Using Inline Assembly in C for Xilinx Microblaze

Question

I have an application written in C with inline assembly for a Xilinx Microblaze core. My inline assembly has a delay task. Function "_delay_loop_X_x" delays exactly 4 cycles per loop of processor. The input signal determines the number of loop to be made. Function "_NOPx" is to achieve more precision. The function works fine, but at the end of the signal it gives twice extra delay. I'm afraid I use registers incorrectly. Can someone please check my assembly code?

For Microblaze I use this documentation: https://www.xilinx.com/support/documentation/sw_manuals/mb_ref_guide.pdf

Assembler code:

    static __inline__ void _delay_loop_1_x( uint8_t) __attribute__((always_inline));

    static __inline__ void _NOP1 (void) {__asm__ volatile ("nop                  \n\t"            ); } //1 cycle
    static __inline__ void _NOP2 (void) {__asm__ volatile ("beqi r12, 1f \n\t""1:\n\t" ::: "r12", "cc" ); } //2 cycle
    static __inline__ void _NOP3 (void) {__asm__ volatile ("brk r12, r0          \n\t" ::: "r12", "cc" ); } //3 cycle

    static __inline__ void      /* exactly 4 cycles */
    _delay_loop_1_x( uint8_t __n )
    {                                        /* cycles per loop */
        __asm__ volatile (                        
           "   addik r11, r0, 1             \n\t"  /*    1   */
           "1: rsub %[input], r11, %[input] \n\t"  /*    1   */
           "   beqi %[input], 2f            \n\t"  /*    1   */
           "2: bnei %[input], 1b            \n\t"  /*    1   */
           :                                       /*  ----- */
           : [input]"r" (__n)                      /*  ----- */
           : "r11", "cc"                           /*    4   */
       );
    }

    static __inline__ void      /* exactly 4 cycles/loop */
    _delay_loop_2_x( uint16_t __n )
    {                                               /* cycles per loop      */
        __asm__ volatile (                            /* __n..one */
               "   addik r11, r0, 1             \n\t" /*    1   */
               "1: rsub %[loops], r11, %[loops] \n\t" /*    1   */
               "   beqi %[loops], 2f            \n\t" /*    1   */
               "2: bnei %[loops], 1b            \n\t" /*    1   */
               :                                      /*  ----- */
               : [loops]"r" (__n)                     /*  ----- */
               : "r11", "cc"                          /*    4   */
           );
    }

    static __inline__ void
    _delay_cycles(const double __ticks_d)
    {
        uint32_t __ticks = (uint32_t)(__ticks_d);
        uint32_t __padding;
        uint32_t __loops;

        if( __ticks <= 3 )  {       
            __padding = __ticks;

        } else if( __ticks <= 0x400 )  {
            __ticks -= 1;                
            __loops = __ticks / 4;
            __padding = __ticks % 4;
            if( __loops != 0 )
                _delay_loop_1_x( (uint8_t)__loops );

        } else if( __ticks <= 0x40001 )  {
            __ticks -= 2;                  
            __loops = __ticks / 4;
            __padding = __ticks % 4;
            if( __loops != 0 )
                _delay_loop_2_x( (uint16_t)__loops );
        } 

       if( __padding ==  1 )  _NOP1();
       if( __padding ==  2 )  _NOP2();
       if( __padding ==  3 )  _NOP3();
    }

C code:

    #define _delay_ns(__ns)     _delay_cycles( (double)(F_CPU)*((double)__ns)/1.0e9 + 0.5 )
    #define _delay_us(__us)     _delay_cycles( (double)(F_CPU)*((double)__us)/1.0e6 + 0.5 )
    #define _delay_ms(__ms)     _delay_cycles( (double)(F_CPU)*((double)__ms)/1.0e3 + 0.5 )

    #define BIT_DELAY_1        _delay_ns(2070) 
    #define BIT_DELAY_5        _delay_us(19) 
    #define BIT_DELAY_7        _delay_us(26)
    #define RX_TX_DELAY        _delay_us(78) 
    #define SHA204_SWI_FLAG_TX      ((uint8_t) 0x88)

    XGpio GpioPIN;

    uint8_t swi_send_bytes(uint8_t count, uint8_t *buffer);
    uint8_t swi_send_byte(uint8_t value);

    int main()
    {
        init_platform();
        XGpio_Initialize(&GpioPIN, GPIO_PIN_DEVICE_ID);
        XGpio_SetDataDirection(&GpioPIN, PIN_CHANNEL, ~PIN);

        (void) swi_send_byte(SHA204_SWI_FLAG_TX);
        cleanup_platform();
        return 0;
    }

    uint8_t swi_send_byte(uint8_t value)
    {
        return swi_send_bytes(1, &value);
    }

    uint8_t swi_send_bytes(uint8_t count, uint8_t *buffer)
    {
        uint8_t i, bit_mask;

        RX_TX_DELAY;

        for (i = 0; i < count; i++) {
            for (bit_mask = 1; bit_mask > 0; bit_mask <<= 1) {
                if (bit_mask & buffer[i]) {
                    XGpio_DiscreteClear(&GpioPIN, PIN_CHANNEL, PIN);
                    BIT_DELAY_1;
                    XGpio_DiscreteWrite(&GpioPIN, PIN_CHANNEL, PIN);
                    BIT_DELAY_7;
                }
                else {
                    XGpio_DiscreteClear(&GpioPIN, PIN_CHANNEL, PIN);
                    BIT_DELAY_1;
                    XGpio_DiscreteWrite(&GpioPIN, PIN_CHANNEL, PIN);
                    BIT_DELAY_1;
                    XGpio_DiscreteClear(&GpioPIN, PIN_CHANNEL, PIN);
                    BIT_DELAY_1;
                    XGpio_DiscreteWrite(&GpioPIN, PIN_CHANNEL, PIN);
                    BIT_DELAY_5;
                }
            }
        }
        return 0;
    }

My result: https://i.stack.imgur.com/zu1vV.jpg

We are not a debugging service. Use a debugger or simulator. Hint: using floating point for such functions and on such platforms is a really bad idea. And names starting with two underscores are reserved for the implementation in all scopes. Names starting with one underscore are reserved at file-scope. — too honest for this site, Sep 04 '17 at 11:02
That "assembler" code is still C code. You should also disassemble binary to verify the delay code was inlined as expected without adding any housekeeping instructions from C compiler to manage the function call/return (from the way it is written it should mostly work as expected, but using pure ASM would be safer in this regard, the C compiler is not required to inline that, `__inline__` is just hint/recommendation. — Ped7g, Sep 04 '17 at 11:11
@Olaf, of course, I know you are not a debugging service. I debugged my code step by step and everything is working fine. When I am debugging with breakpoints or with step over I have an additional execution of the function. I really do not know what's going on. Thank you very much for the advice. — Joanna14071, Sep 04 '17 at 11:41
@Ped7g, Generally I am working in VHDL and this is my first experience with assembler. If you have a link to a good tutorial how to debug the assembly code, you can send, I read readily. Thank you very much for your help. I'll try to do it. — Joanna14071, Sep 04 '17 at 11:51
Well, as you already know how to step over single instructions in debugger, that's it, I'm not sure what exactly you need (depending on which debugger you use, you may find further tricks how to use it more efficiently). At the moment from your question it is not clear which loop/for what value is showing additional execution and what exactly is happening to you. (you have to consider 99% of readers will not even try to compile+execute your code, so you have to pretty much visualize what is going on for us). — Ped7g, Sep 04 '17 at 12:37
But some general hints: in `_delay_loop...` functions I don't see any benefit of having two of them, and both using non-native input type (I guess microblaze is 32b?). Rather use just one delay with 32b input. `rsub` looks like it will be translated to use register (for the `%[input]` part), and registers are 32b, so maybe the compiler throws in some additional instruction to make sure that code works as `uint8_t`, like doing AND with mask `0x000000FF` first? See in debugger on CPU instruction level view (disassembly), if there are additional instructions and how many. And those loops look 3c. — Ped7g, Sep 04 '17 at 12:45

Ped7g · Accepted Answer · 2017-09-04T17:09:48.973

Thinking about it more I don't really see how those loops should be Nx4 cycles.

static __inline__ void      /* exactly 4 cycles */
_delay_loop_1_x( uint8_t __n )
{                                        /* cycles per loop */
    __asm__ volatile (                        
       "   addik r11, r0, 1             \n\t"  /*    1   */
       "1: rsub %[input], r11, %[input] \n\t"  /*    1   */
       "   beqi %[input], 2f            \n\t"  /*    1   */
       "2: bnei %[input], 1b            \n\t"  /*    1   */
       :                                       /*  ----- */
       : [input]"r" (__n)                      /*  ----- */
       : "r11", "cc"                           /*    4   */
   );
}

Once the C part is over (prologue of function) and the code will start execute the ASM part, I see:

for n = 1 (and let's say compiler will use `r1` for `input`):

The executed instructions (step by step) will go I guess like this:

addik r11, r0, 1             ; r11 = 1 (r0 == fixed zero, right?)
rsub r1, r11, r1             ; r1 = r1 - r11 (i.e. r1 = 0 in this example)
beqi r1, 2f                  ; r1 is zero, so branch to "2" will be taken
bnei r1, 1b                  ; r1 == 0, branch not taken

Then any remaining instructions from C will follow (epilogue of function).

If each instruction is 1 cycle (there's lot of dependencies, so on high frequency modern CPU that would be very unlikely, but if microblaze is low frequency simple RISC architecture with no multi-stage pipeline, then it may work like that), then you have 4 cycles.

for n = 2

addik r11, r0, 1             ; r11 = 1
rsub r1, r11, r1             ; r1 = 1
beqi r1, 2f                  ; r1 != 0, branch not taken
bnei r1, 1b                  ; r1 != 0, branch taken
rsub r1, r11, r1             ; r1 = 0
beqi r1, 2f                  ; r1 == 0, branch taken to bnei
bnei r1, 1b                  ; r1 == 0, branch not taken

That's 7 instructions, not 8. For 8 you need to bnei to the addik to re-set r11 again to 1 for the delay purposes (even if the value is already set in r11).

In any case this makes me wonder whether the timing of this CPU is really that simple (1 instruction = 1 cycle, even when branching), and why don't you simplify the loop to simple countdown (and use rather native 32b type for input):

1: raddik %[input], %[input], -1
   bnei   %[input], 1b

Then you have 2-cycles delay loop.

But the main code using the delays... has lot of C code, which will translate to machine code instructions too, and those needs some time to execute as well, so it's not clear what you are measuring, and why, and if you did take into account execution delay caused by those additional instructions.

UPDATE

About ticks -= 1 crash.. no idea, doesn't make sense, so you have to go into debugger and find out the true reason on machine level.

But the whole thing doesn't make much sense, as you try to split cycle delay argument ticks_d by C expressions, like that block of if (padding == ...). And those conditional tests will take many more cycles then the actual padding value, so there's no point to call the NOPx(); to delay even further, as you are already delayed by several tens of cycles more.

Also now I finally found that double Olaf mentioned, that alone (conversion to integer for the remaining math) of course incurs huge performance penalty, probably even in hundreds of cycles. So the whole void delay_cycles(const double ticks_d) will actually delay many many more cycles than you expect, the ASM part will be negligible in the total time (except that inner loop eating about ~3 cycles per the argument value, that one may ramp up the total delay considerably if the initial argument was big enough).

If you really need instruction-cycle accuracy (not that common, last time I did need this was at 8 bit computers around 1990), you have to write it without C in pure asm, and count also the preparation/logic instructions into the delay - written ideally in fixed-exec-time manner, so calling delay_cycles function will incur fixed overhead for any ticks_d argument value.

You can then count that fixed overhead, and start by doing ticks -= <overhead>; first to have exact delay later. And you must switch from double to integers, as the Microblaze CPU doesn't have FPU unit, so the floating values are emulated by integer math, just the initial conversion to int int ticks = (int)(ticks_d); must cost quite some cycles, you should see that in debugger, if you have disassembly view and step into it by single instruction.

Unfortunately this is rather comment than answer, but it's way too long for simple comment under question. :/ — Ped7g, Sep 04 '17 at 13:08
thank you! Unfortunately, the command 'addik %[loops], r0, -1' does not work. I probably can not use a negative value. And in the 'rsub' command I have to use only registers. But I am very grateful for your help :) — Joanna14071, Sep 04 '17 at 16:06
@Joanna14071 what is the exact error? According to [this](http://www.eit.lth.se/fileadmin/eit/courses/eit070/Laborationer/MicroBlazeIS.pdf) the `ADDIK Rd, Ra, Imm` should do `Rd=Ra+signExtend32(Imm)` where Imm is 16b value, so if your assembler is just being dumb rejecting `-1` due to sign, then use the unsigned variant of the same thing `0xFFFF`. Otherwise it should work, according to that pdf. Mind you, I don't have the actual SW, so I'm just trying to use my asm experience from other platforms, completely depending on google results (which may be inaccurate too). — Ped7g, Sep 04 '17 at 16:49
BTW, I had wrong example of that `addik`, using x86 like arguments, fixed it in the "answer". Yours `addik %[loops], r0, -1` is wrong, that would create every time value `-1` if `r0` is hard zero, to decrement value you have to add -1 to the value, i.e. `addik %[loops], %[loops], -1` — Ped7g, Sep 04 '17 at 16:52
You are absolutely right. The command "1: addik %[loops], %[loops], -1 \n\t" works. Thank you for your help! — Joanna14071, Sep 05 '17 at 08:29
you found a solution! The problem resulted from data conversion. I am very grateful :) Thank you. — Joanna14071, Sep 05 '17 at 08:54

Using Inline Assembly in C for Xilinx Microblaze

1 Answers1

for n = 1 (and let's say compiler will use r1 for input):

for n = 2

for n = 1 (and let's say compiler will use `r1` for `input`):