1

So i'm trying to rewrite a function from c into assembly, this was more of an exercise into writing assembly in C rather than making this more efficient.

The problem I am having is I have working code in three asm() blocks but I can't seem to combine them. I think there must be something i'm missing when combining them.

This is currently the code that works:

137         __asm__ __volatile__ (
138             "mov R0, #1\n\t"
139             "mov R1, %[bb]\n\t"
140             "and R0, R1\n\t"
141             "cmp R0, #1\n\t"    // (b & 1) == 1
142             "bne aftereor\n\t"
143             "eor %[pp], %[pp], %[aa]\n\t"
144             "aftereor:\n\t"
145             "mov %[hbs], %[aa]\n\t"
146             "mov R0, #128 \n\t"
147             "and %[hbs], R0 \n\t"
148             "lsl %[aa], %[aa], #1\n\t"
149             : [pp]"+l" (p),[aa]"+l" (a),[hbs]"=l" (hi_bit_set)
150             : [bb]"l" (b)
151             :
152         );
153         __asm__ __volatile__ (
154             "cmp %[hbs], #128 \n\t"
155             "bne brancha \n\t"
156             "mov R2, #0x1b\n\t"
157             "eor %[aa], %[aa], R2\n\t"
158             "brancha:\n\t"
159             : [aa]"+l" (a)
160             : [hbs]"l" (hi_bit_set)
161             :
162          );
163         __asm__ __volatile__ (
164             "lsr %[bb], %[bb], #1"
165             : [bb]"+l" (b)
166             :
167             :
168         );

This is the C code I am attempting to rewrite in assembly:

if((b & 1) == 1) {
    p ^= a;
}
hi_bit_set = (a & 0x80);
a <<= 1;
if(hi_bit_set == 0x80) {
    a ^= 0x1b;
}
b >>= 1;

Both of the above pieces of code work as expected. However, my problem is when combining the three blocks of assembly into one. For example, the following code does not work as expected for some reason.

137         __asm__ __volatile__ (
138             "mov R0, #1\n\t"
139             "mov R1, %[bb]\n\t"
140             "and R0, R1\n\t"
141             "cmp R0, #1\n\t"    // (b & 1) == 1
142             "bne aftereor\n\t"
143             "eor %[pp], %[pp], %[aa]\n\t"
144             "aftereor:\n\t"
145             "mov %[hbs], %[aa]\n\t"
146             "mov R0, #128 \n\t"
147             "and %[hbs], R0 \n\t"
148             "lsl %[aa], %[aa], #1\n\t"
149             "cmp %[hbs], #128 \n\t"
150             "bne brancha \n\t"
151             "mov R2, #0x1b\n\t"
152             "eor %[aa], %[aa], R2\n\t"
153             "brancha:\n\t"
154             "lsr %[bb], %[bb], #1"
155             : [pp]"+l" (p),[aa]"+l" (a),[hbs]"+l" (hi_bit_set),[bb]"+l" (b)
156             :
157             :
158         );

The only changes I made were combining the 2nd and 3rd blocks into the first, changing the variables 'hi_bit_set' and 'b' to be both read and write. To my understanding this seems fine to me. However this is not producing the correct result so i'm guessing I have missed something.

Thanks in advance for your help.

Dave_Peachy
  • 498
  • 3
  • 12

1 Answers1

2

Did you look at 'early clobber'? The compiler will assign the same registers for input and output and you need to hold some for longer and need them to be separate.

Also, you don't tell the compiler you use 'R0', 'R1', and 'R2' as explicit registers. You should create a 'tmp1' variable and pass it as an input; call it 'R0' and you can use the register asm to actually assign it (or list them as clobbers).

The code is full of many potential optimizations and could probably be 50% the size. However, I will stay faithful to your original but specify registers to make it work.

void foo(uint32_t a, uint32_t b, uint32_t p)
{
  register uint32_t tmp1 asm ("r0");
  register uint32_t tmp2 asm ("r1");
  register uint32_t tmp3 asm ("r2");
  uint32_t hi_bit_set;

    __asm__ __volatile__ (
         "mov R0, #1\n\t"
         "mov R1, %[bb]\n\t"
         "and R0, R1\n\t"
         "cmp R0, #1\n\t"    // (b & 1) == 1
         "bne aftereor\n\t"
         "eor %[pp], %[pp], %[aa]\n\t"
         "aftereor:\n\t"
         "mov %[hbs], %[aa]\n\t"
         "mov R0, #128 \n\t"
         "and %[hbs], R0 \n\t"
         "lsl %[aa], %[aa], #1\n\t"
         "cmp %[hbs], #128 \n\t"
         "bne brancha \n\t"
         "mov R2, #0x1b\n\t"
         "eor %[aa], %[aa], R2\n\t"
         "brancha:\n\t"
         "lsr %[bb], %[bb], #1"
         : [pp]"+l" (p),[aa]"+l" (a),[hbs]"+l" (hi_bit_set),[bb]"+l" (b)
         :
         : "r0", "r1", "r2"  /* THIS IS IMPORTANT! */
     );
}

This output appears good whereas if I do not include the clobber registers, the compiler uses 'R0', etc for other purposes. A primary job of a compiler is to manage the registers. Pushing/poping on a stack is bad (spills), but so are un-needed MOV instructions.

Providing a full function that compiles is always good when asking a question. I tried to make one and you can see how 'GCC' translates your function. You can use the 'carry flag' instead of constants to extract set bit information.

#include <stdint.h>

uint32_t foo(uint32_t *A, uint32_t *B, uint32_t p)
{
  uint32_t a = *A;
  uint32_t b = *B;

  /* codes starts here... */

  if((b & 1) == 1) {
    p ^= a;
  }
  a <<= 1;
  if(a & 0x100) {
     a ^= 0x1b;
  }
  b >>= 1;

  /* codes is done. */


  *A = a;
  *B = b;
  return p;
}

Here is gcc's 16-bit thumb output,

foo(unsigned long*, unsigned long*, unsigned long):
        push    {r4, r5, lr}  ; save callee reg
        ldr     r4, [r1]      ; get 'B' pointer, your [bb] is r4.
        ldr     r3, [r0]      ; get 'A' pointer, your [aa] is r3.

       ; codes starts here...

        lsls    r5, r4, #31
        bpl     .L2
        eors    r2, r3
.L2:
        lsls    r3, r3, #1
        lsls    r5, r3, #23
        bpl     .L3
        movs    r5, #27
        eors    r3, r5
.L3:
        lsrs    r4, r4, #1

       ; code is done

        str     r3, [r0]      ; saving [aa]
        movs    r0, r2        ; r0 is 'pp'
        str     r4, [r1]      ; save [bb] value.
        pop     {r4, r5, pc}  ; restore to callers value.

An assembler coder may prefer local labels for the '.L2' and '.L3' labels above.

artless noise
  • 21,212
  • 6
  • 68
  • 105
  • Hey, thanks for the reply. I understand the clobbers now, I guess I just missed that bit. Regarding your second part of the answer, I am limited to 16-bit thumb instructions so can't do the conditional execution optimisation you suggested. – Dave_Peachy Jan 27 '17 at 15:57
  • Additionally it seems that adding ""r0", "r1", "r2"" to the clobber list makes the compiler move the values of "r0", "r1" and "r2" into other registers before running my code. Is there not a way to instruct the compiler to use any register instead of specific ones? Surely this would be more efficient as we don't have to move registers around before and after the code block. – Dave_Peachy Jan 27 '17 at 16:09
  • [Here is the code](https://godbolt.org/g/Nkv02f) with thumb instructions. The main code is 9 instructions w. 16 bit thumb code. There are ways to get the compiler not to move. For instance list tmp1, tmp2, tmp3 as inputs/outputs and don't name the registers. However, thumb can use `lsls` to get the status of a bit so you don't need to load constants in a register which is why you have all the tmpX values. – artless noise Jan 30 '17 at 13:55