Compiling a C function that reads a memory location repeatedly and writes into a memory buffer, I am trying to get the compiler to generate code using STM
instruction instead of multiple STR
s.
The target CPU is Cortex-M0+
, which does not have an instruction prefetch unit nor a cache, so the assumption is that a single STM
instruction is more economical than multiple STR
s in terms of instruction fetch cycles.
I am aware of the -fldm-stm
option, but this is just a feature enable and not a compile hint.
The reference code is:
#include <stdint.h>
#define port (0x12345678U)
extern uint32_t buf[16];
void myfunc(void)
{
uint32_t *p = buf;
for (uint8_t i=0; i<16; i++)
{
*(p++) = *(volatile uint32_t *)(port);
}
}
Compile options: -O3 -fldm-stm --target=arm-arm-none-eabi -mcpu=cortex-m0+ -mthumb
Update 1: Considering some good tips in the comments, I changed the code and options, adding a loop-unroll pragma and optimizing for size:
#include <stdint.h>
#define port (0x12345678U)
extern uint32_t buf[16];
void myfunc(void)
{
uint32_t *p = buf;
#pragma unroll (4)
for (uint8_t i=0; i<16; i++)
{
*(p++) = *(volatile uint32_t *)(port);
}
}
Compile options: -Os -fldm-stm --target=arm-arm-none-eabi -mcpu=cortex-m0+ -mthumb
Still the compiler won't use the STM
instruction.
UPDATE 2: More tweaking, and I am now able to get much closer to the construct I am looking for:
#include <stdint.h>
#define port (0x12345678U)
extern uint32_t buf[16];
void myfunc(void)
{
register uint32_t r0, r1, r2, r3;
uint32_t *p = buf;
for (uint8_t i=0; i<16; i+=4)
{
r0 = (uint32_t) (*(volatile uint32_t *)(port));
r1 = (uint32_t) (*(volatile uint32_t *)(port));
r2 = (uint32_t) (*(volatile uint32_t *)(port));
r3 = (uint32_t) (*(volatile uint32_t *)(port));
*(p++) = r0;
*(p++) = r1;
*(p++) = r2;
*(p++) = r3;
}
}
Compiler Explorer now emits the following loop body:
.LBB0_1:
ldr r3, [r2]
ldr r4, [r2]
ldr r5, [r2]
ldr r6, [r2]
stm r1!, {r3, r4, r5, r6} ;; Bingo!
adds r1, #0 ;; Why do we need this line?
adds r0, r0, #4
cmp r0, #12
blo .LBB0_1
It is not clear to me why that line I pointed out is required. Any idea?