How to get the ARM compiler to use the STM instruction instead of STR?

255 Views Asked by At

Compiling a C function that reads a memory location repeatedly and writes into a memory buffer, I am trying to get the compiler to generate code using STM instruction instead of multiple STRs.

The target CPU is Cortex-M0+, which does not have an instruction prefetch unit nor a cache, so the assumption is that a single STM instruction is more economical than multiple STRs in terms of instruction fetch cycles.

I am aware of the -fldm-stm option, but this is just a feature enable and not a compile hint.

The reference code is:

#include <stdint.h>

#define port (0x12345678U)
extern uint32_t buf[16];

void myfunc(void)
{
    uint32_t *p = buf;

    for (uint8_t i=0; i<16; i++)
    {
        *(p++) = *(volatile uint32_t *)(port);
    }
}

Compile options: -O3 -fldm-stm --target=arm-arm-none-eabi -mcpu=cortex-m0+ -mthumb


Update 1: Considering some good tips in the comments, I changed the code and options, adding a loop-unroll pragma and optimizing for size:

#include <stdint.h>

#define port (0x12345678U)
extern uint32_t buf[16];

void myfunc(void)
{
    uint32_t *p = buf;

#pragma unroll (4)

    for (uint8_t i=0; i<16; i++)
    {
        *(p++) = *(volatile uint32_t *)(port);
    }
}

Compile options: -Os -fldm-stm --target=arm-arm-none-eabi -mcpu=cortex-m0+ -mthumb

Still the compiler won't use the STM instruction.


UPDATE 2: More tweaking, and I am now able to get much closer to the construct I am looking for:

#include <stdint.h>

#define port (0x12345678U)
extern uint32_t buf[16];

void myfunc(void)
{
    register uint32_t r0, r1, r2, r3;
    uint32_t *p = buf;

    for (uint8_t i=0; i<16; i+=4)
    {
         r0 = (uint32_t) (*(volatile uint32_t *)(port));
         r1 = (uint32_t) (*(volatile uint32_t *)(port));
         r2 = (uint32_t) (*(volatile uint32_t *)(port));
         r3 = (uint32_t) (*(volatile uint32_t *)(port));
        *(p++) = r0;
        *(p++) = r1;
        *(p++) = r2;
        *(p++) = r3;
    }
}

Compiler Explorer now emits the following loop body:

.LBB0_1:
    ldr     r3, [r2]
    ldr     r4, [r2]
    ldr     r5, [r2]
    ldr     r6, [r2]
    stm     r1!, {r3, r4, r5, r6}  ;; Bingo!
    adds    r1, #0                 ;; Why do we need this line?
    adds    r0, r0, #4
    cmp     r0, #12
    blo     .LBB0_1

It is not clear to me why that line I pointed out is required. Any idea?

0

There are 0 best solutions below