Why does gcc choose the most basic memset() implementation?

574 Views Asked by At

My bare-metal program manually calls memset() to zero entire/aligned 4k pages like this (I'm not using uint64_t, but another 8-byte thing):

  uint64_t something[512] __attribute__((aligned(4096)));
  memset(something, 0x0, 4096);

I'm compiling similar to this...

%> /path/to/gcc-11.1.0/bin/aarch64-unknown-elf-gcc \
     -O3 \
     -std=gnu99 \
     -nostartfiles \
     --specs=nano.specs \
     -march=armv8.1-a \
     -Wl,--gc-sections \
     -Tlinker_script.lds \
     my_code.c \
     -o my_code.elf

When I disassemble and look at memset() that is linked in, it's this basic/generic/one-byte-at-a-time, implementation:

000000004010399c <memset>:
      s = (char*)aligned_addr;
    }

#endif /* not PREFER_SIZE_OVER_SPEED */

  while (n--)
    4010399c:   d2800003    mov x3, #0x0                    // #0
    401039a0:   eb03005f    cmp x2, x3
    401039a4:   54000041    b.ne    401039ac <memset+0x10>  // b.any
    *s++ = (char) c;

  return m;
}
    401039a8:   d65f03c0    ret
    *s++ = (char) c;
    401039ac:   38236801    strb    w1, [x0, x3]
    401039b0:   91000463    add x3, x3, #0x1
    401039b4:   17fffffb    b   401039a0 <memset+0x4>

I'm expecting an aarch64 optimized version that uses stp or vector instructions. My compiler has a /path/to/gcc-11.1.0/newlib-nano subdirectory.

I've removed --specs=nano.specs and fiddled around with a variety of options, but I'm not sure what I can do here...

HOW CAN I GET THE OPTIMIZED memset() IMPLEMENTATION?

Note that I used /path/to/gcc-11.1.0/bin/aarch64-unknown-elf-ar x to extract the libc_a.memset.o file from lots of different *.a files, but they were all empty: /path/to/gcc-11.1.0/newlib-nano/aarch64-unknown-elf/lib/libc.a, /path/to/gcc-11.1.0/newlib-nano/aarch64-unknown-elf/lib/libc_nano.a, /path/to/gcc-11.1.0/newlib-nano/aarch64-unknown-elf/lib/libg.a, /path/to/gcc-11.1.0/aarch64-unknown-elf/lib/libc.a, etc... Not the byte-by-byte implementation, just empty. I did this to look for a good implementation, but I clearly don't understand what is going on here...

1

There are 1 best solutions below

2
0___________ On BEST ANSWER

You can see the code here. Your library was compiled with the definition PREFER_SIZE_OVER_SPEED. You need to recompile your library.

https://github.com/eblot/newlib/blob/master/newlib/libc/string/memset.c

/*
FUNCTION
    <<memset>>---set an area of memory
INDEX
    memset
ANSI_SYNOPSIS
    #include <string.h>
    void *memset(void *<[dst]>, int <[c]>, size_t <[length]>);
TRAD_SYNOPSIS
    #include <string.h>
    void *memset(<[dst]>, <[c]>, <[length]>)
    void *<[dst]>;
    int <[c]>;
    size_t <[length]>;
DESCRIPTION
    This function converts the argument <[c]> into an unsigned
    char and fills the first <[length]> characters of the array
    pointed to by <[dst]> to the value.
RETURNS
    <<memset>> returns the value of <[dst]>.
PORTABILITY
<<memset>> is ANSI C.
    <<memset>> requires no supporting OS subroutines.
QUICKREF
    memset ansi pure
*/

#include <string.h>

#define LBLOCKSIZE (sizeof(long))
#define UNALIGNED(X)   ((long)X & (LBLOCKSIZE - 1))
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)

_PTR
_DEFUN (memset, (m, c, n),
    _PTR m _AND
    int c _AND
    size_t n)
{
  char *s = (char *) m;

#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
  int i;
  unsigned long buffer;
  unsigned long *aligned_addr;
  unsigned int d = c & 0xff;    /* To avoid sign extension, copy C to an
                   unsigned variable.  */

  while (UNALIGNED (s))
    {
      if (n--)
        *s++ = (char) c;
      else
        return m;
    }

  if (!TOO_SMALL (n))
    {
      /* If we get this far, we know that n is large and s is word-aligned. */
      aligned_addr = (unsigned long *) s;

      /* Store D into each char sized location in BUFFER so that
         we can set large blocks quickly.  */
      buffer = (d << 8) | d;
      buffer |= (buffer << 16);
      for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
        buffer = (buffer << i) | buffer;

      /* Unroll the loop.  */
      while (n >= LBLOCKSIZE*4)
        {
          *aligned_addr++ = buffer;
          *aligned_addr++ = buffer;
          *aligned_addr++ = buffer;
          *aligned_addr++ = buffer;
          n -= 4*LBLOCKSIZE;
        }

      while (n >= LBLOCKSIZE)
        {
          *aligned_addr++ = buffer;
          n -= LBLOCKSIZE;
        }
      /* Pick up the remainder with a bytewise loop.  */
      s = (char*)aligned_addr;
    }

#endif /* not PREFER_SIZE_OVER_SPEED */

  while (n--)
    *s++ = (char) c;

  return m;
}