L3 cache miss when iterating over small amount data

95 Views Asked by At

I'm optimizing a critical part of my program. the algorithm basicly iterating part of rows of a matrix(M) and apply H += alpha * M[x]

the matrix(M) is 8MB in size and i'm only iterating first quarter rows(2MB). it should completly fit in L3 cache. but I still got large number of L3 miss in perf which is propotional to the number of iterations(e.g. 6*max_iter)

So my question is why the L3 cache miss happens and how can i avoid this?

my CPU is AMD epyc 9654 which should be in Zen4 architecture and has 32MB L3 cache per CCX. 96 cores and I disabled hyperthreading.

I also did some system-level tunning that:

# cat /proc/cmdline
BOOT_IMAGE=(hd0,gpt2)/vmlinuz-4.18.0-477.27.1.el8_8.x86_64 root=/dev/mapper/rhel-root ro crashkernel=auto rd.lvm.lv=rhel/root rhgb quiet hugepagesz=1G hugepages=4 processor.max_cstate=0 rcu_nocbs=88-95 nohz=on nohz_full=88-95 intel_idle.max_cstate=0 mce=ignore_ce nmi_watchdog=0 transparent_hugepage=never pcie_aspm=performance audit=0 nosoftlockup iommu=off intel_iommu=off nopti nospec_store_bypass_disable nospectre_v2 nospectre_v1 noibrs noibpb nopti lltf=off nospec_store_bypass_disable no_stf_barrier mds=off mitigations=off noht pcie_port_pm=off ipv6.disable=1 ipmi_si.force_kipmid=0 selinux=0 intel_pstat=disable isolcpus=88-95
# perf stat -C 88 -e l3_lookup_state.l3_miss,cache-misses,cs,L1-icache-load-misses taskset -c 88 ./a.out 1 1024 150000

 Performance counter stats for 'CPU(s) 88':

            883961      l3_lookup_state.l3_miss
          44439939      cache-misses
                10      cs
             49068      L1-icache-load-misses

       3.967808823 seconds time elapsed

# perf stat -C 88 -e l3_lookup_state.l3_miss,cache-misses,cs,L1-icache-load-misses taskset -c 88 ./a.out 1 1024 1500000

 Performance counter stats for 'CPU(s) 88':

           8271822      l3_lookup_state.l3_miss
         440835405      cache-misses
                12      cs
            474544      L1-icache-load-misses

      38.803435718 seconds time elapsed

here's the latest simplified code

static inline void add(double *H, double diff, double *XX, ssize_t cnt) {
  __m256d fac = _mm256_set1_pd(diff);
  for (size_t i = 0; i < cnt; i += 32) {
    __m256d h1 = _mm256_load_pd(H + 0);
    __m256d h2 = _mm256_load_pd(H + 4);
    __m256d h3 = _mm256_load_pd(H + 8);
    __m256d h4 = _mm256_load_pd(H + 12);

    __m256d x1 = _mm256_load_pd(XX + 0);
    __m256d x2 = _mm256_load_pd(XX + 4);
    __m256d x3 = _mm256_load_pd(XX + 8);
    __m256d x4 = _mm256_load_pd(XX + 12);

    __m256d h5 = _mm256_load_pd(H + 16);
    __m256d h6 = _mm256_load_pd(H + 20);
    __m256d h7 = _mm256_load_pd(H + 24);
    __m256d h8 = _mm256_load_pd(H + 28);

    __m256d x5 = _mm256_load_pd(XX + 16);
    __m256d x6 = _mm256_load_pd(XX + 20);
    __m256d x7 = _mm256_load_pd(XX + 24);
    __m256d x8 = _mm256_load_pd(XX + 28);

    _mm256_store_pd(H + 0, _mm256_fmadd_pd(fac, x1, h1));
    _mm256_store_pd(H + 4, _mm256_fmadd_pd(fac, x2, h2));
    _mm256_store_pd(H + 8, _mm256_fmadd_pd(fac, x3, h3));
    _mm256_store_pd(H + 12, _mm256_fmadd_pd(fac, x4, h4));

    _mm256_store_pd(H + 16, _mm256_fmadd_pd(fac, x5, h5));
    _mm256_store_pd(H + 20, _mm256_fmadd_pd(fac, x6, h6));
    _mm256_store_pd(H + 24, _mm256_fmadd_pd(fac, x7, h7));
    _mm256_store_pd(H + 28, _mm256_fmadd_pd(fac, x8, h8));

    H += 32;
    XX += 32;
  }
}

void test(size_t max_iter, size_t N, double *H, double *XX, double alpha) {
  for (size_t iter = 0; iter < max_iter; iter++) {
    for (size_t x = 0; x < N / 4; x++) {
      double *curXX = XX + x * N;
      add(H, alpha, curXX, N);
    }
  }
}

int main(int argc, char** argv) {
  static size_t N = std::stoul(argv[1]);
  static size_t max_iter = std::stoul(argv[2]);
  double *DATA = (double *)mmap(nullptr, N * (N + 1) * sizeof(double), PROT_READ | PROT_WRITE,
                              MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB |
                                  ((30 & MAP_HUGE_MASK) << MAP_HUGE_SHIFT),
                              -1, 0);
  if ((double *)-1 == DATA) {
    perror("failied to mmap: ");
    return -1;
  }

  mlock(DATA, N * (N + 1) * sizeof(double));
  test(max_iter, N, DATA, DATA + N, 3.14);
}

More perf data as @Jérôme Richard suggested

# perf stat -e ls_any_fills_from_sys.dram_io_far,ls_any_fills_from_sys.dram_io_near,ls_any_fills_from_sys.dram_io_all,ls_hw_pf_dc_fills.all,ls_hw_pf_dc_fills.dram_io_near,ls_hw_pf_dc_fills.dram_io_far,cache-misses  -C 8 .
/a.out 1 1920 30000
T = 1, N = 1920
BLOCK_SIZE = 29568000
tid: 0. XX = 0x7f1cc0012c00
tid:0 elapsed: 2742097495ns

 Performance counter stats for 'CPU(s) 8':

              5273      ls_any_fills_from_sys.dram_io_far                                     (85.71%)
           1522996      ls_any_fills_from_sys.dram_io_near                                     (85.71%)
           1644633      ls_any_fills_from_sys.dram_io_all                                     (85.71%)
         142883985      ls_hw_pf_dc_fills.all                                         (85.71%)
            318971      ls_hw_pf_dc_fills.dram_io_near                                     (85.71%)
               561      ls_hw_pf_dc_fills.dram_io_far                                     (85.72%)
          36719500      cache-misses                                                  (85.72%)

       2.925452367 seconds time elapsed

# perf stat -e ls_any_fills_from_sys.dram_io_far,ls_any_fills_from_sys.dram_io_near,ls_any_fills_from_sys.dram_io_all,ls_hw_pf_dc_fills.all,ls_hw_pf_dc_fills.dram_io_near,ls_hw_pf_dc_fills.dram_io_far,cache-misses  -C 8 ./a.out 1 1920 60000
T = 1, N = 1920
BLOCK_SIZE = 29568000
tid: 0. XX = 0x7f68c0012c00
tid:0 elapsed: 5485570742ns

 Performance counter stats for 'CPU(s) 8':

              7406      ls_any_fills_from_sys.dram_io_far                                     (85.71%)
           3011325      ls_any_fills_from_sys.dram_io_near                                     (85.71%)
           2979624      ls_any_fills_from_sys.dram_io_all                                     (85.71%)
         280931278      ls_hw_pf_dc_fills.all                                         (85.71%)
            560910      ls_hw_pf_dc_fills.dram_io_near                                     (85.71%)
              1490      ls_hw_pf_dc_fills.dram_io_far                                     (85.72%)
          72840549      cache-misses                                                  (85.71%)

       5.669287832 seconds time elapsed
0

There are 0 best solutions below