I'm optimizing a critical part of my program. the algorithm basicly iterating part of rows of a matrix(M) and apply H += alpha * M[x]
the matrix(M) is 8MB in size and i'm only iterating first quarter rows(2MB). it should completly fit in L3 cache. but I still got large number of L3 miss in perf which is propotional to the number of iterations(e.g. 6*max_iter)
So my question is why the L3 cache miss happens and how can i avoid this?
my CPU is AMD epyc 9654 which should be in Zen4 architecture and has 32MB L3 cache per CCX. 96 cores and I disabled hyperthreading.
I also did some system-level tunning that:
# cat /proc/cmdline
BOOT_IMAGE=(hd0,gpt2)/vmlinuz-4.18.0-477.27.1.el8_8.x86_64 root=/dev/mapper/rhel-root ro crashkernel=auto rd.lvm.lv=rhel/root rhgb quiet hugepagesz=1G hugepages=4 processor.max_cstate=0 rcu_nocbs=88-95 nohz=on nohz_full=88-95 intel_idle.max_cstate=0 mce=ignore_ce nmi_watchdog=0 transparent_hugepage=never pcie_aspm=performance audit=0 nosoftlockup iommu=off intel_iommu=off nopti nospec_store_bypass_disable nospectre_v2 nospectre_v1 noibrs noibpb nopti lltf=off nospec_store_bypass_disable no_stf_barrier mds=off mitigations=off noht pcie_port_pm=off ipv6.disable=1 ipmi_si.force_kipmid=0 selinux=0 intel_pstat=disable isolcpus=88-95
# perf stat -C 88 -e l3_lookup_state.l3_miss,cache-misses,cs,L1-icache-load-misses taskset -c 88 ./a.out 1 1024 150000
Performance counter stats for 'CPU(s) 88':
883961 l3_lookup_state.l3_miss
44439939 cache-misses
10 cs
49068 L1-icache-load-misses
3.967808823 seconds time elapsed
# perf stat -C 88 -e l3_lookup_state.l3_miss,cache-misses,cs,L1-icache-load-misses taskset -c 88 ./a.out 1 1024 1500000
Performance counter stats for 'CPU(s) 88':
8271822 l3_lookup_state.l3_miss
440835405 cache-misses
12 cs
474544 L1-icache-load-misses
38.803435718 seconds time elapsed
here's the latest simplified code
static inline void add(double *H, double diff, double *XX, ssize_t cnt) {
__m256d fac = _mm256_set1_pd(diff);
for (size_t i = 0; i < cnt; i += 32) {
__m256d h1 = _mm256_load_pd(H + 0);
__m256d h2 = _mm256_load_pd(H + 4);
__m256d h3 = _mm256_load_pd(H + 8);
__m256d h4 = _mm256_load_pd(H + 12);
__m256d x1 = _mm256_load_pd(XX + 0);
__m256d x2 = _mm256_load_pd(XX + 4);
__m256d x3 = _mm256_load_pd(XX + 8);
__m256d x4 = _mm256_load_pd(XX + 12);
__m256d h5 = _mm256_load_pd(H + 16);
__m256d h6 = _mm256_load_pd(H + 20);
__m256d h7 = _mm256_load_pd(H + 24);
__m256d h8 = _mm256_load_pd(H + 28);
__m256d x5 = _mm256_load_pd(XX + 16);
__m256d x6 = _mm256_load_pd(XX + 20);
__m256d x7 = _mm256_load_pd(XX + 24);
__m256d x8 = _mm256_load_pd(XX + 28);
_mm256_store_pd(H + 0, _mm256_fmadd_pd(fac, x1, h1));
_mm256_store_pd(H + 4, _mm256_fmadd_pd(fac, x2, h2));
_mm256_store_pd(H + 8, _mm256_fmadd_pd(fac, x3, h3));
_mm256_store_pd(H + 12, _mm256_fmadd_pd(fac, x4, h4));
_mm256_store_pd(H + 16, _mm256_fmadd_pd(fac, x5, h5));
_mm256_store_pd(H + 20, _mm256_fmadd_pd(fac, x6, h6));
_mm256_store_pd(H + 24, _mm256_fmadd_pd(fac, x7, h7));
_mm256_store_pd(H + 28, _mm256_fmadd_pd(fac, x8, h8));
H += 32;
XX += 32;
}
}
void test(size_t max_iter, size_t N, double *H, double *XX, double alpha) {
for (size_t iter = 0; iter < max_iter; iter++) {
for (size_t x = 0; x < N / 4; x++) {
double *curXX = XX + x * N;
add(H, alpha, curXX, N);
}
}
}
int main(int argc, char** argv) {
static size_t N = std::stoul(argv[1]);
static size_t max_iter = std::stoul(argv[2]);
double *DATA = (double *)mmap(nullptr, N * (N + 1) * sizeof(double), PROT_READ | PROT_WRITE,
MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB |
((30 & MAP_HUGE_MASK) << MAP_HUGE_SHIFT),
-1, 0);
if ((double *)-1 == DATA) {
perror("failied to mmap: ");
return -1;
}
mlock(DATA, N * (N + 1) * sizeof(double));
test(max_iter, N, DATA, DATA + N, 3.14);
}
More perf data as @Jérôme Richard suggested
# perf stat -e ls_any_fills_from_sys.dram_io_far,ls_any_fills_from_sys.dram_io_near,ls_any_fills_from_sys.dram_io_all,ls_hw_pf_dc_fills.all,ls_hw_pf_dc_fills.dram_io_near,ls_hw_pf_dc_fills.dram_io_far,cache-misses -C 8 .
/a.out 1 1920 30000
T = 1, N = 1920
BLOCK_SIZE = 29568000
tid: 0. XX = 0x7f1cc0012c00
tid:0 elapsed: 2742097495ns
Performance counter stats for 'CPU(s) 8':
5273 ls_any_fills_from_sys.dram_io_far (85.71%)
1522996 ls_any_fills_from_sys.dram_io_near (85.71%)
1644633 ls_any_fills_from_sys.dram_io_all (85.71%)
142883985 ls_hw_pf_dc_fills.all (85.71%)
318971 ls_hw_pf_dc_fills.dram_io_near (85.71%)
561 ls_hw_pf_dc_fills.dram_io_far (85.72%)
36719500 cache-misses (85.72%)
2.925452367 seconds time elapsed
# perf stat -e ls_any_fills_from_sys.dram_io_far,ls_any_fills_from_sys.dram_io_near,ls_any_fills_from_sys.dram_io_all,ls_hw_pf_dc_fills.all,ls_hw_pf_dc_fills.dram_io_near,ls_hw_pf_dc_fills.dram_io_far,cache-misses -C 8 ./a.out 1 1920 60000
T = 1, N = 1920
BLOCK_SIZE = 29568000
tid: 0. XX = 0x7f68c0012c00
tid:0 elapsed: 5485570742ns
Performance counter stats for 'CPU(s) 8':
7406 ls_any_fills_from_sys.dram_io_far (85.71%)
3011325 ls_any_fills_from_sys.dram_io_near (85.71%)
2979624 ls_any_fills_from_sys.dram_io_all (85.71%)
280931278 ls_hw_pf_dc_fills.all (85.71%)
560910 ls_hw_pf_dc_fills.dram_io_near (85.71%)
1490 ls_hw_pf_dc_fills.dram_io_far (85.72%)
72840549 cache-misses (85.71%)
5.669287832 seconds time elapsed