I write a fixed size memory pool(deault size is page size) and a microbenchmark for performance testing. There are two problems:
- A double free problem in memory pool destruction function.
- when allocate function returns, the page may already be used, which means the concurrent control failed. But I can't understand why this happends.
here are the address sanity report, and source codes of the memory pool and the microbenchmark. I will be very appreciated if someone can help me.
My memory pool codes:
/*phy_page_pool.h*/
#include <atomic>
#include <cassert>
#include <iostream>
#include <memory>
#include <thread>
#include <vector>
#include <cstdio>
struct Page {
char head_padding[16];
std::atomic<Page*> next;
char data_padding[0];
};
class MemoryPool {
public:
// default pool size = 1GiB = 262144 * 4096GiB
// default page size = 4KiB
MemoryPool(size_t num_pools = 8, size_t pagesPerPool = 262144, size_t page_size = 4096)
: num_pools_(num_pools), page_size_(page_size) {
local_pools_.resize(num_pools);
local_remain_pages_.resize(num_pools);
pages_per_pool_.resize(num_pools, 0);
for (size_t i = 0; i < num_pools; i++) {
local_pools_[i] = new std::atomic<Page*>();
local_remain_pages_[i] = new std::atomic<size_t>(0);
}
for (size_t pid = 0; pid < num_pools * pagesPerPool; pid++) {
Page* page = static_cast<Page*>(std::aligned_alloc(page_size, page_size));
page->next.store(nullptr);
assert((size_t)page % page_size == 0);
if (page != nullptr) {
size_t pool_idx = ((size_t)page / page_size) % num_pools;
pages_per_pool_[pool_idx]++;
page->next.store(
local_pools_[pool_idx]->load(std::memory_order_relaxed),
std::memory_order_relaxed);
local_pools_[pool_idx]->store(page, std::memory_order_relaxed);
(*local_remain_pages_[pool_idx])++;
} else {
throw std::bad_alloc();
}
}
for (size_t i = 0; i < num_pools; ++i) {
assert(*local_remain_pages_[i] == pages_per_pool_[i]);
}
}
~MemoryPool() {
size_t i = 0;
for (size_t i = 0; i < num_pools_; i++) {
assert(*local_remain_pages_[i] == pages_per_pool_[i]);
}
for (std::atomic<Page*>* localPool : local_pools_) {
Page* page = localPool->load(std::memory_order_seq_cst);
size_t free_cnt = 0;
while (page != nullptr) {
//printf("pool[%zu]: free page, cnt(%zu): %p\n", i, free_cnt, page);
free_cnt++;
Page* next = page->next.load(std::memory_order_seq_cst);
std::free(page);
page = next;
}
//printf("actual free: %zu, expected free: %zu", free_cnt, local_remain_pages_[i]->load());
assert(free_cnt == local_remain_pages_[i]->load());
i++;
}
for (size_t i = 0; i < num_pools_; i++) {
delete local_pools_[i];
delete local_remain_pages_[i];
}
}
void* allocate() {
size_t richest_pool_idx = 0;
size_t max_remain_pages = 0;
// get a approximate richest pool
for (size_t i = 0; i < num_pools_; i++) {
if (*local_remain_pages_[i] > max_remain_pages) {
max_remain_pages = *local_remain_pages_[i];
richest_pool_idx = i;
}
}
Page* page = local_pools_[richest_pool_idx]->load(std::memory_order_seq_cst);
Page* next = page->next.load(std::memory_order_seq_cst);
assert((size_t)page != 0x2a2a2a2a2a2a2a2a); // in test, we write 0x2a to page
/*
while (page != nullptr &&
!local_pools_[richest_pool_idx]->compare_exchange_weak(
page, page->next.load(std::memory_order_relaxed),
std::memory_order_acquire, std::memory_order_relaxed))
;
*/
while (page != nullptr && (size_t)page != 0x2a2a2a2a2a2a2a2a &&
!local_pools_[richest_pool_idx]->compare_exchange_strong(
page, page->next.load(std::memory_order_seq_cst))) {
page = local_pools_[richest_pool_idx]->load(std::memory_order_seq_cst);
next = page->next.load(std::memory_order_seq_cst);
}
(*local_remain_pages_[richest_pool_idx])--;
assert((size_t)page != 0x2a2a2a2a2a2a2a2a);
assert((size_t)next != 0x2a2a2a2a2a2a2a2a);
return page;
}
void deallocate(void* ptr) {
size_t idx = (size_t)ptr / page_size_ % num_pools_;
if (ptr == nullptr) {
return;
}
Page* page = static_cast<Page*>(ptr);
Page* expected = local_pools_[idx]->load(std::memory_order_seq_cst);
page->next.store(expected, std::memory_order_seq_cst);
/*
while (!local_pools_[idx]->compare_exchange_weak(
expected, page, std::memory_order_acquire,
std::memory_order_relaxed))
;
*/
while (!local_pools_[idx]->compare_exchange_strong(
expected, page)) {
Page* expected = local_pools_[idx]->load(std::memory_order_seq_cst);
page->next.store(expected, std::memory_order_seq_cst);
}
(*local_remain_pages_[idx])++;
}
private:
size_t num_pools_;
size_t page_size_;
std::vector<size_t> pages_per_pool_;
std::vector<std::atomic<Page*>*> local_pools_;
std::vector<std::atomic<size_t> *> local_remain_pages_;
};
The benchmark codes:
/*file: mem_pool_benchmark.cc*/
#include <iostream>
#include <memory>
#include <thread>
#include <atomic>
#include <cassert>
#include <vector>
#include <chrono>
#include <cstring>
#include <cstdio>
#include "phy_page_pool.h"
constexpr size_t PAGE_SIZE = 4096;
void memory_pool_test(MemoryPool& pool, size_t ops) {
void *ptr[ops];
for (size_t i = 0; i < ops; ++i) {
void* page = pool.allocate();
ptr[i] = page;
assert(page != nullptr);
std::memset(page, 42, PAGE_SIZE);
}
for (size_t i = 0; i < ops; ++i) {
pool.deallocate(ptr[i]);
}
}
void malloc_test(size_t ops) {
void *ptr[ops];
for (size_t i = 0; i < ops; ++i) {
void* page = std::malloc(PAGE_SIZE);
ptr[i] = page;
assert(page != nullptr);
std::memset(page, 42, PAGE_SIZE);
}
for (size_t i = 0; i < ops; ++i) {
free(ptr[i]);
}
}
void bench(size_t num_pools, size_t pages_per_pool, size_t num_threads, size_t iterations) {
printf("=========num pools: %zu, num threads: %zu, ops / thread: %zu\n", num_pools, num_threads, iterations);
MemoryPool pool(num_pools, pages_per_pool);
std::vector<std::thread> threads;
assert(iterations <= pages_per_pool);
// malloc
auto start = std::chrono::high_resolution_clock::now();
for (size_t i = 0; i < num_threads; ++i) {
threads.emplace_back(malloc_test, iterations);
}
for (auto& t : threads) {
t.join();
}
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
std::cout << "malloc time: " << duration << " ms\n";
double ops_per_sec = num_threads * iterations / duration * 1000;
std::cout << "malloc throuputs: " << ops_per_sec << "/s\n";
start = std::chrono::high_resolution_clock::now();
threads.clear();
for (size_t i = 0; i < num_threads; ++i) {
threads.emplace_back(memory_pool_test, std::ref(pool), iterations);
}
for (auto& t : threads) {
t.join();
}
end = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
std::cout << "MemoryPool time: " << duration << " ms\n";
ops_per_sec = num_threads * iterations / duration * 1000;
std::cout << "MemoryPool throuputs: " << ops_per_sec << "/s\n";
}
int main() {
// poolsize = 16GiB * 1;
// max access size if not free = 12G;
// it's enough
size_t total_pages = 4194304 / 2; // 8G
for (int pools = 2; pools <= 32; pools *= 2) {
//bench(pools, total_pages/pools, 1, 10*1000);
//bench(pools, total_pages/pools, 2, 10*1000);
bench(pools, total_pages/pools, 4, 10*1000);
bench(pools, total_pages/pools, 8, 10*1000);
bench(pools, total_pages/pools, 16, 10*1000);
bench(pools, total_pages/pools, 32, 10*1000);
}
return 0;
}
The compiler version and compile commands:
clang version 10.0.0-4ubuntu1
Target: x86_64-pc-linux-gnu
Thread model: posix
InstalledDir: /usr/bin
clang++ -std=c++17 -fsanitize=address -g mem_pool_benchmark.cc -lpthread
logs for question 2:
=========num pools: 2, num threads: 4, ops / thread: 10000
malloc time: 38 ms
malloc throuputs: 1.052e+06/s
MemoryPool time: 11 ms
MemoryPool throuputs: 3.636e+06/s
=========num pools: 2, num threads: 8, ops / thread: 10000
malloc time: 58 ms
malloc throuputs: 1.379e+06/s
AddressSanitizerAddressSanitizer:DEADLYSIGNAL
:DEADLYSIGNAL
=================================================================
==525445==ERROR: AddressSanitizer: SEGV on unknown address (pc 0x0000004cb297 bp 0x7f04deee8230 sp 0x7f04deee80e0 T24)
==525445==The signal is caused by a READ memory access.
a.out: ./phy_page_pool.h:110: void *MemoryPool::allocate(): Assertion `(size_t)next != 0x2a2a2a2a2a2a2a2a' failed.
a.out: ./phy_page_pool.h:110: void *MemoryPool::allocate(): Assertion `(size_t)next != 0x2a2a2a2a2a2a2a2a' failed.
==525445==Hint: this fault was caused by a dereference of a high value address (see register values below). Dissassemble the provided pc to learn which register was used.
[1] 525445 abort (core dumped) ./a.out
logs for question 1:
=========num pools: 2, num threads: 4, ops / thread: 10000
malloc time: 38 ms
malloc throuputs: 1.052e+06/s
MemoryPool time: 11 ms
MemoryPool throuputs: 3.636e+06/s
=================================================================
==525587==ERROR: AddressSanitizer: heap-use-after-free on address 0x6254ff4c8010 at pc 0x0000004cb2ad bp 0x7ffc80dedff0 sp 0x7ffc80dedfe8
READ of size 8 at 0x6254ff4c8010 thread T0
#0 0x4cb2ac in std::__atomic_base<Page*>::load(std::memory_order) const /usr/bin/../lib/gcc/x86_64-linux-gnu/9/../../../../include/c++/9/bits/atomic_base.h:740:9
#1 0x4cb2ac in std::atomic<Page*>::load(std::memory_order) const /usr/bin/../lib/gcc/x86_64-linux-gnu/9/../../../../include/c++/9/atomic:519:21
#2 0x4caa9c in MemoryPool::~MemoryPool() /home/hawk/Codes/user-level-mmap/src/./phy_page_pool.h:62:41
#3 0x4c7e30 in bench(unsigned long, unsigned long, unsigned long, unsigned long) /home/hawk/Codes/user-level-mmap/src/mem_pool_benchmark.cc:77:1
#4 0x4c8022 in main /home/hawk/Codes/user-level-mmap/src/mem_pool_benchmark.cc:88:3
#5 0x7f1440aa0082 in __libc_start_main /build/glibc-wuryBv/glibc-2.31/csu/../csu/libc-start.c:308:16
#6 0x41c47d in _start (/home/hawk/Codes/user-level-mmap/src/a.out+0x41c47d)
0x6254ff4c8010 is located 16 bytes inside of 4096-byte region [0x6254ff4c8000,0x6254ff4c9000)
freed by thread T0 here:
#0 0x49493d in free (/home/hawk/Codes/user-level-mmap/src/a.out+0x49493d)
#1 0x4caab2 in MemoryPool::~MemoryPool() /home/hawk/Codes/user-level-mmap/src/./phy_page_pool.h:63:17
#2 0x4c7e30 in bench(unsigned long, unsigned long, unsigned long, unsigned long) /home/hawk/Codes/user-level-mmap/src/mem_pool_benchmark.cc:77:1
#3 0x4c8022 in main /home/hawk/Codes/user-level-mmap/src/mem_pool_benchmark.cc:88:3
#4 0x7f1440aa0082 in __libc_start_main /build/glibc-wuryBv/glibc-2.31/csu/../csu/libc-start.c:308:16
previously allocated by thread T0 here:
#0 0x4954e2 in aligned_alloc (/home/hawk/Codes/user-level-mmap/src/a.out+0x4954e2)
#1 0x4c9020 in MemoryPool::MemoryPool(unsigned long, unsigned long, unsigned long) /home/hawk/Codes/user-level-mmap/src/./phy_page_pool.h:29:45
#2 0x4c7415 in bench(unsigned long, unsigned long, unsigned long, unsigned long) /home/hawk/Codes/user-level-mmap/src/mem_pool_benchmark.cc:45:16
#3 0x4c8022 in main /home/hawk/Codes/user-level-mmap/src/mem_pool_benchmark.cc:88:3
#4 0x7f1440aa0082 in __libc_start_main /build/glibc-wuryBv/glibc-2.31/csu/../csu/libc-start.c:308:16
SUMMARY: AddressSanitizer: heap-use-after-free /usr/bin/../lib/gcc/x86_64-linux-gnu/9/../../../../include/c++/9/bits/atomic_base.h:740:9 in std::__atomic_base<Page*>::load(std::memory_order) const
Shadow bytes around the buggy address:
0x0c4b1fe90fb0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
0x0c4b1fe90fc0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
0x0c4b1fe90fd0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
0x0c4b1fe90fe0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
0x0c4b1fe90ff0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
=>0x0c4b1fe91000: fd fd[fd]fd fd fd fd fd fd fd fd fd fd fd fd fd
0x0c4b1fe91010: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
0x0c4b1fe91020: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
0x0c4b1fe91030: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
0x0c4b1fe91040: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
0x0c4b1fe91050: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
Shadow byte legend (one shadow byte represents 8 application bytes):
Addressable: 00
Partially addressable: 01 02 03 04 05 06 07
Heap left redzone: fa
Freed heap region: fd
Stack left redzone: f1
Stack mid redzone: f2
Stack right redzone: f3
Stack after return: f5
Stack use after scope: f8
Global redzone: f9
Global init order: f6
Poisoned by user: f7
Container overflow: fc
Array cookie: ac
Intra object redzone: bb
ASan internal: fe
Left alloca redzone: ca
Right alloca redzone: cb
Shadow gap: cc
==525587==ABORTING
I have add some assert and logs in the codes, but i can not find the bugs. I also analysis the lockfree algorithm and still can not find the bugs.