why do i use the correct lock free algorithm but still encount concurrent problems in a lock-free list?

35 Views Asked by At

I write a fixed size memory pool(deault size is page size) and a microbenchmark for performance testing. There are two problems:

  1. A double free problem in memory pool destruction function.
  2. when allocate function returns, the page may already be used, which means the concurrent control failed. But I can't understand why this happends.

here are the address sanity report, and source codes of the memory pool and the microbenchmark. I will be very appreciated if someone can help me.

My memory pool codes:

/*phy_page_pool.h*/

#include <atomic>
#include <cassert>
#include <iostream>
#include <memory>
#include <thread>
#include <vector>
#include <cstdio>

struct Page {
    char head_padding[16];
    std::atomic<Page*> next;
    char data_padding[0];
};

class MemoryPool {
   public:
       // default pool size = 1GiB = 262144 * 4096GiB
       // default page size = 4KiB
    MemoryPool(size_t num_pools = 8, size_t pagesPerPool = 262144, size_t page_size = 4096)
        : num_pools_(num_pools), page_size_(page_size) {
        local_pools_.resize(num_pools);
        local_remain_pages_.resize(num_pools);
        pages_per_pool_.resize(num_pools, 0);
        for (size_t i = 0; i < num_pools; i++) {
            local_pools_[i] = new std::atomic<Page*>();
            local_remain_pages_[i] = new std::atomic<size_t>(0);
        }
        for (size_t pid = 0; pid < num_pools * pagesPerPool; pid++) {
            Page* page = static_cast<Page*>(std::aligned_alloc(page_size, page_size));
            page->next.store(nullptr);
            assert((size_t)page % page_size == 0);
            if (page != nullptr) {
                size_t pool_idx = ((size_t)page / page_size) % num_pools;
                pages_per_pool_[pool_idx]++;
                page->next.store(
                    local_pools_[pool_idx]->load(std::memory_order_relaxed),
                    std::memory_order_relaxed);
                local_pools_[pool_idx]->store(page, std::memory_order_relaxed);
                (*local_remain_pages_[pool_idx])++;
            } else {
                throw std::bad_alloc();
            }
        }

        for (size_t i = 0; i < num_pools; ++i) {
            assert(*local_remain_pages_[i] == pages_per_pool_[i]);
        }
    }

    ~MemoryPool() {
        size_t i = 0;
        for (size_t i = 0; i < num_pools_; i++) {
            assert(*local_remain_pages_[i] == pages_per_pool_[i]);
        }

        for (std::atomic<Page*>* localPool : local_pools_) {
            Page* page = localPool->load(std::memory_order_seq_cst);
            size_t free_cnt = 0;
            while (page != nullptr) {
                //printf("pool[%zu]: free page, cnt(%zu): %p\n", i, free_cnt, page);
                free_cnt++;
                Page* next = page->next.load(std::memory_order_seq_cst);
                std::free(page);
                page = next;
            }

            //printf("actual free: %zu, expected free: %zu", free_cnt, local_remain_pages_[i]->load());
            assert(free_cnt == local_remain_pages_[i]->load());
            i++;
        }
        for (size_t i = 0; i < num_pools_; i++) {
            delete local_pools_[i];
            delete local_remain_pages_[i];
        }
    }

    void* allocate() {
        size_t richest_pool_idx = 0;
        size_t max_remain_pages = 0;
        // get a approximate richest pool
        for (size_t i = 0; i < num_pools_; i++) {
            if (*local_remain_pages_[i] > max_remain_pages) {
                max_remain_pages = *local_remain_pages_[i];
                richest_pool_idx = i;
            }
        }

        Page* page = local_pools_[richest_pool_idx]->load(std::memory_order_seq_cst);
        Page* next = page->next.load(std::memory_order_seq_cst);
        assert((size_t)page != 0x2a2a2a2a2a2a2a2a); // in test, we write 0x2a to page
        /*
        while (page != nullptr &&
               !local_pools_[richest_pool_idx]->compare_exchange_weak(
                   page, page->next.load(std::memory_order_relaxed),
                   std::memory_order_acquire, std::memory_order_relaxed))
            ;
            */
        
        while (page != nullptr && (size_t)page != 0x2a2a2a2a2a2a2a2a &&
               !local_pools_[richest_pool_idx]->compare_exchange_strong(
                   page, page->next.load(std::memory_order_seq_cst))) {
            page = local_pools_[richest_pool_idx]->load(std::memory_order_seq_cst);
            next = page->next.load(std::memory_order_seq_cst);
        }


        (*local_remain_pages_[richest_pool_idx])--;

        assert((size_t)page != 0x2a2a2a2a2a2a2a2a);
        assert((size_t)next != 0x2a2a2a2a2a2a2a2a);
        return page;
    }

    void deallocate(void* ptr) {
        size_t idx = (size_t)ptr / page_size_ % num_pools_;

        if (ptr == nullptr) {
            return;
        }
        Page* page = static_cast<Page*>(ptr);
        Page* expected = local_pools_[idx]->load(std::memory_order_seq_cst);
        page->next.store(expected, std::memory_order_seq_cst);
        /*
        while (!local_pools_[idx]->compare_exchange_weak(
            expected, page, std::memory_order_acquire,
            std::memory_order_relaxed))
            ;
            */
        
        while (!local_pools_[idx]->compare_exchange_strong(
                    expected, page)) {
            Page* expected = local_pools_[idx]->load(std::memory_order_seq_cst);
            page->next.store(expected, std::memory_order_seq_cst);
        }
            

        (*local_remain_pages_[idx])++;
    }

   private:
    size_t num_pools_;
    size_t page_size_;
    std::vector<size_t> pages_per_pool_;
    std::vector<std::atomic<Page*>*> local_pools_;
    std::vector<std::atomic<size_t> *> local_remain_pages_;
};

The benchmark codes:

/*file: mem_pool_benchmark.cc*/
#include <iostream>
#include <memory>
#include <thread>
#include <atomic>
#include <cassert>
#include <vector>
#include <chrono>
#include <cstring>
#include <cstdio>
#include "phy_page_pool.h"

constexpr size_t PAGE_SIZE = 4096;

void memory_pool_test(MemoryPool& pool, size_t ops) {
    void *ptr[ops];
    
    for (size_t i = 0; i < ops; ++i) {
        void* page = pool.allocate();
        ptr[i] = page;
        assert(page != nullptr);
        std::memset(page, 42, PAGE_SIZE);
    }

    for (size_t i = 0; i < ops; ++i) {
        pool.deallocate(ptr[i]);
    }
}

void malloc_test(size_t ops) {
    void *ptr[ops];
    for (size_t i = 0; i < ops; ++i) {
        void* page = std::malloc(PAGE_SIZE);
        ptr[i] = page;
        assert(page != nullptr);
        std::memset(page, 42, PAGE_SIZE);
    }

    for (size_t i = 0; i < ops; ++i) {
        free(ptr[i]);
    }
}

void bench(size_t num_pools, size_t pages_per_pool, size_t num_threads, size_t iterations) {
    printf("=========num pools: %zu, num threads: %zu, ops / thread: %zu\n", num_pools, num_threads, iterations);
    MemoryPool pool(num_pools, pages_per_pool);
    std::vector<std::thread> threads;

    assert(iterations <= pages_per_pool);

    // malloc
    auto start = std::chrono::high_resolution_clock::now();
    for (size_t i = 0; i < num_threads; ++i) {
        threads.emplace_back(malloc_test, iterations);
    }
    for (auto& t : threads) {
        t.join();
    }
    auto end = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
    std::cout << "malloc time: " << duration << " ms\n";
    double ops_per_sec = num_threads * iterations / duration * 1000;
    std::cout << "malloc throuputs: " << ops_per_sec << "/s\n";

    start = std::chrono::high_resolution_clock::now();
    threads.clear();
    for (size_t i = 0; i < num_threads; ++i) {
        threads.emplace_back(memory_pool_test, std::ref(pool), iterations);
    }
    for (auto& t : threads) {
        t.join();
    }
    end = std::chrono::high_resolution_clock::now();
    duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
    std::cout << "MemoryPool time: " << duration << " ms\n";
    ops_per_sec = num_threads * iterations / duration * 1000;
    std::cout << "MemoryPool throuputs: " << ops_per_sec << "/s\n";
}

int main() {
    // poolsize = 16GiB * 1;
    // max access size if not free = 12G;
    // it's enough
    size_t total_pages = 4194304 / 2; // 8G

    for (int pools = 2; pools <= 32; pools *= 2) {
        //bench(pools, total_pages/pools, 1, 10*1000);
        //bench(pools, total_pages/pools, 2, 10*1000);
        bench(pools, total_pages/pools, 4, 10*1000);
        bench(pools, total_pages/pools, 8, 10*1000);
        bench(pools, total_pages/pools, 16, 10*1000);
        bench(pools, total_pages/pools, 32, 10*1000);
    }

    return 0;
}

The compiler version and compile commands:

clang version 10.0.0-4ubuntu1 
Target: x86_64-pc-linux-gnu
Thread model: posix
InstalledDir: /usr/bin


clang++ -std=c++17  -fsanitize=address  -g  mem_pool_benchmark.cc -lpthread

logs for question 2:

=========num pools: 2, num threads: 4, ops / thread: 10000
malloc time: 38 ms
malloc throuputs: 1.052e+06/s
MemoryPool time: 11 ms
MemoryPool throuputs: 3.636e+06/s
=========num pools: 2, num threads: 8, ops / thread: 10000
malloc time: 58 ms
malloc throuputs: 1.379e+06/s
AddressSanitizerAddressSanitizer:DEADLYSIGNAL
:DEADLYSIGNAL
=================================================================
==525445==ERROR: AddressSanitizer: SEGV on unknown address (pc 0x0000004cb297 bp 0x7f04deee8230 sp 0x7f04deee80e0 T24)
==525445==The signal is caused by a READ memory access.
a.out: ./phy_page_pool.h:110: void *MemoryPool::allocate(): Assertion `(size_t)next != 0x2a2a2a2a2a2a2a2a' failed.
a.out: ./phy_page_pool.h:110: void *MemoryPool::allocate(): Assertion `(size_t)next != 0x2a2a2a2a2a2a2a2a' failed.
==525445==Hint: this fault was caused by a dereference of a high value address (see register values below).  Dissassemble the provided pc to learn which register was used.
[1]    525445 abort (core dumped)  ./a.out

logs for question 1:

=========num pools: 2, num threads: 4, ops / thread: 10000
malloc time: 38 ms
malloc throuputs: 1.052e+06/s
MemoryPool time: 11 ms
MemoryPool throuputs: 3.636e+06/s
=================================================================
==525587==ERROR: AddressSanitizer: heap-use-after-free on address 0x6254ff4c8010 at pc 0x0000004cb2ad bp 0x7ffc80dedff0 sp 0x7ffc80dedfe8
READ of size 8 at 0x6254ff4c8010 thread T0
    #0 0x4cb2ac in std::__atomic_base<Page*>::load(std::memory_order) const /usr/bin/../lib/gcc/x86_64-linux-gnu/9/../../../../include/c++/9/bits/atomic_base.h:740:9
    #1 0x4cb2ac in std::atomic<Page*>::load(std::memory_order) const /usr/bin/../lib/gcc/x86_64-linux-gnu/9/../../../../include/c++/9/atomic:519:21
    #2 0x4caa9c in MemoryPool::~MemoryPool() /home/hawk/Codes/user-level-mmap/src/./phy_page_pool.h:62:41
    #3 0x4c7e30 in bench(unsigned long, unsigned long, unsigned long, unsigned long) /home/hawk/Codes/user-level-mmap/src/mem_pool_benchmark.cc:77:1
    #4 0x4c8022 in main /home/hawk/Codes/user-level-mmap/src/mem_pool_benchmark.cc:88:3
    #5 0x7f1440aa0082 in __libc_start_main /build/glibc-wuryBv/glibc-2.31/csu/../csu/libc-start.c:308:16
    #6 0x41c47d in _start (/home/hawk/Codes/user-level-mmap/src/a.out+0x41c47d)

0x6254ff4c8010 is located 16 bytes inside of 4096-byte region [0x6254ff4c8000,0x6254ff4c9000)
freed by thread T0 here:
    #0 0x49493d in free (/home/hawk/Codes/user-level-mmap/src/a.out+0x49493d)
    #1 0x4caab2 in MemoryPool::~MemoryPool() /home/hawk/Codes/user-level-mmap/src/./phy_page_pool.h:63:17
    #2 0x4c7e30 in bench(unsigned long, unsigned long, unsigned long, unsigned long) /home/hawk/Codes/user-level-mmap/src/mem_pool_benchmark.cc:77:1
    #3 0x4c8022 in main /home/hawk/Codes/user-level-mmap/src/mem_pool_benchmark.cc:88:3
    #4 0x7f1440aa0082 in __libc_start_main /build/glibc-wuryBv/glibc-2.31/csu/../csu/libc-start.c:308:16

previously allocated by thread T0 here:
    #0 0x4954e2 in aligned_alloc (/home/hawk/Codes/user-level-mmap/src/a.out+0x4954e2)
    #1 0x4c9020 in MemoryPool::MemoryPool(unsigned long, unsigned long, unsigned long) /home/hawk/Codes/user-level-mmap/src/./phy_page_pool.h:29:45
    #2 0x4c7415 in bench(unsigned long, unsigned long, unsigned long, unsigned long) /home/hawk/Codes/user-level-mmap/src/mem_pool_benchmark.cc:45:16
    #3 0x4c8022 in main /home/hawk/Codes/user-level-mmap/src/mem_pool_benchmark.cc:88:3
    #4 0x7f1440aa0082 in __libc_start_main /build/glibc-wuryBv/glibc-2.31/csu/../csu/libc-start.c:308:16

SUMMARY: AddressSanitizer: heap-use-after-free /usr/bin/../lib/gcc/x86_64-linux-gnu/9/../../../../include/c++/9/bits/atomic_base.h:740:9 in std::__atomic_base<Page*>::load(std::memory_order) const
Shadow bytes around the buggy address:
  0x0c4b1fe90fb0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
  0x0c4b1fe90fc0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
  0x0c4b1fe90fd0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
  0x0c4b1fe90fe0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
  0x0c4b1fe90ff0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
=>0x0c4b1fe91000: fd fd[fd]fd fd fd fd fd fd fd fd fd fd fd fd fd
  0x0c4b1fe91010: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
  0x0c4b1fe91020: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
  0x0c4b1fe91030: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
  0x0c4b1fe91040: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
  0x0c4b1fe91050: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
Shadow byte legend (one shadow byte represents 8 application bytes):
  Addressable:           00
  Partially addressable: 01 02 03 04 05 06 07 
  Heap left redzone:       fa
  Freed heap region:       fd
  Stack left redzone:      f1
  Stack mid redzone:       f2
  Stack right redzone:     f3
  Stack after return:      f5
  Stack use after scope:   f8
  Global redzone:          f9
  Global init order:       f6
  Poisoned by user:        f7
  Container overflow:      fc
  Array cookie:            ac
  Intra object redzone:    bb
  ASan internal:           fe
  Left alloca redzone:     ca
  Right alloca redzone:    cb
  Shadow gap:              cc
==525587==ABORTING

I have add some assert and logs in the codes, but i can not find the bugs. I also analysis the lockfree algorithm and still can not find the bugs.

0

There are 0 best solutions below