avoid memory errors with AVX intinsics

71 Views Asked by At

I've been trying to speed up some neural network computations using AVX instructions. However, I keep running into the following error "Unhandled exception at [...]: Access violation reading location [...]".

I tried to isolate the issue but as the memory seems to be corrupted somewhere, the error doesn't show up at the same place each time, and i suspect it be misleading. Does anyone know what might cause the issue?

Here is some reproducible code:

#include <immintrin.h>
#include <cmath>
#include <iostream>
#include <array>
#include <vector>

inline const int num_avx_registers = 16;
inline const int floats_per_reg = 4;

inline const int HKP_size = 100;
inline constexpr int acc_size = 256;

class NNLayer {
    public:
    alignas(32) float* weight;
    alignas(32) float* bias;

    NNLayer(){
        weight = new float[HKP_size * acc_size]; // flattened 2D array.
        bias = new float[acc_size];


        // initialize the weights and bias with test values
        for (uint32_t i=0; i<HKP_size * acc_size; i++){
            weight[i] = 1.F;
        }

        for (int i=0; i<acc_size; i++){
            bias[i] = static_cast<float>(i);
        }
    }

    ~NNLayer(){
        delete[] weight;
        delete[] bias;
    }
};

class Accumulator {
    public:
    alignas(32) std::array<float, acc_size> accumulator_w;
    alignas(32) std::array<float, acc_size> accumulator_b;

    std::array<float, acc_size>& Accumulator::operator[](bool color){
        return color ? accumulator_w : accumulator_b;
    }
};

class NNUE {
    public:
    Accumulator accumulator;
    NNLayer first_layer = NNLayer();

    void compute_accumulator(const std::vector<int> active_features, bool color){
        // we have 256 floats to process.
        // there are 16 avx registers, and each can hold 4 floats.
        // therefore we need to do 256/64 = 4 passes to the registers.

        constexpr int c_size = num_avx_registers * floats_per_reg; //chunk size
        constexpr int num_chunks = acc_size / c_size;
        
        static_assert(acc_size % c_size == 0);

        __m256 avx_regs[num_avx_registers];

        // we process 1/4th of the whole data at each loop.
        // we add c_idx to the indexes pick up where we left off at the last chunk.
        for (int c_idx = 0; c_idx < num_chunks*c_size; c_idx += c_size){ // chunk index

            // load the bias from memory
            for (int i = 0; i < num_avx_registers; i++){
                avx_regs[i] = _mm256_load_ps(&first_layer.bias[c_idx + i*floats_per_reg]);
            }

            // add the active weights
            for (const int &a: active_features){
                for (int i = 0; i < num_avx_registers; i++){
                    // a*acc_size is to get the a-th row of the flattened 2D array.
                    avx_regs[i] = _mm256_add_ps(
                        avx_regs[i],
                        _mm256_load_ps(&first_layer.weight[a*acc_size + c_idx + i*floats_per_reg])
                        );
                }
            }

            //store the result in the accumulator
            for (int i = 0; i < num_avx_registers; i++){
                _mm256_store_ps(&accumulator[color][c_idx + i*floats_per_reg], avx_regs[i]);
            }
        }
    }
};

int main(){
    NNUE nnue;

    std::vector<int> act_f = {2, 1, 70, 62};
    nnue.compute_accumulator(act_f, true);

    std::cout << "still alive\n";
    return 0;
}
1

There are 1 best solutions below

4
chtz On

alignas(32) float* weight; only aligns the address where the pointer is stored, not the pointed-to-memory. If you want to create aligned memory with new in C++17 you can write:

weight = new (std::align_val_t(32)) float[HKP_size * acc_size];