Determinant of a small complex matrix within a GPU thread

192 Views Asked by At

The problem is to compute the determinant of a complex matrix (5x5, 4x4, and 3x3) from a device function within a particular thread. Efficiency is crucial, as the original problem involves computing thousands of such matrices in a for loop within each thread.

Here is a working example using direct expansion (run with arg: 0) and cofactor expansion (run with arg: 1) methods:

#include <stdio.h>
#include <complex.h>
#include <cstring>

#include <cuda.h>
#include <cuComplex.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

#define N 5

__host__ __device__ static __inline__  cuDoubleComplex cuCfms( cuDoubleComplex x, 
                                                               cuDoubleComplex y, 
                                                               cuDoubleComplex d){
    double real_res;
    double imag_res;
    
    real_res = (cuCreal(x) *  cuCreal(y)) - cuCreal(d);
    imag_res = (cuCreal(x) *  cuCimag(y)) - cuCimag(d);
            
    real_res = -(cuCimag(x) * cuCimag(y))  + real_res;  
    imag_res =  (cuCimag(x) *  cuCreal(y)) + imag_res;     
     
    return make_cuDoubleComplex(real_res, imag_res);
}

__host__ __device__ static __inline__ cuDoubleComplex cuCaaaa(cuDoubleComplex x,
                                                              cuDoubleComplex y,
                                                              cuDoubleComplex u,
                                                              cuDoubleComplex v) {

    return make_cuDoubleComplex (cuCreal(x) + cuCreal(y) + cuCreal(u) + cuCreal(v), 
                                 cuCimag(x) + cuCimag(y) + cuCimag(u) + cuCimag(v));
}

__host__ __device__ static __inline__ cuDoubleComplex cuCasas(cuDoubleComplex x,
                                                              cuDoubleComplex y,
                                                              cuDoubleComplex u,
                                                              cuDoubleComplex v) {

    return make_cuDoubleComplex (cuCreal(x) - cuCreal(y) + cuCreal(u) - cuCreal(v), 
                                 cuCimag(x) - cuCimag(y) + cuCimag(u) - cuCimag(v));
}

__host__ __device__ static __inline__ cuDoubleComplex cuCasasa(cuDoubleComplex x,
                                                               cuDoubleComplex y,
                                                               cuDoubleComplex u,
                                                               cuDoubleComplex v,
                                                               cuDoubleComplex w) {

    return make_cuDoubleComplex (cuCreal(x) - cuCreal(y) + cuCreal(u) - cuCreal(v) + cuCreal(w), 
                                 cuCimag(x) - cuCimag(y) + cuCimag(u) - cuCimag(v) + cuCimag(w) );
}


__device__ cuDoubleComplex cuC_5pos (cuDoubleComplex a, cuDoubleComplex b,
                                   cuDoubleComplex c, cuDoubleComplex d,
                                   cuDoubleComplex e) {
     // B[12]*(B[18]*B[24] - B[19]*B[23])
    //a*b*c -d*e
    return cuCmul(a, cuCfms(b,c, cuCmul(d,e)));
}

__device__ cuDoubleComplex cuC_5neg (cuDoubleComplex a, cuDoubleComplex b,
                                   cuDoubleComplex c, cuDoubleComplex d,
                                   cuDoubleComplex e) {
     // B[12]*(B[18]*B[24] - B[19]*B[23])
    //a*b*c -d*e
    return cuCmul(cuCmul(make_cuDoubleComplex(-1.0,0.0),a), cuCfms(b,c, cuCmul(d,e)));
}

__device__ cuDoubleComplex direct_det5 (cuDoubleComplex *B){
    cuDoubleComplex det;

    cuDoubleComplex det1 = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex det2 = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex det3 = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex det4 = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex det5 = make_cuDoubleComplex(0.0,0.0);


    cuDoubleComplex det11 = cuCmul(B[0],cuCmul(B[6],cuCadd( cuC_5pos(B[12],B[18],B[24],B[19],B[23]), 
                                                   cuCadd(  cuC_5neg(B[13],B[17],B[24],B[22],B[19]), 
                                                            cuC_5pos(B[14],B[17],B[23],B[22],B[18]) ) ) ) );
    cuDoubleComplex det12 = cuCmul(B[0],cuCmul(B[11],cuCadd(cuC_5pos(B[7],B[18],B[24],B[19],B[23]), 
                                                    cuCadd( cuC_5neg(B[8],B[17],B[24],B[22],B[19]), 
                                                            cuC_5pos(B[9],B[17],B[23],B[22],B[18]) ) ) ) );
    cuDoubleComplex det13 = cuCmul(B[0],cuCmul(B[16],cuCadd(cuC_5pos(B[7],B[13],B[24],B[23],B[14]), 
                                                    cuCadd( cuC_5neg(B[8],B[12],B[24],B[22],B[14]), 
                                                            cuC_5pos(B[9],B[12],B[23],B[22],B[13]) ) ) ) );
    cuDoubleComplex det14 = cuCmul(B[0],cuCmul(B[21],cuCadd(cuC_5pos(B[7],B[13],B[19],B[18],B[14]), 
                                                    cuCadd( cuC_5neg(B[8],B[12],B[19],B[17],B[14]), 
                                                            cuC_5pos(B[9],B[12],B[18],B[13],B[17]) ) ) ) );

    det1 = cuCasas(det11, det12, det13, det14);


    cuDoubleComplex det21 = cuCmul(B[1*5+0],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[12],B[18],B[24],B[19],B[23]), cuCadd( cuC_5neg(B[13],B[17],B[24],B[22],B[19]), cuC_5pos(B[14],B[17],B[23],B[22],B[18]) ) ) ) );
    cuDoubleComplex det22 = cuCmul(B[1*5+0],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[2],B[18],B[24],B[19],B[23]), cuCadd( cuC_5neg(B[3],B[17],B[24],B[22],B[19]), cuC_5pos(B[4],B[17],B[23],B[22],B[18]) ) ) ) );
    cuDoubleComplex det23 = cuCmul(B[1*5+0],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[2],B[13],B[24],B[23],B[14]), cuCadd( cuC_5neg(B[3],B[12],B[24],B[22],B[14]), cuC_5pos(B[4],B[12],B[23],B[22],B[13]) ) ) ) );
    cuDoubleComplex det24 = cuCmul(B[1*5+0],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[2],B[13],B[19],B[18],B[14]), cuCadd( cuC_5neg(B[3],B[12],B[19],B[17],B[14]), cuC_5pos(B[4],B[12],B[18],B[13],B[17]) ) ) ) );

    det2 = cuCasas(det21, det22, det23, det24);


    cuDoubleComplex det31 = cuCmul(B[10],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[3*5+3],B[4*5+4],B[3*5+4],B[4*5+3]), 
                                                       cuCadd(  cuC_5neg(B[1*5+3],B[3*5+2],B[4*5+4],B[4*5+2],B[3*5+4]), 
                                                                cuC_5pos(B[1*5+4],B[3*5+2],B[4*5+3],B[4*5+2],B[3*5+3]) ) ) ) );
    cuDoubleComplex det32 = cuCmul(B[10],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[3*5+3],B[4*5+4],B[3*5+4],B[4*5+3]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[3*5+2],B[4*5+4],B[4*5+2],B[3*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[3*5+2],B[4*5+3],B[4*5+2],B[3*5+3]) ) ) ) );
    cuDoubleComplex det33 = cuCmul(B[10],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[4*5+4],B[4*5+3],B[1*5+4]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[4*5+4],B[4*5+2],B[1*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[1*5+2],B[4*5+3],B[4*5+2],B[1*5+3]) ) ) ) );
    cuDoubleComplex det34 = cuCmul(B[10],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[3*5+4],B[3*5+3],B[1*5+4]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[3*5+4],B[3*5+2],B[1*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[1*5+2],B[3*5+3],B[1*5+3],B[3*5+2]) ) ) ) );


    det3 = cuCasas(det31, det32, det33, det34);


    cuDoubleComplex det41 = cuCmul(B[15],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[2*5+3],B[4*5+4],B[2*5+4],B[4*5+3]), 
                                                       cuCadd(  cuC_5neg(B[1*5+3],B[2*5+2],B[4*5+4],B[4*5+2],B[2*5+4]), 
                                                                cuC_5pos(B[1*5+4],B[2*5+2],B[4*5+3],B[4*5+2],B[2*5+3]) ) ) ) );
    cuDoubleComplex det42 = cuCmul(B[15],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[2*5+3],B[4*5+4],B[2*5+4],B[4*5+3]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[2*5+2],B[4*5+4],B[4*5+2],B[2*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[2*5+2],B[4*5+3],B[4*5+2],B[2*5+3]) ) ) ) );
    cuDoubleComplex det43 = cuCmul(B[15],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[4*5+4],B[4*5+3],B[1*5+4]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[4*5+4],B[4*5+2],B[1*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[1*5+2],B[4*5+3],B[4*5+2],B[1*5+3]) ) ) ) );
    cuDoubleComplex det44 = cuCmul(B[15],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[2*5+4],B[2*5+3],B[1*5+4]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[2*5+4],B[2*5+2],B[1*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[1*5+2],B[2*5+3],B[1*5+3],B[2*5+2]) ) ) ) );


    det4 = cuCasas(det41, det42, det43, det44);


    cuDoubleComplex det51 = cuCmul(B[20],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[2*5+3],B[3*5+4],B[2*5+4],B[3*5+3]), 
                                                       cuCadd(  cuC_5neg(B[1*5+3],B[2*5+2],B[3*5+4],B[3*5+2],B[2*5+4]), 
                                                                cuC_5pos(B[1*5+4],B[2*5+2],B[3*5+3],B[3*5+2],B[2*5+3]) ) ) ) );
    cuDoubleComplex det52 = cuCmul(B[20],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[2*5+3],B[3*5+4],B[2*5+4],B[3*5+3]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[2*5+2],B[3*5+4],B[3*5+2],B[2*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[2*5+2],B[3*5+3],B[3*5+2],B[2*5+3]) ) ) ) );
    cuDoubleComplex det53 = cuCmul(B[20],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[3*5+4],B[3*5+3],B[1*5+4]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[3*5+4],B[3*5+2],B[1*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[1*5+2],B[3*5+3],B[3*5+2],B[1*5+3]) ) ) ) );
    cuDoubleComplex det54 = cuCmul(B[20],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[2*5+4],B[2*5+3],B[1*5+4]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[2*5+4],B[2*5+2],B[1*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[1*5+2],B[2*5+3],B[1*5+3],B[2*5+2]) ) ) ) );


    det5 = cuCasas(det51, det52, det53, det54);

    det = cuCasasa(det1,det2,det3,det4,det5);

    return det;
}


#define N 5

__device__ cuDoubleComplex det_4x4(cuDoubleComplex *matrix) {
    cuDoubleComplex det = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex det1 = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex det2 = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex det3 = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex det4 = make_cuDoubleComplex(0.0,0.0);

    // Calculate the determinant using cofactor expansions
    det1 = cuCmul(matrix[0], 
                  cuCadd( cuCsub( cuCmul( matrix[5], 
                                          cuCsub( cuCmul(matrix[10],matrix[15]), 
                                                  cuCmul(matrix[11],matrix[14]) ))
                                 ,cuCmul(matrix[6], 
                                         cuCsub( cuCmul(matrix[9],matrix[15]),
                                                 cuCmul(matrix[11],matrix[13]) ) ) )
                         ,cuCmul(matrix[7], 
                                 cuCsub( cuCmul(matrix[9],matrix[14]),
                                         cuCmul(matrix[10],matrix[13]) ) ) ) );



    det2 = cuCmul(matrix[1], 
                  cuCadd( cuCsub( cuCmul( matrix[4], 
                                          cuCsub( cuCmul(matrix[10],matrix[15]), 
                                                  cuCmul(matrix[11],matrix[14]) ))
                                 ,cuCmul(matrix[6], 
                                         cuCsub( cuCmul(matrix[8],matrix[15]),
                                                 cuCmul(matrix[11],matrix[12]) ) ) )
                         ,cuCmul(matrix[7], 
                                 cuCsub( cuCmul(matrix[8],matrix[14]),
                                         cuCmul(matrix[10],matrix[12]) ) ) ) );


    det3 = cuCmul(matrix[2], 
                  cuCadd( cuCsub( cuCmul( matrix[4], 
                                          cuCsub( cuCmul(matrix[9],matrix[15]), 
                                                  cuCmul(matrix[11],matrix[13]) ))
                                 ,cuCmul(matrix[5], 
                                         cuCsub( cuCmul(matrix[8],matrix[15]),
                                                 cuCmul(matrix[11],matrix[12]) ) ) )
                         ,cuCmul(matrix[7], 
                                 cuCsub( cuCmul(matrix[8],matrix[13]),
                                         cuCmul(matrix[9],matrix[12]) ) ) ) );
    
    
    det4 = cuCmul(matrix[3], 
                  cuCadd( cuCsub( cuCmul( matrix[4], 
                                          cuCsub( cuCmul(matrix[9],matrix[14]), 
                                                  cuCmul(matrix[10],matrix[13]) ))
                                 ,cuCmul(matrix[5], 
                                         cuCsub( cuCmul(matrix[8],matrix[14]),
                                                 cuCmul(matrix[10],matrix[12]) ) ) )
                         ,cuCmul(matrix[6], 
                                 cuCsub( cuCmul(matrix[8],matrix[13]),
                                         cuCmul(matrix[9],matrix[12]) ) ) ) );
    
    det = cuCsub( cuCadd(det1,det3), cuCadd(det2,det4) );

    return det;
}


__device__ cuDoubleComplex det_5x5(cuDoubleComplex *matrix) {
    cuDoubleComplex det = make_cuDoubleComplex(0.0,0.0);


    // Calculate the determinant using cofactor expansions
    for (int col = 0; col < N; col++) {
        cuDoubleComplex submatrix[N-1][N-1];
        int subrow = 0, subcol = 0;

        // Create the submatrix by excluding the current row and column
        for (int row = 1; row < N; row++) {
            for (int c = 0; c < N; c++) {
                if (c == col) {
                    continue;
                }
                submatrix[subrow][subcol] = matrix[row * N + c];
                subcol++;
            }
            subrow++;
            subcol = 0;
        }

        // Calculate the cofactor
        cuDoubleComplex det_sign = make_cuDoubleComplex((col % 2 == 0 ? 1 : -1),0.0);
        cuDoubleComplex cofactor = cuCmul( cuCmul(det_sign,matrix[col]), det_4x4(&submatrix[0][0]));


        // Accumulate the cofactors to calculate the determinant
        det = cuCadd(det, cofactor);
    }


    return det;
}

__device__ cuDoubleComplex gpu_device_kernel(int i, int method) {


    // some random data, depending on `i`, but for simplification say constant data
    cuDoubleComplex matrix[5*5] = {
            make_cuDoubleComplex(2.0*(i/69120 * 1000), 2.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(7.0, 0.0),
            make_cuDoubleComplex(4.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(8.0, 0.0),
            make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(3.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0),
            make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(2.0, 0.0),
            make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(6.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 2.0)
    };

    cuDoubleComplex cf_sign[5]={make_cuDoubleComplex(1.0,0.0),
                                make_cuDoubleComplex(-1.0,0.0),
                                make_cuDoubleComplex(1.0,0.0),
                                make_cuDoubleComplex(-1.0,0.0),
                                make_cuDoubleComplex(1.0,0.0),
                                };

    cuDoubleComplex det;
    if (method == 0) {
        det = direct_det5(matrix);    
    } else {
        det = det_5x5(matrix);
    }

    return det;
}


__global__ void gpu_global_kernel(cuDoubleComplex *d_res, int method) {

    // some other computations, irrelevant to current problem

    uint64_t total = 69120 * 1000;// 10 * 1000 * 1000; // this `total` is what I am referring to thousands 
    
    cuDoubleComplex det_sum, det;
    det_sum = make_cuDoubleComplex(0.0,0.0);
    for (int i = 0; i < total; i++) {
        det = gpu_device_kernel(i, method);
        det_sum = cuCadd(det_sum, det);
    }

    det_sum = cuCmul(det_sum, make_cuDoubleComplex(1.0/total,0.0) );

    d_res[0] = det_sum;

    return;
}

int main(int argc, char *argv[]) {

    if (argc != 2) {
        printf("Usage: %s <num1> <num2>\n", argv[0]);
        return 1;
    }

    int method;
    method = atoi(argv[1]);
    
    double complex *res;
    res = (double complex*)malloc(sizeof(double complex));
    memset(res, 0.0,sizeof(double complex));


    cuDoubleComplex *d_res;
    cudaMalloc(&d_res, sizeof(cuDoubleComplex) );

    int *d_method;
    cudaMalloc(&d_method, sizeof(int) );
    cudaMemcpy(d_method, &method, sizeof(int), cudaMemcpyHostToDevice);


    clock_t direct_start, direct_end;
    direct_start = clock();

    gpu_global_kernel <<<1,1>>>(d_res, *d_method);

    cudaDeviceSynchronize();

    direct_end = clock();

    cudaMemcpy(res, d_res, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost );

    printf("RESULT: %.16E + %.16E i\n", creal(res[0]), cimag(res[0]));

    double direct_time = (double)((double)(direct_end - direct_start) / CLOCKS_PER_SEC);

    if (method == 0) {
        printf("DIRECT EXEC: %.5f s \n", direct_time);
    } else {
        printf("COFACTOR EXEC: %.5f s \n", direct_time);        
    }    

    cudaDeviceReset();

    return 0;
}

However, I am wondering if these implementation is even efficient.

I am looking for an alternative method to compute the determinant of a general matrix inside a device function. The example I have provided is one of the most straightforward methods, but an alternative approach, such as LU/Cholesky decomposition, that can be called from the device function is what I am seeking. Unfortunately, packages like LAPACK are not available for CUDA, but maybe a similar compatible function from some other packages could work within the CUDA device environment.

Any suggestions or function examples would be greatly appreciated!

1

There are 1 best solutions below

4
Abator Abetor On

Since you are including a C++ header (<cstring>) I am assuming its a C++ code, even though it looks very C-like.

I took the general generated code that you provided for expansion and implemented it directly as is by simply defining the operators *,+,- for the cuDoubleComplex .

I call this method "generated" in the code below. Additionally, I made some slight modifications to your code. I added a program parameter to specify the number of GPU threads to have a proper GPU utilization. I also use separate kernels for each method. Method 0 is the original method, method 1 is your direct method, method 2 is my "generated" method.

I compiled the code with nvcc -std=c++14 -O3 -arch=sm_70 main.cu -o main and see the following timings on V100 Pcie for 32000 GPU threads.

Configuration Timing
original 145.01487 s
direct 67.07063 s
generated 24.02771 s

The code:

#include <stdio.h>
#include <complex.h>
#include <cstring>

#include <cuda.h>
#include <cuComplex.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

#define N 5


__host__ __device__
bool operator==(cuDoubleComplex l, cuDoubleComplex r){
    return cuCreal(l) == cuCreal(r) && cuCimag(l) == cuCimag(r);
}

__host__ __device__
cuDoubleComplex operator*(cuDoubleComplex l, cuDoubleComplex r){
    return cuCmul(l,r);
}

__host__ __device__
cuDoubleComplex operator+(cuDoubleComplex l, cuDoubleComplex r){
    return cuCadd(l,r);
}

__host__ __device__
cuDoubleComplex operator-(cuDoubleComplex l, cuDoubleComplex r){
    return cuCsub(l,r);
}

__host__ __device__
cuDoubleComplex operator-(cuDoubleComplex l){
    return make_cuDoubleComplex(-cuCreal(l), -cuCimag(l));
}


template<class Matrix>
__device__ 
cuDoubleComplex generated_det_5x5(Matrix B){
    return 
       B[0*5+0]*(B[1*5+1]*(B[2*5+2]*(B[3*5+3]*B[4*5+4]-B[3*5+4]*B[4*5+3])-B[2*5+3]*(B[3*5+2]*B[4*5+4]-B[4*5+2]*B[3*5+4])+B[2*5+4]*(B[3*5+2]*B[4*5+3]-B[4*5+2]*B[3*5+3]))-B[2*5+1]*(B[1*5+2]*(B[3*5+3]*B[4*5+4]-B[3*5+4]*B[4*5+3])-B[1*5+3]*(B[3*5+2]*B[4*5+4]-B[4*5+2]*B[3*5+4])+B[1*5+4]*(B[3*5+2]*B[4*5+3]-B[4*5+2]*B[3*5+3]))+B[3*5+1]*(B[1*5+2]*(B[2*5+3]*B[4*5+4]-B[4*5+3]*B[2*5+4])-B[1*5+3]*(B[2*5+2]*B[4*5+4]-B[4*5+2]*B[2*5+4])+B[1*5+4]*(B[2*5+2]*B[4*5+3]-B[4*5+2]*B[2*5+3]))-B[4*5+1]*(B[1*5+2]*(B[2*5+3]*B[3*5+4]-B[3*5+3]*B[2*5+4])-B[1*5+3]*(B[2*5+2]*B[3*5+4]-B[3*5+2]*B[2*5+4])+B[1*5+4]*(B[2*5+2]*B[3*5+3]-B[2*5+3]*B[3*5+2])))
     - B[1*5+0]*(B[0*5+1]*(B[2*5+2]*(B[3*5+3]*B[4*5+4]-B[3*5+4]*B[4*5+3])-B[2*5+3]*(B[3*5+2]*B[4*5+4]-B[4*5+2]*B[3*5+4])+B[2*5+4]*(B[3*5+2]*B[4*5+3]-B[4*5+2]*B[3*5+3]))-B[2*5+1]*(B[0*5+2]*(B[3*5+3]*B[4*5+4]-B[3*5+4]*B[4*5+3])-B[0*5+3]*(B[3*5+2]*B[4*5+4]-B[4*5+2]*B[3*5+4])+B[0*5+4]*(B[3*5+2]*B[4*5+3]-B[4*5+2]*B[3*5+3]))+B[3*5+1]*(B[0*5+2]*(B[2*5+3]*B[4*5+4]-B[4*5+3]*B[2*5+4])-B[0*5+3]*(B[2*5+2]*B[4*5+4]-B[4*5+2]*B[2*5+4])+B[0*5+4]*(B[2*5+2]*B[4*5+3]-B[4*5+2]*B[2*5+3]))-B[4*5+1]*(B[0*5+2]*(B[2*5+3]*B[3*5+4]-B[3*5+3]*B[2*5+4])-B[0*5+3]*(B[2*5+2]*B[3*5+4]-B[3*5+2]*B[2*5+4])+B[0*5+4]*(B[2*5+2]*B[3*5+3]-B[2*5+3]*B[3*5+2])))
     + B[2*5+0]*(B[0*5+1]*(B[1*5+2]*(B[3*5+3]*B[4*5+4]-B[3*5+4]*B[4*5+3])-B[1*5+3]*(B[3*5+2]*B[4*5+4]-B[4*5+2]*B[3*5+4])+B[1*5+4]*(B[3*5+2]*B[4*5+3]-B[4*5+2]*B[3*5+3]))-B[1*5+1]*(B[0*5+2]*(B[3*5+3]*B[4*5+4]-B[3*5+4]*B[4*5+3])-B[0*5+3]*(B[3*5+2]*B[4*5+4]-B[4*5+2]*B[3*5+4])+B[0*5+4]*(B[3*5+2]*B[4*5+3]-B[4*5+2]*B[3*5+3]))+B[3*5+1]*(B[0*5+2]*(B[1*5+3]*B[4*5+4]-B[4*5+3]*B[1*5+4])-B[0*5+3]*(B[1*5+2]*B[4*5+4]-B[4*5+2]*B[1*5+4])+B[0*5+4]*(B[1*5+2]*B[4*5+3]-B[4*5+2]*B[1*5+3]))-B[4*5+1]*(B[0*5+2]*(B[1*5+3]*B[3*5+4]-B[3*5+3]*B[1*5+4])-B[0*5+3]*(B[1*5+2]*B[3*5+4]-B[3*5+2]*B[1*5+4])+B[0*5+4]*(B[1*5+2]*B[3*5+3]-B[1*5+3]*B[3*5+2])))
     - B[3*5+0]*(B[0*5+1]*(B[1*5+2]*(B[2*5+3]*B[4*5+4]-B[2*5+4]*B[4*5+3])-B[1*5+3]*(B[2*5+2]*B[4*5+4]-B[4*5+2]*B[2*5+4])+B[1*5+4]*(B[2*5+2]*B[4*5+3]-B[4*5+2]*B[2*5+3]))-B[1*5+1]*(B[0*5+2]*(B[2*5+3]*B[4*5+4]-B[2*5+4]*B[4*5+3])-B[0*5+3]*(B[2*5+2]*B[4*5+4]-B[4*5+2]*B[2*5+4])+B[0*5+4]*(B[2*5+2]*B[4*5+3]-B[4*5+2]*B[2*5+3]))+B[2*5+1]*(B[0*5+2]*(B[1*5+3]*B[4*5+4]-B[4*5+3]*B[1*5+4])-B[0*5+3]*(B[1*5+2]*B[4*5+4]-B[4*5+2]*B[1*5+4])+B[0*5+4]*(B[1*5+2]*B[4*5+3]-B[4*5+2]*B[1*5+3]))-B[4*5+1]*(B[0*5+2]*(B[1*5+3]*B[2*5+4]-B[2*5+3]*B[1*5+4])-B[0*5+3]*(B[1*5+2]*B[2*5+4]-B[2*5+2]*B[1*5+4])+B[0*5+4]*(B[1*5+2]*B[2*5+3]-B[1*5+3]*B[2*5+2])))
     + B[4*5+0]*(B[0*5+1]*(B[1*5+2]*(B[2*5+3]*B[3*5+4]-B[2*5+4]*B[3*5+3])-B[1*5+3]*(B[2*5+2]*B[3*5+4]-B[3*5+2]*B[2*5+4])+B[1*5+4]*(B[2*5+2]*B[3*5+3]-B[3*5+2]*B[2*5+3]))-B[1*5+1]*(B[0*5+2]*(B[2*5+3]*B[3*5+4]-B[2*5+4]*B[3*5+3])-B[0*5+3]*(B[2*5+2]*B[3*5+4]-B[3*5+2]*B[2*5+4])+B[0*5+4]*(B[2*5+2]*B[3*5+3]-B[3*5+2]*B[2*5+3]))+B[2*5+1]*(B[0*5+2]*(B[1*5+3]*B[3*5+4]-B[3*5+3]*B[1*5+4])-B[0*5+3]*(B[1*5+2]*B[3*5+4]-B[3*5+2]*B[1*5+4])+B[0*5+4]*(B[1*5+2]*B[3*5+3]-B[3*5+2]*B[1*5+3]))-B[3*5+1]*(B[0*5+2]*(B[1*5+3]*B[2*5+4]-B[2*5+3]*B[1*5+4])-B[0*5+3]*(B[1*5+2]*B[2*5+4]-B[2*5+2]*B[1*5+4])+B[0*5+4]*(B[1*5+2]*B[2*5+3]-B[1*5+3]*B[2*5+2])));
}








__host__ __device__ static __inline__  cuDoubleComplex cuCfms( cuDoubleComplex x, 
                                                               cuDoubleComplex y, 
                                                               cuDoubleComplex d){
    double real_res;
    double imag_res;
    
    real_res = (cuCreal(x) *  cuCreal(y)) - cuCreal(d);
    imag_res = (cuCreal(x) *  cuCimag(y)) - cuCimag(d);
            
    real_res = -(cuCimag(x) * cuCimag(y))  + real_res;  
    imag_res =  (cuCimag(x) *  cuCreal(y)) + imag_res;     
     
    return make_cuDoubleComplex(real_res, imag_res);
}

__host__ __device__ static __inline__ cuDoubleComplex cuCaaaa(cuDoubleComplex x,
                                                              cuDoubleComplex y,
                                                              cuDoubleComplex u,
                                                              cuDoubleComplex v) {

    return make_cuDoubleComplex (cuCreal(x) + cuCreal(y) + cuCreal(u) + cuCreal(v), 
                                 cuCimag(x) + cuCimag(y) + cuCimag(u) + cuCimag(v));
}

__host__ __device__ static __inline__ cuDoubleComplex cuCasas(cuDoubleComplex x,
                                                              cuDoubleComplex y,
                                                              cuDoubleComplex u,
                                                              cuDoubleComplex v) {

    return make_cuDoubleComplex (cuCreal(x) - cuCreal(y) + cuCreal(u) - cuCreal(v), 
                                 cuCimag(x) - cuCimag(y) + cuCimag(u) - cuCimag(v));
}

__host__ __device__ static __inline__ cuDoubleComplex cuCasasa(cuDoubleComplex x,
                                                               cuDoubleComplex y,
                                                               cuDoubleComplex u,
                                                               cuDoubleComplex v,
                                                               cuDoubleComplex w) {

    return make_cuDoubleComplex (cuCreal(x) - cuCreal(y) + cuCreal(u) - cuCreal(v) + cuCreal(w), 
                                 cuCimag(x) - cuCimag(y) + cuCimag(u) - cuCimag(v) + cuCimag(w) );
}


__device__ cuDoubleComplex cuC_5pos (cuDoubleComplex a, cuDoubleComplex b,
                                   cuDoubleComplex c, cuDoubleComplex d,
                                   cuDoubleComplex e) {
     // B[12]*(B[18]*B[24] - B[19]*B[23])
    //a*b*c -d*e
    return cuCmul(a, cuCfms(b,c, cuCmul(d,e)));
}

__device__ cuDoubleComplex cuC_5neg (cuDoubleComplex a, cuDoubleComplex b,
                                   cuDoubleComplex c, cuDoubleComplex d,
                                   cuDoubleComplex e) {
     // B[12]*(B[18]*B[24] - B[19]*B[23])
    //a*b*c -d*e
    return cuCmul(cuCmul(make_cuDoubleComplex(-1.0,0.0),a), cuCfms(b,c, cuCmul(d,e)));
}

__device__ cuDoubleComplex direct_det5 (cuDoubleComplex *B){
    cuDoubleComplex det;

    cuDoubleComplex det1 = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex det2 = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex det3 = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex det4 = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex det5 = make_cuDoubleComplex(0.0,0.0);


    cuDoubleComplex det11 = cuCmul(B[0],cuCmul(B[6],cuCadd( cuC_5pos(B[12],B[18],B[24],B[19],B[23]), 
                                                   cuCadd(  cuC_5neg(B[13],B[17],B[24],B[22],B[19]), 
                                                            cuC_5pos(B[14],B[17],B[23],B[22],B[18]) ) ) ) );
    cuDoubleComplex det12 = cuCmul(B[0],cuCmul(B[11],cuCadd(cuC_5pos(B[7],B[18],B[24],B[19],B[23]), 
                                                    cuCadd( cuC_5neg(B[8],B[17],B[24],B[22],B[19]), 
                                                            cuC_5pos(B[9],B[17],B[23],B[22],B[18]) ) ) ) );
    cuDoubleComplex det13 = cuCmul(B[0],cuCmul(B[16],cuCadd(cuC_5pos(B[7],B[13],B[24],B[23],B[14]), 
                                                    cuCadd( cuC_5neg(B[8],B[12],B[24],B[22],B[14]), 
                                                            cuC_5pos(B[9],B[12],B[23],B[22],B[13]) ) ) ) );
    cuDoubleComplex det14 = cuCmul(B[0],cuCmul(B[21],cuCadd(cuC_5pos(B[7],B[13],B[19],B[18],B[14]), 
                                                    cuCadd( cuC_5neg(B[8],B[12],B[19],B[17],B[14]), 
                                                            cuC_5pos(B[9],B[12],B[18],B[13],B[17]) ) ) ) );

    det1 = cuCasas(det11, det12, det13, det14);


    cuDoubleComplex det21 = cuCmul(B[1*5+0],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[12],B[18],B[24],B[19],B[23]), cuCadd( cuC_5neg(B[13],B[17],B[24],B[22],B[19]), cuC_5pos(B[14],B[17],B[23],B[22],B[18]) ) ) ) );
    cuDoubleComplex det22 = cuCmul(B[1*5+0],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[2],B[18],B[24],B[19],B[23]), cuCadd( cuC_5neg(B[3],B[17],B[24],B[22],B[19]), cuC_5pos(B[4],B[17],B[23],B[22],B[18]) ) ) ) );
    cuDoubleComplex det23 = cuCmul(B[1*5+0],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[2],B[13],B[24],B[23],B[14]), cuCadd( cuC_5neg(B[3],B[12],B[24],B[22],B[14]), cuC_5pos(B[4],B[12],B[23],B[22],B[13]) ) ) ) );
    cuDoubleComplex det24 = cuCmul(B[1*5+0],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[2],B[13],B[19],B[18],B[14]), cuCadd( cuC_5neg(B[3],B[12],B[19],B[17],B[14]), cuC_5pos(B[4],B[12],B[18],B[13],B[17]) ) ) ) );

    det2 = cuCasas(det21, det22, det23, det24);


    cuDoubleComplex det31 = cuCmul(B[10],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[3*5+3],B[4*5+4],B[3*5+4],B[4*5+3]), 
                                                       cuCadd(  cuC_5neg(B[1*5+3],B[3*5+2],B[4*5+4],B[4*5+2],B[3*5+4]), 
                                                                cuC_5pos(B[1*5+4],B[3*5+2],B[4*5+3],B[4*5+2],B[3*5+3]) ) ) ) );
    cuDoubleComplex det32 = cuCmul(B[10],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[3*5+3],B[4*5+4],B[3*5+4],B[4*5+3]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[3*5+2],B[4*5+4],B[4*5+2],B[3*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[3*5+2],B[4*5+3],B[4*5+2],B[3*5+3]) ) ) ) );
    cuDoubleComplex det33 = cuCmul(B[10],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[4*5+4],B[4*5+3],B[1*5+4]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[4*5+4],B[4*5+2],B[1*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[1*5+2],B[4*5+3],B[4*5+2],B[1*5+3]) ) ) ) );
    cuDoubleComplex det34 = cuCmul(B[10],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[3*5+4],B[3*5+3],B[1*5+4]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[3*5+4],B[3*5+2],B[1*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[1*5+2],B[3*5+3],B[1*5+3],B[3*5+2]) ) ) ) );


    det3 = cuCasas(det31, det32, det33, det34);


    cuDoubleComplex det41 = cuCmul(B[15],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[2*5+3],B[4*5+4],B[2*5+4],B[4*5+3]), 
                                                       cuCadd(  cuC_5neg(B[1*5+3],B[2*5+2],B[4*5+4],B[4*5+2],B[2*5+4]), 
                                                                cuC_5pos(B[1*5+4],B[2*5+2],B[4*5+3],B[4*5+2],B[2*5+3]) ) ) ) );
    cuDoubleComplex det42 = cuCmul(B[15],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[2*5+3],B[4*5+4],B[2*5+4],B[4*5+3]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[2*5+2],B[4*5+4],B[4*5+2],B[2*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[2*5+2],B[4*5+3],B[4*5+2],B[2*5+3]) ) ) ) );
    cuDoubleComplex det43 = cuCmul(B[15],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[4*5+4],B[4*5+3],B[1*5+4]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[4*5+4],B[4*5+2],B[1*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[1*5+2],B[4*5+3],B[4*5+2],B[1*5+3]) ) ) ) );
    cuDoubleComplex det44 = cuCmul(B[15],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[2*5+4],B[2*5+3],B[1*5+4]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[2*5+4],B[2*5+2],B[1*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[1*5+2],B[2*5+3],B[1*5+3],B[2*5+2]) ) ) ) );


    det4 = cuCasas(det41, det42, det43, det44);


    cuDoubleComplex det51 = cuCmul(B[20],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[2*5+3],B[3*5+4],B[2*5+4],B[3*5+3]), 
                                                       cuCadd(  cuC_5neg(B[1*5+3],B[2*5+2],B[3*5+4],B[3*5+2],B[2*5+4]), 
                                                                cuC_5pos(B[1*5+4],B[2*5+2],B[3*5+3],B[3*5+2],B[2*5+3]) ) ) ) );
    cuDoubleComplex det52 = cuCmul(B[20],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[2*5+3],B[3*5+4],B[2*5+4],B[3*5+3]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[2*5+2],B[3*5+4],B[3*5+2],B[2*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[2*5+2],B[3*5+3],B[3*5+2],B[2*5+3]) ) ) ) );
    cuDoubleComplex det53 = cuCmul(B[20],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[3*5+4],B[3*5+3],B[1*5+4]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[3*5+4],B[3*5+2],B[1*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[1*5+2],B[3*5+3],B[3*5+2],B[1*5+3]) ) ) ) );
    cuDoubleComplex det54 = cuCmul(B[20],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[2*5+4],B[2*5+3],B[1*5+4]), 
                                                        cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[2*5+4],B[2*5+2],B[1*5+4]), 
                                                                cuC_5pos(B[0*5+4],B[1*5+2],B[2*5+3],B[1*5+3],B[2*5+2]) ) ) ) );


    det5 = cuCasas(det51, det52, det53, det54);

    det = cuCasasa(det1,det2,det3,det4,det5);

    return det;
}



__device__ cuDoubleComplex det_4x4(cuDoubleComplex *matrix) {
    cuDoubleComplex det = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex det1 = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex det2 = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex det3 = make_cuDoubleComplex(0.0,0.0);
    cuDoubleComplex det4 = make_cuDoubleComplex(0.0,0.0);

    // Calculate the determinant using cofactor expansions
    det1 = cuCmul(matrix[0], 
                  cuCadd( cuCsub( cuCmul( matrix[5], 
                                          cuCsub( cuCmul(matrix[10],matrix[15]), 
                                                  cuCmul(matrix[11],matrix[14]) ))
                                 ,cuCmul(matrix[6], 
                                         cuCsub( cuCmul(matrix[9],matrix[15]),
                                                 cuCmul(matrix[11],matrix[13]) ) ) )
                         ,cuCmul(matrix[7], 
                                 cuCsub( cuCmul(matrix[9],matrix[14]),
                                         cuCmul(matrix[10],matrix[13]) ) ) ) );



    det2 = cuCmul(matrix[1], 
                  cuCadd( cuCsub( cuCmul( matrix[4], 
                                          cuCsub( cuCmul(matrix[10],matrix[15]), 
                                                  cuCmul(matrix[11],matrix[14]) ))
                                 ,cuCmul(matrix[6], 
                                         cuCsub( cuCmul(matrix[8],matrix[15]),
                                                 cuCmul(matrix[11],matrix[12]) ) ) )
                         ,cuCmul(matrix[7], 
                                 cuCsub( cuCmul(matrix[8],matrix[14]),
                                         cuCmul(matrix[10],matrix[12]) ) ) ) );


    det3 = cuCmul(matrix[2], 
                  cuCadd( cuCsub( cuCmul( matrix[4], 
                                          cuCsub( cuCmul(matrix[9],matrix[15]), 
                                                  cuCmul(matrix[11],matrix[13]) ))
                                 ,cuCmul(matrix[5], 
                                         cuCsub( cuCmul(matrix[8],matrix[15]),
                                                 cuCmul(matrix[11],matrix[12]) ) ) )
                         ,cuCmul(matrix[7], 
                                 cuCsub( cuCmul(matrix[8],matrix[13]),
                                         cuCmul(matrix[9],matrix[12]) ) ) ) );
    
    
    det4 = cuCmul(matrix[3], 
                  cuCadd( cuCsub( cuCmul( matrix[4], 
                                          cuCsub( cuCmul(matrix[9],matrix[14]), 
                                                  cuCmul(matrix[10],matrix[13]) ))
                                 ,cuCmul(matrix[5], 
                                         cuCsub( cuCmul(matrix[8],matrix[14]),
                                                 cuCmul(matrix[10],matrix[12]) ) ) )
                         ,cuCmul(matrix[6], 
                                 cuCsub( cuCmul(matrix[8],matrix[13]),
                                         cuCmul(matrix[9],matrix[12]) ) ) ) );
    
    det = cuCsub( cuCadd(det1,det3), cuCadd(det2,det4) );

    return det;
}


__device__ cuDoubleComplex det_5x5(cuDoubleComplex *matrix) {
    cuDoubleComplex det = make_cuDoubleComplex(0.0,0.0);


    // Calculate the determinant using cofactor expansions
    #pragma unroll
    for (int col = 0; col < N; col++) {
        cuDoubleComplex submatrix[N-1][N-1];
        int subrow = 0, subcol = 0;

        // Create the submatrix by excluding the current row and column
        #pragma unroll
        for (int row = 1; row < N; row++) {
            #pragma unroll
            for (int c = 0; c < N; c++) {
                if (c == col) {
                    continue;
                }
                submatrix[subrow][subcol] = matrix[row * N + c];
                subcol++;
            }
            subrow++;
            subcol = 0;
        }

        // Calculate the cofactor
        cuDoubleComplex det_sign = make_cuDoubleComplex((col % 2 == 0 ? 1 : -1),0.0);
        cuDoubleComplex cofactor = cuCmul( cuCmul(det_sign,matrix[col]), det_4x4(&submatrix[0][0]));


        // Accumulate the cofactors to calculate the determinant
        det = cuCadd(det, cofactor);
    }


    return det;
}

__device__ cuDoubleComplex gpu_device_kernel_original(int i) {


    // some random data, depending on `i`, but for simplification say constant data
    cuDoubleComplex matrix[5*5] = {
            make_cuDoubleComplex(2.0*(i/69120 * 1000), 2.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(7.0, 0.0),
            make_cuDoubleComplex(4.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(8.0, 0.0),
            make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(3.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0),
            make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(2.0, 0.0),
            make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(6.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 2.0)
    };

    cuDoubleComplex det = det_5x5(matrix);

    return det;
}

__device__ cuDoubleComplex gpu_device_kernel_direct(int i) {


    // some random data, depending on `i`, but for simplification say constant data
    cuDoubleComplex matrix[5*5] = {
            make_cuDoubleComplex(2.0*(i/69120 * 1000), 2.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(7.0, 0.0),
            make_cuDoubleComplex(4.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(8.0, 0.0),
            make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(3.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0),
            make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(2.0, 0.0),
            make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(6.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 2.0)
    };

    cuDoubleComplex det = direct_det5(matrix);    

    return det;
}

__device__ cuDoubleComplex gpu_device_kernel_generated(int i) {


    // some random data, depending on `i`, but for simplification say constant data
    cuDoubleComplex matrix[5*5] = {
            make_cuDoubleComplex(2.0*(i/69120 * 1000), 2.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(7.0, 0.0),
            make_cuDoubleComplex(4.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(8.0, 0.0),
            make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(3.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0),
            make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(2.0, 0.0),
            make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(6.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 2.0)
    };

    cuDoubleComplex det = generated_det_5x5(matrix);    

    return det;
}


__global__ void gpu_global_kernel_original(cuDoubleComplex *d_res, int numGpuThreads) {
    const int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if(tid < numGpuThreads){

        // some other computations, irrelevant to current problem

        uint64_t total = 69120 * 1000;// 10 * 1000 * 1000; // this `total` is what I am referring to thousands 
        
        cuDoubleComplex det_sum, det;
        det_sum = make_cuDoubleComplex(0.0,0.0);
        for (int i = 0; i < total; i++) {
            det = gpu_device_kernel_original(i);
            det_sum = cuCadd(det_sum, det);
        }

        det_sum = cuCmul(det_sum, make_cuDoubleComplex(1.0/total,0.0) );

        d_res[tid] = det_sum;
    }
}


__global__ void gpu_global_kernel_direct(cuDoubleComplex *d_res, int numGpuThreads) {
    const int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if(tid < numGpuThreads){

        // some other computations, irrelevant to current problem

        uint64_t total = 69120 * 1000;// 10 * 1000 * 1000; // this `total` is what I am referring to thousands 
        
        cuDoubleComplex det_sum, det;
        det_sum = make_cuDoubleComplex(0.0,0.0);
        for (int i = 0; i < total; i++) {
            det = gpu_device_kernel_direct(i);
            det_sum = cuCadd(det_sum, det);
        }

        det_sum = cuCmul(det_sum, make_cuDoubleComplex(1.0/total,0.0) );

        d_res[tid] = det_sum;
    }
}


__global__ void gpu_global_kernel_generated(cuDoubleComplex *d_res, int numGpuThreads) {
    const int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if(tid < numGpuThreads){

        // some other computations, irrelevant to current problem

        uint64_t total = 69120 * 1000;// 10 * 1000 * 1000; // this `total` is what I am referring to thousands 
        
        cuDoubleComplex det_sum, det;
        det_sum = make_cuDoubleComplex(0.0,0.0);
        for (int i = 0; i < total; i++) {
            det = gpu_device_kernel_generated(i);
            det_sum = cuCadd(det_sum, det);
        }

        det_sum = cuCmul(det_sum, make_cuDoubleComplex(1.0/total,0.0) );

        d_res[tid] = det_sum;
    }
}

int main(int argc, char *argv[]) {

    if (argc < 2) {
        printf("Usage: %s <method> [<numGpuThreads>]\n", argv[0]);
        printf("method: 0 = original, 1 = direct, 2 = generated(no explicit cucomplex)\n");
        return 1;
    }

    int method;
    method = atoi(argv[1]);

    int numGpuThreads = 1;
    if(argc > 2){
        numGpuThreads = atoi(argv[2]);
    }
    
    cuDoubleComplex *res;
    res = (cuDoubleComplex*)malloc(sizeof(cuDoubleComplex) * numGpuThreads);
    memset(res, 0.0,sizeof(cuDoubleComplex) * numGpuThreads);


    cuDoubleComplex *d_res;
    cudaMalloc(&d_res, sizeof(cuDoubleComplex) * numGpuThreads );

    clock_t direct_start, direct_end;
    direct_start = clock();
    dim3 block = 128;
    dim3 grid = (numGpuThreads + block.x -1) / block.x;
    switch(method){
        case 0: printf("original\n"); gpu_global_kernel_original<<<grid,block>>>(d_res, numGpuThreads); break;
        case 1: printf("direct\n"); gpu_global_kernel_direct<<<grid,block>>>(d_res, numGpuThreads); break;
        case 2: printf("generated\n"); gpu_global_kernel_generated<<<grid,block>>>(d_res, numGpuThreads); break;
    }

    cudaDeviceSynchronize();

    direct_end = clock();

    //just copy the first result, not all numGpuThreads results
    cudaMemcpy(res, d_res, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost );

    printf("RESULT: %.16E + %.16E i\n", cuCreal(res[0]), cuCimag(res[0]));

    double timing = (double)((double)(direct_end - direct_start) / CLOCKS_PER_SEC);

    printf("time %.5f s\n", timing);

    cudaDeviceReset();

    return 0;
}