CUDA: Matrix inversion slower on GPU than on CPU

340 Views Asked by At

I have a stable, simple Gauss-Jordan algorithm for calculating the matrix inversion on the CPU. I tried to transfer this algorithm to the GPU, it works fine, but the speed has dropped significantly, about 10 times. I understand that I am not well versed in C++ and CUDA, and I would be glad to hear some advice. Thanks in advance.

Console:

matrix size = 88; CPU time = 6531 mcs; GPU time = 52806 mcs;

Here is the code for the CPU:

void GaussJordanInverse(float* original, float* temp, float* singular, float* bigmatrix, int size) {
    //create temp
    //create singular matrix
    for (int i = 0; i < size; i++)
    {
        for (int j = 0; j < size; j++) {
            temp[i * size + j] = original[i * size + j];
            singular[i * size + j] = 0; 
        }
        singular[i * size + i] = 1;
    }
    //create big matrix
    for (int i = 0; i < size; i++) {
        for (int j = 0; j < size; j++)
        {
            bigmatrix[i * (size * 2) + j] = temp[i * size + j];
            bigmatrix[i * (size * 2) + j + size] = singular[i * size + j];
        }
    }
    //direct
    for (int k = 0; k < size; k++)
    {
        for (int i = 0; i < 2 * size; i++) bigmatrix[k * (size * 2) + i] = bigmatrix[k * (size * 2) + i] / temp[k * size + k];
        for (int i = k + 1; i < size; i++) {
            float K = bigmatrix[i * (size * 2) + k] / bigmatrix[k * (size * 2) + k];
            for (int j = 0; j < 2 * size; j++)bigmatrix[i * (size * 2) + j] = bigmatrix[i * (size * 2) + j] - bigmatrix[k * (size * 2) + j] * K;
        }
        for (int i = 0; i < size; i++) {
            for (int j = 0; j < size; j++)
            {
                temp[i * size + j] = bigmatrix[i * (size * 2) + j];
            }
        }
    }
    //indirect
    for (int k = size - 1; k > -1; k--)
    {
        for (int i = 2 * size - 1; i > -1; i--) bigmatrix[k * (size * 2) + i] = bigmatrix[k * (size * 2) + i] / temp[k * size + k];
        for (int i = k - 1; i > -1; i--)
        {
            float K = bigmatrix[i * (size * 2) + k] / bigmatrix[k * (size * 2) + k];
            for (int j = 2 * size - 1; j > -1; j--)
                bigmatrix[i * (size * 2) + j] = bigmatrix[i * (size * 2) + j] - bigmatrix[k * (size * 2) + j] * K;
        }
    }
    //cut
    for (int i = 0; i < size; i++) {
        for (int j = 0; j < size; j++)
        {
            singular[i * size + j] = bigmatrix[i * (size * 2) + j + size];
            original[i * size + j] = singular[i * size + j];
        }
    }
}

Here is the code for the GPU:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <iostream>
#ifndef __CUDACC__  
#define __CUDACC__
#endif
#include <device_functions.h>
#include <stdio.h>
#include <chrono>
#include "matrixconversion.h"
using namespace std;
using namespace std::chrono;

__global__ void create(float* original, float* temp, float* singular, float* bigmatrix, int size) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int y = blockDim.y * blockIdx.y + threadIdx.y;
    //create singular matrix
    if (x < size && y < size)
    {
        temp[x * size + y] = original[x * size + y];
        if (x == y) singular[x * size + y] = 1;
        //singular[x * size + y] = 0;
        //create big matrix
        bigmatrix[x * (2 * size) + y] = temp[x * size + y];
        bigmatrix[x * (2 * size) + y + size] = singular[x * size + y];
    }
}
__global__ void direct_kernel(float* temp, float* bigmatrix, int size, int k) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    if (x < 2 * size) {
        bigmatrix[k * (size * 2) + x] /= temp[k * size + k];
    }
    __syncthreads();
    if (x >= k + 1 && x < size * 2) {
        float K = bigmatrix[x * (size * 2) + k] / bigmatrix[k * (size * 2) + k];
        for (int j = 0; j < 2 * size; j++) {
            bigmatrix[x * (size * 2) + j] -= bigmatrix[k * (size * 2) + j] * K;
        }
    }
    __syncthreads();
    if (x < size) {
        for (int j = 0; j < size; j++) {
            temp[x * size + j] = bigmatrix[x * (size * 2) + j];
        }
    }
}
__global__ void indirect_kernel(float* temp, float* bigmatrix, int size, int k) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;

    if (x < 2 * size) {
        bigmatrix[k * (size * 2) + x] /= temp[k * size + k];
    }
    __syncthreads();
    if (x < k&& x >= 0) {
        float K = bigmatrix[x * (size * 2) + k] / bigmatrix[k * (size * 2) + k];
        for (int j = 2 * size - 1; j > -1; j--) {
            bigmatrix[x * (size * 2) + j] -= bigmatrix[k * (size * 2) + j] * K;
        }
    }
    __syncthreads();
}

__global__ void cut(float* original, float* singular, float* bigmatrix, int size) {
    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int y = blockDim.y * blockIdx.y + threadIdx.y;

    if (x < size && y < size) {
        singular[x * size + y] = bigmatrix[x * (size * 2) + y + size];
        original[x * size + y] = singular[x * size + y];
    }
}

void GPUInverse(float* copy, float* original, int size, long int* time) {
    Copy1DFloat(copy, original, size);
    //set kernels
    int tr = 16;
    int bl = (int)ceil(size / tr);
    dim3 grid_size(16, 16);
    dim3 block_size(bl, bl);
    //create temp
    float* d_copy, * d_temp, * d_singular, * d_bigmatrix;
    //memory
    cudaMalloc((void**)&d_copy, size * size * sizeof(float));
    cudaMalloc((void**)&d_temp, size * size * sizeof(float));
    cudaMalloc((void**)&d_singular, size * size * sizeof(float));
    cudaMalloc((void**)&d_bigmatrix, size * (size * 2) * sizeof(float));
    cudaMemset(d_copy, 0, size * size * sizeof(float));
    cudaMemset(d_temp, 0, size * size * sizeof(float));
    cudaMemset(d_singular, 0, size * size * sizeof(float));
    cudaMemset(d_bigmatrix, 0, size * (2 * size) * sizeof(float));
    //to device
    cudaMemcpy(d_copy, copy, size * size * sizeof(float), cudaMemcpyHostToDevice);

    auto start = high_resolution_clock::now();
    //fill temp
    create <<<grid_size, block_size >>> (d_copy, d_temp, d_singular, d_bigmatrix, size);
    cudaDeviceSynchronize();
    //direct
    for (int k = 0; k < size; k++) {
        direct_kernel <<<(2 * size + 255) / 256, 256 >>> (d_temp, d_bigmatrix, size, k);
        cudaDeviceSynchronize();
    }
    //indirect
    for (int k = size - 1; k > -1; k--) {
        indirect_kernel <<<(2 * size + 255) / 256, 256 >>> (d_temp, d_bigmatrix, size, k);
        cudaDeviceSynchronize();
    }
    //cut
    cut <<<grid_size, block_size >>> (d_copy, d_singular, d_bigmatrix, size);
    cudaDeviceSynchronize();

    auto stop = high_resolution_clock::now();
    auto duration = duration_cast<microseconds>(stop - start);
    *time = duration.count();
    //from device
    cudaMemcpy(copy, d_copy, size * size * sizeof(float), cudaMemcpyDeviceToHost);
    //clean

    cudaFree(d_copy);
    cudaFree(d_temp);
    cudaFree(d_singular);
    cudaFree(d_bigmatrix);
}
1

There are 1 best solutions below

0
Gagy Krayper On

Thank you paleonix for your advices. Indeed, the more data you have, the better you can see the difference in speedup between the CPU and GPU. In addition, the removal of cudaDeviceSynchronize() helped to speed up the work even more. Now the console looks like this:

matrix size = 88; CPU time = 7096 mcs; GPU time = 418 mcs;
matrix size = 1000; CPU time = 9834535 mcs; GPU time = 4347 mcs;