I have a stable, simple Gauss-Jordan algorithm for calculating the matrix inversion on the CPU. I tried to transfer this algorithm to the GPU, it works fine, but the speed has dropped significantly, about 10 times. I understand that I am not well versed in C++ and CUDA, and I would be glad to hear some advice. Thanks in advance.
Console:
matrix size = 88; CPU time = 6531 mcs; GPU time = 52806 mcs;
Here is the code for the CPU:
void GaussJordanInverse(float* original, float* temp, float* singular, float* bigmatrix, int size) {
//create temp
//create singular matrix
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++) {
temp[i * size + j] = original[i * size + j];
singular[i * size + j] = 0;
}
singular[i * size + i] = 1;
}
//create big matrix
for (int i = 0; i < size; i++) {
for (int j = 0; j < size; j++)
{
bigmatrix[i * (size * 2) + j] = temp[i * size + j];
bigmatrix[i * (size * 2) + j + size] = singular[i * size + j];
}
}
//direct
for (int k = 0; k < size; k++)
{
for (int i = 0; i < 2 * size; i++) bigmatrix[k * (size * 2) + i] = bigmatrix[k * (size * 2) + i] / temp[k * size + k];
for (int i = k + 1; i < size; i++) {
float K = bigmatrix[i * (size * 2) + k] / bigmatrix[k * (size * 2) + k];
for (int j = 0; j < 2 * size; j++)bigmatrix[i * (size * 2) + j] = bigmatrix[i * (size * 2) + j] - bigmatrix[k * (size * 2) + j] * K;
}
for (int i = 0; i < size; i++) {
for (int j = 0; j < size; j++)
{
temp[i * size + j] = bigmatrix[i * (size * 2) + j];
}
}
}
//indirect
for (int k = size - 1; k > -1; k--)
{
for (int i = 2 * size - 1; i > -1; i--) bigmatrix[k * (size * 2) + i] = bigmatrix[k * (size * 2) + i] / temp[k * size + k];
for (int i = k - 1; i > -1; i--)
{
float K = bigmatrix[i * (size * 2) + k] / bigmatrix[k * (size * 2) + k];
for (int j = 2 * size - 1; j > -1; j--)
bigmatrix[i * (size * 2) + j] = bigmatrix[i * (size * 2) + j] - bigmatrix[k * (size * 2) + j] * K;
}
}
//cut
for (int i = 0; i < size; i++) {
for (int j = 0; j < size; j++)
{
singular[i * size + j] = bigmatrix[i * (size * 2) + j + size];
original[i * size + j] = singular[i * size + j];
}
}
}
Here is the code for the GPU:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <iostream>
#ifndef __CUDACC__
#define __CUDACC__
#endif
#include <device_functions.h>
#include <stdio.h>
#include <chrono>
#include "matrixconversion.h"
using namespace std;
using namespace std::chrono;
__global__ void create(float* original, float* temp, float* singular, float* bigmatrix, int size) {
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
//create singular matrix
if (x < size && y < size)
{
temp[x * size + y] = original[x * size + y];
if (x == y) singular[x * size + y] = 1;
//singular[x * size + y] = 0;
//create big matrix
bigmatrix[x * (2 * size) + y] = temp[x * size + y];
bigmatrix[x * (2 * size) + y + size] = singular[x * size + y];
}
}
__global__ void direct_kernel(float* temp, float* bigmatrix, int size, int k) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x < 2 * size) {
bigmatrix[k * (size * 2) + x] /= temp[k * size + k];
}
__syncthreads();
if (x >= k + 1 && x < size * 2) {
float K = bigmatrix[x * (size * 2) + k] / bigmatrix[k * (size * 2) + k];
for (int j = 0; j < 2 * size; j++) {
bigmatrix[x * (size * 2) + j] -= bigmatrix[k * (size * 2) + j] * K;
}
}
__syncthreads();
if (x < size) {
for (int j = 0; j < size; j++) {
temp[x * size + j] = bigmatrix[x * (size * 2) + j];
}
}
}
__global__ void indirect_kernel(float* temp, float* bigmatrix, int size, int k) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x < 2 * size) {
bigmatrix[k * (size * 2) + x] /= temp[k * size + k];
}
__syncthreads();
if (x < k&& x >= 0) {
float K = bigmatrix[x * (size * 2) + k] / bigmatrix[k * (size * 2) + k];
for (int j = 2 * size - 1; j > -1; j--) {
bigmatrix[x * (size * 2) + j] -= bigmatrix[k * (size * 2) + j] * K;
}
}
__syncthreads();
}
__global__ void cut(float* original, float* singular, float* bigmatrix, int size) {
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < size && y < size) {
singular[x * size + y] = bigmatrix[x * (size * 2) + y + size];
original[x * size + y] = singular[x * size + y];
}
}
void GPUInverse(float* copy, float* original, int size, long int* time) {
Copy1DFloat(copy, original, size);
//set kernels
int tr = 16;
int bl = (int)ceil(size / tr);
dim3 grid_size(16, 16);
dim3 block_size(bl, bl);
//create temp
float* d_copy, * d_temp, * d_singular, * d_bigmatrix;
//memory
cudaMalloc((void**)&d_copy, size * size * sizeof(float));
cudaMalloc((void**)&d_temp, size * size * sizeof(float));
cudaMalloc((void**)&d_singular, size * size * sizeof(float));
cudaMalloc((void**)&d_bigmatrix, size * (size * 2) * sizeof(float));
cudaMemset(d_copy, 0, size * size * sizeof(float));
cudaMemset(d_temp, 0, size * size * sizeof(float));
cudaMemset(d_singular, 0, size * size * sizeof(float));
cudaMemset(d_bigmatrix, 0, size * (2 * size) * sizeof(float));
//to device
cudaMemcpy(d_copy, copy, size * size * sizeof(float), cudaMemcpyHostToDevice);
auto start = high_resolution_clock::now();
//fill temp
create <<<grid_size, block_size >>> (d_copy, d_temp, d_singular, d_bigmatrix, size);
cudaDeviceSynchronize();
//direct
for (int k = 0; k < size; k++) {
direct_kernel <<<(2 * size + 255) / 256, 256 >>> (d_temp, d_bigmatrix, size, k);
cudaDeviceSynchronize();
}
//indirect
for (int k = size - 1; k > -1; k--) {
indirect_kernel <<<(2 * size + 255) / 256, 256 >>> (d_temp, d_bigmatrix, size, k);
cudaDeviceSynchronize();
}
//cut
cut <<<grid_size, block_size >>> (d_copy, d_singular, d_bigmatrix, size);
cudaDeviceSynchronize();
auto stop = high_resolution_clock::now();
auto duration = duration_cast<microseconds>(stop - start);
*time = duration.count();
//from device
cudaMemcpy(copy, d_copy, size * size * sizeof(float), cudaMemcpyDeviceToHost);
//clean
cudaFree(d_copy);
cudaFree(d_temp);
cudaFree(d_singular);
cudaFree(d_bigmatrix);
}
Thank you paleonix for your advices. Indeed, the more data you have, the better you can see the difference in speedup between the CPU and GPU. In addition, the removal of
cudaDeviceSynchronize()helped to speed up the work even more. Now the console looks like this: