I am interested in the question why the algorithm written for OpenCL works correctly for small-dimensional matrices, but it does not work correctly with large-dimensional matrices. Moreover, for the same input data, the inverse matrix is different each time on OpenCL. The usual algorithm for the CPU calculates everything correctly, using the Gauss method. The GPU algorithm for large matrices gives different incorrect answers each time.
I wondered if there might be a problem in synchronizing threads? However, I have added synchronization both at the host level and in the kernel itself, using barriers. But all to no avail. I am attaching the source code for CPU and GPU (OpenCL), please help.
Code for CPU (the matrix has already been augmented to the expanded unit matrix on the right):
void doInverse() {
for (int i = 0; i < size; i++) getUpperTriangularMatrix(i);
if (isNullDeterminantMatrix()) printf("This matrix does not have an inverse matrix\n");
else {
for (int i = 0; i < size; i++) {
mainDiagonal(i);
processingTopRows(i);
}
}
}
void getUpperTriangularMatrix(int i) {
double elem = matrix[i][i];
int iConst = i;
i++;
for (i; i < size; i++) {
double elem2 = matrix[i][iConst];
for (int j = 0; j < size * 2; j++) matrix[i][j] -= matrix[iConst][j] * elem2 / elem;
}
}
void mainDiagonal(int i) {
double count = matrix[i][i];
for (int j = 0; j < size * 2; j++) matrix[i][j] /= count;
}
void processingTopRows(int i) {
int iConst = i;
i--;
for (i; i >= 0; i--) {
double elem = matrix[i][iConst];
for (int j = 0; j < size * 2; j++) matrix[i][j] -= elem * matrix[iConst][j];
}
}
bool isNullDeterminantMatrix() {
for (int i = 0; i < size; i++)
if (!matrix[i][i]) return true;
return false;
}
Code for GPU (the matrix has already been augmented to the expanded unit matrix on the right):
cl_event event;
size_t globalSize[2] = { size, size * 2 };
cl_event eventRead = NULL;
cl_kernel kernel = clCreateKernel(program, "getUpperTriangularMatrix", &clStatus);
//if (clStatus != CL_SUCCESS) error("Error: creating kernel\n", ERROR_CREATING_KERNEL);
clStatus = clSetKernelArg(kernel, 0, sizeof(matrix_clmem), (void*)&matrix_clmem);
//if (clStatus != CL_SUCCESS) error("Error: set arg 0", ERROR_SET_KERNEL_ARG);
clStatus = clSetKernelArg(kernel, 1, sizeof(int), (int*)&size);
//if (clStatus != CL_SUCCESS) error("Error: set arg 1", ERROR_SET_KERNEL_ARG);
clStatus = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalSize, NULL, 0, NULL, &event);
//if (clStatus != CL_SUCCESS) error("Error: enqueue\n", ERROR_ENQUEUE);
clWaitForEvents(1, &event);
clStatus = clEnqueueReadBuffer(commandQueue, matrix_clmem, CL_TRUE, 0, size * size * 2 * sizeof(cl_double), matrix, 0, 0, NULL);
//if (clStatus != CL_SUCCESS) error("Error: read buffer\n", ERROR_READ_BUFFER);
clWaitForEvents(1, &eventRead);
clStatus = clReleaseKernel(kernel);
clStatus = clFlush(commandQueue);
clStatus = clFinish(commandQueue);
if (isNullDeterminantMatrix()) printf("This matrix does not have an inverse matrix\n");
else {
cl_kernel kernel2 = clCreateKernel(program, "mainDiagonal_processingTopRows", &clStatus);
//if (clStatus != CL_SUCCESS) error("Error: create kernel\n", ERROR_CREATING_KERNEL);
clStatus = clSetKernelArg(kernel2, 0, sizeof(matrix_clmem), (void*)&matrix_clmem);
//if (clStatus != CL_SUCCESS) error("Error: set arg 0", ERROR_SET_KERNEL_ARG);
clStatus = clSetKernelArg(kernel2, 1, sizeof(int), (int*)&size);
//if (clStatus != CL_SUCCESS) error("Error: set arg 1", ERROR_SET_KERNEL_ARG);
clStatus = clEnqueueNDRangeKernel(commandQueue, kernel2, 2, NULL, globalSize, NULL, 0, NULL, &event);
//if (clStatus != CL_SUCCESS) error("Error: enqueue\n", ERROR_ENQUEUE);
clWaitForEvents(1, &event);
clStatus = clEnqueueReadBuffer(commandQueue, matrix_clmem, CL_TRUE, 0, size * size * 2 * sizeof(cl_double), matrix, 0, 0, NULL);
//if (clStatus != CL_SUCCESS) error("Error: read buffer\n", ERROR_READ_BUFFER);
clWaitForEvents(1, &eventRead);
clStatus = clReleaseKernel(kernel2);
}
clStatus = clFlush(commandQueue);
clStatus = clFinish(commandQueue);
Code for kernel (GPU):
#pragma OPENCL EXTENSION cl_khr_fp64: enable
__kernel void getUpperTriangularMatrix(__global double *matrix, int size) {
const int i = get_global_id(0);
const int j = get_global_id(1);
for (int ii = 0; ii < size; ii++) {
if (i + 1 < size && i >= ii) {
double elem = matrix[ii * size * 2 + ii];
double elem2 = matrix[(i + 1) * size * 2 + ii];
matrix[(i + 1) * size * 2 + j] -= matrix[ii * size * 2 + j] * elem2 / elem;
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
__kernel void mainDiagonal_processingTopRows(__global double *matrix, int size) {
const int i = get_global_id(0);
const int j = get_global_id(1);
double count = matrix[i * size * 2 + i];
matrix[i * size * 2 + j] /= count;
for (int ii = 0; ii < size; ii++) {
if (i - 1 >= 0 && i <= ii) {
double elem = matrix[(i - 1) * size * 2 + ii];
matrix[(i - 1) * size * 2 + j] -= elem * matrix[ii * size * 2 + j];
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
}