My code to perform the convolution operation (you check the operation in the provided link: https://prvnk10.medium.com/the-convolution-operation-48d72a382f5a) is working fine but failing some test cases.
#include <chrono>
#include <fstream>
#include <iostream>
#include <stdio.h>
#include <cuda.h>
#include <iostream>
using namespace std;
using std::cin;
using std::cout;
typedef long long ll;
#define TILE_SIZE 16
#define FILTER_SIZE 3
// CUDA kernel for convolution
// CUDA kernel for convolution
__global__ void convolutionKernel(long int* inputMatrix, long int* outputMatrix, long int* filter, int m, int n, int k) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ long int tile[TILE_SIZE + FILTER_SIZE - 1][TILE_SIZE + FILTER_SIZE - 1];
// Load data into shared memory with padding
int tileRow = threadIdx.y;
int tileCol = threadIdx.x;
int inputRow = row - FILTER_SIZE / 2;
int inputCol = col - FILTER_SIZE / 2;
if (inputRow >= 0 && inputRow < m && inputCol >= 0 && inputCol < n) {
tile[tileRow][tileCol] = inputMatrix[inputRow * n + inputCol];
} else {
tile[tileRow][tileCol] = 0;
}
__syncthreads();
// Convolution computation using shared memory
long int result = 0;
for (int i = 0; i < k; ++i) {
for (int j = 0; j < k; ++j) {
result += tile[tileRow + i][tileCol + j] * filter[i * k + j];
}
}
// Write result to the output matrix
if (row < m && col < n) {
outputMatrix[row * n + col] = result;
}
}
int main() {
int m, n, k;
cin >> m >> n >> k;
long int* h_mat = new long int[m * n];
long int* h_filter = new long int[k * k];
long int* h_ans = new long int[m * n];
for (long int i = 0; i < m * n; i++) {
cin >> h_mat[i];
}
for (long int i = 0; i < k * k; i++) {
cin >> h_filter[i];
}
// Allocate memory on the device (GPU)
long int* d_mat, *d_filter, *d_ans;
cudaMalloc((void**)&d_mat, m * n * sizeof(long int));
cudaMalloc((void**)&d_filter, k * k * sizeof(long int));
cudaMalloc((void**)&d_ans, m * n * sizeof(long int));
// Transfer data from host to device
cudaMemcpy(d_mat, h_mat, m * n * sizeof(long int), cudaMemcpyHostToDevice);
cudaMemcpy(d_filter, h_filter, k * k * sizeof(long int), cudaMemcpyHostToDevice);
// Define grid and block dimensions
dim3 blockDim(16, 16);
dim3 gridDim((n + blockDim.x - 1) / blockDim.x, (m + blockDim.y - 1) / blockDim.y);
// Launch the CUDA kernel
convolutionKernel<<<gridDim, blockDim>>>(d_mat, d_ans, d_filter, m, n, k);
// Wait for the kernel to finish and check for errors
cudaDeviceSynchronize();
cudaError_t cudaError = cudaGetLastError();
if (cudaError != cudaSuccess) {
cerr << "CUDA error: " << cudaGetErrorString(cudaError) << endl;
}
// Transfer result from device to host
cudaMemcpy(h_ans, d_ans, m * n * sizeof(long int), cudaMemcpyDeviceToHost);
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
cout << h_ans[i * n + j] << " ";
}
cout << endl;
}
// Free device memory
cudaFree(d_mat);
cudaFree(d_filter);
cudaFree(d_ans);
// Free host memory
delete[] h_mat;
delete[] h_filter;
delete[] h_ans;
return 0;}
It is giving correct output for some test cases but wrong output for other test cases. Also it is taking very long to give output for very large inputs.