perform convolution operation in cuda

42 Views Asked by At

My code to perform the convolution operation (you check the operation in the provided link: https://prvnk10.medium.com/the-convolution-operation-48d72a382f5a) is working fine but failing some test cases.

#include <chrono>
#include <fstream>
#include <iostream>
#include <stdio.h>
#include <cuda.h>

#include <iostream>

using namespace std;

using std::cin;
using std::cout;

typedef long long ll;

#define TILE_SIZE 16
#define FILTER_SIZE 3

// CUDA kernel for convolution
// CUDA kernel for convolution
__global__ void convolutionKernel(long int* inputMatrix, long int* outputMatrix, long int* filter, int m, int n, int k) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    __shared__ long int tile[TILE_SIZE + FILTER_SIZE - 1][TILE_SIZE + FILTER_SIZE - 1];

    // Load data into shared memory with padding
    int tileRow = threadIdx.y;
    int tileCol = threadIdx.x;

    int inputRow = row - FILTER_SIZE / 2;
    int inputCol = col - FILTER_SIZE / 2;

    if (inputRow >= 0 && inputRow < m && inputCol >= 0 && inputCol < n) {
        tile[tileRow][tileCol] = inputMatrix[inputRow * n + inputCol];
    } else {
        tile[tileRow][tileCol] = 0;
    }

    __syncthreads();

    // Convolution computation using shared memory
    long int result = 0;

    for (int i = 0; i < k; ++i) {
        for (int j = 0; j < k; ++j) {
            result += tile[tileRow + i][tileCol + j] * filter[i * k + j];
        }
    }

    // Write result to the output matrix
    if (row < m && col < n) {
        outputMatrix[row * n + col] = result;
    }
}


int main() {
    int m, n, k;
    cin >> m >> n >> k;

    long int* h_mat = new long int[m * n];
    long int* h_filter = new long int[k * k];
    long int* h_ans = new long int[m * n];

    for (long int i = 0; i < m * n; i++) {
        cin >> h_mat[i];
    }

    for (long int i = 0; i < k * k; i++) {
        cin >> h_filter[i];
    }

    // Allocate memory on the device (GPU)
    long int* d_mat, *d_filter, *d_ans;
    cudaMalloc((void**)&d_mat, m * n * sizeof(long int));
    cudaMalloc((void**)&d_filter, k * k * sizeof(long int));
    cudaMalloc((void**)&d_ans, m * n * sizeof(long int));

    // Transfer data from host to device
    cudaMemcpy(d_mat, h_mat, m * n * sizeof(long int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_filter, h_filter, k * k * sizeof(long int), cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    dim3 blockDim(16, 16);
    dim3 gridDim((n + blockDim.x - 1) / blockDim.x, (m + blockDim.y - 1) / blockDim.y);
    
    // Launch the CUDA kernel
    convolutionKernel<<<gridDim, blockDim>>>(d_mat, d_ans, d_filter, m, n, k);
    
    // Wait for the kernel to finish and check for errors
    cudaDeviceSynchronize();
    cudaError_t cudaError = cudaGetLastError();
    if (cudaError != cudaSuccess) {
        cerr << "CUDA error: " << cudaGetErrorString(cudaError) << endl;
    }

    // Transfer result from device to host
    cudaMemcpy(h_ans, d_ans, m * n * sizeof(long int), cudaMemcpyDeviceToHost);

     for (int i = 0; i < m; ++i) {
        for (int j = 0; j < n; ++j) {
            cout << h_ans[i * n + j] << " ";
        }
        cout << endl;
    }

    // Free device memory
    cudaFree(d_mat);
    cudaFree(d_filter);
    cudaFree(d_ans);

    // Free host memory
    delete[] h_mat;
    delete[] h_filter;
    delete[] h_ans;

    return 0;}

It is giving correct output for some test cases but wrong output for other test cases. Also it is taking very long to give output for very large inputs.

0

There are 0 best solutions below