Using cublasGemmBatchedEx

262 Views Asked by At

I am trying to use cublasGemmBatchedEx to perform matrix multiplication. Here is my code.

#include <iostream>
#include <cublas_v2.h>

#define M 4
#define N 4
#define K 4


//nvcc -lcublas -o matmul_gemmBatchedEx matmul_gemmBatchedEx.cu
void print_matrix(float **A, int rows, int cols, int batch_size) {
    for (int i = 0; i < batch_size; i++){
        for (int j = 0; j < rows; j++){
            for(int k = 0; k < cols; k++){
                std::cout << A[i][k * rows + j] << " ";
            }
            std::cout << std::endl;
        }
        std::cout << std::endl;
    }
    
}

int main(int argc, char* argv[])
{
    // Linear dimension of matrices
    int batch_size = 2;

    float *h_A[batch_size], *h_B[batch_size], *h_C[batch_size];
    for (int i = 0; i < batch_size; i++){
        h_A[i] = (float*)malloc(M * K * sizeof(float));
        h_B[i] = (float*)malloc(K * N * sizeof(float));
        h_C[i] = (float*)malloc(M * N * sizeof(float));
    }

    for (int i = 0; i < batch_size; i++){
        for (int j = 0; j < M * K; j++)
            h_A[i][j] = j%4;
        for (int j = 0; j < K * N; j++)
            h_B[i][j] = j%4 + 4;
        for (int j = 0; j < M * N; j++)
            h_C[i][j] = 0;
    }

    std::cout << "A =" << std::endl;
    print_matrix(h_A, M, K, batch_size);
    std::cout << "B =" << std::endl;
    print_matrix(h_B, K, N, batch_size);

    float *d_A[batch_size], *d_B[batch_size], *d_C[batch_size];

    for (int i = 0; i < batch_size; i++){
        cudaMalloc(&d_A[i], sizeof(float)* M * K);
        cudaMalloc(&d_B[i], sizeof(float)* K * N);
        cudaMalloc(&d_C[i], sizeof(float)* M * N);
    }

    cudaMemcpy(d_A, h_A, sizeof(float)* M * K * batch_size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, sizeof(float)* K * N * batch_size, cudaMemcpyHostToDevice);

    cublasHandle_t handle;
    cublasCreate(&handle);

    // Set up the matrix dimensions and batch size
    int lda = M;
    int ldb = K;
    int ldc = M;

    // Set the alpha and beta parameters for the gemm operation
    float alpha = 1.0f;
    float beta = 0.0f;

    cublasStatus_t status = cublasGemmBatchedEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K, 
                            &alpha,
                            (const void**)d_A, CUDA_R_32F, lda, 
                            (const void**)d_B, CUDA_R_32F, ldb, 
                            &beta,
                            (void**)d_C, CUDA_R_32F, ldc, 
                            batch_size,
                            CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);

    cudaMemcpy(h_C,d_C,sizeof(float) * M * N * batch_size, cudaMemcpyDeviceToHost);

    if (status == CUBLAS_STATUS_SUCCESS) {
        std::cout << "C =" << std::endl;
        print_matrix(h_C, M, N, batch_size);
    } else {
        std::cout << status << std::endl;
    }
    
    // Destroy the handle
    cublasDestroy(handle);


    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    cudaFreeHost(h_A);
    cudaFreeHost(h_B);
    cudaFreeHost(h_C);

}

This is the result when I ran this code.

A =

0 0 0 0

1 1 1 1

2 2 2 2

3 3 3 3

0 0 0 0

1 1 1 1

2 2 2 2

3 3 3 3

B =

4 4 4 4

5 5 5 5

6 6 6 6

7 7 7 7

4 4 4 4

5 5 5 5

6 6 6 6

7 7 7 7

C =

0 0 0 0

0 0 0 0

0 0 0 0

0 0 0 0

0 0 0 0

0 0 0 0

0 0 0 0

0 0 0 0

The problem is I don't get a expected result. It comes full of zeros. Is there any problem with my code?

1

There are 1 best solutions below

0
Robert Crovella On

The matrix arguments that you pass to this cublas function need to be an array of device pointers, where each device pointer is properly allocated, properly copied, and has a proper population of its allocation.

There are at least several problems with your attempt to do this. The central problem is around these lines:

cudaMemcpy(d_A, h_A, sizeof(float)* M * K * batch_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, sizeof(float)* K * N * batch_size, cudaMemcpyHostToDevice);

You cannot copy an array of independent allocations that way, and furthermore you are copying pointer data using sizes and types as if it were matrix contents (float * vs. float).

The following example has various issues fixed:

$ cat t2167.cu
#include <iostream>
#include <cublas_v2.h>

#define M 4
#define N 4
#define K 4


//nvcc -lcublas -o matmul_gemmBatchedEx matmul_gemmBatchedEx.cu
void print_matrix(float **A, int rows, int cols, int batch_size) {
    for (int i = 0; i < batch_size; i++){
        for (int j = 0; j < rows; j++){
            for(int k = 0; k < cols; k++){
                std::cout << A[i][k * rows + j] << " ";
            }
            std::cout << std::endl;
        }
        std::cout << std::endl;
    }

}

int main(int argc, char* argv[])
{
    // Linear dimension of matrices
    int batch_size = 2;

    float *h_A[batch_size], *h_B[batch_size], *h_C[batch_size];
    for (int i = 0; i < batch_size; i++){
        h_A[i] = (float*)malloc(M * K * sizeof(float));
        h_B[i] = (float*)malloc(K * N * sizeof(float));
        h_C[i] = (float*)malloc(M * N * sizeof(float));
    }

    for (int i = 0; i < batch_size; i++){
        for (int j = 0; j < M * K; j++)
            h_A[i][j] = j%4;
        for (int j = 0; j < K * N; j++)
            h_B[i][j] = j%4 + 4;
        for (int j = 0; j < M * N; j++)
            h_C[i][j] = 0;
    }

    std::cout << "A =" << std::endl;
    print_matrix(h_A, M, K, batch_size);
    std::cout << "B =" << std::endl;
    print_matrix(h_B, K, N, batch_size);

    float *d_A[batch_size], *d_B[batch_size], *d_C[batch_size];

    for (int i = 0; i < batch_size; i++){
        cudaMalloc(&d_A[i], sizeof(float)* M * K);
        cudaMemcpy(d_A[i], h_A[i], sizeof(float)*M*K, cudaMemcpyHostToDevice);
        cudaMalloc(&d_B[i], sizeof(float)* K * N);
        cudaMemcpy(d_B[i], h_B[i], sizeof(float)*N*K, cudaMemcpyHostToDevice);
        cudaMalloc(&d_C[i], sizeof(float)* M * N);
        cudaMemcpy(d_C[i], h_C[i], sizeof(float)*N*M, cudaMemcpyHostToDevice);
    }
    float **d_dA, **d_dB, **d_dC;
    cudaMalloc(&d_dA, sizeof(float *)*batch_size);
    cudaMalloc(&d_dB, sizeof(float *)*batch_size);
    cudaMalloc(&d_dC, sizeof(float *)*batch_size);
    cudaMemcpy(d_dA, d_A, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_dB, d_B, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_dC, d_C, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);

    cublasHandle_t handle;
    cublasStatus_t status = cublasCreate(&handle);

    // Set up the matrix dimensions and batch size
    int lda = M;
    int ldb = K;
    int ldc = M;

    // Set the alpha and beta parameters for the gemm operation
    float alpha = 1.0f;
    float beta = 0.0f;
    status = cublasGemmBatchedEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K,
                            &alpha,
                            (const void**)d_dA, CUDA_R_32F, lda,
                            (const void**)d_dB, CUDA_R_32F, ldb,
                            &beta,
                            (void**)d_dC, CUDA_R_32F, ldc,
                            batch_size,
                            CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);
    for (int i = 0; i < batch_size; i++)
        cudaMemcpy(h_C[i], d_C[i], sizeof(float)*M*N, cudaMemcpyDeviceToHost);

    if (status == CUBLAS_STATUS_SUCCESS) {
        std::cout << "C =" << std::endl;
        print_matrix(h_C, M, N, batch_size);
    } else {
        std::cout << status << std::endl;
    }

    // Destroy the handle
    cublasDestroy(handle);


    cudaFree(d_dA);
    cudaFree(d_dB);
    cudaFree(d_dC);
    for (int i = 0; i < batch_size; i++){
      free(h_A[i]);
      free(h_B[i]);
      free(h_C[i]);
      cudaFree(d_A[i]);
      cudaFree(d_B[i]);
      cudaFree(d_C[i]);}

}
$ nvcc -o t2167 t2167.cu -lcublas
$ compute-sanitizer ./t2167
========= COMPUTE-SANITIZER
A =
0 0 0 0
1 1 1 1
2 2 2 2
3 3 3 3

0 0 0 0
1 1 1 1
2 2 2 2
3 3 3 3

B =
4 4 4 4
5 5 5 5
6 6 6 6
7 7 7 7

4 4 4 4
5 5 5 5
6 6 6 6
7 7 7 7

C =
0 0 0 0
22 22 22 22
44 44 44 44
66 66 66 66

0 0 0 0
22 22 22 22
44 44 44 44
66 66 66 66

========= ERROR SUMMARY: 0 errors
$

given that cublasGemmBatchedEx() requires that all the matrices across the batch be of identical sizes, and given that we are starting with a clean slate for this exercise, we can use a somewhat simpler realization by concatenating matrices so we can do single allocations for the batch. This won't work for every use case, but may be of interest:

#include <iostream>
#include <cublas_v2.h>

#define M 4
#define N 4
#define K 4


//nvcc -lcublas -o matmul_gemmBatchedEx matmul_gemmBatchedEx.cu
void print_matrix(float *A, int rows, int cols, int batch_size) {
    for (int i = 0; i < batch_size; i++){
        for (int j = 0; j < rows; j++){
            for(int k = 0; k < cols; k++){
                std::cout << A[i*rows*cols+k * rows + j] << " ";
            }
            std::cout << std::endl;
        }
        std::cout << std::endl;
    }

}

int main(int argc, char* argv[])
{
    // Linear dimension of matrices
    int batch_size = 2;

    float *h_A, *h_B, *h_C;
    h_A = (float*)malloc(M * K * sizeof(float) * batch_size);
    h_B = (float*)malloc(K * N * sizeof(float) * batch_size);
    h_C = (float*)malloc(M * N * sizeof(float) * batch_size);

    for (int i = 0; i < batch_size; i++){
        for (int j = 0; j < M * K; j++)
            h_A[i*M*K+j] = j%4;
        for (int j = 0; j < K * N; j++)
            h_B[i*K*N+j] = j%4 + 4;
        for (int j = 0; j < M * N; j++)
            h_C[i*M*N+j] = 0;
    }

    std::cout << "A =" << std::endl;
    print_matrix(h_A, M, K, batch_size);
    std::cout << "B =" << std::endl;
    print_matrix(h_B, K, N, batch_size);

    float *d_A, *d_B, *d_C;

    cudaMalloc(&d_A, sizeof(float)* M * K*batch_size);
    cudaMemcpy(d_A, h_A, sizeof(float)*M*K*batch_size, cudaMemcpyHostToDevice);
    cudaMalloc(&d_B, sizeof(float)* K * N*batch_size);
    cudaMemcpy(d_B, h_B, sizeof(float)*N*K*batch_size, cudaMemcpyHostToDevice);
    cudaMalloc(&d_C, sizeof(float)* M * N*batch_size);
    cudaMemcpy(d_C, h_C, sizeof(float)*N*M*batch_size, cudaMemcpyHostToDevice);
    float *h_dA[batch_size], *h_dB[batch_size], *h_dC[batch_size];
    for (int i = 0; i < batch_size; i++){
      h_dA[i] = d_A+i*M*K;
      h_dB[i] = d_B+i*K*N;
      h_dC[i] = d_C+i*M*N;}
    float **d_dA, **d_dB, **d_dC;
    cudaMalloc(&d_dA, sizeof(float *)*batch_size);
    cudaMalloc(&d_dB, sizeof(float *)*batch_size);
    cudaMalloc(&d_dC, sizeof(float *)*batch_size);
    cudaMemcpy(d_dA, h_dA, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_dB, h_dB, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_dC, h_dC, sizeof(float*)* batch_size, cudaMemcpyHostToDevice);

    cublasHandle_t handle;
    cublasStatus_t status = cublasCreate(&handle);

    // Set up the matrix dimensions and batch size
    int lda = M;
    int ldb = K;
    int ldc = M;

    // Set the alpha and beta parameters for the gemm operation
    float alpha = 1.0f;
    float beta = 0.0f;
    status = cublasGemmBatchedEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K,
                            &alpha,
                            (const void**)d_dA, CUDA_R_32F, lda,
                            (const void**)d_dB, CUDA_R_32F, ldb,
                            &beta,
                            (void**)d_dC, CUDA_R_32F, ldc,
                            batch_size,
                            CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);
    cudaMemcpy(h_C, d_C, sizeof(float)*M*N*batch_size, cudaMemcpyDeviceToHost);

    if (status == CUBLAS_STATUS_SUCCESS) {
        std::cout << "C =" << std::endl;
        print_matrix(h_C, M, N, batch_size);
    } else {
        std::cout << status << std::endl;
    }

    // Destroy the handle
    cublasDestroy(handle);


    cudaFree(d_dA);
    cudaFree(d_dB);
    cudaFree(d_dC);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);
}