I am trying to use cublasGemmBatchedEx to perform matrix multiplication. Here is my code.
#include <iostream>
#include <cublas_v2.h>
#define M 4
#define N 4
#define K 4
//nvcc -lcublas -o matmul_gemmBatchedEx matmul_gemmBatchedEx.cu
void print_matrix(float **A, int rows, int cols, int batch_size) {
for (int i = 0; i < batch_size; i++){
for (int j = 0; j < rows; j++){
for(int k = 0; k < cols; k++){
std::cout << A[i][k * rows + j] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
}
int main(int argc, char* argv[])
{
// Linear dimension of matrices
int batch_size = 2;
float *h_A[batch_size], *h_B[batch_size], *h_C[batch_size];
for (int i = 0; i < batch_size; i++){
h_A[i] = (float*)malloc(M * K * sizeof(float));
h_B[i] = (float*)malloc(K * N * sizeof(float));
h_C[i] = (float*)malloc(M * N * sizeof(float));
}
for (int i = 0; i < batch_size; i++){
for (int j = 0; j < M * K; j++)
h_A[i][j] = j%4;
for (int j = 0; j < K * N; j++)
h_B[i][j] = j%4 + 4;
for (int j = 0; j < M * N; j++)
h_C[i][j] = 0;
}
std::cout << "A =" << std::endl;
print_matrix(h_A, M, K, batch_size);
std::cout << "B =" << std::endl;
print_matrix(h_B, K, N, batch_size);
float *d_A[batch_size], *d_B[batch_size], *d_C[batch_size];
for (int i = 0; i < batch_size; i++){
cudaMalloc(&d_A[i], sizeof(float)* M * K);
cudaMalloc(&d_B[i], sizeof(float)* K * N);
cudaMalloc(&d_C[i], sizeof(float)* M * N);
}
cudaMemcpy(d_A, h_A, sizeof(float)* M * K * batch_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, sizeof(float)* K * N * batch_size, cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
// Set up the matrix dimensions and batch size
int lda = M;
int ldb = K;
int ldc = M;
// Set the alpha and beta parameters for the gemm operation
float alpha = 1.0f;
float beta = 0.0f;
cublasStatus_t status = cublasGemmBatchedEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, M, N, K,
&alpha,
(const void**)d_A, CUDA_R_32F, lda,
(const void**)d_B, CUDA_R_32F, ldb,
&beta,
(void**)d_C, CUDA_R_32F, ldc,
batch_size,
CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);
cudaMemcpy(h_C,d_C,sizeof(float) * M * N * batch_size, cudaMemcpyDeviceToHost);
if (status == CUBLAS_STATUS_SUCCESS) {
std::cout << "C =" << std::endl;
print_matrix(h_C, M, N, batch_size);
} else {
std::cout << status << std::endl;
}
// Destroy the handle
cublasDestroy(handle);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaFreeHost(h_A);
cudaFreeHost(h_B);
cudaFreeHost(h_C);
}
This is the result when I ran this code.
A =
0 0 0 0
1 1 1 1
2 2 2 2
3 3 3 3
0 0 0 0
1 1 1 1
2 2 2 2
3 3 3 3
B =
4 4 4 4
5 5 5 5
6 6 6 6
7 7 7 7
4 4 4 4
5 5 5 5
6 6 6 6
7 7 7 7
C =
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
The problem is I don't get a expected result. It comes full of zeros. Is there any problem with my code?
The matrix arguments that you pass to this cublas function need to be an array of device pointers, where each device pointer is properly allocated, properly copied, and has a proper population of its allocation.
There are at least several problems with your attempt to do this. The central problem is around these lines:
You cannot copy an array of independent allocations that way, and furthermore you are copying pointer data using sizes and types as if it were matrix contents (
float *vs.float).The following example has various issues fixed:
given that
cublasGemmBatchedEx()requires that all the matrices across the batch be of identical sizes, and given that we are starting with a clean slate for this exercise, we can use a somewhat simpler realization by concatenating matrices so we can do single allocations for the batch. This won't work for every use case, but may be of interest: