I previously posted a question regarding matrix-vector multiplication in CUDA and about writing my own kernel. After doing this, I decided to implement my problem using CUBLAS as suggested by some users (thanks @Robert Crovella ) on SO in the hopes of achieving higher performance (my project is performance driven).
Just to clarify: I want to multiply a NxN matrix with a 1xN vector.
I've been looking at the code pasted below for a couple of days now and I cant figure out why the multiplication is giving me an incorrect result. I fear that i am causing problems by using < vector > arrays (this is part of a much larger system that uses these data types). I don't mean to use this thread as a debugging tool but I think this will also be helpful to other users trying to achieve this as I have not come across a particularly comprehensive source on the internet for my particular problem (and for the cublas v2 API). Thanks in advance!
#include <cuda.h>
#include <vector>
#include <iostream>
#include <fstream>
#include <stdio.h>
#include <stdlib.h>
#include <cmath>
#include <cublas_v2.h>
#include <time.h>
//#include "timenow.cu"
// error check macros
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
do { \
cublasStatus_t __err = fn; \
if (__err != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
(int)(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// random data filler
void fillvector(float *data, int N){
for(int i=0; i<N; i++){
data[i] = float(rand() % 10);
}
}
//printer
void printer(bool printOut, float *data, int N){
if(printOut == true){
for(int i=0; i<N; i++){
printf("%2.1f ", data[i]);
}
printf("\n");
}
}
/////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////
int main(){
bool printOut = true;
int N;
std::cout << "Enter N: " ;
std::cin >> N;
std::vector<float> x0;
x0.resize(N);
std::vector<float> p;
p.resize(N*N);
// matrix A
std::vector<float> A[N];
for(int i=0;i<N;i++){
A[i].resize(N);
fillvector(A[i].data(), N);
printer(printOut, A[i].data(), N);
}
printf("\n");
fillvector(x0.data(), N);
printer(printOut, x0.data(), N);
printf("\nStarting CUDA computation...");
///double startTime = timenow();
// device pointers
float *d_A, *d_p, *d_b, *d_x0, *d_v, *d_temp;
cudaMalloc((void**)&d_A, N*N*sizeof(float));
cudaMalloc((void**)&d_temp, N*sizeof(float));
cudaMalloc((void**)&d_x0, N*sizeof(float));
cudaCheckErrors("cuda malloc fail");
// might need to flatten A...
cublasSetVector(N, sizeof(float), &x0, 1, d_x0, 1);
//daMemcpy(d_x0, &x0, N*sizeof(float), cudaMemcpyHostToDevice);
cublasSetMatrix(N, N, sizeof(float), &A, N, d_A, N);
cudaCheckErrors("cuda memcpy of A or x0 fail");
float *temp;
temp = (float *)malloc(N*sizeof(temp));
cublasHandle_t handle;
cublasCheckErrors(cublasCreate(&handle));
float alpha = 1.0f;
float beta = 0.0f;
cublasCheckErrors(cublasSgemv(handle, CUBLAS_OP_N, N, N, &alpha, d_A, N, d_x0, 1, &beta, d_temp, 1));
cublasGetVector(N, sizeof(float), &temp, 1, d_temp, 1);
//cudaMemcpy(temp, d_temp, N*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("returning to host failed");
printf("\n");
printer(printOut, temp, N);
/*alpha = -1.0;
cublasSaxpy(handle, N, &alpha, d_temp, 1, d_v, 1);
cublasGetVector(N, sizeof(float) * N, d_v, 1, &v, 1);
printf("\n");
for(int i=0; i<N; i++){
printf("%2.1f ",v[i]);
}*/
printf("\nFinished CUDA computations...");
//double endTime = timenow();
//double timeDiff = endTime - startTime;
//printf("\nRuntime: %2.3f seconds \n", timeDiff);
cudaFree(d_temp);
cudaFree(d_A);
cudaFree(d_p);
cudaFree(d_x0);
return 0;
}
We don't reference the first element of a vector this way:
Instead you should do this:
And likewise for your
SetMatrixcall referencingA:Your
GetVectorcall has 2 errors:You have your
tempandd_tempparameters reversed (you are copying from device to host) and you should not take the address oftemp: it is already a pointer. So do this:You're not doing proper error checking on all cublas calls, such as your get/set matrix/vector calls. Use the same method you are using on other cublas calls for these also.
You are creating
Aas an array of vectors. This won't work withcublasSetMatrix. Instead we need to createAas a flat vector, of sufficient size (N*N) to store the entire matrix.Finally, cublas expects the matrices it uses to be stored in column-major order. If you pass C-style arrays in row-major order, you should use the transpose for that matrix in
cublasSgemv:The following code has these various problems fixed: