I am struggling with a problem when using pybind to call cuda function in python. The problem is when I tried to get an element of the input, it shows Segmentation fault.
Here is my code:
- main python script
import torch
import matmul_cuda
M,K,N = 32,32,32
A = torch.randint(1,10,(M,K), dtype=torch.int32).cuda()
B = torch.randint(1,10,(K,N), dtype=torch.int32).cuda()
C = torch.zeros((M, N), dtype=torch.float32).cuda()
matmul_cuda.torch_launch_matmul(A, B, C, M, K, N)
- jit script
from torch.utils.cpp_extension import load
mapping_cuda = load(
name='matmul_cuda',
sources=['matmul_cuda.cpp', 'matmul_cuda_kernel.cu'],
build_directory="./build",
verbose=True
)
- cpp file
#include <torch/extension.h>
#include "matmul_cuda_kernel.cuh"
void torch_launch_matmul(
torch::Tensor &tensor_A,
torch::Tensor &tensor_B,
torch::Tensor &tensor_C,
int M,
int K,
int N
)
{
launch_matmul(
(int*) tensor_A.data_ptr(),
(int*) tensor_B.data_ptr(),
(float*) tensor_C.data_ptr(),
M,
K,
N
);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("torch_launch_matmul", &torch_launch_matmul, "torch_launch_matmul (cuda)");
}
- cuda file
#include <iostream>
#define TILE_DIM 32
__global__ void matmul_kernel(int* pfMatrixA, int* pfMatrixB, float* pfMatrixC, int m, int k, int n)
{
int nRow = blockIdx.y * blockDim.y + threadIdx.y;
int nCol = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0.0f;
for(int i =0; i < k; i++)
{
sum += pfMatrixA[nRow * k + i] * pfMatrixB[i * n + nCol];
}
pfMatrixC[nRow * n + nCol] = sum;
}
void launch_matmul(
int* array_A,
int* array_B,
float* array_C,
int M,
int K,
int N
)
{
int x = array_A[0]; // Segmentation fault
dim3 block_size(TILE_DIM, TILE_DIM);
dim3 grid_size((M + TILE_DIM - 1) / TILE_DIM, (N + TILE_DIM - 1) / TILE_DIM);
matmul_kernel<<<grid_size, block_size>>>(array_A, array_B, array_C, M, K, N);
}
If I remove the code int x = array_A[0]; in cuda file, it works again.
I wonder if it is because the array_A is on GPU then I cannot get the element x? If so, how can I get the elements from input tensor in cuda code?