pybind Segmentation fault when getting elements of inputs

31 Views Asked by At

I am struggling with a problem when using pybind to call cuda function in python. The problem is when I tried to get an element of the input, it shows Segmentation fault.

Here is my code:

  1. main python script
import torch
import matmul_cuda

M,K,N = 32,32,32
A = torch.randint(1,10,(M,K), dtype=torch.int32).cuda()
B = torch.randint(1,10,(K,N), dtype=torch.int32).cuda()
C = torch.zeros((M, N), dtype=torch.float32).cuda()

matmul_cuda.torch_launch_matmul(A, B, C, M, K, N)
  1. jit script
from torch.utils.cpp_extension import load
mapping_cuda = load(
    name='matmul_cuda', 
    sources=['matmul_cuda.cpp', 'matmul_cuda_kernel.cu'], 
    build_directory="./build",
    verbose=True
    )
  1. cpp file
#include <torch/extension.h>
#include "matmul_cuda_kernel.cuh"

void torch_launch_matmul(
                        torch::Tensor &tensor_A,
                        torch::Tensor &tensor_B,
                        torch::Tensor &tensor_C,
                        int M,
                        int K,
                        int N
                        ) 
{
    launch_matmul(
              (int*) tensor_A.data_ptr(),
              (int*) tensor_B.data_ptr(),
              (float*) tensor_C.data_ptr(),
              M,
              K,
              N
    );
}


PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("torch_launch_matmul", &torch_launch_matmul, "torch_launch_matmul (cuda)");
}
  1. cuda file
#include <iostream>
#define TILE_DIM 32
__global__ void matmul_kernel(int* pfMatrixA, int* pfMatrixB, float* pfMatrixC, int m, int k, int n)
{
    int nRow = blockIdx.y * blockDim.y + threadIdx.y;
    int nCol = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.0f;

    for(int i =0; i < k; i++)
    {
        sum += pfMatrixA[nRow * k + i] * pfMatrixB[i * n + nCol];
    }
    pfMatrixC[nRow * n + nCol] = sum;
}

void launch_matmul(
                    int* array_A,
                    int* array_B,
                    float* array_C,
                    int M,
                    int K,
                    int N
                    )
{
    int x = array_A[0]; // Segmentation fault
    dim3 block_size(TILE_DIM, TILE_DIM);
    dim3 grid_size((M + TILE_DIM - 1) / TILE_DIM, (N + TILE_DIM - 1) / TILE_DIM);
    matmul_kernel<<<grid_size, block_size>>>(array_A, array_B, array_C, M, K, N);
}

If I remove the code int x = array_A[0]; in cuda file, it works again.

I wonder if it is because the array_A is on GPU then I cannot get the element x? If so, how can I get the elements from input tensor in cuda code?

0

There are 0 best solutions below