Compute and Data transfer not happening concurrently in cuda Streams on Iteration 2

48 Views Asked by Lokesh At 08 March 2024 at 18:38

I have written a basic program where a chunk of data is loaded in CPU memory (Pinned), and then I transfer it in chunks to GPUs (Asynchronously), and then do computation on each chunk. So for each chunk, I have created a stream.
The issue I am facing is data transfer of the second chunk and computation on the first chunk are not happening concurrently, they are happening sequentially. For all other chunks, it is happening concurrently.

Any suggestion on how I can a get second the transfer of the second chunk and computation on the first chunk concurrently?

I have uploaded screenshots of two traces (One for 5 iterations and another for 10 iterations).

(This is on a 16GB V100, tried on nvcc 11.7 and 12.3)

5 Chunks: 10 Chunks:

Code:

#include <bits/stdc++.h>
#include "driver_types.h"
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
using namespace std;

#define checkCudaErrors(err)                                                     \
    do                                                                           \
    {                                                                            \
        if (err != cudaSuccess)                                                  \
        {                                                                        \
            std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << ": " \
                      << cudaGetErrorString(err) << std::endl;                   \
            exit(EXIT_FAILURE);                                                  \
        }                                                                        \
    } while (0)

__global__ void intialize_mark(char *mark, uint32_t num_e)
{
    uint32_t j;
    j = blockIdx.y * gridDim.x + blockIdx.x;
    j = j * 512 + threadIdx.x;
    if (j >= num_e)
        return;
    mark[j] = 0;
}

__global__ void intialize_mask(char *mask, uint32_t num_e)
{
    uint32_t j;
    j = blockIdx.y * gridDim.x + blockIdx.x;
    j = j * 512 + threadIdx.x;
    if (j >= num_e)
        return;
    mask[j] = 0;
}

int main()
{
    cudaFree(nullptr);
    uint32_t numIterations = 10;
    std::ios_base::sync_with_stdio(false);
    std::cin.tie(0);
    std::cout.tie(0);

    uint32_t numNodes = 1000000;
    uint64_t numEdges = 20000000;

    uint64_t *edgelist;
    cudaHostAlloc(&edgelist, numEdges * sizeof(uint64_t), cudaHostAllocDefault);

    std::random_device rd;
    std::mt19937_64 gen(rd());
    std::uniform_int_distribution<uint64_t> dis;
    for (uint64_t i = 0; i < numEdges; i++)
    {
        edgelist[i] = dis(gen);
    }

    uint32_t num_threads = 512;                   // -> number of threads per block
    uint32_t num_blocks_n = (numNodes / 512) + 1; // -> number of blocks for nodes
    uint32_t num_blocks_e = (numEdges / 512) + 1; // -> number of blocks for edges
    uint32_t nny = (num_blocks_n / 1000) + 1;     // -> y dimension for nodes
    uint32_t nnx = 1000;                          // -> x dimension for nodes
    uint32_t ney = (num_blocks_e / 1000) + 1;     // -> y dimension for edges
    uint32_t nex = 1000;                          // -> x dimension for edges

    dim3 grid_n(nnx, nny);        // -> grid for nodes
    dim3 grid_e(nex, ney);        // -> grid for edges
    dim3 threads(num_threads, 1); // -> threads per block

    uint64_t numEdgesIteration = (numEdges + numIterations - 1) / numIterations; // -> number of edges per iteration

    char *d_mark;
    char *mask;
    uint64_t *d_edgeList1;
    uint64_t *d_edgeList2;

    checkCudaErrors(cudaMalloc(&d_mark, (numEdgesIteration) * sizeof(char)));
    checkCudaErrors(cudaMalloc(&mask, (numEdgesIteration) * sizeof(char)));

    checkCudaErrors(cudaMalloc(&d_edgeList1, (numEdgesIteration) * sizeof(uint64_t)));
    checkCudaErrors(cudaMalloc(&d_edgeList2, (numEdgesIteration) * sizeof(uint64_t)));

    cudaStream_t cudaStreamArr[numIterations];
    for (int i = 0; i < numIterations; i++)
    {
        cudaStreamCreate(&cudaStreamArr[i]);
    }
    uint64_t currentNumEdges;
    for (uint32_t i = 0; i <= numIterations; i++)
    {
        if (i < numIterations)
        {
            if ((min(numEdges, (i + 1) * numEdgesIteration) - (i)*numEdgesIteration) > 0)
            {
                checkCudaErrors(cudaMemcpyAsync(d_edgeList2,
                                                edgelist + (i)*numEdgesIteration,
                                                (min(numEdges, (i + 1) * numEdgesIteration) - (i)*numEdgesIteration) * sizeof(uint64_t),
                                                cudaMemcpyHostToDevice,
                                                cudaStreamArr[i]));
            }
        }

        if (i > 0)
        {
            currentNumEdges = min(numEdges, (i)*numEdgesIteration) - (i - 1) * numEdgesIteration;

            intialize_mark<<<grid_e, threads, 0, cudaStreamArr[i - 1]>>>(d_mark, currentNumEdges);

            intialize_mask<<<grid_e, threads, 0, cudaStreamArr[i - 1]>>>(mask, currentNumEdges);
        }
        cudaDeviceSynchronize();
        swap(d_edgeList1, d_edgeList2);
    }
}

I also tried using 3 streams (1 for HostToDevice, 1 for DeviceToHost and another for Computation). Still same issue.

Original Q&A

Compute and Data transfer not happening concurrently in cuda Streams on Iteration 2

There are 0 best solutions below

Related Questions in CUDA

Related Questions in GPU

Related Questions in NVCC

Related Questions in NSIGHT

Related Questions in CUDA-STREAMS

Trending Questions

Popular # Hahtags

Popular Questions