I have written a basic program where a chunk of data is loaded in CPU memory (Pinned), and then I transfer it in chunks to GPUs (Asynchronously), and then do computation on each chunk. So for each chunk, I have created a stream.
The issue I am facing is data transfer of the second chunk and computation on the first chunk are not happening concurrently, they are happening sequentially. For all other chunks, it is happening concurrently.
Any suggestion on how I can a get second the transfer of the second chunk and computation on the first chunk concurrently?
I have uploaded screenshots of two traces (One for 5 iterations and another for 10 iterations).
(This is on a 16GB V100, tried on nvcc 11.7 and 12.3)
Code:
#include <bits/stdc++.h>
#include "driver_types.h"
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
using namespace std;
#define checkCudaErrors(err) \
do \
{ \
if (err != cudaSuccess) \
{ \
std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << ": " \
<< cudaGetErrorString(err) << std::endl; \
exit(EXIT_FAILURE); \
} \
} while (0)
__global__ void intialize_mark(char *mark, uint32_t num_e)
{
uint32_t j;
j = blockIdx.y * gridDim.x + blockIdx.x;
j = j * 512 + threadIdx.x;
if (j >= num_e)
return;
mark[j] = 0;
}
__global__ void intialize_mask(char *mask, uint32_t num_e)
{
uint32_t j;
j = blockIdx.y * gridDim.x + blockIdx.x;
j = j * 512 + threadIdx.x;
if (j >= num_e)
return;
mask[j] = 0;
}
int main()
{
cudaFree(nullptr);
uint32_t numIterations = 10;
std::ios_base::sync_with_stdio(false);
std::cin.tie(0);
std::cout.tie(0);
uint32_t numNodes = 1000000;
uint64_t numEdges = 20000000;
uint64_t *edgelist;
cudaHostAlloc(&edgelist, numEdges * sizeof(uint64_t), cudaHostAllocDefault);
std::random_device rd;
std::mt19937_64 gen(rd());
std::uniform_int_distribution<uint64_t> dis;
for (uint64_t i = 0; i < numEdges; i++)
{
edgelist[i] = dis(gen);
}
uint32_t num_threads = 512; // -> number of threads per block
uint32_t num_blocks_n = (numNodes / 512) + 1; // -> number of blocks for nodes
uint32_t num_blocks_e = (numEdges / 512) + 1; // -> number of blocks for edges
uint32_t nny = (num_blocks_n / 1000) + 1; // -> y dimension for nodes
uint32_t nnx = 1000; // -> x dimension for nodes
uint32_t ney = (num_blocks_e / 1000) + 1; // -> y dimension for edges
uint32_t nex = 1000; // -> x dimension for edges
dim3 grid_n(nnx, nny); // -> grid for nodes
dim3 grid_e(nex, ney); // -> grid for edges
dim3 threads(num_threads, 1); // -> threads per block
uint64_t numEdgesIteration = (numEdges + numIterations - 1) / numIterations; // -> number of edges per iteration
char *d_mark;
char *mask;
uint64_t *d_edgeList1;
uint64_t *d_edgeList2;
checkCudaErrors(cudaMalloc(&d_mark, (numEdgesIteration) * sizeof(char)));
checkCudaErrors(cudaMalloc(&mask, (numEdgesIteration) * sizeof(char)));
checkCudaErrors(cudaMalloc(&d_edgeList1, (numEdgesIteration) * sizeof(uint64_t)));
checkCudaErrors(cudaMalloc(&d_edgeList2, (numEdgesIteration) * sizeof(uint64_t)));
cudaStream_t cudaStreamArr[numIterations];
for (int i = 0; i < numIterations; i++)
{
cudaStreamCreate(&cudaStreamArr[i]);
}
uint64_t currentNumEdges;
for (uint32_t i = 0; i <= numIterations; i++)
{
if (i < numIterations)
{
if ((min(numEdges, (i + 1) * numEdgesIteration) - (i)*numEdgesIteration) > 0)
{
checkCudaErrors(cudaMemcpyAsync(d_edgeList2,
edgelist + (i)*numEdgesIteration,
(min(numEdges, (i + 1) * numEdgesIteration) - (i)*numEdgesIteration) * sizeof(uint64_t),
cudaMemcpyHostToDevice,
cudaStreamArr[i]));
}
}
if (i > 0)
{
currentNumEdges = min(numEdges, (i)*numEdgesIteration) - (i - 1) * numEdgesIteration;
intialize_mark<<<grid_e, threads, 0, cudaStreamArr[i - 1]>>>(d_mark, currentNumEdges);
intialize_mask<<<grid_e, threads, 0, cudaStreamArr[i - 1]>>>(mask, currentNumEdges);
}
cudaDeviceSynchronize();
swap(d_edgeList1, d_edgeList2);
}
}
I also tried using 3 streams (1 for HostToDevice, 1 for DeviceToHost and another for Computation). Still same issue.

