I am making an particle-based code in which every particle's neighbor lists are already generated with the initial condition and unchanged during the whole simulation, but the positions of each particle changes and therefore after some long time the 'neighbors' could be quite far away from each other. These neighbor lists correspond to the 'bond' to its initial positions, thereby unchanged.
I restrict the maximum number of neighbors as MAX_NEIGHBOR and with N particles there will be an array Neighbors[N*MAX_NEIGHBOR]. For the i'th particle, the index of its neighbors will correspond to Neighbors[i*MAX_NEIGHBOR : (i+1)*MAX_NEIGHBOR] (python range notation).
Say, to access a quantity (Quantity[N]) for each particles, basically this kind of approach could be considered.
int index = blockIdx.x; // each block for one particle, and the threads will take care its neighbors
if (index > N) return;
int neighborIndex = Neighbors[MAX_NEIGHBOR*index + threadIdx.x]; // coalesced
double quantity = Quantity[neighborIndex]; // cannot be coalesced
Evidently the access to Quantity is not coalesced. Since the Neighbors[N*MAX_NEIGHBOR] are constant during the simulations, I expect somehow I can do it better but how could this be technically done.
I can think of generating another array NeighborsQuantity[N*MAX_NEIGHBORS] so that I can access those in coalesced manner, but in generating this NeighborsQuantity will require reading non-coalesced memory in the another kernel.
There have been some comments with the texture memory in some documents, but going through CUDA programming guide it seems that the texture memory does not support double3 type of variable.
The following is the kernel I am using. Here I access Pos, and Vol of the neighbor particles. Also Vel is read, but it has no role so far. I have put it there just to prepare next implementations of more sophisticated models.
__global__ void calcBondAccelD(double4 *DVel_Dt, // output
double4 *oldPos, // inputs : oldPos[numParticles]
double4 *oldVel, // oldVel[numParticles]
double *Rho, // Rho[numParticles]
double *Vol, // Vol[numParticles]
double *BondStretch, // BondStretch[numParticles*MAX_NEIGHBOR]
double *Mu, // Mu[numParticles*MAX_NEIGHBOR]
int *Type, // Type[numParticles]
int *Neighbors, // Neighbors[numParticles*MAX_NEIGHBOR]
int *numNeighbors, // numNeighbors[numParticles]
int numParticles) {
int sortedIndex = blockIdx.x * blockDim.x + threadIdx.x; // one particle per thread
if (sortedIndex >= numParticles) return;
if ( (Type[sortedIndex] != 0) && (Type[sortedIndex] != 1) ) return; // bonds only for main particles
double3 pos = make_double3(oldPos[sortedIndex]);
double3 vel = make_double3(oldVel[sortedIndex]);
double3 force = make_double3(0.0);
// examine neighbor list
for (int i = 0; i<numNeighbors[sortedIndex]; i++) {
int neighborIndex = Neighbors[MAX_NEIGHBOR * sortedIndex + i];
double3 pos2 = make_double3(oldPos[neighborIndex]); // oldPos non-coalesced
double3 relPos = pos2 - pos;
double dist = length(relPos);
double _delta = params.horizon;
double _c = 18.*params.modBulk / (CUDART_PI * _delta*_delta*_delta*_delta);
force += _c * BondStretch[sortedIndex*MAX_NEIGHBOR + i] * Mu[sortedIndex*MAX_NEIGHBOR + i]
* Vol[neighborIndex] * relPos / dist; // Vol non-coalesced
}
// write new velocity back to original unsorted location
DVel_Dt[sortedIndex] = make_double4(force / Rho[sortedIndex], 0.0);
}
I admit this is the most naive implementation that one can think of. While this kernel is practical enough on 3090 Ti for my current applications with 100,000~1,000,000 particles, I want it to be faster.
I am trying some variants of this using shared memory and so on, but the naivest one is still performing the best among my variants.
One of my variant is like this, I tried to plug the parallel reduction in it. While the memory access is still non-coalesced, there was a slight speed-up. But as I know 'fixed' neighbor lists, I thinks I had done it better in the memory access respect.
Any suggestions on improving this kernel?
__global__ void calcBondAccelD2(double4 *DVel_Dt, // output
double4 *oldPos, // intputs
double4 *oldVel,
double *Rho,
double *Vol,
double *BondStretch,
double *Mu,
int *Type,
int *Neighbors,
int *numNeighbors,
int numParticles) {
int sortedIndex = blockIdx.x; // one particle per one block, threads taking care of its neighbors
__shared__ double _s[256][6];
if (sortedIndex >= numParticles) return;
if ( (Type[sortedIndex] != 0) && (Type[sortedIndex] != 1) ) return; // bonds only for main particles
double3 pos = make_double3(oldPos[sortedIndex]);
double3 vel = make_double3(oldVel[sortedIndex]);
double rho = Rho[sortedIndex];
double3 force = make_double3(0.0);
int neighborIndex = Neighbors[MAX_NEIGHBOR * sortedIndex + threadIdx.x];
int _numNeighbor = numNeighbors[sortedIndex];
double3 pos2 = make_double3(oldPos[neighborIndex]);
double vol2 = Vol[neighborIndex];
_s[threadIdx.x][0] = BondStretch[sortedIndex*256 + threadIdx.x];
_s[threadIdx.x][1] = Mu [sortedIndex*256 + threadIdx.x];
_s[threadIdx.x][2] = 0.0;
_s[threadIdx.x][3] = 0.0;
_s[threadIdx.x][4] = 0.0;
__syncthreads();
if (threadIdx.x < _numNeighbor) {
double3 relPos = pos2 - pos;
double dist = length(relPos);
double _delta = params.horizon;
double _c = 18.*params.modBulk / (CUDART_PI * _delta*_delta*_delta*_delta);
force = _c * _s[threadIdx.x][0] * _s[threadIdx.x][1] * vol2 * relPos / dist;
_s[threadIdx.x][2] = force.x;
_s[threadIdx.x][3] = force.y;
_s[threadIdx.x][4] = force.z;
}
__syncthreads();
int threadIdx2;
// parallel reduction - WARNING!! numThread should be fixed to 256 !!!
if (threadIdx.x < 128) {
threadIdx2 = threadIdx.x + 128;
if (threadIdx2 < _numNeighbor) {
_s[threadIdx.x][2] = _s[threadIdx.x][2] + _s[threadIdx2][2];
_s[threadIdx.x][3] = _s[threadIdx.x][3] + _s[threadIdx2][3];
_s[threadIdx.x][4] = _s[threadIdx.x][4] + _s[threadIdx2][4];
}
__syncthreads();
}
if (threadIdx.x < 64) {
threadIdx2 = threadIdx.x + 64;
if (threadIdx2 < _numNeighbor) {
_s[threadIdx.x][2] = _s[threadIdx.x][2] + _s[threadIdx2][2];
_s[threadIdx.x][3] = _s[threadIdx.x][3] + _s[threadIdx2][3];
_s[threadIdx.x][4] = _s[threadIdx.x][4] + _s[threadIdx2][4];
}
__syncthreads();
}
if (threadIdx.x < 32) {
threadIdx2 = threadIdx.x + 32;
if (threadIdx2 < _numNeighbor) {
_s[threadIdx.x][2] = _s[threadIdx.x][2] + _s[threadIdx2][2];
_s[threadIdx.x][3] = _s[threadIdx.x][3] + _s[threadIdx2][3];
_s[threadIdx.x][4] = _s[threadIdx.x][4] + _s[threadIdx2][4];
}
__syncthreads();
}
if (threadIdx.x == 0) {
force.x = _s[0][2];
force.y = _s[0][3];
force.z = _s[0][4];
// write new velocity back to original unsorted location
DVel_Dt[sortedIndex] = make_double4(force / rho, 0.0);
}
}