MPI_Sendrecv stuck when I tried to implement alltoall communication with hypercubic permutation

16 Views Asked by At
// Hypercubic all_to_all function adapted to the matrix transpose operation
void HPC_Alltoall_H(void * sbuf, int scount, MPI_Datatype stype, 
                            void * rbuf, int rcount, MPI_Datatype rtype, MPI_Comm comm) {
    int rank, size;
    MPI_Comm_rank(comm, &rank);
    MPI_Comm_size(comm, &size);
    int n = static_cast<int>(std::sqrt(scount * size));
    int blockSize = n/size;
    // std::cout << blockSize << " ";
    int* stored_message = static_cast<int *>(sbuf);
    std::vector<int> temp(stored_message, stored_message + scount);

    std::vector<int> in_sendBuffer(blockSize * blockSize*size/2);
    std::vector<int> in_recvBuffer(blockSize * blockSize*size/2);

    int dims = (int)log2(size); // Calculate the number of dimensions

    for (int i = dims - 1; i >= 0; --i) {
        int diff = 1 << i;
        int partner = rank ^ (diff); // Calculate the partner's rank for this phase

        // For processors with smaller rank
        if (rank  < partner) {
            // select data from temp to in_send
            for (int j = 0; j < size/2/diff; j++) {
                for (int k = 0; k < blockSize * blockSize * diff; k++) {
                    in_sendBuffer.push_back(temp[j * 2 * diff * blockSize * blockSize + diff * blockSize * blockSize + k]);
                    
                }
            } 
            std::cout << rank << partner << std::endl;

            // Perform the data exchange with the partner
            MPI_Sendrecv(in_sendBuffer.data(), blockSize * blockSize * size / 2, MPI_INT, rank, 0,
                     in_recvBuffer.data(), blockSize * blockSize * size / 2, MPI_INT, partner, 0,
                     comm, MPI_STATUS_IGNORE);

            std::cout << "Finish small rank send select"  << std::endl;

            // select data from in_recv to temp
            for (int j = 0; j < size/2/diff; j++) {
                for (int k = 0; k < blockSize * blockSize * diff; k++) {
                    temp[j * 2 * diff * blockSize * blockSize + diff * blockSize * blockSize + k] = in_recvBuffer[j * diff * blockSize * blockSize + k];
                }
            } 

            std::cout << "Finish small rank recv select"  << std::endl;

        } else {
            // select data from temp to in_send
            for (int j = 0; j < size/2/diff; j++) {
                for (int k = 0; k < blockSize * blockSize * diff; k++) {
                    in_sendBuffer.push_back(temp[j * 2 * diff * blockSize * blockSize + k]);
                }
            } 

            // Perform the data exchange with the partner
            MPI_Sendrecv(in_sendBuffer.data(), blockSize * blockSize * size / 2, MPI_INT, rank, 0,
                     in_recvBuffer.data(), blockSize * blockSize * size / 2, MPI_INT, partner, 0,
                     comm, MPI_STATUS_IGNORE);
            std::cout << "Finish large rank send select"  << std::endl;


            // select data from in_recv to temp
            for (int j = 0; j < size/2/diff; j++) {
                for (int k = 0; k < blockSize * blockSize * diff; k++) {
                    temp[j * 2 * diff * blockSize * blockSize + k] = in_recvBuffer[j * diff * blockSize * blockSize + k];
                }
            } 
            std::cout << "Finish large rank recv select"  << std::endl;

        }

    }
    std::cout << "Finish temp"  << std::endl;


    for (int i = 0; i < blockSize * blockSize * size; i++) {
        static_cast<int *>(rbuf)[i] = temp[i];
    }

    std::cout << "Finish."  << std::endl;

}

I am trying to implement the hypercubic permutation for an All to all collective communication to achieve matrix transpose, but when I run the code,it stuck at the function MPI_Sendrecv.

The rank and partner are matched correctly, the buffer size are also correct, I cannot figure out what is the problem that causes the MPI_Sendrecv stuck.

I print the rank and partner numbers, they are matched for 8 processors program. algorithm visualization

0

There are 0 best solutions below