As titled, I think it should be a cumulated quantity from recv_count. So for the i process, the received data are inserted to the location recv_data + rdispls(i)*extent(recv_type). To make this clear, I try to rewrite the example in Python. It gives a wrong answer due to the wrong recv_disp (maybe as well the send_disp):
Example: https://rookiehpc.org/mpi/docs/mpi_alltoallv/index.html
My attempt in Python: (here I did not change the order recv_data in process 0 as that in the example)
from mpi4py import MPI
import numpy as np
comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()
assert size == 3
# Send data
if rank == 0:
send_data = np.array([0, 100, 200], dtype=np.int32)
send_rank = np.array([0, 1, 1], dtype=np.int32)
elif rank == 1:
send_data = np.array([300, 400, 500], dtype=np.int32)
send_rank = np.array([2, 2, 2], dtype=np.int32)
else:
send_data = np.array([600], dtype=np.int32)
send_rank = np.array([0], dtype=np.int32)
send_length = len(send_data)
send_counts = np.zeros(size, dtype=np.int32)
for procs_id in send_rank:
send_counts[procs_id] += 1
send_disp = np.zeros(size, dtype=np.int32)
send_disp[1:] = np.cumsum(send_counts)[:-1]
# Receive data
recv_counts = np.zeros(size, dtype=np.int32)
comm.Alltoall(send_counts, recv_counts)
recv_length = np.sum(recv_counts)
recv_data = np.zeros(recv_length, dtype=np.int32)
recv_disp = np.zeros(size, dtype=np.int32)
recv_disp[1:] = np.cumsum(recv_counts)[:-1] # Comment this and get correct answer
# Alltoallv
comm.Alltoallv([send_data, send_counts, recv_disp, MPI.INT],
[recv_data, recv_counts, recv_disp, MPI.INT])
print(f"rank:\n{rank}\n"
f"send_counts:\n{send_counts}\n"
f"recv_counts:\n{recv_counts}\n"
f"send_disp:\n{send_disp}\n"
f"recv_disp:\n{recv_disp}\n"
f"send_data:\n{send_data}\n"
f"recv_data:\n{recv_data}\n"
f"recv_length:\n{recv_length}\n"
)