I am trying to scatter a pandas dataframe to spawned children processes.
Here is what I have tried.
master.py
from mpi4py import MPI
import numpy as np
import sys
import pandas as pd
comm = MPI.COMM_SELF.Spawn(sys.executable,
args=['child.py'],
maxprocs=5)
class Model:
def __init__(self, num_draw):
self.num_draw = num_draw
def test(self):
id = np.arange(14)
id_repeat = np.repeat(id, self.num_draw)
draws = np.arange(self.num_draw*14)
data = pd.DataFrame({"id": id_repeat, "draws": draws})
chunks = np.array_split(data,5)
comm.scatter(chunks, root=MPI.ROOT)
sum_gather = None
sum_gather = comm.gather(sum_gather, root=MPI.ROOT)
print('parent', sum_gather)
model=Model(500)
model.test()
comm.Disconnect()
child.py
from mpi4py import MPI
comm = MPI.Comm.Get_parent()
size = comm.Get_size()
rank = comm.Get_rank()
data = None
data = comm.scatter(data, root=0)
def li(data):
data_sum = data.sum()
return data_sum
data_sum=li(data)
print("child rank", rank, "after scatter", data_sum)
comm.gather(data_sum, root=0)
comm.Disconnect()
The program hangs without error message. If I do model=Model(5), it works. So perhaps this is related to the size of dataframe? How can I scatter a large dataframe?