I am making the experimental FedAvg simulation using the Pytorch RPC, but the server side throws errors when I run it. It seems that it is my coding problem, but I don't know what the problem is. Here are some related code snippets:
.
.
.
#Start training
if args.rank == 0:
for e in range(args.epoch):
processes = []
q = mp.Queue()
print("Server's Epoch:"+str(e+1))
weight = copy.deepcopy(model.state_dict())
for r in range(args.world_size):
p = mp.Process(
target=run_worker,
args=(
r,
model,
args.lr,
train_loader[r],
device,
args.epoch,
weight,
q))
processes.append(p)
p.start()
for p in processes:
p.join()
.
.
.
And for the function run_worker:
def run_worker(rank, model, lr, train_loader, device, epoch, weight, q):
out_weight = rpc.rpc_sync(f"Worker{rank}", train, args=(rank, model, lr, train_loader, device, epoch, weight))
q.put([rank, out_weight])
But the error shows as below:
Server initialized!
Server's Epoch:1
Process Process-1:
Traceback (most recent call last):
File "/usr/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/pi/FYP/FedAvg_RPC.py", line 96, in run_worker
out_weight = rpc.rpc_sync(f"Worker{rank}", train, args=(rank, model, lr, train_loader, device, epoch, weight))
File "/usr/local/lib/python3.9/dist-packages/torch/distributed/rpc/api.py", line 75, in wrapper
raise RuntimeError(
RuntimeError: RPC has not been initialized. Call torch.distributed.rpc.init_rpc first.
Process Process-2:
Traceback (most recent call last):
File "/usr/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/pi/FYP/FedAvg_RPC.py", line 96, in run_worker
out_weight = rpc.rpc_sync(f"Worker{rank}", train, args=(rank, model, lr, train_loader, device, epoch, weight))
File "/usr/local/lib/python3.9/dist-packages/torch/distributed/rpc/api.py", line 75, in wrapper
raise RuntimeError(
RuntimeError: RPC has not been initialized. Call torch.distributed.rpc.init_rpc first.
Process Process-3:
Traceback (most recent call last):
File "/usr/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/pi/FYP/FedAvg_RPC.py", line 96, in run_worker
out_weight = rpc.rpc_sync(f"Worker{rank}", train, args=(rank, model, lr, train_loader, device, epoch, weight))
File "/usr/local/lib/python3.9/dist-packages/torch/distributed/rpc/api.py", line 75, in wrapper
raise RuntimeError(
RuntimeError: RPC has not been initialized. Call torch.distributed.rpc.init_rpc first.
So what is my coding problem on RPC? Here is my setting: python 3.9.2
torch==1.8.0a0+37c1f4a
torchvision==0.9.0a0+01dfa8e