I wanted to use DistributedDataParallel allel to implement the model's single-machine multi-GPU training process, but encountered some problems during the process.
The specific implementation code is:
def _train_one_epoch(self,epoch):
score_AM = AverageMeter()
loss_AM = AverageMeter()
train_num_episode = self.trainer_params['train_episodes']
episode = 0
loop_cnt = 0
while episode < train_num_episode:
remaining = train_num_episode - episode
batch_size = min(self.trainer_params['train_batch_size'], remaining)
# load all data for DataLoader
dis_up, dis_down = self.env.load_problems(batch_size)
# (batch,node,node)->(batch,node,node,2)
batch_data = torch.stack([dis_up, dis_down], dim=-1)
single_batch_size = batch_size // 3
# create Dataloader
sampler = torch.utils.data.DistributedSampler(batch_data)
batch_dataloader = torch.utils.data.DataLoader(batch_data,batch_size = single_batch_size,shuffle=False,sampler=sampler)
sampler.set_epoch(epoch)
for batch_idx,batch in enumerate(batch_dataloader):
batch_up = batch[:,:,:,0].to(self.device)
batch_down = batch[:,:,:,1].to(self.device)
# avg_score, avg_loss = self._train_one_batch(batch_size)
current_gpu = torch.cuda.current_device()
avg_score, avg_loss = self._train_one_batch(batch_up, batch_down, current_gpu)
score_AM.update(avg_score, batch_size)
loss_AM.update(avg_loss, batch_size)
dist.barrier()
episode += batch_size
Error reported as follows: