I am currently working on porting an existing (and working) training script that I wrote on a multi GPU machine. I encounter following problem. The code does detect all 8 GPUs (I am using torchrun for executing the file) and does the first epoch as expected. However, thereafter the code just stops without any error message. The code is pretty lengthy but here a snippet from the most important part:
LEARNING_RATE = 10e-5
BATCH_SIZE = 32
BACKEND = "nccl"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4,5,6,7"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
dist.init_process_group(BACKEND)
local_rank = int(os.environ["LOCAL_RANK"])
torch.cuda.set_device(local_rank)
model = WildfireMetnet(
forecast_steps=1,
input_size=64,
num_input_timesteps=9,
upsampler_channels=128,
lstm_channels=32,
encoder_channels=64,
center_crop_size=1,
input_channels=18,
output_channels=1
).to(local_rank)
model = nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True)
dataset = WildfireDataset()
train_sampler = DistributedSampler(dataset, shuffle=True)
test_sampler = DistributedSampler(dataset, shuffle=True)
val_sampler = DistributedSampler(dataset, shuffle=True)
train_loader = DataLoader(
dataset=dataset,
batch_size=BATCH_SIZE,
sampler=train_sampler,
num_workers=8,
pin_memory=True
)
test_loader = DataLoader(
dataset=dataset,
batch_size=BATCH_SIZE,
sampler=test_sampler,
num_workers=8,
pin_memory=True
)
val_loader = DataLoader(
dataset=dataset,
batch_size=BATCH_SIZE,
sampler=val_sampler,
num_workers=8,
pin_memory=True
)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5) # TODO: tune this, why not the default value of 1e-4?
criterion = torch.nn.BCELoss()
min_val = -torch.inf
def train_step(engine: Engine, batch: tuple[torch.Tensor, torch.Tensor]) -> float:
model.train()
optimizer.zero_grad()
features, labels = batch[0].to(local_rank), batch[1].to(local_rank).float()
features = torch.nan_to_num(features, nan= 0.0)
labels = torch.nan_to_num(labels, nan= 0.0)
out = model(features, 0)
print(out.mean(), labels.mean())
loss = criterion(out, labels)
loss.backward()
optimizer.step()
return loss.item()
trainer = Engine(train_step)
def validation_step(engine: Engine, batch: tuple[torch.Tensor, torch.Tensor]) -> float:
model.eval()
with torch.no_grad():
features, labels = batch[0].to(local_rank), batch[1].to(local_rank).float()
features = torch.nan_to_num(features, nan= 0.0)
labels = torch.nan_to_num(labels, nan= 0.0)
out = model(features, 0)
rounded = (out>=0.65).float()
return rounded, labels
here after are a few calls to functions that store metrices in several files. One example is the following:
@trainer.on(Events.EPOCH_COMPLETED)
def log_validation_results(trainer):
evaluator.run(val_loader)
metrics = evaluator.state.metrics
filename = 'evaluation/validation_output_2702_multi.txt'
if not os.path.exists(filename):
open(filename, 'w').close()
with open(filename,'a') as f:
print("{},{:.2f},{:.2f},{:.2f},{:.2f}".format(trainer.state.epoch, metrics["loss"],metrics["precision"],metrics["recall"], metrics["f1"]), file = f)
at the very end of the script there is:
trainer.run(train_loader, max_epochs=100)
dist.destroy_process_group()
Also the code is run on a remote Linux Machine with the following config:
GPUs: 8x NVIDIA A100 80 GB GPUs GPU Memory: 640 GB CPU: Dual AMD Rome 7742, 128 cores total, 2.25 GHz (base), 3.4 GHz (max boost) System Memory: 2 TB Internal Storage: 30 TB NVMe
Software:
OS version: DGX-OS 5.4.2 (= Ubuntu 20.04 LTS) LinuX Kernel: 5.4.0 Nvidia Driver Version: 470.161 CUDA Version: 11.4 GLIBC version: 2.31 Docker version: 20.10.21
For pytorch i am using version 1. 13. 1 and for ignite 0.4.10
I already tried several configs to local rank but it doesnt work out. Also the trainer.run was set to just 2 Epochs and the same issue remained