I have been training and testing a model using 10 fold cross validation which I have implemented myself. The first fold iteration went just fine, I would train on the last nine folds ([1:10]) and test on the first fold ([0]). But after this stage I would move the test set along to the next fold, i.e. training on every fold bar the second ([0], [2:10]) and testing on the second fold ([1]). Beyond this first stage, the training time approximately doubled.
I am not a PyTorch expert but I have been trying to debug the problem, first by using the cProfile library and second by using the PyTorch profiler module. I assumed the problem had to be due to some transferring of data between the CPU and GPU, when I profiled the code I cannot really narrow the problem down to anything in particular, everything seems to be longer.
Some preliminary info:
MODEL: VGG16
OPTIMIZER: CrossEntropyLoss
BATCH SIZE: 16 (memory constraints)
BATCH NORMALIZATION: False
LEARNING RATE: 1e-6
My training_loop function, where files
is just a class holding some file objects for data collection.
def train_loop(device, dataloader, model, loss_fn, optimizer, epoch=None, files=None):
size = len(dataloader.dataset)
num_batches = len(dataloader)
correct, train_loss = 0, 0
for batch, (X, y) in enumerate(dataloader):
X, y = X.to(device), y.to(device)
optimizer.zero_grad()
# Compute prediction and loss
pred = model(X)
loss = loss_fn(pred, y)
correct += (pred.argmax(1) == y.argmax(1)).type(torch.float).sum().item()
# Backpropagation
loss.backward()
optimizer.step()
loss, current = loss.item(), batch * len(X)
train_loss += loss
if files is not None:
files.logfile.write(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]\n")
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
if epoch is not None and files is not None:
# Total number of batches
files.sumwriter.add_scalar('Training Loss/batch', loss, epoch * len(dataloader) + batch)
files.training_loss.writerow([epoch * len(dataloader) + batch, loss])
train_loss /= num_batches
correct /= size
if epoch is not None and files is not None:
files.sumwriter.add_scalar("Train Accuracy", correct, epoch)
files.sumwriter.add_scalar('Train Loss/epoch', train_loss, epoch)
files.training_acc.writerow([epoch, correct])
The code that initialises training and loops through the folds:
for training, testing in itr:
# Create dataloaders
training_dl = DataLoader(training, batch_size=batch_size, shuffle=True)
testing_dl = DataLoader(testing, batch_size=batch_size, shuffle=False) # Don't shuffle test data
if files:
files.logfile.write("{:<30}{:>20}\n".format("TAG: ", tag))
files.logfile.write("{:<30}{:>20}\n".format("TRANSFORM: ", ", ".join(transform).upper()))
files.logfile.write("{:<30}{:>20}\n".format("LR: ", learning_rate))
files.logfile.write("{:<30}{:>20}\n".format("EPOCHS: ", epochs))
train(epochs, device, training_dl, testing_dl, model, loss_fn, optimizer, files, debug=debug)
if save_model:
torch.save(model.state_dict(), os.path.join(files.logpath, "model"))
if kfolds:
# Reset for next fold
model = findmodel(model_str, batch_norm, device=device) # Refresh model to base state with pretraining weights and move the model to GPU
optimizer = findoptimizer(optimizer_str, model, learning_rate)
files.refresh() # Save current files and create new ones for the next fold
and finally the train loop:
def train(epochs, device, training_dl, validation_dl, model, loss_fn, optimizer, files=None, debug=False):
start = dt.datetime.now()
start_str = start.strftime("%H:%M:%S")
if files is not None:
files.logfile.write("{:<30}{:>20}\n".format("MODEL: ", files.model_name))
files.logfile.write("{:<30}{:>20}\n".format("LOSS FUNCTION: ", files.loss_fn_name))
files.logfile.write("{:<30}{:>20}\n".format("OPTIMIZER: ", files.optimizer_name))
files.logfile.write("{:<30}{:>20}\n".format("BATCH SIZE: ", files.batch_size_str))
files.logfile.write("{:<30}{:>20}\n".format("BATCH NORMALIZATION: ", files.batchnorm))
files.logfile.write("{:<30}{:>20}\n".format("START: ", start_str))
test_loop(device, validation_dl, model, loss_fn, epoch=-1, files=files)
for ep in range(epochs):
print(f"Epoch {ep + 1}\n-------------------------------")
if files is not None:
files.logfile.write(f"Epoch {ep + 1}\n-------------------------------\n")
train_loop(device, training_dl, model, loss_fn, optimizer, ep, files)
test_loop(device, validation_dl, model, loss_fn, ep, files)
if files is not None:
end = dt.datetime.now()
elapsed = end - start
end_str = end.strftime("%H:%M:%S")
files.logfile.write("{:<30}{:>20}\n".format("END: ", end_str))
files.logfile.write("{:<30}{!s:>20}\n".format("Total elapsed time: ", elapsed))
print("Finished !")
Any help would be appreciated, I've only provided a snapshot of my current code base so if there's anything that doesn't make sense please let me know. Any help would be appreciated.