0

I have been training and testing a model using 10 fold cross validation which I have implemented myself. The first fold iteration went just fine, I would train on the last nine folds ([1:10]) and test on the first fold ([0]). But after this stage I would move the test set along to the next fold, i.e. training on every fold bar the second ([0], [2:10]) and testing on the second fold ([1]). Beyond this first stage, the training time approximately doubled.

I am not a PyTorch expert but I have been trying to debug the problem, first by using the cProfile library and second by using the PyTorch profiler module. I assumed the problem had to be due to some transferring of data between the CPU and GPU, when I profiled the code I cannot really narrow the problem down to anything in particular, everything seems to be longer.

Some preliminary info:

MODEL: VGG16

OPTIMIZER: CrossEntropyLoss

BATCH SIZE: 16 (memory constraints)

BATCH NORMALIZATION: False

LEARNING RATE: 1e-6

My training_loop function, where files is just a class holding some file objects for data collection.

def train_loop(device, dataloader, model, loss_fn, optimizer, epoch=None, files=None):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    correct, train_loss = 0, 0

    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        optimizer.zero_grad()

        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)
        correct += (pred.argmax(1) == y.argmax(1)).type(torch.float).sum().item()
        # Backpropagation
        loss.backward()
        optimizer.step()

        loss, current = loss.item(), batch * len(X)
        train_loss += loss

        if files is not None:
            files.logfile.write(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]\n")

        print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

        if epoch is not None and files is not None:
            # Total number of batches
            files.sumwriter.add_scalar('Training Loss/batch', loss, epoch * len(dataloader) + batch)

            files.training_loss.writerow([epoch * len(dataloader) + batch, loss])

    train_loss /= num_batches
    correct /= size

    if epoch is not None and files is not None:
        files.sumwriter.add_scalar("Train Accuracy", correct, epoch)
        files.sumwriter.add_scalar('Train Loss/epoch', train_loss, epoch)
        files.training_acc.writerow([epoch, correct])

The code that initialises training and loops through the folds:

        for training, testing in itr:

            # Create dataloaders
            training_dl = DataLoader(training, batch_size=batch_size, shuffle=True)
            testing_dl = DataLoader(testing, batch_size=batch_size, shuffle=False)  # Don't shuffle test data

            if files:
                files.logfile.write("{:<30}{:>20}\n".format("TAG: ", tag))
                files.logfile.write("{:<30}{:>20}\n".format("TRANSFORM: ", ", ".join(transform).upper()))
                files.logfile.write("{:<30}{:>20}\n".format("LR: ", learning_rate))
                files.logfile.write("{:<30}{:>20}\n".format("EPOCHS: ", epochs))

            train(epochs, device, training_dl, testing_dl, model, loss_fn, optimizer, files, debug=debug)

            if save_model:
                torch.save(model.state_dict(), os.path.join(files.logpath, "model"))
         

            if kfolds:
                # Reset for next fold
                model = findmodel(model_str, batch_norm, device=device) # Refresh model to base state with pretraining weights and move the model to GPU
                optimizer = findoptimizer(optimizer_str, model, learning_rate) 

                files.refresh() # Save current files and create new ones for the next fold

and finally the train loop:

def train(epochs, device, training_dl, validation_dl, model, loss_fn, optimizer, files=None, debug=False):
    start = dt.datetime.now()
    start_str = start.strftime("%H:%M:%S")

    if files is not None:
        files.logfile.write("{:<30}{:>20}\n".format("MODEL: ", files.model_name))
        files.logfile.write("{:<30}{:>20}\n".format("LOSS FUNCTION: ", files.loss_fn_name))
        files.logfile.write("{:<30}{:>20}\n".format("OPTIMIZER: ", files.optimizer_name))
        files.logfile.write("{:<30}{:>20}\n".format("BATCH SIZE: ", files.batch_size_str))
        files.logfile.write("{:<30}{:>20}\n".format("BATCH NORMALIZATION: ", files.batchnorm))

        files.logfile.write("{:<30}{:>20}\n".format("START: ", start_str))

    test_loop(device, validation_dl, model, loss_fn, epoch=-1, files=files)
    for ep in range(epochs):
        print(f"Epoch {ep + 1}\n-------------------------------")

        if files is not None:
            files.logfile.write(f"Epoch {ep + 1}\n-------------------------------\n")
        
        train_loop(device, training_dl, model, loss_fn, optimizer, ep, files)
        test_loop(device, validation_dl, model, loss_fn, ep, files)

    if files is not None:
        end = dt.datetime.now()
        elapsed = end - start
        end_str = end.strftime("%H:%M:%S")
        files.logfile.write("{:<30}{:>20}\n".format("END: ", end_str))
        files.logfile.write("{:<30}{!s:>20}\n".format("Total elapsed time: ", elapsed))

    print("Finished !")

Any help would be appreciated, I've only provided a snapshot of my current code base so if there's anything that doesn't make sense please let me know. Any help would be appreciated.

  • Please trim your code to make it easier to find your problem. Follow these guidelines to create a [minimal reproducible example](https://stackoverflow.com/help/minimal-reproducible-example). – Community Mar 10 '23 at 05:36

0 Answers0