10 Fold Cross Validation Training Delay After First Fold

Question

I have been training and testing a model using 10 fold cross validation which I have implemented myself. The first fold iteration went just fine, I would train on the last nine folds ([1:10]) and test on the first fold ([0]). But after this stage I would move the test set along to the next fold, i.e. training on every fold bar the second ([0], [2:10]) and testing on the second fold ([1]). Beyond this first stage, the training time approximately doubled.

I am not a PyTorch expert but I have been trying to debug the problem, first by using the cProfile library and second by using the PyTorch profiler module. I assumed the problem had to be due to some transferring of data between the CPU and GPU, when I profiled the code I cannot really narrow the problem down to anything in particular, everything seems to be longer.

Some preliminary info:

MODEL: VGG16

OPTIMIZER: CrossEntropyLoss

BATCH SIZE: 16 (memory constraints)

BATCH NORMALIZATION: False

LEARNING RATE: 1e-6

My training_loop function, where files is just a class holding some file objects for data collection.

def train_loop(device, dataloader, model, loss_fn, optimizer, epoch=None, files=None):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    correct, train_loss = 0, 0

    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        optimizer.zero_grad()

        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)
        correct += (pred.argmax(1) == y.argmax(1)).type(torch.float).sum().item()
        # Backpropagation
        loss.backward()
        optimizer.step()

        loss, current = loss.item(), batch * len(X)
        train_loss += loss

        if files is not None:
            files.logfile.write(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]\n")

        print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

        if epoch is not None and files is not None:
            # Total number of batches
            files.sumwriter.add_scalar('Training Loss/batch', loss, epoch * len(dataloader) + batch)

            files.training_loss.writerow([epoch * len(dataloader) + batch, loss])

    train_loss /= num_batches
    correct /= size

    if epoch is not None and files is not None:
        files.sumwriter.add_scalar("Train Accuracy", correct, epoch)
        files.sumwriter.add_scalar('Train Loss/epoch', train_loss, epoch)
        files.training_acc.writerow([epoch, correct])

The code that initialises training and loops through the folds:

        for training, testing in itr:

            # Create dataloaders
            training_dl = DataLoader(training, batch_size=batch_size, shuffle=True)
            testing_dl = DataLoader(testing, batch_size=batch_size, shuffle=False)  # Don't shuffle test data

            if files:
                files.logfile.write("{:<30}{:>20}\n".format("TAG: ", tag))
                files.logfile.write("{:<30}{:>20}\n".format("TRANSFORM: ", ", ".join(transform).upper()))
                files.logfile.write("{:<30}{:>20}\n".format("LR: ", learning_rate))
                files.logfile.write("{:<30}{:>20}\n".format("EPOCHS: ", epochs))

            train(epochs, device, training_dl, testing_dl, model, loss_fn, optimizer, files, debug=debug)

            if save_model:
                torch.save(model.state_dict(), os.path.join(files.logpath, "model"))
         

            if kfolds:
                # Reset for next fold
                model = findmodel(model_str, batch_norm, device=device) # Refresh model to base state with pretraining weights and move the model to GPU
                optimizer = findoptimizer(optimizer_str, model, learning_rate) 

                files.refresh() # Save current files and create new ones for the next fold

and finally the train loop:

def train(epochs, device, training_dl, validation_dl, model, loss_fn, optimizer, files=None, debug=False):
    start = dt.datetime.now()
    start_str = start.strftime("%H:%M:%S")

    if files is not None:
        files.logfile.write("{:<30}{:>20}\n".format("MODEL: ", files.model_name))
        files.logfile.write("{:<30}{:>20}\n".format("LOSS FUNCTION: ", files.loss_fn_name))
        files.logfile.write("{:<30}{:>20}\n".format("OPTIMIZER: ", files.optimizer_name))
        files.logfile.write("{:<30}{:>20}\n".format("BATCH SIZE: ", files.batch_size_str))
        files.logfile.write("{:<30}{:>20}\n".format("BATCH NORMALIZATION: ", files.batchnorm))

        files.logfile.write("{:<30}{:>20}\n".format("START: ", start_str))

    test_loop(device, validation_dl, model, loss_fn, epoch=-1, files=files)
    for ep in range(epochs):
        print(f"Epoch {ep + 1}\n-------------------------------")

        if files is not None:
            files.logfile.write(f"Epoch {ep + 1}\n-------------------------------\n")
        
        train_loop(device, training_dl, model, loss_fn, optimizer, ep, files)
        test_loop(device, validation_dl, model, loss_fn, ep, files)

    if files is not None:
        end = dt.datetime.now()
        elapsed = end - start
        end_str = end.strftime("%H:%M:%S")
        files.logfile.write("{:<30}{:>20}\n".format("END: ", end_str))
        files.logfile.write("{:<30}{!s:>20}\n".format("Total elapsed time: ", elapsed))

    print("Finished !")

Any help would be appreciated, I've only provided a snapshot of my current code base so if there's anything that doesn't make sense please let me know. Any help would be appreciated.

Please trim your code to make it easier to find your problem. Follow these guidelines to create a [minimal reproducible example](https://stackoverflow.com/help/minimal-reproducible-example). — Community, Mar 10 '23 at 05:36

10 Fold Cross Validation Training Delay After First Fold

0 Answers0