Fastai appears to be slower than PyTorch Lightning

Question

I noticed that Fastai appears to be slower than PyTorch Lightning on my machine.

For instance, the code below is used to classify the CIFAR-10 dataset. It utilizes a ResNet32 model implemented by Timm and was trained for a total of 20 epochs. The dataloaders were created to be as similar as possible.

Fastai code

import os
os. environ['CUDA_VISIBLE_DEVICES'] = '0'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from functools import partial
from fastai.callback.wandb import WandbCallback
from fastai.vision.all import *
import pytorch_lightning as pl
from torch import nn
import timm

import wandb
wandb.init(project="timm", name="fastai")

path = untar_data(URLs.CIFAR)/'train'
test_path = untar_data(URLs.CIFAR)/'test'
device = torch.device("cuda:0")
aug_batch_tfms = [
                Flip(p = 0.5), 
                RandomResizedCropGPU(
                            size = 32,
                            min_scale = 0.8,
                            max_scale =1.0,
                            ratio = (0.9, 1.1),
                            p = 0.9
                            )
                    ]

db = DataBlock(
    blocks=(ImageBlock, CategoryBlock),
    get_items=get_image_files,
    get_y= parent_label,
    splitter = RandomSplitter(valid_pct=0.1, seed=42),
    n_inp=1,
    batch_tfms = aug_batch_tfms + [Normalize.from_stats(*cifar_stats)]#+aug_batch_tfms
)

dls = db.dataloaders(
    path,
    num_workers=4,
    bs = 64
    ).to(device)

res32 = timm.create_model(
    model_name = 'resnet34', 
    pretrained= False, in_chans = 3, num_classes=10)

learn = Learner(
    dls = dls, model = res32, loss_func = CrossEntropyLossFlat(), 
    cbs = [ShowGraphCallback(), WandbCallback(log_preds_every_epoch = True)],
    opt_func=partial(SGD, mom = 0.9, wd = 1e-4),
    metrics = accuracy).to_fp16()

# learn.lr_find()

learn.fit_one_cycle(20, 0.001)

Pytorch lightning code

I am not familiar with PyTorch Lightning. This training framework was written by modifying this toturial.

import os
os. environ['CUDA_VISIBLE_DEVICES'] = '1'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

from lightning.pytorch.loggers import WandbLogger
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
# Torchvision
import torchvision
from torchvision.datasets import CIFAR10
from torchvision import transforms
import pytorch_lightning as pl
import timm

import wandb
wandb.init(project="timm", name="lightning")

# Path to the folder where the datasets are/should be downloaded (e.g. CIFAR10)
DATASET_PATH = "../data"
# Path to the folder where the pretrained models are saved
CHECKPOINT_PATH = "../saved_models/tutorial5"
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

train_dataset = CIFAR10(root=DATASET_PATH, train=True, download=True)
DATA_MEANS = (train_dataset.data / 255.0).mean(axis=(0,1,2))
DATA_STD = (train_dataset.data / 255.0).std(axis=(0,1,2))
print("Data mean", DATA_MEANS)
print("Data std", DATA_STD)

test_transform = transforms.Compose([transforms.ToTensor(),
                                     transforms.Normalize(DATA_MEANS, DATA_STD)
                                     ])
# For training, we add some augmentation. Networks are too powerful and would overfit.
train_transform = transforms.Compose([transforms.RandomHorizontalFlip(),
                                      transforms.RandomResizedCrop((32,32), scale=(0.8,1.0), ratio=(0.9,1.1)),
                                      transforms.ToTensor(),
                                      transforms.Normalize(DATA_MEANS, DATA_STD)
                                     ])
# Loading the training dataset. We need to split it into a training and validation part
# We need to do a little trick because the validation set should not use the augmentation.
train_dataset = CIFAR10(root=DATASET_PATH, train=True, transform=train_transform, download=True)
val_dataset   = CIFAR10(root=DATASET_PATH, train=True, transform=test_transform, download=True)
pl.seed_everything(42)
train_set, _ = torch.utils.data.random_split(train_dataset, [45000, 5000])
pl.seed_everything(42)
_, val_set = torch.utils.data.random_split(val_dataset, [45000, 5000])

# Loading the test set
test_set = CIFAR10(root=DATASET_PATH, train=False, transform=test_transform, download=True)

# We define a set of data loaders that we can use for various purposes later.
train_loader = data.DataLoader(train_set, batch_size=64, shuffle=True, drop_last=True, pin_memory=True, num_workers=4)
val_loader = data.DataLoader(val_set, batch_size=64, shuffle=False, drop_last=False, num_workers=4)
test_loader = data.DataLoader(test_set, batch_size=64, shuffle=False, drop_last=False, num_workers=4)


class CIFARModule(pl.LightningModule):

    def __init__(self, optimizer_name, optimizer_hparams):
        """
        Inputs:
            model_name - Name of the model/CNN to run. Used for creating the model (see function below)
            model_hparams - Hyperparameters for the model, as dictionary.
            optimizer_name - Name of the optimizer to use. Currently supported: Adam, SGD
            optimizer_hparams - Hyperparameters for the optimizer, as dictionary. This includes learning rate, weight decay, etc.
        """
        super().__init__()
        # Exports the hyperparameters to a YAML file, and create "self.hparams" namespace
        self.save_hyperparameters()
        # Create model
        self.model = timm.create_model(
                    model_name = 'resnet34', 
                    pretrained= False, in_chans = 3, num_classes=10)
        # Create loss module
        self.loss_module = nn.CrossEntropyLoss()
        # Example input for visualizing the graph in Tensorboard
        self.example_input_array = torch.zeros((1, 3, 32, 32), dtype=torch.float32)

    def forward(self, imgs):
        # Forward function that is run when visualizing the graph
        return self.model(imgs)

    def configure_optimizers(self):
        # We will support Adam or SGD as optimizers.
        if self.hparams.optimizer_name == "Adam":
            # AdamW is Adam with a correct implementation of weight decay (see here for details: https://arxiv.org/pdf/1711.05101.pdf)
            optimizer = optim.AdamW(
                self.parameters(), **self.hparams.optimizer_hparams)
        elif self.hparams.optimizer_name == "SGD":
            optimizer = optim.SGD(self.parameters(), **self.hparams.optimizer_hparams)
        else:
            assert False, f"Unknown optimizer: \"{self.hparams.optimizer_name}\""

        # We will reduce the learning rate by 0.1 after 100 and 150 epochs
        # scheduler = optim.lr_scheduler.MultiStepLR(
        #     optimizer, milestones=[100, 150], gamma=0.1)
        scheduler = optim.lr_scheduler.MultiStepLR(
            optimizer, milestones=[10, 15], gamma=0.1)
        return [optimizer], [scheduler]

    def training_step(self, batch, batch_idx):
        # "batch" is the output of the training data loader.
        imgs, labels = batch
        preds = self.model(imgs)
        loss = self.loss_module(preds, labels)
        acc = (preds.argmax(dim=-1) == labels).float().mean()

        # Logs the accuracy per epoch to tensorboard (weighted average over batches)
        self.log('train_acc', acc, on_step=False, on_epoch=True)
        self.log('train_loss', loss)
        return loss  # Return tensor to call ".backward" on

    def validation_step(self, batch, batch_idx):
        imgs, labels = batch
        preds = self.model(imgs)
        loss = self.loss_module(preds, labels)
        acc = (preds.argmax(dim=-1) == labels).float().mean()
        # By default logs it per epoch (weighted average over batches)
        self.log('val_acc', acc)
        self.log('valid_loss', loss)

    def test_step(self, batch, batch_idx):
        imgs, labels = batch
        preds = self.model(imgs).argmax(dim=-1)
        acc = (labels == preds).float().mean()
        # By default logs it per epoch (weighted average over batches), and returns it afterwards
        self.log('test_acc', acc)


def train_model(model_name = 'resnet32', save_name=None, **kwargs):
    """
    Inputs:
        model_name - Name of the model you want to run. Is used to look up the class in "model_dict"
        save_name (optional) - If specified, this name will be used for creating the checkpoint and logging directory.
    """
    if save_name is None:
        save_name = model_name

    # Create a PyTorch Lightning trainer with the generation callback
    wandb_logger = WandbLogger(project="CIFAR10",wandb_logger = WandbLogger(tags=['accuracy', 'loss']))
    trainer = pl.Trainer(default_root_dir=os.path.join(CHECKPOINT_PATH, save_name),                          # Where to save models
                        accelerator="gpu",                     # We run on a GPU (if possible)
                        devices=1,                                                                          # How many GPUs/CPUs we want to use (1 is enough for the notebooks)
                        max_epochs=20,                                                                     # How many epochs to train for if no patience is set
                        callbacks=[ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc"),  # Save the best checkpoint based on the maximum val_acc recorded. Saves only weights and not optimizer
                                LearningRateMonitor("epoch")],                                           # Log learning rate every epoch
                        enable_progress_bar=True,
                        logger=wandb_logger)                                                           # Set to False if you do not want a progress bar
    trainer.logger._log_graph = True         # If True, we plot the computation graph in tensorboard
    trainer.logger._default_hp_metric = None # Optional logging argument that we don't need


    pl.seed_everything(42) # To be reproducable
    model = CIFARModule(**kwargs)

    trainer.fit(model, train_loader, val_loader)
    model = CIFARModule.load_from_checkpoint(trainer.checkpoint_callback.best_model_path) # Load best checkpoint after training

    # Test best model on validation and test set
    val_result = trainer.test(model, val_loader, verbose=False)
    test_result = trainer.test(model, test_loader, verbose=False)
    result = {"test": test_result[0]["test_acc"], "val": val_result[0]["test_acc"]}

    return model, result


resnet_model, resnet_results = train_model(model_name="ResNet",
                        optimizer_name="SGD",
                        optimizer_hparams={"lr": 0.1,
                                            "momentum": 0.9,
                                            "weight_decay": 1e-4})

The duration of the training process is depicted in the plot below.

Moreover, I noticed that the GPU usage of fastai(CPU0) is lower pytorch lightning(CPU1).

What is the reason that resulted in this difference?

Pytorch lightning version: 2.0.7
Fastai version: 2.7.12

*"The dataloaders were created to be as similar as possible."* Assuming it's the same model, the problem might lie here. — dinhanhx, Aug 28 '23 at 09:43

Fastai appears to be slower than PyTorch Lightning

Fastai code

Pytorch lightning code

0 Answers0