1

I wanted to test the free IPU runtime on Paperspace, as such I made a free account and selected the HuggingFace + IPU notebook.

Afterwards I created the following very simple notebook with Pytorch Lightning to perform classification on MNIST (the simplest possible example):

!python3 -m pip install torchvision==0.11.1
!python3 -m pip install pytorch_lightning


import torch
from torch.nn import functional as F

import pytorch_lightning as pl
from torch.utils.data import DataLoader
import torchvision
import poptorch

class LitClassifier(pl.LightningModule):
    def __init__(self, hidden_dim: int = 128, learning_rate: float = 0.0001):
        super().__init__()
        self.save_hyperparameters()

        self.l1 = torch.nn.Linear(28 * 28, self.hparams.hidden_dim)
        self.l2 = torch.nn.Linear(self.hparams.hidden_dim, 10)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = torch.relu(self.l1(x))
        x = torch.relu(self.l2(x))
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        probs = self(x)
        # we currently return the accuracy as the validation_step/test_step is run on the IPU devices.
        # Outputs from the step functions are sent to the host device, where we calculate the metrics in
        # validation_epoch_end and test_epoch_end for the test_step.
        acc = self.accuracy(probs, y)
        return acc

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        acc = self.accuracy(logits, y)
        return acc

    def accuracy(self, logits, y):
        # currently IPU poptorch doesn't implicit convert bools to tensor
        # hence we use an explicit calculation for accuracy here. Once fixed in poptorch
        # we can use the accuracy metric.
        acc = torch.sum(torch.eq(torch.argmax(logits, -1), y).to(torch.float32)) / len(y)
        return acc

    def validation_epoch_end(self, outputs) -> None:
        # since the training step/validation step and test step are run on the IPU device
        # we must log the average loss outside the step functions.
        self.log("val_acc", torch.stack(outputs).mean(), prog_bar=True)

    def test_epoch_end(self, outputs) -> None:
        self.log("test_acc", torch.stack(outputs).mean())

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
    

training_batch_size = 10


dm = DataLoader(
     torchvision.datasets.MNIST('mnist_data/',
                                train=True,
                                download=True,
                                transform=torchvision.transforms.Compose([
                                    torchvision.transforms.ToTensor(),
                                    torchvision.transforms.Normalize(
                                        (0.1307, ), (0.3081, ))
                                ])),
     batch_size=training_batch_size,
     shuffle=True)

model = LitClassifier()

print(model)
trainer = pl.Trainer(max_epochs=2, accelerator="ipu", devices="auto")

trainer.fit(model, datamodule=dm)

The code crashes with what looks like an internal error of the library with:

LitClassifier(
  (l1): Linear(in_features=784, out_features=128, bias=True)
  (l2): Linear(in_features=128, out_features=10, bias=True)
)
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: True, using: 4 IPUs
HPU available: False, using: 0 HPUs
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-24-329fa233a013> in <module>
     83 trainer = pl.Trainer(max_epochs=2, accelerator="ipu", devices="auto")
     84 
---> 85 trainer.fit(model, datamodule=dm)

/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
    694         """
    695         self.strategy.model = model
--> 696         self._call_and_handle_interrupt(
    697             self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
    698         )

/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py in _call_and_handle_interrupt(self, trainer_fn, *args, **kwargs)
    648                 return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs)
    649             else:
--> 650                 return trainer_fn(*args, **kwargs)
    651         # TODO(awaelchli): Unify both exceptions below, where `KeyboardError` doesn't re-raise
    652         except KeyboardInterrupt as exception:

/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py in _fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
    733             ckpt_path, model_provided=True, model_connected=self.lightning_module is not None
    734         )
--> 735         results = self._run(model, ckpt_path=self.ckpt_path)
    736 
    737         assert self.state.stopped

/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py in _run(self, model, ckpt_path)
   1089         self._callback_connector._attach_model_logging_functions()
   1090 
-> 1091         verify_loop_configurations(self)
   1092 
   1093         # hook

/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/configuration_validator.py in verify_loop_configurations(trainer)
     57     _check_on_pretrain_routine(model)
     58     # TODO: Delete CheckpointHooks off LightningDataModule in v1.8
---> 59     _check_datamodule_checkpoint_hooks(trainer)
     60     _check_setup_method(trainer)
     61 

/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/configuration_validator.py in _check_datamodule_checkpoint_hooks(trainer)
    291 
    292 def _check_datamodule_checkpoint_hooks(trainer: "pl.Trainer") -> None:
--> 293     if is_overridden(method_name="on_save_checkpoint", instance=trainer.datamodule):
    294         rank_zero_deprecation(
    295             "`LightningDataModule.on_save_checkpoint` was deprecated in"

/usr/local/lib/python3.8/dist-packages/pytorch_lightning/utilities/model_helpers.py in is_overridden(method_name, instance, parent)
     32             parent = pl.Callback
     33         if parent is None:
---> 34             raise ValueError("Expected a parent")
     35 
     36     instance_attr = getattr(instance, method_name, None)

ValueError: Expected a parent

Is this a problem of incompatibility of the versions of the libraries? I tried searching on Google this error but found only this question: pytorch - Model_heplers.py in is_overridden > raise ValueError(“Expected a parent”) but I do not think that it is my case because I just use built-in Dataloader and inherited from pl.LightningModule for my network. The example in paperspace docs: https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/example.html works but it does not use Pytorch Lightning.

Is there a way to make this service behave correctly with Lightning?

Caridorc
  • 6,222
  • 2
  • 31
  • 46

1 Answers1

1

To make Pytorch Lightning correctly handle the dataloader in this case it must be put inside the LitClassifier Class.

!python3 -m pip install torchvision==0.11.1
!python3 -m pip install pytorch_lightning


import torch
from torch.nn import functional as F

import pytorch_lightning as pl
from torch.utils.data import DataLoader
import torchvision
import poptorch

class LitClassifier(pl.LightningModule):
    def __init__(self, hidden_dim: int = 128, learning_rate: float = 0.0001):
        super().__init__()
        self.save_hyperparameters()

        self.l1 = torch.nn.Linear(28 * 28, self.hparams.hidden_dim)
        self.l2 = torch.nn.Linear(self.hparams.hidden_dim, 10)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = torch.relu(self.l1(x))
        x = torch.relu(self.l2(x))
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        probs = self(x)
        # we currently return the accuracy as the validation_step/test_step is run on the IPU devices.
        # Outputs from the step functions are sent to the host device, where we calculate the metrics in
        # validation_epoch_end and test_epoch_end for the test_step.
        acc = self.accuracy(probs, y)
        return acc

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        acc = self.accuracy(logits, y)
        return acc

    def accuracy(self, logits, y):
        # currently IPU poptorch doesn't implicit convert bools to tensor
        # hence we use an explicit calculation for accuracy here. Once fixed in poptorch
        # we can use the accuracy metric.
        acc = torch.sum(torch.eq(torch.argmax(logits, -1), y).to(torch.float32)) / len(y)
        return acc

    def validation_epoch_end(self, outputs) -> None:
        # since the training step/validation step and test step are run on the IPU device
        # we must log the average loss outside the step functions.
        self.log("val_acc", torch.stack(outputs).mean(), prog_bar=True)

    def test_epoch_end(self, outputs) -> None:
        self.log("test_acc", torch.stack(outputs).mean())

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
        
    def train_dataloader(self):
        training_batch_size = 100
        return DataLoader(
     torchvision.datasets.MNIST('mnist_data/',
                                train=True,
                                download=True,
                                transform=torchvision.transforms.Compose([
                                    torchvision.transforms.ToTensor(),
                                    torchvision.transforms.Normalize(
                                        (0.1307, ), (0.3081, ))
                                ])),
     batch_size=training_batch_size,
     num_workers=240,
     shuffle=True)

    def test_dataloader(self):
        val_batch_size = 100
        return DataLoader(
     torchvision.datasets.MNIST('mnist_data/',
                                train=False,
                                download=True,
                                transform=torchvision.transforms.Compose([
                                    torchvision.transforms.ToTensor(),
                                    torchvision.transforms.Normalize(
                                        (0.1307, ), (0.3081, ))
                                ])),
     batch_size=val_batch_size,
     num_workers=240,
     shuffle=False)




model = LitClassifier()

print(model)
trainer = pl.Trainer(max_epochs=2, accelerator="ipu", devices="auto")

trainer.fit(model)
Caridorc
  • 6,222
  • 2
  • 31
  • 46