I wanted to test the free IPU runtime on Paperspace, as such I made a free account and selected the HuggingFace + IPU notebook.
Afterwards I created the following very simple notebook with Pytorch Lightning to perform classification on MNIST (the simplest possible example):
!python3 -m pip install torchvision==0.11.1
!python3 -m pip install pytorch_lightning
import torch
from torch.nn import functional as F
import pytorch_lightning as pl
from torch.utils.data import DataLoader
import torchvision
import poptorch
class LitClassifier(pl.LightningModule):
def __init__(self, hidden_dim: int = 128, learning_rate: float = 0.0001):
super().__init__()
self.save_hyperparameters()
self.l1 = torch.nn.Linear(28 * 28, self.hparams.hidden_dim)
self.l2 = torch.nn.Linear(self.hparams.hidden_dim, 10)
def forward(self, x):
x = x.view(x.size(0), -1)
x = torch.relu(self.l1(x))
x = torch.relu(self.l2(x))
return x
def training_step(self, batch, batch_idx):
x, y = batch
y_hat = self(x)
loss = F.cross_entropy(y_hat, y)
return loss
def validation_step(self, batch, batch_idx):
x, y = batch
probs = self(x)
# we currently return the accuracy as the validation_step/test_step is run on the IPU devices.
# Outputs from the step functions are sent to the host device, where we calculate the metrics in
# validation_epoch_end and test_epoch_end for the test_step.
acc = self.accuracy(probs, y)
return acc
def test_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
acc = self.accuracy(logits, y)
return acc
def accuracy(self, logits, y):
# currently IPU poptorch doesn't implicit convert bools to tensor
# hence we use an explicit calculation for accuracy here. Once fixed in poptorch
# we can use the accuracy metric.
acc = torch.sum(torch.eq(torch.argmax(logits, -1), y).to(torch.float32)) / len(y)
return acc
def validation_epoch_end(self, outputs) -> None:
# since the training step/validation step and test step are run on the IPU device
# we must log the average loss outside the step functions.
self.log("val_acc", torch.stack(outputs).mean(), prog_bar=True)
def test_epoch_end(self, outputs) -> None:
self.log("test_acc", torch.stack(outputs).mean())
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
training_batch_size = 10
dm = DataLoader(
torchvision.datasets.MNIST('mnist_data/',
train=True,
download=True,
transform=torchvision.transforms.Compose([
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(
(0.1307, ), (0.3081, ))
])),
batch_size=training_batch_size,
shuffle=True)
model = LitClassifier()
print(model)
trainer = pl.Trainer(max_epochs=2, accelerator="ipu", devices="auto")
trainer.fit(model, datamodule=dm)
The code crashes with what looks like an internal error of the library with:
LitClassifier(
(l1): Linear(in_features=784, out_features=128, bias=True)
(l2): Linear(in_features=128, out_features=10, bias=True)
)
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: True, using: 4 IPUs
HPU available: False, using: 0 HPUs
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-24-329fa233a013> in <module>
83 trainer = pl.Trainer(max_epochs=2, accelerator="ipu", devices="auto")
84
---> 85 trainer.fit(model, datamodule=dm)
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
694 """
695 self.strategy.model = model
--> 696 self._call_and_handle_interrupt(
697 self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
698 )
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py in _call_and_handle_interrupt(self, trainer_fn, *args, **kwargs)
648 return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs)
649 else:
--> 650 return trainer_fn(*args, **kwargs)
651 # TODO(awaelchli): Unify both exceptions below, where `KeyboardError` doesn't re-raise
652 except KeyboardInterrupt as exception:
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py in _fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
733 ckpt_path, model_provided=True, model_connected=self.lightning_module is not None
734 )
--> 735 results = self._run(model, ckpt_path=self.ckpt_path)
736
737 assert self.state.stopped
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py in _run(self, model, ckpt_path)
1089 self._callback_connector._attach_model_logging_functions()
1090
-> 1091 verify_loop_configurations(self)
1092
1093 # hook
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/configuration_validator.py in verify_loop_configurations(trainer)
57 _check_on_pretrain_routine(model)
58 # TODO: Delete CheckpointHooks off LightningDataModule in v1.8
---> 59 _check_datamodule_checkpoint_hooks(trainer)
60 _check_setup_method(trainer)
61
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/configuration_validator.py in _check_datamodule_checkpoint_hooks(trainer)
291
292 def _check_datamodule_checkpoint_hooks(trainer: "pl.Trainer") -> None:
--> 293 if is_overridden(method_name="on_save_checkpoint", instance=trainer.datamodule):
294 rank_zero_deprecation(
295 "`LightningDataModule.on_save_checkpoint` was deprecated in"
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/utilities/model_helpers.py in is_overridden(method_name, instance, parent)
32 parent = pl.Callback
33 if parent is None:
---> 34 raise ValueError("Expected a parent")
35
36 instance_attr = getattr(instance, method_name, None)
ValueError: Expected a parent
Is this a problem of incompatibility of the versions of the libraries? I tried searching on Google this error but found only this question: pytorch - Model_heplers.py in is_overridden > raise ValueError(“Expected a parent”) but I do not think that it is my case because I just use built-in Dataloader
and inherited from pl.LightningModule
for my network. The example in paperspace docs: https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/example.html works but it does not use Pytorch Lightning.
Is there a way to make this service behave correctly with Lightning?