I am running CNN algorithm using PyTorch on my new machine with 3 Nvidia GPUs and getting the error below:
RuntimeError: cuDNN error: CUDNN_STATUS_NOT_INITIALIZED
File "code.py", line 342, in <module>
trainer.fit(model)
File "/home/.local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 514, in fit
self.dispatch()
File "/home/.local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 554, in dispatch
self.accelerator.start_training(self)
File "/home/.local/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 74, in start_training
self.training_type_plugin.start_training(trainer)
File "/home/.local/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 111, in start_training
self._results = trainer.run_train()
File "/home/.local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 615, in run_train
self.run_sanity_check(self.lightning_module)
File "/home/.local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 864, in run_sanity_check
_, eval_results = self.run_evaluation(max_batches=self.num_sanity_val_batches)
File "/home/.local/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 733, in run_evaluation
output = self.evaluation_loop.evaluation_step(batch, batch_idx, dataloader_idx)
File "/home/.local/lib/python3.8/site-packages/pytorch_lightning/trainer/evaluation_loop.py", line 164, in evaluation_step
output = self.trainer.accelerator.validation_step(args)
File "/home/.local/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 178, in validation_step
return self.training_type_plugin.validation_step(*args)
File "/home/.local/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/ddp.py", line 290, in validation_step
return self.model(*args, **kwargs)
File "/home/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/.local/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 705, in forward
output = self.module(*inputs[0], **kwargs[0])
File "/home/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/.local/lib/python3.8/site-packages/pytorch_lightning/overrides/base.py", line 63, in forward
output = self.module.validation_step(*inputs, **kwargs)
File code.py", line 314, in validation_step
pred = self.forward(x)
File code.py", line 259, in forward
x = self.conv0(x) #([12, 600, 600])
File "/home/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/.local/lib/python3.8/site-packages/torch/nn/modules/container.py", line 119, in forward
input = module(input)
File "/home/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/.local/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 399, in forward
return self._conv_forward(input, self.weight, self.bias)
File "/home/.local/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 395, in _conv_forward
return F.conv2d(input, weight, bias, self.stride,
NVIDIA-MSI:
The code is running without any issue on another machine with driver version 450.51.06 and Cuda version 11. You can see nvidia-smi of new machine above. I checked different comments on other questions same to this issue and non of them resolved my issue.