System exit 1 error after obtaining NaN losses from finetuning Mask R-CNN in Pytorch

Question

I am following this tutorial from Pytorch for Finetuning a pre-trained model on my own dataset. I have my annotation in the COCO format in a json file, so, I first implemented the dataloader as follows:

import torch
import json
from torch.utils.data import Dataset
from pycocotools.coco import COCO
from PIL import Image
import os
import numpy as np
from torchvision import transforms
import Config
import transforms as T
from torchvision.transforms import functional as F

class CustomDataset(Dataset):
    def __init__(self, root, json_file, transform=None):
        self.root = root
        with open(json_file) as f:
            self.data = json.load(f)
        self.transform = transform
        self.image_ids = [img["id"] for img in self.data["images"]]
        self.imgs = list(sorted(os.listdir(os.path.join(root, "Images"))))
        self.masks = list(sorted(os.listdir(os.path.join(root, "Masks"))))

    def __getitem__(self, idx):
        # Get image ID
        img_id = self.image_ids[idx]
        img = next(image for image in self.data["images"] if image["id"] == img_id)
        
        img_path = os.path.join(self.root, "Images")
        mask_path = os.path.join(self.root, "Masks")
        
        # Load image
        image = Image.open(os.path.join(img_path, img['file_name'])).convert("RGB")
        
        # extract annotations from the json file
        annotations = [ann for ann in self.data["annotations"] if ann["image_id"] == img_id]
        
        # extract labels from annotations
        labels = [ann["label"] for ann in annotations]
        # convert labels to integers
        labels = [label for label in labels]
        labels = torch.as_tensor(labels, dtype=torch.int64)
        
        # extract boxes and convert them to format [x1, y1, x2, y2]
        boxes = [ann["bbox"] for ann in annotations]
        boxes = [[bbox[0], bbox[1], bbox[2], bbox[3]] for bbox in boxes]
        num_objects = len(boxes)
        
        # read the mask and include the number of objects in the first dimension 
        mask = np.array(Image.open(os.path.join(mask_path, img['file_name'])).convert("L"))
        # Check if mask is empty
        if mask.size == 0:
            mask = np.zeros((num_objects, 1, 1), dtype=np.uint8)
        else:
            mask = np.expand_dims(mask, axis=0)
            mask = np.repeat(mask, num_objects, axis=0)
        
        # convert the binary mask array to a torch tensor
        mask = torch.as_tensor(mask, dtype=torch.uint8)
        
        # suppose all instances are not crowd
        iscrowd = torch.zeros((num_objects,), dtype=torch.int64)
        
        # convert bboxes to tensors
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        
        # calculate the area of the bounding box
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        
        # convert id to tensor
        image_id = torch.tensor([idx])

        # create target dictionary
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["masks"] = mask
        target["image_id"] = image_id
        target["area"] = area
        target["iscrowd"] = iscrowd
        
        # apply the transform if any
        if self.transform is not None:
            image, target = self.transform(image, target)
        
        return image, target
    
    def __len__(self):
        return len(self.imgs)

and I am using this code for training:

import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from engine import train_one_epoch
import utils
import transforms as T
from dataloader import CustomDataset
import Config
import torch
import utils
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR
from torchvision.transforms import functional as F

def get_instance_segmentation_model(num_classes):
    # load an instance segmentation model pre-trained on COCO
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)

    # get the number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    # now get the number of input features for the mask classifier
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    # and replace the mask predictor with a new one
    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
                                                       hidden_layer,
                                                       num_classes)

    return model

def get_transform(train):
    transforms = []
    # converts the image, a PIL image, into a PyTorch Tensor
    transforms.append(T.PILToTensor())
    if train:
        # during training, randomly flip the training images
        # and ground-truth for data augmentation
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)


json_path = 'annotations.json'

# use our dataset and defined transformations
dataset = CustomDataset(root = Config.Dataset_dir, json_file=json_path, transform = get_transform(train=True))

# for image, target in dataset:
#     print(image.shape)
# split the dataset in train and test set
torch.manual_seed(1)
indices = torch.randperm(len(dataset)).tolist()
dataset = torch.utils.data.Subset(dataset, indices[:-500])
dataset_test = torch.utils.data.Subset(dataset, indices[-500:])

# define training and validation data loaders
data_loader = torch.utils.data.DataLoader(
    dataset, batch_size=1, shuffle=True, num_workers=4,
    collate_fn=utils.collate_fn)

data_loader_test = torch.utils.data.DataLoader(
    dataset_test, batch_size=1, shuffle=False, num_workers=4,
    collate_fn=utils.collate_fn)

device = Config.DEVICE

# # our dataset has two classes only - background and person
num_classes = 2

# get the model using our helper function
model = get_instance_segmentation_model(num_classes)
# move model to the right device
model.to(device)

# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.1,
                            momentum=0.9, weight_decay=0.0005)

# and a learning rate scheduler which decreases the learning rate by
# 10x every 3 epochs
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=3,
                                               gamma=0.1)
# let's train it for 10 epochs
num_epochs = 10

for epoch in range(num_epochs):
    # train for one epoch, printing every 10 iterations
    train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
    # update the learning rate
    lr_scheduler.step()
#     evaluate on the test dataset
    evaluate(model, data_loader_test, device=device)

This training code is as stated in the tutorial is using some helper functions which can be accessed from here. I have run the training code and the training is working for the first 10 samples in the data, but then it gives the following error:

Epoch: [0]  [   0/2759]  eta: 13:29:50  lr: 0.000200  loss: -136.8811 (-136.8811)  loss_classifier: 0.9397 (0.9397)  loss_box_reg: 0.0017 (0.0017)  loss_mask: -137.9142 (-137.9142)  loss_objectness: 0.0859 (0.0859)  loss_rpn_box_reg: 0.0057 (0.0057)  time: 17.6117  data: 10.0775
Loss is nan, stopping training
{'loss_classifier': tensor(nan, grad_fn=<NllLossBackward0>), 'loss_box_reg': tensor(nan, grad_fn=<DivBackward0>), 'loss_mask': tensor(nan, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), 'loss_objectness': tensor(nan, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), 'loss_rpn_box_reg': tensor(nan, grad_fn=<DivBackward0>)}
An exception has occurred, use %tb to see the full traceback.

SystemExit: 1

This error is raised from the engine.py train_one_epoch function, especially from this part of the function:

    with torch.cuda.amp.autocast(enabled=scaler is not None):
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

    # reduce losses over all GPUs for logging purposes
    loss_dict_reduced = utils.reduce_dict(loss_dict)
    losses_reduced = sum(loss for loss in loss_dict_reduced.values())

    loss_value = losses_reduced.item()

    if not math.isfinite(loss_value):
        print(f"Loss is {loss_value}, stopping training")
        print(loss_dict_reduced)
        sys.exit(1)

Which indicates that the losses returned after the first loop are NaN ... What could be wrong here please? I am running out of ideas and don't know what's going wrong anymore.

System exit 1 error after obtaining NaN losses from finetuning Mask R-CNN in Pytorch

0 Answers0