Cuda out of memory with larger dataset and not smaller dataset despite same batchsize

Question

Im training a faster r-cnn model with the detectron2 framework. When I'm training the model using only 15 images in the dataset it works on my RTX 3060, however, when training on a dataset of 3000 images cuda goes out of memory. Despite using a batch size of 1 for both datasets. I do not understand why this is happening as the model should only be processing one image at a time for both dataset sizes. Thus it seems like the model is trying to load all 3000 images at once which I would imagine is not necessary when I set the batch size to 1.

I was thinking it was due to some image in the larger dataset has way more bound boxes or something similar leading to this. However, it gets out of memory immediately which makes me think that is not the reason

The code I'm using is simply from the detectron2 documentation:

import torch

import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

import numpy as np
import glob
import os, json, cv2, random

from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer, ColorMode
from detectron2.data import MetadataCatalog, DatasetCatalog
from detectron2.data.datasets import register_coco_instances
from detectron2.engine import DefaultTrainer

def train():
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml"))
    cfg.DATASETS.TRAIN = ("visdrone-test",)
    cfg.DATASETS.TEST = ()
    cfg.DATALOADER.NUM_WORKERS = 1
    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml")  # Let training initialize from model zoo
    cfg.SOLVER.IMS_PER_BATCH = 1

    cfg.SOLVER.BASE_LR = 0.00025 

    cfg.SOLVER.MAX_ITER = 300

    cfg.SOLVER.STEPS = []        

    cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 256

    cfg.MODEL.ROI_HEADS.NUM_CLASSES = 10  
    os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
    trainer = DefaultTrainer(cfg) 
    trainer.resume_or_load(resume=False)
    trainer.train()

if __name__ == '__main__':
    torch.cuda.empty_cache() 
    
    register_coco_instances("visdrone-train", {}, "D:/data/VisDrone2019-DET-train/labels.json", "D:/data/VisDrone2019-DET-train/images/")
    register_coco_instances("visdrone-val", {}, "D:/data/VisDrone2019-DET-val/labels.json", "D:/data/VisDrone2019-DET-val/images/")
    register_coco_instances("visdrone-test", {}, "D:/data/VisDrone2019-DET-test/labels.json", "D:/data/VisDrone2019-DET-test/images/")

    sample_metadata = MetadataCatalog.get("visdrone-val")
    dataset_dicts = DatasetCatalog.get("visdrone-val")

    train()

Cuda out of memory with larger dataset and not smaller dataset despite same batchsize

0 Answers0