I tried to run code for DiffusionInst based on Detectron2 (source code: https://github.com/chenhaoxing/DiffusionInst). During my training, my python process has always been killed (at 10000-20000 iteration epochs, which is insufficient for diffisioninst training).
I only rewrite the code for dataloader, in order to adapt to my own dataset. My new code for dataloader:
class DiffusionInstDatasetMapper:
"""
A callable which takes a dataset dict in Detectron2 Dataset format,
and map it into a format used by DiffusionInst.
The callable currently does the following:
1. Read the image from "file_name"
2. Applies geometric transforms to the image and annotation
3. Find and applies suitable cropping to the image and annotation
4. Prepare image and annotation to Tensors
"""
def __init__(self, cfg, is_train=True):
if cfg.INPUT.CROP.ENABLED and is_train:
self.crop_gen = [
# T.ResizeShortestEdge([400, 500, 600], sample_style="choice"),
T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE),
]
else:
self.crop_gen = None
self.tfm_gens = build_transform_gen(cfg, is_train)
logging.getLogger(__name__).info(
"Full TransformGens used in training: {}, crop: {}".format(str(self.tfm_gens), str(self.crop_gen))
)
self.img_format = cfg.INPUT.FORMAT
self.is_train = is_train
def __call__(self, dataset_dict):
"""
Args:
dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
Returns:
dict: a format that builtin models in detectron2 accept
"""
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
# image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
## crop roi
'''lst = dataset_dict['file_name'].split('-')
image = sitk.ReadImage('-'.join(lst[:-2]))
image = sitk.GetArrayFromImage(image)
above, below = int(lst[-2]), int(lst[-1])
image = image[:, above:below, :]'''
## no crop roi
image = sitk.ReadImage(dataset_dict["file_name"],sitk.sitkFloat32)
image = sitk.GetArrayFromImage(image)
# print('**********************',image.shape,'************************')
image = (image - image.min()) / (image.max() - image.min()) * 255
#print(image.dtype)
image = image.transpose(1, 2, 0).astype(np.uint8)
image = np.repeat(image, 3, axis=2)
#print(image.dtype)
utils.check_image_size(dataset_dict, image)
#origshape = image.shape
if self.crop_gen is None:
image, transforms = T.apply_transform_gens(self.tfm_gens, image)
else:
image, transforms = T.apply_transform_gens(
self.tfm_gens + self.crop_gen, image
)
#print('orig', origshape, '\t\tresized', image.shape)
image_shape = image.shape[:2] # h, w
# Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
# but not efficient on large generic data structures due to the use of pickle & mp.Queue.
# Therefore it's important to use torch.Tensor.
dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
del image
gc.collect()
if not self.is_train:
# USER: Modify this if you want to keep them for some reason.
dataset_dict.pop("annotations", None)
return dataset_dict
if "annotations" in dataset_dict:
# USER: Modify this if you want to keep them for some reason.
# import pdb;pdb.set_trace()
for anno in dataset_dict["annotations"]:
# anno.pop("segmentation", None)
anno.pop("keypoints", None)
# USER: Implement additional transformations if you have other types of data
annos = [
utils.transform_instance_annotations(obj, transforms, image_shape)
for obj in dataset_dict.pop("annotations")
if obj.get("iscrowd", 0) == 0
]
instances = utils.annotations_to_instances(annos, image_shape, mask_format="bitmask")
dataset_dict["instances"] = utils.filter_empty_instances(instances)
del instances
gc.collect()
return dataset_dict
And the information about the oom-killer:
[2599547.303018] python invoked oom-killer: gfp_mask=0x24000c0, order=0, oom_score_adj=995
[2599547.303084] [<ffffffff8119bfae>] oom_kill_process+0x1fe/0x3c0
[2599547.303133] Task in /kubepods/burstable/podd09a5032-8b07-11ed-bb60-ac1f6b9ec91e/8b4a8d5c2c1a082f93b1610173beb70bbc19fb1a1c2e28150d2d912ed9b95b10 killed as a result of limit of /kubepods/burstable/podd09a5032-8b07-11ed-bb60-ac1f6b9ec91e
[2599547.305957] Memory cgroup out of memory: Kill process 1041771 (python) score 1198 or sacrifice child
[2599547.307810] Killed process 1041771 (python) total-vm:36436532kB, anon-rss:10288264kB, file-rss:104888kB
[2599718.702250] python invoked oom-killer: gfp_mask=0x24000c0, order=0, oom_score_adj=995
[2599718.702299] [<ffffffff8119bfae>] oom_kill_process+0x1fe/0x3c0
[2599718.702333] Task in /kubepods/burstable/podd09a5032-8b07-11ed-bb60-ac1f6b9ec91e/8b4a8d5c2c1a082f93b1610173beb70bbc19fb1a1c2e28150d2d912ed9b95b10 killed as a result of limit of /kubepods/burstable/podd09a5032-8b07-11ed-bb60-ac1f6b9ec91e
I set IMS_PER_BATCH to 1, and used a dataset which contains only 1 image, but the oom problem still occurred.
I wonder know what should i do to prevent oom problem?