How to filter COCO dataset classes & annotations for custom dataset?

Question

I was able to filter the images using the code below with the COCO API, I performed this code multiple times for all the classes I needed, this is an example for category person, I did this for car and etc.

What I want to do now, is filter the annotations of the dataset (instances_train2017.json), and save it in json instances_train2017.json.

# Load categories with the specified ids, in this case all
cats = coco.loadCats(coco.getCatIds())
nms = [cat['name'] for cat in cats]
print('COCO categories: \n{}\n'.format(' '.join(nms)))

# Get all images containing given categories
catIds = coco.getCatIds(catNms=['person'])
imgIds = coco.getImgIds(catIds=catIds)
images = coco.loadImgs(imgIds)
print("imgIds: ", len(imgIds))
#print("images: ", images)

# download images for specific category
for im in images:
    print("im: ", im)
    img_data = requests.get(im['coco_url']).content
    with open('customCoco/images/train2017/' + im['file_name'], 'wb') as handler:
        handler.write(img_data)

I tried to use the COCO API, but I that doesn't give me the COCO format I want like intances_train2017.json)

# download annotation for specific category
for im in images:
    annIds = coco.getAnnIds(imgIds=im['id'], catIds=catIds, iscrowd=None)
    anns = coco.loadAnns(annIds)
    print("anns: ", anns)

I Found this post: https://github.com/cocodataset/cocoapi/issues/271 but its been saved in csv format which I don't want, I want the same file but just filtered.

corticalhazard · Answer 1 · 2021-09-24T02:30:49.700

I recommend Jalagarto's coco_utils / COCO API wrapper: https://github.com/Jalagarto/coco_utils, which generates both images and annotations. In the code below, I've extended it to work for multiple classes.

Other resources:

Useful Tutorial/Article

COCO dataset format

Visualization of Image/Annotation

xD Hope this helps!!

"""
1. saves images/annotations from categories
2. creates new json by filtering the main json file

coco_categories = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']

Expected Directory:
script.py
COCO[
    annotations
    val2017
    train2017
]
"""

from pycocotools.coco import COCO
import requests
import os
from os.path import join
from tqdm import tqdm
import json


class coco_category_filter:
    """
    Downloads images of one category & filters jsons
    to only keep annotations of this category
    """

    def __init__(self, json_path, _categ):
        self.coco = COCO(json_path)  # instanciate coco class
        self.categ = ''
        self.images = self.get_imgs_from_json(_categ)

    def get_imgs_from_json(self, _categ):
        """returns image names of the desired category"""
        # Get category ids
        self.catIds = self.coco.getCatIds(catNms=_categ)
        assert len(self.catIds) > 0, "[ERROR] cannot find category index for {}".format(_categ)
        print("catIds: ", self.catIds)
        # Get the corresponding image ids and images using loadImgs
        imgIds = []
        for c in self.catIds:
            imgIds += self.coco.getImgIds(catIds=c)  # get images over categories (logical OR)
        imgIds = list(set(imgIds))  # remove duplicates
        images = self.coco.loadImgs(imgIds)
        print(f"{len(images)} images of '{self.categ}' instances")
        return images

    def save_imgs(self, imgs_dir):
        """saves the images of this category"""
        print("Saving the images with required categories ...")
        os.makedirs(imgs_dir, exist_ok=True)
        # Save the images into a local folder
        for im in tqdm(self.images):
            img_data = requests.get(im['coco_url']).content
            with open(os.path.join(imgs_dir, im['file_name']), 'wb') as handler:
                handler.write(img_data)

    def filter_json_by_category(self, json_dir):
        """creates a new json with the desired category"""
        # {'supercategory': 'person', 'id': 1, 'name': 'person'}
        ### Filter images:
        print("Filtering the annotations ... ")
        imgs_ids = [x['id'] for x in self.images]  # get img_ids of imgs with the category (prefiltered)
        new_imgs = [x for x in self.coco.dataset['images'] if x['id'] in imgs_ids]  # select images by img_ids
        catIds = self.catIds
        ### Filter annotations
        new_annots = [x for x in self.coco.dataset['annotations'] if x['category_id'] in catIds]  # select annotations based on category id
        ### Reorganize the ids (note for reordering subset 1-N)
        #new_imgs, annotations = self.modify_ids(new_imgs, new_annots)
        ### Filter categories
        new_categories = [x for x in self.coco.dataset['categories'] if x['id'] in catIds]
        print("new_categories: ", new_categories)
        data = {
            "info": self.coco.dataset['info'],
            "licenses": self.coco.dataset['licenses'],
            "images": new_imgs,
            "annotations": new_annots,
            "categories": new_categories
        }
        print("saving json: ")
        with open(os.path.join(json_dir, "coco_annotation.json"), 'w') as f:
            json.dump(data, f)

    def modify_ids(self, images, annotations):
        """
        creates new ids for the images. I.e., maps existing image id to new subset image id and returns the dictionaries back
        images: list of images dictionaries

        images[n]['id']                                     # id of image
        annotations[n]['id']                                # id of annotation
        images[n]['id'] --> annotations[n]['image_id']      # map 'id' of image to 'image_id' in annotation
        """
        print("Reinitialicing images and annotation IDs ...")
        ### Images
        map_old_to_new_id = {}  # necessary for the annotations!
        for n, im in enumerate(images):
            map_old_to_new_id[images[n]['id']] = n + 1  # dicto with old im_ids and new im_ids
            images[n]['id'] = n + 1  # reorganize the ids
        ### Annotations
        for n, ann in enumerate(annotations):
            annotations[n]['id'] = n + 1
            old_image_id = annotations[n]['image_id']
            annotations[n]['image_id'] = map_old_to_new_id[old_image_id]  # replace im_ids in the annotations as well
        return images, annotations


def main(subset, year, root_dir, categories, experiment):
    json_file = join(root_dir, 'annotations/instances_' + subset + year + '.json')  # local path

    # Output files
    img_dir = join(root_dir, experiment, 'images')
    os.makedirs(img_dir, exist_ok=True)
    json_dir = join(root_dir, experiment, 'annotations')
    os.makedirs(json_dir, exist_ok=True)

    # Methods
    coco_filter = coco_category_filter(json_file, categories)  # instantiate class
    coco_filter.save_imgs(img_dir)
    coco_filter.filter_json_by_category(json_dir)


if __name__ == '__main__':
    subset, year = 'val', '2017'  # val - train
    root_dir = './datasets/COCO'
    experiment = "my_custom_dataset"
    categories = ['person', 'bicycle', 'car']  # can be multiple categories
    main(subset, year, root_dir, categories, experiment)

score 1 · Answer 2 · answered Mar 02 '22 at 11:11

@corticalhazard answer is working well. But the code is throwing error when Max retries exceeded with URL in requests. So I refactored the code based on this awesome solution and solve this issue.

Here is the code:

from pycocotools.coco import COCO
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import os
from os.path import join
from tqdm import tqdm
import json

class coco_category_filter:
 """
 Downloads images of one category & filters jsons 
 to only keep annotations of this category
 """
 def __init__(self, json_path, imgs_dir, categ='person'):
     self.coco = COCO(json_path) # instanciate coco class
     self.json_path = json_path
     self.imgs_dir = imgs_dir
     self.categ = categ
     self.images = self.get_imgs_from_json()        
     
 def get_imgs_from_json(self):
     """returns image names of the desired category"""
     # instantiate COCO specifying the annotations json path
     # Specify a list of category names of interest
     catIds = self.coco.getCatIds(catNms=[self.categ])
     print("catIds: ", catIds)
     # Get the corresponding image ids and images using loadImgs
     imgIds = self.coco.getImgIds(catIds=catIds)
     images = self.coco.loadImgs(imgIds)
     print(f"{len(images)} images in '{self.json_path}' with '{self.categ}' instances")
     self.catIds = catIds # list
     return images
 
 def save_imgs(self):
     """saves the images of this category"""
     print("Saving the images with required categories ...")
     os.makedirs(self.imgs_dir, exist_ok=True)
     # Save the images into a local folder
     ################################################# Modified lines
     session = requests.Session()
     retry = Retry(connect=3, backoff_factor=0.5)
     adapter = HTTPAdapter(max_retries=retry)
     session.mount('http://', adapter)
     session.mount('https://', adapter)
     #################################################
     for im in tqdm(self.images):
         img_data = session.get(im['coco_url']).content
         with open(os.path.join(self.imgs_dir, im['file_name']), 'wb') as handler:
             handler.write(img_data)
 
 def filter_json_by_category(self, new_json_path):
     """creates a new json with the desired category"""
     # {'supercategory': 'person', 'id': 1, 'name': 'person'}
     ### Filter images:
     print("Filtering the annotations ... ")
     json_parent = os.path.split(new_json_path)[0]
     os.makedirs(json_parent, exist_ok=True)
     imgs_ids = [x['id'] for x in self.images] # get img_ids of imgs with the category
     new_imgs = [x for x in self.coco.dataset['images'] if x['id'] in imgs_ids]
     catIds = self.catIds
     ### Filter annotations
     new_annots = [x for x in self.coco.dataset['annotations'] if x['category_id'] in catIds]
     ### Reorganize the ids
     new_imgs, annotations = self.modify_ids(new_imgs, new_annots)
     ### Filter categories
     new_categories = [x for x in self.coco.dataset['categories'] if x['id'] in catIds]
     print("new_categories: ", new_categories)
     data = {
         "info": self.coco.dataset['info'],
         "licenses": self.coco.dataset['licenses'],
         "images": new_imgs, 
         "annotations": new_annots,
         "categories": new_categories 
         }
     print("saving json: ")
     with open(new_json_path, 'w') as f:
         json.dump(data, f)

 def modify_ids(self, images, annotations):
     """
     creates new ids for the images. I.e., reorganizes the ids and returns the dictionaries back
     images: list of images dictionaries
     imId_counter: image id starting from one (each dicto will start with id of last json +1)
     """
     print("Reinitialicing images and annotation IDs ...")
     ### Images
     old_new_imgs_ids = {}  # necessary for the annotations!
     for n,im in enumerate(images):
         old_new_imgs_ids[images[n]['id']] = n+1  # dicto with old im_ids and new im_ids
         images[n]['id'] = n+1 # reorganize the ids
     ### Annotations
     for n,ann in enumerate(annotations):
         annotations[n]['id'] = n+1
         old_image_id = annotations[n]['image_id']
         annotations[n]['image_id'] = old_new_imgs_ids[old_image_id]  # replace im_ids in the annotations as well
     return images, annotations


def main(subset, year, root_dir, category='person'):
 json_file = join(os.path.split(root_dir)[0], 'instances_'+subset+year+'.json')   # local path
 imgs_dir = join(root_dir, category + '_' + subset)
 new_json_file = join(root_dir, 'annotations', subset+".json")
 coco_filter = coco_category_filter(json_file, imgs_dir, categ=category) # instanciate class
 coco_filter.save_imgs()
 coco_filter.filter_json_by_category(new_json_file)


if __name__ == '__main__':
 subset, year='train', '2017'
 root_dir = './datasets/COCO/annotations'
 main(subset, year, root_dir, category='person')

score 0 · Answer 3 · answered Jun 17 '22 at 08:32

The solution is correct but there is a logic error in modify_ids function. You are changing the image_id in the annotation list based on the list index which is not correct and return the corrupted annotation since image id is not for the annotation. I made some modification.

 def modify_ids(self, images, annotations):
     """
     creates new ids for the images. I.e., reorganizes the ids and returns the dictionaries back
     images: list of images dictionaries
     imId_counter: image id starting from one (each dicto will start with id of last json +1)
     """
     print("Reinitialicing images and annotation IDs ...")
     ### Images
     new_annotations = 
     for n, _ in enumerate(images):
            annotation = next((sub for sub in annotations if sub['image_id'] == images[n]['id']), None) # search for the annotation  where image_id equals to the current id of the image
            if annotation:
                images[n]['id'] = n + 1
                annotation['image_id'] = n + 1
                new_annotations.append(annotation)
     return images, new_annotations

How to filter COCO dataset classes & annotations for custom dataset?

3 Answers3