I used detectron2 to inference video frame by frame. But the inference speed was slow about 6.96it/s

Question

I tried to use detectron2 to do some object detection in videos. But the inference speed was slow about 6.96it/s. The gpu usage was not high. I think there must be a way to improve the speed. I need some help.

Wed Aug  2 15:00:27 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.43.04    Driver Version: 515.43.04    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA A10          Off  | 00000000:00:06.0 Off |                    0 |
|  0%   67C    P0    98W / 150W |   1155MiB / 23028MiB |     16%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|    0   N/A  N/A      8742      C   ...nvs/detectron2/bin/python     1153MiB |
+-----------------------------------------------------------------------------+

I'm new to Python and detectron2. I will describe what I did below.

I used Labelme to label the object like this.
I used labelme2coco.py to generate coco-format dataset

python labelme2coco.py  --output ./train-dataset-real.json ./real_train_source

Here is the code I used to train. I run this on my personal PC. Shoud I do the train on server?

#!/usr/bin/env python3

from utils import *
import pickle
import os
from detectron2.engine import DefaultTrainer
from detectron2.data.datasets import register_coco_instances
from detectron2.utils.logger import setup_logger


os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
setup_logger()

#https://github.com/facebookresearch/detectron2/blob/main/configs/PascalVOC-Detection/faster_rcnn_R_50_C4.yaml
#COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml
configuration_file_path = "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"
checkpoint_url = "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"

output_dir = "train/output/object_detection_garbage_real"
num_classes = 2

device = "cuda"

train_dataset_name = "garbage can"
train_images_path = "train/real_train_source"
train_json_path = "train/train-dataset-real.json"

test_dataset_name = "garbage can"

cfg_save_path = "OD_cfg_real.pickle"


register_coco_instances(name=train_dataset_name, metadata={},
                        json_file=train_json_path, image_root=train_images_path)


def main():

    cfg = get_train_cfg(configuration_file_path, checkpoint_url, train_dataset_name,
                        test_dataset_name, num_classes, device, output_dir)

    with open(cfg_save_path, 'wb') as f:
        pickle.dump(cfg, f, protocol=(pickle.HIGHEST_PROTOCOL))

    os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)

    trainer = DefaultTrainer(cfg)
    trainer.resume_or_load(resume=False)

    trainer.train()
    # trainer.test(COCOEvaluator)


if __name__ == '__main__':
    main()

# in another python file
def get_train_cfg(config_file_path, checkpoint_url, train_dataset_name,
                  test_dataset_name, num_classes, device, output_url):
    cfg = get_cfg()

    cfg.merge_from_file(model_zoo.get_config_file(config_file_path))

    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(checkpoint_url)

    cfg.DATASETS.TRAIN = (train_dataset_name,)
    cfg.DATASETS.TEST = (test_dataset_name,)

    cfg.DATALOADER.NUM_WORKERS = 2

    # This is the real "batch size" commonly known to deep learning people
    cfg.SOLVER.IMS_PER_BATCH = 2
    cfg.SOLVER.BASE_LR = 0.00025  # pick a good LR
    # 300 iterations seems good enough for this toy dataset; you will need to train longer for a practical dataset
    cfg.SOLVER.MAX_ITER = 2700
    cfg.SOLVER.STEPS = []  # do not decay learning rate
    # The "RoIHead batch size". 128 is faster, and good enough for this toy dataset (default: 512)
    cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
    # only has one class (ballon). (see https://detectron2.readthedocs.io/tutorials/datasets.html#update-the-config-for-new-datasets)
    cfg.MODEL.ROI_HEADS.NUM_CLASSES = num_classes
    # NOTE: this config means the number of classes, but a few popular unofficial tutorials incorrect uses num_classes+1 here.
    cfg.MODEL.DEVICE = device
    cfg.OUTPUT_DIR = output_url

    return cfg

I used the output file named model_final.pth to do the detection.

with open(cfg_save_path, 'rb') as f:
    cfg = pickle.load(f)

model_path =  "/xxx/output/xxx"
cfg.MODEL.WEIGHTS = os.path.join(model_path, "model_final.pth")
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.86

train_dataset_name = "garbage can"
MetadataCatalog.get(train_dataset_name).thing_classes = ["garbage can", "wheel"]

# ... omit some code


video_handler.on_video(cfg, video_input, video_output)

I learnd the detectron2 demo code to write my own code.

class VideoHandler:
    def on_video(self, cfg, video_input, output):
        video_handler = CountPredictor(cfg)

        video = cv2.VideoCapture(video_input)
        width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
        fps = video.get(cv2.CAP_PROP_FPS)
        process = None
        if self.output:
            process = sp.Popen(shlex.split(
                f'ffmpeg -y -s {width}x{height} -pixel_format bgr24 -f rawvideo -r {fps} -i pipe: -vcodec libx265 -pix_fmt yuv420p -crf 25 {output}'),
                stdin=sp.PIPE)

        try:
            for data_obj in tqdm.tqdm(video_handler.run_on_video(video, counter), total=num_frames):

                // omit my business code

                if self.output:
                    vis_frame = self.handle_debug(counter, black_percent, data_obj)
                    self.write_output_frame(vis_frame, counter, output, process, self.basename, black_percent)
                if self.display:
                    if cv2.waitKey(1) == 27:
                        break

            video.release()

            if self.output:
                process.stdin.close()
                process.wait()
            else:
                cv2.destroyAllWindows()
            print("final_bucket_count[" + str(self.final_bucket_count) + "]")
            # draw_picture(counter)
        except KeyboardInterrupt:
            print('Stopped by keyboard interrupt')
            process.stdin.close()
            process.wait()

myCountPredictor

class CountPredictor:

    def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
        """
                Args:
                    cfg (CfgNode):
                    instance_mode (ColorMode):
                    parallel (bool): whether to run the model in different processes from visualization.
                        Useful since the visualization logic can be slow.
        """
        self.metadata = MetadataCatalog.get(
            cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
        )
        self.cpu_device = torch.device("cpu")
        self.instance_mode = instance_mode

        self.parallel = parallel
        if parallel:
            num_gpu = torch.cuda.device_count()
            self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
        else:
            self.predictor = DefaultPredictor(cfg)

    def process_predictions(self, video_visualizer, frame, predictions):
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        predictions = predictions["instances"].to(self.cpu_device)
        data_obj = video_visualizer.draw_instance_predictions(frame, predictions)

        return data_obj

    def run_on_video(self, video, counter: LaoZhongCountHelper):

        video_visualizer = VideoVisualizer(self.metadata, self.instance_mode)

        frame_gen = self._frame_from_video(video)

        for frame in frame_gen:
            counter.origin_frame = frame
            data_obj = self.process_predictions(video_visualizer, frame, self.predictor(frame))
            yield data_obj

    def _frame_from_video(self, video):

        while video.isOpened():
            success, frame = video.read()
            if success:
                yield frame
            else:
                break

I am expecting to speed up the inference speed

score 0 · Answer 1 · answered Aug 02 '23 at 12:14

0

problem solved! VideoVisualizer taking too much time

answered Aug 02 '23 at 12:14

Dr.Fail to fall asleep

13
3

Your answer could be improved with additional supporting information. Please [edit] to add further details, such as citations or documentation, so that others can confirm that your answer is correct. You can find more information on how to write good answers [in the help center](/help/how-to-answer). – Community Aug 07 '23 at 05:12

I used detectron2 to inference video frame by frame. But the inference speed was slow about 6.96it/s

1 Answers1