I tried to use detectron2 to do some object detection in videos. But the inference speed was slow about 6.96it/s. The gpu usage was not high. I think there must be a way to improve the speed. I need some help.
Wed Aug 2 15:00:27 2023
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.43.04 Driver Version: 515.43.04 CUDA Version: 11.7 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA A10 Off | 00000000:00:06.0 Off | 0 |
| 0% 67C P0 98W / 150W | 1155MiB / 23028MiB | 16% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| 0 N/A N/A 8742 C ...nvs/detectron2/bin/python 1153MiB |
+-----------------------------------------------------------------------------+
I'm new to Python and detectron2. I will describe what I did below.
I used Labelme to label the object like this.
I used
labelme2coco.py
to generate coco-format dataset
python labelme2coco.py --output ./train-dataset-real.json ./real_train_source
- Here is the code I used to train. I run this on my personal PC. Shoud I do the train on server?
#!/usr/bin/env python3
from utils import *
import pickle
import os
from detectron2.engine import DefaultTrainer
from detectron2.data.datasets import register_coco_instances
from detectron2.utils.logger import setup_logger
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
setup_logger()
#https://github.com/facebookresearch/detectron2/blob/main/configs/PascalVOC-Detection/faster_rcnn_R_50_C4.yaml
#COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml
configuration_file_path = "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"
checkpoint_url = "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"
output_dir = "train/output/object_detection_garbage_real"
num_classes = 2
device = "cuda"
train_dataset_name = "garbage can"
train_images_path = "train/real_train_source"
train_json_path = "train/train-dataset-real.json"
test_dataset_name = "garbage can"
cfg_save_path = "OD_cfg_real.pickle"
register_coco_instances(name=train_dataset_name, metadata={},
json_file=train_json_path, image_root=train_images_path)
def main():
cfg = get_train_cfg(configuration_file_path, checkpoint_url, train_dataset_name,
test_dataset_name, num_classes, device, output_dir)
with open(cfg_save_path, 'wb') as f:
pickle.dump(cfg, f, protocol=(pickle.HIGHEST_PROTOCOL))
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
trainer = DefaultTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()
# trainer.test(COCOEvaluator)
if __name__ == '__main__':
main()
# in another python file
def get_train_cfg(config_file_path, checkpoint_url, train_dataset_name,
test_dataset_name, num_classes, device, output_url):
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file(config_file_path))
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(checkpoint_url)
cfg.DATASETS.TRAIN = (train_dataset_name,)
cfg.DATASETS.TEST = (test_dataset_name,)
cfg.DATALOADER.NUM_WORKERS = 2
# This is the real "batch size" commonly known to deep learning people
cfg.SOLVER.IMS_PER_BATCH = 2
cfg.SOLVER.BASE_LR = 0.00025 # pick a good LR
# 300 iterations seems good enough for this toy dataset; you will need to train longer for a practical dataset
cfg.SOLVER.MAX_ITER = 2700
cfg.SOLVER.STEPS = [] # do not decay learning rate
# The "RoIHead batch size". 128 is faster, and good enough for this toy dataset (default: 512)
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
# only has one class (ballon). (see https://detectron2.readthedocs.io/tutorials/datasets.html#update-the-config-for-new-datasets)
cfg.MODEL.ROI_HEADS.NUM_CLASSES = num_classes
# NOTE: this config means the number of classes, but a few popular unofficial tutorials incorrect uses num_classes+1 here.
cfg.MODEL.DEVICE = device
cfg.OUTPUT_DIR = output_url
return cfg
- I used the output file named
model_final.pth
to do the detection.
with open(cfg_save_path, 'rb') as f:
cfg = pickle.load(f)
model_path = "/xxx/output/xxx"
cfg.MODEL.WEIGHTS = os.path.join(model_path, "model_final.pth")
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.86
train_dataset_name = "garbage can"
MetadataCatalog.get(train_dataset_name).thing_classes = ["garbage can", "wheel"]
# ... omit some code
video_handler.on_video(cfg, video_input, video_output)
- I learnd the detectron2 demo code to write my own code.
class VideoHandler:
def on_video(self, cfg, video_input, output):
video_handler = CountPredictor(cfg)
video = cv2.VideoCapture(video_input)
width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = video.get(cv2.CAP_PROP_FPS)
process = None
if self.output:
process = sp.Popen(shlex.split(
f'ffmpeg -y -s {width}x{height} -pixel_format bgr24 -f rawvideo -r {fps} -i pipe: -vcodec libx265 -pix_fmt yuv420p -crf 25 {output}'),
stdin=sp.PIPE)
try:
for data_obj in tqdm.tqdm(video_handler.run_on_video(video, counter), total=num_frames):
// omit my business code
if self.output:
vis_frame = self.handle_debug(counter, black_percent, data_obj)
self.write_output_frame(vis_frame, counter, output, process, self.basename, black_percent)
if self.display:
if cv2.waitKey(1) == 27:
break
video.release()
if self.output:
process.stdin.close()
process.wait()
else:
cv2.destroyAllWindows()
print("final_bucket_count[" + str(self.final_bucket_count) + "]")
# draw_picture(counter)
except KeyboardInterrupt:
print('Stopped by keyboard interrupt')
process.stdin.close()
process.wait()
myCountPredictor
class CountPredictor:
def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
"""
Args:
cfg (CfgNode):
instance_mode (ColorMode):
parallel (bool): whether to run the model in different processes from visualization.
Useful since the visualization logic can be slow.
"""
self.metadata = MetadataCatalog.get(
cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
)
self.cpu_device = torch.device("cpu")
self.instance_mode = instance_mode
self.parallel = parallel
if parallel:
num_gpu = torch.cuda.device_count()
self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
else:
self.predictor = DefaultPredictor(cfg)
def process_predictions(self, video_visualizer, frame, predictions):
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
predictions = predictions["instances"].to(self.cpu_device)
data_obj = video_visualizer.draw_instance_predictions(frame, predictions)
return data_obj
def run_on_video(self, video, counter: LaoZhongCountHelper):
video_visualizer = VideoVisualizer(self.metadata, self.instance_mode)
frame_gen = self._frame_from_video(video)
for frame in frame_gen:
counter.origin_frame = frame
data_obj = self.process_predictions(video_visualizer, frame, self.predictor(frame))
yield data_obj
def _frame_from_video(self, video):
while video.isOpened():
success, frame = video.read()
if success:
yield frame
else:
break
I am expecting to speed up the inference speed