How to scale up inference prediction with detectron with PaddleOCR using Ray in Python

Question

I am using Ray and trying to perform image classification (using Detectron2) and OCRing (using PaddleOCR). I have 2 APIs for the same. Tried to autoscale the APIs but my RPS is 1-2 with Image classification & OCR and max 15 RPS with only image classification while performing bulk testing with locust. Avg response time is 2sec.

Please help me how can i scale to improve my RPS.

System config: 32GB RAM, 16GB GPU, 12 core Processor

main.py:

from fastapi import  FastAPI
import os, sys
import ray
import custom_configs as configs
from app.core import route
from ray.serve.drivers import DAGDriver  
import datetime
from ray import serve
from ray.serve import Application


app = FastAPI()
app.include_router(route.router)

@app.on_event("startup") # Code to be run when the server starts.
async def startup_event():
    os.environ['RAY_DEDUP_LOGS']= 0   # To disable log deduplication,


# Deploy the application
def app_builder(args) -> Application:    
    return DAGDriver.bind({"/sam":  route.SAMServices.bind(), "/dam":  route.DAMServices.bind()})

route.py

import torch
from app.core.services.sam import SAM
from app.core.services.dam import DAM
from starlette.requests import Request
import logger
from ray import serve
from fastapi_utils.cbv import cbv
from fastapi_utils.inferring_router import InferringRouter
from typing import Dict

router = InferringRouter()   # Create a router

@serve.deployment( ray_actor_options={"num_cpus":1,"num_gpus":0.1,},autoscaling_config={
         # "min_replicas": 1,
        "initial_replicas": 4,
         "max_replicas": 15
         # "target_num_ongoing_requests_per_replica": 2,
     }
 )
@cbv(router) 
class SAMServices:
    def __init__(self):
        detectron_model = get_detectron()   # Running on GPU
        paddleocr_model = PaddleOCR(use_angle_cls=True, lang='en', debug=False, show_log = False,use_gpu=False)  # Running on CPU and Unable to Run GPU

async def __call__(self,starlette_request: Request) -> Dict:
    try:
        item = await starlette_request.json()
        
        # Call the prediction function with the image.        
        results = detectron_model(item['img'])["instances"]

        # If results are correct then pass to PaddleOCR
        text_results = paddleocr_model.ocr(img)
        torch.cuda.empty_cache()
           
        return predictions

    except Exception as e:
        # To handle Cuda memory error - Run process on CPU
        pass

   

@serve.deployment
@cbv(router) 
class DAMServices:
    def __init__(self):
        detectron_model = get_detectron()   # Running on GPU
            paddleocr_model = PaddleOCR(use_angle_cls=True, lang='en', debug=False, show_log = False,use_gpu=False)  # Running on CPU and Unable to Run GPU

    async def __call__(self,starlette_request: Request) -> Dict:
        try:
            item = await starlette_request.json()
            output= []
            # Call the prediction function with the image.
            for img in item['imgs']:        
                 results = detectron_model(img )["instances"]

                 # If results are correct then pass to PaddleOCR
                 text_results = paddleocr_model.ocr(img)
                 output.append(text_results )
                 torch.cuda.empty_cache()
           
        return {"vals": output}

    except Exception as e:
        # To handle Cuda memory error - Run process on CPU
        pass

def get_detectron():
    
    with open('train_cfg.pickle','rb') as f :  
        cfg = pickle.load(f)
    
    cfg.MODEL.DEVICE = "cuda"
    cfg.MODEL.WEIGHTS = "model_final.pth"
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5

    predictor = DefaultPredictor(cfg)
    predictor.model.share_memory()  # To share GPU among multiple detectron processes

    return predictor

How to scale up inference prediction with detectron with PaddleOCR using Ray in Python

0 Answers0