I am using Ray and trying to perform image classification (using Detectron2) and OCRing (using PaddleOCR). I have 2 APIs for the same. Tried to autoscale the APIs but my RPS is 1-2 with Image classification & OCR and max 15 RPS with only image classification while performing bulk testing with locust. Avg response time is 2sec.
Please help me how can i scale to improve my RPS.
System config: 32GB RAM, 16GB GPU, 12 core Processor
main.py:
from fastapi import FastAPI
import os, sys
import ray
import custom_configs as configs
from app.core import route
from ray.serve.drivers import DAGDriver
import datetime
from ray import serve
from ray.serve import Application
app = FastAPI()
app.include_router(route.router)
@app.on_event("startup") # Code to be run when the server starts.
async def startup_event():
os.environ['RAY_DEDUP_LOGS']= 0 # To disable log deduplication,
# Deploy the application
def app_builder(args) -> Application:
return DAGDriver.bind({"/sam": route.SAMServices.bind(), "/dam": route.DAMServices.bind()})
route.py
import torch
from app.core.services.sam import SAM
from app.core.services.dam import DAM
from starlette.requests import Request
import logger
from ray import serve
from fastapi_utils.cbv import cbv
from fastapi_utils.inferring_router import InferringRouter
from typing import Dict
router = InferringRouter() # Create a router
@serve.deployment( ray_actor_options={"num_cpus":1,"num_gpus":0.1,},autoscaling_config={
# "min_replicas": 1,
"initial_replicas": 4,
"max_replicas": 15
# "target_num_ongoing_requests_per_replica": 2,
}
)
@cbv(router)
class SAMServices:
def __init__(self):
detectron_model = get_detectron() # Running on GPU
paddleocr_model = PaddleOCR(use_angle_cls=True, lang='en', debug=False, show_log = False,use_gpu=False) # Running on CPU and Unable to Run GPU
async def __call__(self,starlette_request: Request) -> Dict:
try:
item = await starlette_request.json()
# Call the prediction function with the image.
results = detectron_model(item['img'])["instances"]
# If results are correct then pass to PaddleOCR
text_results = paddleocr_model.ocr(img)
torch.cuda.empty_cache()
return predictions
except Exception as e:
# To handle Cuda memory error - Run process on CPU
pass
@serve.deployment
@cbv(router)
class DAMServices:
def __init__(self):
detectron_model = get_detectron() # Running on GPU
paddleocr_model = PaddleOCR(use_angle_cls=True, lang='en', debug=False, show_log = False,use_gpu=False) # Running on CPU and Unable to Run GPU
async def __call__(self,starlette_request: Request) -> Dict:
try:
item = await starlette_request.json()
output= []
# Call the prediction function with the image.
for img in item['imgs']:
results = detectron_model(img )["instances"]
# If results are correct then pass to PaddleOCR
text_results = paddleocr_model.ocr(img)
output.append(text_results )
torch.cuda.empty_cache()
return {"vals": output}
except Exception as e:
# To handle Cuda memory error - Run process on CPU
pass
def get_detectron():
with open('train_cfg.pickle','rb') as f :
cfg = pickle.load(f)
cfg.MODEL.DEVICE = "cuda"
cfg.MODEL.WEIGHTS = "model_final.pth"
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5
predictor = DefaultPredictor(cfg)
predictor.model.share_memory() # To share GPU among multiple detectron processes
return predictor