Detectron model giving different results for different machines (constant seed)

Question

My training script for the model:

seed = 42
import random 
import os
import numpy as np
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed)


from detectron2.engine import DefaultTrainer
from detectron2.config import get_cfg
from detectron2.data.catalog import Metadata

import os

cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"))
cfg.DATASETS.TRAIN = ("experiment",)
cfg.DATASETS.TEST = ("test",)
cfg.DATALOADER.NUM_WORKERS = 2
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7
cfg.MODEL.DEVICE = "cuda"
cfg.SOLVER.IMS_PER_BATCH = 2
num_gpu = 1
bs = (num_gpu * 2)
cfg.SOLVER.BASE_LR = 0.02 * bs / 16
cfg.SOLVER.MAX_ITER = 7500   
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128   
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 4
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
trainer = DefaultTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()

My inference script on server-1 is:

import cv2
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7

cfg.SEED = 42
predictor = DefaultPredictor(cfg)

img = cv2.imread('filename.jpg')
outputs = predictor(img)
print(outputs["instances"])

pred_classes = outputs['instances'].pred_classes.tolist()
classes = ["Handwritten", "Logo", "Markings", "Signature"]

for pred_class in pred_classes:
    print('*'*10)
    print(classes[pred_class])
    print('*'*10)

if any(classes[pred_class] == "Handwritten" for pred_class in pred_classes):
    print(True)
else:
    print(False)

My inference script on server-2 is:

class Handwritten:
    """
    Detects a list of handwritten pages in a PDF chart.
    Attributes
    ----------
    path_of_model : str
        Path where the trained model is stored.
    path_of_weights : str
        Path where the weights file is stored.
    """

    def __init__(self, path_of_weights: str) -> None:
        """Initialize Handwritten class.
        Parameters
        ----------
        path_of_model : str
        path_of_weights : str
        """
        self.cfg = get_cfg()
        self.cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7
        self.cfg.MODEL.ROI_HEADS.NUM_CLASSES = 4
        self.cfg.MODEL.WEIGHTS = path_of_weights
        self.cfg.MODEL.DEVICE = "cpu"
        self.cfg.SEED = 42
        self.predictor = DefaultPredictor(self.cfg)
        self.metadata = Metadata()
        self.metadata.set(
            thing_classes=["Handwritten", "Logo", "Markings", "Signature"],
            thing_dataset_id_to_contiguous_id={0: 0, 1: 1, 2: 2, 3: 3},
        )

    def __call__(self, img: Any) -> Any:
        """Return the predicted output classes for the image."""
        self.outputs = self.predictor(img)
        return self.outputs["instances"]

    def detect_hw(self, image: Any) -> bool:
        """Detect handwritten dx entity in image and if present then classifies it as hw page.
        Parameters
        ----------
        image : Matrix
            .Image matrix of a page.
        Return
        -------
        True/False : bool
            Boolean value that states if the page is handwritten or not.
        """
        outputs = self.__call__(image)
        pred_classes = outputs.pred_classes.tolist()
        classes = ["Handwritten", "Logo", "Markings", "Signature"]

        if any(classes[pred_class] == "Handwritten" for pred_class in pred_classes):
            return True
        else:
            return False


app = FastAPI()
path_of_weights = "model/model_final.pth"
model = Handwritten(path_of_weights)

@app.post("/cv/predict", status_code=200)
def predict(
    page_no: int = Form(...), dimensions: list = Form(...), image: UploadFile = File(...)
) -> Dict[str, int]:
    """Predicts if image is handwritten page or not.
    Parameters
    ----------
    page_no : Page number of the given input page
    dimensions : Height and width of the page
    image : Image of the page as bytestream
    """
    image_bytes = image.file.read()
    decoded_image = cv2.imdecode(np.frombuffer(image_bytes, np.uint8), -1)
    height, width = int(dimensions[0]), int(dimensions[1])
    prediction_time = time.time()
    pg_image = cv2.resize(decoded_image, (height, width))
    try:
        # Check if page is handwritten
        hw_result = model.detect_hw(pg_image)

        # If handwritten, consider for output
        if hw_result:
            hw_pages = page_no

        else:
            hw_pages = -99

        prediction_info = {
            "hw_pages": hw_pages,
            "prediction_time": prediction_time,
        }
        #_logger.info(f"prediction info: {prediction_info}")
    except HTTPError as e:
        do something
    
    return {"hw_pages": hw_pages}

While the model keeps giving good results on server-1, it is somehow being very erratic in server-2. The weights and the seed is the same. Somehow, I am unable to understand this change in behavior in both of these scenarios.

The model is trained on server-1

Server-1 is g4dn.2xlarge. Server-2 is g4dn.xlarge

Is there something wrong which I am doing?

Detectron model giving different results for different machines (constant seed)

0 Answers0