docker_images/doctr/app/pipelines/object_detection.py (36 lines of code) (raw):

from typing import Any, Dict, List import torch from app.pipelines import Pipeline from doctr.models.obj_detection.factory import from_hub from PIL import Image from torchvision.transforms import Compose, ConvertImageDtype, PILToTensor class ObjectDetectionPipeline(Pipeline): def __init__(self, model_id: str): self.model = from_hub(model_id).eval() self.transform = Compose( [ PILToTensor(), ConvertImageDtype(torch.float32), ] ) self.labels = self.model.cfg.get("classes") if self.labels is None: self.labels = [f"LABEL_{i}" for i in range(self.model.num_classes)] def __call__(self, inputs: Image.Image) -> List[Dict[str, Any]]: """ Args: inputs (:obj:`PIL.Image`): The raw image representation as PIL. No transformation made whatsoever from the input. Make all necessary transformations here. Return: A :obj:`list`:. The list contains items that are dicts with the keys "label", "score" and "box". """ im = inputs.convert("RGB") inputs = self.transform(im).unsqueeze(0) with torch.inference_mode(): out = self.model(inputs)[0] return [ { "label": self.labels[idx], "score": score.item(), "box": { "xmin": int(round(box[0].item())), "ymin": int(round(box[1].item())), "xmax": int(round(box[2].item())), "ymax": int(round(box[3].item())), }, } for idx, score, box in zip(out["labels"], out["scores"], out["boxes"]) ]