mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/grade.py

import numpy as np import pandas as pd from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval from mlebench.grade_helpers import InvalidSubmissionError def prepare_for_metric(submission: pd.DataFrame, answers: pd.DataFrame) -> dict: """Prepare the submission and answers for the metric.""" required_answer_columns = ["class_id", "x_min", "y_min", "x_max", "y_max", "image_id"] for col in required_answer_columns: assert col in answers.columns, f"Answers must have a `{col}` column" for col in ["x_min", "y_min", "x_max", "y_max"]: assert ( not answers[col].isnull().values.any() ), f"Answers must not contain any NaN values in `{col}` column, but got {answers[col].isnull().values}" required_submission_columns = ["PredictionString", "image_id"] for col in required_submission_columns: if col not in submission.columns: raise InvalidSubmissionError(f"Submission must have a `{col}` column") if set(submission["image_id"]) != set(answers["image_id"]): raise InvalidSubmissionError( "The set of image_ids in the submission must match the set of image_ids in the answers" ) return {"true_df": answers, "pred_df": submission} def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float: inputs = prepare_for_metric(submission, answers) vineval = VinBigDataEval(inputs["true_df"]) cocoEvalResults = vineval.evaluate(inputs["pred_df"]) return cocoEvalResults.stats[0] class VinBigDataEval: """Helper class for calculating the competition metric. You should remove the duplicated annoatations from the `true_df` dataframe before using this script. Otherwise it may give incorrect results. >>> vineval = VinBigDataEval(valid_df) >>> cocoEvalResults = vineval.evaluate(pred_df) Arguments: true_df: pd.DataFrame Clean (no duplication) Training/Validating dataframe. Authors: Peter (https://kaggle.com/pestipeti) See: https://www.kaggle.com/pestipeti/competition-metric-map-0-4 Returns: None """ def __init__(self, true_df): self.true_df = true_df self.image_ids = true_df["image_id"].unique() self.annotations = { "type": "instances", "images": self.__gen_images(self.image_ids), "categories": self.__gen_categories(self.true_df), "annotations": self.__gen_annotations(self.true_df, self.image_ids), } self.predictions = { "images": self.annotations["images"].copy(), "categories": self.annotations["categories"].copy(), "annotations": None, } def __gen_categories(self, df): print("Generating category data...") if "class_name" not in df.columns: df["class_name"] = df["class_id"] cats = df[["class_name", "class_id"]] cats = cats.drop_duplicates().sort_values(by="class_id").values results = [] for cat in cats: results.append( { "id": cat[1], "name": cat[0], "supercategory": "none", } ) return results def __gen_images(self, image_ids): print("Generating image data...") results = [] for idx, image_id in enumerate(image_ids): # Add image identification. results.append( { "id": idx, } ) return results def __gen_annotations(self, df, image_ids): print("Generating annotation data...") k = 0 results = [] for idx, image_id in enumerate(image_ids): # Add image annotations for i, row in df[df["image_id"] == image_id].iterrows(): results.append( { "id": k, "image_id": idx, "category_id": row["class_id"], "bbox": np.array([row["x_min"], row["y_min"], row["x_max"], row["y_max"]]), "segmentation": [], "ignore": 0, "area": (row["x_max"] - row["x_min"]) * (row["y_max"] - row["y_min"]), "iscrowd": 0, } ) k += 1 return results def __decode_prediction_string(self, pred_str): data = list(map(float, pred_str.split(" "))) data = np.array(data) return data.reshape(-1, 6) def __gen_predictions(self, df, image_ids): print("Generating prediction data...") k = 0 results = [] for i, row in df.iterrows(): image_id = row["image_id"] preds = self.__decode_prediction_string(row["PredictionString"]) for j, pred in enumerate(preds): results.append( { "id": k, "image_id": int(np.where(image_ids == image_id)[0]), "category_id": int(pred[0]), "bbox": np.array([pred[2], pred[3], pred[4], pred[5]]), "segmentation": [], "ignore": 0, "area": (pred[4] - pred[2]) * (pred[5] - pred[3]), "iscrowd": 0, "score": pred[1], } ) k += 1 return results def evaluate(self, pred_df, n_imgs=-1): """Evaluating your results Arguments: pred_df: pd.DataFrame your predicted results in the competition output format. n_imgs: int Number of images use for calculating the result.All of the images if `n_imgs` <= 0 Returns: COCOEval object """ if pred_df is not None: self.predictions["annotations"] = self.__gen_predictions(pred_df, self.image_ids) coco_ds = COCO() coco_ds.dataset = self.annotations coco_ds.createIndex() coco_dt = COCO() coco_dt.dataset = self.predictions coco_dt.createIndex() imgIds = sorted(coco_ds.getImgIds()) if n_imgs > 0: imgIds = np.random.choice(imgIds, n_imgs) cocoEval = COCOeval(coco_ds, coco_dt, "bbox") cocoEval.params.imgIds = imgIds cocoEval.params.useCats = True cocoEval.params.iouType = "bbox" cocoEval.params.iouThrs = np.array([0.4]) cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize() return cocoEval

mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/grade.py (136 lines of code) (raw):