mlebench/competitions/siim-covid19-detection/grade.py (98 lines of code) (raw):
from typing import List, Tuple
import numpy as np
import pandas as pd
from sklearn.metrics import average_precision_score
from mlebench.grade_helpers import InvalidSubmissionError
def calculate_iou(
box1: Tuple[float, float, float, float], box2: Tuple[float, float, float, float]
) -> float:
"""
Calculate the Intersection over Union (IoU) of two bounding boxes.
(this image helps: https://machinelearningspace.com/wp-content/uploads/2023/01/IOU2.jpg)
"""
xmin1, ymin1, xmax1, ymax1 = box1
xmin2, ymin2, xmax2, ymax2 = box2
inter_xmin = max(xmin1, xmin2)
inter_ymin = max(ymin1, ymin2)
inter_xmax = min(xmax1, xmax2)
inter_ymax = min(ymax1, ymax2)
inter_width = max(0, inter_xmax - inter_xmin)
inter_height = max(0, inter_ymax - inter_ymin)
inter_area = inter_width * inter_height
box1_area = (xmax1 - xmin1) * (ymax1 - ymin1)
box2_area = (xmax2 - xmin2) * (ymax2 - ymin2)
union_area = box1_area + box2_area - inter_area
iou = inter_area / union_area if union_area > 0 else 0.0
return iou
def calculate_map(submission_preds: List[List[Tuple]], answers_preds: List[List[Tuple]]) -> float:
"""
Calculate mean Average Precision (mAP) for object detection.
> The challenge uses the standard PASCAL VOC 2010 mean Average Precision (mAP) at IoU > 0.5.
Note that the linked document describes VOC 2012, which differs in some minor ways
(e.g. there is no concept of "difficult" classes in VOC 2010).
The P/R curve and AP calculations remain the same.
(www.kaggle.com/competitions/siim-covid19-detection/overview/evaluation)
Some choices made here that were not explicitly specified in the challenge description:
1. Treating "none" or "negative" as prediction classes of their own, instead of as the non-positive class
- Justification: Treating them as their own classes is implied by the data format:
- Study level - train_study_level.csv has 4 binary classes including the "negative" class instead of 3
- Image level - We're asked to predict "none" with full bounding boxes instead of withholding a prediction
Also, 3rd place winner says "It probably treats the six classes equally", where the six classes are
"negative", "typical", "indeterminate", "atypical", "none" and "opacity".
(https://www.kaggle.com/competitions/siim-covid19-detection/discussion/240363)
2. Rules for populating y_pairs (see comments below), in particular the (0, 0) case
- Justification: The general rules follow the descriptions of the PASCAL VOC 2010 mAP documented online.
The only custom addition is handling the edge case of (0, 0) false negatives, which was necessary because
if we don't include (0, 0) pairs, both the sample submission and gold submission end up with all values of
y_true being 1, so the AP is undefined.
Behavior of our implementation is consistent with this comment from the organizers:
https://www.kaggle.com/competitions/siim-covid19-detection/discussion/248467#1362916
"""
aps = []
# Group predictions by class - the general idea is to calculate AP for each class separately
# and then average them to get mAP
classes = sorted(list(set(pred[0] for preds in answers_preds for pred in preds)))
for cls in classes:
# We will populate y_true and y_scores with (y_true, y_score) of:
# (1, confidence) for every predicted box that matches a ground truth box
# (0, confidence) for every predicted box that does not match a ground truth box
# (1, 0) for every ground truth box that is not matched to a predicted box
# (0, 0) when there are neither predicted nor ground truth boxes
y_pairs = [] # List of (y_true, y_score) pairs
# Gather all predictions and ground truth boxes related to this class from all samples
for img_preds, img_gts in zip(submission_preds, answers_preds):
y_pairs_ = []
# Get ground truth boxes for this class
gt_boxes = [gt[2:] for gt in img_gts if gt[0] == cls]
# Sort img_preds by confidence
img_preds.sort(key=lambda x: x[1], reverse=True)
# For each prediction of this class
matched_gt = [False] * len(gt_boxes) # Initialize all ground truths as unmatched
for pred in img_preds:
if pred[0] == cls:
# Find the best matching ground truth box
best_iou = 0
best_gt_idx = -1
for i, gt in enumerate(gt_boxes):
if matched_gt[i]: # Don't reuse matched ground truths
continue
iou = calculate_iou(pred[2:], gt)
if iou > best_iou and iou > 0.5:
best_iou = iou
best_gt_idx = i
pred_confidence = pred[1]
if best_gt_idx != -1:
y_pairs_.append((1, pred_confidence)) # True positive
matched_gt[best_gt_idx] = True
else:
y_pairs_.append((0, pred_confidence)) # False positive
# Add false negatives for unmatched ground truths
y_pairs_.extend([(1, 0)] * matched_gt.count(False))
if len(y_pairs_) == 0:
# A true negative
y_pairs_.append((0, 0))
y_pairs.extend(y_pairs_)
y_true = [pair[0] for pair in y_pairs]
y_scores = [pair[1] for pair in y_pairs]
if len(y_true) > 0:
assert not all(
y == 1 for y in y_true
), "y_true is all 1s; this shouldn't happen and will result in undefined AP"
ap = average_precision_score(y_true, y_scores)
aps.append(ap)
else:
raise ValueError(f"Unexpected error: y_true is empty for class {cls}")
# Calculate mAP
return np.mean(aps) if aps else 0.0
def parse_prediction_string(
prediction_string: str,
) -> List[Tuple[str, float, float, float, float, float]]:
"""
`prediction_string` should have the form "{label} {confidence} {xmin} {ymin} {xmax} {ymax}" for each bounding box
(so there may be 6 x N space-separated values in total, where N is the number of bounding boxes).
Returns a list of tuples, each with the form (label, confidence, xmin, ymin, xmax, ymax).
"""
toks = prediction_string.strip().split()
try:
return [
(
toks[i],
float(toks[i + 1]),
float(toks[i + 2]),
float(toks[i + 3]),
float(toks[i + 4]),
float(toks[i + 5]),
)
for i in range(0, len(toks), 6)
]
except ValueError as e:
raise InvalidSubmissionError(f"Could not parse PredictionString: {e}")
def grade(submission: pd.DataFrame, answers: pd.DataFrame) -> float:
"""
Grade the submission against the answers.
`submission` and `answers` are DataFrames with the following columns:
- `id` (str): The study ID
- `PredictionString` (str): Has the form "{label} {confidence} {xmin} {ymin} {xmax} {ymax}" for each bounding box
(so there may be 6 x N space-separated values in total, where N is the number of bounding boxes).
"""
if len(submission) != len(answers):
raise InvalidSubmissionError("Submission and answers must have the same length")
if "id" not in submission.columns:
raise InvalidSubmissionError("Submission must have an 'Id' column")
if "PredictionString" not in submission.columns:
raise InvalidSubmissionError("Submission must have a 'PredictionString' column")
assert "id" in answers.columns, "Answers must have an 'id' column"
assert "PredictionString" in answers.columns, "Answers must have a 'PredictionString' column"
submission = submission.sort_values("id")
answers = answers.sort_values("id")
if (submission["id"].values != answers["id"].values).any():
raise InvalidSubmissionError("Submission and answers must have the same ids")
# Attempt to parse the PredictionString
submission_preds = submission["PredictionString"].apply(parse_prediction_string)
answers_preds = answers["PredictionString"].apply(parse_prediction_string)
# Calculate the mAP
return calculate_map(submission_preds, answers_preds) if len(submission_preds) > 0 else 0.0