lmms_eval/tasks/pope/utils.py (63 lines of code) (raw):
# Add the following functions to your existing utils.py file
def pope_doc_to_visual(doc):
# Assuming the 'doc' dictionary has a key 'image' with image data
return [doc["image"].convert("RGB")]
def pope_doc_to_text(doc):
# Assuming the 'doc' dictionary has a key 'question' with the question text
question = doc["question"].strip()
return f"{question}\nAnswer the question using a single word or phrase."
def pope_process_results(doc, results):
pred = results[0].lower().strip()
gt_ans = doc["answer"].lower().strip()
assert gt_ans in ["yes", "no"]
score = 1.0 if pred == gt_ans else 0.0
return {
"pope_accuracy": {"question_id": doc["question_id"], "score": score, "prediction": pred, "ground_truth": gt_ans},
"pope_precision": {"question_id": doc["question_id"], "score": score, "prediction": pred, "ground_truth": gt_ans},
"pope_recall": {"question_id": doc["question_id"], "score": score, "prediction": pred, "ground_truth": gt_ans},
"pope_f1_score": {"question_id": doc["question_id"], "score": score, "prediction": pred, "ground_truth": gt_ans},
"pope_yes_ratio": {"question_id": doc["question_id"], "score": score, "prediction": pred, "ground_truth": gt_ans},
}
def pope_aggregate_accuracy(results):
total_score = 0
for result in results:
total_score += result["score"]
avg_score = total_score / len(results)
return avg_score
def pope_aggregate_precision(results):
true_positives = 0
false_positives = 0
for result in results:
pred = result["prediction"]
gt = result["ground_truth"]
if gt == "yes" and pred == "yes":
true_positives += 1
elif gt == "no" and pred == "yes":
false_positives += 1
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
return precision
def pope_aggregate_recall(results):
true_positives = 0
false_negatives = 0
for result in results:
pred = result["prediction"]
gt = result["ground_truth"]
if gt == "yes" and pred == "yes":
true_positives += 1
elif gt == "yes" and pred == "no":
false_negatives += 1
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
return recall
def pope_aggregate_f1_score(results):
precision = pope_aggregate_precision(results)
recall = pope_aggregate_recall(results)
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
return f1_score
def pope_aggregate_yes_ratio(results):
yes_count = 0
no_count = 0
for result in results:
gt = result["ground_truth"]
if gt == "yes":
yes_count += 1
elif gt == "no":
no_count += 1
yes_ratio = yes_count / (yes_count + no_count) if (yes_count + no_count) > 0 else 0
return yes_ratio