in vision/m4/evaluation/custom_metrics/unfolded_image_captioning_metrics.py [0:0]
def _compute(self, example_ids, generated_captions, reference_captions):
data_per_id = {}
for ex_id, gen_cap, ref_caps in zip(example_ids, generated_captions, reference_captions):
# If condition is a dirty trick to handle the case of distributed evaluation where some instances can be
# repeated over a few proceses to make the batches even.
# In this case, we just verify that all processes predicted the same thing, and only take one copy of predictions
# in order to not mess up metrics. Ideally this "unique" logic should be handled outside of the metric or maybe
# in the add_batch call...
if ex_id not in data_per_id:
data_per_id[ex_id] = {
"generated_caption": gen_cap,
"reference_captions": ref_caps,
}
else:
if data_per_id[ex_id]["generated_caption"] == gen_cap:
logger.warning(
f"Example {ex_id} has different predictions accross processes. We have: {gen_cap} and"
f" {data_per_id[ex_id]['generated_caption']}"
)
if data_per_id[ex_id]["reference_captions"] == ref_caps:
logger.warning(
f"Example {ex_id} has different answers accross processes. We have: {ref_caps} and"
f" {data_per_id[ex_id]['reference_captions']}"
)
# assert list(range(len(data_per_id))) == sorted(data_per_id.keys())
results = {}
default_to_save_generations = (
reference_captions[0] is None or len(reference_captions[0]) == 0
) and ImageCaptioningMetrics.DEFAULT_TO_SERVER_RESULTS in self.metrics
if self.save_generations or default_to_save_generations:
# If answers are None, we default to the server results
results["server_results"] = [
{
"image_id": ex_id,
"caption": data["generated_caption"],
}
for ex_id, data in data_per_id.items()
]
if default_to_save_generations:
return results
# We put the results in the format expected by the tokenizer of pycocoevalcap
gts = {}
res = {}
caption_counter = 0
for ex_id, data_dict in data_per_id.items():
res[ex_id] = [{"image_id": ex_id, "caption": data_dict["generated_caption"], "id": caption_counter}]
caption_counter += 1
gts[ex_id] = [
{"image_id": ex_id, "caption": ref_str, "id": caption_counter + idx}
for idx, ref_str in enumerate(data_dict["reference_captions"])
]
caption_counter += len(data_dict["reference_captions"])
if len(self.pycoco_scorers) > 0:
tokenizer = PTBTokenizer()
gts = tokenizer.tokenize(gts)
res = tokenizer.tokenize(res)
for scorer, method in self.pycoco_scorers:
score, scores = scorer.compute_score(gts, res)
if type(method) == list:
for sc, scs, m in zip(score, scores, method):
results[f"{m}"] = sc
results[f"{m}_all"] = convert_to_list(scs)
else:
results[f"{method}"] = score
results[f"{method}_all"] = convert_to_list(scores)
if len(self.other_scorers) > 0:
for scorer, method in self.other_scorers:
generated_captions = [data_per_id[ex_id]["generated_caption"] for ex_id in data_per_id]
reference_captions = [data_per_id[ex_id]["reference_captions"] for ex_id in data_per_id]
score = scorer.compute_score(generated_captions, reference_captions)
results[f"{method}"] = score
return results