in evals/eval/evaluate.py [0:0]
def build_section(datasets, key, lines, res_dir, evaluation_engine):
lines.append(f"\n## {key}\n")
lines.append(f'| Translator/Dataset | {" | ".join(datasets.keys())} |')
lines.append(f"| {' | '.join(['---' for _ in range(len(datasets) + 1)])} |")
inverted_formatted = defaultdict(dict)
inverted_scores = defaultdict(dict)
comet_comparisons = defaultdict(dict)
for dataset_name, translators in datasets.items():
bergamot_res = translators.get("bergamot")
reordered = sorted(translators.items(), key=lambda x: TRANS_ORDER[x[0]])
for translator, score in reordered:
if score == 0:
formatted_score = "N/A"
elif translator != "bergamot" and bergamot_res:
change_perc = (score - bergamot_res) / bergamot_res * 100
change = score - bergamot_res
sign = "+" if change > 0 else ""
formatted_score = f"{score:.2f} ({sign}{change:.2f}, {sign}{change_perc:.2f}%)"
else:
formatted_score = f"{score:.2f}"
inverted_formatted[translator][dataset_name] = formatted_score
inverted_scores[translator][dataset_name] = score
# if this is a non-avg comet report, and a cometcompare report exists, we print it
cometcompare_path = "{}/{}/{}.{}.cometcompare".format(res_dir, key, dataset_name, key)
if (
evaluation_engine == "comet"
and key != "avg"
and "{}.{}".format(dataset_name, key) not in comet_comparisons
and exists(cometcompare_path)
):
cometcompare_file = open(cometcompare_path)
filelines = cometcompare_file.readlines()
final_report = ""
for line in filelines:
if "outperforms" in line:
final_report += f"- {line}"
comet_comparisons["{}.{}".format(dataset_name, key)] = final_report
for translator, scores in inverted_formatted.items():
lines.append(f'| {translator} | {" | ".join(scores.values())} |')
img_path = os.path.join(res_dir, "img", f"{key}-{evaluation_engine}.png")
plot_lang_pair(datasets, inverted_scores, img_path, evaluation_engine)
img_relative_path = "/".join(img_path.split("/")[-2:])
lines.append(f"\n")
printed_header = False
for dataset in comet_comparisons:
if not printed_header:
lines.append("### Comparisons between systems")
lines.append(
"*If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report*"
)
printed_header = True
lines.append(f"#### [{dataset}]({key}/{dataset}.cometcompare)")
lines.append(f"{comet_comparisons[dataset]}")
lines.append("---")