in evals/eval/evaluate.py [0:0]
def evaluate(pair, set_name, translator, evaluation_engine, gpus, models_dir, results_dir):
source, target = pair
my_env = os.environ.copy()
my_env["SRC"] = source
my_env["TRG"] = target
eval_prefix = get_dataset_prefix(set_name, pair, results_dir)
if translator == "bergamot":
my_env["MODEL_DIR"] = os.path.join(models_dir, f"{source}{target}")
my_env["APP_PATH"] = BERGAMOT_APP_PATH
cmd = ["bash", BERGAMOT_EVAL_PATH]
elif translator == "google":
cmd = ["python3", os.path.join(HOME_DIR, "translators", "google_translate.py")]
elif translator == "microsoft":
cmd = ["python3", os.path.join(HOME_DIR, "translators", "microsoft.py")]
elif translator == "argos":
cmd = ["python3", os.path.join(HOME_DIR, "translators", "argos.py")]
elif translator == "nllb":
cmd = ["python3", os.path.join(HOME_DIR, "translators", "nllb.py")]
elif translator == "opusmt":
cmd = ["python3", os.path.join(HOME_DIR, "translators", "opusmt.py")]
else:
raise ValueError(f"Translator is not supported: {translator}")
COMET_PATTERN = re.compile(r"score: (.+)")
os.makedirs(os.path.dirname(eval_prefix), exist_ok=True)
source_file = f"{eval_prefix}.{source}"
reference_file = f"{eval_prefix}.{target}"
translated_file = f"{eval_prefix}.{translator}.{target}"
result_file = f"{eval_prefix}.{translator}.{target}.{evaluation_engine}"
if set_name not in CUSTOM_DATASETS:
if not os.path.exists(source_file):
with open(source_file, "w") as output_file:
subprocess.run(
["sacrebleu", "-t", set_name, "-l", f"{source}-{target}", "--echo", "src"],
stdout=output_file,
text=True,
check=True,
)
if not os.path.exists(translated_file):
with open(source_file, "rb") as input_file:
with open(translated_file, "wb") as output_file:
subprocess.run(cmd, stdin=input_file, stdout=output_file, env=my_env, check=True)
retries = 3
while True:
try:
if evaluation_engine == "bleu":
if set_name in CUSTOM_DATASETS:
dataset_params = [reference_file]
else:
dataset_params = ["-t", set_name]
if not os.path.exists(result_file):
with open(translated_file, "r") as input_file:
with open(result_file, "w") as output_file:
subprocess.run(
["sacrebleu", "--score-only", "-q", "-l", f"{source}-{target}"]
+ dataset_params,
stdin=input_file,
stdout=output_file,
text=True,
check=True,
)
if evaluation_engine == "comet":
if set_name in CUSTOM_DATASETS:
dataset_params = ["-s", source_file, "-r", reference_file]
else:
dataset_params = ["-d", f"{set_name}:{source}-{target}"]
completed_process = subprocess.run(
[
"comet-score",
"--gpus",
gpus,
"--quiet",
"--only_system",
"-t",
translated_file,
]
+ dataset_params,
capture_output=True,
text=True,
check=True,
)
match = COMET_PATTERN.search(completed_process.stdout)
if match:
with open(result_file, "w") as output_file:
output_file.write(f"{match.group(1)}\n")
else:
raise Exception("Unable to find Comet score in output")
with open(result_file, "r") as f:
return float(f.read())
except:
traceback.print_exc()
if retries == 0:
raise
retries -= 1
subprocess.run(["bash", CLEAN_CACHE_PATH])
print("Attempt failed, retrying")