def evaluate()

in evals/eval/evaluate.py [0:0]


def evaluate(pair, set_name, translator, evaluation_engine, gpus, models_dir, results_dir):
    source, target = pair

    my_env = os.environ.copy()
    my_env["SRC"] = source
    my_env["TRG"] = target
    eval_prefix = get_dataset_prefix(set_name, pair, results_dir)

    if translator == "bergamot":
        my_env["MODEL_DIR"] = os.path.join(models_dir, f"{source}{target}")
        my_env["APP_PATH"] = BERGAMOT_APP_PATH
        cmd = ["bash", BERGAMOT_EVAL_PATH]
    elif translator == "google":
        cmd = ["python3", os.path.join(HOME_DIR, "translators", "google_translate.py")]
    elif translator == "microsoft":
        cmd = ["python3", os.path.join(HOME_DIR, "translators", "microsoft.py")]
    elif translator == "argos":
        cmd = ["python3", os.path.join(HOME_DIR, "translators", "argos.py")]
    elif translator == "nllb":
        cmd = ["python3", os.path.join(HOME_DIR, "translators", "nllb.py")]
    elif translator == "opusmt":
        cmd = ["python3", os.path.join(HOME_DIR, "translators", "opusmt.py")]
    else:
        raise ValueError(f"Translator is not supported: {translator}")

    COMET_PATTERN = re.compile(r"score: (.+)")

    os.makedirs(os.path.dirname(eval_prefix), exist_ok=True)

    source_file = f"{eval_prefix}.{source}"
    reference_file = f"{eval_prefix}.{target}"
    translated_file = f"{eval_prefix}.{translator}.{target}"
    result_file = f"{eval_prefix}.{translator}.{target}.{evaluation_engine}"

    if set_name not in CUSTOM_DATASETS:
        if not os.path.exists(source_file):
            with open(source_file, "w") as output_file:
                subprocess.run(
                    ["sacrebleu", "-t", set_name, "-l", f"{source}-{target}", "--echo", "src"],
                    stdout=output_file,
                    text=True,
                    check=True,
                )

    if not os.path.exists(translated_file):
        with open(source_file, "rb") as input_file:
            with open(translated_file, "wb") as output_file:
                subprocess.run(cmd, stdin=input_file, stdout=output_file, env=my_env, check=True)

    retries = 3
    while True:
        try:
            if evaluation_engine == "bleu":
                if set_name in CUSTOM_DATASETS:
                    dataset_params = [reference_file]
                else:
                    dataset_params = ["-t", set_name]

                if not os.path.exists(result_file):
                    with open(translated_file, "r") as input_file:
                        with open(result_file, "w") as output_file:
                            subprocess.run(
                                ["sacrebleu", "--score-only", "-q", "-l", f"{source}-{target}"]
                                + dataset_params,
                                stdin=input_file,
                                stdout=output_file,
                                text=True,
                                check=True,
                            )

            if evaluation_engine == "comet":
                if set_name in CUSTOM_DATASETS:
                    dataset_params = ["-s", source_file, "-r", reference_file]
                else:
                    dataset_params = ["-d", f"{set_name}:{source}-{target}"]

                completed_process = subprocess.run(
                    [
                        "comet-score",
                        "--gpus",
                        gpus,
                        "--quiet",
                        "--only_system",
                        "-t",
                        translated_file,
                    ]
                    + dataset_params,
                    capture_output=True,
                    text=True,
                    check=True,
                )
                match = COMET_PATTERN.search(completed_process.stdout)
                if match:
                    with open(result_file, "w") as output_file:
                        output_file.write(f"{match.group(1)}\n")
                else:
                    raise Exception("Unable to find Comet score in output")

            with open(result_file, "r") as f:
                return float(f.read())
        except:
            traceback.print_exc()
            if retries == 0:
                raise
            retries -= 1
            subprocess.run(["bash", CLEAN_CACHE_PATH])
            print("Attempt failed, retrying")