in pipeline/eval/eval.py [0:0]
def main(args_list: Optional[list[str]] = None) -> None:
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawTextHelpFormatter, # Preserves whitespace in the help text.
)
parser.add_argument(
"--artifacts_prefix",
type=str,
help="The location where the translated results will be saved",
)
parser.add_argument(
"--dataset_prefix", type=str, help="The evaluation datasets prefix, used in the form."
)
parser.add_argument("--src", type=str, help='The source language, e.g "en".')
parser.add_argument("--trg", type=str, help='The target language, e.g "ca".')
parser.add_argument("--marian", type=str, help="The path the to marian binaries.")
parser.add_argument("--marian_config", type=str, help="The marian yaml config for the model.")
parser.add_argument(
"--quantized",
action="store_true",
help="Use a quantized model. This requires the browsermt fork of Marian",
)
parser.add_argument(
"--models",
type=str,
help="The Marian model (or models if its an ensemble) to use for translations",
)
parser.add_argument(
"--vocab_src",
required=False,
type=str,
help="The path to a src vocab file (optional)",
)
parser.add_argument(
"--vocab_trg",
required=False,
type=str,
help="The path to a trg vocab file (optional)",
)
parser.add_argument(
"--shortlist",
required=False,
type=str,
help="The path to a lexical shortlist (optional)",
)
parser.add_argument("--workspace", type=str, help="The preallocated MB for the workspace")
parser.add_argument(
"--gpus",
required=False,
type=str,
help="Which GPUs to use (only for the gpu model variant)",
)
parser.add_argument(
"--model_variant", type=str, help="The model variant to use, (gpu, cpu, quantized)"
)
# Add Weight & Biases CLI args when module is loaded
if WANDB_AVAILABLE:
add_wandb_arguments(parser)
args = parser.parse_args(args_list)
src = args.src
trg = args.trg
dataset_prefix = args.dataset_prefix
artifacts_prefix = args.artifacts_prefix
artifacts_dir = os.path.dirname(artifacts_prefix)
source_file_compressed = f"{dataset_prefix}.{src}.zst"
source_file = f"{artifacts_prefix}.{src}"
target_file_compressed = f"{dataset_prefix}.{trg}.zst"
target_file = f"{artifacts_prefix}.{trg}"
target_ref_file = f"{artifacts_prefix}.{trg}.ref"
marian_decoder = f'"{args.marian}"/marian-decoder'
marian_log_file = f"{artifacts_prefix}.log"
language_pair = f"{src}-{trg}"
metrics_file = f"{artifacts_prefix}.metrics"
metrics_json = f"{artifacts_prefix}.metrics.json"
# Configure Marian for the different model variants.
marian_extra_args = []
if args.model_variant == "quantized":
marian_extra_args = ["--int8shiftAlphaAll"]
elif args.model_variant == "gpu":
if not args.workspace:
raise Exception("The workspace size was not provided")
marian_extra_args = [
'--workspace', args.workspace,
'--devices', args.gpus,
] # fmt: skip
elif not args.model_variant == "cpu":
raise Exception(f"Unsupported model variant {args.model_variant}")
if args.vocab_src and args.vocab_trg:
marian_extra_args = [*marian_extra_args, "--vocabs", args.vocab_src, args.vocab_trg]
if args.shortlist:
# No arguments to the shortlist, so default ones are used
# this way it doesn't matter if the shortlist is binary or text
# because they have different arguments
# text shortlist args: firstNum bestNum threshold
# binary shortlist (has the arguments embedded) args: bool (check integrity)
marian_extra_args = marian_extra_args + ["--shortlist", args.shortlist]
logger.info("The eval script is configured with the following:")
logger.info(f" > artifacts_dir: {artifacts_dir}")
logger.info(f" > source_file_compressed: {source_file_compressed}")
logger.info(f" > source_file: {source_file}")
logger.info(f" > target_file: {target_file}")
logger.info(f" > target_ref_file: {target_ref_file}")
logger.info(f" > marian_decoder: {marian_decoder}")
logger.info(f" > marian_log_file: {marian_log_file}")
logger.info(f" > language_pair: {language_pair}")
logger.info(f" > metrics_file: {metrics_file}")
logger.info(f" > metrics_json: {metrics_json}")
logger.info(f" > marian_extra_args: {marian_extra_args}")
logger.info(f" > gpus: {args.gpus}")
logger.info("Ensure that the artifacts directory exists.")
os.makedirs(artifacts_dir, exist_ok=True)
logger.info("Save the original target sentences to the artifacts")
decompress_file(target_file_compressed, keep_original=False, decompressed_path=target_ref_file)
run_bash_oneliner(
f"""
# Decompress the source file, e.g. $fetches/wmt09.en.zst
zstdmt -dc "{source_file_compressed}"
# Tee the source file into the artifacts directory, e.g. $artifacts/wmt09.en
| tee "{source_file}"
# Take the source and pipe it in to be decoded (translated) by Marian.
| {marian_decoder}
--models {args.models}
--config {args.marian_config}
--quiet
--quiet-translation
--log {marian_log_file}
{" ".join(marian_extra_args)}
# The translations be "tee"ed out to the artifacts, e.g. $artifacts/wmt09.ca
| tee "{target_file}"
"""
)
with open(target_ref_file, "r") as file:
target_ref_lines = file.readlines()
with open(target_file, "r") as file:
target_lines = file.readlines()
with open(source_file, "r") as file:
source_lines = file.readlines()
compute_bleu = BLEU(trg_lang=trg)
compute_chrf = CHRF()
logger.info("Computing the BLEU score.")
bleu_score: BLEUScore = compute_bleu.corpus_score(target_lines, [target_ref_lines])
bleu_details = json.loads(
bleu_score.format(signature=compute_bleu.get_signature().format(), is_json=True)
)
logger.info("Computing the chrF score.")
chrf_score: CHRFScore = compute_chrf.corpus_score(target_lines, [target_ref_lines])
chrf_details = json.loads(
chrf_score.format(signature=compute_chrf.get_signature().format(), is_json=True)
)
# The default comet model.
# It should match the model used in https://github.com/mozilla/firefox-translations-models/
comet_model_name = "Unbabel/wmt22-comet-da"
if os.environ.get("COMET_SKIP"):
comet_score = "skipped"
print("COMET_SKIP was set, so the COMET score will not be computed.")
else:
logger.info("Loading COMET")
import comet
# COMET_MODEL_DIR allows tests to place the model in a data directory
comet_checkpoint = comet.download_model(
comet_model_name, saving_directory=os.environ.get("COMET_MODEL_DIR")
)
comet_model = comet.load_from_checkpoint(comet_checkpoint)
comet_data = []
for source, target, target_ref in zip(source_lines, target_lines, target_ref_lines):
comet_data.append({"src": source, "mt": target, "ref": target_ref})
# GPU information comes in the form of a list of numbers, e.g. "0 1 2 3". Split these to
# get the GPU count.
gpu_count = len(args.gpus.split(" "))
if os.environ.get("COMET_CPU"):
gpu_count = 0 # Let tests override the CPU count.
comet_mode = "cpu" if gpu_count == 0 else "gpu"
logger.info(f'Computing the COMET score with "{comet_model_name}" using the {comet_mode}')
comet_results = comet_model.predict(comet_data, gpus=gpu_count)
# Reduce the precision.
comet_score = round(comet_results.system_score, 4)
metrics = {
"bleu": {
"score": bleu_details["score"],
# Example details:
# {
# "name": "BLEU",
# "score": 0.4,
# "signature": "nrefs:1|case:mixed|eff:no|tok:13a|smooth:exp|version:2.0.0",
# "verbose_score": "15.6/0.3/0.2/0.1 (BP = 0.823 ratio = 0.837 hyp_len = 180 ref_len = 215)",
# "nrefs": "1",
# "case": "mixed",
# "eff": "no",
# "tok": "13a",
# "smooth": "exp",
# "version": "2.0.0"
# }
"details": bleu_details,
},
"chrf": {
"score": chrf_details["score"],
# Example details:
# {
# "name": "chrF2",
# "score": 0.64,
# "signature": "nrefs:1|case:mixed|eff:yes|nc:6|nw:0|space:no|version:2.0.0",
# "nrefs": "1",
# "case": "mixed",
# "eff": "yes",
# "nc": "6",
# "nw": "0",
# "space": "no",
# "version": "2.0.0"
# }
"details": chrf_details,
},
"comet": {
"score": comet_score,
"details": {
"model": comet_model_name,
"score": comet_score,
},
},
}
logger.info(f"Writing {metrics_json}")
with open(metrics_json, "w") as file:
file.write(json.dumps(metrics, indent=2))
logger.info(f'Writing the metrics in the older "text" format: {metrics_file}')
with open(metrics_file, "w") as file:
file.write(f"{bleu_details['score']}\n" f"{chrf_details['score']}\n" f"{comet_score}\n")
if WANDB_AVAILABLE:
metric = metric_from_tc_context(
chrf=chrf_details["score"], bleu=bleu_details["score"], comet=comet_score
)
run_client = get_wandb_publisher( # noqa
project_name=args.wandb_project,
group_name=args.wandb_group,
run_name=args.wandb_run_name,
taskcluster_secret=args.taskcluster_secret,
artifacts=args.wandb_artifacts,
publication=args.wandb_publication,
)
if run_client is None:
# W&B publication may be direclty disabled through WANDB_PUBLICATION
return
logger.info(f"Publishing metrics to Weight & Biases ({run_client.extra_kwargs})")
run_client.open()
run_client.handle_metrics(metrics=[metric])
run_client.close()
# Publish an extra row on the group_logs summary run
group_logs_client = WandB( # noqa
project=run_client.wandb.project,
group=run_client.wandb.group,
name="group_logs",
suffix=run_client.suffix,
)
logger.info("Adding metric row to the 'group_logs' run")
group_logs_client.open()
# Restore existing metrics data
data = list_existing_group_logs_metrics(group_logs_client.wandb)
data.append(
[
run_client.wandb.group,
run_client.wandb.name,
metric.importer,
metric.dataset,
metric.augmentation,
]
+ [getattr(metric, attr) for attr in METRIC_KEYS]
)
group_logs_client.wandb.log(
{
"metrics": wandb.Table(
columns=[
"Group",
"Model",
"Importer",
"Dataset",
"Augmenation",
*METRIC_KEYS,
],
data=data,
)
}
)
group_logs_client.close()