in pipeline/train/train.py [0:0]
def get_marian_cmd(self):
all_model_metrics = ["chrf", "ce-mean-words", "bleu-detok"]
validation_metrics = [
# Place the best model metric first.
self.best_model_metric.value,
# And then the rest of the metrics should follow.
*[m for m in all_model_metrics if m != self.best_model_metric.value],
]
# Take off the "--" from beginning of the list.
extra_args = self.extra_marian_args[1:]
if "USE_CPU" not in os.environ:
# We run a CPU version of Marian in tests and it does not work with these arguments.
extra_args.append("--sharding")
extra_args.append("local")
if self.model_type == ModelType.student:
if self.student_model == StudentModel.none:
raise ValueError("Student configuration is not provided")
model_name = f"student.{self.student_model.value}"
else:
model_name = self.model_type.value
if filecmp.cmp(self.src_vocab, self.trg_vocab, shallow=False):
emb_args = {"tied-embeddings-all": "true"}
else:
# when using separate vocabs tie only target embeddings and output embeddings in output layer
# do not tie source and target embeddings
emb_args = {"tied-embeddings-all": "false", "tied-embeddings": "true"}
return [
str(self.marian_bin),
*apply_command_args(
{
"model": self.artifacts / "model.npz",
"config": [
train_dir / f"configs/model/{model_name}.yml",
train_dir
/ f"configs/training/{self.model_type.value}.{self.training_type.value}.yml",
],
"tempdir": self.temp_dir / "marian-tmp",
"vocabs": [self.src_vocab, self.trg_vocab],
"workspace": self.workspace,
"devices": self.gpus.split(" "),
"valid-metrics": validation_metrics,
"valid-sets": str(self.validation_set),
"valid-translation-output": self.artifacts / "devset.out",
"valid-log": self.artifacts / "valid.log",
"log": self.artifacts / "train.log",
"shuffle": "batches",
"seed": str(self.seed),
"no-restore-corpus": None,
"valid-reset-stalled": None,
"sync-sgd": None,
"quiet-translation": None,
"overwrite": None,
"keep-best": None,
"tsv": None,
}
),
*apply_command_args(emb_args),
*extra_args,
]