taskcluster/translations_taskgraph/parameters.py (113 lines of code) (raw):
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import logging
from pathlib import Path
from taskgraph.parameters import extend_parameters_schema
from voluptuous import Extra, Optional, Required
import yaml
logger = logging.getLogger(__name__)
# By default, provide a very minimal config for CI that runs very quickly. This allows
# the pipeline to be validated in CI. The production training configs should override
# all of these values.
def get_ci_training_config(_=None) -> dict:
vcs_path = (Path(__file__).parent / "../..").resolve()
config_path = vcs_path / "taskcluster/configs/config.ci.yml"
with config_path.open() as file:
return {"training_config": yaml.safe_load(file)}
extend_parameters_schema(
{
Required("training_config"): {
Required("target-stage"): str,
Required("marian-args"): {
Optional("training-backward"): {str: str},
Optional("training-teacher"): {str: str},
Optional("training-student"): {str: str},
Optional("training-student-finetuned"): {str: str},
Optional("decoding-backward"): {str: str},
Optional("decoding-teacher"): {str: str},
},
Required("experiment"): {
Required("name"): str,
Required("src"): str,
Required("trg"): str,
Required("teacher-ensemble"): int,
Required("teacher-mode"): str,
Required("teacher-decoder"): str,
Required("student-model"): str,
Optional("corpus-max-sentences"): int,
Required("mono-max-sentences-trg"): {
Required("total"): int,
Required("per-dataset"): int,
},
Required("mono-max-sentences-src"): {
Required("total"): int,
Required("per-dataset"): int,
},
Required("spm-sample-size"): int,
Optional("spm-vocab-size"): int,
Required("spm-vocab-split"): bool,
Required("best-model"): str,
Required("use-opuscleaner"): str,
Optional("opuscleaner-mode"): str,
Required("bicleaner"): {
Required("default-threshold"): float,
Optional("dataset-thresholds"): {
str: float,
},
},
Required("monocleaner"): {
Required("mono-src"): {
Required("default-threshold"): float,
Optional("dataset-thresholds"): {
str: float,
},
},
Required("mono-trg"): {
Required("default-threshold"): float,
Optional("dataset-thresholds"): {
str: float,
},
},
},
Required("hplt-min-doc-score"): {
Required("mono-src"): float,
Required("mono-trg"): float,
},
Optional("pretrained-models"): {
Optional("train-teacher"): {
Required("urls"): [str],
Required("mode"): str,
Required("type"): str,
},
Optional("backtranslations-train-backwards-model"): {
Required("urls"): [str],
Required("mode"): str,
Required("type"): str,
},
},
},
Optional("datasets"): {
str: [str],
},
Optional("taskcluster"): {
Optional("split-chunks"): int,
Required("worker-classes"): {
Required("default"): str,
Extra: str,
},
},
Optional("wandb-publication"): bool,
},
},
defaults_fn=get_ci_training_config,
)
def deep_setdefault(dict_, defaults):
for k, v in defaults.items():
if isinstance(dict_.get(k), dict):
deep_setdefault(dict_[k], defaults[k])
else:
dict_[k] = v
def get_decision_parameters(graph_config, parameters):
parameters.setdefault("training_config", {})
deep_setdefault(parameters, get_ci_training_config())
# We run the pipeline on a cron schedule to enable integration testing when
# worker images change (see https://bugzilla.mozilla.org/show_bug.cgi?id=1937882).
# These runs should _never_ be sent to W&B to avoid cluttering it up
# with data of no value.
if (
parameters["tasks_for"] == "cron"
and parameters["target_tasks_method"] == "train-target-tasks"
):
logger.info("Overriding wandb-publication to be False for cron pipeline run")
parameters["training_config"]["wandb-publication"] = False