taskcluster/translations_taskgraph/parameters.py (113 lines of code) (raw):

# This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. import logging from pathlib import Path from taskgraph.parameters import extend_parameters_schema from voluptuous import Extra, Optional, Required import yaml logger = logging.getLogger(__name__) # By default, provide a very minimal config for CI that runs very quickly. This allows # the pipeline to be validated in CI. The production training configs should override # all of these values. def get_ci_training_config(_=None) -> dict: vcs_path = (Path(__file__).parent / "../..").resolve() config_path = vcs_path / "taskcluster/configs/config.ci.yml" with config_path.open() as file: return {"training_config": yaml.safe_load(file)} extend_parameters_schema( { Required("training_config"): { Required("target-stage"): str, Required("marian-args"): { Optional("training-backward"): {str: str}, Optional("training-teacher"): {str: str}, Optional("training-student"): {str: str}, Optional("training-student-finetuned"): {str: str}, Optional("decoding-backward"): {str: str}, Optional("decoding-teacher"): {str: str}, }, Required("experiment"): { Required("name"): str, Required("src"): str, Required("trg"): str, Required("teacher-ensemble"): int, Required("teacher-mode"): str, Required("teacher-decoder"): str, Required("student-model"): str, Optional("corpus-max-sentences"): int, Required("mono-max-sentences-trg"): { Required("total"): int, Required("per-dataset"): int, }, Required("mono-max-sentences-src"): { Required("total"): int, Required("per-dataset"): int, }, Required("spm-sample-size"): int, Optional("spm-vocab-size"): int, Required("spm-vocab-split"): bool, Required("best-model"): str, Required("use-opuscleaner"): str, Optional("opuscleaner-mode"): str, Required("bicleaner"): { Required("default-threshold"): float, Optional("dataset-thresholds"): { str: float, }, }, Required("monocleaner"): { Required("mono-src"): { Required("default-threshold"): float, Optional("dataset-thresholds"): { str: float, }, }, Required("mono-trg"): { Required("default-threshold"): float, Optional("dataset-thresholds"): { str: float, }, }, }, Required("hplt-min-doc-score"): { Required("mono-src"): float, Required("mono-trg"): float, }, Optional("pretrained-models"): { Optional("train-teacher"): { Required("urls"): [str], Required("mode"): str, Required("type"): str, }, Optional("backtranslations-train-backwards-model"): { Required("urls"): [str], Required("mode"): str, Required("type"): str, }, }, }, Optional("datasets"): { str: [str], }, Optional("taskcluster"): { Optional("split-chunks"): int, Required("worker-classes"): { Required("default"): str, Extra: str, }, }, Optional("wandb-publication"): bool, }, }, defaults_fn=get_ci_training_config, ) def deep_setdefault(dict_, defaults): for k, v in defaults.items(): if isinstance(dict_.get(k), dict): deep_setdefault(dict_[k], defaults[k]) else: dict_[k] = v def get_decision_parameters(graph_config, parameters): parameters.setdefault("training_config", {}) deep_setdefault(parameters, get_ci_training_config()) # We run the pipeline on a cron schedule to enable integration testing when # worker images change (see https://bugzilla.mozilla.org/show_bug.cgi?id=1937882). # These runs should _never_ be sent to W&B to avoid cluttering it up # with data of no value. if ( parameters["tasks_for"] == "cron" and parameters["target_tasks_method"] == "train-target-tasks" ): logger.info("Overriding wandb-publication to be False for cron pipeline run") parameters["training_config"]["wandb-publication"] = False