in tracking/translations_parser/parser.py [0:0]
def get_extra_marian_config(self) -> dict:
"""
Read extra configuration files (Marian, OpusTrainer, extra CLI arguments).
Publication outside of a Taskcluster context (offline mode) cannot access
the configuration files, only extra-args will be set in this case.
"""
extra_config = {
"arguments": None,
"model": None,
"training": None,
"datasets": None,
"opustrainer": None,
}
if (
self.description is None
or (match := MARIAN_ARGS_REGEX.search(self.description)) is None
):
logger.error(self.description)
logger.warning(
"Invalid Marian description, skipping Marian and OpusTrainer configuration detection."
)
return extra_config
logger.info("Reading Marian command line arguments.")
(arguments_str,) = match.groups()
# Build args from the command line input text
args = defaultdict(list)
key = None
for i in iter(shlex.split(arguments_str)):
if i.startswith("-"):
key = i.strip("-")
continue
args[key].append(i)
# Store arguments used to run Marian, flattening single values
def flatten(vals):
if not vals:
return ""
elif len(vals) == 1:
return vals[0]
return vals
extra_config["arguments"] = {k: flatten(v) for k, v in args.items()}
if os.environ.get("TASK_ID") is None:
logger.info(
"Extra configuration files can only be retrieved in Taskcluster context, skipping."
)
return extra_config
# Handle Marian model and training YAML configuration files (called as --config or -c)
for path in args.get("config", args["c"]):
if path.startswith("configs/training"):
key = "training"
elif path.startswith("configs/model"):
key = "model"
else:
continue
try:
with open(path, "r") as f:
extra_config[key] = yaml.safe_load(f.read())
except Exception as e:
logger.warning(f"Impossible to parse Marian {key} config at {path}: {e}")
# Handle OpusTrainer configuration
(model_path,) = args.get("model", ("./model.npz",))
model_dir = Path(model_path).parent
train_conf_path = (model_dir / "config.opustrainer.yml").resolve()
if not train_conf_path.exists():
logger.warning(f"OpusTrainer configuration file does not exists at {train_conf_path}.")
else:
try:
with open(train_conf_path, "r") as f:
extra_config["opustrainer"] = yaml.safe_load(f.read())
except Exception as e:
logger.warning(f"Impossible to parse OpusTrainer config at {train_conf_path}: {e}")
else:
logger.info("Reading datasets statistics from OpusTrainer configuration.")
try:
dataset_conf = extra_config.get("opustrainer", {}).get("datasets", {})
extra_config["datasets"] = {
key: get_lines_count(path) for key, path in dataset_conf.items()
}
except Exception as e:
logger.warning(
f"OpusTrainer configuration could not be read at {train_conf_path}: {e}."
)
return extra_config