def get_extra_marian_config()

in tracking/translations_parser/parser.py [0:0]


    def get_extra_marian_config(self) -> dict:
        """
        Read extra configuration files (Marian, OpusTrainer, extra CLI arguments).
        Publication outside of a Taskcluster context (offline mode) cannot access
        the configuration files, only extra-args will be set in this case.
        """
        extra_config = {
            "arguments": None,
            "model": None,
            "training": None,
            "datasets": None,
            "opustrainer": None,
        }

        if (
            self.description is None
            or (match := MARIAN_ARGS_REGEX.search(self.description)) is None
        ):
            logger.error(self.description)
            logger.warning(
                "Invalid Marian description, skipping Marian and OpusTrainer configuration detection."
            )
            return extra_config

        logger.info("Reading Marian command line arguments.")
        (arguments_str,) = match.groups()
        # Build args from the command line input text
        args = defaultdict(list)
        key = None
        for i in iter(shlex.split(arguments_str)):
            if i.startswith("-"):
                key = i.strip("-")
                continue
            args[key].append(i)

        # Store arguments used to run Marian, flattening single values
        def flatten(vals):
            if not vals:
                return ""
            elif len(vals) == 1:
                return vals[0]
            return vals

        extra_config["arguments"] = {k: flatten(v) for k, v in args.items()}

        if os.environ.get("TASK_ID") is None:
            logger.info(
                "Extra configuration files can only be retrieved in Taskcluster context, skipping."
            )
            return extra_config

        # Handle Marian model and training YAML configuration files (called as --config or -c)
        for path in args.get("config", args["c"]):
            if path.startswith("configs/training"):
                key = "training"
            elif path.startswith("configs/model"):
                key = "model"
            else:
                continue
            try:
                with open(path, "r") as f:
                    extra_config[key] = yaml.safe_load(f.read())
            except Exception as e:
                logger.warning(f"Impossible to parse Marian {key} config at {path}: {e}")

        # Handle OpusTrainer configuration
        (model_path,) = args.get("model", ("./model.npz",))
        model_dir = Path(model_path).parent
        train_conf_path = (model_dir / "config.opustrainer.yml").resolve()
        if not train_conf_path.exists():
            logger.warning(f"OpusTrainer configuration file does not exists at {train_conf_path}.")
        else:
            try:
                with open(train_conf_path, "r") as f:
                    extra_config["opustrainer"] = yaml.safe_load(f.read())
            except Exception as e:
                logger.warning(f"Impossible to parse OpusTrainer config at {train_conf_path}: {e}")
            else:
                logger.info("Reading datasets statistics from OpusTrainer configuration.")
                try:
                    dataset_conf = extra_config.get("opustrainer", {}).get("datasets", {})
                    extra_config["datasets"] = {
                        key: get_lines_count(path) for key, path in dataset_conf.items()
                    }
                except Exception as e:
                    logger.warning(
                        f"OpusTrainer configuration could not be read at {train_conf_path}: {e}."
                    )

        return extra_config