def main()

in pipeline/data/mono_importer.py [0:0]


def main(args_list: Optional[list[str]] = None) -> None:
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawTextHelpFormatter,  # Preserves whitespace in the help text.
    )
    parser.add_argument("--dataset", type=str, help="The key for the dataset")
    parser.add_argument("--language", type=str, help="The BCP 47 language tag of the dataset")
    parser.add_argument("--src", type=bool, help="Source language of a language pair")
    parser.add_argument("--trg", type=bool, help="Target language of a language pair")
    parser.add_argument(
        "--max_sentences", type=int, help="The maximum number of sentences to retain"
    )
    parser.add_argument(
        "--hplt_min_doc_score",
        type=float,
        help="The minimum document score to filter datasets that include this metric",
        default=5.0,
    )
    parser.add_argument(
        "--hplt_max_characters",
        type=int,
        help="The maximum length of the output segments. ",
        default=600,
    )
    parser.add_argument(
        "--hplt_merge_lines",
        type=bool,
        help="Whether to accumulate lines of the same document in one output segment until `hplt_max_characters` is reached.",
        default=False,
    )
    parser.add_argument(
        "--artifacts", type=Path, help="The location where the dataset will be saved"
    )
    args = parser.parse_args(args_list)

    dataset = Dataset(args.dataset)

    file_destination: Path = args.artifacts / f"{dataset.file_safe_name()}.{args.language}.zst"

    logger.info(f"Dataset: {args.dataset}")
    logger.info(f"Language: {args.language}")
    logger.info(f"HPLT Max Sentences: {args.max_sentences}")
    logger.info(f"HPLT Minimum Document Score Threshold: {args.hplt_min_doc_score}")
    logger.info(f"HPLT Merge Lines: {args.hplt_merge_lines}")
    logger.info(f"Artifacts: {args.artifacts}")
    logger.info(f"File Destination: {file_destination}")

    if not os.path.exists(args.artifacts):
        os.makedirs(args.artifacts)

    if dataset.importer == "hplt":
        if dataset.name != "mono/v2.0":
            raise ValueError("Only HPLT v2.0 is supported")
        HpltDownloader(
            language=args.language,
            hplt_min_doc_score=args.hplt_min_doc_score,
            max_characters=args.hplt_max_characters,
            max_lines=args.max_sentences,
            file_destination=file_destination,
            merge_lines=args.hplt_merge_lines,
        ).download()

        return

    url = None
    if dataset.importer == "url":
        url = dataset.name
    elif dataset.importer == "news-crawl":
        url = f"http://data.statmt.org/news-crawl/{args.language}/{dataset.name}.{args.language}.shuffled.deduped.gz"
        logger.info("Downloading WMT newscrawl monolingual data")
        logger.info(url)
    elif dataset.importer == "opus":
        url = f"https://object.pouta.csc.fi/OPUS-{dataset.name}/mono/{args.language}.txt.gz"
        logger.info("Downloading OPUS monolingual data")
        logger.info(url)
    else:
        raise Exception(f'Unsupported importer "{dataset.importer}"')

    logger.info(f"URL: {url}")

    with ExitStack() as stack:
        outfile = stack.enter_context(write_lines(file_destination))
        lines = stack.enter_context(read_lines(url))

        for line in shuffle_with_max_lines(
            line_stream=lines,
            seed=dataset.name,
            max_lines=args.max_sentences,
            total_byte_size=get_download_size(url),
        ):
            outfile.write(line)

    if args.language == "zh":
        # TODO: convert everything to Chinese simplified for now when Chinese is the source language
        # TODO: https://github.com/mozilla/firefox-translations-training/issues/896
        handle_chinese_mono(
            file_destination, is_src=args.src == "zh", variant=ChineseType.simplified
        )