def main()

in pipeline/clean/merge-parallel.py [0:0]


def main() -> None:
    parser = argparse.ArgumentParser(
        description=__doc__,
        # Preserves whitespace in the help text.
        formatter_class=argparse.RawTextHelpFormatter,
    )
    parser.add_argument(
        "--src",
        type=str,
        help="The source locale",
    )

    parser.add_argument(
        "--trg",
        type=str,
        help="The target locale",
    )

    parser.add_argument(
        "--datasets_glob",
        type=str,
        help="A glob-style path to the mono datasets, e.g. /path/to/*.zst",
    )

    parser.add_argument(
        "--max_lines",
        type=str,
        default="None",
        help="The (optionally) maximum number of sentences that will be merged.",
    )

    parser.add_argument(
        "--sample_size", type=int, default=10_000, help="Generate a random sample of sentences."
    )

    parser.add_argument(
        "--artifacts",
        type=Path,
        help="The path to the artifacts directory.",
    )

    parser.add_argument(
        "--name",
        type=str,
        help='The final corpus name, e.g. "corpus" will output a "corpus.en.zst" file.',
    )

    args = parser.parse_args()

    datasets_src, datasets_trg, total_corpus_bytes = get_datasets(
        args.src, args.trg, args.datasets_glob
    )

    logger.info("Parallel datasets:")

    src_outpath = args.artifacts / f"{args.name}.{args.src}.zst"
    trg_outpath = args.artifacts / f"{args.name}.{args.trg}.zst"

    stats = FilteringStatistics(args.artifacts / args.name)

    max_lines: Optional[int] = None
    if args.max_lines != "None":
        max_lines = int(args.max_lines)

    deduplicate_corpus = DeduplicateCorpus(
        datasets_src,
        datasets_trg,
        src_outpath,
        trg_outpath,
        stats,
    )

    deduplicate_corpus.run(total_corpus_bytes, max_lines)

    sample_corpus(
        artifacts=args.artifacts,
        name=args.name,
        sample_size=args.sample_size,
        src_outpath=src_outpath,
        trg_outpath=trg_outpath,
    )

    stats.save_json()