def main()

in pipeline/clean/merge-mono.py [0:0]
55 lines of code
4 McCabe index (conditional complexity)

def main() -> None:
    parser = argparse.ArgumentParser(description="Merge monolingual datasets.")
    parser.add_argument(
        "--parallel_corpus",
        type=Path,
        help="The path to the parallel corpus of this language, e.g. $MOZ_FETCHES_DIR/corpus.ca.zst",
    )
    parser.add_argument(
        "--output",
        type=Path,
        help="The path to the output compressed file, e.g. /builds/worker/artifacts/mono.ca.zst",
    )
    parser.add_argument(
        "--max_sentences", type=int, help="The maximum number of sentences that will be merged."
    )
    parser.add_argument(
        "--datasets_glob",
        type=str,
        help="A glob-style path to the mono datasets, e.g. /path/to/*.zst",
    )
    parser.add_argument(
        "--sample_size", type=int, default=10_000, help="Generate a random sample of sentences."
    )

    args = parser.parse_args()

    output_path: Path = args.output
    max_sentences: int = args.max_sentences
    parallel_corpus: str = args.parallel_corpus
    mono_dataset_paths: list[str] = glob.glob(args.datasets_glob)

    if not mono_dataset_paths:
        raise FileNotFoundError(f"No files found matching glob pattern: {args.datasets_glob}")

    logger.info("Monolingual datasets:")
    total_mono_bytes = 0
    for path in mono_dataset_paths:
        formatted_size, bytes = get_human_readable_file_size(path)
        logger.info(f" - {path} ({formatted_size})")
        total_mono_bytes += bytes

    logger.info(f" - {format_bytes(total_mono_bytes)} total")

    formatted_size = (get_human_readable_file_size(path))[0]
    logger.info("Parallel corpus:")
    logger.info(f" - {parallel_corpus} ({formatted_size})")

    # Ensure output directory exists
    output_dir = output_path.parent
    output_dir.mkdir(parents=True, exist_ok=True)

    # Compute the line hashes so that the monolingual data can be de-duplicated.
    # It's about 10 bytes per hash in a set, so for a 100 million sentence corpus,
    # it would be ~1G in memory.
    log_memory()
    logger.info(f"Compute hashes of the parallel data: {path}")
    line_hashes = compute_line_hashes(parallel_corpus)

    stats = FilteringStatistics(output_path)

    filter_and_write_monolingual_data(
        mono_datasets=mono_dataset_paths,
        output_path=output_path,
        parallel_hashes=line_hashes,
        max_lines=max_sentences,
        sample_size=args.sample_size,
        stats=stats,
    )

    logger.info("Done: Merging monolingual datasets")