in pipeline/clean/merge-mono.py [0:0]
def main() -> None:
parser = argparse.ArgumentParser(description="Merge monolingual datasets.")
parser.add_argument(
"--parallel_corpus",
type=Path,
help="The path to the parallel corpus of this language, e.g. $MOZ_FETCHES_DIR/corpus.ca.zst",
)
parser.add_argument(
"--output",
type=Path,
help="The path to the output compressed file, e.g. /builds/worker/artifacts/mono.ca.zst",
)
parser.add_argument(
"--max_sentences", type=int, help="The maximum number of sentences that will be merged."
)
parser.add_argument(
"--datasets_glob",
type=str,
help="A glob-style path to the mono datasets, e.g. /path/to/*.zst",
)
parser.add_argument(
"--sample_size", type=int, default=10_000, help="Generate a random sample of sentences."
)
args = parser.parse_args()
output_path: Path = args.output
max_sentences: int = args.max_sentences
parallel_corpus: str = args.parallel_corpus
mono_dataset_paths: list[str] = glob.glob(args.datasets_glob)
if not mono_dataset_paths:
raise FileNotFoundError(f"No files found matching glob pattern: {args.datasets_glob}")
logger.info("Monolingual datasets:")
total_mono_bytes = 0
for path in mono_dataset_paths:
formatted_size, bytes = get_human_readable_file_size(path)
logger.info(f" - {path} ({formatted_size})")
total_mono_bytes += bytes
logger.info(f" - {format_bytes(total_mono_bytes)} total")
formatted_size = (get_human_readable_file_size(path))[0]
logger.info("Parallel corpus:")
logger.info(f" - {parallel_corpus} ({formatted_size})")
# Ensure output directory exists
output_dir = output_path.parent
output_dir.mkdir(parents=True, exist_ok=True)
# Compute the line hashes so that the monolingual data can be de-duplicated.
# It's about 10 bytes per hash in a set, so for a 100 million sentence corpus,
# it would be ~1G in memory.
log_memory()
logger.info(f"Compute hashes of the parallel data: {path}")
line_hashes = compute_line_hashes(parallel_corpus)
stats = FilteringStatistics(output_path)
filter_and_write_monolingual_data(
mono_datasets=mono_dataset_paths,
output_path=output_path,
parallel_hashes=line_hashes,
max_lines=max_sentences,
sample_size=args.sample_size,
stats=stats,
)
logger.info("Done: Merging monolingual datasets")