in pipeline/clean/merge-parallel.py [0:0]
def main() -> None:
parser = argparse.ArgumentParser(
description=__doc__,
# Preserves whitespace in the help text.
formatter_class=argparse.RawTextHelpFormatter,
)
parser.add_argument(
"--src",
type=str,
help="The source locale",
)
parser.add_argument(
"--trg",
type=str,
help="The target locale",
)
parser.add_argument(
"--datasets_glob",
type=str,
help="A glob-style path to the mono datasets, e.g. /path/to/*.zst",
)
parser.add_argument(
"--max_lines",
type=str,
default="None",
help="The (optionally) maximum number of sentences that will be merged.",
)
parser.add_argument(
"--sample_size", type=int, default=10_000, help="Generate a random sample of sentences."
)
parser.add_argument(
"--artifacts",
type=Path,
help="The path to the artifacts directory.",
)
parser.add_argument(
"--name",
type=str,
help='The final corpus name, e.g. "corpus" will output a "corpus.en.zst" file.',
)
args = parser.parse_args()
datasets_src, datasets_trg, total_corpus_bytes = get_datasets(
args.src, args.trg, args.datasets_glob
)
logger.info("Parallel datasets:")
src_outpath = args.artifacts / f"{args.name}.{args.src}.zst"
trg_outpath = args.artifacts / f"{args.name}.{args.trg}.zst"
stats = FilteringStatistics(args.artifacts / args.name)
max_lines: Optional[int] = None
if args.max_lines != "None":
max_lines = int(args.max_lines)
deduplicate_corpus = DeduplicateCorpus(
datasets_src,
datasets_trg,
src_outpath,
trg_outpath,
stats,
)
deduplicate_corpus.run(total_corpus_bytes, max_lines)
sample_corpus(
artifacts=args.artifacts,
name=args.name,
sample_size=args.sample_size,
src_outpath=src_outpath,
trg_outpath=trg_outpath,
)
stats.save_json()