in utils/build-mono-nllb.py [0:0]
def main() -> None:
parser = argparse.ArgumentParser(
description=__doc__,
# Preserves whitespace in the help text.
formatter_class=argparse.RawTextHelpFormatter,
)
parser.add_argument("language", metavar="LANG", type=str, help="The two/three letter langtag")
parser.add_argument(
"--cleanup", action="store_true", help="Delete the intermediate data files"
)
args = parser.parse_args()
lang: str = args.language
os.makedirs(DATA_PATH, exist_ok=True)
mono_file = f"{lang}.txt.gz"
mono_path = DATA_PATH / mono_file
mono_url = f"https://object.pouta.csc.fi/OPUS-NLLB/v1/mono/{mono_file}"
parallel_file = f"en-{lang}.txt.zip"
parallel_path = DATA_PATH / parallel_file
parallel_url = f"https://object.pouta.csc.fi/OPUS-NLLB/v1/moses/{parallel_file}"
output_gzip_path = DATA_PATH / f"nllb-mono-{lang}.txt.gz"
sample_path = DATA_PATH / f"nllb-mono-{lang}.sample.txt"
output_info_path = DATA_PATH / f"nllb-mono-{lang}.info.json"
if output_gzip_path.exists():
print(f"{output_gzip_path} exists")
else:
if mono_path.exists():
print(f"{mono_file} exists")
else:
stream_download_to_file(mono_url, mono_path)
if parallel_path.exists():
print(f"{parallel_file} exists")
else:
stream_download_to_file(parallel_url, parallel_path)
# zip contents:
# ├── README
# ├── LICENSE
# ├── NLLB.en-sl.en
# ├── NLLB.en-sl.sl
# └── NLLB.en-sl.scores
print("Compute a hash of all the sentences in the parallel data.")
print(f"{parallel_path}")
sentence_hashes, sentences_visited = compute_hashes_in_parallel_data(parallel_path, lang)
print(f"There are {len(sentence_hashes):,} unique sentences out of {sentences_visited:,}")
print(f'{(sentences_visited - len(sentence_hashes)):,} "{lang}" sentences were duplicated')
print("Identifying and writing out monolingual data.")
kept_count, discard_count = filter_and_write_monolingual_data(
mono_path, output_gzip_path, sentence_hashes
)
print(f"Dataset created {output_gzip_path}")
print(f"{kept_count:,} kept, {discard_count:,} discarded")
with output_info_path.open() as file:
data = {"sentences_kept": kept_count, "sentences_discarded": discard_count}
json.dump(data, file, indent=2)
if sample_path.exists():
print(f"{sample_path} exists")
else:
print(f"Building a sample of the data: {sample_path}")
build_dataset_sample(output_gzip_path, sample_path, f"nllb-mono-{lang}")
if args.cleanup:
print(f"Cleaning up {mono_path}")
mono_path.unlink()
print(f"Cleaning up {parallel_path}")
parallel_path.unlink()