in voxpopuli/get_lm_data.py [0:0]
def main(args):
out_root = Path(args.root) / "lm_data" / args.lang
out_root.mkdir(exist_ok=True, parents=True)
asr_root = Path(args.root) / "transcribed_data" / args.lang
asr_root.mkdir(exist_ok=True, parents=True)
# Get VoxPopuli transcript
url = f"{DOWNLOAD_BASE_URL}/annotations/asr/asr_{args.lang}.tsv.gz"
path = asr_root / Path(url).name
if not path.exists():
download_url(url, asr_root.as_posix(), Path(url).name)
text = load_from_tsv_gz(path)
# Get Europarl data
if args.lang != "hr":
for filename in ["europarl.tgz", "tools.tgz"]:
url = f"https://www.statmt.org/europarl/v7/{filename}"
if not (out_root / filename).exists():
download_url(url, out_root.as_posix(), filename)
with tarfile.open(out_root / "europarl.tgz", "r:gz") as f:
members = [
i for i in f.getmembers()
if i.name.startswith(f"txt/{args.lang}")
and not (out_root / i.name).exists()
]
f.extractall(out_root, members=members)
with tarfile.open(out_root / "tools.tgz", "r:gz") as f:
f.extractall(out_root)
cur_text = set()
paths = list((out_root / "txt" / args.lang).glob("*.txt"))
for p in tqdm.tqdm(paths):
cur_out_path = p.with_suffix('.out')
script_path = out_root / "tools" / "split-sentences.perl"
os.system(
f"perl {script_path.as_posix()} -l {args.lang} -q "
f"< {p.as_posix()} > {cur_out_path.as_posix()}"
)
with open(cur_out_path) as f_o:
cur_text.update(r.strip() for r in f_o if not r.startswith("<"))
text.extend(cur_text)
assert len(text) > 0, "Cannot load any text. Aborting."
tokens = LANG_TOKENS[args.lang]
out_text = []
vocab = set()
with Pool(args.n_proc) as p:
for norm_text, uniq_vocab in tqdm.tqdm(
p.starmap(process_text, [(t, args.lang, tokens) for t in text])
):
out_text.append(norm_text)
if tokens:
vocab |= uniq_vocab
out_path = out_root / "sentences.txt"
with open(out_path, "w") as o:
for line in out_text:
o.write(line + "\n")
vocab_path = out_root / "vocabulary.txt"
vocab = sorted(vocab)
with open(vocab_path, "w") as o:
o.write(" ".join(vocab))