def main()

in voxpopuli/get_lm_data.py [0:0]


def main(args):
    out_root = Path(args.root) / "lm_data" / args.lang
    out_root.mkdir(exist_ok=True, parents=True)
    asr_root = Path(args.root) / "transcribed_data" / args.lang
    asr_root.mkdir(exist_ok=True, parents=True)

    # Get VoxPopuli transcript
    url = f"{DOWNLOAD_BASE_URL}/annotations/asr/asr_{args.lang}.tsv.gz"
    path = asr_root / Path(url).name
    if not path.exists():
        download_url(url, asr_root.as_posix(), Path(url).name)
    text = load_from_tsv_gz(path)
    # Get Europarl data
    if args.lang != "hr":
        for filename in ["europarl.tgz", "tools.tgz"]:
            url = f"https://www.statmt.org/europarl/v7/{filename}"
            if not (out_root / filename).exists():
                download_url(url, out_root.as_posix(), filename)
        with tarfile.open(out_root / "europarl.tgz", "r:gz") as f:
            members = [
                i for i in f.getmembers()
                if i.name.startswith(f"txt/{args.lang}")
                   and not (out_root / i.name).exists()
            ]
            f.extractall(out_root, members=members)
        with tarfile.open(out_root / "tools.tgz", "r:gz") as f:
            f.extractall(out_root)
        cur_text = set()
        paths = list((out_root / "txt" / args.lang).glob("*.txt"))
        for p in tqdm.tqdm(paths):
            cur_out_path = p.with_suffix('.out')
            script_path = out_root / "tools" / "split-sentences.perl"
            os.system(
                f"perl {script_path.as_posix()} -l {args.lang} -q "
                f"< {p.as_posix()} > {cur_out_path.as_posix()}"
            )
            with open(cur_out_path) as f_o:
                cur_text.update(r.strip() for r in f_o if not r.startswith("<"))
        text.extend(cur_text)
    assert len(text) > 0, "Cannot load any text. Aborting."

    tokens = LANG_TOKENS[args.lang]

    out_text = []
    vocab = set()
    with Pool(args.n_proc) as p:
        for norm_text, uniq_vocab in tqdm.tqdm(
            p.starmap(process_text, [(t, args.lang, tokens) for t in text])
        ):
            out_text.append(norm_text)
            if tokens:
                vocab |= uniq_vocab

    out_path = out_root / "sentences.txt"
    with open(out_path, "w") as o:
        for line in out_text:
            o.write(line + "\n")

    vocab_path = out_root / "vocabulary.txt"
    vocab = sorted(vocab)
    with open(vocab_path, "w") as o:
        o.write(" ".join(vocab))