def get()

in voxpopuli/get_asr_data.py [0:0]


def get(args):
    in_root = Path(args.root) / "raw_audios" / "original"
    out_root = Path(args.root) / "transcribed_data" / args.lang
    out_root.mkdir(exist_ok=True, parents=True)
    # Get metadata TSV
    url = f"{DOWNLOAD_BASE_URL}/annotations/asr/asr_{args.lang}.tsv.gz"
    tsv_path = out_root / Path(url).name
    if not tsv_path.exists():
        download_url(url, out_root.as_posix(), Path(url).name)
    with gzip.open(tsv_path, "rt") as f:
        metadata = [x for x in csv.DictReader(f, delimiter="|")]
    # Get segment into list
    items = defaultdict(dict)
    manifest = []
    for r in tqdm(metadata):
        split = r["split"]
        if split not in SPLITS:
            continue
        event_id = r["session_id"]
        year = event_id[:4]
        in_path = in_root / year / f"{event_id}_original.ogg"
        cur_out_root = out_root / year
        cur_out_root.mkdir(exist_ok=True, parents=True)
        out_path = cur_out_root / "{}-{}.ogg".format(event_id, r["id_"])
        timestamps = [(t[0], t[1]) for t in literal_eval(r["vad"])]
        items[in_path.as_posix()][out_path.as_posix()] = timestamps
        manifest.append(
            (out_path.stem, r["original_text"], r["normed_text"],
             r["speaker_id"], split, r["gender"])
        )
    items = list(items.items())
    # Segment
    multiprocess_run(items, cut_session)
    # Output per-split manifest
    header = ["id", "raw_text", "normalized_text", "speaker_id", "split", "gender"]
    for split in SPLITS:
        with open(out_root / f"asr_{split}.tsv", "w") as f_o:
            f_o.write("\t".join(header) + "\n")
            for cols in manifest:
                if cols[4] == split:
                    f_o.write("\t".join(cols) + "\n")