def get()

in voxpopuli/get_s2s_data.py [0:0]


def get(args):
    src_lang, tgt_lang = args.source_lang, args.target_lang
    if args.use_annotated_target:
        assert tgt_lang in S2S_TGT_LANGUAGES_WITH_HUMAN_TRANSCRIPTION
    in_root = Path(args.root) / "raw_audios" / tgt_lang
    asr_root = Path(args.root) / "transcribed_data" / src_lang
    out_root = asr_root / tgt_lang
    out_root.mkdir(exist_ok=True, parents=True)
    # Get metadata TSV
    url = f"{DOWNLOAD_BASE_URL}/annotations/asr/asr_{src_lang}.tsv.gz"
    tsv_path = asr_root / Path(url).name
    if not tsv_path.exists():
        download_url(url, asr_root.as_posix(), Path(url).name)
    with gzip.open(tsv_path, "rt") as f:
        src_metadata = [x for x in csv.DictReader(f, delimiter="|")]
    src_metadata = {
        "{}-{}".format(r["session_id"], r["id_"]): (
            r["original_text"], r["speaker_id"]
        )
        for r in src_metadata
    }
    ref_sfx = "_ref" if args.use_annotated_target else ""
    url = f"{DOWNLOAD_BASE_URL}/annotations/s2s/s2s_{tgt_lang}{ref_sfx}.tsv.gz"
    tsv_path = out_root / Path(url).name
    if not tsv_path.exists():
        download_url(url, out_root.as_posix(), Path(url).name)
    with gzip.open(tsv_path, "rt") as f:
        tgt_metadata = [x for x in csv.DictReader(f, delimiter="\t")]
    # Get segment into list
    items = defaultdict(list)
    manifest = []
    print("Loading manifest...")
    for r in tqdm(tgt_metadata):
        src_id = r["id"]
        event_id, _src_lang, utt_id = parse_src_id(src_id)
        if _src_lang != src_lang:
            continue
        year = event_id[:4]
        in_path = in_root / year / f"{event_id}_{tgt_lang}.ogg"
        cur_out_root = out_root / year
        cur_out_root.mkdir(exist_ok=True, parents=True)
        tgt_id = f"{event_id}-{tgt_lang}_{utt_id}"
        out_path = cur_out_root / f"{tgt_id}.ogg"
        items[in_path.as_posix()].append(
            (out_path.as_posix(), float(r["start_time"]), float(r["end_time"]))
        )
        src_text, src_speaker_id = src_metadata[src_id]
        tgt_text = r["tgt_text"] if args.use_annotated_target else ""
        manifest.append((src_id, src_text, src_speaker_id, tgt_id, tgt_text))
    items = list(items.items())
    # Segment
    print(f"Segmenting {len(items):,} files...")
    multiprocess_run(items, _segment)
    # Output per-data-split list
    header = ["src_id", "src_text", "src_speaker_id", "tgt_id", "tgt_text"]
    with open(out_root / f"s2s{ref_sfx}.tsv", "w") as f_o:
        f_o.write("\t".join(header) + "\n")
        for cols in manifest:
            f_o.write("\t".join(cols) + "\n")