def get()

in voxpopuli/get_unlabelled_data.py [0:0]


def get(args):
    audio_root = Path(args.root) / "raw_audios"
    out_root = Path(args.root) / "unlabelled_data"
    out_root.mkdir(exist_ok=True, parents=True)
    items = defaultdict(list)
    print("Loading manifest...")
    manifest = get_metadata(out_root, args.subset)
    for event_id, seg_no, start, end in tqdm(manifest):
        lang, year = event_id.rsplit("_", 1)[1], event_id[:4]
        cur_out_root = out_root / lang / year
        cur_out_root.mkdir(exist_ok=True, parents=True)
        path = audio_root / lang / year / f"{event_id}.ogg"
        items[path.as_posix()].append((seg_no, float(start), float(end)))
    items = [(k, v, out_root.as_posix()) for k, v in items.items()]
    print(f"Segmenting {len(items):,} files...")
    multiprocess_run(items, _segment)