in voxpopuli/get_unlabelled_data.py [0:0]
def get(args):
audio_root = Path(args.root) / "raw_audios"
out_root = Path(args.root) / "unlabelled_data"
out_root.mkdir(exist_ok=True, parents=True)
items = defaultdict(list)
print("Loading manifest...")
manifest = get_metadata(out_root, args.subset)
for event_id, seg_no, start, end in tqdm(manifest):
lang, year = event_id.rsplit("_", 1)[1], event_id[:4]
cur_out_root = out_root / lang / year
cur_out_root.mkdir(exist_ok=True, parents=True)
path = audio_root / lang / year / f"{event_id}.ogg"
items[path.as_posix()].append((seg_no, float(start), float(end)))
items = [(k, v, out_root.as_posix()) for k, v in items.items()]
print(f"Segmenting {len(items):,} files...")
multiprocess_run(items, _segment)