in voxpopuli/download_audios.py [0:0]
def download(args):
if args.subset in LANGUAGES_V2:
languages = [args.subset.split("_")[0]]
years = YEARS + [f"{y}_2" for y in YEARS]
elif args.subset in LANGUAGES:
languages = [args.subset]
years = YEARS
else:
languages = {
"400k": LANGUAGES,
"100k": LANGUAGES,
"10k": LANGUAGES,
"asr": ["original"]
}.get(args.subset, None)
years = {
"400k": YEARS + [f"{y}_2" for y in YEARS],
"100k": YEARS,
"10k": [2019, 2020],
"asr": YEARS
}.get(args.subset, None)
url_list = []
for l in languages:
for y in years:
url_list.append(f"{DOWNLOAD_BASE_URL}/audios/{l}_{y}.tar")
out_root = Path(args.root) / "raw_audios"
out_root.mkdir(exist_ok=True, parents=True)
print(f"{len(url_list)} files to download...")
for url in tqdm(url_list):
tar_path = out_root / Path(url).name
download_url(url, out_root.as_posix(), Path(url).name)
extract_archive(tar_path.as_posix())
os.remove(tar_path)