in voxpopuli/get_unlabelled_data.py [0:0]
def get_metadata(out_root, subset):
def predicate(id_):
is_plenary = id_.find("PLENARY") > -1
if subset in {"10k", "10k_sd"}:
return is_plenary and 20190101 <= int(id_[:8]) < 20200801
elif subset in {"100k"}:
return is_plenary
elif subset in LANGUAGES:
return is_plenary and id_.endswith(subset)
elif subset in LANGUAGES_V2:
return id_.endswith(subset.split("_")[0])
return True
filename = "unlabelled_sd" if subset == "10k_sd" else "unlabelled_v2"
url = f"{DOWNLOAD_BASE_URL}/annotations/{filename}.tsv.gz"
tsv_path = out_root / Path(url).name
if not tsv_path.exists():
download_url(url, out_root.as_posix(), Path(url).name)
if subset == '10k_sd':
with gzip.open(tsv_path, mode="rt") as f:
rows = [
(r["session_id"], r["id_"], r["start_time"], r["end_time"])
for r in csv.DictReader(f, delimiter="|")
if predicate(r["session_id"])
]
else:
with gzip.open(tsv_path, mode="rt") as f:
rows = [
(r["event_id"], r["segment_no"], r["start"], r["end"])
for r in csv.DictReader(f, delimiter="\t")
if predicate(r["event_id"])
]
return rows