def get_metadata()

in voxpopuli/get_unlabelled_data.py [0:0]


def get_metadata(out_root, subset):
    def predicate(id_):
        is_plenary = id_.find("PLENARY") > -1
        if subset in {"10k", "10k_sd"}:
            return is_plenary and 20190101 <= int(id_[:8]) < 20200801
        elif subset in {"100k"}:
            return is_plenary
        elif subset in LANGUAGES:
            return is_plenary and id_.endswith(subset)
        elif subset in LANGUAGES_V2:
            return id_.endswith(subset.split("_")[0])
        return True

    filename = "unlabelled_sd" if subset == "10k_sd" else "unlabelled_v2"
    url = f"{DOWNLOAD_BASE_URL}/annotations/{filename}.tsv.gz"
    tsv_path = out_root / Path(url).name
    if not tsv_path.exists():
        download_url(url, out_root.as_posix(), Path(url).name)
    if subset == '10k_sd':
        with gzip.open(tsv_path, mode="rt") as f:
            rows = [
                (r["session_id"], r["id_"], r["start_time"], r["end_time"])
                for r in csv.DictReader(f, delimiter="|")
                if predicate(r["session_id"])
            ]
    else:
        with gzip.open(tsv_path, mode="rt") as f:
            rows = [
                (r["event_id"], r["segment_no"], r["start"], r["end"])
                for r in csv.DictReader(f, delimiter="\t")
                if predicate(r["event_id"])
            ]
    return rows