openvid/openvid.py (25 lines of code) (raw):

from concurrent.futures import ThreadPoolExecutor, as_completed import pandas as pd from tqdm import tqdm from zipstream import ZipStream PARQUET_PATH = "openvid.parquet" BASE_PATH = "H:/openvid" # skip these for now MULTI_PART = {73, 76, 78, 83, 88, 89, 92, 95, 96, 102, 103, 111, 118, 183, 184, 185} URL = "https://huggingface.co/datasets/nkp37/OpenVid-1M/resolve/main/OpenVid_part{part}.zip?download=true" df = pd.read_parquet(PARQUET_PATH) aesthetic = df.loc[df["aesthetic score"] >= 7] aesthetic = aesthetic.loc[~df["part_id"].isin(MULTI_PART)] part_ids = list(aesthetic["part_id"].unique()) filenames = set(aesthetic["video"]) for part_id in part_ids: stream = ZipStream(URL.format(part=part_id)) files = list(filter(lambda file: file.filename.split("/")[-1] in filenames, stream.files)) with ThreadPoolExecutor(max_workers=8) as executor: pbar = tqdm(desc="download", total=len(files)) futures = {} for file in files: filename = file.filename.split("/")[-1] futures[executor.submit(file.download, filename, BASE_PATH)] = file for future in as_completed(futures): _ = future.result() pbar.update()