openvid/openvid_part_id_parquet.py (46 lines of code) (raw):

import pandas as pd from zipstream import ZipStream import tqdm df = pd.read_csv("OpenVid-1M.csv") part_ids = list(range(0, 183)) for multi_part in {73, 76, 78, 83, 88, 89, 92, 95, 96, 102, 103, 111, 118}: part_ids.remove(multi_part) url = "https://huggingface.co/datasets/nkp37/OpenVid-1M/resolve/main/OpenVid_part{part}.zip?download=true" filename_part = [] for part_id in tqdm.tqdm(part_ids): stream = ZipStream(url.format(part=part_id)) filename_part.extend( [ { "video": file.filename.split("/")[-1], "part_id": part_id, "file_offset": file.file_offset, "file_size": file.file_size, } for file in stream.files ] ) # for split parts we get 1 byte of part a to find the size # for part b the central directory offset is - size of part a url_multipart_a = ( "https://huggingface.co/datasets/nkp37/OpenVid-1M/resolve/main/OpenVid_part{part}_partaa?download=true" ) url_multipart = "https://huggingface.co/datasets/nkp37/OpenVid-1M/resolve/main/OpenVid_part{part}_partab?download=true" for part_id in tqdm.tqdm({73, 76, 78, 83, 88, 89, 92, 95, 96, 102, 103, 111, 118, 183, 184, 185}): offset = ZipStream.size(url_multipart_a.format(part=part_id)) stream = ZipStream(url_multipart.format(part=part_id), offset=offset) filename_part.extend( [ { "video": file.filename.split("/")[-1], "part_id": part_id, "file_offset": file.file_offset, "file_size": file.file_size, } for file in stream.files ] ) data = pd.DataFrame(filename_part) df = df.merge(data, how="left") df["part_id"] = df["part_id"].astype(pd.Int64Dtype()) df["file_offset"] = df["file_offset"].astype(pd.Int64Dtype()) df["file_size"] = df["file_size"].astype(pd.Int64Dtype()) df.to_parquet("openvid.parquet")