video_processing/folder_to_parquet.py (19 lines of code) (raw):

import pandas as pd import pathlib from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("--path", type=str, required=True) parser.add_argument("--out-path", type=str, required=True) args = parser.parse_args() path = pathlib.Path(args.path) out_path = pathlib.Path(args.out_path) EXTENSIONS = {"avi", "mkv", "mp4"} videos = [] for extension in EXTENSIONS: videos.extend(list(path.glob(f"*.{extension}"))) data = [] for video in videos: data.append({"file": video.name}) df = pd.DataFrame(data) print(df) df.to_parquet(out_path, compression="snappy")