build_and_upload.py (124 lines of code) (raw):
import os
import json
import shutil
from pathlib import Path
import subprocess
import argparse
import tempfile
from typing import Optional, Dict, List, Tuple
import sys
def read_and_validate_data(metadata_dir: str, video_dir: str) -> List[Tuple[str, dict, str]]:
"""
Read metadata and video files, matching them by filename.
Returns list of tuples: (filename_base, metadata_content, video_path)
"""
paired_data = []
missing_pairs = []
# Get all json files
json_files = list(Path(metadata_dir).glob('*.json'))
print(f"Found {len(json_files)} JSON files in metadata directory")
for json_path in json_files:
base_name = json_path.stem
video_path = Path(video_dir) / f"{base_name}.mp4"
# Check if corresponding video exists
if not video_path.exists():
missing_pairs.append(base_name)
continue
try:
with open(json_path, 'r') as f:
metadata = json.load(f)
except Exception as e:
print(f"Error reading metadata file {json_path}: {str(e)}")
continue
paired_data.append((base_name, metadata, str(video_path)))
# Report statistics
print(f"\nDataset Statistics:")
print(f"Total paired files: {len(paired_data)}")
if missing_pairs:
print(f"Missing video files for: {', '.join(missing_pairs)}")
return paired_data
def create_video_dataset(
source_video_dir: str,
source_metadata_dir: str,
output_base_dir: str,
examples_per_folder: int = 9500,
max_total_examples: Optional[int] = None,
) -> int:
"""
Create a video dataset organized in folders with accompanying metadata.
Args:
source_video_dir: Directory containing source video files
source_metadata_dir: Directory containing metadata JSON files
output_base_dir: Base directory for the organized dataset
examples_per_folder: Maximum number of examples per folder
max_total_examples: Maximum total examples to process (None for all)
Returns:
int: Total processed examples
"""
# Create base output directory
Path(output_base_dir).mkdir(parents=True, exist_ok=True)
# Read and validate data
paired_data = read_and_validate_data(source_metadata_dir, source_video_dir)
if not paired_data:
raise ValueError("No valid paired files found")
current_folder = 0
processed_examples = 0
current_folder_examples = 0
# Create the first folder
current_folder_path = os.path.join(output_base_dir, f"{current_folder:04d}")
Path(current_folder_path).mkdir(exist_ok=True)
# Initialize metadata file for the current folder
metadata_file = open(os.path.join(current_folder_path, "metadata.jsonl"), "w")
for base_name, metadata, video_path in paired_data:
# Break if we've reached the maximum examples
if max_total_examples and processed_examples >= max_total_examples:
break
# If we've reached the examples per folder limit, create a new folder
if current_folder_examples >= examples_per_folder:
metadata_file.close()
current_folder += 1
current_folder_examples = 0
current_folder_path = os.path.join(output_base_dir, f"{current_folder:04d}")
Path(current_folder_path).mkdir(exist_ok=True)
metadata_file = open(os.path.join(current_folder_path, "metadata.jsonl"), "w")
# Copy video to new location
video_filename = f"{base_name}.mp4"
destination_path = os.path.join(current_folder_path, video_filename)
shutil.copy2(video_path, destination_path)
# Create metadata entry (including original filename and all metadata)
metadata_entry = {
"file_name": video_filename,
**metadata # Include all fields from original metadata
}
# Write metadata entry
metadata_file.write(json.dumps(metadata_entry) + "\n")
current_folder_examples += 1
processed_examples += 1
if processed_examples % 100 == 0: # Progress update every 100 examples
print(f"Processed {processed_examples} examples")
# Close the last metadata file
metadata_file.close()
print(f"Dataset creation complete. Processed {processed_examples} examples across {current_folder + 1} folders")
return processed_examples
def upload_to_huggingface(dataset_path: str, hf_dataset_name: str) -> None:
"""Upload the dataset to Hugging Face"""
try:
cmd = [
"huggingface-cli",
"upload-large-folder",
hf_dataset_name,
dataset_path,
"--repo-type=dataset"
]
subprocess.run(cmd, check=True)
print(f"Successfully uploaded dataset to {hf_dataset_name}")
except subprocess.CalledProcessError as e:
print(f"Error uploading to Hugging Face: {str(e)}")
sys.exit(1)
def main():
parser = argparse.ArgumentParser(description="Create and upload a video dataset to Hugging Face")
parser.add_argument("--video-dir", required=True, help="Directory containing source video files")
parser.add_argument("--metadata-dir", required=True, help="Directory containing metadata JSON files")
parser.add_argument("--hf-dataset-name", required=True, help="Hugging Face dataset name (e.g., 'username/dataset-name')")
parser.add_argument("--examples-per-folder", type=int, default=9500,
help="Maximum examples per folder (max 10000)")
parser.add_argument("--max-examples", type=int, help="Maximum total examples to process")
parser.add_argument("--temp-dir", help="Temporary directory for dataset creation (default: system temp directory)")
args = parser.parse_args()
# Validate examples_per_folder
if args.examples_per_folder > 10000:
print("Error: examples-per-folder cannot exceed 10000")
sys.exit(1)
elif args.examples_per_folder <= 0:
print("Error: examples-per-folder must be greater than 0")
sys.exit(1)
# Use provided temp directory or create one
temp_base_dir = args.temp_dir or tempfile.mkdtemp()
print(f"Using temporary directory: {temp_base_dir}")
try:
# Create dataset
processed_examples = create_video_dataset(
source_video_dir=args.video_dir,
source_metadata_dir=args.metadata_dir,
output_base_dir=temp_base_dir,
examples_per_folder=args.examples_per_folder,
max_total_examples=args.max_examples
)
if processed_examples > 0:
# Upload to Hugging Face
upload_to_huggingface(temp_base_dir, args.hf_dataset_name)
else:
print("No examples were processed. Aborting upload.")
sys.exit(1)
finally:
if not args.temp_dir: # Only remove if we created the temp directory
shutil.rmtree(temp_base_dir, ignore_errors=True)
if __name__ == "__main__":
main()