in build_and_upload.py [0:0]
def read_and_validate_data(metadata_dir: str, video_dir: str) -> List[Tuple[str, dict, str]]:
"""
Read metadata and video files, matching them by filename.
Returns list of tuples: (filename_base, metadata_content, video_path)
"""
paired_data = []
missing_pairs = []
# Get all json files
json_files = list(Path(metadata_dir).glob('*.json'))
print(f"Found {len(json_files)} JSON files in metadata directory")
for json_path in json_files:
base_name = json_path.stem
video_path = Path(video_dir) / f"{base_name}.mp4"
# Check if corresponding video exists
if not video_path.exists():
missing_pairs.append(base_name)
continue
try:
with open(json_path, 'r') as f:
metadata = json.load(f)
except Exception as e:
print(f"Error reading metadata file {json_path}: {str(e)}")
continue
paired_data.append((base_name, metadata, str(video_path)))
# Report statistics
print(f"\nDataset Statistics:")
print(f"Total paired files: {len(paired_data)}")
if missing_pairs:
print(f"Missing video files for: {', '.join(missing_pairs)}")
return paired_data