def read_and_validate_data()

in build_and_upload.py [0:0]


def read_and_validate_data(metadata_dir: str, video_dir: str) -> List[Tuple[str, dict, str]]:
    """
    Read metadata and video files, matching them by filename.
    Returns list of tuples: (filename_base, metadata_content, video_path)
    """
    paired_data = []
    missing_pairs = []
    
    # Get all json files
    json_files = list(Path(metadata_dir).glob('*.json'))
    
    print(f"Found {len(json_files)} JSON files in metadata directory")
    
    for json_path in json_files:
        base_name = json_path.stem
        video_path = Path(video_dir) / f"{base_name}.mp4"
        
        # Check if corresponding video exists
        if not video_path.exists():
            missing_pairs.append(base_name)
            continue
            
        try:
            with open(json_path, 'r') as f:
                metadata = json.load(f)
        except Exception as e:
            print(f"Error reading metadata file {json_path}: {str(e)}")
            continue
            
        paired_data.append((base_name, metadata, str(video_path)))
    
    # Report statistics
    print(f"\nDataset Statistics:")
    print(f"Total paired files: {len(paired_data)}")
    if missing_pairs:
        print(f"Missing video files for: {', '.join(missing_pairs)}")
    
    return paired_data