def collect_test_environment_stats()

in tools/url-checker/create_test_files.py [0:0]


def collect_test_environment_stats():
    """Collect statistics about the test environment created."""
    stats = {
        "directory_count": 0,
        "file_count_by_type": {},
        "total_files": 0,
        "max_depth": 0,
        "special_dirs": [],
        "directory_sizes": []
    }
    
    # Walk the directory structure
    for root, dirs, files in os.walk(TEST_ROOT):
        # Count directories
        stats["directory_count"] += len(dirs)
        
        # Track special directories
        for d in dirs:
            if any(c in d for c in [' ', '&', '!', '.', '-']):
                stats["special_dirs"].append(os.path.join(root, d).replace(TEST_ROOT + os.sep, ''))
        
        # Calculate depth
        rel_path = os.path.relpath(root, TEST_ROOT)
        if rel_path != '.':
            depth = len(rel_path.split(os.sep))
            stats["max_depth"] = max(stats["max_depth"], depth)
            stats["directory_sizes"].append(len(files))
        
        # Count files by type
        for file in files:
            ext = os.path.splitext(file)[1].lower()
            if ext:
                ext = ext[1:]  # Remove the leading dot
                stats["file_count_by_type"][ext] = stats["file_count_by_type"].get(ext, 0) + 1
                stats["total_files"] += 1
    
    # If any files were found at the root level, adjust the depth count
    if not stats["max_depth"] and stats["total_files"] > 0:
        stats["max_depth"] = 1
    
    # Collect URL stats (types of relative URLs generated)
    url_stats = {
        "direct_paths": 0,
        "dot_prefixed": 0,
        "parent_traversal": 0,
        "directory_paths": 0,
        "invalid_paths": 0
    }
    
    # We can't accurately count these after generation, so we'll use placeholders
    # In a real implementation, we'd track these during URL generation
    url_stats["direct_paths"] = int(stats["total_files"] * 0.4)
    url_stats["dot_prefixed"] = int(stats["total_files"] * 0.2)
    url_stats["parent_traversal"] = int(stats["total_files"] * 0.2)
    url_stats["directory_paths"] = int(stats["total_files"] * 0.1)
    url_stats["invalid_paths"] = int(stats["total_files"] * 0.1)
    
    stats["url_stats"] = url_stats
    
    return stats