def main()

in scripts/python/url-checker/url-checker.py [0:0]


def main():
    # Lists to track results
    broken_absolute_urls = []
    ok_absolute_urls = []
    broken_relative_urls_with_anchor = []
    broken_relative_urls_without_anchor = []
    ok_relative_urls = []
    broken_image_urls = []
    ok_image_urls = []
    broken_svg_urls = []
    ok_svg_urls = []
    broken_header_urls = []
    ok_header_urls = []
    broken_root_relative_urls = []
    ok_root_relative_urls = []
    # Initialize the no_links_types list right here with the other lists
    no_links_types = []
    
    # Get all markdown files
    markdown_files = find_markdown_files()
    
    # Create log file with timestamp
    timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    log_file_with_timestamp = os.path.join(LOG_DIR, f'broken_urls_{timestamp}.log')
    
    print("Starting URL check...")
    start_time = datetime.now()
    
    # Process all files and URLs - write to log in real-time for monitoring
    with open(log_file_with_timestamp, 'w', encoding='utf-8') as log:
        log.write(f"URL Checker Results\n\n")
        log.write(f"Log generated on: {timestamp}\n")
        log.write("Processing URLs in real-time...\n\n")
        log.flush()
        
        for md_file in markdown_files:
            print(f"Processing file: {md_file}")
            urls = extract_urls(md_file)
            
            for url in urls:
                # Skip email links
                if EMAIL_REGEX.match(url):
                    print(f"Skipping email URL: {url}")
                    continue
                
                # Skip localhost and IP-based URLs
                if url.startswith("http://localhost") or is_ip_based_url(url):
                    print(f"Skipping localhost or IP-based URL: {url}")
                    continue
                
                # Check URL based on whether it's absolute or relative
                parsed_url = urlparse(url)
                if parsed_url.scheme in ('http', 'https'):
                    # It's an absolute URL - pass the file path to track source
                    log_entry = check_absolute_url(url, md_file)
                    if "[OK ABSOLUTE]" in log_entry:
                        ok_absolute_urls.append(log_entry)
                    else:
                        broken_absolute_urls.append(log_entry)
                else:
                    # Strip quotes before further processing to avoid false positives
                    url_clean = url.strip('"\'')
                    parsed_clean = urlparse(url_clean)
                    
                    # Check again if it's actually an absolute URL after stripping quotes
                    if parsed_clean.scheme in ('http', 'https'):
                        log_entry = check_absolute_url(url_clean, md_file)
                        if "[OK ABSOLUTE]" in log_entry:
                            ok_absolute_urls.append(log_entry)
                        else:
                            broken_absolute_urls.append(log_entry)
                    else:
                        # It's a relative URL, image, SVG, root-relative, or header link
                        log_entry, is_image, is_svg, is_root_relative, has_anchor = check_relative_url(url, md_file)
                        
                        if "[BROKEN HEADER]" in log_entry:
                            broken_header_urls.append(log_entry)
                        elif "[OK HEADER]" in log_entry:
                            ok_header_urls.append(log_entry)
                        # Changed order of these conditions to prioritize image/SVG type over root-relative
                        elif is_svg:
                            if "[OK SVG]" in log_entry:
                                ok_svg_urls.append(log_entry)
                            else:
                                broken_svg_urls.append(log_entry)
                        elif is_image:
                            if "[OK IMAGE]" in log_entry:
                                ok_image_urls.append(log_entry)
                            else:
                                broken_image_urls.append(log_entry)
                        elif is_root_relative:
                            if "[OK ROOT-RELATIVE]" in log_entry:
                                ok_root_relative_urls.append(log_entry)
                            else:
                                broken_root_relative_urls.append(log_entry)
                        else:
                            if "[OK RELATIVE]" in log_entry:
                                ok_relative_urls.append(log_entry)
                            else:
                                # Use the new log message format for categorization
                                if "[BROKEN RELATIVE WITH ANCHOR]" in log_entry:
                                    broken_relative_urls_with_anchor.append(log_entry)
                                elif "[BROKEN RELATIVE WITHOUT ANCHOR]" in log_entry:
                                    broken_relative_urls_without_anchor.append(log_entry)
                
                # Write to log file (real-time monitoring)
                log.write(strip_ansi_escape_codes(log_entry) + "\n")
                log.flush()
    
    # Calculate runtime
    end_time = datetime.now()
    runtime_duration = end_time - start_time
    
    # Write the log file with organized results
    with open(log_file_with_timestamp, 'w', encoding='utf-8') as log:
        log.write(f"URL Checker Results\n\n")
        log.write(f"Log generated on: {timestamp}\n")
        log.write(f"Runtime duration: {runtime_duration}\n\n")
        
        # Write broken sections first (most important)
        log.write(f"=== Broken Absolute URLs ({len(broken_absolute_urls)} links found) ===\n\n")
        if broken_absolute_urls:
            log.write("\n".join(strip_ansi_escape_codes(url) for url in broken_absolute_urls) + "\n\n")
        else:
            log.write("No broken absolute URLs found.\n\n")
        
        log.write(f"=== Broken Relative URLs Without Anchors ({len(broken_relative_urls_without_anchor)} links found) ===\n\n")
        if broken_relative_urls_without_anchor:
            log.write("\n".join(strip_ansi_escape_codes(url) for url in broken_relative_urls_without_anchor) + "\n\n")
        else:
            log.write("No broken relative URLs without anchors found.\n\n")
        
        log.write(f"=== Broken Relative URLs With Anchors ({len(broken_relative_urls_with_anchor)} links found) ===\n\n")
        if broken_relative_urls_with_anchor:
            log.write("\n".join(strip_ansi_escape_codes(url) for url in broken_relative_urls_with_anchor) + "\n\n")
        else:
            log.write("No broken relative URLs with anchors found.\n\n")
        
        log.write(f"=== Broken Root-Relative URLs ({len(broken_root_relative_urls)} links found) ===\n\n")
        if broken_root_relative_urls:
            log.write("\n".join(strip_ansi_escape_codes(url) for url in broken_root_relative_urls) + "\n\n")
        else:
            log.write("No broken root-relative URLs found.\n\n")
        
        log.write(f"=== Broken Image URLs ({len(broken_image_urls)} links found) ===\n\n")
        if broken_image_urls:
            log.write("\n".join(strip_ansi_escape_codes(url) for url in broken_image_urls) + "\n\n")
        else:
            log.write("No broken image URLs found.\n\n")
        
        log.write(f"=== Broken SVG URLs ({len(broken_svg_urls)} links found) ===\n\n")
        if broken_svg_urls:
            log.write("\n".join(strip_ansi_escape_codes(url) for url in broken_svg_urls) + "\n\n")
        else:
            log.write("No broken SVG URLs found.\n\n")
        
        log.write(f"=== Broken Header Links ({len(broken_header_urls)} links found) ===\n\n")
        if broken_header_urls:
            log.write("\n".join(strip_ansi_escape_codes(url) for url in broken_header_urls) + "\n\n")
        else:
            log.write("No broken header links found.\n\n")
        
        log.write(f"=== OK Absolute URLs ({len(ok_absolute_urls)} links found) ===\n\n")
        if ok_absolute_urls:
            log.write("\n".join(strip_ansi_escape_codes(url) for url in ok_absolute_urls) + "\n\n")
        else:
            log.write("No absolute URLs found.\n\n")
        
        log.write(f"=== OK Relative URLs ({len(ok_relative_urls)} links found) ===\n\n")
        if ok_relative_urls:
            log.write("\n".join(strip_ansi_escape_codes(url) for url in ok_relative_urls) + "\n\n")
        else:
            log.write("No relative URLs found.\n\n")
        
        log.write(f"=== OK Root-Relative URLs ({len(ok_root_relative_urls)} links found) ===\n\n")
        if ok_root_relative_urls:
            log.write("\n".join(strip_ansi_escape_codes(url) for url in ok_root_relative_urls) + "\n\n")
        else:
            log.write("No root-relative URLs found.\n\n")
        
        log.write(f"=== OK Image URLs ({len(ok_image_urls)} links found) ===\n\n")
        if ok_image_urls:
            log.write("\n".join(strip_ansi_escape_codes(url) for url in ok_image_urls) + "\n\n")
        else:
            log.write("No image URLs found.\n\n")
        
        log.write(f"=== OK SVG URLs ({len(ok_svg_urls)} links found) ===\n\n")
        if ok_svg_urls:
            log.write("\n".join(strip_ansi_escape_codes(url) for url in ok_svg_urls) + "\n\n")
        else:
            log.write("No SVG URLs found.\n\n")
        
        log.write(f"=== OK Header Links ({len(ok_header_urls)} links found) ===\n\n")
        if ok_header_urls:
            log.write("\n".join(strip_ansi_escape_codes(url) for url in ok_header_urls) + "\n\n")
        else:
            log.write("No header links found.\n\n")
        
        # Add summary with improved informative title and hierarchical format
        total_broken = (len(broken_absolute_urls) + 
                        len(broken_relative_urls_with_anchor) + 
                        len(broken_relative_urls_without_anchor) + 
                        len(broken_root_relative_urls) + 
                        len(broken_image_urls) + 
                        len(broken_svg_urls) + 
                        len(broken_header_urls))
        
        total_ok = len(ok_absolute_urls) + len(ok_relative_urls) + len(ok_root_relative_urls) + len(ok_image_urls) + len(ok_svg_urls) + len(ok_header_urls)
        total_links = total_broken + total_ok
        
        # Updated categorization logic
        no_links_types = []  # Categories with no links at all (neither broken nor OK)
        zero_broken_types = []  # Categories with OK links but no broken links
        broken_types = []  # Categories with broken links
        
        # Absolute URLs
        if len(broken_absolute_urls) == 0 and len(ok_absolute_urls) == 0:
            no_links_types.append(("Absolute URLs", 0))
        elif len(broken_absolute_urls) == 0:
            zero_broken_types.append(("Absolute URLs", len(ok_absolute_urls)))
        else:
            broken_types.append(("Absolute URLs", len(broken_absolute_urls)))
            
        # Relative URLs without anchors and with anchors combined
        if len(broken_relative_urls_without_anchor) == 0 and len(broken_relative_urls_with_anchor) == 0 and len(ok_relative_urls) == 0:
            no_links_types.append(("Relative URLs", 0))
        elif len(broken_relative_urls_without_anchor) == 0 and len(broken_relative_urls_with_anchor) == 0:
            zero_broken_types.append(("Relative URLs", len(ok_relative_urls)))
        else:
            # Count broken relative URLs with and without anchors separately
            if len(broken_relative_urls_without_anchor) > 0:
                broken_types.append(("Relative URLs without anchors", len(broken_relative_urls_without_anchor)))
            if len(broken_relative_urls_with_anchor) > 0:
                broken_types.append(("Relative URLs with anchors", len(broken_relative_urls_with_anchor)))
                
        # Root-relative URLs
        if len(broken_root_relative_urls) == 0 and len(ok_root_relative_urls) == 0:
            no_links_types.append(("Root-relative URLs", 0))
        elif len(broken_root_relative_urls) == 0:
            zero_broken_types.append(("Root-relative URLs", len(ok_root_relative_urls)))
        else:
            broken_types.append(("Root-relative URLs", len(broken_root_relative_urls)))
            
        # Image URLs
        if len(broken_image_urls) == 0 and len(ok_image_urls) == 0:
            no_links_types.append(("Image URLs", 0))
        elif len(broken_image_urls) == 0:
            zero_broken_types.append(("Image URLs", len(ok_image_urls)))
        else:
            broken_types.append(("Image URLs", len(broken_image_urls)))
            
        # SVG URLs
        if len(broken_svg_urls) == 0 and len(ok_svg_urls) == 0:
            no_links_types.append(("SVG URLs", 0))
        elif len(broken_svg_urls) == 0:
            zero_broken_types.append(("SVG URLs", len(ok_svg_urls)))
        else:
            broken_types.append(("SVG URLs", len(broken_svg_urls)))
            
        # Header links
        if len(broken_header_urls) == 0 and len(ok_header_urls) == 0:
            no_links_types.append(("Header links", 0))
        elif len(broken_header_urls) == 0:
            zero_broken_types.append(("Header links", len(ok_header_urls)))
        else:
            broken_types.append(("Header links", len(broken_header_urls)))
        
        # Write summary to log file
        log.write(f"Link Validation Summary ({total_links} links checked):\n")
        
        # Always show broken links section if there are any broken links
        if total_broken > 0:
            log.write(f"- Broken links: {total_broken}\n")
            # Only show categories that actually have broken links
            for category, count in broken_types:
                log.write(f"  - {category}: {count}\n")
        else:
            log.write("- Broken links: 0\n")
        
        # Show categories with no links found
        if no_links_types:
            log.write(f"- No links found: {len(no_links_types)} categories\n")
            for category, _ in no_links_types:
                log.write(f"  - {category}\n")
            
        # Show categories with no broken links (but have OK links)
        if zero_broken_types:
            log.write(f"- Categories with no broken links: {len(zero_broken_types)}\n")
            for category, count in zero_broken_types:
                log.write(f"  - {category}: {count} OK links\n")
            
        log.write(f"- OK links: {total_ok}\n")
        
        # Add final conclusion with emoji
        broken_links_found = bool(broken_absolute_urls or broken_relative_urls_without_anchor or broken_relative_urls_with_anchor or
                                 broken_root_relative_urls or broken_image_urls or broken_svg_urls or broken_header_urls)
        if broken_links_found:
            log.write(f"❌ Broken links were found. Check the logs for details.\n")
        else:
            log.write(f"✅ All links are valid!\n")
    
    # Print results to console
    print(f"Check complete. See {log_file_with_timestamp} for details.")
    
    print(f"\nLog generated on: {timestamp}")
    print(f"Runtime duration: {runtime_duration}")
    print(f"Total broken absolute URLs: {len(broken_absolute_urls)}")
    print(f"Total broken relative URLs (without anchors): {len(broken_relative_urls_without_anchor)}")
    print(f"Total broken relative URLs (with anchors): {len(broken_relative_urls_with_anchor)}")
    print(f"Total OK absolute URLs: {len(ok_absolute_urls)}")
    print(f"Total OK relative URLs: {len(ok_relative_urls)}")
    print(f"Total broken root-relative URLs: {len(broken_root_relative_urls)}")
    print(f"Total OK root-relative URLs: {len(ok_root_relative_urls)}")
    print(f"Total broken image URLs: {len(broken_image_urls)}")
    print(f"Total OK image URLs: {len(ok_image_urls)}")
    print(f"Total broken SVG URLs: {len(broken_svg_urls)}")
    print(f"Total OK SVG URLs: {len(ok_svg_urls)}")
    print(f"Total broken header links: {len(broken_header_urls)}")
    print(f"Total OK header links: {len(ok_header_urls)}")
    
    print("\n=== Broken Absolute URLs ===")
    for url in broken_absolute_urls:
        print(f"{Colors.FAIL}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
    
    print("\n=== Broken Relative URLs Without Anchors ===")
    for url in broken_relative_urls_without_anchor:
        print(f"{Colors.FAIL}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
        
    print("\n=== Broken Relative URLs With Anchors ===")
    for url in broken_relative_urls_with_anchor:
        print(f"{Colors.FAIL}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
        
    print("\n=== Broken Root-Relative URLs ===")
    for url in broken_root_relative_urls:
        print(f"{Colors.FAIL}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
    print("\n=== Broken Image URLs ===")
    for url in broken_image_urls:
        print(f"{Colors.FAIL}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
    print("\n=== Broken SVG URLs ===")
    for url in broken_svg_urls:
        print(f"{Colors.FAIL}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
    print("\n=== Broken Header Links ===")
    for url in broken_header_urls:
        print(f"{Colors.FAIL}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
    print("\n=== OK Absolute URLs ===")
    for url in ok_absolute_urls:
        print(f"{Colors.OKGREEN}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
    print("\n=== OK Relative URLs ===")
    for url in ok_relative_urls:
        print(f"{Colors.OKGREEN}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
    print("\n=== OK Root-Relative URLs ===")
    for url in ok_root_relative_urls:
        print(f"{Colors.OKGREEN}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
    print("\n=== OK Image URLs ===")
    for url in ok_image_urls:
        print(f"{Colors.OKGREEN}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
    print("\n=== OK SVG URLs ===")
    for url in ok_svg_urls:
        print(f"{Colors.OKGREEN}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
    print("\n=== OK Header Links ===")
    for url in ok_header_urls:
        print(f"{Colors.OKGREEN}{strip_ansi_escape_codes(url)}{Colors.ENDC}")

    # Print summary table with improved title and color coding
    total_broken = (len(broken_absolute_urls) + 
                    len(broken_relative_urls_with_anchor) + 
                    len(broken_relative_urls_without_anchor) + 
                    len(broken_root_relative_urls) + 
                    len(broken_image_urls) + 
                    len(broken_svg_urls) + 
                    len(broken_header_urls))
    
    total_ok = len(ok_absolute_urls) + len(ok_relative_urls) + len(ok_root_relative_urls) + len(ok_image_urls) + len(ok_svg_urls) + len(ok_header_urls)
    total_links = total_broken + total_ok
    
    # Title in cyan (INFO color)
    print(f"\n{Colors.INFO}Link Validation Summary ({total_links} links checked):{Colors.ENDC}")
    
    # Always show broken links section if there are any broken links
    if total_broken > 0:
        print(f"{Colors.FAIL}- Broken links: {total_broken}{Colors.ENDC}")
        # Only show categories that actually have broken links
        for category, count in broken_types:
            print(f"{Colors.FAIL}  - {category}: {count}{Colors.ENDC}")
    else:
        print(f"{Colors.INFO}- Broken links: 0{Colors.ENDC}")
        
    # Show categories with no links found
    if no_links_types:
        print(f"{Colors.NEUTRAL}- No links found: {len(no_links_types)} categories{Colors.ENDC}")
        for category, _ in no_links_types:
            print(f"{Colors.NEUTRAL}  - {category}{Colors.ENDC}")
    
    # Show categories with no broken links (but have OK links)
    if zero_broken_types:
        print(f"{Colors.INFO}- Categories with no broken links: {len(zero_broken_types)}{Colors.ENDC}")
        for category, count in zero_broken_types:
            print(f"{Colors.INFO}  - {category}: {count} OK links{Colors.ENDC}")
            
    print(f"{Colors.OKGREEN}- OK links: {total_ok}{Colors.ENDC}")

    # Determine if any broken links were found
    broken_links_found = bool(broken_absolute_urls or broken_relative_urls_with_anchor or broken_relative_urls_without_anchor or broken_root_relative_urls or broken_image_urls or broken_svg_urls or broken_header_urls)

    # Exit with appropriate code
    if broken_links_found:
        print(f"{Colors.FAIL}❌ Broken links were found. Check the logs for details.{Colors.ENDC}")
        sys.exit(1)  # Exit code 1 signals that broken links were found
    else:
        print(f"{Colors.OKGREEN}✅ All links are valid!{Colors.ENDC}")
        sys.exit(0)  # Exit code 0 signals that all links are valid