in tools/url-checker/url_checker.py [0:0]
def main():
# Parse arguments
args = parse_arguments()
# Override timeout if provided
global TIMEOUT
if args.timeout:
TIMEOUT = args.timeout
print(f"Using custom timeout: {TIMEOUT} seconds")
# Lists to track results
broken_absolute_urls = []
ok_absolute_urls = []
broken_relative_urls_with_anchor = []
broken_relative_urls_without_anchor = []
ok_relative_urls = []
broken_image_urls = []
ok_image_urls = []
broken_svg_urls = []
ok_svg_urls = []
broken_header_urls = []
ok_header_urls = []
broken_root_relative_urls = []
ok_root_relative_urls = []
no_links_types = []
# If a specific directory is provided, only check files there
if args.dir:
script_dir = os.path.dirname(os.path.abspath(__file__))
test_dir = os.path.join(script_dir, args.dir)
print(f"Only checking files in test directory: {test_dir}")
files_to_check = find_files_in_directory(test_dir, args.exclude)
else:
files_to_check = find_files_to_check(args.exclude)
# Create log file with timestamp
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
log_file_with_timestamp = os.path.join(LOG_DIR, f'broken_urls_{timestamp}.log')
print(f"Starting URL check on {len(files_to_check)} files...")
start_time = datetime.now()
# Process all files and URLs - write to log in real-time for monitoring
with open(log_file_with_timestamp, 'w', encoding='utf-8') as log:
log.write(f"URL Checker Results\n\n")
log.write(f"Log generated on: {timestamp}\n")
log.write("Processing URLs in real-time...\n\n")
log.flush()
for file_path in files_to_check:
file_ext = os.path.splitext(file_path)[1].lower()
file_type = SUPPORTED_FILE_TYPES.get(file_ext, 'Unknown')
print(f"Processing {file_type} file: {file_path}")
urls = extract_urls(file_path)
for url in urls:
# Skip email links
if EMAIL_REGEX.match(url):
print(f"Skipping email URL: {url}")
continue
# Skip localhost and IP-based URLs
if url.startswith("http://localhost") or is_ip_based_url(url):
print(f"Skipping localhost or IP-based URL: {url}")
continue
# Skip false positive URLs
if is_false_positive(url):
continue
# Add error handling for URL parsing
try:
# Check URL based on whether it's absolute or relative
parsed_url = urlparse(url)
if parsed_url.scheme in ('http', 'https'):
# It's an absolute URL - pass the file path to track source
log_entry = check_absolute_url(url, file_path)
if "[OK ABSOLUTE]" in log_entry:
ok_absolute_urls.append(log_entry)
else:
broken_absolute_urls.append(log_entry)
else:
# Strip quotes before further processing to avoid false positives
url_clean = url.strip('"\'')
try:
parsed_clean = urlparse(url_clean)
# Check again if it's actually an absolute URL after stripping quotes
if parsed_clean.scheme in ('http', 'https'):
# Skip false positive URLs after cleaning
if is_false_positive(url_clean):
continue
log_entry = check_absolute_url(url_clean, file_path)
if "[OK ABSOLUTE]" in log_entry:
ok_absolute_urls.append(log_entry)
else:
broken_absolute_urls.append(log_entry)
else:
# It's a relative URL, image, SVG, root-relative, or header link
log_entry, is_image, is_svg, is_root_relative, has_anchor = check_relative_url(url, file_path)
# ...existing categorization code...
if "[BROKEN HEADER]" in log_entry:
broken_header_urls.append(log_entry)
elif "[OK HEADER]" in log_entry:
ok_header_urls.append(log_entry)
elif is_svg:
if "[OK SVG]" in log_entry:
ok_svg_urls.append(log_entry)
else:
broken_svg_urls.append(log_entry)
elif is_image:
if "[OK IMAGE]" in log_entry:
ok_image_urls.append(log_entry)
else:
broken_image_urls.append(log_entry)
elif is_root_relative:
if "[OK ROOT-RELATIVE]" in log_entry:
ok_root_relative_urls.append(log_entry)
else:
broken_root_relative_urls.append(log_entry)
else:
if "[OK RELATIVE]" in log_entry:
ok_relative_urls.append(log_entry)
else:
# Use the new log message format for categorization
if "[BROKEN RELATIVE WITH ANCHOR]" in log_entry:
broken_relative_urls_with_anchor.append(log_entry)
elif "[BROKEN RELATIVE WITHOUT ANCHOR]" in log_entry:
broken_relative_urls_without_anchor.append(log_entry)
except ValueError as e:
# Handle URL parsing errors for the cleaned URL
error_message = str(e)
log_entry = f"{Colors.FAIL}[MALFORMED URL] {url_clean} - Error: {error_message} (in file: {file_path}){Colors.ENDC}"
print(log_entry)
broken_absolute_urls.append(log_entry)
except ValueError as e:
# Handle URL parsing errors
error_message = str(e)
if "Invalid IPv6 URL" in error_message:
log_entry = f"{Colors.FAIL}[MALFORMED URL] {url} - Invalid IPv6 URL format (in file: {file_path}){Colors.ENDC}"
else:
log_entry = f"{Colors.FAIL}[MALFORMED URL] {url} - Error: {error_message} (in file: {file_path}){Colors.ENDC}"
print(log_entry)
broken_absolute_urls.append(log_entry)
# Write to log file (real-time monitoring)
log.write(strip_ansi_escape_codes(log_entry) + "\n")
log.flush()
# Calculate runtime
end_time = datetime.now()
runtime_duration = end_time - start_time
runtime_seconds = runtime_duration.total_seconds()
# Create a human-readable runtime string
if runtime_seconds < 60:
runtime_str = f"{runtime_seconds:.2f} seconds"
elif runtime_seconds < 3600:
runtime_str = f"{runtime_seconds/60:.2f} minutes ({runtime_duration})"
else:
runtime_str = f"{runtime_seconds/3600:.2f} hours ({runtime_duration})"
# Write the log file with organized results
with open(log_file_with_timestamp, 'w', encoding='utf-8') as log:
log.write(f"URL Checker Results\n\n")
log.write(f"Log generated on: {timestamp}\n")
log.write(f"Runtime: {runtime_str}\n")
log.write(f"Runtime duration: {runtime_duration}\n\n")
# Write broken sections first (most important)
log.write(f"=== Broken Absolute URLs ({len(broken_absolute_urls)} links found) ===\n\n")
if broken_absolute_urls:
log.write("\n".join(strip_ansi_escape_codes(url) for url in broken_absolute_urls) + "\n\n")
else:
log.write("No broken absolute URLs found.\n\n")
log.write(f"=== Broken Relative URLs Without Anchors ({len(broken_relative_urls_without_anchor)} links found) ===\n\n")
if broken_relative_urls_without_anchor:
log.write("\n".join(strip_ansi_escape_codes(url) for url in broken_relative_urls_without_anchor) + "\n\n")
else:
log.write("No broken relative URLs without anchors found.\n\n")
log.write(f"=== Broken Relative URLs With Anchors ({len(broken_relative_urls_with_anchor)} links found) ===\n\n")
if broken_relative_urls_with_anchor:
log.write("\n".join(strip_ansi_escape_codes(url) for url in broken_relative_urls_with_anchor) + "\n\n")
else:
log.write("No broken relative URLs with anchors found.\n\n")
log.write(f"=== Broken Root-Relative URLs ({len(broken_root_relative_urls)} links found) ===\n\n")
if broken_root_relative_urls:
log.write("\n".join(strip_ansi_escape_codes(url) for url in broken_root_relative_urls) + "\n\n")
else:
log.write("No broken root-relative URLs found.\n\n")
log.write(f"=== Broken Image URLs ({len(broken_image_urls)} links found) ===\n\n")
if broken_image_urls:
log.write("\n".join(strip_ansi_escape_codes(url) for url in broken_image_urls) + "\n\n")
else:
log.write("No broken image URLs found.\n\n")
log.write(f"=== Broken SVG URLs ({len(broken_svg_urls)} links found) ===\n\n")
if broken_svg_urls:
log.write("\n".join(strip_ansi_escape_codes(url) for url in broken_svg_urls) + "\n\n")
else:
log.write("No broken SVG URLs found.\n\n")
log.write(f"=== Broken Header Links ({len(broken_header_urls)} links found) ===\n\n")
if broken_header_urls:
log.write("\n".join(strip_ansi_escape_codes(url) for url in broken_header_urls) + "\n\n")
else:
log.write("No broken header links found.\n\n")
log.write(f"=== OK Absolute URLs ({len(ok_absolute_urls)} links found) ===\n\n")
if ok_absolute_urls:
log.write("\n".join(strip_ansi_escape_codes(url) for url in ok_absolute_urls) + "\n\n")
else:
log.write("No absolute URLs found.\n\n")
log.write(f"=== OK Relative URLs ({len(ok_relative_urls)} links found) ===\n\n")
if ok_relative_urls:
log.write("\n".join(strip_ansi_escape_codes(url) for url in ok_relative_urls) + "\n\n")
else:
log.write("No relative URLs found.\n\n")
log.write(f"=== OK Root-Relative URLs ({len(ok_root_relative_urls)} links found) ===\n\n")
if ok_root_relative_urls:
log.write("\n".join(strip_ansi_escape_codes(url) for url in ok_root_relative_urls) + "\n\n")
else:
log.write("No root-relative URLs found.\n\n")
log.write(f"=== OK Image URLs ({len(ok_image_urls)} links found) ===\n\n")
if ok_image_urls:
log.write("\n".join(strip_ansi_escape_codes(url) for url in ok_image_urls) + "\n\n")
else:
log.write("No image URLs found.\n\n")
log.write(f"=== OK SVG URLs ({len(ok_svg_urls)} links found) ===\n\n")
if ok_svg_urls:
log.write("\n".join(strip_ansi_escape_codes(url) for url in ok_svg_urls) + "\n\n")
else:
log.write("No SVG URLs found.\n\n")
log.write(f"=== OK Header Links ({len(ok_header_urls)} links found) ===\n\n")
if ok_header_urls:
log.write("\n".join(strip_ansi_escape_codes(url) for url in ok_header_urls) + "\n\n")
else:
log.write("No header links found.\n\n")
# Add summary with improved informative title and hierarchical format
total_broken = (len(broken_absolute_urls) +
len(broken_relative_urls_with_anchor) +
len(broken_relative_urls_without_anchor) +
len(broken_root_relative_urls) +
len(broken_image_urls) +
len(broken_svg_urls) +
len(broken_header_urls)
)
total_ok = len(ok_absolute_urls) + len(ok_relative_urls) + len(ok_root_relative_urls) + len(ok_image_urls) + len(ok_svg_urls) + len(ok_header_urls)
total_links = total_broken + total_ok
# Updated categorization logic
no_links_types = [] # Categories with no links at all (neither broken nor OK)
zero_broken_types = [] # Categories with OK links but no broken links
broken_types = [] # Categories with broken links
# Absolute URLs
if len(broken_absolute_urls) == 0 and len(ok_absolute_urls) == 0:
no_links_types.append(("Absolute URLs", 0))
elif len(broken_absolute_urls) == 0:
zero_broken_types.append(("Absolute URLs", len(ok_absolute_urls)))
else:
broken_types.append(("Absolute URLs", len(broken_absolute_urls)))
# Relative URLs without anchors and with anchors combined
if len(broken_relative_urls_without_anchor) == 0 and len(broken_relative_urls_with_anchor) == 0 and len(ok_relative_urls) == 0:
no_links_types.append(("Relative URLs", 0))
elif len(broken_relative_urls_without_anchor) == 0 and len(broken_relative_urls_with_anchor) == 0:
zero_broken_types.append(("Relative URLs", len(ok_relative_urls)))
else:
# Count broken relative URLs with and without anchors separately
if len(broken_relative_urls_without_anchor) > 0:
broken_types.append(("Relative URLs without anchors", len(broken_relative_urls_without_anchor)))
if len(broken_relative_urls_with_anchor) > 0:
broken_types.append(("Relative URLs with anchors", len(broken_relative_urls_with_anchor)))
# Root-relative URLs
if len(broken_root_relative_urls) == 0 and len(ok_root_relative_urls) == 0:
no_links_types.append(("Root-relative URLs", 0))
elif len(broken_root_relative_urls) == 0:
zero_broken_types.append(("Root-relative URLs", len(ok_root_relative_urls)))
else:
broken_types.append(("Root-relative URLs", len(broken_root_relative_urls)))
# Image URLs
if len(broken_image_urls) == 0 and len(ok_image_urls) == 0:
no_links_types.append(("Image URLs", 0))
elif len(broken_image_urls) == 0:
zero_broken_types.append(("Image URLs", len(ok_image_urls)))
else:
broken_types.append(("Image URLs", len(broken_image_urls)))
# SVG URLs
if len(broken_svg_urls) == 0 and len(ok_svg_urls) == 0:
no_links_types.append(("SVG URLs", 0))
elif len(broken_svg_urls) == 0:
zero_broken_types.append(("SVG URLs", len(ok_svg_urls)))
else:
broken_types.append(("SVG URLs", len(broken_svg_urls)))
# Header links
if len(broken_header_urls) == 0 and len(ok_header_urls) == 0:
no_links_types.append(("Header links", 0))
elif len(broken_header_urls) == 0:
zero_broken_types.append(("Header links", len(ok_header_urls)))
else:
broken_types.append(("Header links", len(broken_header_urls)))
# Write modernized summary to log file
log.write("\n" + "═" * 80 + "\n")
log.write(f"📊 LINK VALIDATION SUMMARY ({total_links} links checked)\n")
log.write("═" * 80 + "\n\n")
# Always show broken links section if there are any broken links
if total_broken > 0:
log.write(f"❌ BROKEN LINKS: {total_broken}\n")
# Only show categories that actually have broken links
for category, count in broken_types:
log.write(f" • {category}: {count}\n")
log.write("\n")
else:
log.write(f"✅ BROKEN LINKS: 0 (All links are valid!)\n\n")
# Show categories with no links found
if no_links_types:
log.write(f"📭 NO LINKS FOUND: {len(no_links_types)}\n")
for category, _ in no_links_types:
log.write(f" • {category}\n")
log.write("\n")
# Show categories with no broken links (but have OK links)
if zero_broken_types:
log.write(f"🔍 CATEGORIES WITH NO BROKEN LINKS: {len(zero_broken_types)}\n")
for category, count in zero_broken_types:
log.write(f" • {category}: {count} OK links\n")
log.write("\n")
log.write(f"✅ OK LINKS: {total_ok}\n\n")
# Add runtime to log summary
log.write(f"⏱️ RUNTIME: {runtime_str}\n\n")
# Add final conclusion with emoji
broken_links_found = bool(broken_absolute_urls or broken_relative_urls_with_anchor or broken_relative_urls_without_anchor or
broken_root_relative_urls or broken_image_urls or broken_svg_urls or broken_header_urls)
if broken_links_found:
log.write(f"❌ Broken links were found. Check the logs for details.\n")
else:
log.write(f"✅ All links are valid!\n")
# Print results to console
print(f"Check complete. See {log_file_with_timestamp} for details.")
print(f"\nLog generated on: {timestamp}")
print(f"{Colors.INFO}Runtime: {runtime_str}{Colors.ENDC}")
print(f"Runtime duration: {runtime_duration}")
print(f"Total broken absolute URLs: {len(broken_absolute_urls)}")
print(f"Total broken relative URLs (without anchors): {len(broken_relative_urls_without_anchor)}")
print(f"Total broken relative URLs (with anchors): {len(broken_relative_urls_with_anchor)}")
print(f"Total OK absolute URLs: {len(ok_absolute_urls)}")
print(f"Total OK relative URLs: {len(ok_relative_urls)}")
print(f"Total broken root-relative URLs: {len(broken_root_relative_urls)}")
print(f"Total OK root-relative URLs: {len(ok_root_relative_urls)}")
print(f"Total broken image URLs: {len(broken_image_urls)}")
print(f"Total OK image URLs: {len(ok_image_urls)}")
print(f"Total broken SVG URLs: {len(broken_svg_urls)}")
print(f"Total OK SVG URLs: {len(ok_svg_urls)}")
print(f"Total broken header links: {len(broken_header_urls)}")
print(f"Total OK header links: {len(ok_header_urls)}")
# Update these sections to match log file format
print(f"\n=== Broken Absolute URLs ({len(broken_absolute_urls)} links found) ===")
if broken_absolute_urls:
for url in broken_absolute_urls:
print(f"{Colors.FAIL}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
else:
print("No broken absolute URLs found.")
print(f"\n=== Broken Relative URLs Without Anchors ({len(broken_relative_urls_without_anchor)} links found) ===")
if broken_relative_urls_without_anchor:
for url in broken_relative_urls_without_anchor:
print(f"{Colors.FAIL}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
else:
print("No broken relative URLs without anchors found.")
print(f"\n=== Broken Relative URLs With Anchors ({len(broken_relative_urls_with_anchor)} links found) ===")
if broken_relative_urls_with_anchor:
for url in broken_relative_urls_with_anchor:
print(f"{Colors.FAIL}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
else:
print("No broken relative URLs with anchors found.")
print(f"\n=== Broken Root-Relative URLs ({len(broken_root_relative_urls)} links found) ===")
if broken_root_relative_urls:
for url in broken_root_relative_urls:
print(f"{Colors.FAIL}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
else:
print("No broken root-relative URLs found.")
print(f"\n=== Broken Image URLs ({len(broken_image_urls)} links found) ===")
if broken_image_urls:
for url in broken_image_urls:
print(f"{Colors.FAIL}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
else:
print("No broken image URLs found.")
print(f"\n=== Broken SVG URLs ({len(broken_svg_urls)} links found) ===")
if broken_svg_urls:
for url in broken_svg_urls:
print(f"{Colors.FAIL}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
else:
print("No broken SVG URLs found.")
print(f"\n=== Broken Header Links ({len(broken_header_urls)} links found) ===")
if broken_header_urls:
for url in broken_header_urls:
print(f"{Colors.FAIL}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
else:
print("No broken header links found.")
print(f"\n=== OK Absolute URLs ({len(ok_absolute_urls)} links found) ===")
if ok_absolute_urls:
for url in ok_absolute_urls:
print(f"{Colors.OKGREEN}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
else:
print("No absolute URLs found.")
print(f"\n=== OK Relative URLs ({len(ok_relative_urls)} links found) ===")
if ok_relative_urls:
for url in ok_relative_urls:
print(f"{Colors.OKGREEN}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
else:
print("No relative URLs found.")
print(f"\n=== OK Root-Relative URLs ({len(ok_root_relative_urls)} links found) ===")
if ok_root_relative_urls:
for url in ok_root_relative_urls:
print(f"{Colors.OKGREEN}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
else:
print("No root-relative URLs found.")
print(f"\n=== OK Image URLs ({len(ok_image_urls)} links found) ===")
if ok_image_urls:
for url in ok_image_urls:
print(f"{Colors.OKGREEN}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
else:
print("No image URLs found.")
print(f"\n=== OK SVG URLs ({len(ok_svg_urls)} links found) ===")
if ok_svg_urls:
for url in ok_svg_urls:
print(f"{Colors.OKGREEN}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
else:
print("No SVG URLs found.")
print(f"\n=== OK Header Links ({len(ok_header_urls)} links found) ===")
if ok_header_urls:
for url in ok_header_urls:
print(f"{Colors.OKGREEN}{strip_ansi_escape_codes(url)}{Colors.ENDC}")
else:
print("No header links found.")
# Print modernized summary table with improved title and color coding
total_broken = (len(broken_absolute_urls) +
len(broken_relative_urls_with_anchor) +
len(broken_relative_urls_without_anchor) +
len(broken_root_relative_urls) +
len(broken_image_urls) +
len(broken_svg_urls) +
len(broken_header_urls))
total_ok = len(ok_absolute_urls) + len(ok_relative_urls) + len(ok_root_relative_urls) + len(ok_image_urls) + len(ok_svg_urls) + len(ok_header_urls)
total_links = total_broken + total_ok
# Enhanced title with borders - keep this one cyan
print(f"\n{Colors.INFO}═════════════════════════════════════════════════════════{Colors.ENDC}")
print(f"{Colors.INFO}📊 LINK VALIDATION SUMMARY ({total_links} links checked){Colors.ENDC}")
print(f"{Colors.INFO}═════════════════════════════════════════════════════════{Colors.ENDC}")
print()
# Always show broken links section if there are any broken links
if total_broken > 0:
print(f"{Colors.FAIL}❌ BROKEN LINKS: {total_broken}{Colors.ENDC}")
# Only show categories that actually have broken links
for category, count in broken_types:
print(f"{Colors.FAIL} • {category}: {count}{Colors.ENDC}")
print()
else:
print(f"{Colors.OKGREEN}✅ BROKEN LINKS: 0 (All links are valid!){Colors.ENDC}")
print()
# Show categories with no links found
if no_links_types:
print(f"{Colors.NEUTRAL}📭 NO LINKS FOUND: {len(no_links_types)}{Colors.ENDC}")
for category, _ in no_links_types:
print(f"{Colors.NEUTRAL} • {category}{Colors.ENDC}")
print()
# Show categories with no broken links but with OK links - use SPECIAL color (magenta)
if zero_broken_types:
print(f"{Colors.SPECIAL}🔍 CATEGORIES WITH NO BROKEN LINKS: {len(zero_broken_types)}{Colors.ENDC}")
for category, count in zero_broken_types:
print(f"{Colors.SPECIAL} • {category}: {count} OK links{Colors.ENDC}")
print()
# Keep this green for consistency with checkmarks
print(f"{Colors.OKGREEN}✅ OK LINKS: {total_ok}{Colors.ENDC}")
print()
# Add runtime to console summary with emoji - use the same color as the section headers
print(f"{Colors.INFO}⏱️ RUNTIME: {runtime_str}{Colors.ENDC}")
print()
# Determine if any broken links were found
broken_links_found = bool(broken_absolute_urls or broken_relative_urls_with_anchor or broken_relative_urls_without_anchor or broken_root_relative_urls or broken_image_urls or broken_svg_urls or broken_header_urls)
# Add a message about where the log file is saved - use the same color as the section headers
print(f"{Colors.INFO}📄 FULL LOGS: {log_file_with_timestamp}{Colors.ENDC}")
print()
# Exit with appropriate code and final conclusion
if broken_links_found:
print(f"{Colors.FAIL}❌ Broken links were found. Check the logs for details.{Colors.ENDC}")
sys.exit(1) # Exit code 1 signals that broken links were found
else:
print(f"{Colors.OKGREEN}✅ All links are valid!{Colors.ENDC}")
sys.exit(0) # Exit code 0 signals that all links are valid