def is_false_positive()

in tools/url-checker/url_checker.py [0:0]


def is_false_positive(url):
    """Check if a URL is a known false positive pattern that should be skipped."""
    # Skip direct domain matches first (most efficient check)
    try:
        parsed_url = urlparse(url)
        if parsed_url.netloc in KNOWN_VALID_DOMAINS:
            print(f"Skipping trusted domain URL: {url}")
            return True
    except:
        pass  # Continue with other checks if parsing fails

    # Simple string patterns that should be skipped
    simple_skip_patterns = [
        "http://\\", 
        "http:\\", 
        "http://\\\\", 
        "http:\\\\\\", 
        "https://\\", 
        "https://\\\\"
    ]
    
    for pattern in simple_skip_patterns:
        if pattern in url:
            print(f"Skipping URL with backslashes: {url}")
            return True
    
    # Template Base URL patterns
    template_patterns = [
        r'\(\$templateBaseUrl',
        r'\(\$env:templateBaseUrl',
        r'\(\$Using:templateBaseUrl',
        r'\$Env:templateBaseUrl',  # PowerShell environment variable syntax
    ]
    
    # Storage account and GitHub patterns
    placeholder_patterns = [
        r'https://\{STORAGEACCOUNT\}\.blob\.core\.windows\.net/',
        r'https://\$githubPat@github\.com/\$githubUser/\$appsRepo\.git',
        r'http://\$URL:\$PORT',
        r'https://\$\(\$HCIBoxConfig\.WACVMName\)\.',
        r'https://\$stagingStorageAccountName\.blob\.core\.windows\.net/\$containerName/config',
    ]
    
    # PowerShell variable names that look like URLs or paths but aren't actual URLs
    powershell_variable_patterns = [
        r'\$websiteUrls',          # Variable holding website URLs
        r'\$websiteUrls\[',        # With array indexing
        r'\$websiteUrls\.',        # With property/method access
        r'\$mqttExplorerReleasesUrl', # MQTT Explorer releases URL variable
        r'\$mqttExplorerReleaseDownloadUrl', # MQTT Explorer download URL variable
        r'\$terminalDownloadUri',   # Terminal download URI variable
        r'\$uri',                  # Generic URI variable
        r'\$url',                  # Generic URL variable
        r'\$downloadUrl',          # Download URL variable
        r'\$aksEEReleasesUrl',     # AKS EE releases URL variable
        r'\$AKSEEReleaseDownloadUrl', # AKS EE download URL variable
        r'\$localPathStorageUrl',  # Local path storage URL variable
        r'\$acsadeployYamlUrl',    # ACSA deploy YAML URL variable
        r'\$aksEEk3sUrl',          # AKS EE K3s URL variable
        r'\$githubApiUrl',         # GitHub API URL variable
        r'\$fabricHeaders',        # Fabric API headers variable
        r'\$_',                    # PowerShell automatic variable for current pipeline object
    ]
    
    # Script files and commands that appear to be URLs but aren't
    script_file_patterns = [
        r'get_helm\.sh',           # Helm installation script
        r'http://\\',              # Escaped backslash in URL (not a real URL)
        r'http://\\\\',            # Multiple escaped backslashes in URL
        r'http://\\\\\S*',         # Multiple escaped backslashes with any additional characters
        r'https://\\\\\S*',        # Multiple escaped backslashes with HTTPS
    ]
    
    # Escaped backslashes in URLs or JSON path patterns - expanded patterns
    escaped_backslash_patterns = [
        r'http://\\+',             # One or more backslashes after http://
        r'https://\\+',            # One or more backslashes after https://
        r'http:\\+',               # Backslashes without forward slashes
        r'https:\\+',              # Backslashes without forward slashes
        r'http://\\\\\S*',         # Multiple escaped backslashes with any additional characters 
        r'http://\\',              # Single backslash
        r'http:/\\',               # Malformed backslash
        r'https://\\',             # HTTPS with backslash
    ]
    
    # Template variable patterns (JavaScript-style ${var} and shell-style $var)
    template_variable_patterns = [
        r'http://\${[^}]+}', # ${variable} format
        r'https://\${[^}]+}',
        r'http://\${[^}]+}:[0-9]+', # With port
        r'https://\${[^}]+}:[0-9]+',
        r'http://\${[^}]+}:[0-9]+/\w+', # With path after port
        r'https://\${[^}]+}:[0-9]+/\w+',
        r'http://\$[a-zA-Z0-9_]+', # $variable format (without braces)
        r'https://\$[a-zA-Z0-9_]+',
        r'http://\$[a-zA-Z0-9_]+:[0-9]+', # With port
        r'https://\$[a-zA-Z0-9_]+:[0-9]+',
        r'https://\$[a-zA-Z0-9_]+/\w+', # With path (no port)
        r'https://\${[^}]+}/\w+', # With path (no port) for braced variables
        r'https://[^/]+/\$[a-zA-Z0-9_]+', # Variable in path
        r'https://[^/]+/\${[^}]+}', # Braced variable in path
        r'https://\$Env:[a-zA-Z0-9_]+', # PowerShell Env variables in URLs
        r'https://\$env:[a-zA-Z0-9_]+', # PowerShell env variables in URLs (lowercase)
    ]
    
    # Query string variable patterns
    query_variable_patterns = [
        r'https://[^?]+\?[^=]+=\$[a-zA-Z0-9_]+',  # https://example.com?param=$variable
        r'https://[^?]+\?[^=]+=\${[^}]+}',        # https://example.com?param=${variable}
    ]
    
    # XML namespace URLs that aren't meant to be accessed directly
    xml_namespace_urls = [
        'http://www.w3.org/2000/svg',
        'http://www.w3.org/1999/xlink',
    ]
    
    # Special placeholder hostnames (typically used in configs/templates)
    placeholder_hostnames = [
        r'influxPlaceholder',
    ]
    
    # Patterns for specific GitHub raw URLs that are placeholders
    github_raw_urls = [
        r'https://raw\.githubusercontent\.com/microsoft/azure_arc/main/azure_jumpstart_ag/',
        r'https://raw\.githubusercontent\.com/microsoft/azure_arc/main/.+/'
    ]
    
    # Local script file patterns that shouldn't be checked as URLs
    local_script_patterns = [
        r'^\.\/[a-zA-Z0-9_-]+\.sh$',         # ./script.sh
        r'^\.\/[a-zA-Z0-9_-]+\.ps1$',        # ./script.ps1
        r'^\.\/[a-zA-Z0-9_-]+\.bat$',        # ./script.bat
        r'^\.\/[a-zA-Z0-9_-]+\.cmd$',        # ./script.cmd
        r'\.\/akri\.sh$',                    # ./akri.sh specifically
    ]
    
    # GitHub API URL patterns with variables
    github_api_variable_patterns = [
        r'\$gitHubAPIBaseUri\/repos\/\$githubUser\/\$appsRepo',
        r'\$gitHubAPIBaseUri\/repos\/[^\/]+\/[^\/]+',
        r'\$githubApiUrl',
        r'api\.github\.com\/repos\/\$[a-zA-Z0-9_]+\/',
    ]
    
    # Additional Management API domains that are valid but often give auth errors
    management_api_domains = [
        r'management\.core\.windows\.net',
    ]
    
    # HTTP verbs that are commonly used in PowerShell scripts and not actual URLs
    http_verbs = [
        r'^Get$', 
        r'^POST$',
        r'^GET$',
        r'^PUT$',
        r'^PATCH$',
        r'^DELETE$',
        r'^OPTIONS$',
        r'^HEAD$',
        r'^CONNECT$',
        r'^TRACE$',
        r'^Post$'
    ]
    
    # Check for standalone HTTP verbs
    for verb in http_verbs:
        if re.match(verb, url):
            print(f"Skipping HTTP verb: {url}")
            return True
    
    # Check for placeholder hostnames
    for hostname in placeholder_hostnames:
        if re.search(rf'https?://{hostname}(?::[0-9]+)?/?', url, re.IGNORECASE):
            print(f"Skipping placeholder hostname URL: {url}")
            return True
    
    # Check for specific GitHub raw URLs that are placeholders
    for pattern in github_raw_urls:
        if re.search(pattern, url):
            print(f"Skipping GitHub raw placeholder URL: {url}")
            return True
    
    # Check if URL matches any of the template patterns
    for pattern in template_patterns:
        if re.search(pattern, url):
            print(f"Skipping false positive template URL: {url}")
            return True
    
    # Check if URL matches any of the placeholder patterns
    for pattern in placeholder_patterns:
        if re.search(pattern, url):
            print(f"Skipping false positive placeholder URL: {url}")
            return True
    
    # Check if URL matches any of the PowerShell variable patterns
    for pattern in powershell_variable_patterns:
        if re.search(pattern, url):
            print(f"Skipping PowerShell variable URL: {url}")
            return True
    
    # Check if URL matches any of the script file patterns
    for pattern in script_file_patterns:
        if re.search(pattern, url):
            print(f"Skipping script file or command URL: {url}")
            return True
            
    # Check if URL matches escaped backslash patterns
    for pattern in escaped_backslash_patterns:
        if re.search(pattern, url):
            print(f"Skipping escaped backslash URL pattern: {url}")
            return True
    
    # Check if URL matches any of the template variable patterns
    for pattern in template_variable_patterns:
        if re.search(pattern, url):
            print(f"Skipping template variable URL: {url}")
            return True
    
    # Check if URL matches any of the query variable patterns
    for pattern in query_variable_patterns:
        if re.search(pattern, url):
            print(f"Skipping query variable URL: {url}")
            return True
    
    # Check for local script file patterns
    for pattern in local_script_patterns:
        if re.search(pattern, url):
            print(f"Skipping local script file: {url}")
            return True
    
    # Check for GitHub API URL variable patterns
    for pattern in github_api_variable_patterns:
        if re.search(pattern, url):
            print(f"Skipping GitHub API URL variable: {url}")
            return True
    
    # Check if URL contains management API domains
    for domain in management_api_domains:
        if re.search(domain, url):
            print(f"Skipping management API domain: {url}")
            return True
    
    # Check if URL is an XML namespace
    if url in xml_namespace_urls or url.startswith('http://www.w3.org/2000/svg') or url.startswith('http://www.w3.org/1999/xlink'):
        print(f"Skipping XML namespace URL: {url}")
        return True
    
    # Additional check for specific URLs that we know are problematic
    hardcoded_urls_to_skip = [
        "https://api.fabric.microsoft.com",
        "https://api.powerbi.com",
        "https://dashboards.kusto.windows.net",
        "https://api.kusto.windows.net",
        "https://analysis.windows.net",
        "https://wabi-us-central-b-primary-redirect.analysis.windows.net",
        "https://raw.githubusercontent.com/microsoft/azure_arc/main/azure_jumpstart_ag/",
        "http://influxPlaceholder:8086",
        "https://management.core.windows.net/", # Azure Management API
    ]
    
    if url in hardcoded_urls_to_skip:
        print(f"Skipping hardcoded URL: {url}")
        return True
            
    return False