in tools/url-checker/url_checker.py [0:0]
def is_false_positive(url):
"""Check if a URL is a known false positive pattern that should be skipped."""
# Skip direct domain matches first (most efficient check)
try:
parsed_url = urlparse(url)
if parsed_url.netloc in KNOWN_VALID_DOMAINS:
print(f"Skipping trusted domain URL: {url}")
return True
except:
pass # Continue with other checks if parsing fails
# Simple string patterns that should be skipped
simple_skip_patterns = [
"http://\\",
"http:\\",
"http://\\\\",
"http:\\\\\\",
"https://\\",
"https://\\\\"
]
for pattern in simple_skip_patterns:
if pattern in url:
print(f"Skipping URL with backslashes: {url}")
return True
# Template Base URL patterns
template_patterns = [
r'\(\$templateBaseUrl',
r'\(\$env:templateBaseUrl',
r'\(\$Using:templateBaseUrl',
r'\$Env:templateBaseUrl', # PowerShell environment variable syntax
]
# Storage account and GitHub patterns
placeholder_patterns = [
r'https://\{STORAGEACCOUNT\}\.blob\.core\.windows\.net/',
r'https://\$githubPat@github\.com/\$githubUser/\$appsRepo\.git',
r'http://\$URL:\$PORT',
r'https://\$\(\$HCIBoxConfig\.WACVMName\)\.',
r'https://\$stagingStorageAccountName\.blob\.core\.windows\.net/\$containerName/config',
]
# PowerShell variable names that look like URLs or paths but aren't actual URLs
powershell_variable_patterns = [
r'\$websiteUrls', # Variable holding website URLs
r'\$websiteUrls\[', # With array indexing
r'\$websiteUrls\.', # With property/method access
r'\$mqttExplorerReleasesUrl', # MQTT Explorer releases URL variable
r'\$mqttExplorerReleaseDownloadUrl', # MQTT Explorer download URL variable
r'\$terminalDownloadUri', # Terminal download URI variable
r'\$uri', # Generic URI variable
r'\$url', # Generic URL variable
r'\$downloadUrl', # Download URL variable
r'\$aksEEReleasesUrl', # AKS EE releases URL variable
r'\$AKSEEReleaseDownloadUrl', # AKS EE download URL variable
r'\$localPathStorageUrl', # Local path storage URL variable
r'\$acsadeployYamlUrl', # ACSA deploy YAML URL variable
r'\$aksEEk3sUrl', # AKS EE K3s URL variable
r'\$githubApiUrl', # GitHub API URL variable
r'\$fabricHeaders', # Fabric API headers variable
r'\$_', # PowerShell automatic variable for current pipeline object
]
# Script files and commands that appear to be URLs but aren't
script_file_patterns = [
r'get_helm\.sh', # Helm installation script
r'http://\\', # Escaped backslash in URL (not a real URL)
r'http://\\\\', # Multiple escaped backslashes in URL
r'http://\\\\\S*', # Multiple escaped backslashes with any additional characters
r'https://\\\\\S*', # Multiple escaped backslashes with HTTPS
]
# Escaped backslashes in URLs or JSON path patterns - expanded patterns
escaped_backslash_patterns = [
r'http://\\+', # One or more backslashes after http://
r'https://\\+', # One or more backslashes after https://
r'http:\\+', # Backslashes without forward slashes
r'https:\\+', # Backslashes without forward slashes
r'http://\\\\\S*', # Multiple escaped backslashes with any additional characters
r'http://\\', # Single backslash
r'http:/\\', # Malformed backslash
r'https://\\', # HTTPS with backslash
]
# Template variable patterns (JavaScript-style ${var} and shell-style $var)
template_variable_patterns = [
r'http://\${[^}]+}', # ${variable} format
r'https://\${[^}]+}',
r'http://\${[^}]+}:[0-9]+', # With port
r'https://\${[^}]+}:[0-9]+',
r'http://\${[^}]+}:[0-9]+/\w+', # With path after port
r'https://\${[^}]+}:[0-9]+/\w+',
r'http://\$[a-zA-Z0-9_]+', # $variable format (without braces)
r'https://\$[a-zA-Z0-9_]+',
r'http://\$[a-zA-Z0-9_]+:[0-9]+', # With port
r'https://\$[a-zA-Z0-9_]+:[0-9]+',
r'https://\$[a-zA-Z0-9_]+/\w+', # With path (no port)
r'https://\${[^}]+}/\w+', # With path (no port) for braced variables
r'https://[^/]+/\$[a-zA-Z0-9_]+', # Variable in path
r'https://[^/]+/\${[^}]+}', # Braced variable in path
r'https://\$Env:[a-zA-Z0-9_]+', # PowerShell Env variables in URLs
r'https://\$env:[a-zA-Z0-9_]+', # PowerShell env variables in URLs (lowercase)
]
# Query string variable patterns
query_variable_patterns = [
r'https://[^?]+\?[^=]+=\$[a-zA-Z0-9_]+', # https://example.com?param=$variable
r'https://[^?]+\?[^=]+=\${[^}]+}', # https://example.com?param=${variable}
]
# XML namespace URLs that aren't meant to be accessed directly
xml_namespace_urls = [
'http://www.w3.org/2000/svg',
'http://www.w3.org/1999/xlink',
]
# Special placeholder hostnames (typically used in configs/templates)
placeholder_hostnames = [
r'influxPlaceholder',
]
# Patterns for specific GitHub raw URLs that are placeholders
github_raw_urls = [
r'https://raw\.githubusercontent\.com/microsoft/azure_arc/main/azure_jumpstart_ag/',
r'https://raw\.githubusercontent\.com/microsoft/azure_arc/main/.+/'
]
# Local script file patterns that shouldn't be checked as URLs
local_script_patterns = [
r'^\.\/[a-zA-Z0-9_-]+\.sh$', # ./script.sh
r'^\.\/[a-zA-Z0-9_-]+\.ps1$', # ./script.ps1
r'^\.\/[a-zA-Z0-9_-]+\.bat$', # ./script.bat
r'^\.\/[a-zA-Z0-9_-]+\.cmd$', # ./script.cmd
r'\.\/akri\.sh$', # ./akri.sh specifically
]
# GitHub API URL patterns with variables
github_api_variable_patterns = [
r'\$gitHubAPIBaseUri\/repos\/\$githubUser\/\$appsRepo',
r'\$gitHubAPIBaseUri\/repos\/[^\/]+\/[^\/]+',
r'\$githubApiUrl',
r'api\.github\.com\/repos\/\$[a-zA-Z0-9_]+\/',
]
# Additional Management API domains that are valid but often give auth errors
management_api_domains = [
r'management\.core\.windows\.net',
]
# HTTP verbs that are commonly used in PowerShell scripts and not actual URLs
http_verbs = [
r'^Get$',
r'^POST$',
r'^GET$',
r'^PUT$',
r'^PATCH$',
r'^DELETE$',
r'^OPTIONS$',
r'^HEAD$',
r'^CONNECT$',
r'^TRACE$',
r'^Post$'
]
# Check for standalone HTTP verbs
for verb in http_verbs:
if re.match(verb, url):
print(f"Skipping HTTP verb: {url}")
return True
# Check for placeholder hostnames
for hostname in placeholder_hostnames:
if re.search(rf'https?://{hostname}(?::[0-9]+)?/?', url, re.IGNORECASE):
print(f"Skipping placeholder hostname URL: {url}")
return True
# Check for specific GitHub raw URLs that are placeholders
for pattern in github_raw_urls:
if re.search(pattern, url):
print(f"Skipping GitHub raw placeholder URL: {url}")
return True
# Check if URL matches any of the template patterns
for pattern in template_patterns:
if re.search(pattern, url):
print(f"Skipping false positive template URL: {url}")
return True
# Check if URL matches any of the placeholder patterns
for pattern in placeholder_patterns:
if re.search(pattern, url):
print(f"Skipping false positive placeholder URL: {url}")
return True
# Check if URL matches any of the PowerShell variable patterns
for pattern in powershell_variable_patterns:
if re.search(pattern, url):
print(f"Skipping PowerShell variable URL: {url}")
return True
# Check if URL matches any of the script file patterns
for pattern in script_file_patterns:
if re.search(pattern, url):
print(f"Skipping script file or command URL: {url}")
return True
# Check if URL matches escaped backslash patterns
for pattern in escaped_backslash_patterns:
if re.search(pattern, url):
print(f"Skipping escaped backslash URL pattern: {url}")
return True
# Check if URL matches any of the template variable patterns
for pattern in template_variable_patterns:
if re.search(pattern, url):
print(f"Skipping template variable URL: {url}")
return True
# Check if URL matches any of the query variable patterns
for pattern in query_variable_patterns:
if re.search(pattern, url):
print(f"Skipping query variable URL: {url}")
return True
# Check for local script file patterns
for pattern in local_script_patterns:
if re.search(pattern, url):
print(f"Skipping local script file: {url}")
return True
# Check for GitHub API URL variable patterns
for pattern in github_api_variable_patterns:
if re.search(pattern, url):
print(f"Skipping GitHub API URL variable: {url}")
return True
# Check if URL contains management API domains
for domain in management_api_domains:
if re.search(domain, url):
print(f"Skipping management API domain: {url}")
return True
# Check if URL is an XML namespace
if url in xml_namespace_urls or url.startswith('http://www.w3.org/2000/svg') or url.startswith('http://www.w3.org/1999/xlink'):
print(f"Skipping XML namespace URL: {url}")
return True
# Additional check for specific URLs that we know are problematic
hardcoded_urls_to_skip = [
"https://api.fabric.microsoft.com",
"https://api.powerbi.com",
"https://dashboards.kusto.windows.net",
"https://api.kusto.windows.net",
"https://analysis.windows.net",
"https://wabi-us-central-b-primary-redirect.analysis.windows.net",
"https://raw.githubusercontent.com/microsoft/azure_arc/main/azure_jumpstart_ag/",
"http://influxPlaceholder:8086",
"https://management.core.windows.net/", # Azure Management API
]
if url in hardcoded_urls_to_skip:
print(f"Skipping hardcoded URL: {url}")
return True
return False