in tools/url-checker/url_checker.py [0:0]
def extract_urls_by_file_type(file_path):
"""Extract URLs from a file based on its extension."""
file_ext = os.path.splitext(file_path)[1].lower()
urls = []
file_type = SUPPORTED_FILE_TYPES.get(file_ext, 'Unknown')
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Markdown files
if file_ext == '.md':
# Use the existing markdown link extraction
with open(file_path, 'r', encoding='utf-8') as md_file:
for line in md_file:
matches = MD_URL_REGEX.findall(line)
# Strip quotes from URLs
cleaned_matches = [match.strip('"\'') for match in matches]
urls.extend(cleaned_matches)
# HTML files
elif file_ext in ['.html', '.htm']:
urls.extend([url for url in HTML_HREF_REGEX.findall(content) if url])
urls.extend([url for url in HTML_SRC_REGEX.findall(content) if url])
urls.extend([url for url in HTML_LINK_HREF_REGEX.findall(content) if url])
urls.extend([url for url in HTML_META_CONTENT_REGEX.findall(content) if url and (url.startswith('http') or url.startswith('/') or url.startswith('.'))])
# CSS files
elif file_ext in ['.css', '.scss']:
urls.extend([url for url in CSS_URL_REGEX.findall(content) if url])
# JavaScript/TypeScript files
elif file_ext in ['.js', '.jsx', '.ts', '.tsx']:
js_matches = JS_URL_REGEX.findall(content)
# Handle the case where we have groups in the regex
for match in js_matches:
if isinstance(match, tuple):
urls.extend([m for m in match if m])
else:
urls.append(match)
# Python files
elif file_ext == '.py':
urls.extend([url for url in PY_URL_REGEX.findall(content) if url])
# Python imports are special - we don't check these as URLs but could in the future
# JSON/YAML files
elif file_ext in ['.json', '.yaml', '.yml']:
urls.extend([url for url in JSON_URL_REGEX.findall(content) if url])
# XML files
elif file_ext == '.xml':
urls.extend([url for url in XML_URL_REGEX.findall(content) if url])
# Shell/Bash scripts
elif file_ext in ['.sh', '.bash', '.zsh', '.ksh']:
shell_matches = SHELL_URL_REGEX.findall(content)
for match in shell_matches:
if isinstance(match, tuple):
urls.extend([m for m in match if m])
else:
urls.append(match)
# PowerShell scripts
elif file_ext in ['.ps1', '.psm1', '.psd1']:
ps_matches = PS_URL_REGEX.findall(content)
for match in ps_matches:
if isinstance(match, tuple):
urls.extend([m for m in match if m])
else:
urls.append(match)
# Batch/CMD scripts
elif file_ext in ['.bat', '.cmd']:
urls.extend([url for url in BATCH_URL_REGEX.findall(content) if url])
# Perl/Ruby/Other scripting languages
elif file_ext in ['.pl', '.pm', '.rb', '.php', '.lua', '.tcl', '.groovy', '.awk', '.r', '.R']:
urls.extend([url for url in SCRIPT_URL_REGEX.findall(content) if url])
# Configuration files
elif file_ext in ['.ini', '.conf', '.cfg', '.toml', '.env']:
urls.extend([url for url in CONFIG_URL_REGEX.findall(content) if url])
# For other file types, use a generic approach to find http(s) URLs
else:
generic_url_regex = re.compile(r'(?:https?://[^\s\'">]+)')
urls.extend([url for url in generic_url_regex.findall(content) if url])
print(f"Found {len(urls)} URLs in {file_type} file: {file_path}")
except Exception as e:
print(f"Error processing file {file_path}: {str(e)}")
return urls