def extract_urls_by_file_type()

in tools/url-checker/url_checker.py [0:0]


def extract_urls_by_file_type(file_path):
    """Extract URLs from a file based on its extension."""
    file_ext = os.path.splitext(file_path)[1].lower()
    
    urls = []
    file_type = SUPPORTED_FILE_TYPES.get(file_ext, 'Unknown')
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            
            # Markdown files
            if file_ext == '.md':
                # Use the existing markdown link extraction
                with open(file_path, 'r', encoding='utf-8') as md_file:
                    for line in md_file:
                        matches = MD_URL_REGEX.findall(line)
                        # Strip quotes from URLs
                        cleaned_matches = [match.strip('"\'') for match in matches]
                        urls.extend(cleaned_matches)
            
            # HTML files
            elif file_ext in ['.html', '.htm']:
                urls.extend([url for url in HTML_HREF_REGEX.findall(content) if url])
                urls.extend([url for url in HTML_SRC_REGEX.findall(content) if url])
                urls.extend([url for url in HTML_LINK_HREF_REGEX.findall(content) if url])
                urls.extend([url for url in HTML_META_CONTENT_REGEX.findall(content) if url and (url.startswith('http') or url.startswith('/') or url.startswith('.'))])
            
            # CSS files
            elif file_ext in ['.css', '.scss']:
                urls.extend([url for url in CSS_URL_REGEX.findall(content) if url])
            
            # JavaScript/TypeScript files
            elif file_ext in ['.js', '.jsx', '.ts', '.tsx']:
                js_matches = JS_URL_REGEX.findall(content)
                # Handle the case where we have groups in the regex
                for match in js_matches:
                    if isinstance(match, tuple):
                        urls.extend([m for m in match if m])
                    else:
                        urls.append(match)
            
            # Python files
            elif file_ext == '.py':
                urls.extend([url for url in PY_URL_REGEX.findall(content) if url])
                # Python imports are special - we don't check these as URLs but could in the future
            
            # JSON/YAML files
            elif file_ext in ['.json', '.yaml', '.yml']:
                urls.extend([url for url in JSON_URL_REGEX.findall(content) if url])
            
            # XML files
            elif file_ext == '.xml':
                urls.extend([url for url in XML_URL_REGEX.findall(content) if url])
                
            # Shell/Bash scripts
            elif file_ext in ['.sh', '.bash', '.zsh', '.ksh']:
                shell_matches = SHELL_URL_REGEX.findall(content)
                for match in shell_matches:
                    if isinstance(match, tuple):
                        urls.extend([m for m in match if m])
                    else:
                        urls.append(match)
            
            # PowerShell scripts
            elif file_ext in ['.ps1', '.psm1', '.psd1']:
                ps_matches = PS_URL_REGEX.findall(content)
                for match in ps_matches:
                    if isinstance(match, tuple):
                        urls.extend([m for m in match if m])
                    else:
                        urls.append(match)
                        
            # Batch/CMD scripts
            elif file_ext in ['.bat', '.cmd']:
                urls.extend([url for url in BATCH_URL_REGEX.findall(content) if url])
                
            # Perl/Ruby/Other scripting languages
            elif file_ext in ['.pl', '.pm', '.rb', '.php', '.lua', '.tcl', '.groovy', '.awk', '.r', '.R']:
                urls.extend([url for url in SCRIPT_URL_REGEX.findall(content) if url])
                
            # Configuration files
            elif file_ext in ['.ini', '.conf', '.cfg', '.toml', '.env']:
                urls.extend([url for url in CONFIG_URL_REGEX.findall(content) if url])
            
            # For other file types, use a generic approach to find http(s) URLs
            else:
                generic_url_regex = re.compile(r'(?:https?://[^\s\'">]+)')
                urls.extend([url for url in generic_url_regex.findall(content) if url])
            
            print(f"Found {len(urls)} URLs in {file_type} file: {file_path}")
            
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
    
    return urls