in tools/url-checker/url_checker.py [0:0]
def extract_headers(md_file):
"""Extract all headers from a markdown file and convert to slug format for link validation."""
headers = []
# Only attempt to extract headers from markdown files
if not md_file.lower().endswith('.md'):
print(f"Warning: Attempted to extract headers from non-markdown file: {md_file}")
return headers
try:
with open(md_file, 'r', encoding='utf-8') as f:
for line in f:
if line.strip().startswith('#'):
# Extract the header text (remove the # and any leading/trailing whitespace)
header_text = line.lstrip('#').strip()
# Convert to lowercase
header_text_lower = header_text.lower()
# Remove markdown formatting (bold, italic, code)
header_text_clean = re.sub(r'[*_`]', '', header_text_lower)
# Create slug: keep only alphanumeric chars and hyphens, replace spaces with hyphens
header_slug = re.sub(r'[^\w\- ]', '', header_text_clean)
header_slug = re.sub(r'\s+', '-', header_slug)
# Add to the list of headers
headers.append(header_slug)
print(f"Found header: '{header_text}' -> slug: '{header_slug}'")
except Exception as e:
print(f"Warning: Could not extract headers from {md_file}: {str(e)}")
return headers