utilities/tools/platform/Parse-WAF-Security-Recommendations.py (118 lines of code) (raw):

import requests import yaml import csv import markdown from bs4 import BeautifulSoup import re import time def parse_toc(url): try: response = requests.get(url) response.raise_for_status() content = response.text toc = yaml.safe_load(content) def find_recommendations(items): results = [] for item in items: if 'items' in item: results.extend(find_recommendations(item['items'])) elif 'href' in item and item['href'].startswith('recommendations-reference'): results.append(item) return results recommendations = find_recommendations(toc) print(f"Found {len(recommendations)} recommendation items in TOC") return recommendations except Exception as e: print(f"Error parsing TOC: {str(e)}") return [] def parse_recommendation_page(url): try: print(f"Fetching page: {url}") response = requests.get(url) response.raise_for_status() content = response.text # Convert Markdown to HTML html_content = markdown.markdown(content) soup = BeautifulSoup(html_content, 'html.parser') # Find all h2 tags that contain "Azure" azure_h2_tags = [h2 for h2 in soup.find_all('h2') if 'Azure' in h2.text] if not azure_h2_tags: print("No h2 tag with 'Azure' found") return None recommendations = [] for h2 in azure_h2_tags: name = h2.text.strip() print(f"Found name: {name}") # Find all h3 tags that follow this h2 until the next h2 h3_tags = [] next_element = h2.next_sibling while next_element and next_element.name != 'h2': if next_element.name == 'h3': h3_tags.append(next_element) next_element = next_element.next_sibling for h3 in h3_tags: description = h3.text.strip() print(f"Found description: {description}") # Find the next sibling elements until the next h3 or h2 next_element = h3.next_sibling policy_url = '' severity = '' type_info = '' additional_info = [] while next_element and next_element.name not in ['h3', 'h2']: if isinstance(next_element, str): text = next_element.strip() if text: additional_info.append(text) elif next_element.name in ['p', 'li']: text = next_element.text.strip() if text: additional_info.append(text) next_element = next_element.next_sibling # Join additional info and search for severity, type, and policy URL additional_text = ' '.join(additional_info) severity_match = re.search(r'Severity:\s*(\w+)', additional_text) if severity_match: severity = severity_match.group(1) print(f"Found severity: {severity}") type_match = re.search(r'Type:\s*(.+?)(?=\n|$)', additional_text) if type_match: type_info = type_match.group(1).strip() print(f"Found type: {type_info}") # Extract policy URL policy_match = re.search(r'Related policy: \[.+?\]\((https://portal\.azure\.com/#blade/Microsoft_Azure_Policy/PolicyDetailBlade/definitionId/[^)]+)\)', additional_text) if policy_match: policy_url = policy_match.group(1) print(f"Found policy URL: {policy_url}") recommendations.append({ 'name': name, 'description': description, 'policy_url': policy_url, 'severity': severity, 'type': type_info }) return recommendations except Exception as e: print(f"Error parsing page {url}: {str(e)}") return None # URL of the TOC.yml file toc_url = "https://raw.githubusercontent.com/MicrosoftDocs/azure-security-docs/refs/heads/main/articles/defender-for-cloud/TOC.yml" # Parse the TOC and get recommendations recommendation_items = parse_toc(toc_url) if not recommendation_items: print("No recommendation items found. Exiting.") exit() # Prepare CSV file csv_filename = 'recommendations.csv' csv_headers = ['Name', 'Description', 'Policy URL', 'Severity', 'Type'] with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=csv_headers) writer.writeheader() for item in recommendation_items: page_url = f"https://raw.githubusercontent.com/MicrosoftDocs/azure-security-docs/refs/heads/main/articles/defender-for-cloud/{item['href']}" page_data = parse_recommendation_page(page_url) if page_data: for recommendation in page_data: writer.writerow({ 'Name': recommendation['name'], 'Description': recommendation['description'], 'Policy URL': recommendation['policy_url'], 'Severity': recommendation['severity'], 'Type': recommendation['type'] }) print(f"Processed and wrote to CSV: {recommendation['description']}") else: print(f"Failed to process: {item['name']}") time.sleep(1) print(f"CSV file '{csv_filename}' has been created with the extracted information.")