dev-tools/scripts/refguide/refguide-download-js-css.py (108 lines of code) (raw):

#!/usr/bin/env python3 # -*- coding: utf-8 -*- # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This script processes all static html files for Solr's refernce guide and downloads external JS and CSS files to local folders js/ and css/ for each version. It also updates the HTML files to reference the local files. Context is that ASF policy for web sites changed to not allow external references to JS and CSS files, and these sites were generated long ago. """ import os import requests from bs4 import BeautifulSoup from urllib.parse import urlparse import re import argparse def extract_version_from_url(url): """Extract version number from URL if present.""" match = re.search(r'/(\d+\.\d+(\.\d+)?)/', url) return match.group(1) if match else None def is_external_url(url): if "apache.org" in url: return False """Check if a URL is external (starts with http/https or //).""" return url.startswith("http://") or url.startswith("https://") or url.startswith("//") def download_file(url, dest_path): """Download a file from a URL to a local path.""" if os.path.exists(dest_path): #print(f"Skipping {url} (already downloaded to {dest_path})") return try: if url.startswith("//"): url = "https:" + url # Default to HTTPS for protocol-relative URLs if url.startswith("https://oss.maxcdn.com/"): url = url.replace("https://oss.maxcdn.com/", "https://cdnjs.cloudflare.com/ajax/") response = requests.get(url, timeout=10) response.raise_for_status() with open(dest_path, "wb") as f: f.write(response.content) print(f"Downloaded {url} to {dest_path}") except Exception as e: print(f"Failed to download {url}: {e}") def add_version_to_filename(filename, version): """Add version number to filename if not already present. Example: jquery.js -> jquery-3.6.0.js jquery.min.js -> jquery-3.6.0.min.js """ if filename.endswith(".min.js"): filename_parts = filename.rsplit(".min.js", 1) filename = f"{filename_parts[0]}-{version}.min.js" elif filename.endswith(".min.css"): filename_parts = filename.rsplit(".min.css", 1) filename = f"{filename_parts[0]}-{version}.min.css" else: filename_parts = filename.rsplit('.', 1) filename = f"{filename_parts[0]}-{version}.{filename_parts[1]}" return filename def process_html_file(html_file_path, js_dir, css_dir, skip_files=None): """Process an HTML file to localize external JS and CSS references.""" with open(html_file_path, "r", encoding="utf-8") as f: lines = f.readlines() modified = False new_lines = [] for line in lines: soup = BeautifulSoup(line, "html.parser") script = soup.find("script", src=True) link = soup.find("link", rel="stylesheet", href=True) if script and is_external_url(script["src"]): src = script["src"] if "MathJax" in src: src = "https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js" filename = os.path.basename(urlparse(src).path) version = extract_version_from_url(src) if version and not re.search(r'\d+\.\d+(\.\d+)?', filename): filename = add_version_to_filename(filename, version) local_path = os.path.join(js_dir, filename) download_file(src, local_path) script["src"] = f"js/{filename}" # Relative path to js/ folder new_lines.append(str(script) + "\n") modified = True elif link and is_external_url(link["href"]): href = link["href"] if "MathJax" in href: href = "https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js" filename = os.path.basename(urlparse(href).path) if filename not in skip_files: version = extract_version_from_url(href) if version and not re.search(r'\d+\.\d+(\.\d+)?', filename): filename = add_version_to_filename(filename, version) local_path = os.path.join(css_dir, filename) download_file(href, local_path) link["href"] = f"css/{filename}" # Relative path to css/ folder new_lines.append(str(link) + "\n") modified = True else: new_lines.append(line) if modified: with open(html_file_path, "w", encoding="utf-8") as f: f.writelines(new_lines) print(f"Updated {html_file_path}") def main(): parser = argparse.ArgumentParser(description='Process HTML files to localize external JS and CSS references.') parser.add_argument('folder', help='Folder of svn checkout (https://svn.apache.org/repos/infra/sites/solr/guide/)') args = parser.parse_args() base_dir = args.folder # Iterate over the folder structure folders = [name for name in os.listdir(base_dir) if re.match(r'\d+_\d+', name)] if not folders: print(f"No versioned directories 'N_M' found in {base_dir}, exiting.") return for root_dir in folders: print(f"\nProcessing directory {root_dir}") print(f"=================================") full_path = os.path.join(base_dir, root_dir) if not os.path.exists(full_path): print(f"Directory {full_path} not found, skipping.") continue js_dir = os.path.join(full_path, "js") css_dir = os.path.join(full_path, "css") os.makedirs(js_dir, exist_ok=True) os.makedirs(css_dir, exist_ok=True) skip_files = ["font-awesome.min.css"] # Process each HTML file in the directory for filename in os.listdir(full_path): if filename.endswith(".html"): html_file_path = os.path.join(full_path, filename) process_html_file(html_file_path, js_dir, css_dir, skip_files) if __name__ == "__main__": main()