scripts/validate-urls.py

"""Copyright 2018-2022 The Kubeflow Authors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at https://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. This script finds .md files under a directory and its subdirectories, extracts http/https URLs from .md files and validates them. This script can be run periodically on kubeflow/website source repository to find outdated URLs, which indicate possible outdated document sections. To run this script, type the following on the command line: python3.8 validate-urls.py -d /path/to/kubeflow/website/content/docs Input: The path of a directory that contains .md files as `-d` command line flag. Output: STDOUT logs in the format of `<file>: <URL> , <status>` and a summary of all invalid URLs at the end. Dependency: You may need to install the `requests` Python package via command line: python3.8 -m pip install requests """ import argparse import os import re import requests parser = argparse.ArgumentParser( description='Validate all URLs in the Kubeflow on Google Cloud website' ) parser.add_argument( '-d', '--dir', dest='input_dir', nargs='?', default='../content/en', help= 'Path to the doc content folder. (Default: %(default)s)', ) # http/https URLs HTTP_PATTERN = re.compile( 'http[s]?://[a-zA-Z\-_?/*\.#\$][a-zA-Z0-9\-_?/*\.#%=\$]+') # Patterns in this white list are considered valid. WHITE_LIST = [ re.compile('http[s]?://localhost'), re.compile('http[s]?://\.\.'), # https://...... re.compile('https://path/to/component.yaml'), re.compile('https://github.com/kubeflow/kfctl/releases/tag') ] def should_skip(url): for p in WHITE_LIST: if p.match(url): return True return False def main(): args = parser.parse_args() # find all md files under INPUT_DIR. files = [] for (dirpath, dirname, filenames) in os.walk(args.input_dir): for f in filenames: if f.endswith(".md"): files.append(os.path.join(dirpath, f)) print(f'Found {len(files)} MD files') urls = {} for file in files: with open(file, "r") as f: u = HTTP_PATTERN.findall(f.read()) if u: urls[file[len(args.input_dir):]] = u print(f'Found {len(urls)} URLS') problematic_urls = [] for file, urls in urls.items(): for url in urls: if should_skip(url): print(f"skipping {url} ") continue print(f"{file}: URL {url}",end='') try: r = requests.head(url) print(f" , Status {r.status_code}") if r.status_code >= 400 and r.status_code < 500: problematic_urls.append((file, url, r.status_code)) except Exception as e: print(e) problematic_urls.append((file, url, "FAIL")) print("\nSummary:\n") for u in problematic_urls: print(f"|{u[0]} | {u[1]} | {u[2]}|") if __name__ == "__main__": main()

scripts/validate-urls.py (64 lines of code) (raw):