check_all_deadlink.py (50 lines of code) (raw):

# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import re import os from urllib.parse import urlparse def process_md_file(file_path): link_pattern = re.compile(r"\[.*?\]\((.*?)\)") code_block_pattern = re.compile(r"^```.*$") with open(file_path, "r", encoding="utf-8") as f: content = f.read() lines = content.splitlines() in_code_block = False for line_number, line in enumerate(lines, start=1): # Skip codeblocks if code_block_pattern.match(line): in_code_block = not in_code_block continue if in_code_block: continue links = link_pattern.findall(line) for link in links: # Skip urls if ( not urlparse(link).scheme and not os.path.isabs(link) and not (link[0] == "#") ): full_path = os.path.normpath( os.path.join(os.path.dirname(file_path), link) ) # Skip section headers if "#" in full_path: full_path = full_path.split("#", 1)[0] if not full_path.endswith(".md") and not full_path.endswith(".mdx"): full_path += ".md" md_exists = os.path.exists(full_path) mdx_exists = ( os.path.exists(full_path[:-3] + ".mdx") if full_path.endswith(".md") else False ) if not md_exists and not mdx_exists: print( f"Error: File not found for link '{link}' in file '{file_path}:{line_number}'" ) def travel(root_path: str): for root, dirs, files in os.walk(root_path): for file in files: if file.endswith(".md") or file.endswith(".mdx"): md_file_path = os.path.join(root, file) process_md_file(md_file_path) if __name__ == "__main__": # check docs directories travel("docs") travel("i18n") travel("versioned_docs")