check_move.py (89 lines of code) (raw):
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import argparse
import subprocess
import re
import os
import sys
from typing import AnyStr, List
from urllib.parse import urlparse
move_pairs = []
deletes = []
change_detected = False
def is_same_file(path1, path2):
return os.path.normpath(path1) == os.path.normpath(path2)
def remove_suffix(text: str, suffix: str):
return text.rsplit(suffix, 1)[0]
def process_md_file(file_path):
link_pattern = re.compile(r"\[.*?\]\((.*?)\)")
global change_detected
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
links = link_pattern.findall(content)
new_content = content
for link in links:
if not urlparse(link).scheme and not os.path.isabs(link):
full_path: str = os.path.normpath(
os.path.join(os.path.dirname(file_path), link)
)
if not full_path.endswith(".md") and not full_path.endswith(".mdx"):
full_path += ".md"
for [from_path, to_path] in move_pairs:
# Skip change of suffix
from_base, from_ext = os.path.splitext(from_path)
to_base, to_ext = os.path.splitext(to_path)
if (
from_ext in [".md", ".mdx", ""] or to_ext in [".md", ".mdx", ""]
) and (from_base == to_base):
continue
# In md, the link relative path starts from the directory where the document is located, not the document
relative_to_path = os.path.relpath(to_path, os.path.dirname(file_path))
relative_to_path = remove_suffix(relative_to_path, ".md")
relative_to_path = remove_suffix(relative_to_path, ".mdx")
if is_same_file(full_path, from_path):
print(
f"{file_path} has a link moved by this commit: from {link} to {relative_to_path}"
)
change_detected = True
# Replace the old link with the new one
new_content = new_content.replace(
f"({link})", f"({relative_to_path})"
)
for deleted_path in deletes:
if is_same_file(full_path, deleted_path):
print(f"{file_path} has a link removed by this commit: {link}")
change_detected = True
# Write the updated content back to the file
if new_content != content:
with open(file_path, "w", encoding="utf-8") as f:
f.write(new_content)
def extract_file_changes(git_show_output: List[AnyStr]):
print(f"commit lines: {len(git_show_output)}")
content = b"".join(git_show_output).decode()
# print(content)
move_pattern = r"rename from (.+?)\nrename to (.+?)\n"
move_matches = re.findall(move_pattern, content, re.DOTALL | re.MULTILINE)
print(f"moved files: {len(move_matches)}")
delete_pattern = r"diff --git a/(\S+) b/\1\ndeleted file mode \d+\nindex .+"
delete_matches = re.findall(delete_pattern, content, re.DOTALL | re.MULTILINE)
print(f"deleted files: {len(delete_matches)}")
global move_pairs
global deletes
move_pairs = move_matches
deletes = delete_matches
def travel(root_path: str):
for root, dirs, files in os.walk(root_path):
for file in files:
if file.endswith(".md") or file.endswith(".mdx"):
md_file_path = os.path.join(root, file)
process_md_file(md_file_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Add commit id as arguments to check")
parser.add_argument("commit_id", type=str, help="id of the commit to check")
args = parser.parse_args()
# extract all move/delete files
p = subprocess.Popen(
"git show " + args.commit_id,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
)
extract_file_changes(p.stdout.readlines())
# check docs directories
travel("docs")
travel("i18n")
travel("versioned_docs")
if change_detected:
print("Failed!")
sys.exit(1)