check_docs_version_diff.py (88 lines of code) (raw):

import json import os import filecmp # Define a list of keywords to exclude exclude_keywords = ["sql-manual", "releasenotes", "ecosystem", "admin-manual", "faq", "data-operate", "query-data", "table-design", "gettingStarted", "query-acceleration", "lakehouse", "compute-storage-decoupled", "benchmark", "db-connect", "deploy-on-kubernetes"] version_21_prefix_cn = "./ii18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/" version_30_prefix_cn = "./i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/" version_dev_prefix_cn = "./i18n/zh-CN/docusaurus-plugin-content-docs/current/" version_21_prefix_en = "./versioned_docs/version-2.1/" version_30_prefix_en = "./versioned_docs/version-3.0/" version_dev_prefix_en = "./docs/" def extract_items_from_file(file_path): """Read the JSON file, extract 'items', and filter them based on exclusion criteria.""" result = [] # List to store the filtered items try: with open(file_path, 'r', encoding='utf-8') as file: data = json.load(file) # Load the JSON data # Recursive function to extract items and filter them def extract_items(data): """Recursively extract 'items' and store them in result, excluding specified keywords.""" if isinstance(data, list): for item in data: extract_items(item) # Process each item in the list elif isinstance(data, dict): if "items" in data: # Add valid items (not containing excluded keywords) result.extend([item for item in data["items"] if isinstance(item, str) and not any(keyword in item for keyword in exclude_keywords)]) # Recursively process each key in the dictionary for key in data: extract_items(data[key]) extract_items(data) # Start extracting items from the loaded JSON data return result # Return the list of filtered items except (FileNotFoundError, json.JSONDecodeError) as e: print(f"Error: {e}") return [] except Exception as e: print(f"An unexpected error occurred: {e}") return [] def diff_doc_cn(directories): """Generate three paths for each directory using predefined version prefixes and check file existence and content differences.""" for directory in directories: # Construct the paths for each version path_v21 = os.path.join(version_21_prefix_cn, directory + ".md") path_v30 = os.path.join(version_30_prefix_cn, directory + ".md") path_dev = os.path.join(version_dev_prefix_cn, directory + ".md") # Check if the file is missing in v21 or v30 if not os.path.exists(path_v21): print(f"Missing in version 2.1: {path_v21}") if not os.path.exists(path_v30): print(f"Missing in version 3.0: {path_v30}") if not os.path.exists(path_dev): print(f"Missing in current (dev) version: {path_dev}") # Compare path_v21 with path_dev if path_v21 exists if os.path.exists(path_v21) and os.path.exists(path_dev): if not filecmp.cmp(path_v21, path_dev, shallow=False): print(f"File mismatch between v21 and dev for: {directory}") print("path_dev: " + path_dev) print("path_v21: " + path_v21) print("-" * 50) # Compare path_v30 with path_dev if path_v30 exists if os.path.exists(path_v30) and os.path.exists(path_dev): if not filecmp.cmp(path_v30, path_dev, shallow=False): print(f"File mismatch between v30 and dev for: {directory}") print("path_dev: " + path_dev) print("path_v30: " + path_v30) print("-" * 50) def diff_doc_en(directories): """Generate three paths for each directory using predefined version prefixes and check file existence and content differences.""" for directory in directories: # Construct the paths for each version path_v21 = os.path.join(version_21_prefix_en, directory + ".md") path_v30 = os.path.join(version_30_prefix_en, directory + ".md") path_dev = os.path.join(version_dev_prefix_en, directory + ".md") # Check if the file is missing in v21 or v30 if not os.path.exists(path_v21): print(f"Missing in version 2.1: {path_v21}") if not os.path.exists(path_v30): print(f"Missing in version 3.0: {path_v30}") if not os.path.exists(path_dev): print(f"Missing in current (dev) version: {path_dev}") # Compare path_v21 with path_dev if path_v21 exists if os.path.exists(path_v21) and os.path.exists(path_dev): if not filecmp.cmp(path_v21, path_dev, shallow=False): print(f"File mismatch between v21 and dev for: {directory}") print("path_dev: " + path_dev) print("path_v21: " + path_v21) print("-" * 50) # Compare path_v30 with path_dev if path_v30 exists if os.path.exists(path_v30) and os.path.exists(path_dev): if not filecmp.cmp(path_v30, path_dev, shallow=False): print(f"File mismatch between v30 and dev for: {directory}") print("path_dev: " + path_dev) print("path_v30: " + path_v30) print("-" * 50) if __name__ == "__main__": # Fixed file path to sidebars.json file_path = "./sidebars.json" # Fixed file path file_list = extract_items_from_file(file_path) # Extract items and get the result # Call diff_doc to check for mismatched documents print("Checking CN Doc >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") diff_doc_cn(file_list) print("Checking EN Doc >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") diff_doc_en(file_list)