check_docs_version_diff.py (88 lines of code) (raw):
import json
import os
import filecmp
# Define a list of keywords to exclude
exclude_keywords = ["sql-manual", "releasenotes", "ecosystem", "admin-manual", "faq",
"data-operate", "query-data", "table-design", "gettingStarted",
"query-acceleration", "lakehouse", "compute-storage-decoupled",
"benchmark", "db-connect", "deploy-on-kubernetes"]
version_21_prefix_cn = "./ii18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/"
version_30_prefix_cn = "./i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/"
version_dev_prefix_cn = "./i18n/zh-CN/docusaurus-plugin-content-docs/current/"
version_21_prefix_en = "./versioned_docs/version-2.1/"
version_30_prefix_en = "./versioned_docs/version-3.0/"
version_dev_prefix_en = "./docs/"
def extract_items_from_file(file_path):
"""Read the JSON file, extract 'items', and filter them based on exclusion criteria."""
result = [] # List to store the filtered items
try:
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file) # Load the JSON data
# Recursive function to extract items and filter them
def extract_items(data):
"""Recursively extract 'items' and store them in result, excluding specified keywords."""
if isinstance(data, list):
for item in data:
extract_items(item) # Process each item in the list
elif isinstance(data, dict):
if "items" in data:
# Add valid items (not containing excluded keywords)
result.extend([item for item in data["items"] if isinstance(item, str) and not any(keyword in item for keyword in exclude_keywords)])
# Recursively process each key in the dictionary
for key in data:
extract_items(data[key])
extract_items(data) # Start extracting items from the loaded JSON data
return result # Return the list of filtered items
except (FileNotFoundError, json.JSONDecodeError) as e:
print(f"Error: {e}")
return []
except Exception as e:
print(f"An unexpected error occurred: {e}")
return []
def diff_doc_cn(directories):
"""Generate three paths for each directory using predefined version prefixes and check file existence and content differences."""
for directory in directories:
# Construct the paths for each version
path_v21 = os.path.join(version_21_prefix_cn, directory + ".md")
path_v30 = os.path.join(version_30_prefix_cn, directory + ".md")
path_dev = os.path.join(version_dev_prefix_cn, directory + ".md")
# Check if the file is missing in v21 or v30
if not os.path.exists(path_v21):
print(f"Missing in version 2.1: {path_v21}")
if not os.path.exists(path_v30):
print(f"Missing in version 3.0: {path_v30}")
if not os.path.exists(path_dev):
print(f"Missing in current (dev) version: {path_dev}")
# Compare path_v21 with path_dev if path_v21 exists
if os.path.exists(path_v21) and os.path.exists(path_dev):
if not filecmp.cmp(path_v21, path_dev, shallow=False):
print(f"File mismatch between v21 and dev for: {directory}")
print("path_dev: " + path_dev)
print("path_v21: " + path_v21)
print("-" * 50)
# Compare path_v30 with path_dev if path_v30 exists
if os.path.exists(path_v30) and os.path.exists(path_dev):
if not filecmp.cmp(path_v30, path_dev, shallow=False):
print(f"File mismatch between v30 and dev for: {directory}")
print("path_dev: " + path_dev)
print("path_v30: " + path_v30)
print("-" * 50)
def diff_doc_en(directories):
"""Generate three paths for each directory using predefined version prefixes and check file existence and content differences."""
for directory in directories:
# Construct the paths for each version
path_v21 = os.path.join(version_21_prefix_en, directory + ".md")
path_v30 = os.path.join(version_30_prefix_en, directory + ".md")
path_dev = os.path.join(version_dev_prefix_en, directory + ".md")
# Check if the file is missing in v21 or v30
if not os.path.exists(path_v21):
print(f"Missing in version 2.1: {path_v21}")
if not os.path.exists(path_v30):
print(f"Missing in version 3.0: {path_v30}")
if not os.path.exists(path_dev):
print(f"Missing in current (dev) version: {path_dev}")
# Compare path_v21 with path_dev if path_v21 exists
if os.path.exists(path_v21) and os.path.exists(path_dev):
if not filecmp.cmp(path_v21, path_dev, shallow=False):
print(f"File mismatch between v21 and dev for: {directory}")
print("path_dev: " + path_dev)
print("path_v21: " + path_v21)
print("-" * 50)
# Compare path_v30 with path_dev if path_v30 exists
if os.path.exists(path_v30) and os.path.exists(path_dev):
if not filecmp.cmp(path_v30, path_dev, shallow=False):
print(f"File mismatch between v30 and dev for: {directory}")
print("path_dev: " + path_dev)
print("path_v30: " + path_v30)
print("-" * 50)
if __name__ == "__main__":
# Fixed file path to sidebars.json
file_path = "./sidebars.json" # Fixed file path
file_list = extract_items_from_file(file_path) # Extract items and get the result
# Call diff_doc to check for mismatched documents
print("Checking CN Doc >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
diff_doc_cn(file_list)
print("Checking EN Doc >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
diff_doc_en(file_list)