readme_generator.py (123 lines of code) (raw):
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import os
import re
import logging
from collections import defaultdict
# Setup logging
logging.basicConfig(filename='readme_generation.log', filemode='w', level=logging.DEBUG, format='%(asctime)s [%(levelname)s] %(message)s')
# Function to escape special characters for Markdown
def escape_markdown(text):
escape_chars = {
'*': '\\*',
'_': '\\_',
'[': '\\[',
']': '\\]',
'(': '\\(',
')': '\\)',
'#': '\\#',
'+': '\\+',
'-': '\\-',
'!': '\\!',
'~': '\\~',
'|': '\\|',
'<': '\\<',
'>': '\\>',
'`': '\\`',
}
for char, escaped in escape_chars.items():
text = text.replace(char, escaped)
return text
# Function to parse a single SQLX file and extract relevant details
def parse_sqlx(file_content: str, filename: str) -> dict:
logging.info(f"Parsing file: {filename}")
function_pattern = re.compile(
r"CREATE OR REPLACE (?:AGGREGATE )?FUNCTION\s+\$\{self\(\)\}\((.*?)\)\s+RETURNS\s+([^;]+?)(?=\s+(?:LANGUAGE|OPTIONS|$))",
re.DOTALL
)
description_pattern = re.compile(r"description\s*=\s*['\"]{3}(.*?)['\"]{3}", re.DOTALL)
# Extract function signature and return type
function_match = function_pattern.search(file_content)
if function_match:
function_signature = function_match.group(1).strip()
return_type = function_match.group(2).strip()
logging.debug(f"Function signature: {function_signature}")
logging.debug(f"Return type: {return_type}")
else:
function_signature = ""
return_type = "UNKNOWN"
logging.warning(f"No function signature or return type found in {filename}")
# Extract description
description_match = description_pattern.search(file_content)
description = description_match.group(1).strip() if description_match else "No description available"
description = re.compile(r'\n*For more info.*', re.M | re.S).sub('', description) # remove repetitive links
description = escape_markdown(description)
description = description.replace('\nParam', '\n* Param')
description = description.replace('\nDefault', '\n* Default')
description = description.replace('\nReturn', '\n* Return')
# Extract function arguments and their types
arg_list = []
for arg in re.split(r",\s*(?![^<>]*>)", function_signature): # Split by comma only if not within "<>"
arg_parts = arg.strip().split()
if len(arg_parts) >= 2: # Allow more than two parts for complex arguments
arg_list.append((arg_parts[0], " ".join(arg_parts[1:]))) # (arg_name, arg_type)
elif arg.strip(): # Ignore empty arguments
logging.warning(f"Unexpected argument format in {filename}: {arg}")
# Determine function type
function_type = "AGGREGATE" if "AGGREGATE FUNCTION" in file_content else "SCALAR"
return {
"name": filename[:-5], # Remove file extension .sqlx
"params": f"({', '.join([f'{arg[0]} {arg[1]}' for arg in arg_list])})",
"returns": return_type,
"description": description,
"type": function_type,
}
# Function to walk through directories, parse SQLX files, and collect data for README
def process_folder(input_folder: str, sketch_type: str) -> dict:
function_index = defaultdict(list)
for root, dirs, files in os.walk(input_folder):
for file in files:
if file.endswith(".sqlx"):
sqlx_path = os.path.join(root, file)
logging.info(f"Processing file: {sqlx_path}")
with open(sqlx_path, 'r') as f:
content = f.read()
# Parse the SQLX content
data = parse_sqlx(content, file)
logging.info(f"Parsed data for {file}: {data}")
data['path'] = sqlx_path
function_index[sketch_type].append(data)
return function_index
# Function to generate README content based on the template
def generate_readme(template_path: str, function_index: dict, examples_path: str) -> str:
# Read the template file
with open(template_path, 'r') as template_file:
output_lines = template_file.readlines()
output_lines.append("\n## Aggregate Functions\n")
# Sort functions by function type (AGGREGATE first, then SCALAR) and then by number of arguments
sorted_functions = sorted(function_index, key=lambda x: (x['type'], len(x['params'].split(','))), reverse=False)
is_aggregate = True
for function in sorted_functions:
if is_aggregate and function['type'] == 'SCALAR':
output_lines.append("\n## Scalar Functions\n")
is_aggregate = False
function_link = f"[{function['name']}{function['params']}](../{function['path']})"
output_lines.append(f"\n### {function_link}\n{function['description']}\n")
# Add examples section
example_files = [f for f in os.listdir(examples_path) if f.endswith(".sql")]
if example_files:
output_lines.append("\n## Examples\n")
for example_file in example_files:
full_name = os.path.join(examples_path, example_file)
output_lines.append(f"\n### [test/{example_file}](../{full_name})\n")
with open(full_name, 'r') as f:
sql_code = f.read()
# Remove license header from examples
sql_code_lines = sql_code.splitlines()
start_index = 0
for i, line in enumerate(sql_code_lines):
if not line.startswith("/*") and not line.startswith(" *") and not line.startswith(" */"):
start_index = i
break
sql_code_without_license = "\n".join(sql_code_lines[start_index:])
# add project and dataset available in BQ
sql_code_without_license = sql_code_without_license.replace('`$BQ_DATASET`', 'bqutil.datasketches')
# Add the SQL code in a code block
output_lines.append(f"```sql\n{sql_code_without_license}\n```\n")
output_content = "".join(output_lines)
return output_content
if __name__ == "__main__":
sketch_types = ["cpc", "fi", "hll", "kll", "tdigest", "theta", "tuple", "req"]
template_name = "README_template.md"
readme_name = "README.md"
for sketch_type in sketch_types:
logging.info("processing sketch type " + sketch_type)
function_index = process_folder(sketch_type, sketch_type)
sketch_type_readme_name = os.path.join(sketch_type, readme_name)
logging.info("generating " + sketch_type_readme_name)
readme_content = generate_readme(os.path.join(sketch_type, template_name), function_index[sketch_type], os.path.join(sketch_type, "test"))
with open(sketch_type_readme_name, "w") as readme_file:
readme_file.write(readme_content)
logging.info(sketch_type_readme_name + " generated successfully")