readme_generator.py

# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os import re import logging from collections import defaultdict # Setup logging logging.basicConfig(filename='readme_generation.log', filemode='w', level=logging.DEBUG, format='%(asctime)s [%(levelname)s] %(message)s') # Function to escape special characters for Markdown def escape_markdown(text): escape_chars = { '*': '\\*', '_': '\\_', '[': '\\[', ']': '\\]', '(': '\$', ')': '\$', '#': '\\#', '+': '\\+', '-': '\\-', '!': '\\!', '~': '\\~', '|': '\\|', '<': '\\<', '>': '\\>', '`': '\\`', } for char, escaped in escape_chars.items(): text = text.replace(char, escaped) return text # Function to parse a single SQLX file and extract relevant details def parse_sqlx(file_content: str, filename: str) -> dict: logging.info(f"Parsing file: {filename}") function_pattern = re.compile( r"CREATE OR REPLACE (?:AGGREGATE )?FUNCTION\s+\$\{self\}$(.*?)$\s+RETURNS\s+([^;]+?)(?=\s+(?:LANGUAGE|OPTIONS|$))", re.DOTALL ) description_pattern = re.compile(r"description\s*=\s*['\"]{3}(.*?)['\"]{3}", re.DOTALL) # Extract function signature and return type function_match = function_pattern.search(file_content) if function_match: function_signature = function_match.group(1).strip() return_type = function_match.group(2).strip() logging.debug(f"Function signature: {function_signature}") logging.debug(f"Return type: {return_type}") else: function_signature = "" return_type = "UNKNOWN" logging.warning(f"No function signature or return type found in {filename}") # Extract description description_match = description_pattern.search(file_content) description = description_match.group(1).strip() if description_match else "No description available" description = re.compile(r'\n*For more info.*', re.M | re.S).sub('', description) # remove repetitive links description = escape_markdown(description) description = description.replace('\nParam', '\n* Param') description = description.replace('\nDefault', '\n* Default') description = description.replace('\nReturn', '\n* Return') # Extract function arguments and their types arg_list = [] for arg in re.split(r",\s*(?![^<>]*>)", function_signature): # Split by comma only if not within "<>" arg_parts = arg.strip().split() if len(arg_parts) >= 2: # Allow more than two parts for complex arguments arg_list.append((arg_parts[0], " ".join(arg_parts[1:]))) # (arg_name, arg_type) elif arg.strip(): # Ignore empty arguments logging.warning(f"Unexpected argument format in {filename}: {arg}") # Determine function type function_type = "AGGREGATE" if "AGGREGATE FUNCTION" in file_content else "SCALAR" return { "name": filename[:-5], # Remove file extension .sqlx "params": f"({', '.join([f'{arg[0]} {arg[1]}' for arg in arg_list])})", "returns": return_type, "description": description, "type": function_type, } # Function to walk through directories, parse SQLX files, and collect data for README def process_folder(input_folder: str, sketch_type: str) -> dict: function_index = defaultdict(list) for root, dirs, files in os.walk(input_folder): for file in files: if file.endswith(".sqlx"): sqlx_path = os.path.join(root, file) logging.info(f"Processing file: {sqlx_path}") with open(sqlx_path, 'r') as f: content = f.read() # Parse the SQLX content data = parse_sqlx(content, file) logging.info(f"Parsed data for {file}: {data}") data['path'] = sqlx_path function_index[sketch_type].append(data) return function_index # Function to generate README content based on the template def generate_readme(template_path: str, function_index: dict, examples_path: str) -> str: # Read the template file with open(template_path, 'r') as template_file: output_lines = template_file.readlines() output_lines.append("\n## Aggregate Functions\n") # Sort functions by function type (AGGREGATE first, then SCALAR) and then by number of arguments sorted_functions = sorted(function_index, key=lambda x: (x['type'], len(x['params'].split(','))), reverse=False) is_aggregate = True for function in sorted_functions: if is_aggregate and function['type'] == 'SCALAR': output_lines.append("\n## Scalar Functions\n") is_aggregate = False function_link = f"[{function['name']}{function['params']}](../{function['path']})" output_lines.append(f"\n### {function_link}\n{function['description']}\n") # Add examples section example_files = [f for f in os.listdir(examples_path) if f.endswith(".sql")] if example_files: output_lines.append("\n## Examples\n") for example_file in example_files: full_name = os.path.join(examples_path, example_file) output_lines.append(f"\n### [test/{example_file}](../{full_name})\n") with open(full_name, 'r') as f: sql_code = f.read() # Remove license header from examples sql_code_lines = sql_code.splitlines() start_index = 0 for i, line in enumerate(sql_code_lines): if not line.startswith("/*") and not line.startswith(" *") and not line.startswith(" */"): start_index = i break sql_code_without_license = "\n".join(sql_code_lines[start_index:]) # add project and dataset available in BQ sql_code_without_license = sql_code_without_license.replace('`$BQ_DATASET`', 'bqutil.datasketches') # Add the SQL code in a code block output_lines.append(f"```sql\n{sql_code_without_license}\n```\n") output_content = "".join(output_lines) return output_content if __name__ == "__main__": sketch_types = ["cpc", "fi", "hll", "kll", "tdigest", "theta", "tuple", "req"] template_name = "README_template.md" readme_name = "README.md" for sketch_type in sketch_types: logging.info("processing sketch type " + sketch_type) function_index = process_folder(sketch_type, sketch_type) sketch_type_readme_name = os.path.join(sketch_type, readme_name) logging.info("generating " + sketch_type_readme_name) readme_content = generate_readme(os.path.join(sketch_type, template_name), function_index[sketch_type], os.path.join(sketch_type, "test")) with open(sketch_type_readme_name, "w") as readme_file: readme_file.write(readme_content) logging.info(sketch_type_readme_name + " generated successfully")

readme_generator.py (123 lines of code) (raw):