sources/grammar_docs/tree_sitter.py (253 lines of code) (raw):
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# ###################################################################################
# tree_sitter.py:
# ---------------
#
# Generate tree-sitter grammar for the CQL language. It's used by CQL VSCODE extension
# to provide syntax hightlighting.
#
# CQL parser (xplat/vscode/modules/tree-sitter-cql/*):
# ----------------------------------------------------
#
# It's a nuclide module that contains the parser for cql language. The parser is backed
# by the parser generator tool tree-sitter (http://tree-sitter.github.io/tree-sitter/).
# It generate a tree-sitter binary (tree-sitter-cql.wasm) from grammar.js. The binary is
# used in the CQL VSCode extension module to parse CQL files.
#
# #####################################################################################
import datetime
import re
NULL_PATTERN = re.compile(r"/\*\s*nil\s*\*/")
SEQUENCE_PATTERN = re.compile(r"\"[^\"]+\"|'.?'|[\w\-\_@]+")
WORD_PATTERN = re.compile(r"[\w\-\_@]+")
STRING_PATTERN = re.compile(r"\"[^\"]+\"")
RULE_PATTERN = re.compile(r"(.*)\s*::=\s*(.*)")
CHOICE_PATTERN = re.compile(r"\s+\|\s+")
SPACE_PATTERN = re.compile(r"\s+")
QUOTE_WORD_PATTERN = re.compile(r"'[^']+'")
PREC_LEFT = "prec.left({})"
REPEAT_1 = "repeat1({})"
# Some of the rules have conflicts therefore we need to define the precedent priority.
APPLY_FUNC_LIST = {
"fk_target_options": PREC_LEFT,
"math_expr": PREC_LEFT,
"expr": PREC_LEFT,
"join_target": PREC_LEFT,
"elseif_list": PREC_LEFT,
"stmt_list": REPEAT_1,
}
# these are rules in cql_grammar.txt that are not defined. We need to manually define
# them for tree sitter parser.
REPLACEMENT_RULE_NAMES = {
"integer-literal": ["INT_LIT", "INT_LIT: $ => choice(/[0-9]+/, /0x[0-9a-fA-F]+/)"],
"long-literal": ["LONG_LIT", "LONG_LIT: $ => choice(/[0-9]+L/, /0x[0-9a-fA-F]+L/)"],
"real-literal": [
"REAL_LIT",
"REAL_LIT: $ => /([0-9]+\.[0-9]*|\.[0-9]+)((E|e)(\+|\-)?[0-9]+)?/",
],
"sql-blob-literal": [
"BLOB_LIT",
"BLOB_LIT: $ => /[xX]'([0-9a-fA-F][0-9a-fA-F])*'/",
],
"c-string-literal": [
"C_STR_LIT",
'C_STR_LIT: $ => /\\"(\\\\.|[^"\\n])*\\"/',
],
"sql-string-literal": ["STR_LIT", "STR_LIT: $ => /'(\\\\.|''|[^'\\n])*'/"],
"ID": ["ID", "ID: $ => /[_A-Za-z][A-Za-z0-9_]*/"],
}
# These are not part of cql_grammar.txt but are supported in cql grammar. We need to manually
# define them for tree sitter parser.
DEFAULT_RULES = [
"comment: $ => token(choice(seq('--', /(\\\\(.|\\r?\\n)|[^\\\\\\n])*/), seq('/*', /[^*]*\*+([^/*][^*]*\*+)*/, '/')))",
"line_directive: $ => seq(/#(line)?[ \\t]*/, $.INT_LIT, $.C_STR_LIT, /[^\\n]*/, /\\n/)",
"macro: $ => choice($.preproc_include, $.preproc_def, $.preproc_function_def, $.preproc_call)",
"preproc_def: $ => seq(preprocessor('define'),field('name', $.ID),field('value', optional($.preproc_arg)),'\\n')",
"preproc_call: $ => seq(field('directive', $.preproc_directive),field('argument', optional($.preproc_arg)),'\\n')",
"...preprocIf('', $ => $.stmt)",
"...preprocIf('_in_field_declaration_list', $ => $.field_declaration_list_item)",
"field_declaration_list_item: $ => choice($.declare_stmt,$.preproc_def,$.preproc_function_def,$.preproc_call,alias($.preproc_if_in_field_declaration_list, $.preproc_if),alias($.preproc_ifdef_in_field_declaration_list, $.preproc_ifdef),)",
"preproc_include: $ => seq(preprocessor('include'),field('path', choice($.string_literal,$.system_lib_string,$.ID,alias($.preproc_call_expression, $.call_expression),)),'\\n')",
"preproc_function_def: $ => seq(preprocessor('define'),field('name', $.ID),field('parameters', $.preproc_params),field('value', optional($.preproc_arg)),'\\n')",
"preproc_directive: $ => /#[ \\t]*[a-zA-Z]\\w*/, preproc_arg: $ => token(prec(-1, repeat1(/.|\\\\\\r?\\n/)))",
"preproc_expression: $ => choice($.ID,$.expr)",
"call_expression: $ => prec(1, seq(field('function', $.expression),field('arguments', $.argument_list)))",
"preproc_call_expression: $ => prec(1, seq(field('function', $.ID),field('arguments', alias($.preproc_argument_list, $.argument_list))))",
"preproc_argument_list: $ => seq('(',commaSep($.preproc_expression),')')",
"argument_list: $ => seq('(', commaSep($.expression), ')')",
"preproc_expression: $ => $.expression",
"expression: $ => $.expr",
"string_literal: $ => seq(choice('L\"', 'u\"', 'U\"', 'u8\"', '\"'),repeat(choice(token.immediate(prec(1, /[^\\\\\"\\n]+/)),$.escape_sequence)),'\"',)",
"escape_sequence: $ => token(prec(1, seq('\\\\',choice(/[^xuU]/,/\d{2,3}/,/x[0-9a-fA-F]{2,}/,/u[0-9a-fA-F]{4}/,/U[0-9a-fA-F]{8}/))))",
"system_lib_string: $ => token(seq('<',repeat(choice(/[^>\\n]/, '\\\\>')),'>'))",
"preproc_params: $ => seq(token.immediate('('), commaSep(choice($.ID, '...')), ')')",
]
cql_grammar = "cql_grammar.txt"
ts_grammar = {}
ts_rule_names = []
TOKEN_GRAMMAR = {}
rule_defs = {}
sorted_rule_names = []
optional_rules = set()
rules_name_visited = set()
def add_ts_rule(name, ts_rule):
ts_grammar[name] = ts_rule
ts_rule_names.append(name)
def get_rule_ref(token):
if token in REPLACEMENT_RULE_NAMES:
return "$.{}".format(REPLACEMENT_RULE_NAMES[token][0])
if QUOTE_WORD_PATTERN.match(token):
return token
if STRING_PATTERN.match(token):
tk = token.strip('"')
if WORD_PATTERN.match(tk):
if tk in REPLACEMENT_RULE_NAMES:
return "$.{}".format(REPLACEMENT_RULE_NAMES[tk][0])
name = tk.replace("@", "AT_")
if name not in TOKEN_GRAMMAR:
TOKEN_GRAMMAR[name] = "{}: $ => CI('{}')".format(name, tk.lower())
return "$.{}".format(name)
else:
return token
return (
"optional($.{})".format(token)
if token in optional_rules
else "$.{}".format(token)
)
def add_sub_sequence(tokens):
name = "_".join(tokens)
if name not in rules_name_visited:
values = ["CI('{}')".format(item.lower()) for item in tokens]
ts_rule = "$ => prec.left(1, seq({}))".format(", ".join(values))
add_ts_rule(name, ts_rule)
rules_name_visited.add(name)
return name
# Process a subquence within a sequence. they are a group of words within a string
# e.g: "ELSE IF"
def get_sub_sequence(seq):
tokens = SPACE_PATTERN.split(seq.strip('"'))
name = add_sub_sequence(tokens)
return get_rule_ref(name)
# Process a sequence in a rule.
# e.g: else_if: "else" "if"
def get_sequence(sequence):
tokens_list = []
for tk in sequence:
tk = tk.strip()
if len(tk) > 0:
if SPACE_PATTERN.search(tk):
tokens_list.append(get_sub_sequence(tk))
elif STRING_PATTERN.match(tk):
tokens_list.append(get_rule_ref(tk))
else:
tokens_list.append(get_rule_ref(tk))
return tokens_list
with open(cql_grammar) as fp:
for line in RULE_PATTERN.finditer(fp.read()):
assert line.lastindex == 2
name = line.group(1).strip()
rule = line.group(2)
choices = []
for choice in CHOICE_PATTERN.split(rule):
seq = []
if NULL_PATTERN.match(choice):
optional_rules.add(name)
else:
seq = [r.strip() for r in re.findall(SEQUENCE_PATTERN, choice)]
if len(seq) > 0:
# the rule is not optional
choices.append(seq)
rule_defs[name] = choices
sorted_rule_names.append(name)
# tree_sitter.py generator is base on cql_grammar.txt which does not contains the rules for
# comment, line_directive and macro. Therefore I need to add them manually into stmt_list rule.
rule_defs["stmt_list"] = [["stmt", "';'"], ["comment"], ["line_directive"], ["macro"]]
for name in sorted_rule_names:
if name in rules_name_visited:
continue
rules_name_visited.add(name)
choices = []
for rule in rule_defs[name]:
seq = get_sequence(rule)
size = len(seq)
if size == 0:
# An empty sequence in the rule indicates that the rule is optional.
# We dont need to do anything here, we just move on. later it's
# used to add the "optional()" function to optional rule's definition.
continue
elif size == 1:
choices.append(seq[0])
else:
choices.append("seq({})".format(", ".join(seq)))
if len(choices) == 1:
rule_str = choices[0]
else:
rule_str = "choice({})".format(", ".join(choices))
if name in APPLY_FUNC_LIST:
rule_str = APPLY_FUNC_LIST[name].format(rule_str)
else:
rule_str = rule_str
add_ts_rule(name, "$ => {}".format(rule_str))
# redefine the if_stmt rule because if not we're going to have parsing issues with "opt_elseif_list" and "opt_else" rule.
# I tried to fix it by providing a priority to the conflict but it didn't work.
ts_grammar[
"if_stmt"
] = "$ => seq($.IF, $.expr, $.THEN, optional($.opt_stmt_list), optional(repeat1($.elseif_item)), optional($.opt_else), $.END, $.IF)"
grammar = ",\n ".join(
["{}: {}".format(ts, ts_grammar[ts]) for ts in ts_rule_names]
+ DEFAULT_RULES
+ list(TOKEN_GRAMMAR.values())
+ [r[1] for r in REPLACEMENT_RULE_NAMES.values()]
)
print(
"/**\n"
" * Copyright (c) Meta Platforms, Inc. and affiliates.\n"
" *\n"
" * This source code is licensed under the MIT license found in the\n"
" * LICENSE file in the root directory of this source tree.\n"
" */\n\n"
)
print("// Snapshot as of {}\n\n".format(datetime.datetime.now().strftime("%c")))
print(
"const PREC = {\n"
"};\n\n"
"module.exports = grammar({\n"
" name: 'cql',\n"
" extras: $ => [\n"
" /\\s|\\\\\\r?\\n/,\n"
" $.comment\n"
" ],\n"
" conflicts: $ => [\n"
" [$.fk_options],\n"
" ],\n"
" word: $ => $.ID,\n"
" rules: {"
)
print(" {}".format(grammar))
print(
" }\n"
"});\n\n"
"// make string case insensitive\n"
"function CI (keyword) {\n"
" return new RegExp(keyword\n"
" .split('')\n"
" .map(letter => `[${letter}${letter.toUpperCase()}]`)\n"
" .join('')\n"
" )\n"
"}\n"
"// generic rule for macro directives where the directive\n"
"// is going to be case insensitive.\n"
"function preprocessor (command) {\n"
" return alias(new RegExp('#[ \\t]*' + command), '#' + command)\n"
"}\n\n"
"// generic rule making optional commaSep1.\n"
"function commaSep (rule) {\n"
" return optional(commaSep1(rule))\n"
"}\n\n"
'// generic rule for a list of ID separated by ","\n'
"function commaSep1 (rule) {\n"
" return seq(rule, repeat(seq(',', rule)))\n"
"}\n\n"
"function preprocIf (suffix, content) {\n"
" function elseBlock ($) {\n"
" return choice(\n"
" suffix ? alias($['preproc_else' + suffix], $.preproc_else) : $.preproc_else,\n"
" suffix ? alias($['preproc_elif' + suffix], $.preproc_elif) : $.preproc_elif,\n"
" );\n"
" }\n"
" return {\n"
" ['preproc_if' + suffix]: $ => seq(\n"
" preprocessor('if'),\n"
" field('condition', $.preproc_expression),\n"
" '\\n',\n"
" repeat(content($)),\n"
" field('alternative', optional(elseBlock($))),\n"
" preprocessor('endif')\n"
" ),\n"
" ['preproc_ifdef' + suffix]: $ => seq(\n"
" choice(preprocessor('ifdef'), preprocessor('ifndef')),\n"
" field('name', $.ID),\n"
" repeat(content($)),\n"
" field('alternative', optional(elseBlock($))),\n"
" preprocessor('endif')\n"
" ),\n"
" ['preproc_else' + suffix]: $ => seq(\n"
" preprocessor('else'),\n"
" repeat(content($))\n"
" ),\n"
" ['preproc_elif' + suffix]: $ => seq(\n"
" preprocessor('elif'),\n"
" field('condition', $.preproc_expression),\n"
" '\\n',\n"
" repeat(content($)),\n"
" field('alternative', optional(elseBlock($))),\n"
" )\n"
" }\n"
"}\n"
)