core/lib/sqlparse/create.py (830 lines of code) (raw):
"""
Copyright (c) 2017-present, Facebook, Inc.
All rights reserved.
This source code is licensed under the BSD-style license found in the
LICENSE file in the root directory of this source tree.
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
from typing import List, Set, Union
from pyparsing import (
alphanums,
CaselessLiteral,
Combine,
delimitedList,
Group,
Literal,
nestedExpr,
nums,
OneOrMore,
Optional,
ParseException,
ParseResults,
QuotedString,
Regex,
replaceWith,
SkipTo,
StringEnd,
upcaseTokens,
White,
Word,
ZeroOrMore,
)
from . import models
log = logging.getLogger(__name__)
__all__ = ["parse_create", "ParseError", "PartitionParseError"]
class ParseError(Exception):
def __init__(self, msg, line=0, column=0):
self._msg = msg
self._line = line
self._column = column
def __str__(self):
return "Line: {}, Column: {}\n {}".format(self._line, self._column, self._msg)
class PartitionParseError(Exception):
pass
class CreateParser(object):
"""
This class can take a plain "CREATE TABLE" SQL as input and parse it into
a Table object, so that we have more insight on the detail of this SQL.
Example:
sql = 'create table foo ( bar int primary key )'
parser = CreateParser(sql)
try:
tbl_obj = parser.parse()
except ParseError:
log.error("Failed to parse SQL")
This set of BNF rules are basically translated from the MySQL manual:
http://dev.mysql.com/doc/refman/5.6/en/create-table.html
If you don't know how to change the rule or fix the bug,
<Getting Started with Pyparsing> is probably the best book to start with.
Also this wiki has all supported functions listed:
https://pyparsing.wikispaces.com/HowToUsePyparsing
If you want have more information how these characters are
matching, add .setDebug(True) after the specific token you want to debug
"""
_parser = None
_partitions_parser = None
# Basic token
WORD_CREATE = CaselessLiteral("CREATE").suppress()
WORD_TABLE = CaselessLiteral("TABLE").suppress()
COMMA = Literal(",").suppress()
DOT = Literal(".")
LEFT_PARENTHESES = Literal("(").suppress()
RIGHT_PARENTHESES = Literal(")").suppress()
QUOTE = Literal("'") | Literal('"')
BACK_QUOTE = Optional(Literal("`")).suppress()
LENGTH = Word(nums)
DECIMAL = Combine(Word(nums) + DOT + Word(nums))
OBJECT_NAME = Word(alphanums + "_" + "-" + "<" + ">" + ":")
QUOTED_STRING_WITH_QUOTE = QuotedString(
quoteChar="'", escQuote="''", escChar="\\", multiline=True, unquoteResults=False
) | QuotedString(
quoteChar='"', escQuote='""', escChar="\\", multiline=True, unquoteResults=False
)
QUOTED_STRING = QuotedString(
quoteChar="'", escQuote="''", escChar="\\", multiline=True
) | QuotedString(quoteChar='"', escQuote='""', escChar="\\", multiline=True)
# Start of a create table statement
# Sample: this part of rule will match following section
# `table_name` IF NOT EXISTS
IF_NOT_EXIST = Optional(
CaselessLiteral("IF") + CaselessLiteral("NOT") + CaselessLiteral("EXISTS")
).suppress()
TABLE_NAME = (
QuotedString(quoteChar="`", escQuote="``", escChar="\\", unquoteResults=True)
| OBJECT_NAME
)("table_name")
# Column definition
# Sample: this part of rule will match following section
# `id` bigint(20) unsigned NOT NULL DEFAULT '0',
COLUMN_NAME = (
QuotedString(quoteChar="`", escQuote="``", escChar="\\", unquoteResults=True)
| OBJECT_NAME
)("column_name")
COLUMN_NAME_WITH_QUOTE = (
QuotedString(quoteChar="`", escQuote="``", escChar="\\", unquoteResults=False)
| OBJECT_NAME
)("column_name")
UNSIGNED = Optional(CaselessLiteral("UNSIGNED"))("unsigned")
ZEROFILL = Optional(CaselessLiteral("ZEROFILL"))("zerofill")
COL_LEN = Combine(LEFT_PARENTHESES + LENGTH + RIGHT_PARENTHESES, adjacent=False)(
"length"
)
INT_TYPE = (
CaselessLiteral("TINYINT")
| CaselessLiteral("SMALLINT")
| CaselessLiteral("MEDIUMINT")
| CaselessLiteral("INT")
| CaselessLiteral("INTEGER")
| CaselessLiteral("BIGINT")
| CaselessLiteral("BINARY")
| CaselessLiteral("BIT")
)
INT_DEF = INT_TYPE("column_type") + Optional(COL_LEN) + UNSIGNED + ZEROFILL
VARBINARY_DEF = CaselessLiteral("VARBINARY")("column_type") + COL_LEN
FLOAT_TYPE = (
CaselessLiteral("REAL")
| CaselessLiteral("DOUBLE")
| CaselessLiteral("FLOAT")
| CaselessLiteral("DECIMAL")
| CaselessLiteral("NUMERIC")
)
FLOAT_LEN = Combine(
LEFT_PARENTHESES + LENGTH + Optional(COMMA + LENGTH) + RIGHT_PARENTHESES,
adjacent=False,
joinString=", ",
)("length")
FLOAT_DEF = FLOAT_TYPE("column_type") + Optional(FLOAT_LEN) + UNSIGNED + ZEROFILL
# time type definition. They contain type_name and an optional FSP section
# Sample: DATETIME[(fsp)]
FSP = COL_LEN
DT_DEF = (
Combine(CaselessLiteral("TIME") + Optional(CaselessLiteral("STAMP")))
| CaselessLiteral("DATETIME")
)("column_type") + Optional(FSP)
SIMPLE_DEF = (
CaselessLiteral("DATE")
| CaselessLiteral("YEAR")
| CaselessLiteral("TINYBLOB")
| CaselessLiteral("BLOB")
| CaselessLiteral("MEDIUMBLOB")
| CaselessLiteral("LONGBLOB")
| CaselessLiteral("BOOLEAN")
| CaselessLiteral("BOOL")
| CaselessLiteral("JSON")
)("column_type")
OPTIONAL_COL_LEN = Optional(COL_LEN)
BINARY = Optional(CaselessLiteral("BINARY"))("binary")
CHARSET_NAME = (
Optional(QUOTE).suppress()
+ Word(alphanums + "_")("charset")
+ Optional(QUOTE).suppress()
)
COLLATION_NAME = (
Optional(QUOTE).suppress()
+ Word(alphanums + "_")("collate")
+ Optional(QUOTE).suppress()
)
CHARSET_DEF = CaselessLiteral("CHARACTER SET").suppress() + CHARSET_NAME
COLLATE_DEF = CaselessLiteral("COLLATE").suppress() + COLLATION_NAME
CHAR_DEF = CaselessLiteral("CHAR")("column_type") + OPTIONAL_COL_LEN + BINARY
VARCHAR_DEF = CaselessLiteral("VARCHAR")("column_type") + COL_LEN + BINARY
TEXT_TYPE = (
CaselessLiteral("TINYTEXT")
| CaselessLiteral("TEXT")
| CaselessLiteral("MEDIUMTEXT")
| CaselessLiteral("LONGTEXT")
| CaselessLiteral("DOCUMENT")
)
TEXT_DEF = TEXT_TYPE("column_type") + BINARY
ENUM_VALUE_LIST = Group(
QUOTED_STRING_WITH_QUOTE + ZeroOrMore(COMMA + QUOTED_STRING_WITH_QUOTE)
)("enum_value_list")
ENUM_DEF = (
CaselessLiteral("ENUM")("column_type")
+ LEFT_PARENTHESES
+ ENUM_VALUE_LIST
+ RIGHT_PARENTHESES
)
SET_VALUE_LIST = Group(
QUOTED_STRING_WITH_QUOTE + ZeroOrMore(COMMA + QUOTED_STRING_WITH_QUOTE)
)("set_value_list")
SET_DEF = (
CaselessLiteral("SET")("column_type")
+ LEFT_PARENTHESES
+ SET_VALUE_LIST
+ RIGHT_PARENTHESES
)
DATA_TYPE = (
INT_DEF
| FLOAT_DEF
| DT_DEF
| SIMPLE_DEF
| TEXT_DEF
| CHAR_DEF
| VARCHAR_DEF
| ENUM_DEF
| SET_DEF
| VARBINARY_DEF
)
# Column attributes come after column type and length
NULLABLE = CaselessLiteral("NULL") | CaselessLiteral("NOT NULL")
DEFAULT_VALUE = CaselessLiteral("DEFAULT").suppress() + (
Optional(Literal("b"))("is_bit") + QUOTED_STRING_WITH_QUOTE("default")
| Combine(
CaselessLiteral("CURRENT_TIMESTAMP")("default")
+ Optional(COL_LEN)("ts_len")
)
| DECIMAL("default")
| Word(alphanums + "_" + "-" + "+")("default")
)
ON_UPDATE = (
CaselessLiteral("ON")
+ CaselessLiteral("UPDATE")
+ (
CaselessLiteral("CURRENT_TIMESTAMP")("on_update")
+ Optional(COL_LEN)("on_update_ts_len")
)
)
AUTO_INCRE = CaselessLiteral("AUTO_INCREMENT")
UNIQ_KEY = CaselessLiteral("UNIQUE") + Optional(CaselessLiteral("KEY")).suppress()
PRIMARY_KEY = (
CaselessLiteral("PRIMARY") + Optional(CaselessLiteral("KEY")).suppress()
)
COMMENT = Combine(
CaselessLiteral("COMMENT").suppress() + QUOTED_STRING_WITH_QUOTE, adjacent=False
)
COLUMN_DEF = Group(
COLUMN_NAME
+ DATA_TYPE
+ ZeroOrMore(
NULLABLE("nullable")
| DEFAULT_VALUE
| ON_UPDATE
| AUTO_INCRE("auto_increment")
| UNIQ_KEY("uniq_key")
| PRIMARY_KEY("primary")
| COMMENT("comment")
| CHARSET_DEF
| COLLATE_DEF
)
)
COLUMN_LIST = Group(COLUMN_DEF + ZeroOrMore(COMMA + COLUMN_DEF))("column_list")
DOCUMENT_PATH = Combine(
COLUMN_NAME_WITH_QUOTE + ZeroOrMore(DOT + COLUMN_NAME_WITH_QUOTE)
)
INDEX_ORDER = (CaselessLiteral("ASC") | CaselessLiteral("DESC"))("index_order")
IDX_COL = (
Group(
DOCUMENT_PATH
+ CaselessLiteral("AS")
+ (CaselessLiteral("INT") | CaselessLiteral("STRING"))
+ Optional(COL_LEN, default="")
)
) | (
Group(
COLUMN_NAME
+ Optional(COL_LEN, default="")
+ Optional(INDEX_ORDER, default="ASC")
)
)
# Primary key section
COL_NAME_LIST = Group(IDX_COL + ZeroOrMore(COMMA + IDX_COL))
IDX_COLS = LEFT_PARENTHESES + COL_NAME_LIST + RIGHT_PARENTHESES
WORD_PRI_KEY = (
CaselessLiteral("PRIMARY").suppress() + CaselessLiteral("KEY").suppress()
)
KEY_BLOCK_SIZE = (
CaselessLiteral("KEY_BLOCK_SIZE").suppress()
+ Optional(Literal("="))
+ Word(nums)("idx_key_block_size")
)
INDEX_USING = CaselessLiteral("USING").suppress() + (
CaselessLiteral("BTREE") | CaselessLiteral("HASH")
)("idx_using")
INDEX_OPTION = ZeroOrMore(KEY_BLOCK_SIZE | COMMENT("idx_comment") | INDEX_USING)
PRI_KEY_DEF = COMMA + WORD_PRI_KEY + IDX_COLS("pri_list") + INDEX_OPTION
# Index section
KEY_TYPE = (CaselessLiteral("FULLTEXT") | CaselessLiteral("SPATIAL"))("key_type")
WORD_UNIQUE = CaselessLiteral("UNIQUE")("unique")
WORD_KEY = CaselessLiteral("INDEX").suppress() | CaselessLiteral("KEY").suppress()
IDX_NAME = Optional(COLUMN_NAME)
IDX_DEF = (
ZeroOrMore(
Group(
COMMA
+ Optional(WORD_UNIQUE | KEY_TYPE)
+ WORD_KEY
+ IDX_NAME("index_name")
+ IDX_COLS("index_col_list")
+ INDEX_OPTION
)
)
)("index_section")
# Constraint section as this is not a recommended way of using MySQL
# we'll treat the whole section as a string
CONSTRAINT = Combine(
ZeroOrMore(
COMMA
+ Optional(CaselessLiteral("CONSTRAINT"))
+
# foreign key name except the key word 'FOREIGN'
Optional((~CaselessLiteral("FOREIGN") + COLUMN_NAME))
+ CaselessLiteral("FOREIGN")
+ CaselessLiteral("KEY")
+ LEFT_PARENTHESES
+ COL_NAME_LIST
+ RIGHT_PARENTHESES
+ CaselessLiteral("REFERENCES")
+ COLUMN_NAME
+ LEFT_PARENTHESES
+ COL_NAME_LIST
+ RIGHT_PARENTHESES
+ ZeroOrMore(Word(alphanums))
),
adjacent=False,
joinString=" ",
)("constraint")
# Table option section
ENGINE = (
CaselessLiteral("ENGINE").suppress()
+ Optional(Literal("=")).suppress()
+ COLUMN_NAME("engine").setParseAction(upcaseTokens)
)
DEFAULT_CHARSET = (
Optional(CaselessLiteral("DEFAULT")).suppress()
+ (
(
CaselessLiteral("CHARACTER").suppress()
+ CaselessLiteral("SET").suppress()
)
| (CaselessLiteral("CHARSET").suppress())
)
+ Optional(Literal("=")).suppress()
+ Word(alphanums + "_")("charset")
)
TABLE_COLLATE = (
Optional(CaselessLiteral("DEFAULT")).suppress()
+ CaselessLiteral("COLLATE").suppress()
+ Optional(Literal("=")).suppress()
+ COLLATION_NAME
)
ROW_FORMAT = (
CaselessLiteral("ROW_FORMAT").suppress()
+ Optional(Literal("=")).suppress()
+ Word(alphanums + "_")("row_format").setParseAction(upcaseTokens)
)
TABLE_KEY_BLOCK_SIZE = (
CaselessLiteral("KEY_BLOCK_SIZE").suppress()
+ Optional(Literal("=")).suppress()
+ Word(nums)("key_block_size").setParseAction(lambda s, l, t: [int(t[0])])
)
COMPRESSION = (
CaselessLiteral("COMPRESSION").suppress()
+ Optional(Literal("=")).suppress()
+ Word(alphanums + "_")("compression").setParseAction(upcaseTokens)
)
# Parse and make sure auto_increment is an integer
# parseAction function is defined as fn( s, loc, toks ), where:
# s is the original parse string
# loc is the location in the string where matching started
# toks is the list of the matched tokens, packaged as a ParseResults_
# object
TABLE_AUTO_INCRE = (
CaselessLiteral("AUTO_INCREMENT").suppress()
+ Optional(Literal("=")).suppress()
+ Word(nums)("auto_increment").setParseAction(lambda s, l, t: [int(t[0])])
)
TABLE_COMMENT = (
CaselessLiteral("COMMENT").suppress()
+ Optional(Literal("=")).suppress()
+ QUOTED_STRING_WITH_QUOTE("comment")
)
TABLE_OPTION = ZeroOrMore(
(
ENGINE
| DEFAULT_CHARSET
| TABLE_COLLATE
| ROW_FORMAT
| TABLE_KEY_BLOCK_SIZE
| COMPRESSION
| TABLE_AUTO_INCRE
| TABLE_COMMENT
)
# Table attributes could be comma separated too.
+ Optional(COMMA).suppress()
)
# Partition section
PARTITION = Optional(
Combine(
Combine(Optional(Literal("/*!") + Word(nums)))
+ CaselessLiteral("PARTITION")
+ CaselessLiteral("BY")
+ SkipTo(StringEnd()),
adjacent=False,
joinString=" ",
)("partition")
)
# Parse partitions in detail
# From https://dev.mysql.com/doc/refman/8.0/en/create-table.html
PART_FIELD_NAME = (
QuotedString(quoteChar="`", escQuote="``", escChar="\\", unquoteResults=True)
| OBJECT_NAME
)
PART_FIELD_LIST = delimitedList(PART_FIELD_NAME)("field_list")
# e.g 1, 2, 3
# and 'a', 'b', 'c'
# and `NULL`
# and _binary 0x123aBc
HEX_VALUE = Literal("0x") + OneOrMore(Regex("[0-9a-fA-F]"))
day = Word(nums)
month = Word(nums)
year = Word(nums)
dateday = Combine(year + "-" + month + "-" + day)
to_days = Combine("to_days('" + dateday + "')")
PART_VALUE_LIST = Group(
LEFT_PARENTHESES
+ (
delimitedList(
Word(nums) # e.g. (1, 2, 3)
| QUOTED_STRING_WITH_QUOTE # e.g. ('a', 'b')
| CaselessLiteral("NULL").setParseAction(upcaseTokens) # e.g. (NULL)
| to_days # e.g. to_days('2010-11-07')
)
| (
LEFT_PARENTHESES
+ (
delimitedList(
QUOTED_STRING_WITH_QUOTE
| CaselessLiteral("NULL").setParseAction(upcaseTokens)
)
)
+ RIGHT_PARENTHESES
)(
"is_tuple"
) # e.g. (("a", "b")), See test_parts_list_in_tuple15
# e.g. `_binary 0xdeadbeef123`, See test_parts_list_by_cols_with_binary17
# turns to: `_BINARY 0xdeadbeef123`
| Combine(
CaselessLiteral("_binary").setParseAction(upcaseTokens)
+ White(" ").setParseAction(replaceWith(" "))
+ HEX_VALUE
)
)
+ RIGHT_PARENTHESES
)
PART_VALUES_IN = (CaselessLiteral("IN").suppress() + PART_VALUE_LIST)("p_values_in")
# Note: No expr support although full syntax (allowed by mysql8) is
# LESS THAN {(expr | value_list) | MAXVALUE}
PART_VALUES_LESSTHAN = (
CaselessLiteral("LESS").suppress()
+ CaselessLiteral("THAN").suppress()
+ (CaselessLiteral("MAXVALUE").setParseAction(upcaseTokens) | PART_VALUE_LIST)
)("p_values_less_than")
PART_NAME = (
QuotedString(quoteChar="`", escQuote="``", escChar="\\", unquoteResults=True)
| OBJECT_NAME
)("part_name")
# Options for partition definitions - engine/comments only for now.
# DO NOT re-use QUOTED_STRING_WITH_QUOTE for these -
# *seems* to trigger a pyparsing bug?
P_ENGINE = (
QuotedString(quoteChar="'", escQuote="''", escChar="\\", unquoteResults=True)
| QuotedString(
quoteChar='"',
escQuote='""',
escChar="\\",
multiline=False,
unquoteResults=True,
)
| CaselessLiteral("innodb")
| CaselessLiteral("ndb")
| CaselessLiteral("rocksdb")
)
P_COMMENT = QuotedString(
quoteChar="'", escQuote="''", escChar="\\", multiline=True, unquoteResults=False
) | QuotedString(
quoteChar='"', escQuote='""', escChar="\\", multiline=True, unquoteResults=False
)
P_OPT_ENGINE = (
Optional(CaselessLiteral("STORAGE")).suppress()
+ CaselessLiteral("ENGINE").suppress()
+ Optional(Literal("=")).suppress()
+ P_ENGINE.setParseAction(upcaseTokens)("pdef_engine")
)
P_OPT_COMMENT = (
CaselessLiteral("COMMENT").suppress()
+ Optional(Literal("=")).suppress()
+ P_COMMENT("pdef_comment")
)
PDEF_OPTIONS = ZeroOrMore((P_OPT_ENGINE | P_OPT_COMMENT))
# e.g. PARTITION p99 VALUES (LESS THAN|IN) ...
PART_DEFS = delimitedList(
Group(
CaselessLiteral("PARTITION").suppress()
+ PART_NAME
+ CaselessLiteral("VALUES").suppress()
+ (PART_VALUES_LESSTHAN | PART_VALUES_IN)
+ PDEF_OPTIONS
)
)
# No fancy expressions yet, just a list of cols OR something nested in ()
PART_EXPR = (
(
LEFT_PARENTHESES
+ delimitedList(
QuotedString(
quoteChar="`", escQuote="``", escChar="\\", unquoteResults=True
)
| OBJECT_NAME
)
+ RIGHT_PARENTHESES
)("via_list")
# `RANGE expr` support (test_parts_range_with_expr)
| nestedExpr()("via_nested_expr")
)("p_expr")
SUBTYPE_LINEAR = (Optional(CaselessLiteral("LINEAR")).setParseAction(upcaseTokens))(
"p_subtype"
)
# Match: [LINEAR] HASH (expr)
PTYPE_HASH = (
SUBTYPE_LINEAR
+ (CaselessLiteral("HASH").setParseAction(upcaseTokens))("part_type")
+ nestedExpr()("p_hash_expr") # Lousy approximation, needs post processing
)
# Match: [LINEAR] KEY [ALGORITHM=1|2] (column_list)
PART_ALGO = (
CaselessLiteral("ALGORITHM").suppress()
+ Literal("=").suppress()
+ Word(alphanums)
)("p_algo")
PTYPE_KEY = (
SUBTYPE_LINEAR
+ (CaselessLiteral("KEY").setParseAction(upcaseTokens))("part_type")
+ Optional(PART_ALGO)
+ Literal("(") # don't suppress here
+ Optional(PART_FIELD_LIST) # e.g. `PARTITION BY KEY() PARTITIONS 2` is valid
+ Literal(")")
)
PART_COL_LIST = (
(CaselessLiteral("COLUMNS").setParseAction(upcaseTokens))("p_subtype")
+ LEFT_PARENTHESES
+ PART_FIELD_LIST
+ RIGHT_PARENTHESES
)
PTYPE_RANGE = (CaselessLiteral("RANGE").setParseAction(upcaseTokens))(
"part_type"
) + (PART_COL_LIST | PART_EXPR)
PTYPE_LIST = (CaselessLiteral("LIST").setParseAction(upcaseTokens))("part_type") + (
PART_COL_LIST | PART_EXPR
)
@classmethod
def generate_rule(cls):
# The final rule for the whole statement match
return (
cls.WORD_CREATE
+ cls.WORD_TABLE
+ cls.IF_NOT_EXIST
+ cls.TABLE_NAME
+ cls.LEFT_PARENTHESES
+ cls.COLUMN_LIST
+ Optional(cls.PRI_KEY_DEF)
+ cls.IDX_DEF
+ cls.CONSTRAINT
+ cls.RIGHT_PARENTHESES
+ cls.TABLE_OPTION("table_options")
+ cls.PARTITION
)
@classmethod
def get_parser(cls):
if not cls._parser:
cls._parser = cls.generate_rule()
return cls._parser
@classmethod
def gen_partitions_parser(cls):
# Init full parts matcher only on demand
# invalid_partition_prefix - used to detect any invalid prefix
# attached to the number of partitions. The prefix is used
# later on to flag invalid schemas.
return (
Combine(Optional(Literal("/*!") + Word(nums))).suppress()
+ CaselessLiteral("PARTITION")
+ CaselessLiteral("BY")
+ (cls.PTYPE_HASH | cls.PTYPE_KEY | cls.PTYPE_RANGE | cls.PTYPE_LIST)
+ Optional(
CaselessLiteral("PARTITIONS")
+ Optional(Combine(Regex("[^0-9]")))("invalid_partition_prefix")
+ Word(nums)("num_partitions")
)
+ Optional(
cls.LEFT_PARENTHESES
+ cls.PART_DEFS("part_defs")
+ cls.RIGHT_PARENTHESES
)
)
@classmethod
def get_partitions_parser(cls):
if not cls._partitions_parser:
cls._partitions_parser = cls.gen_partitions_parser()
return cls._partitions_parser
@classmethod
def parse_partitions(cls, parts) -> ParseResults:
try:
return cls.get_partitions_parser().parseString(parts)
except ParseException as e:
raise ParseError(f"Error parsing partitions: {e.line}, {e.column}")
@classmethod
def parse(cls, sql):
try:
if not isinstance(sql, str):
sql = sql.decode("utf-8")
result = cls.get_parser().parseString(sql)
except ParseException as e:
raise ParseError(
"Failed to parse SQL, unsupported syntax: {}".format(e),
e.line,
e.column,
)
inline_pri_exists = False
table = models.Table()
table.name = result.table_name
table_options = [
"engine",
"charset",
"collate",
"row_format",
"key_block_size",
"compression",
"auto_increment",
"comment",
]
for table_option in table_options:
if table_option in result:
setattr(table, table_option, result.get(table_option))
if "partition" in result:
# pyparsing will convert newline into two after parsing. So we
# need to dedup here
table.partition = result.partition.replace("\n\n", "\n")
try:
presult = cls.parse_partitions(table.partition)
table.partition_config = cls.partition_to_model(presult)
except ParseException as e:
raise ParseError(
f"Failed to parse partitions config, unsupported syntax {e},"
f" line: {e.line} col {e.column}"
)
except PartitionParseError as mpe:
raise ParseError(
f"Failed to init model from partitions config: {mpe}, "
f"ParseResult: {presult.dump()}\nRaw: {table.partition}"
)
if "constraint" in result:
table.constraint = result.constraint
for column_def in result.column_list:
if column_def.column_type == "ENUM":
column = models.EnumColumn()
for enum_value in column_def.enum_value_list:
column.enum_list.append(enum_value)
elif column_def.column_type == "SET":
column = models.SetColumn()
for set_value in column_def.set_value_list:
column.set_list.append(set_value)
elif column_def.column_type in ("TIMESTAMP", "DATETIME"):
column = models.TimestampColumn()
if "on_update" in column_def:
if "on_update_ts_len" in column_def:
column.on_update_current_timestamp = "{}({})".format(
column_def.on_update, column_def.on_update_ts_len
)
else:
column.on_update_current_timestamp = column_def.on_update
else:
column = models.Column()
column.name = column_def.column_name
column.column_type = column_def.column_type
if column.column_type == "JSON":
table.has_80_features = True
# We need to check whether each column property exist in the
# create table string, because not specifying a "COMMENT" is
# different from specifying "COMMENT" equals to empty string.
# The former one will ends up being
# column=None
# and the later one being
# column=''
if "comment" in column_def:
column.comment = column_def.comment
if "nullable" in column_def:
if column_def.nullable == "NULL":
column.nullable = True
elif column_def.nullable == "NOT NULL":
column.nullable = False
if "unsigned" in column_def:
if column_def.unsigned == "UNSIGNED":
column.unsigned = True
if "default" in column_def:
if "ts_len" in column_def:
column.default = "{}({})".format(
column_def.default, column_def.ts_len
)
else:
column.default = column_def.default
if "is_bit" in column_def:
column.is_default_bit = True
if "charset" in column_def:
column.charset = column_def.charset
if "length" in column_def:
column.length = column_def.length
if "collate" in column_def:
column.collate = column_def.collate
if "auto_increment" in column_def:
column.auto_increment = True
if "primary" in column_def:
idx_col = models.IndexColumn()
idx_col.name = column_def.column_name
table.primary_key.column_list.append(idx_col)
inline_pri_exists = True
table.column_list.append(column)
if "pri_list" in result:
if inline_pri_exists:
raise ParseError("Multiple primary keys defined")
table.primary_key.name = "PRIMARY"
for col in result.pri_list:
for name, length, order in col:
idx_col = models.IndexColumn()
idx_col.name = name
idx_col.order = order
if order.upper() == "DESC":
table.has_80_features = True
if length:
idx_col.length = length
table.primary_key.column_list.append(idx_col)
if "idx_key_block_size" in result:
table.primary_key.key_block_size = result.pri_key_block_size
if "idx_comment" in result:
table.primary_key.comment = result.idx_comment
if "index_section" in result:
for idx_def in result.index_section:
idx = models.TableIndex()
idx.name = idx_def.index_name
if "idx_key_block_size" in idx_def:
idx.key_block_size = idx_def.idx_key_block_size
if "idx_comment" in idx_def:
idx.comment = idx_def.idx_comment
if "idx_using" in idx_def:
idx.using = idx_def.idx_using
if "key_type" in idx_def:
idx.key_type = idx_def.key_type
if "unique" in idx_def:
idx.is_unique = True
for col in idx_def.index_col_list:
for col_def in col:
if len(col_def) == 4 and col_def[1].upper() == "AS":
(document_path, word_as, key_type, length) = col_def
idx_col = models.DocStoreIndexColumn()
idx_col.document_path = document_path
idx_col.key_type = key_type
if length:
idx_col.length = length
idx.column_list.append(idx_col)
else:
(name, length, order) = col_def
idx_col = models.IndexColumn()
idx_col.name = name
idx_col.order = order
if order.upper() == "DESC":
table.has_80_features = True
if length:
idx_col.length = length
idx.column_list.append(idx_col)
table.indexes.append(idx)
return table
@classmethod
def partition_to_model(cls, presult: ParseResults) -> models.PartitionConfig:
# Convert ParseResults from parsing a partitions config into a
# model. This can throw a PartitionParseError
mytype = presult.get("part_type", None)
mysubtype = presult.get("p_subtype", None)
if (
(not mytype and not mysubtype)
or mytype not in models.PartitionConfig.KNOWN_PARTITION_TYPES
or (
mysubtype is not None
and mysubtype not in models.PartitionConfig.KNOWN_PARTITION_SUBTYPES
)
):
raise PartitionParseError(
"partition_to_model Cannot init mode.PartitionConfig: "
f"type {mytype} subtype {mysubtype}"
)
pc = models.PartitionConfig()
pc.part_type = mytype
pc.p_subtype = mysubtype
def _strip_ticks(fields: Union[str, List[str]]) -> Union[str, List[str]]:
if isinstance(fields, str):
return fields.replace("`", "")
return [_strip_ticks(f) for f in fields]
if presult.get("invalid_partition_prefix"):
raise PartitionParseError(
f"Partition type {pc.part_type} cannot "
"have invalid partition number prefix defined"
)
# set fields_or_expr, full_type
if (
pc.part_type == models.PartitionConfig.PTYPE_LIST
or pc.part_type == models.PartitionConfig.PTYPE_RANGE
):
pc.num_partitions = len(presult.get("part_defs", []))
if pc.num_partitions == 0:
raise PartitionParseError(
f"Partition type {pc.part_type} MUST have partitions defined"
)
pc.part_defs = _process_partition_definitions(presult.part_defs)
if not pc.p_subtype:
pc.full_type = pc.part_type
pc.via_nested_expr = (
"via_nested_expr" in presult and "via_list" not in presult
)
pc.fields_or_expr = presult.p_expr.asList()
if pc.via_nested_expr:
# strip backticks e.g. to_days(`date`) -> [to_days, [date]]
pc.fields_or_expr = _strip_ticks(pc.fields_or_expr)
else:
pc.full_type = f"{pc.part_type} {pc.p_subtype}"
pc.fields_or_expr = presult.field_list.asList()
elif pc.part_type == models.PartitionConfig.PTYPE_KEY:
pc.full_type = (
pc.part_type if not pc.p_subtype else f"{pc.p_subtype} {pc.part_type}"
)
pc.num_partitions = int(presult.get("num_partitions", 1))
fl = presult.get("field_list", None)
pc.fields_or_expr = fl.asList() if fl else []
# This is the only place p_algo is valid. algorithm_for_key
algo_result = presult.get("p_algo")
if algo_result and len(algo_result.asList()) > 0:
pc.algorithm_for_key = int(algo_result.asList()[0])
elif pc.part_type == models.PartitionConfig.PTYPE_HASH:
pc.full_type = (
pc.part_type if not pc.p_subtype else f"{pc.p_subtype} {pc.part_type}"
)
pc.num_partitions = int(presult.get("num_partitions", 1))
hexpr = presult.get("p_hash_expr", None)
if not hexpr:
raise PartitionParseError(
f"Partition type {pc.part_type} MUST have p_hash_expr defined"
)
pc.fields_or_expr = _strip_ticks(hexpr.asList())
else:
# unreachable since we checked for all part_types earlier.
raise PartitionParseError(f"Unknown partition type {pc.part_type}")
# We avoid escaping fields/expr in partitions with backticks since
# its tricky to distinguish between a list of columns and an expression
# e.g. unix_timestamp(ts) - ts could be escaped but unix_ts cannot.
# Our parser will strip out backticks wherever possible. For nestedExpr
# usecases, this is done via _strip_ticks instead.
def _has_backticks(fields: Union[str, List[str]]) -> bool:
if isinstance(fields, list):
return any(_has_backticks(f) for f in fields)
return "`" in fields if isinstance(fields, str) else False
if _has_backticks(pc.fields_or_expr):
raise PartitionParseError(
f"field_or_expr cannot have backticks {pc.fields_or_expr}"
)
if len(pc.part_defs) > 0 and any(
pd.pdef_name.upper() == "NULL" for pd in pc.part_defs
):
# We will disallow this even if raw sql passed in as e.g.
# PARTITION `null` VALUES IN ...
raise PartitionParseError("Partition names may not be literal `null`")
return pc
def parse_create(sql):
return CreateParser.parse(sql)
def _process_partition_definitions(
partdefs: List[ParseResults],
) -> List[models.PartitionDefinitionEntry]:
# Populates partition definitions, applicable only for RANGE/LIST types
res: List[models.PartitionDefinitionEntry] = []
unique_attrs: Set[str] = set()
unique_engines: Set[str] = set()
for item in partdefs:
name = item.get("part_name", None)
if not name:
raise PartitionParseError(f"Missing `part_name` in {item}")
for attrname in models.PartitionConfig.PDEF_TYPE_ATTRIBS:
val_tmp = item.get(attrname, None)
if val_tmp:
unique_attrs.add(attrname)
val_as_list = val_tmp.asList()
is_tuple = "is_tuple" in val_tmp[0]
if (
isinstance(val_as_list, list)
and len(val_as_list) > 0
and not is_tuple
):
# MAXVALUE would show up as ['MAXVALUE'] and (1,2,3) as
# [['1', '2', '3']] so normalize to ['1', '2', '3'] and MAXVALUE
# But not if a tuple of values e.g. ((1, 2, 3))
val_as_list = val_as_list[0]
entry = models.PartitionDefinitionEntry(
pdef_name=name,
pdef_type=attrname,
pdef_value_list=val_as_list,
pdef_comment=item.get("pdef_comment"),
pdef_engine=item.get("pdef_engine") or "INNODB",
is_tuple=is_tuple,
)
unique_engines.add(entry.pdef_engine)
res.append(entry)
break
else:
# did not explicitly break? implies we had neither in/lessthan attribs
raise PartitionParseError(f"Missing `part_name` in {item}")
if len(res) == 0:
raise PartitionParseError("Empty partition definitions")
if len(unique_attrs) > 1:
# Partition defs MUST be all LESS THAN <OR> all IN. Not a mix.
raise PartitionParseError("Partitions cannot be a mix of LESS THAN / IN types")
if len(unique_engines) > 1:
# All partitions must use same engine.
# Its upto linter (not us) to verify that the engine used by partitions and
# the table match.
raise PartitionParseError(
f"Partitions cannot use a mix of ENGINE values {unique_engines}"
)
return res