cdk/parser/expr_parser.cc (1,154 lines of code) (raw):
/*
* Copyright (c) 2015, 2024, Oracle and/or its affiliates.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License, version 2.0, as
* published by the Free Software Foundation.
*
* This program is designed to work with certain software (including
* but not limited to OpenSSL) that is licensed under separate terms, as
* designated in a particular file or component or in included license
* documentation. The authors of MySQL hereby grant you an additional
* permission to link the program and your derivative works with the
* separately licensed software that they have either included with
* the program or referenced in the documentation.
*
* Without limiting anything contained in the foregoing, this file,
* which is part of Connector/C++, is also subject to the
* Universal FOSS Exception, version 1.0, a copy of which can be found at
* https://oss.oracle.com/licenses/universal-foss-exception.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU General Public License, version 2.0, for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "expr_parser.h"
PUSH_SYS_WARNINGS_CDK
#include <stdlib.h>
POP_SYS_WARNINGS_CDK
using namespace parser;
using cdk::Expression;
typedef cdk::Expression::Processor Processor;
typedef Processor::List_prc List_prc;
using cdk::Safe_prc;
using cdk::safe_prc;
/*
Set up keyword and operator maps.
*/
Keyword::map_t Keyword::kw_map;
Keyword Keyword::init;
Op::tok_map_t Op::unary_tok_map, Op::binary_tok_map;
Op::kw_map_t Op::unary_kw_map, Op::binary_kw_map;
Op Op::init;
// -------------------------------------------------------------------------
/*
Variant of std::auto_ptr such that after smart_ptr.release() the
pointed object can still be accessed via smart_ptr->xxx() (even
though it is no longer owned by this smart_ptr instance).
*/
template <typename T>
class smart_ptr
: public cdk::foundation::nocopy
{
T *m_ptr;
bool m_owns;
public:
smart_ptr(T *ptr = NULL)
: m_ptr(ptr), m_owns(true)
{}
~smart_ptr()
{
reset(NULL);
}
void operator=(T *ptr)
{
reset(ptr);
}
T* reset(T *ptr)
{
if (m_owns)
delete m_ptr;
m_ptr = ptr;
m_owns = true;
return ptr;
}
T* release()
{
m_owns = false;
return m_ptr;
}
T* operator->()
{
return m_ptr;
}
};
/*
Sink expression processor that ignores the expression reported
to it.
It is used below in situations where we want to ignore results
of parsing without storing them anywhere.
*/
struct Sink : public Expression::Processor
{
Scalar_prc* scalar() { return NULL; }
List_prc* arr() { return NULL; }
Doc_prc* doc() { return NULL; }
};
Expression::Processor* ignore_if(Expression::Processor *prc)
{
static Sink sink;
if (!prc)
return &sink;
return prc;
}
bool Expr_parser_base::do_parse(Processor *prc)
{
/*
if prc is NULL, ignore the parsed expression instead of storing it
which would be the case if we pass NULL to parse().
For safety, delete the object returned from parse() if any.
*/
delete parse(FULL, ignore_if(prc));
return true;
}
// -------------------------------------------------------------------------
/**
castOp ::= CAST LPAREN expr AS castType RPAREN
*/
bool Expr_parser_base::parse_cast(Scalar_prc *prc)
{
if (!consume_token(Op::CAST))
return false;
Safe_prc<List_prc> ap = safe_prc(prc)->op(Op::name(Op::CAST));
consume_token_throw(Token::LPAREN, "Expected '(' after CAST");
ap->list_begin();
// 1st arg, the expression
delete parse(FULL, ignore_if(ap->list_el()));
consume_token_throw(Keyword::AS,
"Expected AS after expression inside CAST operator");
// 2nd arg, cast_data_type
ap->list_el()->scalar()->val()->value(cdk::TYPE_BYTES,
Format_info(),
cdk::bytes(parse_cast_type()));
ap->list_end();
consume_token_throw(Token::RPAREN,
"Expected ')' closing CAST operator call");
return true;
}
/**
castType ::=
SIGNED INTEGER?
| UNSIGNED INTEGER?
| CHAR lengthSpec?
| BINARY lengthSpec?
| DECIMAL (lengthSpec | (LPAREN INT COMMA INT RPAREN))?
| TIME
| DATE
| DATETIME
| JSON
lengthSpec ::= LPAREN INT RPAREN
*/
std::string Expr_parser_base::parse_cast_type()
{
std::string type_str;
const Token* token = consume_token();
if (!token)
parse_error("Expected cast type");
Keyword::Type type = Keyword::get(*token);
if (Keyword::NONE == type)
parse_error("Unexpected cast type");
type_str = Keyword::name(type);
switch (type)
{
case Keyword::BINARY:
case Keyword::CHAR:
case Keyword::DECIMAL:
if (cur_token_type_is(Token::LPAREN))
type_str += cast_data_type_dimension(Keyword::DECIMAL == type);
break;
case Keyword::SIGNED:
case Keyword::UNSIGNED:
consume_token(Keyword::Set{ Keyword::INTEGER, Keyword::INT });
type_str += " ";
type_str += Keyword::name(Keyword::INTEGER);
break;
case Keyword::DATE:
case Keyword::DATETIME:
case Keyword::TIME:
case Keyword::INTEGER:
case Keyword::JSON:
break;
default:
parse_error("Unexpected cast type");
}
return type_str;
}
/**
dimension ::= LPAREN LINTEGER RPAREN
if double_dimention = true:
LPAREN INT COMMA INT RPAREN
returns textual representation of the parse, like "(N)" or "(N:M)".
*/
std::string Expr_parser_base::cast_data_type_dimension(bool double_dimension)
{
consume_token_throw(Token::LPAREN, "Expected type dimension specification");
std::string result("(");
result += consume_token_throw(
Token::INTEGER,
"Expected integer type dimension"
).get_utf8();
if (double_dimension && consume_token(Token::COMMA))
{
result += ",";
result += consume_token_throw(
Token::INTEGER,
"Expected second type dimension after ','"
).get_utf8();
}
result += ")";
consume_token_throw(
Token::RPAREN,
"Expected ')' closing type dimension specification"
);
return result;
}
// -------------------------------------------------------------------------
/*
ident ::=
ID
| QUOTED_ID
*/
bool Expr_parser_base::get_ident(string &id)
{
if (!tokens_available())
return false;
if (Token_base::cur_token_type_in({ Token::WORD, Token::QWORD }))
{
id = consume_token()->get_utf8();
return true;
}
return false;
}
/*
Assuming that a schema-qualified identifier was just parsed, attempt to
parse a function call if next token starts argument list.
Returns false if this is not the case.
functionCall ::= schemaQualifiedIdent LPAREN argsList? RPAREN
*/
bool
Expr_parser_base::parse_function_call(const cdk::api::Table_ref &func, Scalar_prc *prc)
{
if (!consume_token(Token::LPAREN))
return false;
List_prc *aprc = NULL;
bool qualified = (NULL != func.schema());
bool parse_position = false;
// Report position(.. IN ..) as locate(...,...)
if (! qualified && Keyword::equal(func.name(), "position"))
{
Table_ref locate;
locate.set("locate");
aprc = safe_prc(prc)->call(locate);
parse_position = true;
}
else
aprc = safe_prc(prc)->call(func);
if (aprc)
aprc->list_begin();
if (!cur_token_type_is(Token::RPAREN))
{
if (
!qualified && Keyword::equal(func.name(), "trim")
&& cur_token_type_in({
Keyword::BOTH, Keyword::LEADING, Keyword::TRAILING
})
)
unsupported("LEADING, TRAILING or BOTH clause inside function TRIM()");
delete parse(parse_position ? COMP : FULL, aprc ? aprc->list_el() : NULL);
if (consume_token(Token::COMMA))
parse_argslist(aprc);
else
parse_special_args(func, aprc);
}
if (aprc)
aprc->list_end();
consume_token_throw(
Token::RPAREN,
"Expected ')' to close function argument list"
);
return true;
}
void
Expr_parser_base::parse_special_args(
const cdk::api::Table_ref &func,
Expression::List::Processor *aprc
)
{
if (func.schema())
return;
if (Keyword::equal(func.name(), "char"))
{
if (cur_token_type_is(Keyword::USING))
unsupported("USING clause inside function CHAR()");
return;
}
if (Keyword::equal(func.name(), "trim"))
{
if (cur_token_type_is(Keyword::FROM))
unsupported("FROM clause inside function TRIM()");
}
if (Keyword::equal(func.name(), "position"))
{
if (!consume_token(Keyword::IN))
parse_error("Expected IN inside POSITION(... IN ...)");
delete parse(FULL, aprc ? aprc->list_el() : NULL);
return;
}
}
/*
Original grammar:
// [[schema.]table.]ident
columnIdent ::= (ident '.' (ident '.')?)? ident
('->' (('$' documentPath) | ("'$" documentPath "'")) )?
is rewritten as:
columnIdent ::= schemaQualifiedIdent columnIdent1
columnIdent1 ::= ('.' ident)? ('->' ( columnIdentDocPath
| "'" columnIdentDocPath "'" ))?
columnIdentDocPath ::= documentField // but require DOLLAR prefix
*/
/*
Parse a schema-qualified identifier and store it as table/schema
name of m_col_ref member. Schema name is optional.
If types is not NULL then types of the consumed tokens are stored in this
array.
*/
bool Expr_parser_base::parse_schema_ident(Token::Type (*types)[2])
{
if (types)
{
(*types)[0] = Token::Type(0);
(*types)[1] = Token::Type(0);
}
if (!tokens_available())
return false;
if (types)
(*types)[0] = peek_token()->get_type();
string name;
if (!get_ident(name))
return false;
m_col_ref.m_table_ref.set(name);
if (consume_token(Token::DOT))
{
if (!tokens_available())
return false;
if (types)
(*types)[1] = peek_token()->get_type();
string name1;
if (!get_ident(name1))
return false;
m_col_ref.m_table_ref.set(name1, name);
}
return true;
}
void Expr_parser_base::parse_column_ident(Processor *prc)
{
if (!parse_schema_ident())
parse_error("Expected a column identifier");
parse_column_ident1(prc);
}
void Expr_parser_base::parse_column_ident1(Processor *prc)
{
/*
Note: at this point we assume that an (possibly schema qualified) identifier
has been already seen and is stored in m_col_ref.table()
*/
if (consume_token(Token::DOT))
{
string name;
if (!get_ident(name))
parse_error("Expected identifier after '.'");
// Note: the table part was initialized in parse_schema_ident()
m_col_ref.set_name(name);
}
else
{
// Re-interpret table name parsed by parse_schema_ident() as a
// column name of the form [<table>.]<column>
auto table = m_col_ref.table();
assert(table);
if (table->schema())
m_col_ref.set(table->name(), table->schema()->name());
else
m_col_ref.set(table->name());
}
auto t = peek_token();
Safe_prc<Processor> sprc(prc);
if (t && (t->get_type() == Token::ARROW || t->get_type() == Token::ARROW2))
{
Safe_prc<cdk::Expr_processor::Args_prc> args = nullptr;
if(t->get_type() == Token::ARROW2)
{
Table_ref json_unquote;
json_unquote.set("JSON_UNQUOTE");
args =sprc->scalar()->call(json_unquote);
args->list_begin();
//Will override previous processor, so from now on, this will be the one
//used
sprc = args->list_el();
}
consume_token();
cdk::Doc_path_storage path;
if (Token_base::cur_token_type_in({ Token::QSTRING, Token::QQSTRING }))
{
Tokenizer toks(consume_token()->get_bytes());
It first = toks.begin();
It last = toks.end();
Expr_parser_base path_parser(first, last, m_parser_mode);
// TODO: Translate parse errors
path_parser.parse_document_field(&path, true);
if (first != last)
parse_error("Unexpected characters in a quoted path component");
}
else
{
parse_document_field(&path, true);
}
sprc->scalar()->ref(m_col_ref,&path);
args->list_end();
}
else
{
sprc->scalar()->ref(m_col_ref,nullptr);
}
}
// -------------------------------------------------------------------------
/**
The original grammar was:
documentField ::= fieldId [documentPath] | "$" [ documentPath ]
Which makes "*", "**.foo" or "*.foo" not valid field specifications
while "$[3]" is a valid specification.
We modify the grammar so that "$[..]" is not valid while "*.." or "**.."
are valid:
documentField ::=
| DOLLAR documentPathLeadingDot?
| documentPath
The grammar of documentPath was adjusted so that the first
path item can not be an array item ("[n]" or "[*]") and we can request
a leading DOT before member items (see parse_document_path()).
If prefix is true, only the first form starting with DOLLAR prefix is
accepted.
*/
void Expr_parser_base::parse_document_field(Path_prc *prc, bool prefix)
{
if (consume_token(Token::DOLLAR))
{
if (!parse_document_path(prc, true))
{
// The "$" path which denotes the whole document.
prc->whole_document();
}
return;
}
if (prefix)
parse_error("Expected '$' to start a document path");
if (!parse_document_path(prc, false))
parse_error("Expected a document path");
}
/*
Parse a document field path with a given initial member segment.
*/
void Expr_parser_base::parse_document_field(const string &first, Path_prc *prc)
{
Safe_prc<Path_prc> sprc = prc;
sprc->list_begin();
sprc->list_el()->member(first);
parse_document_path1(prc);
sprc->list_end();
}
/*
Parse a document field path with given 2 initial member segment.
*/
void Expr_parser_base::parse_document_field(const string &first,
const string &second,
Path_prc *prc)
{
Safe_prc<Path_prc> sprc = prc;
sprc->list_begin();
sprc->list_el()->member(first);
sprc->list_el()->member(second);
parse_document_path1(prc);
sprc->list_end();
}
/**
Original Grammar:
documentPath ::= documentPathItem* documentPathLastItem
documentPathItem ::=
documentPathLastItem
| DOUBLESTAR
documentPathLastItem ::=
ARRAYSTAR
| LSQBRACKET INT RSQBRACKET
| DOTSTAR
| DOT documentPathMember
documentPathMember ::=
ID
| STRING1
This grammar has few flaws:
1. It allows a document path to start with array location, which is not
correct - array locations should be possible only after a path to some
array member.
2. It always requires a DOT before a member element, but in some contexts
we want a document path like "foo.bar.baz" to start without a dot.
To deal with this the grammar has been changed and require_dot parameter
has been added. Modified grammar:
documentPath ::= documentPathFirstItem documentPathItem*
documentPathFirstItem ::=
| DOT? documentPathMember
| DOUBLESTAR
documentPathItem ::=
| DOT documentPathMember
| DOUBLESTAR
| documentPathArray
documentPathMember ::=
| MUL
| ID
| STRING1
docuemntPathArray ::= LSQBRACKET documentPathArrayLoc RSQBRACKET
documentPathArrayLoc ::=
| MUL
| INT
Parameter require_dot tells if the initial dot is required or not.
A check that DOUBLESTAR is not last element of a path is done separately.
Returns true if a valid document path was parsed and reported, false if the
current token did not start a valid document path.
Note: If false is returned then nothing is reported to the processor (not
even an empty list).
*/
bool Expr_parser_base::parse_document_path(Path_prc *prc, bool require_dot)
{
/*
Below we call methods like parse_docpath_member() which expect a document
path element processor. Our path processor prc is a list processor. So,
before we report the first path element we must call prc->list_begin() and
prc->list_el(). The problem is that when calling parse_docpath_member()
we might not know yet if there is any path to report or not -- only inside
parse_docpath_member() it will become evident.
The Path_el_reporter wrapper around path processor solves this problem by
deferring the initial list_begin() call and the list_el() calls to the
moment when a path element is reported. If no path elements are reported
then list_begin() or list_el() will not be called. Similar, call to
list_end() will be forwarded to the wrapped processor only if list_begin()
was called before.
*/
struct Path_el_reporter
: public Path_prc
, public Path_prc::Element_prc
{
using Element_prc::string;
using Element_prc::index_t;
Safe_prc<Path_prc> m_prc;
bool m_started;
void list_begin()
{
if (!m_started)
m_prc->list_begin();
m_started = true;
}
void list_end()
{
if (m_started)
m_prc->list_end();
}
Element_prc* list_el()
{
return this;
}
// Element_prc
void member(const string &name)
{
list_begin();
m_prc->list_el()->member(name);
}
void any_member()
{
list_begin();
m_prc->list_el()->any_member();
}
void index(index_t ind)
{
list_begin();
m_prc->list_el()->index(ind);
}
void any_index()
{
list_begin();
m_prc->list_el()->any_index();
}
void any_path()
{
list_begin();
m_prc->list_el()->any_path();
}
void whole_document()
{
m_prc->whole_document();
}
Path_el_reporter(Path_prc *prc)
: m_prc(prc), m_started(false)
{}
}
el_reporter(prc);
// documentPathFirstItem
bool double_star = false;
if (consume_token(Token::DOUBLESTAR))
{
double_star = true;
el_reporter.any_path();
}
else if (parse_docpath_member_dot(&el_reporter))
{
// continue below
}
else if (require_dot)
{
return false;
}
else
{
if (!parse_docpath_member(&el_reporter))
return false;
}
// the rest of the path (if any)
bool ret = parse_document_path1(&el_reporter);
if (!ret && double_star)
parse_error("Document path ending in '**'");
el_reporter.list_end();
return true;
}
/*
Parse a reminder of a document path after the first item, that is, a possibly
empty sequence of documentPathItem strings.
The items are reported to the given Path_prc without calling list_begin() or
list_end() (which is assumed to be done by the caller).
Returns true if at least one path item component was parsed.
*/
bool Expr_parser_base::parse_document_path1(Path_prc *prc)
{
Safe_prc<Path_prc> sprc = prc;
/*
These Booleans are used to detect if we are at the beginning of the path
and if there was a "**" component at the end of it.
*/
bool double_star;
bool last_double_star = false;
bool has_item = false;
for (double_star = false; true;
last_double_star = double_star,
double_star =false,
has_item = true)
{
if (!cur_token_type_in({ Token::DOUBLESTAR, Token::DOT, Token::LSQBRACKET }))
break;
if (consume_token(Token::DOUBLESTAR))
{
sprc->list_el()->any_path();
double_star = true;
continue;
}
if (parse_docpath_member_dot(sprc))
continue;
if (parse_docpath_array(sprc))
continue;
break;
}
if (last_double_star)
parse_error("Document path ending in '**'");
return has_item;
}
/**
documentPathMember ::=
| MUL
| ID
| STRING1
TODO: Does STRING1 differ from plain STRING in any way?
*/
bool Expr_parser_base::parse_docpath_member(Path_prc *prc)
{
const Token *t = peek_token();
if (!t)
return false;
switch (t->get_type())
{
case Token::STAR:
safe_prc(prc)->list_el()->any_member();
break;
case Token::WORD:
case Token::QQSTRING:
case Token::QSTRING:
safe_prc(prc)->list_el()->member(t->get_text());
break;
default:
return false;
}
consume_token();
return true;
}
bool Expr_parser_base::parse_docpath_member_dot(Path_prc *prc)
{
if (!consume_token(Token::DOT))
return false;
if (!parse_docpath_member(prc))
parse_error("Expected member name or '*' after '.' in a document path");
return true;
}
/**
docuemntPathArray ::= LSQBRACKET documentPathArrayLoc RSQBRACKET
documentPathArrayLoc ::=
| MUL
| INT
*/
bool Expr_parser_base::parse_docpath_array(Path_prc *prc)
{
if (!consume_token(Token::LSQBRACKET))
return false;
if (consume_token(Token::STAR))
{
safe_prc(prc)->list_el()->any_index();
}
else
{
if (!cur_token_type_is(Token::INTEGER))
parse_error("Expected '*' or integer index after '[' in a document path");
uint64_t v;
try {
v = strtoui(consume_token()->get_utf8());
}
catch (const Numeric_conversion_error &e)
{
parse_error(e.msg());
throw; // quiet compile warnings
}
if (v > std::numeric_limits<Path_prc::Element_prc::index_t>::max())
parse_error("Array index too large");
safe_prc(prc)->list_el()->index(Path_prc::Element_prc::index_t(v));
}
consume_token_throw(
Token::RSQBRACKET,
"Expected ']' to close a document path array component"
);
return true;
}
// -------------------------------------------------------------------------
bool column_ref_from_path(cdk::Doc_path &path, parser::Column_ref &column)
{
struct Path_prc
: public cdk::Doc_path::Processor
, public cdk::Doc_path::Processor::Element_prc
{
unsigned m_len;
parser::Column_ref &m_col;
bool m_ret;
Element_prc* list_el()
{
return this;
}
void member(const Element_prc::string &name)
{
switch (m_len++)
{
case 0: m_col.set(name); break;
case 1: m_col.set(name, m_col.name()); break;
case 2:
assert(m_col.table());
m_col.m_table_ref.set(m_col.name(), m_col.table()->name());
m_col.set_name(name);
break;
default:
// Too many path elements
m_ret = false;
}
}
void index(uint32_t)
{
m_ret = false;
}
void any_member()
{
m_ret = false;
}
void any_index()
{
m_ret = false;
}
void any_path()
{
m_ret = false;
}
void whole_document()
{
m_ret = false;
}
Path_prc(parser::Column_ref &col)
: m_len(0), m_col(col), m_ret(true)
{}
}
prc(column);
path.process(prc);
return prc.m_ret;
}
/**
atomicExpr ::=
placeholder
| columnIdent // TABLE mode
| documentField // DOCUMENT mode
| functionCall
| groupedExpr
| unaryOp
| castOp
| literal
| jsonDoc
| array
placeholder ::= COLON ID
groupedExpr ::= LPAREN expr RPAREN
unaryOp ::=
BANG atomicExpr
| NEG atomicExpr
| PLUS atomicExpr
| MINUS atomicExpr
literal ::=
INT
| FLOAT
| STRING1
| STRING2
| NULL
| FALSE
| TRUE
We extend this grammar with nullary operators:
nullaryOp ::= MUL
TODO: "default" operator
*/
Expression* Expr_parser_base::parse_atomic(Processor *prc)
{
if (!tokens_available())
parse_error("Expected an expression");
Token::Type type = peek_token()->get_type();
switch (type)
{
// jsonDOC
case Token::LCURLY:
return parse(DOC, prc);
// array
case Token::LSQBRACKET:
return parse(ARR, prc);
// groupedExpr
case Token::LPAREN:
{
consume_token();
smart_ptr<Expression> res(parse(FULL, prc));
consume_token_throw(
Token::RPAREN,
"Expected ')' to close parenthesized sub-expression"
);
return res.release();
}
default: break;
}
/*
If prc is NULL, we are supposed to store and return the result
of parsing. In that case initialize stored variable with appropriate
storage object and set prc to point at it so that expression will
be reported to the storage object.
Note: if prc is not NULL then stored remains empty and stored.release()
would produce NULL as required in this case.
*/
smart_ptr<Stored_expr> stored;
if (!prc)
prc = stored.reset(new Stored_any());
Safe_prc<Processor> sprc(prc);
// parameters, nullary operators, CAST
if (consume_token(Token::COLON))
{
sprc->scalar()->param(consume_token_throw(
Token::WORD,
"Expected parameter name after ':'"
).get_text());
return stored.release();
}
if (consume_token(Op::STAR))
{
sprc->scalar()->op(Op::name(Op::STAR));
// NOTE: arguments processor is ignored as there are no arguments
return stored.release();
}
if (parse_cast(prc->scalar()))
{
return stored.release();
}
// Unary operator.
List_prc *argsp = NULL;
bool neg = false;
Op::Type op = Op::get_unary(*peek_token());
switch (op)
{
case Op::PLUS:
case Op::MINUS:
{
consume_token();
if (Token_base::cur_token_type_in({ Token::NUMBER, Token::INTEGER }))
{
// treat as numeric literal with possibly negated value
neg = (Op::MINUS == op);
break;
}
// otherwise report as unary operator
argsp = sprc->scalar()->op(Op::name(op));
break;
}
case Op::NEG:
consume_token();
argsp = sprc->scalar()->op(Op::name(Op::NEG));
break;
case Op::NOT:
consume_token();
argsp = sprc->scalar()->op(Op::name(Op::NOT));
break;
case Op::BITNEG:
consume_token();
argsp = sprc->scalar()->op(Op::name(Op::BITNEG));
break;
default:
break; // will continue with literal parsing
}
// Report the single argument of the unary operator
if (argsp)
{
argsp->list_begin();
delete parse(ATOMIC, argsp->list_el());
argsp->list_end();
return stored.release();
}
assert(tokens_available());
// Literal value
Keyword::Type kw = Keyword::get(*peek_token());
switch (kw)
{
case Keyword::L_NULL:
sprc->scalar()->val()->null();
consume_token();
return stored.release();
case Keyword::L_TRUE:
case Keyword::L_FALSE:
sprc->scalar()->val()->yesno(Keyword::L_TRUE == kw);
consume_token();
return stored.release();
default:
// continue looking for other literals
break;
}
try {
switch (peek_token()->get_type())
{
case Token::QQSTRING:
case Token::QSTRING:
if (m_strings_as_blobs)
{
sprc->scalar()->val()->value(
cdk::TYPE_BYTES, Format_info(), consume_token()->get_bytes()
);
}
else
sprc->scalar()->val()->str(consume_token()->get_text());
return stored.release();
case Token::NUMBER:
{
double val = strtod(consume_token()->get_utf8());
sprc->scalar()->val()->num(neg ? -val : val);
return stored.release();
}
case Token::INTEGER:
if (neg)
{
int64_t val = strtoi(consume_token()->get_utf8());
sprc->scalar()->val()->num(-val);
}
else
{
uint64_t val = strtoui(consume_token()->get_utf8());
sprc->scalar()->val()->num(val);
}
return stored.release();
case Token::HEX:
if (neg)
{
int64_t val = strtoi(consume_token()->get_utf8(), 16);
sprc->scalar()->val()->num(-val);
}
else
{
uint64_t val = strtoui(consume_token()->get_utf8(), 16);
sprc->scalar()->val()->num(val);
}
return stored.release();
default:
// will continue with functionCall | columnIdent | documentField parsing
break;
}
}
catch (const Numeric_conversion_error &e)
{
parse_error(e.msg());
}
/*
functionCall | columnIdent | documentField
It is not possible to tell which of these 3 alternatives we have by
looking at the current token. Either functionCall or columnIdent or
documentField can start with something which looks like a schema-qualified
name: "A" or "A.B".
For that reason we start with a call to parse_schema_indent() which would
parse such a schema-qualified name and store it as table/schema name of
m_col_ref member.
After this we try to parse a function call and if it fails we try
columnIndent or documentField, depending on the parsing mode.
*/
Token::Type types[2];
bool schema_ident = false;
m_col_ref.clear();
/*
Try to parse schema-qualified identifier, storing the types of the tokens
that have been consumed - this information is needed in case parsing
schema identifier fails.
Note: it is important that parse_schema_ident() stores consumed tokens
in m_col_ref even if it fails in the end.
*/
schema_ident = parse_schema_ident(&types);
/*
If parse_schema_ident() succeeded, and we have the result in
m_col_ref.table(), we see if it is not a beginning of a function call.
If parse_function_call() succeeds then we are done.
*/
if (schema_ident)
{
assert(m_col_ref.table());
if (parse_function_call(*m_col_ref.table(), sprc.scalar()))
return stored.release();
}
/*
Otherwise we must have either a document path (in DOCUMENT mode) or
a column identifier, possibly followed by a path (in TABLE mode).
*/
if (Parser_mode::TABLE == m_parser_mode)
{
/*
If we are in the TABLE mode, and parse_schema_ident() failed above, then
we do not have a valid column identifier which is an error.
*/
if (!schema_ident)
parse_error("Expected atomic expression");
/*
Otherwise we complete parsing the column identifier and report it to
the processor.
*/
parse_column_ident1(prc);
return stored.release();
}
/*
Here we know that we are in DOCUMENT mode and we are expecting a document
path. If parse_schema_ident() called above consumed some tokens, we check
if they were not quoted identifiers. Such identifiers are allowed when
referring to tables or columns but are invalid in a document path.
*/
if (Token::QWORD == types[0] || Token::QWORD == types[1])
parse_error("Expected atomic expression");
/*
Now we treat the identifiers "A.B" parsed by parse_schema_ident() and
stored as table/schema name in m_col_ref (if any), as an initial segment
of a document field reference and complete parsing the whole document
field.
*/
cdk::Doc_path_storage path;
if (m_col_ref.table() && m_col_ref.table()->schema())
{
parse_document_field(
m_col_ref.table()->schema()->name(),
m_col_ref.table()->name(),
&path
);
}
else if (m_col_ref.table())
{
parse_document_field(m_col_ref.table()->name(), &path);
}
else
{
parse_document_field(&path, true);
}
sprc->scalar()->ref(path);
return stored.release();
}
// -------------------------------------------------------------------------
Expression*
Expr_parser_base::left_assoc_binary_op(const Op::Set &ops,
Start lhs, Start rhs,
Processor *prc)
{
// Store LHS of the expression
smart_ptr<Expression> stored_lhs(parse(lhs, NULL));
const Token *t = consume_token(ops);
if (!t)
{
/*
There is no RHS, so LHS is the whole expression.
If prc is NULL then we return already stored LHS. Otherwise
we report stored LHS to the processor.
*/
if (!prc)
return stored_lhs.release();
stored_lhs->process(*prc);
return NULL;
}
Op::Type op = Op::get_binary(*t);
/*
If storing operator call expression (prc is NULL), use specialized
Stored_op class that can re-use already stored LHS expression.
*/
smart_ptr<Stored_expr> stored;
if (!prc)
// Note: Stored_op takes ownership of the stored LHS expr.
prc = stored.reset(new Stored_op(stored_lhs.release()));
// pass lhs and rhs as operator arguments
List_prc *aprc = safe_prc(prc)->scalar()->op(Op::name(op));
if (aprc)
{
aprc->list_begin();
// Report stored LHS as the 1st argument.
stored_lhs->process_if(aprc->list_el());
// then parse rhs, passing it as 2nd argument
delete parse(rhs, aprc->list_el());
aprc->list_end();
}
return stored.release();
}
Expression* Expr_parser_base::parse_mul(Processor *prc)
{
Op::Set ops;
ops.insert(Op::MUL);
ops.insert(Op::DIV);
ops.insert(Op::MOD);
return left_assoc_binary_op(ops, ATOMIC, MUL, prc);
}
Expression* Expr_parser_base::parse_add(Processor *prc)
{
Op::Set ops;
ops.insert(Op::ADD);
ops.insert(Op::SUB);
return left_assoc_binary_op(ops, MUL, ADD, prc);
}
Expression* Expr_parser_base::parse_shift(Processor *prc)
{
Op::Set ops;
ops.insert(Op::LSHIFT);
ops.insert(Op::RSHIFT);
return left_assoc_binary_op(ops, ADD, SHIFT, prc);
}
Expression* Expr_parser_base::parse_bit(Processor *prc)
{
if (consume_token(Op::BITNEG))
{
smart_ptr<Stored_expr> stored;
if (!prc)
prc = stored.reset(new Stored_any());
Safe_prc<Processor::Scalar_prc> sprc(prc->scalar());
List_prc *argsp = NULL;
argsp = sprc->op(Op::name(Op::BITNEG));
if (argsp)
{
argsp->list_begin();
delete parse(ATOMIC, argsp->list_el());
argsp->list_end();
return stored.release();
}
return parse_bit(prc);
}
Op::Set ops;
ops.insert(Op::BITAND);
ops.insert(Op::BITOR);
ops.insert(Op::BITXOR);
return left_assoc_binary_op(ops, SHIFT, BIT, prc);
}
Expression* Expr_parser_base::parse_comp(Processor *prc)
{
Op::Set ops;
ops.insert(Op::GE);
ops.insert(Op::GT);
ops.insert(Op::LE);
ops.insert(Op::LT);
ops.insert(Op::EQ);
ops.insert(Op::NE);
return left_assoc_binary_op(ops, BIT, COMP, prc);
}
Expression* Expr_parser_base::parse_and(Processor *prc)
{
return left_assoc_binary_op({ Op::AND }, ILRI, AND, prc);
}
Expression* Expr_parser_base::parse_or(Processor *prc)
{
return left_assoc_binary_op({ Op::OR }, AND, OR, prc);
}
// -------------------------------------------------------------------------
/**
Expression Parser EBNF:
note; No repetition, must be connected by logical operators
ilriExpr ::=
compExpr IS NOT? (NULL|TRUE|FALSE)
| compExpr NOT? IN LPAREN argsList? RPAREN
| compExpr NOT? "IN" compExpr
// TODO: we don't know how to report ESCAPE on protocol level
| compExpr NOT? LIKE compExpr //(ESCAPE compExpr)?
| compExpr NOT? RLIKE compExpr //(ESCAPE compExpr)?
| compExpr NOT? BETWEEN compExpr AND compExpr
| compExpr NOT? REGEXP compExpr
| compExpr
*/
Expression* Expr_parser_base::parse_ilri(Processor *prc)
{
// Store the first expression.
smart_ptr<Expression> first(parse(COMP, NULL));
// Record negation, if present.
bool neg = (NULL != consume_token(Op::NOT));
/*
Look for the main operator.
*/
Op::Set next;
next.insert(Op::IS);
next.insert(Op::IN);
next.insert(Op::LIKE);
next.insert(Op::RLIKE);
next.insert(Op::BETWEEN);
next.insert(Op::REGEXP);
next.insert(Op::SOUNDS_LIKE);
next.insert(Op::OVERLAPS);
const Token *t = consume_token(next);
/*
If we don't see any of the operators and there was no negation
then we report the first expression as complete ilriExpr.
*/
if (!t)
{
if (neg)
parse_error("Expected IN, (R)LIKE, BETWEEN, OVERLAPS or REGEXP after NOT");
// If prc is NULL return already stored expression.
if (!prc)
return first.release();
// Otherwise report stored expression to the processor.
first->process(*prc);
return NULL;
}
// We have an ilri expression with operator and 2 arguments.
Op::Type op = Op::get_binary(*t);
// Handle IS NOT case.
if (neg && Op::IS == op)
parse_error("Operator NOT before IS, should be IS NOT");
// Note: consume_token() replaces contents of *t...
if (Op::IS == op && consume_token(Op::NOT))
neg = true;
// Detect unsupported operators before handling parameters
switch (op)
{
case Op::SOUNDS_LIKE:
if (cur_token_type_is(Keyword::LIKE))
unsupported("Operator SOUNDS LIKE");
break;
case Op::IS:
if (neg)
op = Op::IS_NOT;
break;
case Op::IN:
if (!cur_token_type_is(Token::LPAREN))
{
if (neg)
op =Op::NOT_CONT_IN;
else
op = Op::CONT_IN;
}
else
{
if (neg)
op =Op::NOT_IN;
}
break;
case Op::LIKE:
if (neg)
op = Op::NOT_LIKE;
break;
case Op::RLIKE:
if (neg)
op = Op::NOT_RLIKE;
break;
case Op::BETWEEN:
if (neg)
op = Op::NOT_BETWEEN;
break;
case Op::REGEXP:
if (neg)
op = Op::NOT_REGEXP;
break;
case Op::OVERLAPS:
if (neg)
op = Op::NOT_OVERLAPS;
break;
default: break;
}
/*
If prc is NULL and we are supposed to store parsed expression, use
specialized Stored_ilri class that can re-use the already stored first
part of the expression.
*/
List_prc *not_arg_prc = NULL;
smart_ptr<Stored_ilri> stored;
if (!prc)
{
prc = stored.reset(new Stored_ilri(first.release()));
}
// report the main operator
Safe_prc<List_prc> aprc = safe_prc(prc)->scalar()->op(Op::name(op));
aprc->list_begin();
// 1st argument
first->process_if(aprc->list_el());
// other arguments
switch (op)
{
case Op::IS:
case Op::IS_NOT:
{
t = consume_token();
if (t)
{
switch (Keyword::get(*t))
{
case Keyword::L_TRUE: aprc->list_el()->scalar()->val()->yesno(true); break;
case Keyword::L_FALSE: aprc->list_el()->scalar()->val()->yesno(false); break;
case Keyword::L_NULL: aprc->list_el()->scalar()->val()->null(); break;
default:
t = NULL; // this indicates error
}
}
if (!t)
parse_error("expected TRUE, FALSE or NULL after IS");
break;
}
case Op::IN:
case Op::CONT_IN:
case Op::NOT_IN:
case Op::NOT_CONT_IN:
{
if (consume_token(Token::LPAREN))
{
// Note: true flag means that strings will be reported as blobs.
parse_argslist(aprc, true);
consume_token_throw(
Token::RPAREN,
"Expected ')' to close IN(... expression"
);
}
else
{
delete parse(COMP, aprc->list_el());
}
break;
}
case Op::LIKE:
case Op::NOT_LIKE:
case Op::RLIKE:
case Op::NOT_RLIKE:
{
delete parse(COMP, aprc->list_el());
if (cur_token_type_is(Keyword::ESCAPE))
{
unsupported("ESCAPE clause for (R)LIKE operator");
}
break;
}
case Op::REGEXP:
case Op::NOT_REGEXP:
delete parse(COMP, aprc->list_el());
break;
case Op::OVERLAPS:
case Op::NOT_OVERLAPS:
delete parse(COMP, aprc->list_el());
break;
case Op::BETWEEN:
case Op::NOT_BETWEEN:
delete parse(COMP, aprc->list_el());
consume_token_throw(
Keyword::AND,
"Expected AND in BETWEEN ... expression"
);
delete parse(COMP, aprc->list_el());
break;
default: assert(false);
}
// close argument list
aprc->list_end();
if (not_arg_prc)
not_arg_prc->list_end();
return stored.release();
}
// -------------------------------------------------------------------------
/*
Below we want to use Expr_parser_base with parser templates such
as Doc_parser<> or List_parser<>. These templates assume that the
base parser can be constructed with a constructor which accepts only
2 parameters defining the range of tokens to be parsed.
But Expr_parser_base constructor also needs parser mode parameter
and the flag which tells if strings should be reported as blobs.
To fix this, we define Base_parser<> template parametrized with parser
mode, which will construct required flavor of the parser.
*/
template <Parser_mode::value Mode,
bool strings_as_blobs = false>
struct Base_parser : public Expr_parser_base
{
Base_parser(It &first, const It &last)
: Expr_parser_base(first, last, Mode, strings_as_blobs)
{}
};
template <Parser_mode::value Mode,
bool strings_as_blobs>
void parse_args(Processor::List_prc *prc, It &first, const It &last)
{
List_parser< Base_parser<Mode, strings_as_blobs> >
args_parser(first, last);
args_parser.process_if(prc);
}
template <bool strings_as_blobs>
void parse_args(Parser_mode::value mode, Processor::List_prc *prc,
It &first, const It &last)
{
if (Parser_mode::DOCUMENT == mode)
parse_args<Parser_mode::DOCUMENT, strings_as_blobs>(prc, first, last);
else
parse_args<Parser_mode::TABLE, strings_as_blobs>(prc, first, last);
}
void
Expr_parser_base::parse_argslist(Processor::List_prc *prc,
bool strings_as_blobs)
{
/*
argsList ::= expr (COMMA expr)*
*/
if (strings_as_blobs)
parse_args<true>(m_parser_mode, prc, cur_pos(), end_pos());
else
parse_args<false>(m_parser_mode, prc, cur_pos(), end_pos());
}
void Expr_parser_base::parse_arr(Processor::List_prc *prc)
{
if (Parser_mode::DOCUMENT == m_parser_mode)
{
Arr_parser<Base_parser<Parser_mode::DOCUMENT>,
Expression::Scalar::Processor>
arr_parser(cur_pos(), end_pos());
arr_parser.process_if(prc);
}
else
{
Arr_parser<Base_parser<Parser_mode::TABLE>, Expression::Scalar::Processor>
arr_parser(cur_pos(), end_pos());
arr_parser.process_if(prc);
}
}
void Expr_parser_base::parse_doc(Processor::Doc_prc *prc)
{
if (Parser_mode::DOCUMENT == m_parser_mode)
{
Doc_parser<Base_parser<Parser_mode::DOCUMENT>,
Expression::Scalar::Processor>
doc_parser(cur_pos(), end_pos());
doc_parser.process_if(prc);
}
else
{
Doc_parser<Base_parser<Parser_mode::TABLE>,
Expression::Scalar::Processor>
doc_parser(cur_pos(), end_pos());
doc_parser.process_if(prc);
}
}
void Order_parser::parse(Processor& prc)
{
It it = m_tokenizer.begin();
set_tokens(it, m_tokenizer.end());
if (!tokens_available())
parse_error("Expected sorting order specification");
Stored_any store_expr;
Expr_parser_base parser(cur_pos(), end_pos(), m_mode);
parser.process(store_expr);
cdk::api::Sort_direction::value dir = cdk::api::Sort_direction::ASC;
// get ASC/DESC token if available
if (tokens_available())
{
switch(Keyword::get(*peek_token()))
{
case Keyword::ASC:
consume_token();
dir = cdk::api::Sort_direction::ASC;
break;
case Keyword::DESC:
consume_token();
dir = cdk::api::Sort_direction::DESC;
break;
default:
parse_error("Expected sorting direction ASC or DESC");
}
}
if (tokens_available())
parse_error("Unexpected characters after sorting order specification");
store_expr.process_if(prc.sort_key(dir));
}
void Projection_parser::parse_tbl_mode(Projection_processor& prc)
{
It it = m_tokenizer.begin();
set_tokens(it, m_tokenizer.end());
if (!tokens_available())
parse_error("Expected projection specification");
Expr_parser_base parser(cur_pos(), end_pos(), m_mode);
parser.process_if(prc.expr());
// get AS token if available
if (tokens_available())
{
if (!consume_token(Keyword::AS))
parse_error("Invalid characters in projection specification,"
" only AS <name> allowed after the projection expression");
if (!Token_base::cur_token_type_in({ Token::WORD, Token::QWORD }))
parse_error("Expected identifier after AS");
prc.alias(consume_token()->get_text());
}
if (tokens_available())
parse_error("Unexpected characters after projection specification");
}
void Projection_parser::parse_doc_mode(Document_processor& prc)
{
It it = m_tokenizer.begin();
set_tokens(it, m_tokenizer.end());
if (!tokens_available())
parse_error("Expected projection specification");
/*
note: passing m_toks.end() directly as constructor argument results
in "incompatible iterators" exception when comparing iterators (at
least on win, vs2010). problem with passing temporary object?
*/
Stored_any store_expr;
Expr_parser_base parser(cur_pos(), end_pos(), m_mode);
parser.process(store_expr);
// AS is mandatory on Collections
if (!consume_token(Keyword::AS))
parse_error("Expected AS in projection specification");
if (!Token_base::cur_token_type_in({Token::WORD,Token::QWORD}))
parse_error("Expected identifier after AS");
const string &id = consume_token()->get_text();
if (tokens_available())
parse_error("Invalid characters after projection specification");
store_expr.process_if(prc.key_val(id));
}