lib/elastic_apm/sql/tokenizer.rb (186 lines of code) (raw):
# Licensed to Elasticsearch B.V. under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Elasticsearch B.V. licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# frozen_string_literal: true
require 'strscan'
require 'elastic_apm/sql/tokens'
module ElasticAPM
module Sql
# @api private
class Tokenizer
include Tokens
ALPHA = /[[:alpha:]]/.freeze
DIGIT = /[[:digit:]]/.freeze
SPACE = /[[:space:]]+/.freeze
def initialize(input)
@input = input
@scanner = StringScanner.new(input)
@byte_start = 0
end
attr_reader :input, :scanner, :token
def text
@input.byteslice(@byte_start, @byte_end - @byte_start)
end
def scan
scanner.skip(SPACE)
@byte_start = scanner.pos
char = next_char
return false unless char
@token = next_token(char)
true
end
private
# rubocop:disable Metrics/CyclomaticComplexity
def next_token(char)
case char
when '_' then scan_keyword_or_identifier(possible_keyword: false)
when '.' then PERIOD
when '$' then scan_dollar_sign
when '`' then scan_quoted_indentifier('`')
when '"' then scan_quoted_indentifier('"')
when '[' then scan_quoted_indentifier(']')
when '(' then LPAREN
when ')' then RPAREN
when '/' then scan_bracketed_or_cql_comment
when '-' then scan_simple_comment
when "'" then scan_string_literal
when ALPHA then scan_keyword_or_identifier(possible_keyword: true)
when DIGIT then scan_numeric_literal
else OTHER
end
end
# rubocop:enable Metrics/CyclomaticComplexity
def next_char
char = @scanner.getch
@byte_end = @scanner.pos
char
end
# StringScanner#peek returns next byte which could be an incomplete utf
# multi-byte character
def peek_char(length = 1)
# The maximum byte count of utf chars is 4:
# > In UTF-8, characters from the U+0000..U+10FFFF range (the UTF-16
# accessible range) are encoded using sequences of 1 to 4 octets.
# # https://tools.ietf.org/html/rfc3629
return nil if length > 4
char = @scanner.peek(length)
return nil if char.empty?
return char if char.valid_encoding?
peek_char(length + 1)
end
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
def scan_keyword_or_identifier(possible_keyword:)
while (peek = peek_char)
if peek == '_' || peek == '$' || peek =~ DIGIT
possible_keyword = false
next next_char
end
next next_char if ALPHA.match?(peek)
break
end
return IDENT unless possible_keyword
snap = text
if snap.length < KEYWORD_MIN_LENGTH || snap.length > KEYWORD_MAX_LENGTH
return IDENT
end
keyword = KEYWORDS[snap.length].find { |kw| snap.upcase == kw.to_s }
return keyword if keyword
IDENT
end
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
# rubocop:disable Metrics/CyclomaticComplexity
def scan_dollar_sign
while (peek = peek_char)
case peek
when DIGIT
next_char while peek_char =~ DIGIT
when '$', '_', ALPHA, SPACE
# PostgreSQL supports dollar-quoted string literal syntax,
# like $foo$...$foo$. The tag (foo in this case) is optional,
# and if present follows identifier rules.
while (char = next_char)
case char
when '$'
# This marks the end of the initial $foo$.
snap = text
slice = input.slice(scanner.pos, input.length)
index = slice.index(snap)
next unless index && index >= 0
delta = index + snap.bytesize
@byte_end += delta
scanner.pos += delta
return STRING
when SPACE
# Unknown token starting with $, consume chars until space.
@byte_end -= char.bytesize
return OTHER
end
end
else break
end
end
OTHER
end
# rubocop:enable Metrics/CyclomaticComplexity
def scan_quoted_indentifier(delimiter)
while (char = next_char)
next unless char == delimiter
if delimiter == '"' && peek_char == delimiter
next next_char
end
break
end
# Remove quotes from identifier
@byte_start += char.bytesize
@byte_end -= char.bytesize
IDENT
end
def scan_bracketed_or_cql_comment
case peek_char
when '*' then scan_bracketed_comment
when '/' then scan_cql_comment
else OTHER
end
end
# rubocop:disable Metrics/CyclomaticComplexity
def scan_bracketed_comment
nesting = 1
while (char = next_char)
case char
when '/'
next unless peek_char == '*'
next_char
nesting += 1
when '*'
next unless peek_char == '/'
next_char
nesting -= 1
return COMMENT if nesting == 0
end
end
end
# rubocop:enable Metrics/CyclomaticComplexity
def scan_cql_comment
return OTHER unless peek_char == '/'
while (char = next_char)
break if char == "\n"
end
COMMENT
end
def scan_simple_comment
return OTHER unless peek_char == '-'
while (char = next_char)
break if char == "\n"
end
COMMENT
end
def scan_string_literal
delimiter = "'"
while (char = next_char)
if char == '\\'
# Skip escaped character, e.g. 'what\'s up?'
next_char
next
end
next unless char == delimiter
return STRING unless peek_char
return STRING if peek_char != delimiter
next_char
end
end
# rubocop:disable Metrics/CyclomaticComplexity
def scan_numeric_literal
period = false
exponent = false
while (peek = peek_char)
case peek
when DIGIT then next_char
when '.'
return NUMBER if period
next_char
period = true
when 'e', 'E'
return NUMBER if exponent
next_char
next_char if /[+-]/.match?(peek_char)
else break
end
end
NUMBER
end
# rubocop:enable Metrics/CyclomaticComplexity
end
end
end