loader/lib/Scanner.js (419 lines of code) (raw):
/*
Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights
reserved.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; version 2 of
the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
02110-1301 USA
*/
"use strict";
// This file contains:
// SQL Tokenizer for parsing control statement
// Data File Scanner
// Line Scanner
// Text Field Scanner
/* SQL Tokenizer
tokenize(source, operators)
IMMEDIATE
source: a source string
operators: a string consisting of all legal one-character operators
Returns an array of tokens.
Token types:
name = alphabetic { alpha + numeric + underscore } (SQL REGULAR IDENTIFIER)
number = digit { digit } . (NON-NEGATIVE INTEGERS ONLY)
variable = '@' { char } . (MYSQL "@" VARIABLE)
operator = valid single-character operator
string = text quoted in single, double, or backtick quotes
Ignores C-style block comments and SQL-style comments from "--" TO EOL
Stops scanning and returns token stream if it reaches the special token
BEGINDATA
*/
var assert = require("assert"),
udebug = unified_debug.getLogger("Scanner.js");
function tokenize (source) {
var operators = "(),;:."; // Legal one-character operators
var result = []; // An array to hold the results
var c; // The current character
var i; // The index of the current character
var v; // Intermediate value
var tok; // Current token
var q; // Quote character
var line = 1, col = 1; // Current line and column of input
function peek() { // Look ahead one character
return source.charAt(i+1);
}
function advance(n) { // Advance to next character
var amt = n || 1;
if(i + amt >= source.length) {
i = source.length;
c = '';
}
else {
i += amt;
c = source.charAt(i);
}
if(c == '\n') { line += 1; col = 0; }
else { col += amt; }
}
function begin() { // Begin tokenizing
i = 0;
c = source.charAt(i);
if(c == '\n') { line = 1; }
}
function isAlpha() {
return ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'));
}
function isAlpha_() {
return (c === '_' || isAlpha());
}
function isNumeric() {
return (c >= '0' && c <= '9');
}
function isInitialNumeric() {
var p = peek();
return (isNumeric() || (c == '-' && p >= '0' && p <= '9'));
}
function isNonInitialNumeric() {
return (isNumeric() || (c == '.'));
}
function isAlphanumeric() {
return (isAlpha_() || isNumeric());
}
/* Tokens */
function Token(type, initialValue) {
this.type = type;
this.line = line;
this.column = col;
this.str = initialValue;
advance();
}
Token.prototype.consume = function() {
this.str += c;
advance();
};
Token.prototype.deliver = function(value) {
this.value = value || this.str;
udebug.log("Token deliver", this.type, this.value);
delete this.str;
result.push(this);
};
Token.prototype.error = function(message) {
var err = new Error(message);
err.token = this;
throw err;
};
/* Examine the text one character at a time. */
begin();
while (c) {
tok = null;
if (c <= ' ') { /* IGNORE WHITESPACE */
advance();
}
else if(isAlpha()) { /* NAME */
tok = new Token('name', c);
while (isAlphanumeric()) {
tok.consume(c);
}
tok.deliver();
if(tok.value === "BEGINDATA") {
tok.type = 'begindata';
return result;
}
}
else if (c === '@') { /* @VARIABLE */
tok = new Token('variable', '');
while (isAlphanumeric()) {
tok.consume();
}
tok.deliver();
}
else if (isInitialNumeric()) { /* NUMBER */
tok = new Token('number', c);
while(isNonInitialNumeric()) {
tok.consume();
}
v = + tok.str; // numeric value
if(isFinite(v)) { tok.deliver(v); }
else { tok.error("bad number"); }
}
else if (c === '\'' || c === '"' || c === '`') { /* QUOTED STRING */
q = c;
tok = new Token('string', '');
while (c !== q) { /* until closing quote */
/* Special cases: unterminated string, control character, escapes */
if (c === '\n' || c === '\r' || c === '') {
tok.error("Unterminated string.");
}
else if (c < ' ') {
tok.error("Control character in string.");
}
else if (c === '\\') { /* escape sequence */
advance();
switch (c) {
case '':
tok.error("Unterminated string");
break;
case 'b':
c = '\b'; break;
case 'f':
c = '\f'; break;
case 'n':
c = '\n'; break;
case 'r':
c = '\r'; break;
case 't':
c = '\t'; break;
case 'u':
v = parseInt(source.substr(i + 1, 4), 16);
if (v < 0 || v > 0xFFFF) {
tok.error("Bad Unicode character sequence");
}
c = String.fromCharCode(v);
advance(4);
break;
}
}
tok.consume();
}
advance(); /* advance past closing quote */
tok.deliver();
}
else if (c === '-' && peek() === '-') { /* COMMENTS FROM -- TO EOL */
advance(2);
while(c !== '\n' && c !== '\r' && c !== '') {
advance();
}
}
else if (c === '/' && peek() === '*') { // COMMENTS FROM /* TO */
advance(2);
while(c && c !== '*' && peek() !== '/') {
advance();
}
advance(2);
if(c === '') { throw new Error("Unterminated comment"); }
}
else if(operators.indexOf(c) >= 0) { /* SINGLE-CHARACTER OPERATOR */
tok = new Token('operator', c);
tok.deliver();
}
else {
v = "scanner error";
if(result.length) { v += " after " + result.pop().value; }
v += " at position " + i;
throw new Error(v);
}
} /* end of while loop */
return result;
}
////////////////////////////////////////////////////////////////
//
// Data File Scanners
//
/* From http://dev.mysql.com/doc/refman/5.5/en/load-data.html
The FIELD and LINE terminators can be strings
The escape sequences are:
\0 ASCII NULL, \b backspace, \n newline, \r return,
\t tab, \Z ASCII 26, \N SQL NULL
TODO: We currently only support a single character field separator.
*/
function Scanner(source, start, options) {
this.source = source;
this.i = start;
this.c = source.charAt(start);
this.opt = options;
this.EOL = options.lineEndString;
this.lineEndExtra = (this.EOL.length > 1);
this.lineCount = 0;
}
Scanner.prototype.advance = function(n) {
assert(n > 0);
var advanceTo = this.i + n;
while(this.i < advanceTo) {
if(this.c == '\n') { this.lineCount++; }
this.i += 1;
if(this.i >= this.source.length) {
this.i = this.source.length - 1;
this.c = '';
return;
}
this.c = this.source.charAt(this.i);
}
};
Scanner.prototype.peek = function(chars) {
var n = chars || 1;
return this.source[this.i + n];
};
Scanner.prototype.isQuote = function(character) {
var char = character || this.c;
return ((char === this.opt.fieldQuoteStart) ||
(char === this.opt.fieldQuoteEnd));
};
Scanner.prototype.isEsc = function() {
return (this.c === this.opt.fieldQuoteEsc);
};
Scanner.prototype.isWhitespace = function() {
// Always skip newlines; never skip field separators.
if(this.c === '\n' || this.c === '\r') {
return true;
}
if(this.opt.fieldSepOnWhitespace) {
return false;
}
if(this.c <= ' ' && this.c !== this.opt.fieldSep) {
return true;
}
return false;
};
Scanner.prototype.skipWhitespace = function() {
while(this.c && this.isWhitespace()) {
this.advance(1);
}
};
Scanner.prototype.isStartQuote = function() {
return (this.c === this.opt.fieldQuoteStart);
};
Scanner.prototype.isFieldSeparator = function() {
return ( (this.c === this.opt.fieldSep) ||
(this.fieldSepOnWhitespace && this.isWhitespace()));
};
Scanner.prototype.handleQuotedString = function(doEval) {
if(udebug.is_debug) { doEval = true; }
var inquote, value, consume, scanner;
if(doEval) {
scanner = this;
value = "";
consume = function() { value += scanner.c; };
} else {
consume = function() {};
}
assert(this.c === this.opt.fieldQuoteStart);
inquote = true;
do {
this.advance(1);
if(this.isQuote() && (this.c === this.peek())) { /**** Doubled quote */
consume(); /* CONSUME A QUOTE */
this.advance(2); /* SKIP PAST A QUOTE */
}
else if(this.c === this.opt.fieldQuoteEnd) { /**** Closing quote */
// this.advance(1); /* ADVANCE PAST CLOSING QUOTE CHAR */
inquote = false; /* TERMINATE LOOP */
}
else if(this.isEsc()) { /** Escape Sequence */
this.advance(1); /* SKIP PAST ESCAPE CHAR */
if(this.isQuote()) { // quote
consume();
this.advance(1);
}
}
else { /* Normal character */
consume();
}
} while(inquote && this.c);
return value;
};
Scanner.prototype.skip_C_comment = function() {
if (this.c === '/' && this.peek() === '*') {
this.advance(2);
while(this.c && this.c !== '*' && this.peek() !== '/') {
this.advance(1);
}
this.advance(2);
if(this.c === '') {
throw new Error("Unterminated comment");
}
}
};
Scanner.prototype.isEndOfLine = function() {
var r;
if(this.lineEndExtra) {
r = (this.c === this.EOL);
} else {
r = (this.source.substr(this.i, this.EOL.length) === this.EOL);
}
return r;
};
Scanner.prototype.skipToEndOfLine = function() {
while(this.c) {
if(this.isEndOfLine()) {
this.advance(this.EOL.length);
return;
}
this.advance(1);
}
};
Scanner.prototype.isInlineComment = function() {
var i, j;
if(! this.opt.commentStart) {
return false;
}
j = this.opt.commentStart.length;
for(i = 0 ; i < j ; i++) {
if(this.peek(i) !== this.opt.commentStart.charAt(i)) {
return false;
}
}
return true;
};
Scanner.prototype.skipLinePrefix = function() {
var idx;
if(this.opt.lineStartString && this.opt.lineStartString.length) {
idx = this.source.indexOf(this.opt.lineStartString, this.i);
if(idx > 0) {
this.i = idx + this.opt.lineStartString.length;
this.c = this.source.charAt(this.i);
this.skipWhitespace();
}
}
};
Scanner.prototype.isAtEnd = function() {
return ((1 + this.i) >= this.source.length);
};
Scanner.prototype.getValueForDelimitedField = function() {
var value;
if(this.isStartQuote()) {
value = this.handleQuotedString(true);
} else {
value = "";
while(! ( this.isFieldSeparator() || this.isEndOfLine())) {
/* TODO: Handle \N for null; also handle other escape sequences? */
value += this.c;
this.advance(1);
}
}
return value;
};
Scanner.prototype.getValueForFixedWidthField = function(column) {
return this.source.substring(column.startPos, column.endPos);
};
// Line Scanner
function LineScanner(options) {
this.options = options;
}
/*
Skip a fixed number of physical lines
*/
LineScanner.prototype.skipPhysicalLines = function(spec, n) {
var scanner = new Scanner(spec.source, spec.lineStart, this.options);
while(n-- > 0 && ! scanner.isAtEnd()) {
scanner.skipToEndOfLine();
}
spec.atEnd = scanner.isAtEnd();
spec.lineEnd = scanner.i;
};
/* scan():
start in string spec.source at position spec.start.
Skip over whitespace and comments.
If the buffer contains the end of the record, set spec.lineEnd.
If the whole buffer has been read, set spec.atEnd.
Return the number of newline characters scanned.
*/
LineScanner.prototype.scan = function(spec) {
var scanner = new Scanner(spec.source, spec.lineStart, this.options);
var start, isEOL;
/* Find start of line */
do {
start = scanner.i;
scanner.skipWhitespace();
if(scanner.isInlineComment()) {
scanner.skipToEndOfLine();
}
} while(scanner.i > start); // while making progress through the file
scanner.skipLinePrefix(); // brings us to the end of prefix (if any)
spec.lineStart = scanner.i;
spec.lineHasFields = false;
/* Find end of line; set lineEnd if reached */
do {
if(scanner.isFieldSeparator()) {
if(udebug.is_debug && (! spec.lineHasFields)) {
udebug.log("lineHasFields - true at pos:", scanner.i);
}
spec.lineHasFields = true;
} else if(scanner.isStartQuote()) {
scanner.handleQuotedString(false);
}
scanner.advance(1);
isEOL = scanner.isEndOfLine();
} while(scanner.c && ! isEOL);
spec.atEnd = scanner.isAtEnd();
if(isEOL) {
spec.lineEnd = scanner.i;
}
udebug.log("LineScanner.scan returning", scanner.lineCount);
return scanner.lineCount;
};
// Text Field Scanner
function TextFieldScanner(options) {
this.options = options;
}
TextFieldScanner.prototype.scan = function(spec) {
var scanner, result;
scanner = new Scanner(spec.source, spec.lineStart, this.options);
result = [];
while(! scanner.isEndOfLine()) {
if(scanner.isFieldSeparator()) {
scanner.advance(1);
} else {
result.push(scanner.getValueForDelimitedField());
}
}
udebug.log("TextFieldScanner.scan:", result.length, "fields");
return result;
};
/* testFileType investigates a JSON file.
We allow C and C++ style comments.
If it starts with "[ [" or "[ {" it's a JSON array.
If it starts with "{" it's line-delimited JSON.
RETURNS: 2 for JSON Array. 1 for line-delimited JSON. 0 for Not JSON.
*/
function testFileType(source) {
var i, c, x;
i = 0;
var tokens = [];
do {
x = i;
// i = skipWhitespace(source, i); fixme
// i = skipJavascriptComment(source, i); fixme
c = source[i];
if(c === '[' || c === '{') {
tokens.push(c); i++;
}
} while(i > x && tokens.length < 2);
if(tokens[0] === "[" && tokens.length === 2) { return 2; } // JSON Array
if(tokens.length > 0) { return 1; } // JSON
return 0; // Not JSON
}
exports.tokenize = tokenize;
exports.LineScanner = LineScanner;
exports.TextFieldScanner = TextFieldScanner;