in jena-arq/src/main/java/org/apache/jena/riot/tokens/TokenizerText.java [171:522]
private Token parseToken() {
token = new Token(getLine(), getColumn());
int ch = reader.peekChar();
// ---- IRI, unless it's << or <<(
// [spc] check is for LT.
if ( ch == CH_LT ) {
// Look ahead on char
reader.readChar();
int chPeek2 = reader.peekChar();
if ( chPeek2 != '<' ) {
// '<' not '<<'
token.setImage(readIRI());
token.setType(TokenType.IRI);
if ( Checking )
checkURI(token.getImage());
return token;
}
reader.readChar();
// '<<' so far - maybe '<<('
int chPeek3 = reader.peekChar();
if ( chPeek3 != '(' ) {
token.setType(TokenType.LT2);
//token.setImage("<<");
return token;
}
// It is <<(
reader.readChar();
token.setType(TokenType.L_TRIPLE);
//token.setImage("<<(");
return token;
}
// ---- Literal
if ( ch == CH_QUOTE1 || ch == CH_QUOTE2 ) {
// The token type is STRING.
// We incorporate this into a token for LITERAL_LANG or LITERAL_DT.
token.setType(TokenType.STRING);
reader.readChar();
int ch2 = reader.peekChar();
if ( ch2 == ch ) {
reader.readChar(); // Read potential second quote.
int ch3 = reader.peekChar();
if ( ch3 == ch ) {
reader.readChar(); // Read potential third quote.
token.setImage(readStringQuote3(ch, false));
StringType st = (ch == CH_QUOTE1) ? StringType.LONG_STRING1 : StringType.LONG_STRING2;
token.setStringType(st);
} else {
// Two quotes then a non-quote.
// Must be '' or ""
// No need to pushback characters as we know the lexical
// form is the empty string.
// if ( ch2 != EOF ) reader.pushbackChar(ch2);
// if ( ch1 != EOF ) reader.pushbackChar(ch1); // Must be
// '' or ""
token.setImage("");
StringType st = (ch == CH_QUOTE1) ? StringType.STRING1 : StringType.STRING2;
token.setStringType(st);
}
} else {
// One quote character.
token.setImage(readStringQuote1(ch, ch));
// Record exactly what form of STRING was seen.
StringType st = (ch == CH_QUOTE1) ? StringType.STRING1 : StringType.STRING2;
token.setStringType(st);
}
// White space after lexical part of a literal.
skip();
// Literal. Is it @ or ^^
if ( reader.peekChar() == CH_AT ) {
reader.readChar();
// White space is not legal here.
// The spec terminal is "LANGTAG" which includes the '@'.
Token mainToken = new Token(token);
mainToken.setType(TokenType.LITERAL_LANG);
mainToken.setSubToken1(token);
mainToken.setImage2(langTag());
token = mainToken;
if ( Checking )
checkLiteralLang(token.getImage(), token.getImage2());
} else if ( reader.peekChar() == '^' ) {
expect("^^");
// White space is legal after a ^^.
// It's not a good idea, but it is legal.
// // Check no whitespace.
// int nextCh = reader.peekChar();
// if ( isWhitespace(nextCh) )
// exception("No whitespace after ^^ in literal with datatype");
skip();
// Stash current token.
Token mainToken = new Token(token);
mainToken.setSubToken1(token);
mainToken.setImage(token.getImage());
Token subToken = parseToken();
if ( !subToken.isIRI() )
fatal("Datatype URI required after ^^ - URI or prefixed name expected");
mainToken.setSubToken2(subToken);
mainToken.setType(TokenType.LITERAL_DT);
token = mainToken;
if ( Checking )
checkLiteralDT(token.getImage(), subToken);
} else {
// Was a simple string.
if ( Checking )
checkString(token.getImage());
}
return token;
}
if ( ch == CH_UNDERSCORE ) {
reader.readChar();
int ch2 = reader.peekChar();
if ( ch2 == CH_COLON ) {
reader.readChar();
token.setImage(readBlankNodeLabel());
token.setType(TokenType.BNODE);
if ( Checking ) checkBlankNode(token.getImage());
return token;
}
token.setType(TokenType.UNDERSCORE);
/*token.setImage(CH_UNDERSCORE);*/
return token;
}
// A directive (not part of a literal as lang tag)
if ( ch == CH_AT ) {
reader.readChar();
token.setType(TokenType.DIRECTIVE);
token.setImage(readWord(false));
if ( Checking )
checkDirective(token.getImage());
return token;
}
// Variable
if ( ch == CH_QMARK ) {
reader.readChar();
token.setType(TokenType.VAR);
// Character set?
token.setImage(readVarName());
if ( Checking )
checkVariable(token.getImage());
return token;
}
//if ( ch == CH_DOLLAR ) {}
switch(ch)
{
// DOT can start a decimal.
case CH_DOT:
reader.readChar();
ch = reader.peekChar();
if ( range(ch, '0', '9') ) {
// DOT DIGIT - it's a number.
// Reload the DOT.
reader.pushbackChar(CH_DOT);
boolean charactersConsumed = readNumber(CH_ZERO, false);
if ( charactersConsumed ) {
if ( Checking )
checkNumber(token.getImage(), token.getImage2());
return token;
}
// else it's DOT - drop through.
}
// It's DOT.
token.setType(TokenType.DOT);
return token;
case CH_GT: {
reader.readChar();
int chPeek = reader.peekChar();
if ( chPeek == CH_GT ) {
reader.readChar();
token.setType(TokenType.GT2);
return token;
}
token.setType(TokenType.GT);
//token.setImage(">");
return token;
}
case CH_SEMICOLON: reader.readChar(); token.setType(TokenType.SEMICOLON); /*token.setImage(CH_SEMICOLON);*/ return token;
case CH_COMMA: reader.readChar(); token.setType(TokenType.COMMA); /*token.setImage(CH_COMMA);*/ return token;
// {| for RDF-star annotation syntax.
// case CH_LBRACE: reader.readChar(); token.setType(TokenType.LBRACE); /*token.setImage(CH_LBRACE);*/ return token;
case CH_LBRACE: {
reader.readChar();
int chPeek = reader.peekChar();
if ( chPeek == CH_VBAR ) {
reader.readChar();
token.setType(TokenType.L_ANN);
return token;
}
token.setType(TokenType.LBRACE);
return token;
}
case CH_RBRACE: reader.readChar(); token.setType(TokenType.RBRACE); /*token.setImage(CH_RBRACE);*/ return token;
case CH_LPAREN: reader.readChar(); token.setType(TokenType.LPAREN); /*token.setImage(CH_LPAREN);*/ return token;
// Can be ')' or ')>>'
case CH_RPAREN: {
// The ')'
reader.readChar();
int peek2 = reader.peekChar();
if ( peek2 != '>') {
// Includes EOF.
token.setType(TokenType.RPAREN);
return token;
}
reader.readChar();
int peek3 = reader.peekChar();
if ( peek3 != '>') {
reader.pushbackChar(peek2);
token.setType(TokenType.RPAREN);
return token;
}
// It is ')>>'
reader.readChar();
token.setType(TokenType.R_TRIPLE);
/*token.setImage(")>>");*/
return token;
}
case CH_LBRACKET: reader.readChar(); token.setType(TokenType.LBRACKET); /*token.setImage(CH_LBRACKET);*/ return token;
case CH_RBRACKET: reader.readChar(); token.setType(TokenType.RBRACKET); /*token.setImage(CH_RBRACKET);*/ return token;
case CH_EQUALS: reader.readChar(); token.setType(TokenType.EQUALS); /*token.setImage(CH_EQUALS);*/ return token;
case CH_SLASH: reader.readChar(); token.setType(TokenType.SLASH); /*token.setImage(CH_SLASH);*/ return token;
case CH_RSLASH: reader.readChar(); token.setType(TokenType.RSLASH); /*token.setImage(CH_RSLASH);*/ return token;
// case CH_VBAR: reader.readChar(); token.setType(TokenType.VBAR); /*token.setImage(CH_VBAR);*/ return token;
// |} for RDF-star annotation syntax.
case CH_VBAR: {
reader.readChar();
int chPeek = reader.peekChar();
if ( chPeek == CH_RBRACE ) {
reader.readChar();
token.setType(TokenType.R_ANN);
return token;
}
token.setType(TokenType.VBAR);
return token;
}
case CH_AMPHERSAND: reader.readChar(); token.setType(TokenType.AMPERSAND);/*token.setImage(CH_AMPHERSAND);*/ return token;
// Specials (if prefix names processing is off)
//case CH_COLON: reader.readChar(); token.setType(TokenType.COLON); /*token.setImage(COLON);*/return token;
// Done above with blank nodes.
//case CH_UNDERSCORE: reader.readChar(); token.setType(TokenType.UNDERSCORE);/*token.setImage(CH_UNDERSCORE);*/ return token;
case CH_LT: reader.readChar(); token.setType(TokenType.LT); /*token.setImage(CH_LT);*/ return token;
case CH_STAR: reader.readChar(); token.setType(TokenType.STAR); /*token.setImage(CH_STAR);*/ return token;
case CH_EMARK: reader.readChar(); token.setType(TokenType.EMARK); /*token.setImage(CH_EMARK);*/ return token;
case CH_TILDE: reader.readChar(); token.setType(TokenType.TILDE); /*token.setImage(CH_TILDE);*/ return token;
// VAR overrides
//case CH_QMARK: reader.readChar(); token.setType(TokenType.QMARK); /*token.setImage(CH_EMARK);*/ return token;
// Two character tokens && || GE >= , LE <=
//TokenType.LE
//TokenType.GE
//TokenType.LOGICAL_AND
//TokenType.LOGICAL_OR
}
// ---- Numbers.
// A plain "+" and "-", not followed by an unsigned number are symbols.
/*
[16] integer ::= ('-' | '+') ? [0-9]+
[17] double ::= ('-' | '+') ? ( [0-9]+ '.' [0-9]* exponent | '.' ([0-9])+ exponent | ([0-9])+ exponent )
0.e0, .0e0, 0e0
[18] decimal ::= ('-' | '+')? ( [0-9]+ '.' [0-9]* | '.' ([0-9])+ | ([0-9])+ )
0.0 .0 0.
[19] exponent ::= [eE] ('-' | '+')? [0-9]+
[] hex ::= 0x0123456789ABCDEFG
*/
if ( ch == CH_PLUS || ch == CH_MINUS ) {
reader.readChar();
int ch2 = reader.peekChar();
if ( !range(ch2, '0', '9') && ch2 != CH_DOT ) {
// Not a number.
if ( ch == CH_PLUS )
token.setType(TokenType.PLUS);
else
token.setType(TokenType.MINUS);
return token;
}
// ch2 not consumed.
boolean charactersConsumed = readNumber(ch, false);
if ( ! charactersConsumed ) {
if ( ch == CH_PLUS )
token.setType(TokenType.PLUS);
else
token.setType(TokenType.MINUS);
}
return token;
}
if ( range(ch, '0', '9') ) {
reader.readChar();
if ( ch == '0' ) {
// Is it "hex" -- 0x/0X ?
boolean isHex = readPossibleHex();
if ( isHex )
return token;
}
// Not hex.
boolean charactersConsumed = readNumber(ch, true);
if ( ! charactersConsumed ) {
// Impossible.
throw new InternalError("Seen digit but no number produced");
}
return token;
}
if ( isNewlineChar(ch) ) {
//** - If collecting token image.
//** stringBuilder.setLength(0);
// Any number of NL and CR become one "NL" token.
do {
int ch2 = reader.readChar();
// insertCodepointDirect(stringBuilder,ch2);
} while (isNewlineChar(reader.peekChar()));
token.setType(TokenType.NL);
//** token.setImage(stringBuilder.toString());
return token;
}
// Plain words and prefixes.
// Can't start with a number due to numeric test above.
// Can't start with a '_' due to blank node test above.
// If we see a :, the first time it means a prefixed name else it's a token break.
readPrefixedNameOrKeyword(token);
if ( Checking ) checkKeyword(token.getImage());
return token;
}