in odps-sqoop/src/java/org/apache/sqoop/lib/RecordParser.java [234:414]
public List<String> parseRecord(CharBuffer input)
throws com.cloudera.sqoop.lib.RecordParser.ParseError {
if (null == input) {
throw new com.cloudera.sqoop.lib.RecordParser.ParseError(
"null input string");
}
/*
This method implements the following state machine to perform
parsing.
Note that there are no restrictions on whether particular characters
(e.g., field-sep, record-sep, etc) are distinct or the same. The
state transitions are processed in the order seen in this comment.
Starting state is FIELD_START
encloser -> ENCLOSED_FIELD
escape char -> UNENCLOSED_ESCAPE
field delim -> FIELD_START (for a new field)
record delim -> stops processing
all other letters get added to current field, -> UNENCLOSED FIELD
ENCLOSED_FIELD state:
escape char goes to ENCLOSED_ESCAPE
encloser goes to ENCLOSED_EXPECT_DELIMITER
field sep or record sep gets added to the current string
normal letters get added to the current string
ENCLOSED_ESCAPE state:
any character seen here is added literally, back to ENCLOSED_FIELD
ENCLOSED_EXPECT_DELIMITER state:
field sep goes to FIELD_START
record sep halts processing.
all other characters are errors.
UNENCLOSED_FIELD state:
ESCAPE char goes to UNENCLOSED_ESCAPE
FIELD_SEP char goes to FIELD_START
RECORD_SEP char halts processing
normal chars or the enclosing char get added to the current string
UNENCLOSED_ESCAPE:
add charater literal to current string, return to UNENCLOSED_FIELD
*/
char curChar = com.cloudera.sqoop.lib.DelimiterSet.NULL_CHAR;
ParseState state = ParseState.FIELD_START;
int len = input.length();
StringBuilder sb = null;
outputs.clear();
char enclosingChar = delimiters.getEnclosedBy();
char fieldDelim = delimiters.getFieldsTerminatedBy();
char recordDelim = delimiters.getLinesTerminatedBy();
char escapeChar = delimiters.getEscapedBy();
boolean enclosingRequired = delimiters.isEncloseRequired();
for (int pos = 0; pos < len; pos++) {
curChar = input.get();
switch (state) {
case FIELD_START:
// ready to start processing a new field.
if (null != sb) {
// We finished processing a previous field. Add to the list.
outputs.add(sb.toString());
}
sb = new StringBuilder();
if (enclosingChar == curChar) {
// got an opening encloser.
state = ParseState.ENCLOSED_FIELD;
} else if (escapeChar == curChar) {
state = ParseState.UNENCLOSED_ESCAPE;
} else if (fieldDelim == curChar) {
// we have a zero-length field. This is a no-op.
continue;
} else if (recordDelim == curChar) {
// we have a zero-length field, that ends processing.
pos = len;
} else {
// current char is part of the field.
state = ParseState.UNENCLOSED_FIELD;
sb.append(curChar);
if (enclosingRequired) {
throw new com.cloudera.sqoop.lib.RecordParser.ParseError(
"Opening field-encloser expected at position " + pos);
}
}
break;
case ENCLOSED_FIELD:
if (escapeChar == curChar) {
// the next character is escaped. Treat it literally.
state = ParseState.ENCLOSED_ESCAPE;
} else if (enclosingChar == curChar) {
// we're at the end of the enclosing field. Expect an EOF or EOR char.
state = ParseState.ENCLOSED_EXPECT_DELIMITER;
} else {
// this is a regular char, or an EOF / EOR inside an encloser. Add to
// the current field string, and remain in this state.
sb.append(curChar);
}
break;
case UNENCLOSED_FIELD:
if (escapeChar == curChar) {
// the next character is escaped. Treat it literally.
state = ParseState.UNENCLOSED_ESCAPE;
} else if (fieldDelim == curChar) {
// we're at the end of this field; may be the start of another one.
state = ParseState.FIELD_START;
} else if (recordDelim == curChar) {
pos = len; // terminate processing immediately.
} else {
// this is a regular char. Add to the current field string,
// and remain in this state.
sb.append(curChar);
}
break;
case ENCLOSED_ESCAPE:
// Treat this character literally, whatever it is, and return to
// enclosed field processing.
sb.append(curChar);
state = ParseState.ENCLOSED_FIELD;
break;
case ENCLOSED_EXPECT_DELIMITER:
// We were in an enclosed field, but got the final encloser. Now we
// expect either an end-of-field or an end-of-record.
if (fieldDelim == curChar) {
// end of one field is the beginning of the next.
state = ParseState.FIELD_START;
} else if (recordDelim == curChar) {
// stop processing.
pos = len;
} else {
// Don't know what to do with this character.
throw new com.cloudera.sqoop.lib.RecordParser.ParseError(
"Expected delimiter at position " + pos);
}
break;
case UNENCLOSED_ESCAPE:
// Treat this character literally, whatever it is, and return to
// non-enclosed field processing.
sb.append(curChar);
state = ParseState.UNENCLOSED_FIELD;
break;
default:
throw new com.cloudera.sqoop.lib.RecordParser.ParseError(
"Unexpected parser state: " + state);
}
}
if (state == ParseState.FIELD_START && curChar == fieldDelim) {
// we hit an EOF/EOR as the last legal character and we need to mark
// that string as recorded. This if block is outside the for-loop since
// we don't have a physical 'epsilon' token in our string.
if (null != sb) {
outputs.add(sb.toString());
sb = new StringBuilder();
}
}
if (null != sb) {
// There was a field that terminated by running out of chars or an EOR
// character. Add to the list.
outputs.add(sb.toString());
}
return outputs;
}