public List parseRecord()

in odps-sqoop/src/java/org/apache/sqoop/lib/RecordParser.java [234:414]


  public List<String> parseRecord(CharBuffer input)
      throws com.cloudera.sqoop.lib.RecordParser.ParseError {
    if (null == input) {
      throw new com.cloudera.sqoop.lib.RecordParser.ParseError(
          "null input string");
    }

    /*
      This method implements the following state machine to perform
      parsing.

      Note that there are no restrictions on whether particular characters
      (e.g., field-sep, record-sep, etc) are distinct or the same. The
      state transitions are processed in the order seen in this comment.

      Starting state is FIELD_START
        encloser -> ENCLOSED_FIELD
        escape char -> UNENCLOSED_ESCAPE
        field delim -> FIELD_START (for a new field)
        record delim -> stops processing
        all other letters get added to current field, -> UNENCLOSED FIELD

      ENCLOSED_FIELD state:
        escape char goes to ENCLOSED_ESCAPE
        encloser goes to ENCLOSED_EXPECT_DELIMITER
        field sep or record sep gets added to the current string
        normal letters get added to the current string

      ENCLOSED_ESCAPE state:
        any character seen here is added literally, back to ENCLOSED_FIELD

      ENCLOSED_EXPECT_DELIMITER state:
        field sep goes to FIELD_START
        record sep halts processing.
        all other characters are errors.

      UNENCLOSED_FIELD state:
        ESCAPE char goes to UNENCLOSED_ESCAPE
        FIELD_SEP char goes to FIELD_START
        RECORD_SEP char halts processing
        normal chars or the enclosing char get added to the current string

      UNENCLOSED_ESCAPE:
        add charater literal to current string, return to UNENCLOSED_FIELD
    */

    char curChar = com.cloudera.sqoop.lib.DelimiterSet.NULL_CHAR;
    ParseState state = ParseState.FIELD_START;
    int len = input.length();
    StringBuilder sb = null;

    outputs.clear();

    char enclosingChar = delimiters.getEnclosedBy();
    char fieldDelim = delimiters.getFieldsTerminatedBy();
    char recordDelim = delimiters.getLinesTerminatedBy();
    char escapeChar = delimiters.getEscapedBy();
    boolean enclosingRequired = delimiters.isEncloseRequired();

    for (int pos = 0; pos < len; pos++) {
      curChar = input.get();
      switch (state) {
      case FIELD_START:
        // ready to start processing a new field.
        if (null != sb) {
          // We finished processing a previous field. Add to the list.
          outputs.add(sb.toString());
        }

        sb = new StringBuilder();
        if (enclosingChar == curChar) {
          // got an opening encloser.
          state = ParseState.ENCLOSED_FIELD;
        } else if (escapeChar == curChar) {
          state = ParseState.UNENCLOSED_ESCAPE;
        } else if (fieldDelim == curChar) {
          // we have a zero-length field. This is a no-op.
          continue;
        } else if (recordDelim == curChar) {
          // we have a zero-length field, that ends processing.
          pos = len;
        } else {
          // current char is part of the field.
          state = ParseState.UNENCLOSED_FIELD;
          sb.append(curChar);

          if (enclosingRequired) {
            throw new com.cloudera.sqoop.lib.RecordParser.ParseError(
                "Opening field-encloser expected at position " + pos);
          }
        }

        break;

      case ENCLOSED_FIELD:
        if (escapeChar == curChar) {
          // the next character is escaped. Treat it literally.
          state = ParseState.ENCLOSED_ESCAPE;
        } else if (enclosingChar == curChar) {
          // we're at the end of the enclosing field. Expect an EOF or EOR char.
          state = ParseState.ENCLOSED_EXPECT_DELIMITER;
        } else {
          // this is a regular char, or an EOF / EOR inside an encloser. Add to
          // the current field string, and remain in this state.
          sb.append(curChar);
        }

        break;

      case UNENCLOSED_FIELD:
        if (escapeChar == curChar) {
          // the next character is escaped. Treat it literally.
          state = ParseState.UNENCLOSED_ESCAPE;
        } else if (fieldDelim == curChar) {
          // we're at the end of this field; may be the start of another one.
          state = ParseState.FIELD_START;
        } else if (recordDelim == curChar) {
          pos = len; // terminate processing immediately.
        } else {
          // this is a regular char. Add to the current field string,
          // and remain in this state.
          sb.append(curChar);
        }

        break;

      case ENCLOSED_ESCAPE:
        // Treat this character literally, whatever it is, and return to
        // enclosed field processing.
        sb.append(curChar);
        state = ParseState.ENCLOSED_FIELD;
        break;

      case ENCLOSED_EXPECT_DELIMITER:
        // We were in an enclosed field, but got the final encloser. Now we
        // expect either an end-of-field or an end-of-record.
        if (fieldDelim == curChar) {
          // end of one field is the beginning of the next.
          state = ParseState.FIELD_START;
        } else if (recordDelim == curChar) {
          // stop processing.
          pos = len;
        } else {
          // Don't know what to do with this character.
          throw new com.cloudera.sqoop.lib.RecordParser.ParseError(
              "Expected delimiter at position " + pos);
        }

        break;

      case UNENCLOSED_ESCAPE:
        // Treat this character literally, whatever it is, and return to
        // non-enclosed field processing.
        sb.append(curChar);
        state = ParseState.UNENCLOSED_FIELD;
        break;

      default:
        throw new com.cloudera.sqoop.lib.RecordParser.ParseError(
            "Unexpected parser state: " + state);
      }
    }

    if (state == ParseState.FIELD_START && curChar == fieldDelim) {
      // we hit an EOF/EOR as the last legal character and we need to mark
      // that string as recorded. This if block is outside the for-loop since
      // we don't have a physical 'epsilon' token in our string.
      if (null != sb) {
        outputs.add(sb.toString());
        sb = new StringBuilder();
      }
    }

    if (null != sb) {
      // There was a field that terminated by running out of chars or an EOR
      // character. Add to the list.
      outputs.add(sb.toString());
    }

    return outputs;
  }