int tokenize_bytes()

in amplify/backend/function/iamxawswrangler/lib/python/pandas/_libs/src/parser/tokenizer.c [710:1125]


int tokenize_bytes(parser_t *self,
                   size_t line_limit, uint64_t start_lines) {
    int64_t i;
    uint64_t slen;
    int should_skip;
    char c;
    char *stream;
    char *buf = self->data + self->datapos;

    const char line_terminator = (self->lineterminator == '\0') ?
            '\n' : self->lineterminator;

    // 1000 is something that couldn't fit in "char"
    // thus comparing a char to it would always be "false"
    const int carriage_symbol = (self->lineterminator == '\0') ? '\r' : 1000;
    const int comment_symbol = (self->commentchar != '\0') ?
            self->commentchar : 1000;
    const int escape_symbol = (self->escapechar != '\0') ?
            self->escapechar : 1000;

    if (make_stream_space(self, self->datalen - self->datapos) < 0) {
        int64_t bufsize = 100;
        self->error_msg = malloc(bufsize);
        snprintf(self->error_msg, bufsize, "out of memory");
        return -1;
    }

    stream = self->stream + self->stream_len;
    slen = self->stream_len;

    TRACE(("%s\n", buf));

    if (self->file_lines == 0) {
        CHECK_FOR_BOM();
    }

    for (i = self->datapos; i < self->datalen; ++i) {
        // next character in file
        c = *buf++;

        TRACE(
            ("tokenize_bytes - Iter: %d Char: 0x%x Line %d field_count %d, "
             "state %d\n",
             i, c, self->file_lines + 1, self->line_fields[self->lines],
             self->state));

        switch (self->state) {
            case START_FIELD_IN_SKIP_LINE:
                if (IS_TERMINATOR(c)) {
                    END_LINE();
                } else if (IS_CARRIAGE(c)) {
                    self->file_lines++;
                    self->state = EAT_CRNL_NOP;
                } else if (IS_QUOTE(c)) {
                    self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
                } else if (IS_DELIMITER(c)) {
                    // Do nothing, we're starting a new field again.
                } else {
                    self->state = IN_FIELD_IN_SKIP_LINE;
                }
                break;

            case IN_FIELD_IN_SKIP_LINE:
                if (IS_TERMINATOR(c)) {
                    END_LINE();
                } else if (IS_CARRIAGE(c)) {
                    self->file_lines++;
                    self->state = EAT_CRNL_NOP;
                } else if (IS_DELIMITER(c)) {
                    self->state = START_FIELD_IN_SKIP_LINE;
                }
                break;

            case IN_QUOTED_FIELD_IN_SKIP_LINE:
                if (IS_QUOTE(c)) {
                    if (self->doublequote) {
                        self->state = QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE;
                    } else {
                        self->state = IN_FIELD_IN_SKIP_LINE;
                    }
                }
                break;

            case QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE:
                if (IS_QUOTE(c)) {
                    self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
                } else if (IS_TERMINATOR(c)) {
                    END_LINE();
                } else if (IS_CARRIAGE(c)) {
                    self->file_lines++;
                    self->state = EAT_CRNL_NOP;
                } else if (IS_DELIMITER(c)) {
                    self->state = START_FIELD_IN_SKIP_LINE;
                } else {
                    self->state = IN_FIELD_IN_SKIP_LINE;
                }
                break;

            case WHITESPACE_LINE:
                if (IS_TERMINATOR(c)) {
                    self->file_lines++;
                    self->state = START_RECORD;
                    break;
                } else if (IS_CARRIAGE(c)) {
                    self->file_lines++;
                    self->state = EAT_CRNL_NOP;
                    break;
                } else if (!self->delim_whitespace) {
                    if (isblank(c) && c != self->delimiter) {
                    } else {  // backtrack
                        // use i + 1 because buf has been incremented but not i
                        do {
                            --buf;
                            --i;
                        } while (i + 1 > self->datapos && !IS_TERMINATOR(*buf));

                        // reached a newline rather than the beginning
                        if (IS_TERMINATOR(*buf)) {
                            ++buf;  // move pointer to first char after newline
                            ++i;
                        }
                        self->state = START_FIELD;
                    }
                    break;
                }
            // fall through

            case EAT_WHITESPACE:
                if (IS_TERMINATOR(c)) {
                    END_LINE();
                    self->state = START_RECORD;
                    break;
                } else if (IS_CARRIAGE(c)) {
                    self->state = EAT_CRNL;
                    break;
                } else if (IS_COMMENT_CHAR(c)) {
                    self->state = EAT_COMMENT;
                    break;
                } else if (!isblank(c)) {
                    self->state = START_FIELD;
                    // fall through to subsequent state
                } else {
                    // if whitespace char, keep slurping
                    break;
                }

            case START_RECORD:
                // start of record
                should_skip = skip_this_line(self, self->file_lines);

                if (should_skip == -1) {
                    goto parsingerror;
                } else if (should_skip) {
                    if (IS_QUOTE(c)) {
                        self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
                    } else {
                        self->state = IN_FIELD_IN_SKIP_LINE;

                        if (IS_TERMINATOR(c)) {
                            END_LINE();
                        }
                    }
                    break;
                } else if (IS_TERMINATOR(c)) {
                    // \n\r possible?
                    if (self->skip_empty_lines) {
                        self->file_lines++;
                    } else {
                        END_LINE();
                    }
                    break;
                } else if (IS_CARRIAGE(c)) {
                    if (self->skip_empty_lines) {
                        self->file_lines++;
                        self->state = EAT_CRNL_NOP;
                    } else {
                        self->state = EAT_CRNL;
                    }
                    break;
                } else if (IS_COMMENT_CHAR(c)) {
                    self->state = EAT_LINE_COMMENT;
                    break;
                } else if (isblank(c)) {
                    if (self->delim_whitespace) {
                        if (self->skip_empty_lines) {
                            self->state = WHITESPACE_LINE;
                        } else {
                            self->state = EAT_WHITESPACE;
                        }
                        break;
                    } else if (c != self->delimiter && self->skip_empty_lines) {
                        self->state = WHITESPACE_LINE;
                        break;
                    }
                    // fall through
                }

                // normal character - fall through
                // to handle as START_FIELD
                self->state = START_FIELD;

            case START_FIELD:
                // expecting field
                if (IS_TERMINATOR(c)) {
                    END_FIELD();
                    END_LINE();
                } else if (IS_CARRIAGE(c)) {
                    END_FIELD();
                    self->state = EAT_CRNL;
                } else if (IS_QUOTE(c)) {
                    // start quoted field
                    self->state = IN_QUOTED_FIELD;
                } else if (IS_ESCAPE_CHAR(c)) {
                    // possible escaped character
                    self->state = ESCAPED_CHAR;
                } else if (IS_SKIPPABLE_SPACE(c)) {
                    // ignore space at start of field
                } else if (IS_DELIMITER(c)) {
                    if (self->delim_whitespace) {
                        self->state = EAT_WHITESPACE;
                    } else {
                        // save empty field
                        END_FIELD();
                    }
                } else if (IS_COMMENT_CHAR(c)) {
                    END_FIELD();
                    self->state = EAT_COMMENT;
                } else {
                    // begin new unquoted field
                    PUSH_CHAR(c);
                    self->state = IN_FIELD;
                }
                break;

            case ESCAPED_CHAR:
                PUSH_CHAR(c);
                self->state = IN_FIELD;
                break;

            case EAT_LINE_COMMENT:
                if (IS_TERMINATOR(c)) {
                    self->file_lines++;
                    self->state = START_RECORD;
                } else if (IS_CARRIAGE(c)) {
                    self->file_lines++;
                    self->state = EAT_CRNL_NOP;
                }
                break;

            case IN_FIELD:
                // in unquoted field
                if (IS_TERMINATOR(c)) {
                    END_FIELD();
                    END_LINE();
                } else if (IS_CARRIAGE(c)) {
                    END_FIELD();
                    self->state = EAT_CRNL;
                } else if (IS_ESCAPE_CHAR(c)) {
                    // possible escaped character
                    self->state = ESCAPED_CHAR;
                } else if (IS_DELIMITER(c)) {
                    // end of field - end of line not reached yet
                    END_FIELD();

                    if (self->delim_whitespace) {
                        self->state = EAT_WHITESPACE;
                    } else {
                        self->state = START_FIELD;
                    }
                } else if (IS_COMMENT_CHAR(c)) {
                    END_FIELD();
                    self->state = EAT_COMMENT;
                } else {
                    // normal character - save in field
                    PUSH_CHAR(c);
                }
                break;

            case IN_QUOTED_FIELD:
                // in quoted field
                if (IS_ESCAPE_CHAR(c)) {
                    // possible escape character
                    self->state = ESCAPE_IN_QUOTED_FIELD;
                } else if (IS_QUOTE(c)) {
                    if (self->doublequote) {
                        // double quote - " represented by ""
                        self->state = QUOTE_IN_QUOTED_FIELD;
                    } else {
                        // end of quote part of field
                        self->state = IN_FIELD;
                    }
                } else {
                    // normal character - save in field
                    PUSH_CHAR(c);
                }
                break;

            case ESCAPE_IN_QUOTED_FIELD:
                PUSH_CHAR(c);
                self->state = IN_QUOTED_FIELD;
                break;

            case QUOTE_IN_QUOTED_FIELD:
                // double quote - seen a quote in an quoted field
                if (IS_QUOTE(c)) {
                    // save "" as "

                    PUSH_CHAR(c);
                    self->state = IN_QUOTED_FIELD;
                } else if (IS_DELIMITER(c)) {
                    // end of field - end of line not reached yet
                    END_FIELD();

                    if (self->delim_whitespace) {
                        self->state = EAT_WHITESPACE;
                    } else {
                        self->state = START_FIELD;
                    }
                } else if (IS_TERMINATOR(c)) {
                    END_FIELD();
                    END_LINE();
                } else if (IS_CARRIAGE(c)) {
                    END_FIELD();
                    self->state = EAT_CRNL;
                } else {
                    PUSH_CHAR(c);
                    self->state = IN_FIELD;
                }
                break;

            case EAT_COMMENT:
                if (IS_TERMINATOR(c)) {
                    END_LINE();
                } else if (IS_CARRIAGE(c)) {
                    self->state = EAT_CRNL;
                }
                break;

            // only occurs with non-custom line terminator,
            // which is why we directly check for '\n'
            case EAT_CRNL:
                if (c == '\n') {
                    END_LINE();
                } else if (IS_DELIMITER(c)) {
                    if (self->delim_whitespace) {
                        END_LINE_STATE(EAT_WHITESPACE);
                    } else {
                        // Handle \r-delimited files
                        END_LINE_AND_FIELD_STATE(START_FIELD);
                    }
                } else {
                    if (self->delim_whitespace) {
                        /* XXX
                        * first character of a new record--need to back up and
                        * reread
                        * to handle properly...
                        */
                        i--;
                        buf--;  // back up one character (HACK!)
                        END_LINE_STATE(START_RECORD);
                    } else {
                        // \r line terminator
                        // UGH. we don't actually want
                        // to consume the token. fix this later
                        self->stream_len = slen;
                        if (end_line(self) < 0) {
                            goto parsingerror;
                        }

                        stream = self->stream + self->stream_len;
                        slen = self->stream_len;
                        self->state = START_RECORD;

                        --i;
                        buf--;  // let's try this character again (HACK!)
                        if (line_limit > 0 &&
                            self->lines == start_lines + line_limit) {
                            goto linelimit;
                        }
                    }
                }
                break;

            // only occurs with non-custom line terminator,
            // which is why we directly check for '\n'
            case EAT_CRNL_NOP:  // inside an ignored comment line
                self->state = START_RECORD;
                // \r line terminator -- parse this character again
                if (c != '\n' && !IS_DELIMITER(c)) {
                    --i;
                    --buf;
                }
                break;
            default:
                break;
        }
    }

    _TOKEN_CLEANUP();

    TRACE(("Finished tokenizing input\n"))

    return 0;

parsingerror:
    i++;
    _TOKEN_CLEANUP();

    return -1;

linelimit:
    i++;
    _TOKEN_CLEANUP();

    return 0;
}