in amplify/backend/function/iamxawswrangler/lib/python/pandas/_libs/src/parser/tokenizer.c [710:1125]
int tokenize_bytes(parser_t *self,
size_t line_limit, uint64_t start_lines) {
int64_t i;
uint64_t slen;
int should_skip;
char c;
char *stream;
char *buf = self->data + self->datapos;
const char line_terminator = (self->lineterminator == '\0') ?
'\n' : self->lineterminator;
// 1000 is something that couldn't fit in "char"
// thus comparing a char to it would always be "false"
const int carriage_symbol = (self->lineterminator == '\0') ? '\r' : 1000;
const int comment_symbol = (self->commentchar != '\0') ?
self->commentchar : 1000;
const int escape_symbol = (self->escapechar != '\0') ?
self->escapechar : 1000;
if (make_stream_space(self, self->datalen - self->datapos) < 0) {
int64_t bufsize = 100;
self->error_msg = malloc(bufsize);
snprintf(self->error_msg, bufsize, "out of memory");
return -1;
}
stream = self->stream + self->stream_len;
slen = self->stream_len;
TRACE(("%s\n", buf));
if (self->file_lines == 0) {
CHECK_FOR_BOM();
}
for (i = self->datapos; i < self->datalen; ++i) {
// next character in file
c = *buf++;
TRACE(
("tokenize_bytes - Iter: %d Char: 0x%x Line %d field_count %d, "
"state %d\n",
i, c, self->file_lines + 1, self->line_fields[self->lines],
self->state));
switch (self->state) {
case START_FIELD_IN_SKIP_LINE:
if (IS_TERMINATOR(c)) {
END_LINE();
} else if (IS_CARRIAGE(c)) {
self->file_lines++;
self->state = EAT_CRNL_NOP;
} else if (IS_QUOTE(c)) {
self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
} else if (IS_DELIMITER(c)) {
// Do nothing, we're starting a new field again.
} else {
self->state = IN_FIELD_IN_SKIP_LINE;
}
break;
case IN_FIELD_IN_SKIP_LINE:
if (IS_TERMINATOR(c)) {
END_LINE();
} else if (IS_CARRIAGE(c)) {
self->file_lines++;
self->state = EAT_CRNL_NOP;
} else if (IS_DELIMITER(c)) {
self->state = START_FIELD_IN_SKIP_LINE;
}
break;
case IN_QUOTED_FIELD_IN_SKIP_LINE:
if (IS_QUOTE(c)) {
if (self->doublequote) {
self->state = QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE;
} else {
self->state = IN_FIELD_IN_SKIP_LINE;
}
}
break;
case QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE:
if (IS_QUOTE(c)) {
self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
} else if (IS_TERMINATOR(c)) {
END_LINE();
} else if (IS_CARRIAGE(c)) {
self->file_lines++;
self->state = EAT_CRNL_NOP;
} else if (IS_DELIMITER(c)) {
self->state = START_FIELD_IN_SKIP_LINE;
} else {
self->state = IN_FIELD_IN_SKIP_LINE;
}
break;
case WHITESPACE_LINE:
if (IS_TERMINATOR(c)) {
self->file_lines++;
self->state = START_RECORD;
break;
} else if (IS_CARRIAGE(c)) {
self->file_lines++;
self->state = EAT_CRNL_NOP;
break;
} else if (!self->delim_whitespace) {
if (isblank(c) && c != self->delimiter) {
} else { // backtrack
// use i + 1 because buf has been incremented but not i
do {
--buf;
--i;
} while (i + 1 > self->datapos && !IS_TERMINATOR(*buf));
// reached a newline rather than the beginning
if (IS_TERMINATOR(*buf)) {
++buf; // move pointer to first char after newline
++i;
}
self->state = START_FIELD;
}
break;
}
// fall through
case EAT_WHITESPACE:
if (IS_TERMINATOR(c)) {
END_LINE();
self->state = START_RECORD;
break;
} else if (IS_CARRIAGE(c)) {
self->state = EAT_CRNL;
break;
} else if (IS_COMMENT_CHAR(c)) {
self->state = EAT_COMMENT;
break;
} else if (!isblank(c)) {
self->state = START_FIELD;
// fall through to subsequent state
} else {
// if whitespace char, keep slurping
break;
}
case START_RECORD:
// start of record
should_skip = skip_this_line(self, self->file_lines);
if (should_skip == -1) {
goto parsingerror;
} else if (should_skip) {
if (IS_QUOTE(c)) {
self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
} else {
self->state = IN_FIELD_IN_SKIP_LINE;
if (IS_TERMINATOR(c)) {
END_LINE();
}
}
break;
} else if (IS_TERMINATOR(c)) {
// \n\r possible?
if (self->skip_empty_lines) {
self->file_lines++;
} else {
END_LINE();
}
break;
} else if (IS_CARRIAGE(c)) {
if (self->skip_empty_lines) {
self->file_lines++;
self->state = EAT_CRNL_NOP;
} else {
self->state = EAT_CRNL;
}
break;
} else if (IS_COMMENT_CHAR(c)) {
self->state = EAT_LINE_COMMENT;
break;
} else if (isblank(c)) {
if (self->delim_whitespace) {
if (self->skip_empty_lines) {
self->state = WHITESPACE_LINE;
} else {
self->state = EAT_WHITESPACE;
}
break;
} else if (c != self->delimiter && self->skip_empty_lines) {
self->state = WHITESPACE_LINE;
break;
}
// fall through
}
// normal character - fall through
// to handle as START_FIELD
self->state = START_FIELD;
case START_FIELD:
// expecting field
if (IS_TERMINATOR(c)) {
END_FIELD();
END_LINE();
} else if (IS_CARRIAGE(c)) {
END_FIELD();
self->state = EAT_CRNL;
} else if (IS_QUOTE(c)) {
// start quoted field
self->state = IN_QUOTED_FIELD;
} else if (IS_ESCAPE_CHAR(c)) {
// possible escaped character
self->state = ESCAPED_CHAR;
} else if (IS_SKIPPABLE_SPACE(c)) {
// ignore space at start of field
} else if (IS_DELIMITER(c)) {
if (self->delim_whitespace) {
self->state = EAT_WHITESPACE;
} else {
// save empty field
END_FIELD();
}
} else if (IS_COMMENT_CHAR(c)) {
END_FIELD();
self->state = EAT_COMMENT;
} else {
// begin new unquoted field
PUSH_CHAR(c);
self->state = IN_FIELD;
}
break;
case ESCAPED_CHAR:
PUSH_CHAR(c);
self->state = IN_FIELD;
break;
case EAT_LINE_COMMENT:
if (IS_TERMINATOR(c)) {
self->file_lines++;
self->state = START_RECORD;
} else if (IS_CARRIAGE(c)) {
self->file_lines++;
self->state = EAT_CRNL_NOP;
}
break;
case IN_FIELD:
// in unquoted field
if (IS_TERMINATOR(c)) {
END_FIELD();
END_LINE();
} else if (IS_CARRIAGE(c)) {
END_FIELD();
self->state = EAT_CRNL;
} else if (IS_ESCAPE_CHAR(c)) {
// possible escaped character
self->state = ESCAPED_CHAR;
} else if (IS_DELIMITER(c)) {
// end of field - end of line not reached yet
END_FIELD();
if (self->delim_whitespace) {
self->state = EAT_WHITESPACE;
} else {
self->state = START_FIELD;
}
} else if (IS_COMMENT_CHAR(c)) {
END_FIELD();
self->state = EAT_COMMENT;
} else {
// normal character - save in field
PUSH_CHAR(c);
}
break;
case IN_QUOTED_FIELD:
// in quoted field
if (IS_ESCAPE_CHAR(c)) {
// possible escape character
self->state = ESCAPE_IN_QUOTED_FIELD;
} else if (IS_QUOTE(c)) {
if (self->doublequote) {
// double quote - " represented by ""
self->state = QUOTE_IN_QUOTED_FIELD;
} else {
// end of quote part of field
self->state = IN_FIELD;
}
} else {
// normal character - save in field
PUSH_CHAR(c);
}
break;
case ESCAPE_IN_QUOTED_FIELD:
PUSH_CHAR(c);
self->state = IN_QUOTED_FIELD;
break;
case QUOTE_IN_QUOTED_FIELD:
// double quote - seen a quote in an quoted field
if (IS_QUOTE(c)) {
// save "" as "
PUSH_CHAR(c);
self->state = IN_QUOTED_FIELD;
} else if (IS_DELIMITER(c)) {
// end of field - end of line not reached yet
END_FIELD();
if (self->delim_whitespace) {
self->state = EAT_WHITESPACE;
} else {
self->state = START_FIELD;
}
} else if (IS_TERMINATOR(c)) {
END_FIELD();
END_LINE();
} else if (IS_CARRIAGE(c)) {
END_FIELD();
self->state = EAT_CRNL;
} else {
PUSH_CHAR(c);
self->state = IN_FIELD;
}
break;
case EAT_COMMENT:
if (IS_TERMINATOR(c)) {
END_LINE();
} else if (IS_CARRIAGE(c)) {
self->state = EAT_CRNL;
}
break;
// only occurs with non-custom line terminator,
// which is why we directly check for '\n'
case EAT_CRNL:
if (c == '\n') {
END_LINE();
} else if (IS_DELIMITER(c)) {
if (self->delim_whitespace) {
END_LINE_STATE(EAT_WHITESPACE);
} else {
// Handle \r-delimited files
END_LINE_AND_FIELD_STATE(START_FIELD);
}
} else {
if (self->delim_whitespace) {
/* XXX
* first character of a new record--need to back up and
* reread
* to handle properly...
*/
i--;
buf--; // back up one character (HACK!)
END_LINE_STATE(START_RECORD);
} else {
// \r line terminator
// UGH. we don't actually want
// to consume the token. fix this later
self->stream_len = slen;
if (end_line(self) < 0) {
goto parsingerror;
}
stream = self->stream + self->stream_len;
slen = self->stream_len;
self->state = START_RECORD;
--i;
buf--; // let's try this character again (HACK!)
if (line_limit > 0 &&
self->lines == start_lines + line_limit) {
goto linelimit;
}
}
}
break;
// only occurs with non-custom line terminator,
// which is why we directly check for '\n'
case EAT_CRNL_NOP: // inside an ignored comment line
self->state = START_RECORD;
// \r line terminator -- parse this character again
if (c != '\n' && !IS_DELIMITER(c)) {
--i;
--buf;
}
break;
default:
break;
}
}
_TOKEN_CLEANUP();
TRACE(("Finished tokenizing input\n"))
return 0;
parsingerror:
i++;
_TOKEN_CLEANUP();
return -1;
linelimit:
i++;
_TOKEN_CLEANUP();
return 0;
}