runtime/under-codecs-module.cpp (1,216 lines of code) (raw):
// Copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com)
#include "builtins.h"
#include "bytearray-builtins.h"
#include "bytes-builtins.h"
#include "byteslike.h"
#include "formatter-utils.h"
#include "frame.h"
#include "int-builtins.h"
#include "modules.h"
#include "runtime.h"
#include "str-builtins.h"
#include "unicode-db.h"
#include "unicode.h"
#include "utils.h"
namespace py {
const char kASCIIReplacement = '?';
static SymbolId lookupSymbolForErrorHandler(const Str& error) {
if (error.equalsCStr("strict")) {
return ID(strict);
}
if (error.equalsCStr("ignore")) {
return ID(ignore);
}
if (error.equalsCStr("replace")) {
return ID(replace);
}
if (error.equalsCStr("surrogateescape")) {
return ID(surrogateescape);
}
if (error.equalsCStr("surrogatepass")) {
return ID(surrogatepass);
}
return SymbolId::kInvalid;
}
static int asciiDecode(Thread* thread, const StrArray& dst,
const Byteslike& src, word start, word end) {
// TODO(T41032331): Implement a fastpass to read longs instead of chars
Runtime* runtime = thread->runtime();
for (word i = start; i < end; i++) {
byte ch = src.byteAt(i);
if (ch > kMaxASCII) {
return i;
}
runtime->strArrayAddASCII(thread, dst, ch);
}
return end;
}
RawObject FUNC(_codecs, _ascii_decode)(Thread* thread, Arguments args) {
Runtime* runtime = thread->runtime();
HandleScope scope(thread);
Object data(&scope, args.get(0));
Str errors(&scope, strUnderlying(args.get(1)));
word index = intUnderlying(args.get(2)).asWord();
StrArray dst(&scope, args.get(3));
Byteslike bytes(&scope, thread, *data);
word length = bytes.length();
runtime->strArrayEnsureCapacity(thread, dst, length);
word outpos = asciiDecode(thread, dst, bytes, index, length);
if (outpos == length) {
Object dst_obj(&scope, runtime->strFromStrArray(dst));
Object length_obj(&scope, runtime->newInt(length));
return runtime->newTupleWith2(dst_obj, length_obj);
}
SymbolId error_id = lookupSymbolForErrorHandler(errors);
while (outpos < length) {
byte c = bytes.byteAt(outpos);
if (c < 128) {
runtime->strArrayAddASCII(thread, dst, c);
++outpos;
continue;
}
switch (error_id) {
case ID(replace): {
Str temp(&scope, SmallStr::fromCodePoint(0xFFFD));
runtime->strArrayAddStr(thread, dst, temp);
++outpos;
break;
}
case ID(surrogateescape): {
Str temp(&scope,
SmallStr::fromCodePoint(Unicode::kLowSurrogateStart + c));
runtime->strArrayAddStr(thread, dst, temp);
++outpos;
break;
}
case ID(ignore):
++outpos;
break;
default: {
Object outpos1(&scope, runtime->newIntFromUnsigned(outpos));
Object outpos2(&scope, runtime->newIntFromUnsigned(outpos + 1));
return runtime->newTupleWith2(outpos1, outpos2);
}
}
}
Object dst_obj(&scope, runtime->strFromStrArray(dst));
Object length_obj(&scope, runtime->newInt(length));
return runtime->newTupleWith2(dst_obj, length_obj);
}
// CPython encodes latin1 codepoints into the low-surrogate range, and is able
// to recover the original codepoints from those decodable surrogate points.
static bool isEscapedLatin1Surrogate(int32_t codepoint) {
return (Unicode::kLowSurrogateStart + kMaxASCII) < codepoint &&
codepoint <= (Unicode::kLowSurrogateStart + kMaxByte);
}
RawObject FUNC(_codecs, _ascii_encode)(Thread* thread, Arguments args) {
Runtime* runtime = thread->runtime();
HandleScope scope(thread);
Object output_obj(&scope, args.get(3));
DCHECK(runtime->isInstanceOfBytearray(*output_obj),
"Fourth arg to _ascii_encode must be bytearray");
Str data(&scope, strUnderlying(args.get(0)));
Str errors(&scope, strUnderlying(args.get(1)));
word i = intUnderlying(args.get(2)).asWord();
Bytearray output(&scope, *output_obj);
SymbolId error_symbol = lookupSymbolForErrorHandler(errors);
// TODO(T43252439): Optimize this by first checking whether the entire string
// is ASCII, and just memcpy into a string if so
for (word byte_offset = thread->strOffset(data, i);
byte_offset < data.length(); i++) {
word num_bytes;
int32_t codepoint = data.codePointAt(byte_offset, &num_bytes);
byte_offset += num_bytes;
if (codepoint <= kMaxASCII) {
bytearrayAdd(thread, runtime, output, codepoint);
} else {
switch (error_symbol) {
case ID(ignore):
continue;
case ID(replace):
bytearrayAdd(thread, runtime, output, kASCIIReplacement);
continue;
case ID(surrogateescape):
if (isEscapedLatin1Surrogate(codepoint)) {
bytearrayAdd(thread, runtime, output,
codepoint - Unicode::kLowSurrogateStart);
continue;
}
break;
default:
break;
}
Object outpos1(&scope, runtime->newInt(i));
while (byte_offset < data.length() &&
data.codePointAt(byte_offset, &num_bytes) > kMaxASCII) {
byte_offset += num_bytes;
i++;
}
Object outpos2(&scope, runtime->newInt(i + 1));
return runtime->newTupleWith2(outpos1, outpos2);
}
}
Object output_bytes(&scope, bytearrayAsBytes(thread, output));
Object outpos_obj(&scope, runtime->newInt(i));
return runtime->newTupleWith2(output_bytes, outpos_obj);
}
// Decodes a sequence of unicode encoded bytes into a codepoint, returns
// -1 if no value should be written, and -2 if an error occurred. Sets the
// iterating variable to where decoding should continue, and sets
// invalid_escape_index if it doesn't recognize the escape sequence.
static int32_t decodeEscaped(const Byteslike& bytes, word* i,
word* invalid_escape_index) {
word length = bytes.length();
switch (byte ch = bytes.byteAt((*i)++)) {
// \x escapes
case '\n':
return -1;
case '\\':
case '\'':
case '\"':
return ch;
case 'b':
return '\b';
case 't':
return '\t';
case 'n':
return '\n';
case 'r':
return '\r';
// BEL,
case 'a':
return '\x07';
// VT
case 'v':
return '\x0B';
// FF
case 'f':
return '\x0C';
// \OOO (octal) escapes
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7': {
word escaped = ch - '0';
word octal_index = *i;
if (octal_index < length) {
word ch2 = bytes.byteAt(octal_index);
if ('0' <= ch2 && ch2 <= '7') {
escaped = (escaped << 3) + ch2 - '0';
if (++octal_index < length) {
word ch3 = bytes.byteAt(octal_index);
if ('0' <= ch3 && ch3 <= '7') {
octal_index++;
escaped = (escaped << 3) + ch3 - '0';
}
}
}
}
*i = octal_index;
return escaped;
}
// hex escapes
// \xXX
case 'x': {
word hex_index = *i;
if (hex_index + 1 < length) {
int digit1, digit2;
digit1 = _PyLong_DigitValue[bytes.byteAt(hex_index)];
digit2 = _PyLong_DigitValue[bytes.byteAt(hex_index + 1)];
if (digit1 < 16 && digit2 < 16) {
*i += 2;
return (digit1 << 4) + digit2;
}
}
return -2;
}
default:
*invalid_escape_index = *i - 1;
return ch;
}
}
RawObject FUNC(_codecs, _escape_decode)(Thread* thread, Arguments args) {
HandleScope scope(thread);
Object bytes_obj(&scope, args.get(0));
Runtime* runtime = thread->runtime();
if (runtime->isInstanceOfStr(*bytes_obj)) {
// TODO(T44739505): Make sure we can decode a str
UNIMPLEMENTED("_codecs.escape_decode with a str");
}
DCHECK(runtime->isInstanceOfStr(args.get(2)),
"Third arg to _escape_decode must be str");
Byteslike bytes(&scope, thread, *bytes_obj);
Str errors(&scope, strUnderlying(args.get(1)));
Bytearray dst(&scope, runtime->newBytearray());
word length = bytes.length();
runtime->bytearrayEnsureCapacity(thread, dst, length);
word first_invalid_escape_index = -1;
for (word i = 0; i < length;) {
byte ch = bytes.byteAt(i++);
if (ch != '\\') {
// TODO(T45134397): Support the recode_encoding parameter
if (ch <= kMaxASCII) {
bytearrayAdd(thread, runtime, dst, ch);
continue;
}
Str temp(&scope, SmallStr::fromCodePoint(ch));
bytearrayAdd(thread, runtime, dst, temp.byteAt(0));
bytearrayAdd(thread, runtime, dst, temp.byteAt(1));
continue;
}
if (i >= length) {
return runtime->newStrFromCStr("Trailing \\ in string");
}
word invalid_escape_index = -1;
int32_t decoded = decodeEscaped(bytes, &i, &invalid_escape_index);
if (invalid_escape_index != -1) {
bytearrayAdd(thread, runtime, dst, '\\');
if (first_invalid_escape_index == -1) {
first_invalid_escape_index = invalid_escape_index;
}
}
if (decoded >= 0) {
bytearrayAdd(thread, runtime, dst, decoded);
continue;
}
if (decoded == -1) {
continue;
}
SymbolId error_id = lookupSymbolForErrorHandler(errors);
switch (error_id) {
case ID(strict):
return runtime->newStrFromFmt("invalid \\x escape at position %d",
i - 2);
case ID(replace): {
bytearrayAdd(thread, runtime, dst, '?');
break;
}
case ID(ignore):
break;
default:
return runtime->newStrFromFmt(
"decoding error; unknown error handling code: %S", &errors);
}
if (i < length && Byte::isHexDigit(bytes.byteAt(i))) {
i++;
}
}
Object dst_obj(&scope, bytearrayAsBytes(thread, dst));
Object length_obj(&scope, runtime->newInt(length));
Object escape_obj(&scope, runtime->newInt(first_invalid_escape_index));
return runtime->newTupleWith3(dst_obj, length_obj, escape_obj);
}
RawObject FUNC(_codecs, _latin_1_decode)(Thread* thread, Arguments args) {
Runtime* runtime = thread->runtime();
HandleScope scope(thread);
Object data(&scope, args.get(0));
StrArray array(&scope, runtime->newStrArray());
word length;
Byteslike bytes(&scope, thread, *data);
length = bytes.length();
runtime->strArrayEnsureCapacity(thread, array, length);
// First, try a quick ASCII decoding
word num_bytes = asciiDecode(thread, array, bytes, 0, length);
if (num_bytes != length) {
// A non-ASCII character was found; switch to a Latin-1 decoding for the
// remainder of the input sequence
for (word i = num_bytes; i < length; ++i) {
byte code_point = bytes.byteAt(i);
if (code_point <= kMaxASCII) {
runtime->strArrayAddASCII(thread, array, code_point);
} else {
runtime->strArrayAddCodePoint(thread, array, code_point);
}
}
}
Object array_str(&scope, runtime->strFromStrArray(array));
Object length_obj(&scope, runtime->newInt(length));
return runtime->newTupleWith2(array_str, length_obj);
}
RawObject FUNC(_codecs, _latin_1_encode)(Thread* thread, Arguments args) {
Runtime* runtime = thread->runtime();
HandleScope scope(thread);
Object output_obj(&scope, args.get(3));
DCHECK(runtime->isInstanceOfBytearray(*output_obj),
"Fourth arg to _latin_1_encode must be bytearray");
Str data(&scope, strUnderlying(args.get(0)));
Str errors(&scope, strUnderlying(args.get(1)));
word i = intUnderlying(args.get(2)).asWord();
Bytearray output(&scope, *output_obj);
SymbolId error_symbol = lookupSymbolForErrorHandler(errors);
for (word byte_offset = thread->strOffset(data, i);
byte_offset < data.length(); i++) {
word num_bytes;
int32_t codepoint = data.codePointAt(byte_offset, &num_bytes);
byte_offset += num_bytes;
if (codepoint <= kMaxByte) {
bytearrayAdd(thread, runtime, output, codepoint);
} else {
switch (error_symbol) {
case ID(ignore):
continue;
case ID(replace):
bytearrayAdd(thread, runtime, output, kASCIIReplacement);
continue;
case ID(surrogateescape):
if (isEscapedLatin1Surrogate(codepoint)) {
bytearrayAdd(thread, runtime, output,
codepoint - Unicode::kLowSurrogateStart);
continue;
}
break;
default:
break;
}
Object outpos1(&scope, runtime->newInt(i));
while (byte_offset < data.length() &&
data.codePointAt(byte_offset, &num_bytes) > kMaxByte) {
byte_offset += num_bytes;
i++;
}
Object outpos2(&scope, runtime->newInt(i + 1));
return runtime->newTupleWith2(outpos1, outpos2);
}
}
Object output_bytes(&scope, bytearrayAsBytes(thread, output));
Object outpos(&scope, runtime->newInt(i));
return runtime->newTupleWith2(output_bytes, outpos);
}
// Decodes a sequence of hexadecimal encoded bytes into a codepoint or returns
// a negative value if the value could not be decoded. Sets the start variable
// to where decoding should continue.
static int32_t decodeHexEscaped(const Byteslike& bytes, word* start,
word count) {
DCHECK_BOUND(count, 8);
word result = 0;
word i = *start;
for (word len = bytes.length(); i < len && count != 0; i++, count--) {
byte ch = bytes.byteAt(i);
result <<= 4;
if (ch >= '0' && ch <= '9') {
result += ch - '0';
} else if (ch >= 'a' && ch <= 'f') {
result += ch - ('a' - 10);
} else if (ch >= 'A' && ch <= 'F') {
result += ch - ('A' - 10);
} else {
break; // not a hexadecimal digit, stop reading
}
}
*start = i;
if (count != 0) {
return -1;
}
// if count is 4, result could be a 32-bit unicode character
if (result > kMaxUnicode) {
return -2;
}
return result;
}
// Decodes a sequence of unicode encoded bytes into a codepoint or returns
// a negative value if no value should be written. Sets the iterating variable
// to where decoding should continue, sets invalid_escape_index if it doesn't
// recognize the escape sequence, and sets error_message if an error occurred.
static int32_t decodeUnicodeEscaped(const Byteslike& bytes, word* i,
word* invalid_escape_index,
const char** error_message) {
switch (byte ch = bytes.byteAt((*i)++)) {
// \x escapes
case '\n':
return -1;
case '\\':
case '\'':
case '\"':
return ch;
case 'b':
return '\b';
case 't':
return '\t';
case 'n':
return '\n';
case 'r':
return '\r';
// BEL
case 'a':
return '\007';
// FF
case 'f':
return '\014';
// VT
case 'v':
return '\013';
// \OOO (octal) escapes
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7': {
word escaped = ch - '0';
word octal_index = *i;
word length = bytes.length();
if (octal_index < length) {
word ch2 = bytes.byteAt(octal_index);
if ('0' <= ch2 && ch2 <= '7') {
escaped = (escaped << 3) + ch2 - '0';
if (++octal_index < length) {
word ch3 = bytes.byteAt(octal_index);
if ('0' <= ch3 && ch3 <= '7') {
octal_index++;
escaped = (escaped << 3) + ch3 - '0';
}
}
}
}
*i = octal_index;
return escaped;
}
// hex escapes
// \xXX
case 'x': {
word escaped;
if ((escaped = decodeHexEscaped(bytes, i, 2)) < 0) {
*error_message = (escaped == -1 ? "truncated \\xXX escape"
: "illegal Unicode character");
return -1;
}
return escaped;
}
// \uXXXX
case 'u': {
word escaped;
if ((escaped = decodeHexEscaped(bytes, i, 4)) < 0) {
*error_message = (escaped == -1 ? "truncated \\uXXXX escape"
: "illegal Unicode character");
return -1;
}
return escaped;
}
// \UXXXXXXXX
case 'U': {
word escaped;
if ((escaped = decodeHexEscaped(bytes, i, 8)) < 0) {
*error_message = (escaped == -1 ? "truncated \\uXXXXXXXX escape"
: "illegal Unicode character");
return -1;
}
return escaped;
}
// \N{name}
case 'N': {
*error_message = "malformed \\N character escape";
word length = bytes.length();
if (*i >= length || bytes.byteAt(*i) != '{') {
return -1;
}
word start = ++(*i);
while (*i < length && bytes.byteAt(*i) != '}') {
*i += 1;
}
word size = *i - start;
if (size == 0 || *i == length) {
return -1;
}
*i += 1;
*error_message = "unknown Unicode character name";
unique_c_ptr<byte> buffer(reinterpret_cast<byte*>(std::malloc(size)));
bytes.copyToStartAt(buffer.get(), size, start);
return codePointFromName(buffer.get(), size);
}
default: {
*invalid_escape_index = *i - 1;
return ch;
}
}
}
RawObject FUNC(_codecs, _unicode_escape_decode)(Thread* thread,
Arguments args) {
HandleScope scope(thread);
Runtime* runtime = thread->runtime();
Object data(&scope, args.get(0));
Str errors(&scope, strUnderlying(args.get(1)));
word index = intUnderlying(args.get(2)).asWord();
StrArray dst(&scope, args.get(3));
Byteslike bytes(&scope, thread, *data);
word length = bytes.length();
runtime->strArrayEnsureCapacity(thread, dst, length);
word first_invalid_escape_index = -1;
for (word i = index; i < length;) {
const char* message = nullptr;
word start_pos = i;
byte ch = bytes.byteAt(i++);
if (ch != '\\') {
if (ch <= kMaxASCII) {
runtime->strArrayAddASCII(thread, dst, ch);
continue;
}
Str temp(&scope, SmallStr::fromCodePoint(ch));
runtime->strArrayAddStr(thread, dst, temp);
continue;
}
if (i >= length) {
message = "\\ at end of string";
} else {
word invalid_escape_index = -1;
int32_t decoded =
decodeUnicodeEscaped(bytes, &i, &invalid_escape_index, &message);
if (invalid_escape_index != -1) {
runtime->strArrayAddASCII(thread, dst, '\\');
if (first_invalid_escape_index == -1) {
first_invalid_escape_index = invalid_escape_index;
}
}
if (decoded != -1) {
if (decoded <= kMaxASCII) {
runtime->strArrayAddASCII(thread, dst, decoded);
continue;
}
Str temp(&scope, SmallStr::fromCodePoint(decoded));
runtime->strArrayAddStr(thread, dst, temp);
continue;
}
}
if (message != nullptr) {
SymbolId error_id = lookupSymbolForErrorHandler(errors);
switch (error_id) {
case ID(replace): {
Str temp(&scope, SmallStr::fromCodePoint(0xFFFD));
runtime->strArrayAddStr(thread, dst, temp);
break;
}
case ID(ignore):
break;
default: {
Object start_pos_obj(&scope, runtime->newInt(start_pos));
Object outpos_obj(&scope, runtime->newInt(i));
Object message_obj(&scope, runtime->newStrFromCStr(message));
Object escape_obj(&scope,
runtime->newInt(first_invalid_escape_index));
return runtime->newTupleWith4(start_pos_obj, outpos_obj, message_obj,
escape_obj);
}
}
}
}
Object dst_obj(&scope, runtime->strFromStrArray(dst));
Object length_obj(&scope, runtime->newInt(length));
Object message_obj(&scope, runtime->newStrFromCStr(""));
Object escape_obj(&scope, runtime->newInt(first_invalid_escape_index));
return runtime->newTupleWith4(dst_obj, length_obj, message_obj, escape_obj);
}
enum Utf8DecoderResult {
k1Byte = 1,
k2Byte = 2,
k3Byte = 3,
k4Byte = 4,
kInvalidStart = 0,
kInvalidContinuation1 = -1,
kInvalidContinuation2 = -2,
kInvalidContinuation3 = -3,
kUnexpectedEndOfData = -4,
};
// This functionality is taken mostly from CPython:
// Objects/stringlib/codecs.h::utf8_decode
// This does error checking to ensure well-formedness of the passed in UTF-8
// bytes, and returns the number of bytes of the codepoint at `index` as a
// Utf8DecoderResult enum value.
// Since this is supposed to work as an incremental decoder as well, this
// function returns specific values for errors to determine whether they could
// be caused by incremental decoding, or if they would be an error no matter
// what other bytes might be streamed in later.
static Utf8DecoderResult isValidUtf8Codepoint(const Byteslike& bytes,
word index) {
word length = bytes.length();
byte ch = bytes.byteAt(index);
if (ch <= kMaxASCII) {
return k1Byte;
}
if (ch < 0xE0) {
// \xC2\x80-\xDF\xBF -- 0080-07FF
if (ch < 0xC2) {
// invalid sequence
// \x80-\xBF -- continuation byte
// \xC0-\xC1 -- fake 0000-007F
return kInvalidStart;
}
if (index + 1 >= length) {
return kUnexpectedEndOfData;
}
if (!UTF8::isTrailByte(bytes.byteAt(index + 1))) {
return kInvalidContinuation1;
}
return k2Byte;
}
if (ch < 0xF0) {
// \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF
if (index + 2 >= length) {
if (index + 1 >= length) {
return kUnexpectedEndOfData;
}
byte ch2 = bytes.byteAt(index + 1);
if (!UTF8::isTrailByte(ch2) || (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) {
return kInvalidContinuation1;
}
return kUnexpectedEndOfData;
}
byte ch2 = bytes.byteAt(index + 1);
if (!UTF8::isTrailByte(ch2)) {
return kInvalidContinuation1;
}
if (ch == 0xE0) {
if (ch2 < 0xA0) {
// invalid sequence
// \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800
return kInvalidContinuation1;
}
} else if (ch == 0xED && ch2 >= 0xA0) {
// Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
// will result in surrogates in range D800-DFFF. Surrogates are
// not valid UTF-8 so they are rejected.
// See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
// (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
return kInvalidContinuation1;
}
if (!UTF8::isTrailByte(bytes.byteAt(index + 2))) {
return kInvalidContinuation2;
}
return k3Byte;
}
if (ch < 0xF5) {
// \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF
if (index + 3 >= length) {
if (index + 1 >= length) {
return kUnexpectedEndOfData;
}
byte ch2 = bytes.byteAt(index + 1);
if (!UTF8::isTrailByte(ch2) || (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) {
return kInvalidContinuation1;
}
if (index + 2 >= length) {
return kUnexpectedEndOfData;
}
if (!UTF8::isTrailByte(bytes.byteAt(index + 2))) {
return kInvalidContinuation2;
}
return kUnexpectedEndOfData;
}
byte ch2 = bytes.byteAt(index + 1);
if (!UTF8::isTrailByte(ch2)) {
return kInvalidContinuation1;
}
if (ch == 0xF0) {
if (ch2 < 0x90) {
// invalid sequence
// \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF
return kInvalidContinuation1;
}
} else if (ch == 0xF4 && ch2 >= 0x90) {
// invalid sequence
// \xF4\x90\x80\80- -- 110000- overflow
return kInvalidContinuation1;
}
if (!UTF8::isTrailByte(bytes.byteAt(index + 2))) {
return kInvalidContinuation2;
}
if (!UTF8::isTrailByte(bytes.byteAt(index + 3))) {
return kInvalidContinuation3;
}
return k4Byte;
}
return kInvalidStart;
}
RawObject FUNC(_codecs, _utf_8_decode)(Thread* thread, Arguments args) {
Runtime* runtime = thread->runtime();
HandleScope scope(thread);
Object final_obj(&scope, args.get(4));
DCHECK(final_obj.isBool(), "Fifth arg to _utf_8_decode must be bool");
Object data(&scope, args.get(0));
Str errors(&scope, strUnderlying(args.get(1)));
word index = intUnderlying(args.get(2)).asWord();
StrArray dst(&scope, args.get(3));
word length;
Byteslike bytes(&scope, thread, *data);
length = bytes.length();
runtime->strArrayEnsureCapacity(thread, dst, length);
word i = asciiDecode(thread, dst, bytes, index, length);
if (i == length) {
Object dst_obj(&scope, runtime->strFromStrArray(dst));
Object length_obj(&scope, runtime->newInt(length));
Object message_obj(&scope, runtime->newStrFromCStr(""));
return runtime->newTupleWith3(dst_obj, length_obj, message_obj);
}
SymbolId error_id = lookupSymbolForErrorHandler(errors);
bool is_final = Bool::cast(*final_obj).value();
while (i < length) {
// TODO(T41032331): Scan for non-ASCII characters by words instead of chars
Utf8DecoderResult validator_result = isValidUtf8Codepoint(bytes, i);
if (validator_result >= k1Byte) {
byte codepoint[4] = {0};
for (int codeunit = 0; codeunit + 1 <= validator_result; codeunit++) {
codepoint[codeunit] = bytes.byteAt(i + codeunit);
}
i += validator_result;
Str temp(&scope,
runtime->newStrWithAll(View<byte>{codepoint, validator_result}));
runtime->strArrayAddStr(thread, dst, temp);
continue;
}
if (validator_result != kInvalidStart && !is_final) {
break;
}
word error_end = i;
const char* error_message = nullptr;
switch (validator_result) {
case kInvalidStart:
error_end += 1;
error_message = "invalid start byte";
break;
case kInvalidContinuation1:
case kInvalidContinuation2:
case kInvalidContinuation3:
error_end -= validator_result;
error_message = "invalid continuation byte";
break;
case kUnexpectedEndOfData:
error_end = length;
error_message = "unexpected end of data";
break;
default:
UNREACHABLE(
"valid utf-8 codepoints should have been decoded by this point");
}
switch (error_id) {
case ID(replace): {
Str temp(&scope, SmallStr::fromCodePoint(kReplacementCharacter));
runtime->strArrayAddStr(thread, dst, temp);
i = error_end;
break;
}
case ID(surrogateescape): {
for (; i < error_end; ++i) {
Str temp(&scope, SmallStr::fromCodePoint(Unicode::kLowSurrogateStart +
bytes.byteAt(i)));
runtime->strArrayAddStr(thread, dst, temp);
}
break;
}
case ID(ignore):
i = error_end;
break;
default: {
Object outpos_obj(&scope, runtime->newInt(i));
Object error_end_obj(&scope, runtime->newInt(error_end));
Object message_obj(&scope, runtime->newStrFromCStr(error_message));
return runtime->newTupleWith3(outpos_obj, error_end_obj, message_obj);
}
}
}
Object dst_obj(&scope, runtime->strFromStrArray(dst));
Object outpos_obj(&scope, runtime->newInt(i));
Object message_obj(&scope, Str::empty());
return runtime->newTupleWith3(dst_obj, outpos_obj, message_obj);
}
RawObject FUNC(_codecs, _utf_8_encode)(Thread* thread, Arguments args) {
Runtime* runtime = thread->runtime();
HandleScope scope(thread);
Object output_obj(&scope, args.get(3));
DCHECK(runtime->isInstanceOfBytearray(*output_obj),
"Fourth arg to _utf_8_encode must be bytearray");
Str data(&scope, strUnderlying(args.get(0)));
Str errors(&scope, strUnderlying(args.get(1)));
word index = intUnderlying(args.get(2)).asWord();
Bytearray output(&scope, *output_obj);
SymbolId error_symbol = lookupSymbolForErrorHandler(errors);
for (word byte_offset = thread->strOffset(data, index);
byte_offset < data.length(); index++) {
word num_bytes;
int32_t codepoint = data.codePointAt(byte_offset, &num_bytes);
byte_offset += num_bytes;
if (!Unicode::isSurrogate(codepoint)) {
for (word j = byte_offset - num_bytes; j < byte_offset; j++) {
bytearrayAdd(thread, runtime, output, data.byteAt(j));
}
} else {
switch (error_symbol) {
case ID(ignore):
continue;
case ID(replace):
bytearrayAdd(thread, runtime, output, kASCIIReplacement);
continue;
case ID(surrogateescape):
if (isEscapedLatin1Surrogate(codepoint)) {
bytearrayAdd(thread, runtime, output,
codepoint - Unicode::kLowSurrogateStart);
continue;
}
break;
case ID(surrogatepass):
if (Unicode::isSurrogate(codepoint)) {
bytearrayAdd(thread, runtime, output, data.byteAt(byte_offset - 3));
bytearrayAdd(thread, runtime, output, data.byteAt(byte_offset - 2));
bytearrayAdd(thread, runtime, output, data.byteAt(byte_offset - 1));
continue;
}
break;
default:
break;
}
Object outpos1(&scope, runtime->newInt(index));
while (byte_offset < data.length() &&
Unicode::isSurrogate(data.codePointAt(byte_offset, &num_bytes))) {
byte_offset += num_bytes;
index++;
}
Object outpos2(&scope, runtime->newInt(index + 1));
return runtime->newTupleWith2(outpos1, outpos2);
}
}
Object output_bytes(&scope, bytearrayAsBytes(thread, output));
Object index_obj(&scope, runtime->newInt(index));
return runtime->newTupleWith2(output_bytes, index_obj);
}
static void appendUtf16ToBytearray(Thread* thread, Runtime* runtime,
const Bytearray& writer, int32_t codepoint,
endian endianness) {
if (endianness == endian::little) {
bytearrayAdd(thread, runtime, writer, codepoint);
bytearrayAdd(thread, runtime, writer, codepoint >> kBitsPerByte);
} else {
bytearrayAdd(thread, runtime, writer, codepoint >> kBitsPerByte);
bytearrayAdd(thread, runtime, writer, codepoint);
}
}
RawObject FUNC(_codecs, _utf_16_encode)(Thread* thread, Arguments args) {
Runtime* runtime = thread->runtime();
HandleScope scope(thread);
Object output_obj(&scope, args.get(3));
DCHECK(runtime->isInstanceOfBytearray(*output_obj),
"Fourth arg to _utf_16_encode must be bytearray");
Str data(&scope, strUnderlying(args.get(0)));
Str errors(&scope, strUnderlying(args.get(1)));
word index = intUnderlying(args.get(2)).asWord();
Bytearray output(&scope, *output_obj);
OptInt<int32_t> byteorder = intUnderlying(args.get(4)).asInt<int32_t>();
if (byteorder.error != CastError::None) {
return thread->raiseWithFmt(LayoutId::kOverflowError,
"Python int too large to convert to C int");
}
SymbolId error_id = lookupSymbolForErrorHandler(errors);
for (word byte_offset = thread->strOffset(data, index);
byte_offset < data.length(); index++) {
endian endianness = byteorder.value <= 0 ? endian::little : endian::big;
word num_bytes;
int32_t codepoint = data.codePointAt(byte_offset, &num_bytes);
byte_offset += num_bytes;
if (!Unicode::isSurrogate(codepoint)) {
if (codepoint < Unicode::kHighSurrogateStart) {
appendUtf16ToBytearray(thread, runtime, output, codepoint, endianness);
} else {
appendUtf16ToBytearray(thread, runtime, output,
Unicode::highSurrogateFor(codepoint),
endianness);
appendUtf16ToBytearray(thread, runtime, output,
Unicode::lowSurrogateFor(codepoint), endianness);
}
} else {
switch (error_id) {
case ID(ignore):
continue;
case ID(replace):
appendUtf16ToBytearray(thread, runtime, output, kASCIIReplacement,
endianness);
continue;
case ID(surrogateescape):
if (isEscapedLatin1Surrogate(codepoint)) {
appendUtf16ToBytearray(thread, runtime, output,
codepoint - Unicode::kLowSurrogateStart,
endianness);
continue;
}
break;
default:
break;
}
Object outpos1(&scope, runtime->newInt(index));
while (byte_offset < data.length() &&
Unicode::isSurrogate(data.codePointAt(byte_offset, &num_bytes))) {
byte_offset += num_bytes;
index++;
}
Object outpos2(&scope, runtime->newInt(index + 1));
return runtime->newTupleWith2(outpos1, outpos2);
}
}
Object output_bytes(&scope, bytearrayAsBytes(thread, output));
Object index_obj(&scope, runtime->newInt(index));
return runtime->newTupleWith2(output_bytes, index_obj);
}
static void appendUtf32ToBytearray(Thread* thread, Runtime* runtime,
const Bytearray& writer, int32_t codepoint,
endian endianness) {
if (endianness == endian::little) {
bytearrayAdd(thread, runtime, writer, codepoint);
bytearrayAdd(thread, runtime, writer, codepoint >> (kBitsPerByte));
bytearrayAdd(thread, runtime, writer, codepoint >> (kBitsPerByte * 2));
bytearrayAdd(thread, runtime, writer, codepoint >> (kBitsPerByte * 3));
} else {
bytearrayAdd(thread, runtime, writer, codepoint >> (kBitsPerByte * 3));
bytearrayAdd(thread, runtime, writer, codepoint >> (kBitsPerByte * 2));
bytearrayAdd(thread, runtime, writer, codepoint >> (kBitsPerByte));
bytearrayAdd(thread, runtime, writer, codepoint);
}
}
RawObject FUNC(_codecs, _utf_32_encode)(Thread* thread, Arguments args) {
Runtime* runtime = thread->runtime();
HandleScope scope(thread);
Object output_obj(&scope, args.get(3));
DCHECK(runtime->isInstanceOfBytearray(*output_obj),
"Fourth arg to _utf_32_encode must be bytearray");
Str data(&scope, strUnderlying(args.get(0)));
Str errors(&scope, strUnderlying(args.get(1)));
word index = intUnderlying(args.get(2)).asWord();
Bytearray output(&scope, *output_obj);
OptInt<int32_t> byteorder = intUnderlying(args.get(4)).asInt<int32_t>();
if (byteorder.error != CastError::None) {
return thread->raiseWithFmt(LayoutId::kOverflowError,
"Python int too large to convert to C int");
}
SymbolId error_id = lookupSymbolForErrorHandler(errors);
for (word byte_offset = thread->strOffset(data, index);
byte_offset < data.length(); index++) {
endian endianness = byteorder.value <= 0 ? endian::little : endian::big;
word num_bytes;
int32_t codepoint = data.codePointAt(byte_offset, &num_bytes);
byte_offset += num_bytes;
if (!Unicode::isSurrogate(codepoint)) {
appendUtf32ToBytearray(thread, runtime, output, codepoint, endianness);
} else {
switch (error_id) {
case ID(ignore):
continue;
case ID(replace):
appendUtf32ToBytearray(thread, runtime, output, kASCIIReplacement,
endianness);
continue;
case ID(surrogateescape):
if (isEscapedLatin1Surrogate(codepoint)) {
appendUtf32ToBytearray(thread, runtime, output,
codepoint - Unicode::kLowSurrogateStart,
endianness);
continue;
}
break;
default:
break;
}
Object outpos1(&scope, runtime->newInt(index));
while (byte_offset < data.length() &&
Unicode::isSurrogate(data.codePointAt(byte_offset, &num_bytes))) {
byte_offset += num_bytes;
index++;
}
Object outpos2(&scope, runtime->newInt(index + 1));
return runtime->newTupleWith2(outpos1, outpos2);
}
}
Object output_bytes(&scope, bytearrayAsBytes(thread, output));
Object index_obj(&scope, runtime->newInt(index));
return runtime->newTupleWith2(output_bytes, index_obj);
}
// Takes a Bytearray and a Str object, and appends each byte in the Str to the
// Bytearray one by one
RawObject FUNC(_codecs, _bytearray_string_append)(Thread* thread,
Arguments args) {
HandleScope scope(thread);
Bytearray dst(&scope, args.get(0));
Str data(&scope, args.get(1));
for (word i = 0; i < data.length(); ++i) {
bytearrayAdd(thread, thread->runtime(), dst, data.byteAt(i));
}
return NoneType::object();
}
RawObject FUNC(_codecs, _raw_unicode_escape_encode)(Thread* thread,
Arguments args) {
HandleScope scope(thread);
Runtime* runtime = thread->runtime();
Str data(&scope, strUnderlying(args.get(0)));
word size = data.codePointLength();
Bytearray dst(&scope, runtime->newBytearray());
word length = data.length();
// 2 byte codepoints can be expanded to 4 bytes + 2 escape characters
// 4 byte codepoints well be expanded to 8 bytes + 2 escape characters
// To be safe we double the bytecount and add space for 2 escape characters
// per codepoint.
word expanded_size = length * 2 + size * 2;
runtime->bytearrayEnsureCapacity(thread, dst, expanded_size);
word num_bytes;
for (word index = 0, byte_offset = thread->strOffset(data, index);
byte_offset < data.length(); index++) {
int32_t codepoint = data.codePointAt(byte_offset, &num_bytes);
byte_offset += num_bytes;
// U+0000-U+00ff range: Copy 8-bit characters as-is
if (codepoint <= kMaxByte) {
bytearrayAdd(thread, runtime, dst, codepoint);
}
// U+0100-U+ffff range: Map 16-bit characters to '\uHHHH'
else if (codepoint <= kMaxUint16) {
bytearrayAdd(thread, runtime, dst, '\\');
bytearrayAdd(thread, runtime, dst, 'u');
bytearrayAdd(thread, runtime, dst,
lowerCaseHexDigit((codepoint >> 12) & 0xf));
bytearrayAdd(thread, runtime, dst,
lowerCaseHexDigit((codepoint >> 8) & 0xf));
bytearrayAdd(thread, runtime, dst,
lowerCaseHexDigit((codepoint >> 4) & 0xf));
bytearrayAdd(thread, runtime, dst, lowerCaseHexDigit(codepoint & 15));
}
// U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH'
else {
CHECK(codepoint <= kMaxUnicode, "expected a valid unicode code point");
bytearrayAdd(thread, runtime, dst, '\\');
bytearrayAdd(thread, runtime, dst, 'U');
bytearrayAdd(thread, runtime, dst, '0');
bytearrayAdd(thread, runtime, dst, '0');
bytearrayAdd(thread, runtime, dst,
lowerCaseHexDigit((codepoint >> 20) & 0xf));
bytearrayAdd(thread, runtime, dst,
lowerCaseHexDigit((codepoint >> 16) & 0xf));
bytearrayAdd(thread, runtime, dst,
lowerCaseHexDigit((codepoint >> 12) & 0xf));
bytearrayAdd(thread, runtime, dst,
lowerCaseHexDigit((codepoint >> 8) & 0xf));
bytearrayAdd(thread, runtime, dst,
lowerCaseHexDigit((codepoint >> 4) & 0xf));
bytearrayAdd(thread, runtime, dst, lowerCaseHexDigit(codepoint & 15));
}
}
Object output_bytes(&scope, bytearrayAsBytes(thread, dst));
Object size_obj(&scope, runtime->newInt(size));
return runtime->newTupleWith2(output_bytes, size_obj);
}
RawObject FUNC(_codecs, _raw_unicode_escape_decode)(Thread* thread,
Arguments args) {
HandleScope scope(thread);
Runtime* runtime = thread->runtime();
Object data(&scope, args.get(0));
Str errors(&scope, strUnderlying(args.get(1)));
word index = intUnderlying(args.get(2)).asWord();
StrArray dst(&scope, args.get(3));
Byteslike bytes(&scope, thread, *data);
word length = bytes.length();
runtime->strArrayEnsureCapacity(thread, dst, length);
for (word i = index; i < length;) {
const char* message = nullptr;
word start_pos = i;
byte ch = bytes.byteAt(i);
i++;
if (ch != '\\') {
if (ch <= kMaxASCII) {
runtime->strArrayAddASCII(thread, dst, ch);
continue;
}
Str temp(&scope, SmallStr::fromCodePoint(ch));
runtime->strArrayAddStr(thread, dst, temp);
continue;
}
if (i >= length) {
// \\ at end of string
runtime->strArrayAddASCII(thread, dst, '\\');
} else {
int32_t decoded;
ch = bytes.byteAt(i);
i++;
// Only care about \uXXXX and \UXXXXXXXX when decoding raw unicode.
switch (ch) {
// \uXXXX
case 'u': {
if ((decoded = decodeHexEscaped(bytes, &i, 4)) < 0) {
message = (decoded == -1 ? "truncated \\uXXXX escape"
: "illegal Unicode character");
}
break;
}
// \UXXXXXXXX
case 'U': {
if ((decoded = decodeHexEscaped(bytes, &i, 8)) < 0) {
if (decoded == -1) {
message = "truncated \\UXXXXXXXX escape";
} else if (decoded == -2) {
message = "\\Uxxxxxxxx out of range";
} else {
message = "illegal Unicode character";
}
}
break;
}
default: {
runtime->strArrayAddASCII(thread, dst, '\\');
decoded = ch;
}
}
if (decoded >= 0) {
if (decoded <= kMaxASCII) {
runtime->strArrayAddASCII(thread, dst, decoded);
continue;
}
Str temp(&scope, SmallStr::fromCodePoint(decoded));
runtime->strArrayAddStr(thread, dst, temp);
continue;
}
}
if (message != nullptr) {
SymbolId error_id = lookupSymbolForErrorHandler(errors);
switch (error_id) {
case ID(replace): {
Str temp(&scope, SmallStr::fromCodePoint(0xFFFD));
runtime->strArrayAddStr(thread, dst, temp);
break;
}
case ID(ignore):
break;
default: {
Object start_pos_obj(&scope, runtime->newInt(start_pos));
Object outpos_obj(&scope, runtime->newInt(i));
Object message_obj(&scope, runtime->newStrFromCStr(message));
return runtime->newTupleWith3(start_pos_obj, outpos_obj, message_obj);
}
}
}
}
Object dst_obj(&scope, runtime->strFromStrArray(dst));
Object length_obj(&scope, runtime->newInt(length));
Object message_obj(&scope, runtime->newStrFromCStr(""));
return runtime->newTupleWith3(dst_obj, length_obj, message_obj);
}
RawObject FUNC(_codecs, backslashreplace_errors)(Thread* thread,
Arguments args) {
HandleScope scope(thread);
Runtime* runtime = thread->runtime();
Object error(&scope, args.get(0));
Object object(&scope, NoneType::object());
word start;
word end;
if (runtime->isInstanceOfUnicodeDecodeError(*error)) {
UnicodeErrorBase unicode_error(&scope, *error);
start = SmallInt::cast(unicode_error.start()).value();
end = SmallInt::cast(unicode_error.end()).value();
object = unicode_error.object();
if (!runtime->isInstanceOfBytes(*object)) {
return thread->raiseWithFmt(LayoutId::kTypeError,
"object attribute must be bytes");
}
Bytes bytes(&scope, bytesUnderlying(*object));
word length = bytes.length();
if (start >= length) start = length - 1;
if (start < 0) start = 0;
if (end >= length) end = length;
if (end < 1) end = 1;
word result_size = end - start;
if (result_size < 0) {
return thread->raiseWithFmt(LayoutId::kValueError, "end before start");
}
result_size *= 4;
MutableBytes result(&scope,
runtime->newMutableBytesUninitialized(result_size));
word pos = 0;
for (word i = start; i < end; i++) {
byte b = bytes.byteAt(i);
result.byteAtPut(pos++, '\\');
result.byteAtPut(pos++, 'x');
uwordToHexadecimalWithMutableBytes(*result, pos, /*num_digits=*/2, b);
pos += 2;
}
DCHECK(pos == result.length(), "size mismatch");
Object result_str(&scope, result.becomeStr());
Object end_obj(&scope, SmallInt::fromWord(end));
return runtime->newTupleWith2(result_str, end_obj);
}
if (runtime->isInstanceOfUnicodeEncodeError(*error) ||
runtime->isInstanceOfUnicodeTranslateError(*error)) {
UnicodeErrorBase unicode_error(&scope, *error);
start = SmallInt::cast(unicode_error.start()).value();
end = SmallInt::cast(unicode_error.end()).value();
object = unicode_error.object();
if (!runtime->isInstanceOfStr(*object)) {
return thread->raiseWithFmt(LayoutId::kTypeError,
"object attribute must be unicode");
}
Str str(&scope, strUnderlying(*object));
if (start < 0) start = 0;
if (end < 1) end = 1;
if (end < start) {
return thread->raiseWithFmt(LayoutId::kValueError, "end before start");
}
word start_byte = str.offsetByCodePoints(0, start);
word end_byte = str.offsetByCodePoints(start_byte, end - start);
word result_size = 0;
for (word i = start_byte; i < end_byte;) {
word num_bytes;
int32_t cp = str.codePointAt(i, &num_bytes);
i += num_bytes;
if (cp > kMaxUint16) {
result_size += 10; // Will replace with `\Uxxxxxxxx`
} else if (cp > kMaxByte) {
result_size += 6; // Will replace with `\uxxxx`
} else {
result_size += 4; // Will replace with `\xyy`
}
}
MutableBytes result(&scope,
runtime->newMutableBytesUninitialized(result_size));
word pos = 0;
for (word i = start_byte; i < end_byte;) {
word num_bytes;
int32_t cp = str.codePointAt(i, &num_bytes);
i += num_bytes;
result.byteAtPut(pos++, '\\');
if (cp > kMaxUint16) {
result.byteAtPut(pos++, 'U');
uwordToHexadecimalWithMutableBytes(*result, pos, /*num_digits=*/8, cp);
pos += 8;
} else if (cp > kMaxByte) {
result.byteAtPut(pos++, 'u');
uwordToHexadecimalWithMutableBytes(*result, pos, /*num_digits=*/4, cp);
pos += 4;
} else {
result.byteAtPut(pos++, 'x');
uwordToHexadecimalWithMutableBytes(*result, pos, /*num_digits=*/2, cp);
pos += 2;
}
}
DCHECK(pos == result.length(), "size mismatch");
Object result_bytes(&scope, result.becomeStr());
Object end_obj(&scope, SmallInt::fromWord(end));
return runtime->newTupleWith2(result_bytes, end_obj);
}
return thread->raiseWithFmt(LayoutId::kTypeError,
"don't know how to handle %T in error callback",
&error);
}
} // namespace py