in runtime/under-json-module.cpp [139:193]
static RawObject maybeDecode(Thread* thread, const Object& s,
const Bytes& bytes, word length, word* next) {
// Cannot guess with just 0 or 1 bytes. Assume it's UTF-8.
if (length < 2) return *bytes;
// Search for BOM sequences. If there are none, search for `0` bytes which
// are a strong sign for the high bits of UTF-16/UTF-32 encodings, since
// legal JSON must start with an ASCII character with high byte(s) zero.
// The code looks at the first 2 bytes to detect UTF-16 and the first 4
// bytes to detect UTF-32.
const char* encoding;
byte b0 = bytes.byteAt(0);
byte b1 = bytes.byteAt(1);
if (b0 == UTF8::kBOM[0] && b1 == UTF8::kBOM[1] && length >= 3 &&
bytes.byteAt(2) == UTF8::kBOM[2]) {
*next += 3;
return *bytes;
}
if (b0 == UTF32::kBOMLittleEndian[0] && b1 == UTF32::kBOMLittleEndian[1] &&
length >= 4 && bytes.byteAt(2) == UTF32::kBOMLittleEndian[2] &&
bytes.byteAt(3) == UTF32::kBOMLittleEndian[3]) {
encoding = "utf-32";
} else if (b0 == UTF32::kBOMBigEndian[0] && b1 == UTF32::kBOMBigEndian[1] &&
length >= 4 && bytes.byteAt(2) == UTF32::kBOMBigEndian[2] &&
bytes.byteAt(3) == UTF32::kBOMBigEndian[3]) {
encoding = "utf-32";
} else if (b0 == UTF16::kBOMLittleEndian[0] &&
b1 == UTF16::kBOMLittleEndian[1]) {
encoding = "utf-16";
} else if (b0 == UTF16::kBOMBigEndian[0] && b1 == UTF16::kBOMBigEndian[1]) {
encoding = "utf-16";
} else if (b0 == 0) {
if (b1 == 0 && length >= 4) {
encoding = "utf-32-be";
} else {
encoding = "utf-16-be";
}
} else if (b1 == 0) {
DCHECK(b0 != 0, "Expected b0 != 0");
if (length >= 4 && bytes.byteAt(2) == 0 && bytes.byteAt(3) == 0) {
encoding = "utf-32-le";
} else {
encoding = "utf-16-le";
}
} else {
// Default to UTF-8 which the decoder handles naturally.
return *bytes;
}
HandleScope scope(thread);
Object encoding_str(&scope, Runtime::internStrFromCStr(thread, encoding));
Object errors(&scope, Runtime::internStrFromCStr(thread, "surrogatepass"));
return thread->invokeFunction3(ID(_codecs), ID(decode), s, encoding_str,
errors);
}