in runtime/under-codecs-module.cpp [658:758]
static Utf8DecoderResult isValidUtf8Codepoint(const Byteslike& bytes,
word index) {
word length = bytes.length();
byte ch = bytes.byteAt(index);
if (ch <= kMaxASCII) {
return k1Byte;
}
if (ch < 0xE0) {
// \xC2\x80-\xDF\xBF -- 0080-07FF
if (ch < 0xC2) {
// invalid sequence
// \x80-\xBF -- continuation byte
// \xC0-\xC1 -- fake 0000-007F
return kInvalidStart;
}
if (index + 1 >= length) {
return kUnexpectedEndOfData;
}
if (!UTF8::isTrailByte(bytes.byteAt(index + 1))) {
return kInvalidContinuation1;
}
return k2Byte;
}
if (ch < 0xF0) {
// \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF
if (index + 2 >= length) {
if (index + 1 >= length) {
return kUnexpectedEndOfData;
}
byte ch2 = bytes.byteAt(index + 1);
if (!UTF8::isTrailByte(ch2) || (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) {
return kInvalidContinuation1;
}
return kUnexpectedEndOfData;
}
byte ch2 = bytes.byteAt(index + 1);
if (!UTF8::isTrailByte(ch2)) {
return kInvalidContinuation1;
}
if (ch == 0xE0) {
if (ch2 < 0xA0) {
// invalid sequence
// \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800
return kInvalidContinuation1;
}
} else if (ch == 0xED && ch2 >= 0xA0) {
// Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
// will result in surrogates in range D800-DFFF. Surrogates are
// not valid UTF-8 so they are rejected.
// See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
// (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
return kInvalidContinuation1;
}
if (!UTF8::isTrailByte(bytes.byteAt(index + 2))) {
return kInvalidContinuation2;
}
return k3Byte;
}
if (ch < 0xF5) {
// \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF
if (index + 3 >= length) {
if (index + 1 >= length) {
return kUnexpectedEndOfData;
}
byte ch2 = bytes.byteAt(index + 1);
if (!UTF8::isTrailByte(ch2) || (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) {
return kInvalidContinuation1;
}
if (index + 2 >= length) {
return kUnexpectedEndOfData;
}
if (!UTF8::isTrailByte(bytes.byteAt(index + 2))) {
return kInvalidContinuation2;
}
return kUnexpectedEndOfData;
}
byte ch2 = bytes.byteAt(index + 1);
if (!UTF8::isTrailByte(ch2)) {
return kInvalidContinuation1;
}
if (ch == 0xF0) {
if (ch2 < 0x90) {
// invalid sequence
// \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF
return kInvalidContinuation1;
}
} else if (ch == 0xF4 && ch2 >= 0x90) {
// invalid sequence
// \xF4\x90\x80\80- -- 110000- overflow
return kInvalidContinuation1;
}
if (!UTF8::isTrailByte(bytes.byteAt(index + 2))) {
return kInvalidContinuation2;
}
if (!UTF8::isTrailByte(bytes.byteAt(index + 3))) {
return kInvalidContinuation3;
}
return k4Byte;
}
return kInvalidStart;
}