in runtime/bytes-builtins.cpp [358:457]
static bool bytesIsValidUTF8Impl(RawBytes bytes, bool allow_surrogates) {
for (word i = 0, length = bytes.length(); i < length;) {
byte b0 = bytes.byteAt(i++);
// ASCII bytes have the topmost bit zero.
static_assert(kMaxASCII == 0x7F, "unexpected kMaxASCII value");
if (b0 <= 0x7F) continue;
// Bytes past this point have the high bit set (0b1xxxxxxx).
// 0b110xxxxx begins a sequence with one continuation byte.
// `b0 < 0b11100000` overestimates and we filter in a 2nd comparison.
if (b0 < 0xE0) {
// b0 < 0xC0 catches 0b10xxxxxx bytes (invalid continuation bytes).
// 0xC0 + 0xC1 (0b11000000 + 0b110000001) would result in range(0x7F)
// which should have been encoded as ASCII.
if (b0 < 0xC2) {
return false;
}
if (i >= length) {
return false;
}
byte b1 = bytes.byteAt(i++);
if (!UTF8::isTrailByte(b1)) {
return false;
}
if (DCHECK_IS_ON()) {
uword decoded =
static_cast<uword>(b0 & 0x1F) << 6 | static_cast<uword>(b1 & 0x3F);
DCHECK(0x80 <= decoded && decoded <= 0x7FF, "unexpected value");
}
// 0b1110xxxx starts a sequence with two continuation bytes.
} else if (b0 < 0xF0) {
if (i + 1 >= length) {
return false;
}
byte b1 = bytes.byteAt(i++);
byte b2 = bytes.byteAt(i++);
if (!UTF8::isTrailByte(b1) || !UTF8::isTrailByte(b2)) {
return false;
}
// Catch sequences that should have been encoded in 1-2 bytes instead.
if (b0 == 0xE0) {
if (b1 < 0xA0) {
return false;
}
} else if (!allow_surrogates && b0 == 0xED && b1 >= 0xA0) {
// 0b11011xxxxxxxxxxx (0xD800 - 0xDFFF) is declared invalid by unicode
// as they look like utf-16 surrogates making it easier to detect
// mix-ups.
return false;
}
if (DCHECK_IS_ON()) {
uword decoded = static_cast<uword>(b0 & 0x0F) << 12 |
static_cast<uword>(b1 & 0x3F) << 6 |
static_cast<uword>(b2 & 0x3F);
DCHECK(0x0800 <= decoded && decoded <= 0xFFFF, "unexpected value");
}
static_assert(kMaxUnicode == 0x10FFFF, "unexpected maxunicode value");
// 0b11110xxx starts a sequence with three continuation bytes.
// However values bigger than 0x10FFFF are not valid unicode, so we test
// b0 < 0b11110101 to overestimate that.
} else if (b0 < 0xF5) {
if (i + 2 >= length) {
return false;
}
byte b1 = bytes.byteAt(i++);
byte b2 = bytes.byteAt(i++);
byte b3 = bytes.byteAt(i++);
if (!UTF8::isTrailByte(b1) || !UTF8::isTrailByte(b2) ||
!UTF8::isTrailByte(b3)) {
return false;
}
// Catch sequences that should have been encoded with 1-3 bytes instead.
if (b0 == 0xF0) {
if (b1 < 0x90) {
return false;
}
} else if (b0 == 0xF4 && b1 >= 0x90) {
// Bigger than kMaxUnicode.
return false;
}
if (DCHECK_IS_ON()) {
uword decoded = static_cast<uword>(b0 & 0x07) << 16 |
static_cast<uword>(b1 & 0x3F) << 12 |
static_cast<uword>(b2 & 0x3F) << 6 |
static_cast<uword>(b3 & 0x3F);
DCHECK(0x10000 <= decoded && decoded <= kMaxUnicode,
"unexpected value");
}
} else {
// Invalid prefix byte.
return false;
}
}
return true;
}