in hessian2/basic_codec/string_codec.cc [86:119]
std::pair<int64_t, size_t> getUtf8StringLength(absl::string_view in,
bool &has_surrogate) {
int64_t utf8_length = 0;
size_t raw_bytes_length = 0;
const size_t in_size = in.size();
for (; raw_bytes_length < in_size;) {
const uint8_t code = static_cast<uint8_t>(in[raw_bytes_length]);
// This is a cheap but coarse check for surrogate pairs.
// The 'E' means 0b1110 is the prefix of 3 bytes UTF-8 character.
// The 'D' means 0b1101 is the prefix of surrogate pair.
// But 6bit is necessary to determine whether it is surrogate pair.
// So we need to check the next byte. But the next byte may be not
// available directly which makes the check more complex. So we just
// check the first byte here and scan the whole string after the
// string is read from the reader.
if (code == 0xED) {
has_surrogate = true;
}
const uint8_t char_length = UTF_8_CHAR_LENGTHS[code >> 3];
if (char_length == 0) {
return {-1, 0};
}
utf8_length++;
raw_bytes_length += char_length;
}
return {utf8_length, raw_bytes_length};
}