in hessian2/basic_codec/string_codec.cc [127:186]
std::string unescapeFourBytesUtf8Char(absl::string_view in) {
const size_t in_size = in.size();
std::string out;
out.reserve(in_size);
for (size_t index = 0; index < in_size;) {
const uint8_t code = static_cast<uint8_t>(in[index]);
const uint8_t char_length = UTF_8_CHAR_LENGTHS[code >> 3];
// Check whether the current two 3 bytes UTF-8 is surrogate pair. The prefix
// 6bit of surrogate is 0b110110 or 0b110111. 4bit in the first byte of
// UTF-8 character and 2bit in the second byte of UTF-8 character.
if ((char_length == 3) && (index + 5 < in_size) &&
(static_cast<uint8_t>(in[index + 0]) == 0xED) &&
(static_cast<uint8_t>((in[index + 1]) & 0xF0) == 0xA0) &&
(static_cast<uint8_t>(in[index + 3]) == 0xED) &&
(static_cast<uint8_t>((in[index + 4]) & 0xF0) == 0xB0)) {
// Extract the high and low surrogate.
const uint32_t high_surrogate =
(static_cast<uint32_t>(in[index + 0] & 0x0F) << 12) |
(static_cast<uint32_t>(in[index + 1] & 0x3F) << 6) |
(static_cast<uint32_t>(in[index + 2] & 0x3F));
const uint32_t low_surrogate =
(static_cast<uint32_t>(in[index + 3] & 0x0F) << 12) |
(static_cast<uint32_t>(in[index + 4] & 0x3F) << 6) |
(static_cast<uint32_t>(in[index + 5] & 0x3F));
const uint32_t code_point =
((static_cast<uint32_t>(high_surrogate & 0x3FF) << 10) |
(static_cast<uint32_t>(low_surrogate & 0x3FF))) +
0x10000;
// Covert the code point to 4 bytes UTF-8.
out.push_back(static_cast<char>(0xF0 | ((code_point >> 18))));
out.push_back(static_cast<char>(0x80 | ((code_point >> 12) & 0x3F)));
out.push_back(static_cast<char>(0x80 | ((code_point >> 6) & 0x3F)));
out.push_back(static_cast<char>(0x80 | ((code_point & 0x3F))));
index += 6;
continue;
}
// In other cases copy the bytes to the output string directly.
if (char_length > 0 && index + char_length <= in_size) {
for (size_t inner_i = 0; inner_i < char_length; inner_i++) {
out.push_back(in[index + inner_i]);
}
index += char_length;
} else {
// This should not happen because we have checked the validity of UTF-8
// string before.
return "";
}
}
return out;
}