in hessian2/basic_codec/string_codec.cc [192:252]
std::string escapeFourBytesUtf8Char(
absl::string_view in, const Uint64Vector &four_bytes_char_offsets) {
std::string out;
out.reserve(in.size() + four_bytes_char_offsets.size() * 3);
size_t last_pos = 0;
for (const size_t pos : four_bytes_char_offsets) {
const absl::string_view sub_segment = in.substr(last_pos, pos - last_pos);
out.append(sub_segment.data(), sub_segment.size());
// Get code point of 4-byte character.
uint32_t code_point = (static_cast<uint32_t>(in[pos] & 0x07) << 18) |
(static_cast<uint32_t>(in[pos + 1] & 0x3F) << 12) |
(static_cast<uint32_t>(in[pos + 2] & 0x3F) << 6) |
(static_cast<uint32_t>(in[pos + 3] & 0x3F));
// Check the range of code point of 4-byte character.
if (code_point < 0x10000 || code_point > 0x10FFFF) {
return "";
}
// Covert the code point to UTF-16 surrogate pair.
code_point -= 0x10000;
// The value range of 'surrogate_pair' is 0xD800-0xDFFF, it is reserved by
// Unicode standard for UTF-16 surrogate pair, so it is safe to use it as a
// flag.
const uint16_t surrogate_pair[2] = {
// 6bit as the prefix and 10bit as the suffix (0b1101 10xx xxxx xxxx).
// The value range is 0xD800-0xDBFF.
static_cast<uint16_t>(0xD800 + (code_point >> 10)),
// 6bit as the prefix and 10bit as the suffix (0b1101 11xx xxxx xxxx).
// The value range is 0xDC00-0xDFFF.
static_cast<uint16_t>(0xDC00 + (code_point & 0x3FF))};
// Covert high and low surrogate to UTF-8.
// The Java hessian2 library will encode one surrogate pair
// (U+10000-U+10FFFF) to two UTF-8 characters. This is wrong, because one
// surrogate pair (U+10000-U+10FFFF) should be encoded to one 4 bytes
// UTF-8 characters. However, we still need to be compatible with the
// Java hessian2 library, so we need to do the same thing even it is
// wrong. Ref:
// https://github.com/apache/dubbo-hessian-lite/blob/ca001b4658227d5122f85bcb45032a0dac4faf0d/src/main/java/com/alibaba/com/caucho/hessian/io/Hessian2Output.java#L1360
for (const auto utf16_char : surrogate_pair) {
// Needn't to check the range of 'utf16_char', because it must larger
// than 0x800 and less than 0xFFFF,so it must be 3 bytes UTF-8. And
// note because the value range is 0xD800-0xDFFF, so these UTF-8
// characters actually are invalid and should not appear in the correct
// UTF-8 string.
out.push_back(static_cast<char>(0xE0 | ((utf16_char >> 12))));
out.push_back(static_cast<char>(0x80 | ((utf16_char >> 6) & 0x3F)));
out.push_back(static_cast<char>(0x80 | ((utf16_char & 0x3F))));
}
last_pos = pos + 4;
}
const absl::string_view last_segment = in.substr(last_pos);
out.append(last_segment.data(), last_segment.size());
return out;
}