std::string escapeFourBytesUtf8Char()

in hessian2/basic_codec/string_codec.cc [192:252]


std::string escapeFourBytesUtf8Char(
    absl::string_view in, const Uint64Vector &four_bytes_char_offsets) {
  std::string out;
  out.reserve(in.size() + four_bytes_char_offsets.size() * 3);

  size_t last_pos = 0;
  for (const size_t pos : four_bytes_char_offsets) {
    const absl::string_view sub_segment = in.substr(last_pos, pos - last_pos);
    out.append(sub_segment.data(), sub_segment.size());

    // Get code point of 4-byte character.
    uint32_t code_point = (static_cast<uint32_t>(in[pos] & 0x07) << 18) |
                          (static_cast<uint32_t>(in[pos + 1] & 0x3F) << 12) |
                          (static_cast<uint32_t>(in[pos + 2] & 0x3F) << 6) |
                          (static_cast<uint32_t>(in[pos + 3] & 0x3F));

    // Check the range of code point of 4-byte character.
    if (code_point < 0x10000 || code_point > 0x10FFFF) {
      return "";
    }

    // Covert the code point to UTF-16 surrogate pair.
    code_point -= 0x10000;
    // The value range of 'surrogate_pair' is 0xD800-0xDFFF, it is reserved by
    // Unicode standard for UTF-16 surrogate pair, so it is safe to use it as a
    // flag.
    const uint16_t surrogate_pair[2] = {
        // 6bit as the prefix and 10bit as the suffix (0b1101 10xx xxxx xxxx).
        // The value range is 0xD800-0xDBFF.
        static_cast<uint16_t>(0xD800 + (code_point >> 10)),
        // 6bit as the prefix and 10bit as the suffix (0b1101 11xx xxxx xxxx).
        // The value range is 0xDC00-0xDFFF.
        static_cast<uint16_t>(0xDC00 + (code_point & 0x3FF))};

    // Covert high and low surrogate to UTF-8.
    // The Java hessian2 library will encode one surrogate pair
    // (U+10000-U+10FFFF) to two UTF-8 characters. This is wrong, because one
    // surrogate pair (U+10000-U+10FFFF) should be encoded to one 4 bytes
    // UTF-8 characters. However, we still need to be compatible with the
    // Java hessian2 library, so we need to do the same thing even it is
    // wrong. Ref:
    // https://github.com/apache/dubbo-hessian-lite/blob/ca001b4658227d5122f85bcb45032a0dac4faf0d/src/main/java/com/alibaba/com/caucho/hessian/io/Hessian2Output.java#L1360
    for (const auto utf16_char : surrogate_pair) {
      // Needn't to check the range of 'utf16_char', because it must larger
      // than 0x800 and less than 0xFFFF,so it must be 3 bytes UTF-8. And
      // note because the value range is 0xD800-0xDFFF, so these UTF-8
      // characters actually are invalid and should not appear in the correct
      // UTF-8 string.
      out.push_back(static_cast<char>(0xE0 | ((utf16_char >> 12))));
      out.push_back(static_cast<char>(0x80 | ((utf16_char >> 6) & 0x3F)));
      out.push_back(static_cast<char>(0x80 | ((utf16_char & 0x3F))));
    }

    last_pos = pos + 4;
  }

  const absl::string_view last_segment = in.substr(last_pos);
  out.append(last_segment.data(), last_segment.size());

  return out;
}