std::string unescapeFourBytesUtf8Char()

in hessian2/basic_codec/string_codec.cc [127:186]


std::string unescapeFourBytesUtf8Char(absl::string_view in) {
  const size_t in_size = in.size();

  std::string out;
  out.reserve(in_size);

  for (size_t index = 0; index < in_size;) {
    const uint8_t code = static_cast<uint8_t>(in[index]);
    const uint8_t char_length = UTF_8_CHAR_LENGTHS[code >> 3];

    // Check whether the current two 3 bytes UTF-8 is surrogate pair. The prefix
    // 6bit of surrogate is 0b110110 or 0b110111. 4bit in the first byte of
    // UTF-8 character and 2bit in the second byte of UTF-8 character.

    if ((char_length == 3) && (index + 5 < in_size) &&
        (static_cast<uint8_t>(in[index + 0]) == 0xED) &&
        (static_cast<uint8_t>((in[index + 1]) & 0xF0) == 0xA0) &&
        (static_cast<uint8_t>(in[index + 3]) == 0xED) &&
        (static_cast<uint8_t>((in[index + 4]) & 0xF0) == 0xB0)) {
      // Extract the high and low surrogate.
      const uint32_t high_surrogate =
          (static_cast<uint32_t>(in[index + 0] & 0x0F) << 12) |
          (static_cast<uint32_t>(in[index + 1] & 0x3F) << 6) |
          (static_cast<uint32_t>(in[index + 2] & 0x3F));

      const uint32_t low_surrogate =
          (static_cast<uint32_t>(in[index + 3] & 0x0F) << 12) |
          (static_cast<uint32_t>(in[index + 4] & 0x3F) << 6) |
          (static_cast<uint32_t>(in[index + 5] & 0x3F));

      const uint32_t code_point =
          ((static_cast<uint32_t>(high_surrogate & 0x3FF) << 10) |
           (static_cast<uint32_t>(low_surrogate & 0x3FF))) +
          0x10000;

      // Covert the code point to 4 bytes UTF-8.
      out.push_back(static_cast<char>(0xF0 | ((code_point >> 18))));
      out.push_back(static_cast<char>(0x80 | ((code_point >> 12) & 0x3F)));
      out.push_back(static_cast<char>(0x80 | ((code_point >> 6) & 0x3F)));
      out.push_back(static_cast<char>(0x80 | ((code_point & 0x3F))));

      index += 6;
      continue;
    }

    // In other cases copy the bytes to the output string directly.
    if (char_length > 0 && index + char_length <= in_size) {
      for (size_t inner_i = 0; inner_i < char_length; inner_i++) {
        out.push_back(in[index + inner_i]);
      }
      index += char_length;
    } else {
      // This should not happen because we have checked the validity of UTF-8
      // string before.
      return "";
    }
  }

  return out;
}