std::pair getUtf8StringLength()

in hessian2/basic_codec/string_codec.cc [86:119]


std::pair<int64_t, size_t> getUtf8StringLength(absl::string_view in,
                                               bool &has_surrogate) {
  int64_t utf8_length = 0;
  size_t raw_bytes_length = 0;

  const size_t in_size = in.size();

  for (; raw_bytes_length < in_size;) {
    const uint8_t code = static_cast<uint8_t>(in[raw_bytes_length]);

    // This is a cheap but coarse check for surrogate pairs.
    // The 'E' means 0b1110 is the prefix of 3 bytes UTF-8 character.
    // The 'D' means 0b1101 is the prefix of surrogate pair.
    // But 6bit is necessary to determine whether it is surrogate pair.
    // So we need to check the next byte. But the next byte may be not
    // available directly which makes the check more complex. So we just
    // check the first byte here and scan the whole string after the
    // string is read from the reader.
    if (code == 0xED) {
      has_surrogate = true;
    }

    const uint8_t char_length = UTF_8_CHAR_LENGTHS[code >> 3];

    if (char_length == 0) {
      return {-1, 0};
    }

    utf8_length++;
    raw_bytes_length += char_length;
  }

  return {utf8_length, raw_bytes_length};
}