int64_t getUtf8StringLengthAndPerChunkOffsets()

in hessian2/basic_codec/string_codec.cc [36:80]


int64_t getUtf8StringLengthAndPerChunkOffsets(
    absl::string_view in, Uint64Vector &per_chunk_bytes_offsets,
    Uint64Vector &four_bytes_char_offsets) {
  int64_t utf8_length = 0;
  size_t raw_bytes_length = 0;

  size_t current_chunk = 0;

  const size_t in_size = in.size();

  for (; raw_bytes_length < in_size;) {
    const uint8_t code = static_cast<uint8_t>(in[raw_bytes_length]);
    const uint8_t char_length = UTF_8_CHAR_LENGTHS[code >> 3];

    // Check the validity of UTF-8 string.
    if (char_length == 0 || raw_bytes_length + char_length > in_size) {
      return -1;
    }

    // Record the offset of the four bytes UTF-8 character.
    if (char_length == 4) {
      four_bytes_char_offsets.push_back(raw_bytes_length);
    }

    utf8_length++;
    raw_bytes_length += char_length;

    current_chunk++;

    // Check whether the current chunk is full and record the bytes offset of
    // the current chunk.
    if (current_chunk >= STRING_CHUNK_SIZE) {
      per_chunk_bytes_offsets.push_back(raw_bytes_length);
      current_chunk = 0;
    }
  }

  // Record the bytes offset of the last chunk.
  if (current_chunk > 0) {
    per_chunk_bytes_offsets.push_back(raw_bytes_length);
    current_chunk = 0;
  }

  return utf8_length;
}