in hessian2/basic_codec/string_codec.cc [36:80]
int64_t getUtf8StringLengthAndPerChunkOffsets(
absl::string_view in, Uint64Vector &per_chunk_bytes_offsets,
Uint64Vector &four_bytes_char_offsets) {
int64_t utf8_length = 0;
size_t raw_bytes_length = 0;
size_t current_chunk = 0;
const size_t in_size = in.size();
for (; raw_bytes_length < in_size;) {
const uint8_t code = static_cast<uint8_t>(in[raw_bytes_length]);
const uint8_t char_length = UTF_8_CHAR_LENGTHS[code >> 3];
// Check the validity of UTF-8 string.
if (char_length == 0 || raw_bytes_length + char_length > in_size) {
return -1;
}
// Record the offset of the four bytes UTF-8 character.
if (char_length == 4) {
four_bytes_char_offsets.push_back(raw_bytes_length);
}
utf8_length++;
raw_bytes_length += char_length;
current_chunk++;
// Check whether the current chunk is full and record the bytes offset of
// the current chunk.
if (current_chunk >= STRING_CHUNK_SIZE) {
per_chunk_bytes_offsets.push_back(raw_bytes_length);
current_chunk = 0;
}
}
// Record the bytes offset of the last chunk.
if (current_chunk > 0) {
per_chunk_bytes_offsets.push_back(raw_bytes_length);
current_chunk = 0;
}
return utf8_length;
}