in androidLibs/fbjni/cxx/fbjni/detail/utf8.cpp [102:165]
void utf8ToModifiedUTF8(const uint8_t* utf8, size_t len, uint8_t* modified, size_t modifiedBufLen)
{
size_t j = 0;
for (size_t i = 0; i < len; ) {
if (j >= modifiedBufLen) {
FBJNI_LOGF("output buffer is too short");
}
if (utf8[i] == 0) {
if (j + 1 >= modifiedBufLen) {
FBJNI_LOGF("output buffer is too short");
}
modified[j] = 0xc0;
modified[j + 1] = 0x80;
i += 1;
j += 2;
continue;
}
if (i + 4 > len ||
!isFourByteUTF8Encoding(utf8 + i)) {
// If the input is too short for this to be a four-byte
// encoding, or it isn't one for real, just copy it on through.
modified[j] = utf8[i];
i++;
j++;
continue;
}
// Convert 4 bytes of input to 2 * 3 bytes of output
char32_t code = (((utf8[i] & 0x07) << 18) |
((utf8[i + 1] & 0x3f) << 12) |
((utf8[i + 2] & 0x3f) << 6) |
( utf8[i + 3] & 0x3f));
char32_t first;
char32_t second;
if (code > 0x10ffff) {
// These could be valid utf-8, but cannot be represented as modified UTF-8, due to the 20-bit
// limit on that representation. Encode two replacement characters, so the expected output
// length lines up.
const char32_t kUnicodeReplacementChar = 0xfffd;
first = kUnicodeReplacementChar;
second = kUnicodeReplacementChar;
} else {
// split into surrogate pair
first = ((code - 0x010000) >> 10) | 0xd800;
second = ((code - 0x010000) & 0x3ff) | 0xdc00;
}
// encode each as a 3 byte surrogate value
if (j + 5 >= modifiedBufLen) {
FBJNI_LOGF("output buffer is too short");
}
encode3ByteUTF8(first, modified + j);
encode3ByteUTF8(second, modified + j + 3);
i += 4;
j += 6;
}
if (j >= modifiedBufLen) {
FBJNI_LOGF("output buffer is too short");
}
modified[j++] = '\0';
}