std::string modifiedUTF8ToUTF8()

in androidLibs/fbjni/cxx/fbjni/detail/utf8.cpp [167:209]


std::string modifiedUTF8ToUTF8(const uint8_t* modified, size_t len) noexcept {
  // Converting from modified utf8 to utf8 will always shrink, so this will always be sufficient
  std::string utf8(len, 0);
  size_t j = 0;
  for (size_t i = 0; i < len; ) {
    // surrogate pair: 1101 10xx  xxxx xxxx  1101 11xx  xxxx xxxx
    // encoded pair: 1110 1101  1010 xxxx  10xx xxxx  1110 1101  1011 xxxx  10xx xxxx

    if (len >= i + 6 &&
        modified[i] == 0xed &&
        (modified[i + 1] & 0xf0) == 0xa0 &&
        modified[i + 3] == 0xed &&
        (modified[i + 4] & 0xf0) == 0xb0) {
      // Valid surrogate pair
      char32_t pair1 = decode3ByteUTF8(modified + i);
      char32_t pair2 = decode3ByteUTF8(modified + i + 3);
      char32_t ch = 0x10000 + (((pair1 & 0x3ff) << 10) |
                               ( pair2 & 0x3ff));
      encode4ByteUTF8(ch, utf8, j);
      i += 6;
      j += 4;
      continue;
    } else if (len >= i + 2 &&
               modified[i] == 0xc0 &&
               modified[i + 1] == 0x80) {
      utf8[j] = 0;
      i += 2;
      j += 1;
      continue;
    }

    // copy one byte.  This might be a one, two, or three-byte encoding.  It might be an invalid
    // encoding of some sort, but garbage in garbage out is ok.

    utf8[j] = (char) modified[i];
    i++;
    j++;
  }

  utf8.resize(j);

  return utf8;
}