void utf8ToModifiedUTF8()

in androidLibs/fbjni/cxx/fbjni/detail/utf8.cpp [102:165]


void utf8ToModifiedUTF8(const uint8_t* utf8, size_t len, uint8_t* modified, size_t modifiedBufLen)
{
  size_t j = 0;
  for (size_t i = 0; i < len; ) {
    if (j >= modifiedBufLen) {
      FBJNI_LOGF("output buffer is too short");
    }
    if (utf8[i] == 0) {
      if (j + 1 >= modifiedBufLen) {
        FBJNI_LOGF("output buffer is too short");
      }
      modified[j] = 0xc0;
      modified[j + 1] = 0x80;
      i += 1;
      j += 2;
      continue;
    }

    if (i + 4 > len ||
        !isFourByteUTF8Encoding(utf8 + i)) {
      // If the input is too short for this to be a four-byte
      // encoding, or it isn't one for real, just copy it on through.
      modified[j] = utf8[i];
      i++;
      j++;
      continue;
    }

    // Convert 4 bytes of input to 2 * 3 bytes of output
    char32_t code = (((utf8[i]     & 0x07) << 18) |
                     ((utf8[i + 1] & 0x3f) << 12) |
                     ((utf8[i + 2] & 0x3f) << 6) |
                     ( utf8[i + 3] & 0x3f));
    char32_t first;
    char32_t second;

    if (code > 0x10ffff) {
      // These could be valid utf-8, but cannot be represented as modified UTF-8, due to the 20-bit
      // limit on that representation.  Encode two replacement characters, so the expected output
      // length lines up.
      const char32_t kUnicodeReplacementChar = 0xfffd;
      first = kUnicodeReplacementChar;
      second = kUnicodeReplacementChar;
    } else {
      // split into surrogate pair
      first = ((code - 0x010000) >> 10) | 0xd800;
      second = ((code - 0x010000) & 0x3ff) | 0xdc00;
    }

    // encode each as a 3 byte surrogate value
    if (j + 5 >= modifiedBufLen) {
      FBJNI_LOGF("output buffer is too short");
    }
    encode3ByteUTF8(first, modified + j);
    encode3ByteUTF8(second, modified + j + 3);
    i += 4;
    j += 6;
  }

  if (j >= modifiedBufLen) {
    FBJNI_LOGF("output buffer is too short");
  }
  modified[j++] = '\0';
}