static long putCharsToUtf8()

in datasketches-memory-java8/src/main/java/org/apache/datasketches/memory/internal/Utf8.java [282:402]


  static long putCharsToUtf8(final long offsetBytes, final CharSequence src,
        final long capacityBytes, final long cumBaseOffset, final Object unsafeObj) {


    int cIdx = 0; //src character index
    long bIdx = cumBaseOffset + offsetBytes; //byte index
    long bCnt = 0; //bytes inserted

    final long byteLimit = cumBaseOffset + capacityBytes; //unsafe index limit

    final int utf16Length = src.length();
    //Quickly dispatch an ASCII sequence
    for (char c;
        (cIdx < utf16Length) && ((cIdx + bIdx) < byteLimit) && ((c = src.charAt(cIdx)) < 0x80);
        cIdx++, bCnt++) {
      unsafe.putByte(unsafeObj, bIdx + cIdx, (byte) c);
    }
    //encountered a non-ascii character
    if (cIdx == utf16Length) { //done.
      // next relative byte index in memory is (bIdx + utf16Length) - cumBaseOffset.
      return bCnt;
    }
    bIdx += cIdx; //bytes == characters for ascii

    for (char c; cIdx < utf16Length; cIdx++) { //process the remaining characters
      c = src.charAt(cIdx);

      if ((c < 0x80) && (bIdx < byteLimit)) {
        //Encode ASCII, 0 through 0x007F.
        unsafe.putByte(unsafeObj, bIdx++, (byte) c);
        bCnt++;
      }

      else
      //c MUST BE >= 0x0080 || j >= byteLimit

      if ((c < 0x800) && (bIdx < (byteLimit - 1))) {
        //Encode 0x80 through 0x7FF.
        //This is for almost all Latin-script alphabets plus Greek, Cyrillic, Hebrew, Arabic, etc.
        //We must have target space for at least 2 Utf8 bytes.
        unsafe.putByte(unsafeObj, bIdx++, (byte) ((0xF << 6) | (c >>> 6)));
        unsafe.putByte(unsafeObj, bIdx++, (byte) (0x80 | (0x3F & c)));
        bCnt += 2;
      }

      else
      //c > 0x800 || j >= byteLimit - 1 || j >= byteLimit

      if ( !isSurrogate(c) && (bIdx < (byteLimit - 2)) ) {
        //Encode the remainder of the BMP that are not surrogates:
        //  0x0800 thru 0xD7FF; 0xE000 thru 0xFFFF, the max single-char code point
        //We must have target space for at least 3 Utf8 bytes.
        unsafe.putByte(unsafeObj, bIdx++, (byte) ((0xF << 5) | (c >>> 12)));
        unsafe.putByte(unsafeObj, bIdx++, (byte) (0x80 | (0x3F & (c >>> 6))));
        unsafe.putByte(unsafeObj, bIdx++, (byte) (0x80 | (0x3F & c)));
        bCnt += 3;
      }

      else {
        //c is a surrogate || j >= byteLimit - 2 || j >= byteLimit - 1 || j >= byteLimit

        //At this point we are either:
        // 1) Attempting to encode Code Points outside the BMP.
        //
        //    The only way to properly encode code points outside the BMP into Utf8 bytes is to use
        //    High/Low pairs of surrogate characters. Therefore, we must have at least 2 source
        //    characters remaining, at least 4 bytes of memory space remaining, and the next 2
        //    characters must be a valid surrogate pair.
        //
        // 2) There is insufficient MemoryImpl space to encode the current character from one of the
        //    ifs above.
        //
        // We proceed assuming (1). If the following test fails, we move to an exception.

        final char low;
        if ( (cIdx <= (utf16Length - 2))
            && (bIdx <= (byteLimit - 4))
            && isSurrogatePair(c, low = src.charAt(cIdx + 1)) ) { //we are good
          cIdx++; //skip over low surrogate
          final int codePoint = toCodePoint(c, low);
          unsafe.putByte(unsafeObj, bIdx++, (byte) ((0xF << 4) | (codePoint >>> 18)));
          unsafe.putByte(unsafeObj, bIdx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))));
          unsafe.putByte(unsafeObj, bIdx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))));
          unsafe.putByte(unsafeObj, bIdx++, (byte) (0x80 | (0x3F & codePoint)));
          bCnt += 4;
        }

        else {
          //We are going to throw an exception. So we have time to figure out
          // what was wrong and hopefully throw an intelligent message!

          //check the BMP code point cases and their required memory limits
          if (   ((c < 0X0080) && (bIdx >= byteLimit))
              || ((c < 0x0800) && (bIdx >= (byteLimit - 1)))
              || ((c < 0xFFFF) && (bIdx >= (byteLimit - 2))) ) {
            throw Utf8CodingException.outOfMemory();
          }

          if (cIdx > (utf16Length - 2)) { //the last char is an unpaired surrogate
            throw Utf8CodingException.unpairedSurrogate(c);
          }

          if (bIdx > (byteLimit - 4)) {
            //4 MemoryImpl bytes required to encode a surrogate pair.
            final int remaining = (int) ((bIdx - byteLimit) + 4L);
            throw Utf8CodingException.shortUtf8EncodeByteLength(remaining);
          }

          if (!isSurrogatePair(c, src.charAt(cIdx + 1)) ) {
            //Not a surrogate pair.
            throw Utf8CodingException.illegalSurrogatePair(c, src.charAt(cIdx + 1));
          }

          //This should not happen :)
          throw new IllegalArgumentException("Unknown Utf8 encoding exception");
        }
      }
    }
    //final long localOffsetBytes = bIdx - cumBaseOffset;
    return bCnt;
  }