in datasketches-memory-java8/src/main/java/org/apache/datasketches/memory/internal/Utf8.java [282:402]
static long putCharsToUtf8(final long offsetBytes, final CharSequence src,
final long capacityBytes, final long cumBaseOffset, final Object unsafeObj) {
int cIdx = 0; //src character index
long bIdx = cumBaseOffset + offsetBytes; //byte index
long bCnt = 0; //bytes inserted
final long byteLimit = cumBaseOffset + capacityBytes; //unsafe index limit
final int utf16Length = src.length();
//Quickly dispatch an ASCII sequence
for (char c;
(cIdx < utf16Length) && ((cIdx + bIdx) < byteLimit) && ((c = src.charAt(cIdx)) < 0x80);
cIdx++, bCnt++) {
unsafe.putByte(unsafeObj, bIdx + cIdx, (byte) c);
}
//encountered a non-ascii character
if (cIdx == utf16Length) { //done.
// next relative byte index in memory is (bIdx + utf16Length) - cumBaseOffset.
return bCnt;
}
bIdx += cIdx; //bytes == characters for ascii
for (char c; cIdx < utf16Length; cIdx++) { //process the remaining characters
c = src.charAt(cIdx);
if ((c < 0x80) && (bIdx < byteLimit)) {
//Encode ASCII, 0 through 0x007F.
unsafe.putByte(unsafeObj, bIdx++, (byte) c);
bCnt++;
}
else
//c MUST BE >= 0x0080 || j >= byteLimit
if ((c < 0x800) && (bIdx < (byteLimit - 1))) {
//Encode 0x80 through 0x7FF.
//This is for almost all Latin-script alphabets plus Greek, Cyrillic, Hebrew, Arabic, etc.
//We must have target space for at least 2 Utf8 bytes.
unsafe.putByte(unsafeObj, bIdx++, (byte) ((0xF << 6) | (c >>> 6)));
unsafe.putByte(unsafeObj, bIdx++, (byte) (0x80 | (0x3F & c)));
bCnt += 2;
}
else
//c > 0x800 || j >= byteLimit - 1 || j >= byteLimit
if ( !isSurrogate(c) && (bIdx < (byteLimit - 2)) ) {
//Encode the remainder of the BMP that are not surrogates:
// 0x0800 thru 0xD7FF; 0xE000 thru 0xFFFF, the max single-char code point
//We must have target space for at least 3 Utf8 bytes.
unsafe.putByte(unsafeObj, bIdx++, (byte) ((0xF << 5) | (c >>> 12)));
unsafe.putByte(unsafeObj, bIdx++, (byte) (0x80 | (0x3F & (c >>> 6))));
unsafe.putByte(unsafeObj, bIdx++, (byte) (0x80 | (0x3F & c)));
bCnt += 3;
}
else {
//c is a surrogate || j >= byteLimit - 2 || j >= byteLimit - 1 || j >= byteLimit
//At this point we are either:
// 1) Attempting to encode Code Points outside the BMP.
//
// The only way to properly encode code points outside the BMP into Utf8 bytes is to use
// High/Low pairs of surrogate characters. Therefore, we must have at least 2 source
// characters remaining, at least 4 bytes of memory space remaining, and the next 2
// characters must be a valid surrogate pair.
//
// 2) There is insufficient MemoryImpl space to encode the current character from one of the
// ifs above.
//
// We proceed assuming (1). If the following test fails, we move to an exception.
final char low;
if ( (cIdx <= (utf16Length - 2))
&& (bIdx <= (byteLimit - 4))
&& isSurrogatePair(c, low = src.charAt(cIdx + 1)) ) { //we are good
cIdx++; //skip over low surrogate
final int codePoint = toCodePoint(c, low);
unsafe.putByte(unsafeObj, bIdx++, (byte) ((0xF << 4) | (codePoint >>> 18)));
unsafe.putByte(unsafeObj, bIdx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))));
unsafe.putByte(unsafeObj, bIdx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))));
unsafe.putByte(unsafeObj, bIdx++, (byte) (0x80 | (0x3F & codePoint)));
bCnt += 4;
}
else {
//We are going to throw an exception. So we have time to figure out
// what was wrong and hopefully throw an intelligent message!
//check the BMP code point cases and their required memory limits
if ( ((c < 0X0080) && (bIdx >= byteLimit))
|| ((c < 0x0800) && (bIdx >= (byteLimit - 1)))
|| ((c < 0xFFFF) && (bIdx >= (byteLimit - 2))) ) {
throw Utf8CodingException.outOfMemory();
}
if (cIdx > (utf16Length - 2)) { //the last char is an unpaired surrogate
throw Utf8CodingException.unpairedSurrogate(c);
}
if (bIdx > (byteLimit - 4)) {
//4 MemoryImpl bytes required to encode a surrogate pair.
final int remaining = (int) ((bIdx - byteLimit) + 4L);
throw Utf8CodingException.shortUtf8EncodeByteLength(remaining);
}
if (!isSurrogatePair(c, src.charAt(cIdx + 1)) ) {
//Not a surrogate pair.
throw Utf8CodingException.illegalSurrogatePair(c, src.charAt(cIdx + 1));
}
//This should not happen :)
throw new IllegalArgumentException("Unknown Utf8 encoding exception");
}
}
}
//final long localOffsetBytes = bIdx - cumBaseOffset;
return bCnt;
}