in datasketches-memory-java8/src/main/java/org/apache/datasketches/memory/internal/Utf8.java [209:271]
private static int getNonAsciiCharsFromUtf8(final Appendable dst, long address,
final long addressLimit, final Object unsafeObj, final long cumBaseOffset)
throws IOException {
int chars = 0;
while (address < addressLimit) {
final byte byte1 = unsafe.getByte(unsafeObj, address++);
if (DecodeUtil.isOneByte(byte1)) {
dst.append((char) byte1);
chars++;
// It's common for there to be multiple ASCII characters in a run mixed in, so add an
// extra optimized loop to take care of these runs.
while (address < addressLimit) {
final byte b = unsafe.getByte(unsafeObj, address);
if (!DecodeUtil.isOneByte(b)) {
break;
}
address++;
dst.append((char) b);
chars++;
}
}
else if (DecodeUtil.isTwoBytes(byte1)) {
if (address >= addressLimit) {
final long off = address - cumBaseOffset;
final long limit = addressLimit - cumBaseOffset;
throw Utf8CodingException.shortUtf8DecodeByteSequence(byte1, off, limit, 2);
}
DecodeUtil.handleTwoBytes(
byte1,
/* byte2 */ unsafe.getByte(unsafeObj, address++),
dst);
chars++;
}
else if (DecodeUtil.isThreeBytes(byte1)) {
if (address >= (addressLimit - 1)) {
final long off = address - cumBaseOffset;
final long limit = addressLimit - cumBaseOffset;
throw Utf8CodingException.shortUtf8DecodeByteSequence(byte1, off, limit, 3);
}
DecodeUtil.handleThreeBytes(
byte1,
/* byte2 */ unsafe.getByte(unsafeObj, address++),
/* byte3 */ unsafe.getByte(unsafeObj, address++),
dst);
chars++;
}
else {
if (address >= (addressLimit - 2)) {
final long off = address - cumBaseOffset;
final long limit = addressLimit - cumBaseOffset;
throw Utf8CodingException.shortUtf8DecodeByteSequence(byte1, off, limit, 4);
}
DecodeUtil.handleFourBytes(
byte1,
/* byte2 */ unsafe.getByte(unsafeObj, address++),
/* byte3 */ unsafe.getByte(unsafeObj, address++),
/* byte4 */ unsafe.getByte(unsafeObj, address++),
dst);
chars += 2;
}
}
return chars;
}