in java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java [870:923]
private static byte bestCoder(char[] chars) {
int numChars = chars.length;
// sample 64 chars
int sampleNum = Math.min(64, numChars);
int vectorizedLen = sampleNum >> 2;
int vectorizedChars = vectorizedLen << 2;
int endOffset = Platform.CHAR_ARRAY_OFFSET + (vectorizedChars << 1);
int asciiCount = 0;
int latin1Count = 0;
for (int offset = Platform.CHAR_ARRAY_OFFSET, charOffset = 0;
offset < endOffset;
offset += 8, charOffset += 4) {
long multiChars = Platform.getLong(chars, offset);
if ((multiChars & MULTI_CHARS_NON_ASCII_MASK) == 0) {
latin1Count += 4;
asciiCount += 4;
} else if ((multiChars & MULTI_CHARS_NON_LATIN_MASK) == 0) {
latin1Count += 4;
for (int i = 0; i < 4; ++i) {
if (chars[charOffset + i] < 0x80) {
asciiCount++;
}
}
} else {
for (int i = 0; i < 4; ++i) {
if (chars[charOffset + i] < 0x80) {
latin1Count++;
asciiCount++;
} else if (chars[charOffset + i] <= 0xFF) {
latin1Count++;
}
}
}
}
for (int i = vectorizedChars; i < sampleNum; i++) {
if (chars[i] < 0x80) {
latin1Count++;
asciiCount++;
} else if (chars[i] <= 0xFF) {
latin1Count++;
}
}
if (latin1Count == numChars
|| (latin1Count == sampleNum && StringUtils.isLatin(chars, sampleNum))) {
return LATIN1;
} else if (asciiCount >= sampleNum * 0.5) {
// ascii number > 50%, choose UTF-8
return UTF8;
} else {
return UTF16;
}
}