in languagetool-core/src/main/java/org/languagetool/language/identifier/detector/UnicodeBasedDetector.java [38:142]
public List<String> getDominantLangCodes(String str) {
// For a more complete list of script/language relations,
// see https://unicode-org.github.io/cldr-staging/charts/37/supplemental/scripts_and_languages.html
// Another more complete approach might be to use Character.UnicodeScript.of() for each character.
int arabicChars = 0;
int cyrillicChars = 0;
int cjkChars = 0;
int khmerChars = 0;
int tamilChars = 0;
int greekChars = 0;
int devanagariChars = 0;
int thaiChars = 0;
int hebrewChars = 0;
int hangulChars = 0;
int significantChars = 0;
for (int i = 0; i < Math.min(str.length(), maxCheckLength); i++) {
int val = str.charAt(i);
if (!Character.isWhitespace(val) && !Character.isDigit(val) && val != '.') {
significantChars++;
}
if (val >= 0x0600 && val <= 0x06FF) {
arabicChars++;
}
if (val >= 0x0400 && val <= 0x04FF) {
cyrillicChars++;
}
if (val >= 0x4E00 && val <= 0x9FFF ||
val >= 0x3040 && val <= 0x309F ||
val >= 0x30A0 && val <= 0x30FF) { // https://de.wikipedia.org/wiki/Japanische_Schrift
// there might be a better way to tell Chinese from Japanese, but we rely
// on the actual language identifier in a later step, so finding candidates is enough here
cjkChars++;
}
if (val >= 0x1780 && val <= 0x17FF) {
khmerChars++;
}
if (val >= 0xB82 && val <= 0xBFA) {
tamilChars++;
}
if (val >= 0x0370 && val <= 0x03FF || val >= 0x1F00 && val <= 0x1FFF) {
greekChars++;
}
if (val >= 0x0900 && val <= 0x097F) {
devanagariChars++;
}
if (val >= 0x0E00 && val <= 0x0E7F) {
thaiChars++;
}
if (val >= 0x0590 && val <= 0x05FF || val >= 0xFB1D && val <= 0xFB40) {
hebrewChars++;
}
if (val >= 0xAC00 && val <= 0xD7AF || // https://en.wikipedia.org/wiki/Hangul
val >= 0x1100 && val <= 0x11FF ||
val >= 0x3130 && val <= 0x318F ||
val >= 0xA960 && val <= 0xA97F ||
val >= 0xD7B0 && val <= 0xD7FF) {
hangulChars++;
}
}
List<String> langCodes = new ArrayList<>();
if ((float) arabicChars / significantChars >= THRESHOLD) {
langCodes.add("ar");
langCodes.add("fa");
}
if ((float) cyrillicChars / significantChars >= THRESHOLD) {
langCodes.add("ru");
langCodes.add("uk");
langCodes.add("be");
}
if ((float) cjkChars / significantChars >= THRESHOLD) {
langCodes.add("zh");
langCodes.add("ja");
// Korean: see hangulChars
}
if ((float) khmerChars / significantChars >= THRESHOLD) {
langCodes.add("km");
}
if ((float) tamilChars / significantChars >= THRESHOLD) {
langCodes.add("ta");
}
if ((float) greekChars / significantChars >= THRESHOLD) {
langCodes.add("el");
}
if ((float) devanagariChars / significantChars >= THRESHOLD) {
langCodes.add("hi");
langCodes.add("mr");
}
if ((float) thaiChars / significantChars >= THRESHOLD) {
langCodes.add("th");
}
if ((float) hebrewChars / significantChars >= THRESHOLD) {
langCodes.add("he");
}
if ((float) hangulChars / significantChars >= THRESHOLD) {
langCodes.add("ko");
}
//System.out.println("CJK: " + cjkChars);
//System.out.println("Hangul: " + hangulChars);
//
// NOTE: if you add languages here that LT doesn't support, also update LanguageIdentifier.detectLanguage()
// so it makes use of the fact that we have safely detected a language by its character set
// (we can then directly assume it's not supported)
//
return langCodes;
}