in languagetool-core/src/main/java/org/languagetool/language/identifier/DefaultLanguageIdentifier.java [241:362]
public List<DetectedLanguage> getDetectedLanguageScores(String cleanText, List<String> noopLangsTmp, List<String> preferredLangsTmp, boolean limitOnPreferredLangs, int count) {
String text = cleanText;
ParsedLanguageLists parsedLanguageLists = prepareDetectLanguage(text, noopLangsTmp, preferredLangsTmp);
if (parsedLanguageLists == null) {
return Collections.singletonList(new DetectedLanguage(null, new NoopLanguage()));
}
List<String> additionalLangs = parsedLanguageLists.getAdditionalLangs();
List<String> preferredLangs = parsedLanguageLists.getPreferredLangs();
Map<String, Double> scores = null;
boolean fasttextFailed = false;
String source = "";
if (fastTextDetector != null || ngram != null) {
try {
boolean usingFastText = false;
if ((text.length() <= SHORT_ALGO_THRESHOLD || fastTextDetector == null) && ngram != null) {
scores = ngram.detectLanguages(text.trim(), additionalLangs);
source += "ngram";
} else {
usingFastText = true;
scores = fastTextDetector.runFasttext(text, additionalLangs);
source += "fasttext";
}
/*if (result.getValue().floatValue() < THRESHOLD) {
System.out.println("FastText below threshold: " + result.getValue().floatValue() + " for " + cleanText.length() + " chars");
} else {
System.out.println("FastText above threshold: " + result.getValue().floatValue() + " for " + cleanText.length() + " chars");
}*/
Map.Entry<String, Double> fasttextHighestScoringResult = getHighestScoringResult(scores);
if ((usingFastText && fasttextHighestScoringResult.getValue().floatValue() < FASTTEXT_CONFIDENCE_THRESHOLD) || fasttextHighestScoringResult.getKey().equals("zz")) {
//System.out.println(cleanText + " ->" + result.getValue().floatValue() + " " + result.getKey());
Map<Language, Integer> lang2Count = COMMON_WORDS_LANG_IDENTIFIER.getKnownWordsPerLanguage(text);
Set<String> baseLangAlreadyHandled = new HashSet<>();
for (Map.Entry<Language, Integer> entry : lang2Count.entrySet()) {
String langCode = entry.getKey().getShortCode();
if (baseLangAlreadyHandled.contains(langCode)) {
// quick hack to fix #5772
continue;
}
baseLangAlreadyHandled.add(langCode);
if (scores.containsKey(langCode)) {
// this looks arbitrary, but gave best results with evaluation (LanguageDetectionMinLengthEval):
scores.put(langCode, scores.get(langCode) + Double.valueOf(entry.getValue()));
} else {
scores.put(langCode, Double.valueOf(entry.getValue()));
}
}
source += "+commonwords";
}
if (preferredLangs.contains("no") && !preferredLangs.contains("da")) {
// Special case, as Norwegian easily gets detected as Danish (https://github.com/languagetool-org/languagetool/issues/5520).
scores.keySet().removeIf(k -> k.equals("da"));
}
if (!preferredLangs.isEmpty() && (text.length() <= CONSIDER_ONLY_PREFERRED_THRESHOLD || limitOnPreferredLangs)) {
boolean wasRemoved = scores.keySet().removeIf(k -> !preferredLangs.contains(k));
if (wasRemoved && scores.isEmpty() && limitOnPreferredLangs) {
//TODO: just to see how often we would return no results because of that parameter -> remove later
logger.warn("No language detected for text after remove all not preferred languages from score.");
}
source += "+prefLang(forced: " + limitOnPreferredLangs + ")";
}
} catch (FastTextDetector.FastTextException e) {
if (e.isDisabled()) {
fasttextFailed = true;
reinitFasttextAfterFailure(e);
} else {
logger.error("Fasttext failed, fallback used", e);
fasttextFailed = true;
}
} catch (Exception e) {
fasttextFailed = true;
reinitFasttextAfterFailure(e);
}
}
if (fastTextDetector == null && ngram == null || fasttextFailed) { // no else, value can change in if clause
text = textObjectFactory.forText(text).toString();
source +="+fallback";
if (scores == null) {
scores = new HashMap<>();
}
Map.Entry<String, Double> localResult = detectLanguageCode(text, preferredLangs, limitOnPreferredLangs);
if (localResult != null) {
scores.put(localResult.getKey(), localResult.getValue());
}
if (!additionalLangs.isEmpty()) {
logger.warn("Cannot consider noopLanguages because not in fastText mode: {}", additionalLangs);
}
}
List<DetectedLanguage> detectedLanguages = new LinkedList<>();
if (count > 1) {
Map<String, Double> orderedScores = getOrderedScores(scores, count);
for (Map.Entry<String, Double> entry : orderedScores.entrySet()) {
if (entry.getKey() != null && LanguageIdentifierService.INSTANCE.canLanguageBeDetected(entry.getKey(), additionalLangs)) {
float rate = Math.round(entry.getValue() * 100.0) / 100.0f; // Convert to a non-scientific float and potentially round down
detectedLanguages.add(new DetectedLanguage(null, Languages.getLanguageForShortCode(entry.getKey(), additionalLangs), rate, source));
}
}
} else {
Map.Entry<String, Double> highestScoringResult = getHighestScoringResult(scores);
if (highestScoringResult.getKey() != null && LanguageIdentifierService.INSTANCE.canLanguageBeDetected(highestScoringResult.getKey(), additionalLangs)) {
float newScore;
if (source.contains("fasttext")) {
// Calculate a trivial confidence value because fasttext's confidence is often
// wrong for short cleanText (e.g. 0.99 for a test that's misclassified). Don't
// use 1.0 because we can never be totally sure...
newScore = (float) (0.99/ (30.0 / Math.min(text.length(), 30)));
} else {
newScore = highestScoringResult.getValue().floatValue();
}
detectedLanguages.add(new DetectedLanguage(null, Languages.getLanguageForShortCode(highestScoringResult.getKey(), additionalLangs), newScore, source));
}
}
if (detectedLanguages.isEmpty() && !preferredLangs.isEmpty() &&
preferredLangs.get(0) != null &&
!preferredLangs.get(0).trim().isEmpty() &&
Languages.isLanguageSupported(preferredLangs.get(0))) {
source += "+fallbackToPrefLang";
detectedLanguages.add(new DetectedLanguage(null, Languages.getLanguageForShortCode(preferredLangs.get(0)), 0.1f, source));
}
return detectedLanguages;
}