public List getDetectedLanguageScores()

in languagetool-core/src/main/java/org/languagetool/language/identifier/DefaultLanguageIdentifier.java [241:362]


  public List<DetectedLanguage> getDetectedLanguageScores(String cleanText, List<String> noopLangsTmp, List<String> preferredLangsTmp, boolean limitOnPreferredLangs, int count) {
    String text = cleanText;
    ParsedLanguageLists parsedLanguageLists = prepareDetectLanguage(text, noopLangsTmp, preferredLangsTmp);
    if (parsedLanguageLists == null) {
      return Collections.singletonList(new DetectedLanguage(null, new NoopLanguage()));
    }
    List<String> additionalLangs = parsedLanguageLists.getAdditionalLangs();
    List<String> preferredLangs = parsedLanguageLists.getPreferredLangs();

    Map<String, Double> scores = null;
    boolean fasttextFailed = false;
    String source = "";
    if (fastTextDetector != null || ngram != null) {
      try {
        boolean usingFastText = false;
        if ((text.length() <= SHORT_ALGO_THRESHOLD || fastTextDetector == null) && ngram != null) {
          scores = ngram.detectLanguages(text.trim(), additionalLangs);
          source += "ngram";
        } else {
          usingFastText = true;
          scores = fastTextDetector.runFasttext(text, additionalLangs);
          source += "fasttext";
        }
        /*if (result.getValue().floatValue() < THRESHOLD) {
          System.out.println("FastText below threshold: " + result.getValue().floatValue() + " for " + cleanText.length() + " chars");
        } else {
          System.out.println("FastText above threshold: " + result.getValue().floatValue() + " for " + cleanText.length() + " chars");
        }*/
        Map.Entry<String, Double> fasttextHighestScoringResult = getHighestScoringResult(scores);
        if ((usingFastText && fasttextHighestScoringResult.getValue().floatValue() < FASTTEXT_CONFIDENCE_THRESHOLD) || fasttextHighestScoringResult.getKey().equals("zz")) {
          //System.out.println(cleanText + " ->" + result.getValue().floatValue() + " " + result.getKey());
          Map<Language, Integer> lang2Count = COMMON_WORDS_LANG_IDENTIFIER.getKnownWordsPerLanguage(text);
          Set<String> baseLangAlreadyHandled = new HashSet<>();
          for (Map.Entry<Language, Integer> entry : lang2Count.entrySet()) {
            String langCode = entry.getKey().getShortCode();
            if (baseLangAlreadyHandled.contains(langCode)) {
              // quick hack to fix #5772
              continue;
            }
            baseLangAlreadyHandled.add(langCode);
            if (scores.containsKey(langCode)) {
              // this looks arbitrary, but gave best results with evaluation (LanguageDetectionMinLengthEval):
              scores.put(langCode, scores.get(langCode) + Double.valueOf(entry.getValue()));
            } else {
              scores.put(langCode, Double.valueOf(entry.getValue()));
            }
          }
          source += "+commonwords";
        }
        if (preferredLangs.contains("no") && !preferredLangs.contains("da")) {
          // Special case, as Norwegian easily gets detected as Danish (https://github.com/languagetool-org/languagetool/issues/5520).
          scores.keySet().removeIf(k -> k.equals("da"));
        }
        if (!preferredLangs.isEmpty() && (text.length() <= CONSIDER_ONLY_PREFERRED_THRESHOLD || limitOnPreferredLangs)) {
          boolean wasRemoved = scores.keySet().removeIf(k -> !preferredLangs.contains(k));
          if (wasRemoved && scores.isEmpty() && limitOnPreferredLangs) {
            //TODO: just to see how often we would return no results because of that parameter -> remove later
            logger.warn("No language detected for text after remove all not preferred languages from score.");
          }
          source += "+prefLang(forced: " + limitOnPreferredLangs + ")";
        }
      } catch (FastTextDetector.FastTextException e) {
        if (e.isDisabled()) {
          fasttextFailed = true;
          reinitFasttextAfterFailure(e);
        } else {
          logger.error("Fasttext failed, fallback used", e);
          fasttextFailed = true;
        }
      } catch (Exception e) {
        fasttextFailed = true;
        reinitFasttextAfterFailure(e);
      }
    }
    if (fastTextDetector == null && ngram == null || fasttextFailed) { // no else, value can change in if clause
      text = textObjectFactory.forText(text).toString();
      source +="+fallback";
      if (scores == null) {
        scores = new HashMap<>();
      }
      Map.Entry<String, Double> localResult = detectLanguageCode(text, preferredLangs, limitOnPreferredLangs);
      if (localResult != null) {
        scores.put(localResult.getKey(), localResult.getValue());
      }
      if (!additionalLangs.isEmpty()) {
        logger.warn("Cannot consider noopLanguages because not in fastText mode: {}", additionalLangs);
      }
    }

    List<DetectedLanguage> detectedLanguages = new LinkedList<>();
    if (count > 1) {
      Map<String, Double> orderedScores = getOrderedScores(scores, count);
      for (Map.Entry<String, Double> entry : orderedScores.entrySet()) {
        if (entry.getKey() != null && LanguageIdentifierService.INSTANCE.canLanguageBeDetected(entry.getKey(), additionalLangs)) {
          float rate = Math.round(entry.getValue() * 100.0) / 100.0f; // Convert to a non-scientific float and potentially round down
          detectedLanguages.add(new DetectedLanguage(null, Languages.getLanguageForShortCode(entry.getKey(), additionalLangs), rate, source));
        }
      }
    } else {
      Map.Entry<String, Double> highestScoringResult = getHighestScoringResult(scores);
      if (highestScoringResult.getKey() != null && LanguageIdentifierService.INSTANCE.canLanguageBeDetected(highestScoringResult.getKey(), additionalLangs)) {
        float newScore;
        if (source.contains("fasttext")) {
          // Calculate a trivial confidence value because fasttext's confidence is often
          // wrong for short cleanText (e.g. 0.99 for a test that's misclassified). Don't
          // use 1.0 because we can never be totally sure...
          newScore = (float) (0.99/ (30.0 / Math.min(text.length(), 30)));
        } else {
          newScore = highestScoringResult.getValue().floatValue();
        }
        detectedLanguages.add(new DetectedLanguage(null, Languages.getLanguageForShortCode(highestScoringResult.getKey(), additionalLangs), newScore, source));
      }
    }
    if (detectedLanguages.isEmpty() && !preferredLangs.isEmpty() &&
      preferredLangs.get(0) != null &&
      !preferredLangs.get(0).trim().isEmpty() &&
      Languages.isLanguageSupported(preferredLangs.get(0))) {
      source += "+fallbackToPrefLang";
      detectedLanguages.add(new DetectedLanguage(null, Languages.getLanguageForShortCode(preferredLangs.get(0)), 0.1f, source));
    }
    return detectedLanguages;
  }