in languagetool-http-client/src/main/java/org/languagetool/remote/ArtificialErrorEval.java [319:591]
private static void run(boolean printSummaryDetails) throws IOException {
int ignoredLines = 0;
Arrays.fill(results[0], 0);
Arrays.fill(results[1], 0);
fakeRuleIDs[0] = "rules_" + words[0] + "->" + words[1]; // rules in one direction
fakeRuleIDs[1] = "rules_" + words[1] + "->" + words[0]; // rules in the other direction
CheckConfiguration config;
CheckConfigurationBuilder cfgBuilder = new CheckConfigurationBuilder(langCode);
//cfgBuilder.textSessionID("-2");
if (enabledOnlyRules.isEmpty()) {
cfgBuilder.disabledRuleIds("WHITESPACE_RULE");
if (!disabledRules.isEmpty()) {
cfgBuilder.disabledRuleIds(disabledRules);
}
} else {
cfgBuilder.enabledRuleIds(enabledOnlyRules).enabledOnly();
}
if (!userName.isEmpty() && !apiKey.isEmpty()) {
cfgBuilder.username(userName).apiKey(apiKey).build();
}
config = cfgBuilder.build();
long start = System.currentTimeMillis();
List<String> lines = Files.readAllLines(Paths.get(corpusFilePath));
if (!inflected && !isDoubleLetters && !isDiacritics && !isParallelCorpus) {
final Pattern p0;
Matcher mWordBoundaries = pWordboundaries.matcher(words[0]);
if (mWordBoundaries.matches() && wholeword) {
p0 = Pattern.compile("\\b" + words[0] + "\\b", Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
} else {
p0 = Pattern.compile(words[0], Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
}
final Pattern p1;
mWordBoundaries = pWordboundaries.matcher(words[1]);
if (mWordBoundaries.matches() && wholeword) {
p1 = Pattern.compile("\\b" + words[1] + "\\b", Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
} else {
p1 = Pattern.compile(words[1], Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
}
countLine = 0;
checkedSentences = 0;
for (String line : lines) {
cachedMatches = new HashMap<>();
countLine++;
if (countLine > maxInputSentences || checkedSentences > maxCheckedSentences) {
break;
}
boolean foundSomething = false;
if (words[0].length() > 0) {
Matcher m = p0.matcher(line);
while (m.find()) {
foundSomething = true;
analyzeSentence(line, 0, m.start(), config);
}
}
if (words[1].length() > 0) {
Matcher m = p1.matcher(line);
while (m.find()) {
foundSomething = true;
analyzeSentence(line, 1, m.start(), config);
}
}
if (!foundSomething) {
// printSentenceOutput("Ignored, no error", line, "");
}
}
}
if (isParallelCorpus) {
final Pattern p = Pattern.compile("(.*)__(.*)__(.*)");
countLine = 0;
checkedSentences = 0;
for (String line : lines) {
cachedMatches = new HashMap<>();
countLine++;
if (countLine > maxInputSentences || checkedSentences > maxCheckedSentences) {
break;
}
String[] parts = line.split("\t");
// adjust the numbers 3 and 4 according to the source file
if (parts.length < columnCorrect && parts.length < columnIncorrect) {
continue;
}
String correctSource = parts[columnCorrect - 1];
String incorrectSource = parts[columnIncorrect - 1];
words[0] = null;
words[1] = null;
/*String correctSentence = "";
String incorrectSentence = "";
/*Matcher mIncorrect = p.matcher(incorrectSource);
if (mIncorrect.matches()) {
words[0] = mIncorrect.group(2);
}
int posError = -1;
Matcher mCorrect = p.matcher(correctSource);
if (mCorrect.matches()) {
words[1] = mCorrect.group(2);
correctSentence = mCorrect.group(1) + mCorrect.group(2) + mCorrect.group(3);
posError = mCorrect.group(1).length();
}*/
String correctSentence = correctSource.replace("__", "");
String incorrectSentence = incorrectSource.replace("__", "");
if (correctSentence.equals(incorrectSentence)) {
printSentenceOutput("IGNORED LINE: sentences are identical!", correctSource, 0, "");
ignoredLines++;
continue;
}
List<String> diffs = StringTools.getDifference(correctSentence, incorrectSentence);
int posError = diffs.get(0).length();
words[1] = diffs.get(1);
words[0] = diffs.get(2);
if (words[1] != null) {
// words[0] may be null!
// check FN
analyzeSentence(correctSentence, 1, posError, config);
// check FP in the correct sentence
words[0] = words[1];
words[1] = null;
analyzeSentence(correctSentence, 0, posError, config);
}
}
}
if (isDoubleLetters) {
// introduce error: nn -> n
fakeRuleIDs[0] = "rules_double_letters";
countLine = 0;
checkedSentences = 0;
final Pattern p1 = Pattern.compile("([a-zA-Z])\\1+");
for (String line : lines) {
cachedMatches = new HashMap<>();
countLine++;
if (countLine > maxInputSentences || checkedSentences > maxCheckedSentences) {
break;
}
Matcher m = p1.matcher(line);
while (m.find()) {
words[1] = m.group(0);
words[0] = words[1].substring(0, 1);
analyzeSentence(line, 1, m.start(), config);
}
}
}
if (isDiacritics) {
// check missing diacritics
countLine = 0;
checkedSentences = 0;
for (String line : lines) {
cachedMatches = new HashMap<>();
countLine++;
if (countLine > maxInputSentences || checkedSentences > maxCheckedSentences) {
break;
}
List<String> tokens = language.getWordTokenizer().tokenize(line);
int pos = 0;
for (String token: tokens) {
if (StringTools.hasDiacritics(token)) {
words[1] = token;
words[0] = StringTools.removeDiacritics(token);
analyzeSentence(line, 1, pos, config);
}
pos += token.length();
}
}
}
if (inflected) {
// search lemma
countLine = 0;
checkedSentences = 0;
for (String line : lines) {
cachedMatches = new HashMap<>();
countLine++;
if (countLine > maxInputSentences || checkedSentences > maxCheckedSentences) {
break;
}
List<AnalyzedSentence> analyzedSentences = localLt.analyzeText(line);
boolean foundSomething = false;
for (AnalyzedSentence analyzedSentence: analyzedSentences) {
for (AnalyzedTokenReadings token : analyzedSentence.getTokensWithoutWhitespace()) {
if (lemmas[0].length() > 0) {
if (token.hasLemma(lemmas[0])) {
words[0] = token.getToken();
AnalyzedToken atr1 = token.readingWithLemma(lemmas[0]);
AnalyzedToken atr2 = new AnalyzedToken(atr1.getToken(), atr1.getPOSTag(), lemmas[1]);
String[] syntheziedWords = synth.synthesize(atr2, atr2.getPOSTag());
words[1] = syntheziedWords[0];
foundSomething = true;
analyzeSentence(line, 0, token.getStartPos(), config);
}
}
if (lemmas[1].length() > 0) {
if (token.hasLemma(lemmas[1])) {
words[1] = token.getToken();
AnalyzedToken atr1 = token.readingWithLemma(lemmas[1]);
AnalyzedToken atr2 = new AnalyzedToken(atr1.getToken(), atr1.getPOSTag(), lemmas[0]);
String[] syntheziedWords = synth.synthesize(atr2, atr2.getPOSTag());
words[0] = syntheziedWords[0];
foundSomething = true;
analyzeSentence(line, 1, token.getStartPos(), config);
}
}
}
}
}
}
// print results
int oneOrTwo = (unidirectional ? 1 : 2);
for (int i = 0; i < oneOrTwo; i++) {
float precision = results[i][classifyTypes.indexOf("TP")]
/ (float) (results[i][classifyTypes.indexOf("TP")] + results[i][classifyTypes.indexOf("FP")]);
float recall = results[i][classifyTypes.indexOf("TP")]
/ (float) (results[i][classifyTypes.indexOf("TP")] + results[i][classifyTypes.indexOf("FN")]
+ results[i][classifyTypes.indexOf("TPns")] + results[i][classifyTypes.indexOf("TPws")]);
// recall including empty suggestions
float recall2 = (results[i][classifyTypes.indexOf("TP")] + results[i][classifyTypes.indexOf("TPns")])
/ (float) (results[i][classifyTypes.indexOf("TP")] + results[i][classifyTypes.indexOf("FN")]
+ results[i][classifyTypes.indexOf("TPns")] + results[i][classifyTypes.indexOf("TPws")]);
//float expectedSuggestionPercentage = (float) results[i][classifyTypes.indexOf("TPs")]
// / results[i][classifyTypes.indexOf("TP")];
int errorsTotal = results[i][classifyTypes.indexOf("TP")] + results[i][classifyTypes.indexOf("FP")]
+ results[i][classifyTypes.indexOf("TN")] + results[i][classifyTypes.indexOf("FN")] + results[i][classifyTypes.indexOf("TPns")]
+ results[i][classifyTypes.indexOf("TPws")];
StringWriter resultsString = new StringWriter();
resultsString.append("-------------------------------------\n");
resultsString.append("Results for " + fakeRuleIDs[i] + "\n");
int nCorrectSentences = results[i][1] + results[i][2] ; // FP + TN
int nIncorrectSentences = results[i][0] + results[i][4] + results[i][5] + results[i][3]; // TP + TPns + TPws + FN
resultsString.append("Total sentences: " + String.valueOf(errorsTotal) + "\n");
resultsString.append(formattedAbsoluteAndPercentage("\nCorrect sentences", nCorrectSentences, nCorrectSentences + nIncorrectSentences));
resultsString.append(formattedAbsoluteAndPercentage("FP", results[i][1], nCorrectSentences));
resultsString.append(formattedAbsoluteAndPercentage("TN", results[i][2], nCorrectSentences));
resultsString.append(formattedAbsoluteAndPercentage("\nIncorrect sentences", nIncorrectSentences, nCorrectSentences + nIncorrectSentences));
resultsString.append(formattedAbsoluteAndPercentage("TP (total)", results[i][4] + results[i][5] + results[i][0], nIncorrectSentences));
resultsString.append(formattedAbsoluteAndPercentage(" TP (expected suggestion)", results[i][0], nIncorrectSentences));
resultsString.append(formattedAbsoluteAndPercentage(" TPns (no suggestion)", results[i][4], nIncorrectSentences));
resultsString.append(formattedAbsoluteAndPercentage(" TPws (wrong suggestion)", results[i][5], nIncorrectSentences));
resultsString.append(formattedAbsoluteAndPercentage("FN", results[i][3], nIncorrectSentences));
resultsString.append("\nPrecision: " + String.format(Locale.ROOT, "%.4f", precision) + "\n");
resultsString.append("Recall: " + String.format(Locale.ROOT, "%.4f", recall) + "\n");
resultsString.append("Recall (including empty suggestions): " + String.format(Locale.ROOT, "%.4f", recall2) + "\n");
if (ignoredLines > 0) {
resultsString.append("\nIgnored lines from source: " + ignoredLines + "\n");
}
resultsString.append(printTimeFromStart(start, ""));
resultsString.append("\n" + printCurrentDateTime() + "\n");
appendToFile(verboseOutputFilename, resultsString.toString());
if (printSummaryDetails) {
appendToFile(summaryOutputFilename, errorCategory + "\t" + fakeRuleIDs[i]
+ "\t" + errorsTotal + "\t" + String.format(Locale.ROOT, "%.4f", precision) + "\t" + String.format(Locale.ROOT, "%.4f", recall) + "\t"
+ results[i][classifyTypes.indexOf("TP")] + "\t"
+ results[i][classifyTypes.indexOf("FP")] + "\t"
+ results[i][classifyTypes.indexOf("TN")] + "\t"
+ results[i][classifyTypes.indexOf("FN")] + "\t"
+ results[i][classifyTypes.indexOf("TPns")] + "\t"
+ results[i][classifyTypes.indexOf("TPws")] + "\t");
}
accumulateResults[0] += errorsTotal;
accumulateResults[1] += results[i][classifyTypes.indexOf("TP")];
accumulateResults[2] += results[i][classifyTypes.indexOf("FP")];
accumulateResults[3] += results[i][classifyTypes.indexOf("TN")];
accumulateResults[4] += results[i][classifyTypes.indexOf("FN")];
}
System.out.println(printTimeFromStart(start, ""));
System.out.println("-------------------------------------");
}