in languagetool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/SentenceSourceChecker.java [152:304]
private void run(File propFile, Set<String> disabledRules, String langCode, String motherTongueCode,
int maxSentences, int maxErrors, int contextSize,
CommandLine options) throws IOException {
long startTime = System.currentTimeMillis();
String[] ruleIds = options.hasOption('r') ? options.getOptionValue('r').split(",") : null;
String[] additionalCategoryIds = options.hasOption("also-enable-categories") ? options.getOptionValue("also-enable-categories").split(",") : null;
String[] fileNames = options.getOptionValues('f');
File languageModelDir = options.hasOption("languagemodel") ? new File(options.getOptionValue("languagemodel")) : null;
File remoteRules = options.hasOption("remoterules") ? new File(options.getOptionValue("remoterules")) : null;
Pattern filter = options.hasOption("filter") ? Pattern.compile(options.getOptionValue("filter")) : null;
String ruleSource = options.hasOption("rulesource") ? options.getOptionValue("rulesource") : null;
int sentencesToSkip = options.hasOption("skip") ? Integer.parseInt(options.getOptionValue("skip")) : 0;
Language lang = Languages.getLanguageForShortCode(langCode);
Language motherTongue = motherTongueCode != null ? Languages.getLanguageForShortCode(motherTongueCode) : null;
GlobalConfig globalConfig = new GlobalConfig();
System.out.println("Premium: " + Premium.isPremiumVersion());
if (options.hasOption("nerUrl")) {
System.out.println("Using NER service: " + options.getOptionValue("nerUrl"));
globalConfig.setNERUrl(options.getOptionValue("nerUrl"));
}
if (options.hasOption("print-correct")) {
System.out.println("In print-correct mode, will only print sentences for which no error is found.");
}
MultiThreadedJLanguageTool lt = new MultiThreadedJLanguageTool(lang, motherTongue, -1, globalConfig, null);
lt.setCleanOverlappingMatches(false);
if (languageModelDir != null) {
lt.activateLanguageModelRules(languageModelDir);
}
int activatedBySource = 0;
for (Rule rule : lt.getAllRules()) {
if (rule.isDefaultTempOff()) {
System.out.println("Activating " + rule.getFullId() + ", which is default='temp_off'");
lt.enableRule(rule.getId());
}
if (ruleSource != null) {
boolean enable = false;
if (rule instanceof AbstractPatternRule) {
String sourceFile = rule.getSourceFile();
if (sourceFile != null && sourceFile.endsWith("/" + ruleSource) && !rule.isDefaultOff()) {
enable = true;
activatedBySource++;
}
}
if (enable) {
lt.enableRule(rule.getId());
} else {
lt.disableRule(rule.getId());
}
}
}
lt.activateRemoteRules(remoteRules);
if (ruleSource == null) {
if (ruleIds != null) {
enableOnlySpecifiedRules(ruleIds, lt);
} else {
applyRuleDeactivation(lt, disabledRules);
}
} else {
System.out.println("Activated " + activatedBySource + " rules from " + ruleSource);
}
if (filter != null) {
System.out.println("*** NOTE: only sentences that match regular expression '" + filter + "' will be checked");
}
activateAdditionalCategories(additionalCategoryIds, lt);
if (options.hasOption("spelling")) {
System.out.println("Spelling rules active: yes (only if you're using a language code like en-US which comes with spelling)");
} else if (ruleIds == null) {
disableSpellingRules(lt);
System.out.println("Spelling rules active: no");
}
System.out.println("Working on: " + StringUtils.join(fileNames, ", "));
System.out.println("Sentence limit: " + (maxSentences > 0 ? maxSentences : "no limit"));
System.out.println("Context size: " + contextSize);
System.out.println("Error limit: " + (maxErrors > 0 ? maxErrors : "no limit"));
System.out.println("Skip: " + sentencesToSkip);
//System.out.println("Version: " + JLanguageTool.VERSION + " (" + JLanguageTool.BUILD_DATE + ")");
ResultHandler resultHandler = null;
int ruleMatchCount = 0;
int sentenceCount = 0;
int skipCount = 0;
int ignoredCount = 0;
boolean skipMessageShown = false;
try {
if (options.hasOption("csv")) {
resultHandler = new CSVHandler(maxSentences, maxErrors);
} else if (propFile != null) {
resultHandler = new DatabaseHandler(propFile, maxSentences, maxErrors);
} else {
resultHandler = new StdoutHandler(maxSentences, maxErrors, contextSize, options.hasOption("verbose"));
}
MixingSentenceSource mixingSource = MixingSentenceSource.create(Arrays.asList(fileNames), lang, filter);
while (mixingSource.hasNext()) {
Sentence sentence = mixingSource.next();
if (sentencesToSkip > 0 && skipCount < sentencesToSkip) {
if (skipCount % 5000 == 0) {
System.err.printf("%s sentences skipped...\n", NumberFormat.getNumberInstance(Locale.US).format(skipCount));
}
skipCount++;
continue;
} else if (sentencesToSkip > 0 && !skipMessageShown) {
System.err.println("Done skipping " + sentencesToSkip + " sentences.");
skipMessageShown = true;
}
try {
AnnotatedText annotatedText = new AnnotatedTextBuilder().addText(sentence.getText()).build();
CheckResults matches = lt.check2(annotatedText, true, JLanguageTool.ParagraphHandling.NORMAL, null,
JLanguageTool.Mode.ALL, JLanguageTool.Level.PICKY, new HashSet<>(Arrays.asList(ToneTag.values())), null);
if (options.hasOption("print-correct")) {
if (matches.getRuleMatches().size() == 0) {
System.out.println(sentence.getText());
}
} else {
resultHandler.handleResult(sentence, matches.getRuleMatches(), lang);
}
sentenceCount++;
if (sentenceCount % 5000 == 0) {
System.err.printf("%s sentences checked...\n", NumberFormat.getNumberInstance(Locale.US).format(sentenceCount));
}
ruleMatchCount += matches.getRuleMatches().size();
} catch (DocumentLimitReachedException | ErrorLimitReachedException e) {
throw e;
} catch (Exception e) {
if (options.hasOption("skip-exceptions")) {
e.printStackTrace();
} else {
throw new RuntimeException("Check failed on sentence: " + StringUtils.abbreviate(sentence.getText(), 250), e);
}
}
}
ignoredCount = mixingSource.getIgnoredCount();
} catch (DocumentLimitReachedException | ErrorLimitReachedException e) {
System.out.println(getClass().getSimpleName() + ": " + e);
} finally {
lt.shutdown();
if (resultHandler != null) {
System.out.printf(lang + ": %d total matches\n", ruleMatchCount);
System.out.printf(lang + ": %d total sentences considered\n", sentenceCount);
float matchesPerSentence = (float)ruleMatchCount / sentenceCount;
System.out.printf(Locale.ENGLISH, lang + ": ø%.2f rule matches per sentence\n", matchesPerSentence);
System.out.printf(Locale.ENGLISH, lang + ": %d input lines ignored (e.g. not between %d and %d chars or at least %d tokens)\n", ignoredCount,
SentenceSource.MIN_SENTENCE_LENGTH, SentenceSource.MAX_SENTENCE_LENGTH, SentenceSource.MIN_SENTENCE_TOKEN_COUNT);
if (options.hasOption("print-duration")) {
System.out.println("The analysis took " + (System.currentTimeMillis() - startTime) + "ms");
}
try {
resultHandler.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
}