in languagetool-dev/src/main/java/org/languagetool/rules/spelling/suggestions/SuggestionChangesTest.java [286:440]
public void testChanges() throws IOException, InterruptedException {
File configFile = new File(System.getProperty("config", "SuggestionChangesTestConfig.json"));
ObjectMapper mapper = new ObjectMapper(new JsonFactory().enable(JsonParser.Feature.ALLOW_COMMENTS));
SuggestionChangesTestConfig config = mapper.readValue(configFile, SuggestionChangesTestConfig.class);
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd_HH:mm:ss");
String timestamp = dateFormat.format(new Date());
Path loggingFile = Paths.get(config.logDir, String.format("suggestionChangesExperiment_%s.log", timestamp));
Path datasetFile = Paths.get(config.logDir, String.format("suggestionChangesExperiment_%s.csv", timestamp));
BufferedWriter writer = Files.newBufferedWriter(loggingFile);
CSVPrinter datasetWriter = new CSVPrinter(Files.newBufferedWriter(datasetFile), CSVFormat.DEFAULT.withEscape('\\'));
List<String> datasetHeader = new ArrayList<>(Arrays.asList("sentence", "correction", "covered", "replacement", "dataset_id"));
SuggestionsChanges.init(config, writer);
writer.write("Evaluation configuration: \n");
String configContent = String.join("\n", Files.readAllLines(configFile.toPath()));
writer.write(configContent);
writer.write("\nRunning experiments: \n");
int experimentId = 0;
for (SuggestionChangesExperiment experiment : SuggestionsChanges.getInstance().getExperiments()) {
experimentId++;
writer.write(String.format("#%d: %s%n", experimentId, experiment));
datasetHeader.add(String.format("experiment_%d_suggestions", experimentId));
datasetHeader.add(String.format("experiment_%d_metadata", experimentId));
datasetHeader.add(String.format("experiment_%d_suggestions_metadata", experimentId));
}
writer.newLine();
datasetWriter.printRecord(datasetHeader);
BlockingQueue<SuggestionTestData> tasks = new LinkedBlockingQueue<>(1000);
ConcurrentLinkedQueue<Pair<SuggestionTestResultData, String>> results = new ConcurrentLinkedQueue<>();
List<SuggestionTestThread> threads = new ArrayList<>();
for (int i = 0; i < Runtime.getRuntime().availableProcessors(); i++) {
SuggestionTestThread worker = new SuggestionTestThread(tasks, results);
worker.start();
threads.add(worker);
}
// Thread for writing results from worker threads into CSV
Thread logger = new Thread(() -> {
try {
long messages = 0;
//noinspection InfiniteLoopStatement
while (true) {
Pair<SuggestionTestResultData, String> message = results.poll();
if (message != null) {
writer.write(message.getRight());
SuggestionTestResultData result = message.getLeft();
int datasetId = 1 + config.datasets.indexOf(result.getInput().getDataset());
if (result != null && result.getSuggestions() != null &&
!result.getSuggestions().isEmpty() && result.getSuggestions().stream()
.noneMatch(m -> m.getSuggestedReplacements() == null || m.getSuggestedReplacements().isEmpty())) {
List<Object> record = new ArrayList<>(Arrays.asList(
result.getInput().getSentence(), result.getInput().getCorrection(),
result.getInput().getCovered(), result.getInput().getReplacement(), datasetId));
for (RuleMatch match : result.getSuggestions()) {
List<String> suggestions = match.getSuggestedReplacements();
record.add(mapper.writeValueAsString(suggestions));
// features extracted by SuggestionsOrdererFeatureExtractor
record.add(mapper.writeValueAsString(match.getFeatures()));
List<SortedMap<String, Float>> suggestionsMetadata = new ArrayList<>();
for (SuggestedReplacement replacement : match.getSuggestedReplacementObjects()) {
suggestionsMetadata.add(replacement.getFeatures());
}
record.add(mapper.writeValueAsString(suggestionsMetadata));
}
datasetWriter.printRecord(record);
}
if (++messages % 1000 == 0) {
writer.flush();
System.out.printf("Evaluated %d corrections.%n", messages);
}
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}
});
logger.setDaemon(true);
logger.start();
// format straight from database dump
String[] header = {"id", "sentence", "correction", "language", "rule_id", "suggestion_pos", "accept_language",
"country", "region", "created_at", "updated_at", "covered", "replacement", "text_session_id", "client"};
int datasetId = 0;
// read data, send to worker threads via queue
for (SuggestionChangesDataset dataset : config.datasets) {
writer.write(String.format("Evaluating dataset #%d: %s.%n", ++datasetId, dataset));
CSVFormat format = CSVFormat.DEFAULT;
if (dataset.type.equals("dump")) {
format = format.withEscape('\\').withNullString("\\N").withHeader(header);
} else if (dataset.type.equals("artificial")) {
format = format.withEscape('\\').withFirstRecordAsHeader();
}
try (CSVParser parser = new CSVParser(new FileReader(dataset.path), format)) {
for (CSVRecord record : parser) {
String lang = record.get("language");
String rule = dataset.type.equals("dump") ? record.get("rule_id") : "";
String covered = record.get("covered");
String replacement = record.get("replacement");
String sentence = record.get("sentence");
String correction = record.isSet("correction") ? record.get("correction") : "";
String acceptLanguage = dataset.type.equals("dump") ? record.get("accept_language") : "";
if (sentence == null || sentence.trim().isEmpty()) {
continue;
}
if (!config.language.equals(lang)) {
continue; // TODO handle auto maybe?
}
if (dataset.type.equals("dump") && !config.rule.equals(rule)) {
continue;
}
// correction column missing in export from doccano; workaround
if (dataset.enforceCorrect && !record.isSet("correction")) {
throw new IllegalStateException("enforceCorrect in dataset configuration enabled," +
" but column 'correction' is not set for entry " + record);
}
if (dataset.type.equals("dump") && dataset.enforceAcceptLanguage) {
if (acceptLanguage != null) {
String[] entries = acceptLanguage.split(",", 2);
if (entries.length == 2) {
String userLanguage = entries[0]; // TODO: what to do with e.g. de-AT,de-DE;...
if (!config.language.equals(userLanguage)) {
continue;
}
}
}
}
tasks.put(new SuggestionTestData(lang, sentence, covered, replacement, correction, dataset));
}
}
}
for (Thread t : threads) {
t.join();
}
logger.join(10000L);
logger.interrupt();
datasetWriter.close();
}