public void testChanges()

in languagetool-dev/src/main/java/org/languagetool/rules/spelling/suggestions/SuggestionChangesTest.java [286:440]


  public void testChanges() throws IOException, InterruptedException {

    File configFile = new File(System.getProperty("config", "SuggestionChangesTestConfig.json"));
    ObjectMapper mapper = new ObjectMapper(new JsonFactory().enable(JsonParser.Feature.ALLOW_COMMENTS));
    SuggestionChangesTestConfig config = mapper.readValue(configFile, SuggestionChangesTestConfig.class);

    SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd_HH:mm:ss");
    String timestamp = dateFormat.format(new Date());
    Path loggingFile = Paths.get(config.logDir, String.format("suggestionChangesExperiment_%s.log", timestamp));
    Path datasetFile = Paths.get(config.logDir, String.format("suggestionChangesExperiment_%s.csv", timestamp));

    BufferedWriter writer = Files.newBufferedWriter(loggingFile);
    CSVPrinter datasetWriter = new CSVPrinter(Files.newBufferedWriter(datasetFile), CSVFormat.DEFAULT.withEscape('\\'));
    List<String> datasetHeader = new ArrayList<>(Arrays.asList("sentence", "correction", "covered", "replacement", "dataset_id"));

    SuggestionsChanges.init(config, writer);
    writer.write("Evaluation configuration: \n");
    String configContent = String.join("\n", Files.readAllLines(configFile.toPath()));
    writer.write(configContent);
    writer.write("\nRunning experiments: \n");
    int experimentId = 0;
    for (SuggestionChangesExperiment experiment : SuggestionsChanges.getInstance().getExperiments()) {
      experimentId++;
      writer.write(String.format("#%d: %s%n", experimentId, experiment));
      datasetHeader.add(String.format("experiment_%d_suggestions", experimentId));
      datasetHeader.add(String.format("experiment_%d_metadata", experimentId));
      datasetHeader.add(String.format("experiment_%d_suggestions_metadata", experimentId));
    }
    writer.newLine();
    datasetWriter.printRecord(datasetHeader);

    BlockingQueue<SuggestionTestData> tasks = new LinkedBlockingQueue<>(1000);
    ConcurrentLinkedQueue<Pair<SuggestionTestResultData, String>> results = new ConcurrentLinkedQueue<>();
    List<SuggestionTestThread> threads = new ArrayList<>();
    for (int i = 0; i < Runtime.getRuntime().availableProcessors(); i++) {
      SuggestionTestThread worker = new SuggestionTestThread(tasks, results);
      worker.start();
      threads.add(worker);
    }

    // Thread for writing results from worker threads into CSV
    Thread logger = new Thread(() -> {
      try {
        long messages = 0;
        //noinspection InfiniteLoopStatement
        while (true) {
          Pair<SuggestionTestResultData, String> message = results.poll();
          if (message != null) {
            writer.write(message.getRight());

            SuggestionTestResultData result = message.getLeft();
            int datasetId = 1 + config.datasets.indexOf(result.getInput().getDataset());
            if (result != null && result.getSuggestions() != null &&
              !result.getSuggestions().isEmpty() && result.getSuggestions().stream()
              .noneMatch(m -> m.getSuggestedReplacements() == null || m.getSuggestedReplacements().isEmpty())) {

              List<Object> record = new ArrayList<>(Arrays.asList(
                result.getInput().getSentence(), result.getInput().getCorrection(),
                result.getInput().getCovered(), result.getInput().getReplacement(), datasetId));
              for (RuleMatch match : result.getSuggestions()) {
                List<String> suggestions = match.getSuggestedReplacements();
                record.add(mapper.writeValueAsString(suggestions));
                // features extracted by SuggestionsOrdererFeatureExtractor
                record.add(mapper.writeValueAsString(match.getFeatures()));
                List<SortedMap<String, Float>> suggestionsMetadata = new ArrayList<>();
                for (SuggestedReplacement replacement : match.getSuggestedReplacementObjects()) {
                  suggestionsMetadata.add(replacement.getFeatures());
                }
                record.add(mapper.writeValueAsString(suggestionsMetadata));
              }
              datasetWriter.printRecord(record);
            }

            if (++messages % 1000 == 0) {
              writer.flush();
              System.out.printf("Evaluated %d corrections.%n", messages);
            }
          }
        }
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
    });
    logger.setDaemon(true);
    logger.start();

    // format straight from database dump
    String[] header = {"id", "sentence", "correction", "language", "rule_id", "suggestion_pos", "accept_language",
      "country", "region", "created_at", "updated_at", "covered", "replacement", "text_session_id", "client"};

    int datasetId = 0;
    // read data, send to worker threads via queue
    for (SuggestionChangesDataset dataset : config.datasets) {

      writer.write(String.format("Evaluating dataset #%d: %s.%n", ++datasetId, dataset));

      CSVFormat format = CSVFormat.DEFAULT;
      if (dataset.type.equals("dump")) {
        format = format.withEscape('\\').withNullString("\\N").withHeader(header);
      } else if (dataset.type.equals("artificial")) {
        format = format.withEscape('\\').withFirstRecordAsHeader();
      }
      try (CSVParser parser = new CSVParser(new FileReader(dataset.path), format)) {
        for (CSVRecord record : parser) {

          String lang = record.get("language");
          String rule = dataset.type.equals("dump") ? record.get("rule_id") : "";
          String covered = record.get("covered");
          String replacement = record.get("replacement");
          String sentence = record.get("sentence");
          String correction = record.isSet("correction") ? record.get("correction") : "";
          String acceptLanguage = dataset.type.equals("dump") ? record.get("accept_language") : "";

          if (sentence == null || sentence.trim().isEmpty()) {
            continue;
          }

          if (!config.language.equals(lang)) {
            continue; // TODO handle auto maybe?
          }
          if (dataset.type.equals("dump") && !config.rule.equals(rule)) {
            continue;
          }

          // correction column missing in export from doccano; workaround
          if (dataset.enforceCorrect && !record.isSet("correction")) {
            throw new IllegalStateException("enforceCorrect in dataset configuration enabled," +
              " but column 'correction' is not set for entry " + record);
          }

          if (dataset.type.equals("dump") && dataset.enforceAcceptLanguage) {
            if (acceptLanguage != null) {
              String[] entries = acceptLanguage.split(",", 2);
              if (entries.length == 2) {
                String userLanguage = entries[0]; // TODO: what to do with e.g. de-AT,de-DE;...
                if (!config.language.equals(userLanguage)) {
                  continue;
                }
              }
            }
          }

          tasks.put(new SuggestionTestData(lang, sentence, covered, replacement, correction, dataset));
        }
      }

    }

    for (Thread t : threads) {
      t.join();
    }
    logger.join(10000L);
    logger.interrupt();
    datasetWriter.close();
  }