public List generalizeTwoGroupedPhrasesDeterministic()

in opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeMatcherDeterministic.java [44:206]


  public List<ParseTreeChunk> generalizeTwoGroupedPhrasesDeterministic(
      ParseTreeChunk chunk1, ParseTreeChunk chunk2) {
    List<String> pos1 = chunk1.getPOSs();
    List<String> pos2 = chunk2.getPOSs();
    List<String> lem1 = chunk1.getLemmas();
    List<String> lem2 = chunk2.getLemmas();

    List<String> lem1stem = new ArrayList<>();
    List<String> lem2stem = new ArrayList<>();

    for (String word : lem1) {
      try {
        lem1stem.add(ps.stem(word.toLowerCase()).toString());
      } catch (Exception e) {
        // e.printStackTrace();

        if (word.length() > 2)
          System.err.println("Unable to stem: " + word);
      }
    }
    try {
      for (String word : lem2) {
        lem2stem.add(ps.stem(word.toLowerCase()).toString());
      }
    } catch (Exception e) {
      System.err.println("problem processing word " + lem2.toString());
    }

    List<String> overlap = new ArrayList<>(lem1stem);
    overlap.retainAll(lem2stem);

    if (overlap.size() < 1)
      return null;

    List<Integer> occur1 = new ArrayList<>(), occur2 = new ArrayList<>();
    for (String word : overlap) {
      Integer i1 = lem1stem.indexOf(word);
      Integer i2 = lem2stem.indexOf(word);
      occur1.add(i1);
      occur2.add(i2);
    }

    // now we search for plausible sub-lists of overlaps
    // if at some position correspondence is inverse (one of two position
    // decreases instead of increases)
    // then we terminate current alignment accum and start a new one
    List<List<int[]>> overlapsPlaus = new ArrayList<>();
    // starts from 1, not 0
    List<int[]> accum = new ArrayList<>();
    accum.add(new int[] { occur1.get(0), occur2.get(0) });
    for (int i = 1; i < occur1.size(); i++) {

      if (occur1.get(i) > occur1.get(i - 1)
          && occur2.get(i) > occur2.get(i - 1))
        accum.add(new int[] { occur1.get(i), occur2.get(i) });
      else {
        overlapsPlaus.add(accum);
        accum = new ArrayList<>();
        accum.add(new int[] { occur1.get(i), occur2.get(i) });
      }
    }
    if (accum.size() > 0) {
      overlapsPlaus.add(accum);
    }

    List<ParseTreeChunk> results = new ArrayList<>();
    for (List<int[]> occur : overlapsPlaus) {
      List<Integer> occr1 = new ArrayList<>(), occr2 = new ArrayList<>();
      for (int[] column : occur) {
        occr1.add(column[0]);
        occr2.add(column[1]);
      }

      int ov1 = 0, ov2 = 0; // iterators over common words;
      List<String> commonPOS = new ArrayList<>(), commonLemmas = new ArrayList<>();
      // we start two words before first word
      int k1 = occr1.get(ov1) - 2, k2 = occr2.get(ov2) - 2;
      // if (k1<0) k1=0; if (k2<0) k2=0;
      boolean bReachedCommonWord = false;
      while (k1 < 0 || k2 < 0) {
        k1++;
        k2++;
      }
      int k1max = pos1.size() - 1, k2max = pos2.size() - 1;
      while (k1 <= k1max && k2 <= k2max) {
        // first check if the same POS
        String sim = posManager.similarPOS(pos1.get(k1), pos2.get(k2));
        String lemmaMatch = lemmaFormManager.matchLemmas(ps, lem1.get(k1),
            lem2.get(k2), sim);
        if ((sim != null)
            && (lemmaMatch == null || (lemmaMatch != null && !lemmaMatch.equals("fail")))) {
          commonPOS.add(pos1.get(k1));
          if (lemmaMatch != null) {
            commonLemmas.add(lemmaMatch);
            // System.out.println("Added "+lemmaMatch);
            if (k1 == occr1.get(ov1) && k2 == occr2.get(ov2))
              bReachedCommonWord = true; // now we can have different increment
                                         // opera
            else {
              if (occr1.size() > ov1 + 1 && occr2.size() > ov2 + 1
                  && k1 == occr1.get(ov1 + 1) && k2 == occr2.get(ov2 + 1)) {
                ov1++;
                ov2++;
                bReachedCommonWord = true;
              }
              // else
              // System.err.println("Next match reached '"+lemmaMatch+
              // "' | k1 - k2: "+k1 + " "+k2 +
              // "| occur index ov1-ov2 "+
              // ov1+" "+ov2+
              // "| identified positions of match: occr1.get(ov1) - occr2.get(ov1) "
              // +
              // occr1.get(ov1) + " "+ occr2.get(ov1));
            }
          } else {
            commonLemmas.add("*");
          } // the same parts of speech, proceed to the next word in both
            // expressions
          k1++;
          k2++;

        } else if (!bReachedCommonWord) {
          k1++;
          k2++;
        } // still searching
        else {
          // different parts of speech, jump to the next identified common word
          ov1++;
          ov2++;
          if (ov1 > occr1.size() - 1 || ov2 > occr2.size() - 1)
            break;
          // now trying to find
          int kk1 = occr1.get(ov1) - 2, // new positions of iterators
          kk2 = occr2.get(ov2) - 2;
          int countMove = 0;
          while ((kk1 < k1 + 1 || kk2 < k2 + 1) && countMove < 2) { // if it is
                                                                    // behind
                                                                    // current
                                                                    // position,
                                                                    // synchronously
                                                                    // move
                                                                    // towards
                                                                    // right
            kk1++;
            kk2++;
            countMove++;
          }
          k1 = kk1;
          k2 = kk2;

          if (k1 > k1max)
            k1 = k1max;
          if (k2 > k2max)
            k2 = k2max;
          bReachedCommonWord = false;
        }
      }
      ParseTreeChunk currResult = new ParseTreeChunk(commonLemmas, commonPOS, 0, 0);
      results.add(currResult);
    }

    return results;
  }