protected boolean findHeadTailAndL1Patterns()

in ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/wien/Wien.java [236:376]


  protected boolean findHeadTailAndL1Patterns() {
    List<TextRulerExampleDocument> docs = exampleDocuments.getDocuments();
    TextRulerExampleDocument doc0 = docs.get(0);
    TextRulerRulePattern head = new TextRulerRulePattern();
    TextRulerRulePattern tail = new TextRulerRulePattern();
    getPageHeadAndTailPortion(doc0, head, tail);

    final class HLCandidate {
      public TextRulerRulePattern head = new TextRulerRulePattern();

      public TextRulerRulePattern l1 = new TextRulerRulePattern();
    }

    // a small optimization:
    // find out the maximum possible length for l1 in doc0 since l1 is much
    // smaller than the possible head length!
    List<TextRulerRulePattern> interTupleSeparators = getInterTupleSepatators(doc0);
    int shortestL1 = head.size() - 1;
    for (TextRulerRulePattern its : interTupleSeparators)
      shortestL1 = its.size() < shortestL1 ? its.size() : shortestL1;

    List<HLCandidate> hlCandidates = new ArrayList<HLCandidate>();
    // create candidates for each separation of the head and tail patterns:
    for (int separator = head.size() - 1; separator > 0; separator--) {
      HLCandidate c = new HLCandidate();
      for (int i = 0; i < head.size(); i++) {
        if (i < separator)
          c.head.add(head.get(i));
        else {
          WienRuleItem it = (WienRuleItem) head.get(i).copy();
          it.getWordConstraint().setGeneralizeLinkMarkUp(true);
          c.l1.add(it);
        }
      }
      hlCandidates.add(c);
      TextRulerToolkit.log(c.head.size() + " vs. " + c.l1.size());
      if (c.l1.size() >= shortestL1)
        break;
    }

    long total = 0;

    // get total h l1 t combination count:
    long tCand = (tail.size() * (tail.size() + 1)) / 2;
    for (HLCandidate c : hlCandidates) {
      total += ((c.head.size() - 1) * (c.head.size())) / 2;
    }
    total *= tCand;

    long current = 0;
    int oldPercent = -1;

    for (HLCandidate c : hlCandidates) {
      // for each "candidate" which represents a l1 suffix pattern of the
      // head tokens and a rest pattern for the h pattern,
      // we have to create every sub pattern of the remaining h pattern as
      // a h candidate:
      TextRulerRulePattern l1 = c.l1;
      TextRulerRulePattern h = null;

      boolean l1Sucks = false;

      for (int endI = c.head.size() - 1; endI > 0; endI--) {
        for (int startI = endI; startI > 0; startI--) {
          h = new TextRulerRulePattern();
          for (int i = startI; i <= endI; i++)
            h.add(c.head.get(i));

          // now for each h candidate we have to create each t
          // candidate:
          TextRulerRulePattern t = null;
          for (int tstartI = 0; tstartI < tail.size(); tstartI++) {
            for (int tendI = tstartI; tendI < tail.size(); tendI++) {
              int percent = Math.round(((float) current * 100 / total));
              if (percent != oldPercent) {
                oldPercent = percent;
                if (percent > 100)
                  percent = 100;
                // TextRulerToolkit.log(current+" / "+total);
                sendStatusUpdateToDelegate("Testing C3, " + percent + "%",
                        TextRulerLearnerState.ML_RUNNING, false);
              }
              if (shouldAbort())
                return false;
              current++;

              t = new TextRulerRulePattern();
              for (int i = tstartI; i <= tendI; i++)
                t.add(tail.get(i));

              // no we have a possible candidate triple: h, t and
              // l1:

              constraint3ReturnType c3Result = testConstraint3(h, t, l1);

              if (c3Result == constraint3ReturnType.C3_SUCCESS) {
                hPattern = h;
                tPattern = t;
                patternPairs.get(0).l = l1;
                return true;
              } else if (c3Result == constraint3ReturnType.C3_L1CandidateSuffixError
                      || c3Result == constraint3ReturnType.C3_L1CandidateInterTupleSeparatorSuffixError) {
                l1Sucks = true;
                current += tail.size() - tendI - 1;
                break;
              } else if (c3Result == constraint3ReturnType.C3_TailCandidateH_L1Error
                      || c3Result == constraint3ReturnType.C3_TailCandidateSucceedsL1InTailError) {
                // no special pruning options here... we simply
                // have to test the next t-candidate
              } else if (c3Result == constraint3ReturnType.C3_TailCandidateRK_PrefixError
                      || c3Result == constraint3ReturnType.C3_TailCandidateNotFoundError) {
                // all candidates with the same start item are
                // bad, so leave this inner loop:
                current += tail.size() - tendI - 1;
                break;
              } else if (c3Result == constraint3ReturnType.C3_TailCandidatePrecedesL1InterTupleSeparatorError) {
                // this is a problematic case... the cause could
                // be L1 or the current Tail pattern,
                // so we can't do nothing about it! just try the
                // next t-candidate
              }
            }
            if (l1Sucks) {
              current += (tail.size() - tstartI - 1) * (tail.size() - tstartI) / 2;
              break;
            }
          }
          if (l1Sucks) {
            if (startI > 0)
              current += (startI - 1) * tCand;
            break;
          }
        }
        if (l1Sucks) {
          current += (endI * (endI + 1) / 2) * tCand;
          break;
        }
      }
    }
    return false;
  }