protected WhiskRule extendRule()

in ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/whisk/token/Whisk.java [207:357]
121 lines of code
28 McCabe index (conditional complexity)

  protected WhiskRule extendRule(WhiskRule rule, TextRulerExampleDocument doc,
          TextRulerExample example, List<WhiskRuleItem> allTerms, int subRoundNumber) {
    WhiskRule bestRule = null;
    double bestL = 1.0;
    int bestRuleConstraintPoints = -1;
    if (rule.getLaplacian() <= errorThreshold) {
      bestRule = rule;
      bestL = rule.getLaplacian();
    }

    List<WhiskRuleItem> slotTerms = getTermsWithinBounds(allTerms,
            example.getAnnotations()[0].getBegin(), example.getAnnotations()[0].getEnd());
    WhiskRuleItem firstSlotTerm = slotTerms.get(0);
    WhiskRuleItem lastSlotTerm = slotTerms.get(slotTerms.size() - 1);

    List<TextRulerRule> rulesToTest = new ArrayList<TextRulerRule>();
    for (WhiskRuleItem term : allTerms) {
      if (rule.containsTerm(term)) {
        continue;
      }

      boolean rejectTerm = false;
      // for now this works only for slot 0 (no multislot stuff here yet!)
      if (term.getTermNumberInExample() < firstSlotTerm.getTermNumberInExample())
        rejectTerm = firstSlotTerm.getTermNumberInExample() - term.getTermNumberInExample() > windowSize;
      else if (term.getTermNumberInExample() > lastSlotTerm.getTermNumberInExample())
        rejectTerm = term.getTermNumberInExample() - firstSlotTerm.getTermNumberInExample() > windowSize;

      if (rejectTerm) {
        // out of window scope -> skip to next...
        continue;
      }

      WhiskRule proposedRule = createNewRuleByAddingTerm(rule, term);
      WhiskRuleItem t = proposedRule.searchItemWithTermNumber(term.getTermNumberInExample());

      if (!rulesToTest.contains(proposedRule))
        rulesToTest.add(proposedRule);

      // add a second version where we remove the exact token content if
      // it is a regexp item:
      WhiskRule proposedRule2 = null;
      WhiskRuleItem t2 = null;
      if (t.getWordConstraint().isRegExpConstraint()) {
        proposedRule2 = proposedRule.copy();
        t2 = proposedRule2.searchItemWithTermNumber(term.getTermNumberInExample());
        t2.setHideRegExp(true);
        proposedRule2.setNeedsCompile(true);
        if (!rulesToTest.contains(proposedRule2)) {
          rulesToTest.add(proposedRule2);
        }
      }

      // and now, for WHISK performance testing purposes, we also add POS
      // tags:
      // this is not very nice code and not dynamic feature capable, but
      // for testpurposes
      // in order to test WHISK with PosTag Terms...
      if (posTagRootTypeName != null && posTagRootTypeName.length() > 0) {
        TextRulerAnnotation tokenAnnotation = term.getWordConstraint().getTokenAnnotation();
        CAS cas = example.getDocumentCAS();
        TypeSystem ts = cas.getTypeSystem();
        Type posTagsRootType = ts.getType(posTagRootTypeName);
        if (ts != null) {
          // POS-Tags created by our test hmm tagger.
          List<AnnotationFS> posTagAnnotations = TextRulerToolkit.getAnnotationsWithinBounds(cas,
                  tokenAnnotation.getBegin(), tokenAnnotation.getEnd(), null, posTagsRootType);
          if (posTagAnnotations.size() > 0) {
            AnnotationFS posTag = posTagAnnotations.get(0);
            if (posTag.getBegin() == tokenAnnotation.getBegin()
                    && posTag.getEnd() == tokenAnnotation.getEnd()) {
              TextRulerAnnotation posTagAnnotation = new TextRulerAnnotation(posTag, doc);

              // 1. most specific term with all constraints we
              // have:
              WhiskRule proposedRule3 = proposedRule.copy();
              WhiskRuleItem t3 = proposedRule3.searchItemWithTermNumber(term
                      .getTermNumberInExample());
              t3.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
              proposedRule3.setNeedsCompile(true);
              if (!rulesToTest.contains(proposedRule3))
                rulesToTest.add(proposedRule3);

              // 2. the same without the regexp thingy:
              if (proposedRule2 != null) {
                WhiskRule proposedRule4 = proposedRule2.copy();
                WhiskRuleItem t4 = proposedRule4.searchItemWithTermNumber(term
                        .getTermNumberInExample());
                t4.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
                proposedRule4.setNeedsCompile(true);
                if (!rulesToTest.contains(proposedRule4))
                  rulesToTest.add(proposedRule4);
              }

              // 3. last but not least: a rule with only the pos
              // tag constraint:
              WhiskRule proposedRule5 = proposedRule.copy();
              WhiskRuleItem t5 = proposedRule5.searchItemWithTermNumber(term
                      .getTermNumberInExample());
              t5.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
              t5.setWordConstraint(null);
              proposedRule5.setNeedsCompile(true);
              if (!rulesToTest.contains(proposedRule5))
                rulesToTest.add(proposedRule5);
            }
          }
        }
      }

    }
    if (rulesToTest.size() == 0)
      return bestRule;

    sendStatusUpdateToDelegate(
            "Round "
                    + roundNumber
                    + "."
                    + subRoundNumber
                    + " - Testing "
                    + rulesToTest.size()
                    + " rules... "
                    + " - uncovered examples: "
                    + (allExamplesCount - coveredExamples.size() + " / " + allExamplesCount
                            + " ; cs=" + cachedTestedRuleStatistics.size()),
            TextRulerLearnerState.ML_RUNNING, false);

    TextRulerToolkit.log("Testing " + rulesToTest.size() + " rules on training set...");
    for (TextRulerRule r : rulesToTest)
      TextRulerToolkit.log(r.getRuleString());
    testRulesIfNotCached(rulesToTest); // testRulesOnDocumentSet(rulesToTest,
    // exampleDocuments);
    if (shouldAbort())
      return null;
    for (TextRulerRule r : rulesToTest) {
      WhiskRule wr = (WhiskRule) r;
      if (wr.getLaplacian() < bestL) {
        bestL = wr.getLaplacian();
        bestRule = wr;
        bestRuleConstraintPoints = bestRule.totalConstraintPoints();
      } else if (wr.getLaplacian() == bestL && bestRuleConstraintPoints >= 0) {
        TextRulerToolkit.log("Same Laplacian! So prefer more general rule!");
        if (wr.totalConstraintPoints() < bestRuleConstraintPoints) {
          TextRulerToolkit.log("\tYes, prefered!");
          bestL = wr.getLaplacian();
          bestRule = wr;
          bestRuleConstraintPoints = bestRule.totalConstraintPoints();
        }
      }
    }
    return bestRule;
  }