protected WhiskRule extendRule()

in ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/whisk/generic/Whisk.java [220:415]
164 lines of code
42 McCabe index (conditional complexity)

  protected WhiskRule extendRule(WhiskRule rule, TextRulerExampleDocument doc,
          TextRulerExample example, int subRoundNumber) {
    WhiskRule bestRule = null;
    double bestL = 1.0;
    int bestRuleConstraintPoints = -1;
    if (rule.getLaplacian() <= errorThreshold) {
      bestRule = rule;
      bestL = rule.getLaplacian();
    }
    List<TextRulerRule> rulesToTest = new ArrayList<TextRulerRule>();

    // first only add conditions, e.g., for features

    List<TextRulerSlotPattern> patterns = rule.getPatterns();
    for (TextRulerSlotPattern eachPattern : patterns) {
      for (TextRulerRuleItem item : eachPattern.fillerPattern) {
        if (item instanceof WhiskRuleItem) {
          WhiskRuleItem wri = (WhiskRuleItem) item;
          WhiskRule proposedRule = rule;
          TextRulerWordConstraint wordConstraint = wri.getWordConstraint();
          for (String eachFeature : consideredFeatures) {
            if (wordConstraint != null) {
              Map<String, String> featureMap = wordConstraint.getTokenAnnotation().getFeatureMap();
              String stringValue = featureMap.get(eachFeature);
              if (stringValue != null && !wri.getActivatedFeatures().contains(eachFeature)) {
                wri.activateFeature(eachFeature);
                WhiskRule proposedRuleF = proposedRule.copy();
                wri.deactivateFeature(eachFeature);
                proposedRuleF.setNeedsCompile(true);
                if (!rulesToTest.contains(proposedRuleF)) {
                  rulesToTest.add(proposedRuleF);
                }
              }
            }
          }
          if (wordConstraint != null && wordConstraint.isRegExpConstraint() && wri.isHideRegExp()) {
            wri.setHideRegExp(false);
            WhiskRule proposedRuleF = proposedRule.copy();
            wri.setHideRegExp(true);
            proposedRuleF.setNeedsCompile(true);
            if (!rulesToTest.contains(proposedRuleF)) {
              rulesToTest.add(proposedRuleF);
            }
          }
        }
      }
    }

    List<List<WhiskRuleItem>> slotTerms = getTermsWithinBounds(
            example.getAnnotations()[0].getBegin(), example.getAnnotations()[0].getEnd(), example);
    List<List<WhiskRuleItem>> windowTerms = getTermsWithinWindow(slotTerms, example, 0);

    for (List<WhiskRuleItem> eachList : windowTerms) {
      for (WhiskRuleItem term : eachList) {

        if (rule.containsTerm(term)) {
          continue;
        }

        WhiskRule proposedRule = createNewRuleByAddingTerm(rule, term);
        if (proposedRule == null)
          continue;
        WhiskRuleItem t = term;

        if (!rulesToTest.contains(proposedRule))
          rulesToTest.add(proposedRule);

        // add a second version where we add the exact token content if
        // it is a regexp item:
        WhiskRule proposedRule2 = proposedRule;
        if (t.getWordConstraint().isRegExpConstraint()) {
          t.setHideRegExp(false);
          WhiskRule proposedRuleF = proposedRule.copy();
          t.setHideRegExp(true);
          proposedRuleF.setNeedsCompile(true);
          if (!rulesToTest.contains(proposedRuleF)) {
            rulesToTest.add(proposedRuleF);
          }
        }

        // extend with feature conditions
        WhiskRule proposedRuleF = null;
        for (String eachFeature : consideredFeatures) {
          Map<String, String> featureMap = t.getWordConstraint().getTokenAnnotation()
                  .getFeatureMap();
          String stringValue = featureMap.get(eachFeature);
          if (stringValue != null) {
            t.activateFeature(eachFeature);
            proposedRuleF = proposedRule.copy();
            t.deactivateFeature(eachFeature);
            proposedRuleF.setNeedsCompile(true);
            if (!rulesToTest.contains(proposedRuleF)) {
              rulesToTest.add(proposedRuleF);
            }
          }
        }

        // and now, for WHISK performance testing purposes, we also add POS
        // tags:
        // this is not very nice code and not dynamic feature capable, but
        // for testpurposes
        // in order to test WHISK with PosTag Terms...
        if (posTagRootTypeName != null && posTagRootTypeName.length() > 0) {
          TextRulerAnnotation tokenAnnotation = term.getWordConstraint().getTokenAnnotation();
          CAS cas = example.getDocumentCAS();
          TypeSystem ts = cas.getTypeSystem();
          Type posTagsRootType = ts.getType(posTagRootTypeName);
          if (ts != null) {
            // POS-Tags created by our test hmm tagger.
            List<AnnotationFS> posTagAnnotations = TextRulerToolkit.getAnnotationsWithinBounds(cas,
                    tokenAnnotation.getBegin(), tokenAnnotation.getEnd(), null, posTagsRootType);
            if (posTagAnnotations.size() > 0) {
              AnnotationFS posTag = posTagAnnotations.get(0);
              if (posTag.getBegin() == tokenAnnotation.getBegin()
                      && posTag.getEnd() == tokenAnnotation.getEnd()) {
                TextRulerAnnotation posTagAnnotation = new TextRulerAnnotation(posTag, doc,
                        consideredFeatures);

                // 1. most specific term with all constraints we
                // have:
                WhiskRule proposedRule3 = proposedRule.copy();
                WhiskRuleItem t3 = term;
                t3.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
                proposedRule3.setNeedsCompile(true);
                if (!rulesToTest.contains(proposedRule3))
                  rulesToTest.add(proposedRule3);

                // 2. the same without the regexp thingy:
                if (proposedRule2 != null) {
                  WhiskRule proposedRule4 = proposedRule2.copy();
                  WhiskRuleItem t4 = term;
                  t4.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation,
                          posTagAnnotation));
                  proposedRule4.setNeedsCompile(true);
                  if (!rulesToTest.contains(proposedRule4))
                    rulesToTest.add(proposedRule4);
                }

                // 3. last but not least: a rule with only the pos
                // tag constraint:
                WhiskRule proposedRule5 = proposedRule.copy();
                WhiskRuleItem t5 = term;
                t5.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
                t5.setWordConstraint(null);
                proposedRule5.setNeedsCompile(true);
                if (!rulesToTest.contains(proposedRule5)) {
                  rulesToTest.add(proposedRule5);
                }

              }
            }
          }
        }
      }
    }
    if (rulesToTest.size() == 0)
      return bestRule;

    sendStatusUpdateToDelegate(
            "Round "
                    + roundNumber
                    + "."
                    + subRoundNumber
                    + " - Testing "
                    + rulesToTest.size()
                    + " rules... "
                    + " - uncovered examples: "
                    + (allExamplesCount - coveredExamples.size() + " / " + allExamplesCount
                            + " ; cs=" + cachedTestedRuleStatistics.size()),
            TextRulerLearnerState.ML_RUNNING, false);

    TextRulerToolkit.log("Testing " + rulesToTest.size() + " rules on training set...");
    for (TextRulerRule r : rulesToTest)
      TextRulerToolkit.log(r.getRuleString());
    testRulesIfNotCached(rulesToTest);

    if (shouldAbort())
      return null;
    for (TextRulerRule r : rulesToTest) {
      WhiskRule wr = (WhiskRule) r;
      if (wr.getLaplacian() < bestL) {
        bestL = wr.getLaplacian();
        bestRule = wr;
        bestRuleConstraintPoints = bestRule.totalConstraintPoints();
      } else if (wr.getLaplacian() == bestL && bestRuleConstraintPoints >= 0) {
        TextRulerToolkit.log("Same Laplacian! So prefer more general rule!");
        if (wr.totalConstraintPoints() < bestRuleConstraintPoints) {
          TextRulerToolkit.log("\tYes, prefered!");
          bestL = wr.getLaplacian();
          bestRule = wr;
          bestRuleConstraintPoints = bestRule.totalConstraintPoints();
        }
      }
    }
    return bestRule;
  }