protected RapierRule findNewRule()

in ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/rapier/Rapier.java [276:453]


  protected RapierRule findNewRule() {
    Random rand = new Random(System.currentTimeMillis());

    Set<RapierRule> generalizations = new HashSet<RapierRule>();
    // 0. initialization
    ruleList.clear();

    if (slotRules.size() <= 1)
      return null;

    List<RapierRule> uncompressedRules = new ArrayList<RapierRule>();
    for (TextRulerRule r : slotRules) {
      if (((RapierRule) r).isInitialRule())
        uncompressedRules.add((RapierRule) r);
    }

    // 1. get generalizations of the two slot filler patterns:

    // create pairs and prefer still uncompressed rules when choosing
    // "randomly":
    int pairsLeft = pairCount;
    if (uncompressedRules.size() == 1) {
      RapierRule rule1 = uncompressedRules.get(0);
      RapierRule rule2 = null;
      while (rule2 == null || rule1 == rule2) {
        rule2 = (RapierRule) slotRules.get(rand.nextInt(slotRules.size()));
      }
      generalizations.addAll(getFillerGeneralizationsForRulePair(rule1, rule2));
      if (shouldAbort())
        return null;
      pairsLeft--;
    } else if (uncompressedRules.size() == 2) {
      RapierRule rule1 = uncompressedRules.get(0);
      RapierRule rule2 = uncompressedRules.get(1);
      generalizations.addAll(getFillerGeneralizationsForRulePair(rule1, rule2));
      if (shouldAbort())
        return null;
      pairsLeft--;
    } else if (uncompressedRules.size() > 2) {
      int uPairCount = pairCount;
      if (uPairCount > uncompressedRules.size())
        uPairCount /= 2;
      for (int i = 0; i < uPairCount; i++) {
        RapierRule rule1 = uncompressedRules.get(rand.nextInt(uncompressedRules.size()));
        RapierRule rule2 = null;
        while (rule2 == null || rule1 == rule2) {
          rule2 = uncompressedRules.get(rand.nextInt(uncompressedRules.size()));
        }
        generalizations.addAll(getFillerGeneralizationsForRulePair(rule1, rule2));
        pairsLeft--;
      }
    }

    for (int i = 0; i < pairsLeft; i++) {
      // TODO optimize !! don't call the machinery with the same rule pair
      // two times in one session !!!
      // randomly pick two rules:
      RapierRule rule1 = (RapierRule) slotRules.get(rand.nextInt(slotRules.size()));
      RapierRule rule2 = null;
      while (rule2 == null || rule1 == rule2) {
        rule2 = (RapierRule) slotRules.get(rand.nextInt(slotRules.size()));
      }
      generalizations.addAll(getFillerGeneralizationsForRulePair(rule1, rule2));

      if (shouldAbort())
        return null;
    }

    // if (TextRulerToolkit.DEBUG)
    // {
    // TextRulerToolkit.log("Rule Generalizations created: " +
    // generalizations.size());
    // for (RapierRule newRule : generalizations)
    // TextRulerToolkit.log("Rule = "+newRule.getRuleString());
    // }

    // 2. evaluate an enque to priority list:
    List<RapierRule> testRules = new ArrayList<RapierRule>(generalizations);

    for (RapierRule r : testRules) {
      r.combineSenselessPatternListItems();
    }

    testRulesIfNotCached(testRules);
    if (shouldAbort())
      return null;

    for (RapierRule newRule : generalizations) {
      if (TextRulerToolkit.DEBUG) {
        if (!RapierDebugHelper.debugCheckIfRuleCoversItsSeedRuleCoverings(newRule)) {
          TextRulerToolkit
                  .log("------------------------------------------------------------------------------------------");
          TextRulerToolkit
                  .log("ERROR, A RULE HAS TO COVER AT LEAST EVERY POSITIVE EXAMPLE OF ITS TWO SEED RULES!!!");
          TextRulerToolkit.log("\t RULE: " + newRule.getRuleString());
          TextRulerToolkit.log("\t Parent1: " + newRule.getParent1().getRuleString());
          TextRulerToolkit.log("\t Parent2: " + newRule.getParent2().getRuleString());
          TextRulerToolkit.log("--------");
          TextRulerToolkit.log("+RuleCovering: "
                  + newRule.getCoveringStatistics().getCoveredPositiveExamples());
          TextRulerToolkit.log("+P1Covering  : "
                  + newRule.getParent1().getCoveringStatistics().getCoveredPositiveExamples());
          TextRulerToolkit.log("+P2Covering  : "
                  + newRule.getParent2().getCoveringStatistics().getCoveredPositiveExamples());

        }
      }
      ruleList.add(newRule);
    }

    // 3. specialize pre and post fillers:
    int n = 0;
    double bestValue = Double.MAX_VALUE;
    int noImprovementCounter = 0;
    while (true) {
      n++;
      TextRulerToolkit.log(" --- NEW SPECIALIZATOIN ROUND; n = " + n + "  noImprovementCounter = "
              + noImprovementCounter);
      List<RapierRule> newRuleList = new ArrayList<RapierRule>();
      for (RapierRule curRule : ruleList) {

        List<RapierRule> specTestRules = new ArrayList<RapierRule>(specializePreFiller(curRule, n));

        for (RapierRule r : specTestRules)
          r.combineSenselessPatternListItems();

        testRulesIfNotCached(specTestRules);
        if (shouldAbort())
          return null;

        for (RapierRule r : specTestRules)
          newRuleList.add(r);
      }
      ruleList.addAll(newRuleList);

      newRuleList.clear();
      for (RapierRule curRule : ruleList) {

        List<RapierRule> specTestRules = new ArrayList<RapierRule>(specializePostFiller(curRule, n));

        for (RapierRule r : specTestRules)
          r.combineSenselessPatternListItems();

        testRulesIfNotCached(specTestRules);
        if (shouldAbort())
          return null;

        for (RapierRule r : specTestRules)
          newRuleList.add(r);
      }
      ruleList.addAll(newRuleList);

      RapierRule bestRule = ruleList.peek();

      if (TextRulerToolkit.DEBUG) {
        // for (RapierRule r: ruleList)
        // TextRulerToolkit.log("value="+r.getPriority()+" rule = "+r.getRuleString());
        TextRulerToolkit.log("------------------------------------");
        TextRulerToolkit.log("BEST RULE FOR THIS SESSION: " + bestRule.getCoveringStatistics());
        TextRulerToolkit.log(bestRule.getRuleString());
        TextRulerToolkit.log("------------------------------------");
      }
      if (bestRule.producesOnlyValidFillers())
        break; // todo: horizon effects ??

      if (bestRule.getPriority() < bestValue) {
        noImprovementCounter = 0;
        bestValue = bestRule.getPriority();
      } else {
        noImprovementCounter++;
        if (noImprovementCounter > limNoImprovements)
          break;
      }
    }

    RapierRule bestRule = ruleList.peek();
    return bestRule;
  }