in ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/rapier/Rapier.java [276:453]
protected RapierRule findNewRule() {
Random rand = new Random(System.currentTimeMillis());
Set<RapierRule> generalizations = new HashSet<RapierRule>();
// 0. initialization
ruleList.clear();
if (slotRules.size() <= 1)
return null;
List<RapierRule> uncompressedRules = new ArrayList<RapierRule>();
for (TextRulerRule r : slotRules) {
if (((RapierRule) r).isInitialRule())
uncompressedRules.add((RapierRule) r);
}
// 1. get generalizations of the two slot filler patterns:
// create pairs and prefer still uncompressed rules when choosing
// "randomly":
int pairsLeft = pairCount;
if (uncompressedRules.size() == 1) {
RapierRule rule1 = uncompressedRules.get(0);
RapierRule rule2 = null;
while (rule2 == null || rule1 == rule2) {
rule2 = (RapierRule) slotRules.get(rand.nextInt(slotRules.size()));
}
generalizations.addAll(getFillerGeneralizationsForRulePair(rule1, rule2));
if (shouldAbort())
return null;
pairsLeft--;
} else if (uncompressedRules.size() == 2) {
RapierRule rule1 = uncompressedRules.get(0);
RapierRule rule2 = uncompressedRules.get(1);
generalizations.addAll(getFillerGeneralizationsForRulePair(rule1, rule2));
if (shouldAbort())
return null;
pairsLeft--;
} else if (uncompressedRules.size() > 2) {
int uPairCount = pairCount;
if (uPairCount > uncompressedRules.size())
uPairCount /= 2;
for (int i = 0; i < uPairCount; i++) {
RapierRule rule1 = uncompressedRules.get(rand.nextInt(uncompressedRules.size()));
RapierRule rule2 = null;
while (rule2 == null || rule1 == rule2) {
rule2 = uncompressedRules.get(rand.nextInt(uncompressedRules.size()));
}
generalizations.addAll(getFillerGeneralizationsForRulePair(rule1, rule2));
pairsLeft--;
}
}
for (int i = 0; i < pairsLeft; i++) {
// TODO optimize !! don't call the machinery with the same rule pair
// two times in one session !!!
// randomly pick two rules:
RapierRule rule1 = (RapierRule) slotRules.get(rand.nextInt(slotRules.size()));
RapierRule rule2 = null;
while (rule2 == null || rule1 == rule2) {
rule2 = (RapierRule) slotRules.get(rand.nextInt(slotRules.size()));
}
generalizations.addAll(getFillerGeneralizationsForRulePair(rule1, rule2));
if (shouldAbort())
return null;
}
// if (TextRulerToolkit.DEBUG)
// {
// TextRulerToolkit.log("Rule Generalizations created: " +
// generalizations.size());
// for (RapierRule newRule : generalizations)
// TextRulerToolkit.log("Rule = "+newRule.getRuleString());
// }
// 2. evaluate an enque to priority list:
List<RapierRule> testRules = new ArrayList<RapierRule>(generalizations);
for (RapierRule r : testRules) {
r.combineSenselessPatternListItems();
}
testRulesIfNotCached(testRules);
if (shouldAbort())
return null;
for (RapierRule newRule : generalizations) {
if (TextRulerToolkit.DEBUG) {
if (!RapierDebugHelper.debugCheckIfRuleCoversItsSeedRuleCoverings(newRule)) {
TextRulerToolkit
.log("------------------------------------------------------------------------------------------");
TextRulerToolkit
.log("ERROR, A RULE HAS TO COVER AT LEAST EVERY POSITIVE EXAMPLE OF ITS TWO SEED RULES!!!");
TextRulerToolkit.log("\t RULE: " + newRule.getRuleString());
TextRulerToolkit.log("\t Parent1: " + newRule.getParent1().getRuleString());
TextRulerToolkit.log("\t Parent2: " + newRule.getParent2().getRuleString());
TextRulerToolkit.log("--------");
TextRulerToolkit.log("+RuleCovering: "
+ newRule.getCoveringStatistics().getCoveredPositiveExamples());
TextRulerToolkit.log("+P1Covering : "
+ newRule.getParent1().getCoveringStatistics().getCoveredPositiveExamples());
TextRulerToolkit.log("+P2Covering : "
+ newRule.getParent2().getCoveringStatistics().getCoveredPositiveExamples());
}
}
ruleList.add(newRule);
}
// 3. specialize pre and post fillers:
int n = 0;
double bestValue = Double.MAX_VALUE;
int noImprovementCounter = 0;
while (true) {
n++;
TextRulerToolkit.log(" --- NEW SPECIALIZATOIN ROUND; n = " + n + " noImprovementCounter = "
+ noImprovementCounter);
List<RapierRule> newRuleList = new ArrayList<RapierRule>();
for (RapierRule curRule : ruleList) {
List<RapierRule> specTestRules = new ArrayList<RapierRule>(specializePreFiller(curRule, n));
for (RapierRule r : specTestRules)
r.combineSenselessPatternListItems();
testRulesIfNotCached(specTestRules);
if (shouldAbort())
return null;
for (RapierRule r : specTestRules)
newRuleList.add(r);
}
ruleList.addAll(newRuleList);
newRuleList.clear();
for (RapierRule curRule : ruleList) {
List<RapierRule> specTestRules = new ArrayList<RapierRule>(specializePostFiller(curRule, n));
for (RapierRule r : specTestRules)
r.combineSenselessPatternListItems();
testRulesIfNotCached(specTestRules);
if (shouldAbort())
return null;
for (RapierRule r : specTestRules)
newRuleList.add(r);
}
ruleList.addAll(newRuleList);
RapierRule bestRule = ruleList.peek();
if (TextRulerToolkit.DEBUG) {
// for (RapierRule r: ruleList)
// TextRulerToolkit.log("value="+r.getPriority()+" rule = "+r.getRuleString());
TextRulerToolkit.log("------------------------------------");
TextRulerToolkit.log("BEST RULE FOR THIS SESSION: " + bestRule.getCoveringStatistics());
TextRulerToolkit.log(bestRule.getRuleString());
TextRulerToolkit.log("------------------------------------");
}
if (bestRule.producesOnlyValidFillers())
break; // todo: horizon effects ??
if (bestRule.getPriority() < bestValue) {
noImprovementCounter = 0;
bestValue = bestRule.getPriority();
} else {
noImprovementCounter++;
if (noImprovementCounter > limNoImprovements)
break;
}
}
RapierRule bestRule = ruleList.peek();
return bestRule;
}