in ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/whisk/token/Whisk.java [207:357]
protected WhiskRule extendRule(WhiskRule rule, TextRulerExampleDocument doc,
TextRulerExample example, List<WhiskRuleItem> allTerms, int subRoundNumber) {
WhiskRule bestRule = null;
double bestL = 1.0;
int bestRuleConstraintPoints = -1;
if (rule.getLaplacian() <= errorThreshold) {
bestRule = rule;
bestL = rule.getLaplacian();
}
List<WhiskRuleItem> slotTerms = getTermsWithinBounds(allTerms,
example.getAnnotations()[0].getBegin(), example.getAnnotations()[0].getEnd());
WhiskRuleItem firstSlotTerm = slotTerms.get(0);
WhiskRuleItem lastSlotTerm = slotTerms.get(slotTerms.size() - 1);
List<TextRulerRule> rulesToTest = new ArrayList<TextRulerRule>();
for (WhiskRuleItem term : allTerms) {
if (rule.containsTerm(term)) {
continue;
}
boolean rejectTerm = false;
// for now this works only for slot 0 (no multislot stuff here yet!)
if (term.getTermNumberInExample() < firstSlotTerm.getTermNumberInExample())
rejectTerm = firstSlotTerm.getTermNumberInExample() - term.getTermNumberInExample() > windowSize;
else if (term.getTermNumberInExample() > lastSlotTerm.getTermNumberInExample())
rejectTerm = term.getTermNumberInExample() - firstSlotTerm.getTermNumberInExample() > windowSize;
if (rejectTerm) {
// out of window scope -> skip to next...
continue;
}
WhiskRule proposedRule = createNewRuleByAddingTerm(rule, term);
WhiskRuleItem t = proposedRule.searchItemWithTermNumber(term.getTermNumberInExample());
if (!rulesToTest.contains(proposedRule))
rulesToTest.add(proposedRule);
// add a second version where we remove the exact token content if
// it is a regexp item:
WhiskRule proposedRule2 = null;
WhiskRuleItem t2 = null;
if (t.getWordConstraint().isRegExpConstraint()) {
proposedRule2 = proposedRule.copy();
t2 = proposedRule2.searchItemWithTermNumber(term.getTermNumberInExample());
t2.setHideRegExp(true);
proposedRule2.setNeedsCompile(true);
if (!rulesToTest.contains(proposedRule2)) {
rulesToTest.add(proposedRule2);
}
}
// and now, for WHISK performance testing purposes, we also add POS
// tags:
// this is not very nice code and not dynamic feature capable, but
// for testpurposes
// in order to test WHISK with PosTag Terms...
if (posTagRootTypeName != null && posTagRootTypeName.length() > 0) {
TextRulerAnnotation tokenAnnotation = term.getWordConstraint().getTokenAnnotation();
CAS cas = example.getDocumentCAS();
TypeSystem ts = cas.getTypeSystem();
Type posTagsRootType = ts.getType(posTagRootTypeName);
if (ts != null) {
// POS-Tags created by our test hmm tagger.
List<AnnotationFS> posTagAnnotations = TextRulerToolkit.getAnnotationsWithinBounds(cas,
tokenAnnotation.getBegin(), tokenAnnotation.getEnd(), null, posTagsRootType);
if (posTagAnnotations.size() > 0) {
AnnotationFS posTag = posTagAnnotations.get(0);
if (posTag.getBegin() == tokenAnnotation.getBegin()
&& posTag.getEnd() == tokenAnnotation.getEnd()) {
TextRulerAnnotation posTagAnnotation = new TextRulerAnnotation(posTag, doc);
// 1. most specific term with all constraints we
// have:
WhiskRule proposedRule3 = proposedRule.copy();
WhiskRuleItem t3 = proposedRule3.searchItemWithTermNumber(term
.getTermNumberInExample());
t3.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
proposedRule3.setNeedsCompile(true);
if (!rulesToTest.contains(proposedRule3))
rulesToTest.add(proposedRule3);
// 2. the same without the regexp thingy:
if (proposedRule2 != null) {
WhiskRule proposedRule4 = proposedRule2.copy();
WhiskRuleItem t4 = proposedRule4.searchItemWithTermNumber(term
.getTermNumberInExample());
t4.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
proposedRule4.setNeedsCompile(true);
if (!rulesToTest.contains(proposedRule4))
rulesToTest.add(proposedRule4);
}
// 3. last but not least: a rule with only the pos
// tag constraint:
WhiskRule proposedRule5 = proposedRule.copy();
WhiskRuleItem t5 = proposedRule5.searchItemWithTermNumber(term
.getTermNumberInExample());
t5.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
t5.setWordConstraint(null);
proposedRule5.setNeedsCompile(true);
if (!rulesToTest.contains(proposedRule5))
rulesToTest.add(proposedRule5);
}
}
}
}
}
if (rulesToTest.size() == 0)
return bestRule;
sendStatusUpdateToDelegate(
"Round "
+ roundNumber
+ "."
+ subRoundNumber
+ " - Testing "
+ rulesToTest.size()
+ " rules... "
+ " - uncovered examples: "
+ (allExamplesCount - coveredExamples.size() + " / " + allExamplesCount
+ " ; cs=" + cachedTestedRuleStatistics.size()),
TextRulerLearnerState.ML_RUNNING, false);
TextRulerToolkit.log("Testing " + rulesToTest.size() + " rules on training set...");
for (TextRulerRule r : rulesToTest)
TextRulerToolkit.log(r.getRuleString());
testRulesIfNotCached(rulesToTest); // testRulesOnDocumentSet(rulesToTest,
// exampleDocuments);
if (shouldAbort())
return null;
for (TextRulerRule r : rulesToTest) {
WhiskRule wr = (WhiskRule) r;
if (wr.getLaplacian() < bestL) {
bestL = wr.getLaplacian();
bestRule = wr;
bestRuleConstraintPoints = bestRule.totalConstraintPoints();
} else if (wr.getLaplacian() == bestL && bestRuleConstraintPoints >= 0) {
TextRulerToolkit.log("Same Laplacian! So prefer more general rule!");
if (wr.totalConstraintPoints() < bestRuleConstraintPoints) {
TextRulerToolkit.log("\tYes, prefered!");
bestL = wr.getLaplacian();
bestRule = wr;
bestRuleConstraintPoints = bestRule.totalConstraintPoints();
}
}
}
return bestRule;
}