in ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/whisk/generic/Whisk.java [220:415]
protected WhiskRule extendRule(WhiskRule rule, TextRulerExampleDocument doc,
TextRulerExample example, int subRoundNumber) {
WhiskRule bestRule = null;
double bestL = 1.0;
int bestRuleConstraintPoints = -1;
if (rule.getLaplacian() <= errorThreshold) {
bestRule = rule;
bestL = rule.getLaplacian();
}
List<TextRulerRule> rulesToTest = new ArrayList<TextRulerRule>();
// first only add conditions, e.g., for features
List<TextRulerSlotPattern> patterns = rule.getPatterns();
for (TextRulerSlotPattern eachPattern : patterns) {
for (TextRulerRuleItem item : eachPattern.fillerPattern) {
if (item instanceof WhiskRuleItem) {
WhiskRuleItem wri = (WhiskRuleItem) item;
WhiskRule proposedRule = rule;
TextRulerWordConstraint wordConstraint = wri.getWordConstraint();
for (String eachFeature : consideredFeatures) {
if (wordConstraint != null) {
Map<String, String> featureMap = wordConstraint.getTokenAnnotation().getFeatureMap();
String stringValue = featureMap.get(eachFeature);
if (stringValue != null && !wri.getActivatedFeatures().contains(eachFeature)) {
wri.activateFeature(eachFeature);
WhiskRule proposedRuleF = proposedRule.copy();
wri.deactivateFeature(eachFeature);
proposedRuleF.setNeedsCompile(true);
if (!rulesToTest.contains(proposedRuleF)) {
rulesToTest.add(proposedRuleF);
}
}
}
}
if (wordConstraint != null && wordConstraint.isRegExpConstraint() && wri.isHideRegExp()) {
wri.setHideRegExp(false);
WhiskRule proposedRuleF = proposedRule.copy();
wri.setHideRegExp(true);
proposedRuleF.setNeedsCompile(true);
if (!rulesToTest.contains(proposedRuleF)) {
rulesToTest.add(proposedRuleF);
}
}
}
}
}
List<List<WhiskRuleItem>> slotTerms = getTermsWithinBounds(
example.getAnnotations()[0].getBegin(), example.getAnnotations()[0].getEnd(), example);
List<List<WhiskRuleItem>> windowTerms = getTermsWithinWindow(slotTerms, example, 0);
for (List<WhiskRuleItem> eachList : windowTerms) {
for (WhiskRuleItem term : eachList) {
if (rule.containsTerm(term)) {
continue;
}
WhiskRule proposedRule = createNewRuleByAddingTerm(rule, term);
if (proposedRule == null)
continue;
WhiskRuleItem t = term;
if (!rulesToTest.contains(proposedRule))
rulesToTest.add(proposedRule);
// add a second version where we add the exact token content if
// it is a regexp item:
WhiskRule proposedRule2 = proposedRule;
if (t.getWordConstraint().isRegExpConstraint()) {
t.setHideRegExp(false);
WhiskRule proposedRuleF = proposedRule.copy();
t.setHideRegExp(true);
proposedRuleF.setNeedsCompile(true);
if (!rulesToTest.contains(proposedRuleF)) {
rulesToTest.add(proposedRuleF);
}
}
// extend with feature conditions
WhiskRule proposedRuleF = null;
for (String eachFeature : consideredFeatures) {
Map<String, String> featureMap = t.getWordConstraint().getTokenAnnotation()
.getFeatureMap();
String stringValue = featureMap.get(eachFeature);
if (stringValue != null) {
t.activateFeature(eachFeature);
proposedRuleF = proposedRule.copy();
t.deactivateFeature(eachFeature);
proposedRuleF.setNeedsCompile(true);
if (!rulesToTest.contains(proposedRuleF)) {
rulesToTest.add(proposedRuleF);
}
}
}
// and now, for WHISK performance testing purposes, we also add POS
// tags:
// this is not very nice code and not dynamic feature capable, but
// for testpurposes
// in order to test WHISK with PosTag Terms...
if (posTagRootTypeName != null && posTagRootTypeName.length() > 0) {
TextRulerAnnotation tokenAnnotation = term.getWordConstraint().getTokenAnnotation();
CAS cas = example.getDocumentCAS();
TypeSystem ts = cas.getTypeSystem();
Type posTagsRootType = ts.getType(posTagRootTypeName);
if (ts != null) {
// POS-Tags created by our test hmm tagger.
List<AnnotationFS> posTagAnnotations = TextRulerToolkit.getAnnotationsWithinBounds(cas,
tokenAnnotation.getBegin(), tokenAnnotation.getEnd(), null, posTagsRootType);
if (posTagAnnotations.size() > 0) {
AnnotationFS posTag = posTagAnnotations.get(0);
if (posTag.getBegin() == tokenAnnotation.getBegin()
&& posTag.getEnd() == tokenAnnotation.getEnd()) {
TextRulerAnnotation posTagAnnotation = new TextRulerAnnotation(posTag, doc,
consideredFeatures);
// 1. most specific term with all constraints we
// have:
WhiskRule proposedRule3 = proposedRule.copy();
WhiskRuleItem t3 = term;
t3.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
proposedRule3.setNeedsCompile(true);
if (!rulesToTest.contains(proposedRule3))
rulesToTest.add(proposedRule3);
// 2. the same without the regexp thingy:
if (proposedRule2 != null) {
WhiskRule proposedRule4 = proposedRule2.copy();
WhiskRuleItem t4 = term;
t4.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation,
posTagAnnotation));
proposedRule4.setNeedsCompile(true);
if (!rulesToTest.contains(proposedRule4))
rulesToTest.add(proposedRule4);
}
// 3. last but not least: a rule with only the pos
// tag constraint:
WhiskRule proposedRule5 = proposedRule.copy();
WhiskRuleItem t5 = term;
t5.addOtherConstraint(new MLWhiskOtherConstraint(tokenAnnotation, posTagAnnotation));
t5.setWordConstraint(null);
proposedRule5.setNeedsCompile(true);
if (!rulesToTest.contains(proposedRule5)) {
rulesToTest.add(proposedRule5);
}
}
}
}
}
}
}
if (rulesToTest.size() == 0)
return bestRule;
sendStatusUpdateToDelegate(
"Round "
+ roundNumber
+ "."
+ subRoundNumber
+ " - Testing "
+ rulesToTest.size()
+ " rules... "
+ " - uncovered examples: "
+ (allExamplesCount - coveredExamples.size() + " / " + allExamplesCount
+ " ; cs=" + cachedTestedRuleStatistics.size()),
TextRulerLearnerState.ML_RUNNING, false);
TextRulerToolkit.log("Testing " + rulesToTest.size() + " rules on training set...");
for (TextRulerRule r : rulesToTest)
TextRulerToolkit.log(r.getRuleString());
testRulesIfNotCached(rulesToTest);
if (shouldAbort())
return null;
for (TextRulerRule r : rulesToTest) {
WhiskRule wr = (WhiskRule) r;
if (wr.getLaplacian() < bestL) {
bestL = wr.getLaplacian();
bestRule = wr;
bestRuleConstraintPoints = bestRule.totalConstraintPoints();
} else if (wr.getLaplacian() == bestL && bestRuleConstraintPoints >= 0) {
TextRulerToolkit.log("Same Laplacian! So prefer more general rule!");
if (wr.totalConstraintPoints() < bestRuleConstraintPoints) {
TextRulerToolkit.log("\tYes, prefered!");
bestL = wr.getLaplacian();
bestRule = wr;
bestRuleConstraintPoints = bestRule.totalConstraintPoints();
}
}
}
return bestRule;
}