in ruta-ep-textruler/src/main/java/org/apache/uima/ruta/textruler/learner/whisk/token/Whisk.java [359:538]
protected WhiskRule createNewRuleByAddingTerm(WhiskRule baseRule, WhiskRuleItem term) {
WhiskRule newRule = baseRule.copy();
int foundSlotNumber = -1; // debug info
String foundSlotPattern = "";
int termNumber = term.getTermNumberInExample();
// determine, where this term is located relatively to the slots we
// have...
TextRulerRulePattern targetPattern = null;
TextRulerRulePattern previousSlotPostFillerPattern = null;
for (int i = 0; i < newRule.getPatterns().size(); i++) {
TextRulerSlotPattern slotPattern = newRule.getPatterns().get(i);
WhiskRuleItem it = (WhiskRuleItem) slotPattern.preFillerPattern.lastItem(); // look at the
// prefiller
// pattern
if (it != null && termNumber <= it.getTermNumberInExample())
targetPattern = slotPattern.preFillerPattern;
if (targetPattern == null && slotPattern.fillerPattern.size() > 0) // now
// look
// at
// the
// filler
// pattern
{
it = (WhiskRuleItem) slotPattern.fillerPattern.firstItem();
if (termNumber < it.getTermNumberInExample()) // it's still for
// the prefiller
// pattern but it
// seems to be
// emtpy so we
// could not find
// that out above!
targetPattern = slotPattern.preFillerPattern;
else {
it = (WhiskRuleItem) slotPattern.fillerPattern.lastItem();
if (termNumber <= it.getTermNumberInExample()) {
targetPattern = slotPattern.fillerPattern;
}
}
}
if (targetPattern == null && slotPattern.postFillerPattern.size() > 0) // now look at
// the
// postfiller
// pattern
{
it = (WhiskRuleItem) slotPattern.postFillerPattern.firstItem();
if (termNumber < it.getTermNumberInExample()) // it's still for
// the filler
// pattern but it
// seems to be
// emtpy so we
// could not find
// that out above!
targetPattern = slotPattern.fillerPattern;
else {
it = (WhiskRuleItem) slotPattern.postFillerPattern.lastItem();
if (termNumber <= it.getTermNumberInExample())
targetPattern = slotPattern.postFillerPattern;
}
}
if (targetPattern == null) {
targetPattern = previousSlotPostFillerPattern;
if (i > 0) {
TextRulerSlotPattern prevSlotPattern = newRule.getPatterns().get(i - 1);
foundSlotPattern = targetPattern == prevSlotPattern.preFillerPattern ? "PRE FILLER"
: (targetPattern == prevSlotPattern.fillerPattern ? "FILLER" : "POST FILLER");
foundSlotNumber = i - 1;
}
} else {
foundSlotPattern = targetPattern == slotPattern.preFillerPattern ? "PRE FILLER"
: (targetPattern == slotPattern.fillerPattern ? "FILLER" : "POST FILLER");
foundSlotNumber = i;
}
previousSlotPostFillerPattern = slotPattern.postFillerPattern;
}
if (targetPattern == null) {
targetPattern = previousSlotPostFillerPattern;
foundSlotNumber = newRule.getPatterns().size() - 1;
foundSlotPattern = "POST FILLER";
}
if (targetPattern == null) {
TextRulerToolkit.log("ERROR, NO TARGET PATTERN FOR NEW RULE TERM FOUND !");
} else {
// TextRulerToolkit.log("Ok, found for Rule: "+newRule.getRuleString());
// TextRulerToolkit.log("Term: "+term.getTermNumberInExample()+" ; "+term);
// TextRulerToolkit.log("Slot "+foundSlotNumber+" - Pattern: "+foundSlotPattern);
// now put that term into the rule:
int indexInPattern = -1;
if (targetPattern.size() == 0) {
targetPattern.add(term.copy());
indexInPattern = 0;
} else {
// 1. search if the term would replace a wildcard:
WhiskRuleItem wildCard = newRule.searchItemWithTermNumber(termNumber);
if (wildCard != null) {
if (!wildCard.isStarWildCard()) {
TextRulerToolkit
.log("ERROR, FOUND A TERM WITH THE SAME NUMBER THAT IS NOT A WILDCARD! HOW IS THAT???");
return null;
}
if (!targetPattern.contains(wildCard)) {
TextRulerToolkit.log("EVEN WORSE, THAT MUST NOT BE AT ALL!");
return null;
}
indexInPattern = targetPattern.indexOf(wildCard);
targetPattern.set(indexInPattern, term.copy());
} else {
// not a wildcard, so search for the insertion point:
for (int i = 0; i < targetPattern.size(); i++) {
WhiskRuleItem it = (WhiskRuleItem) targetPattern.get(i);
if (termNumber < it.getTermNumberInExample()) {
indexInPattern = i;
break;
}
}
if (indexInPattern < 0) {
indexInPattern = targetPattern.size();
targetPattern.add(term.copy());
} else
targetPattern.add(indexInPattern, term.copy());
}
}
// ok, now we have replaced a wildcard with the term or added the
// term between two other items.
// we now have to check the neighbors of the new term: if it is a
// direct neighbor (according to the termNumber),
// we have nothing special to do. but if it is not a direct
// neighbor, we have to add a wildcard between the two items (if the
// neighbor item
// is not a wildcard itself!
WhiskRuleItem newTerm = (WhiskRuleItem) targetPattern.get(indexInPattern);
// look at left neighbor:
WhiskRuleItem left = newRule.searchNeighborOfItem(newTerm, true);
if (left != null) {
// TextRulerToolkit.log("LEFT NEIGHBOR FOUND!");
// so we have a left neighbor. let's see if it also is the
// neighbor in our seed token stream:
if (left.getTermNumberInExample() < newTerm.getTermNumberInExample() - 1
&& !left.isStarWildCard()) { // no direct neighbor and
// no wildcard yet,
// so insert a wildcard between us!
targetPattern.add(indexInPattern,
WhiskRuleItem.newWildCardItem(left.getTermNumberInExample() + 1));
indexInPattern++;
}
}
// look at right neighbor:
WhiskRuleItem right = newRule.searchNeighborOfItem(newTerm, false);
if (right != null) {
// TextRulerToolkit.log("RIGHT NEIGHBOR FOUND!");
// so we have a right neighbor. let's see if it also is the
// neighbor in our seed token stream:
if (right.getTermNumberInExample() > newTerm.getTermNumberInExample() + 1
&& !right.isStarWildCard()) { // no direct neighbor and
// no wildcard yet,
// so insert a wildcard between us!
WhiskRuleItem wc = WhiskRuleItem.newWildCardItem(newTerm.getTermNumberInExample() + 1);
if (indexInPattern + 1 < targetPattern.size())
targetPattern.add(indexInPattern + 1, wc);
else
targetPattern.add(wc);
}
}
newRule.setNeedsCompile(true);
// TextRulerToolkit.log("BEFORE: "+baseRule.getRuleString());
// TextRulerToolkit.log("AFTER : "+newRule.getRuleString());
// TextRulerToolkit.log("");
}
if (newRule.getRuleString().equals(baseRule.getRuleString())) // this
// must
// not be!
return null;
else
return newRule;
}