in ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetectorAnnotatorBIO.java [99:206]
public void process(JCas jcas) throws AnalysisEngineProcessException {
LOGGER.info( "Processing ..." );
String uri=null;
try{
uri = ViewUriUtil.getURI(jcas).toString();
LOGGER.info(String.format("Processing file with uri %s", uri));
}catch(CASRuntimeException e){
LOGGER.debug("No uri found, probably not a big deal unless this is an evaluation.");
}
if(featConfig == FEAT_CONFIG.LINE_POS || featConfig == FEAT_CONFIG.CHAR_POS || featConfig == FEAT_CONFIG.CHAR_SHAPE_POS){
buildDocEndlineModel(jcas);
}
for(Segment seg : JCasUtil.select(jcas, Segment.class)){
// keep track of next sentence during training
List<Sentence> sents = JCasUtil.selectCovered(jcas, Sentence.class, seg);
int sentInd = 0;
Sentence nextSent = sents.size() > 0 ? sents.get(sentInd++) : null;
int startInd=0;
// Iterate over every character in the Segment and classify it as Begin, Inside, or Outside a Sentence
String prevOutcome = "O";
String segText = seg.getCoveredText();
for(int ind = 0; ind < segText.length(); ind++){
List<Feature> feats = new ArrayList<>();
char curChar = segText.charAt(ind);
// Start collecting features:
feats.add(new Feature("PrevOutcome", prevOutcome));
// all systems get to know about the current char they're classifying (i.e. is this a period)
feats.addAll(getCharFeatures(curChar, "Character"));
if(featConfig == FEAT_CONFIG.CHAR || featConfig == FEAT_CONFIG.CHAR_POS || featConfig == FEAT_CONFIG.CHAR_SHAPE || featConfig == FEAT_CONFIG.CHAR_SHAPE_POS){
for(int window = -WINDOW_SIZE; window <= WINDOW_SIZE; window++){
if(ind+window >= 0 && ind+window < segText.length()){
char conChar = segText.charAt(ind+window);
feats.addAll(getCharFeatures(conChar, "CharOffset_"+window));
}
}
}
String nextToken = getNextToken(segText, ind);
String prevToken = getPrevToken(segText, ind);
feats.addAll(getTokenFeatures(prevToken, nextToken, "Token"));
if(featConfig == FEAT_CONFIG.LINE_POS || featConfig == FEAT_CONFIG.CHAR_POS || featConfig == FEAT_CONFIG.CHAR_SHAPE_POS){
feats.addAll(getPositionFeatures(curChar, ind, segText, nextToken));
}
String outcome;
int casInd = seg.getBegin() + ind;
if(this.isTraining()){
// if ind pointer has passed nextSent pointer advance nextSent
while(nextSent != null && nextSent.getEnd() < casInd && sentInd < sents.size()){
nextSent = sents.get(sentInd++);
}
if(nextSent == null){
outcome = "O";
}else if(casInd < nextSent.getBegin()){
// current index is prior to next sentence
outcome = "O";
}else if(prevOutcome.equals("O")){
// current index is in sentence but just after a character that was out of the sentence
outcome = "B";
}else{
// current index is in the middle of a sentence
outcome = "I";
}
this.dataWriter.write(new Instance<String>(outcome, feats));
}else{
if(!prevOutcome.equals("O") && Character.isLetterOrDigit(curChar)){
outcome = "I";
}else{
outcome = this.classifier.classify(feats);
// This shouldn't be necessary, but if the learning algorithm fails, we need to correct it so
// that our accounting works. Only a B or O can follow an O, if classifier predicts "I", switch
// it to a "B".
if(outcome.equals("I") && prevOutcome.equals("O")){
outcome = "B";
}
if(outcome.equals("B")) startInd = casInd;
else if(outcome.equals("O") &&
(prevOutcome.equals("I") || prevOutcome.equals("B"))){
// just ended a sentence
int endInd = casInd;
while(endInd > startInd && Character.isWhitespace(segText.charAt(endInd-seg.getBegin()-1))){
endInd--;
}
if(endInd > startInd){
makeSentence(jcas, startInd, endInd);
}
}
}
}
prevOutcome = outcome;
}
// One final sentence at the end of the segment if we were in the middle of one when we ran out of characters.
if(!this.isTraining() && !prevOutcome.equals("O")){
// segment ended with a sentence
makeSentence(jcas, startInd, seg.getEnd());
}
}
}