in WhitespaceTokenizer/src/main/java/org/apache/uima/annotator/WhitespaceTokenizer.java [79:215]
public void process(CAS aCas) throws AnalysisEngineProcessException {
this.logger.logrb(Level.INFO, "WhitespaceTokenizer", "process",
MESSAGE_BUNDLE, "whitespace_tokenizer_info_start_processing");
ArrayList<CAS> casList = new ArrayList<CAS>();
// check if sofa names are available
if (this.sofaNames != null && this.sofaNames.length > 0) {
// get sofa names
for (int i = 0; i < this.sofaNames.length; i++) {
Iterator it = aCas.getViewIterator(this.sofaNames[i]);
while (it.hasNext()) {
// add sofas to the cas List to process
casList.add((CAS) it.next());
}
}
} else {
// use default sofa for the processing
casList.add(aCas);
}
for (int x = 0; x < casList.size(); x++) {
this.cas = casList.get(x);
// get text content from the CAS
char[] textContent = this.cas.getDocumentText().toCharArray();
int tokenStart = UNDEFINED;
int currentCharPos = 0;
int sentenceStart = 0;
int nextCharType = UNDEFINED;
char nextChar = INVALID_CHAR;
while (currentCharPos < textContent.length) {
char currentChar = textContent[currentCharPos];
int currentCharType = getCharacterType(currentChar);
// get character class for current and next character
if ((currentCharPos + 1) < textContent.length) {
nextChar = textContent[currentCharPos + 1];
nextCharType = getCharacterType(nextChar);
} else {
nextCharType = UNDEFINED;
nextChar = INVALID_CHAR;
}
// check if current character is a letter or number
if (currentCharType == CH_LETTER || currentCharType == CH_NUMBER) {
// check if it is the first letter of a token
if (tokenStart == UNDEFINED) {
// start new token here
tokenStart = currentCharPos;
}
}
// check if current character is a whitespace character
else if (currentCharType == CH_WHITESPACE) {
// terminate current token
if (tokenStart != UNDEFINED) {
// end of current word
createAnnotation(this.tokenType, tokenStart, currentCharPos);
tokenStart = UNDEFINED;
}
}
// check if current character is a special character
else if (currentCharType == CH_SPECIAL) {
// terminate current token
if (tokenStart != UNDEFINED) {
// end of current word
createAnnotation(this.tokenType, tokenStart, currentCharPos);
tokenStart = UNDEFINED;
}
// create token for special character
createAnnotation(this.tokenType, currentCharPos,
currentCharPos + 1);
}
// check if current character is new line character
else if (currentCharType == CH_NEWLINE) {
// terminate current token
if (tokenStart != UNDEFINED) {
// end of current word
createAnnotation(this.tokenType, tokenStart, currentCharPos);
tokenStart = UNDEFINED;
}
}
// check if current character is new punctuation character
else if (currentCharType == CH_PUNCTUATION) {
// terminates the current token
if (tokenStart != UNDEFINED) {
createAnnotation(this.tokenType, tokenStart, currentCharPos);
tokenStart = UNDEFINED;
}
// check next token type so see if we have a sentence end
if (((nextCharType == CH_WHITESPACE) || (nextCharType == CH_NEWLINE))
&& (punctuations.contains(new String(
new char[] { currentChar })))) {
// terminate sentence
createAnnotation(this.sentenceType, sentenceStart,
currentCharPos + 1);
sentenceStart = currentCharPos + 1;
}
// create token for punctuation character
createAnnotation(this.tokenType, currentCharPos,
currentCharPos + 1);
}
// go to the next token
currentCharPos++;
} // end of character loop
// we are at the end of the text terminate open token annotations
if (tokenStart != UNDEFINED) {
// end of current word
createAnnotation(this.tokenType, tokenStart, currentCharPos);
tokenStart = UNDEFINED;
}
// we are at the end of the text terminate open sentence annotations
if (sentenceStart != UNDEFINED) {
// end of current word
createAnnotation(this.sentenceType, sentenceStart, currentCharPos);
sentenceStart = UNDEFINED;
}
}
this.logger.logrb(Level.INFO, "WhitespaceTokenizer", "process",
MESSAGE_BUNDLE, "whitespace_tokenizer_info_stop_processing");
}