in ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java [62:439]
public List<?> tokenizeTextSegment(JCas jcas, String textSegment, int offsetAdjustment, boolean includeTextNotJustOffsets) {
String lowerCasedText = textSegment.toLowerCase();
ArrayList<Object> tokens = new ArrayList<Object>();
Class<? extends BaseToken> tokenClass = null;
// if input was null or empty, return empty token list
if (textSegment==null || textSegment.length()==0) return emptyTokenList;
// find first character of a token
int currentPosition = 0;
currentPosition = findFirstCharOfNextToken(textSegment, currentPosition);
// if input contained only white space but not even any newlines, return empty token list
if (currentPosition < 0) return emptyTokenList;
while ((currentPosition = findFirstCharOfNextToken(textSegment, currentPosition))>=0) {
// get current character and the one after that, which is used in making a number
// of decisions. if at the end of the input, use '\0' to represent the non-existent
// character after the current one just to avoid dealing with null
char firstCharOfToken = textSegment.charAt(currentPosition);
int NOT_SET_INDICATOR = -999;
int tokenLen = NOT_SET_INDICATOR; // should set it below to a real value
Object bta;
if (currentPosition+1 >= textSegment.length()) {
// we found the start of a token, but it was the last character in the input,
// so it is a 1-character token
tokenLen = 1;
tokenClass = null; // null indicates that we don't know yet what the class is
}
// else we have at least 2 characters to consider
else if (isWhitespace(textSegment.charAt(currentPosition+1))) {
// Since the following character is whitespace, and the current character
// is the first character of a token, the current character is a one-character token
tokenLen = 1;
tokenClass = null; // null indicates that we don't know yet what the class is
}
else if (firstCharOfToken == NEWLINE) {
tokenLen = 1;
tokenClass = NewlineToken.class;
}
else if (firstCharOfToken == CR) {
char peekAhead;
peekAhead = textSegment.charAt(currentPosition+1);
if (peekAhead != NEWLINE) {
tokenLen = 1;
tokenClass = NewlineToken.class;
}
else {
// create CR followed by LF as single end-of-line marker
tokenLen = 2; // skip an extra one to skip both the CR and the LF
tokenClass = NewlineToken.class;
}
}
else if (firstCharOfToken==PERIOD) {
// check if decimal number without the leading digits
int len = getLengthIfIsNumberThatStartsWithPeriod(currentPosition, textSegment);
if (len > 0) {
tokenClass = NumToken.class;
tokenLen = len;
}
else if (isEllipsis(currentPosition, textSegment)) {
tokenLen = 3;
tokenClass = PunctuationToken.class;
} else {
// Abbreviation does not start with period, and not part of some other token, so it is punctuation
tokenLen = 1;
tokenClass = PunctuationToken.class;
}
}
else if (firstCharOfToken==HYPHEN_OR_MINUS_SIGN) {
// If it's the first character of a token, then this is not a hyphenated term that
// was supposed to be kept as one token, or we would have included it in the previous token
// Also telephone numbers do not start with a dash
// So assume the hyphen/dash char is a one-character token like in 5-6 or in -400
tokenLen = 1;
tokenClass = PunctuationToken.class;
}
else if (firstCharOfToken==APOSTROPHE) {
// "can't" is not part of this case because the n is the start of the second token
// The 've part of should've is not handled here, when something like should've or he'll
// is found, 2 tokens are created (elsewhere)
// Check if start of a Name
int len = getLengthIfNameStartingWithApostrophe(currentPosition, textSegment);
if (len > 0) {
tokenLen = len;
tokenClass = WordToken.class;
} else if (ContractionsPTB.isContractionThatStartsWithApostrophe(currentPosition, lowerCasedText)) {
// 'tis and 'twas which get tokenized as "'t is" and "'t was"
tokenLen = 2;
tokenClass = ContractionToken.class;
// the "is" or "was" part will become a token on the next iteration
// TODO potential place to add some self-checking code
} else { // is separate punctuation mark
tokenLen = 1;
tokenClass = PunctuationToken.class;
}
}
else if (isPunctuation(firstCharOfToken)) { // other than any handled above
// Already handled minus sign and leading period (which could be part of a decimal)
// Since not processing 'web-text', no need to look for things like :)
// so is some type of 1-character punctuation token
tokenLen = 1;
tokenClass = PunctuationToken.class;
}
else if (isLetterOrDigit(firstCharOfToken)) {
boolean obviouslyIsWord = true; // until we find a non alphanum before a whitespace
boolean obviouslyIsNumber = true; // until we find a non digit before a whitespace
int nextWhitespaceOrEndOfSegment = -1;
int nextNonLetterOrNonDigit = -1;
int nextNonLetterDigitApostrophe = -1;
int nextNonTelephoneOrPostalChar = -1; // digits and dash aka hyphen
int nextNonNumericChar = -1; // 9,876.012345 is an example with all the numeric chars
int nextNonDigit = -1;
// First check the easy case - if just letters and digits until next whitespace (or until end of segment)
// then that is a word or a number, can skip all the other logic to check for +hyphens
// or contractions etc
int i = currentPosition;
char ch;
do {
ch = textSegment.charAt(i);
if (isWhitespace(ch)) {
if (nextNonLetterOrNonDigit < 0) nextNonLetterOrNonDigit = i;
if (nextNonLetterDigitApostrophe < 0) nextNonLetterDigitApostrophe = i;
if (nextNonDigit < 0) nextNonDigit = i;
if (nextNonTelephoneOrPostalChar < 0) nextNonTelephoneOrPostalChar = i;
if (nextNonNumericChar < 0) nextNonNumericChar = i;
nextWhitespaceOrEndOfSegment = i;
} else if (!isLetterOrDigit(ch)) {
obviouslyIsWord = false; // not sure if it will be word all the way to whitespace
obviouslyIsNumber = false; // not sure if it will be number all the way to whitespace
if (nextNonLetterOrNonDigit < 0) nextNonLetterOrNonDigit = i;
if (nextNonLetterDigitApostrophe < 0 && ch!=APOSTROPHE) {
nextNonLetterDigitApostrophe = i;
}
if (nextNonDigit < 0) nextNonDigit = i;
if (nextNonTelephoneOrPostalChar < 0 && !isTelephoneNumberChar(ch)) {
nextNonTelephoneOrPostalChar = i;
}
if (nextNonNumericChar < 0 && !isNumericChar(ch)) {
nextNonNumericChar = i;
}
// don't break here though, keep going to set nextWhitespace correctly for other uses
} else if (!isDigit(ch)) {
obviouslyIsNumber = false; // not sure if it will be number all the way to whitespace
// since passed nextNonLetterOrNonDigit test above, must be letter, so nextNonLetterOrNonDigit is not changed here
// since passed !isLetterOrDigit test above, must be letter, so nextNonLetterDigitApostrophe is not changed here
if (nextNonDigit < 0) nextNonDigit = i;
if (nextNonTelephoneOrPostalChar < 0 && !isTelephoneNumberChar(ch)) {
nextNonTelephoneOrPostalChar = i;
}
if (nextNonNumericChar < 0 && !isNumericChar(ch)) {
nextNonNumericChar = i;
}
} else {
// else is a digit, none of the flags need to be set for digit characters.
}
i++;
} while (i < textSegment.length() && !isWhitespace(ch));
if (i>=textSegment.length()) {
if (nextWhitespaceOrEndOfSegment < 0) nextWhitespaceOrEndOfSegment = textSegment.length();
if (nextNonLetterOrNonDigit < 0) nextNonLetterOrNonDigit = textSegment.length();
if (nextNonLetterDigitApostrophe < 0) nextNonLetterDigitApostrophe = textSegment.length();
if (nextNonTelephoneOrPostalChar < 0) nextNonTelephoneOrPostalChar = textSegment.length();
if (nextNonNumericChar < 0) nextNonNumericChar = textSegment.length();
}
//System.err.println("nextWhitespaceOrEndOfSegment = " + nextWhitespaceOrEndOfSegment);
if (obviouslyIsNumber) {
tokenLen = nextWhitespaceOrEndOfSegment - currentPosition;
tokenClass = NumToken.class;
} else if (obviouslyIsWord) {
// Check for things like "cannot" and "gonna" that appear to be one token but
// are supposed to be more than one according to PTB rules.
String lowerCasedSubstring = textSegment.substring(currentPosition, nextWhitespaceOrEndOfSegment).toLowerCase();
int len = ContractionsPTB.lenOfFirstTokenInContraction(lowerCasedSubstring);
if (len > 0) { // is a contraction that doesn't contain an apostrophe, like "gonna", create WordToken for first part,
// and create ContractionToken for other token(s)
tokenLen = len;
tokenClass = WordToken.class;
bta = createToken(tokenClass, textSegment, jcas, currentPosition, currentPosition+tokenLen, offsetAdjustment);
tokens.add(bta);
currentPosition+=tokenLen; // currentPosition
len = ContractionsPTB.lenOfSecondTokenInContraction(lowerCasedSubstring);
tokenLen = len;
tokenClass = ContractionToken.class;
len = ContractionsPTB.lenOfThirdTokenInContraction(lowerCasedSubstring);
if (len>0) { // if there is a 3rd, create the 2nd and set up for the 3rd to be created later
bta = createToken(tokenClass, textSegment, jcas, currentPosition, currentPosition+tokenLen, offsetAdjustment);
tokens.add(bta);
currentPosition+=tokenLen; // currentPosition
tokenLen = len;
tokenClass = ContractionToken.class;
}
} else {
tokenLen = nextWhitespaceOrEndOfSegment - currentPosition;
tokenClass = WordToken.class;
}
} else { // Still within the "isLetterOrDigit(firstCharOfToken)" but not obviously number or word
int len;
ContractionResult cr;
// Not sure what the token is, the token could extend to
// include all to the end of an email address,
// or include all to the end of a URL,
// or include all to the end of a URL,
// or through the next period (for an abbreviation)
// or to the next hyphen,
// or beyond,
// or to the next whitespace (note already handle case of all alphanums to whitespace
// or to the end of input (note already handle case of all alphanums to end of input
// or the next apostrophe (for a most contractions)
// or until "n't" for such contractions
// or the next other punctuation symbol
// or beyond (for 80's)
// or could include some punctuation like 3,245.51
// Need to check for things like 80's before checking for contractions or else 80's looks like a contraction
if (nextNonLetterOrNonDigit < lowerCasedText.length() && lowerCasedText.charAt(nextNonLetterOrNonDigit)==APOSTROPHE) {
String lowerCasedSubstring = lowerCasedText.substring(currentPosition, nextWhitespaceOrEndOfSegment);
len = ContractionsPTB.tokenLengthCheckingForSingleQuoteWordsToKeepTogether(lowerCasedSubstring);
if (len > nextNonLetterOrNonDigit-currentPosition) { // if keeping the apostrophe attached
tokenLen = len;
tokenClass = wordTokenOrNumToken(lowerCasedText, currentPosition, tokenLen);
} // else let contraction checking later determine what to do
}
if (tokenLen == NOT_SET_INDICATOR) { // not found yet
if ((cr = ContractionsPTB.getLengthIfNextApostIsMiddleOfContraction(currentPosition, nextNonLetterOrNonDigit, lowerCasedText)) != null) {
len = cr.getWordTokenLen();
tokenLen = len;
tokenClass = WordToken.class;
char c = lowerCasedText.charAt(currentPosition+len);
if (c=='n' || c==APOSTROPHE) { // if a "n't" contraction or a contraction where contraction token starts with '
if (tokenLen < 0) throw new RuntimeException("c = " + c + "tokenLen = " + tokenLen + " currentPosition = " + currentPosition);
// First create the WordToken (no apostrophe)
if(tokenLen > 0){
bta = createToken(tokenClass, textSegment, jcas, currentPosition, currentPosition+tokenLen, offsetAdjustment);
//System.out.println("bta = " + bta + " class = " + bta.getClass() + " tokenLen = " + tokenLen + " currentPosition = " + currentPosition);
tokens.add(bta);
currentPosition+=tokenLen; // currentPosition
}
// Set up to create the second token, for other contractions, the next token will start with an
// apostrophe and be handled above... but for "n't" contractions, next token won't start with apostrophe
// so just go ahead and handle it here instead of having to keep track of previous
// and handle n't in next loop.
tokenLen = cr.getContractionTokenLen();
// if (tokenLen!=3) throw new RuntimeException("getContractionTokenLen != 3 for n't");
tokenClass = ContractionToken.class;
} else {
throw new RuntimeException("ERROR: getLengthIfNextApostIsMiddleOfContraction returned " + len + " but the character (" + c +") after that is not 'n' or apostrophe ");
}
} else if ((len = lenIfIsTelephoneNumber(currentPosition, lowerCasedText, nextNonTelephoneOrPostalChar)) > 0) {
tokenLen = len;
tokenClass = WordToken.class;
} else if ((len = lenIfIsPostalCode(currentPosition, lowerCasedText, nextNonTelephoneOrPostalChar)) > 0) {
tokenLen = len;
tokenClass = WordToken.class;
} else if ((len = lenIfIsUrl(currentPosition, lowerCasedText, nextWhitespaceOrEndOfSegment)) > 0) {
tokenLen = len;
tokenClass = WordToken.class;
} else if ((len = lenIfIsEmailAddress(currentPosition, lowerCasedText, nextWhitespaceOrEndOfSegment)) > 0) {
tokenLen = len;
tokenClass = WordToken.class;
} else if ((len = lenIfIsAbbreviation(currentPosition, textSegment, nextWhitespaceOrEndOfSegment)) > 0) {
tokenLen = len;
tokenClass = WordToken.class;
} else { // Still within the "isLetterOrDigit(firstCharOfToken)".
// not obviously a word or number (already checked those)
// and not Url, EmailAddress, or Abbreviation
// There could be a hyphen before the next white space,
// or a symbol before the next whitespace
// or apostrophe like in 80's or P'yongyang (one token each) or James' or Ted's (2 tokens each)
// Take alphanums, but consider hyphenated words and names with apostrophes
// and consider tele numbers and postal codes
// if (true) { // TBD comment out this debug code
// System.out.println("lowerCasedSubstring = " + quoted(lowerCasedSubstring));
// System.out.println("currentPosition = " + currentPosition);
// System.out.println("nextWhitespaceOrEndOfSegment = " + nextWhitespaceOrEndOfSegment);
// System.out.println("nextNonLetterOrNonDigit = " + nextNonLetterOrNonDigit);
// System.out.println("nextNonLetterDigitApostrophe = " + nextNonLetterDigitApostrophe);
// }
if (nextNonLetterOrNonDigit<lowerCasedText.length() && lowerCasedText.charAt(nextNonLetterOrNonDigit)==HYPHEN_OR_MINUS_SIGN) {
// telephone numbers and postal codes handled above already
String lowerCasedSubstring = lowerCasedText.substring(currentPosition, nextWhitespaceOrEndOfSegment);
len = HyphenatedPTB.tokenLengthCheckingForHyphenatedTerms(lowerCasedSubstring);
tokenLen = len;
if (tokenLen < 0) throw new RuntimeException("tokenLen = " + tokenLen + " currentPosition = " + currentPosition + " nextNonLetterOrNonDigit = " + nextNonLetterOrNonDigit);
tokenClass = wordTokenOrNumToken(lowerCasedText, currentPosition, tokenLen);
} else if (nextNonNumericChar > 0 && (len = lenIfIsNumberContainingComma(currentPosition, lowerCasedText, nextNonNumericChar)) > 0) {
tokenLen = len;
tokenClass = NumToken.class;
} else if (nextNonLetterDigitApostrophe < lowerCasedText.length() && lowerCasedText.charAt(nextNonLetterDigitApostrophe)==PERIOD) {
// see if is a number with a decimal place (without commas, comma-containing numbers are handled above)
if (nextNonDigit==lowerCasedText.length()-1) {
// end of sentence, don't include the period as part of the number, count it as end of sentence marker (punctuation)
tokenLen = nextNonDigit - currentPosition;
//if (tokenLen<1) throw new RuntimeException("Period at end of sentence " + nextNonDigit + " " + nextNonLetterDigitApostrophe+" "+tokenLen+ " " + lowerCasedText);
tokenClass = NumToken.class;
} else if (nextNonLetterDigitApostrophe==nextNonDigit) {
// if not end of sentence, do include period (decimal point) in the NumToken
tokenLen = nextNonDigit + 1 + getLenToNextNonDigit(lowerCasedText, nextNonDigit+1) - currentPosition;
tokenClass = NumToken.class;
}
else {
// something like 2J3. which is not a number or 2'3.
tokenLen = nextNonLetterOrNonDigit - currentPosition;
tokenClass = wordTokenOrNumToken(lowerCasedText, currentPosition, tokenLen);
}
} else { // breaking character is not - character and not ' character, so stop there
tokenLen = nextNonLetterOrNonDigit - currentPosition;
tokenClass = wordTokenOrNumToken(lowerCasedText, currentPosition, tokenLen);
}
//} else {
// throw new UnsupportedOperationException("nextNonLetterDigitApostrophe = " + nextNonLetterDigitApostrophe);
//}
}
}
}
} else { // some other symbol or punctuation not included in isPunctuation
// Since not processing 'web-text', no need to look for things like :)
// so it is some type of 1-character symbol token
tokenLen = 1;
tokenClass = SymbolToken.class;
}
// add the token created
if (tokenLen < 0) throw new RuntimeException("tokenLen = " + tokenLen + " currentPosition = " + currentPosition);
bta = createToken(tokenClass, textSegment, jcas, currentPosition, currentPosition+tokenLen, offsetAdjustment);
//System.out.println("bta = " + bta + " class = " + bta.getClass() + " tokenLen = " + tokenLen + " currentPosition = " + currentPosition);
tokens.add(bta);
currentPosition+=tokenLen; // currentPosition
} // end while loop
return tokens;
}