public List tokenizeTextSegment()

in ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java [62:439]
237 lines of code
76 McCabe index (conditional complexity)

	public List<?> tokenizeTextSegment(JCas jcas, String textSegment, int offsetAdjustment, boolean includeTextNotJustOffsets) {
		
	    String lowerCasedText = textSegment.toLowerCase();
		ArrayList<Object> tokens = new ArrayList<Object>();
		Class<? extends BaseToken> tokenClass = null; 
		
		// if input was null or empty, return empty token list
		if (textSegment==null || textSegment.length()==0) return emptyTokenList;
		
		// find first character of a token
		int currentPosition = 0;
		currentPosition = findFirstCharOfNextToken(textSegment, currentPosition);
		// if input contained only white space but not even any newlines, return empty token list 
		if (currentPosition < 0) return emptyTokenList;
		
		while ((currentPosition = findFirstCharOfNextToken(textSegment, currentPosition))>=0) {
		
			// get current character and the one after that, which is used in making a number
			// of decisions. if at the end of the input, use '\0' to represent the non-existent 
			// character after the current one just to avoid dealing with null
			char firstCharOfToken = textSegment.charAt(currentPosition);
			int NOT_SET_INDICATOR = -999;
			int tokenLen = NOT_SET_INDICATOR; // should set it below to a real value
			Object bta;
			


			if (currentPosition+1 >= textSegment.length()) {
            			// we found the start of a token, but it was the last character in the input,
            			// so it is a 1-character token
            			tokenLen = 1;
            			tokenClass = null; // null indicates that we don't know yet what the class is

			} 
			
			// else we have at least 2 characters to consider
			
			else if (isWhitespace(textSegment.charAt(currentPosition+1))) {
			    // Since the following character is whitespace, and the current character
			    // is the first character of a token, the current character is a one-character token
			    tokenLen = 1; 
			    tokenClass = null; // null indicates that we don't know yet what the class is

			}
			
			else if (firstCharOfToken == NEWLINE) {
				tokenLen = 1; 
				tokenClass = NewlineToken.class;
			}
			
			else if (firstCharOfToken == CR) {
			
			    char peekAhead;
			    peekAhead = textSegment.charAt(currentPosition+1);
			    if (peekAhead != NEWLINE) { 
				tokenLen = 1; 
				tokenClass = NewlineToken.class;
			    }
			    else { 
				// create CR followed by LF as single end-of-line marker
				tokenLen = 2; // skip an extra one to skip both the CR and the LF
				tokenClass = NewlineToken.class;
			    }
			
			
			}
			
			else if (firstCharOfToken==PERIOD) {
			    // check if decimal number without the leading digits
			    int len = getLengthIfIsNumberThatStartsWithPeriod(currentPosition, textSegment);
			    if (len > 0) {
				tokenClass = NumToken.class;
				tokenLen = len; 
			    } 

			    else if (isEllipsis(currentPosition, textSegment)) {
				tokenLen = 3; 
				tokenClass = PunctuationToken.class;
			    } else {
				// Abbreviation does not start with period, and not part of some other token, so it is punctuation
				tokenLen = 1; 
				tokenClass = PunctuationToken.class;
			    }

			} 

			else if (firstCharOfToken==HYPHEN_OR_MINUS_SIGN) {
			    // If it's the first character of a token, then this is not a hyphenated term that
			    // was supposed to be kept as one token, or we would have included it in the previous token
			    // Also telephone numbers do not start with a dash
			    // So assume the hyphen/dash char is a one-character token like in 5-6 or in -400
			    tokenLen = 1; 
			    tokenClass = PunctuationToken.class;
			} 
			
			else if (firstCharOfToken==APOSTROPHE) { 
			    // "can't" is not part of this case because the n is the start of the second token
			    // The 've part of should've is not handled here, when something like should've or he'll
			    // is found, 2 tokens are created (elsewhere)
			    
			    // Check if start of a Name
			    int len = getLengthIfNameStartingWithApostrophe(currentPosition, textSegment);
			    if (len > 0) {
				tokenLen = len;
				tokenClass = WordToken.class;
			    } else if (ContractionsPTB.isContractionThatStartsWithApostrophe(currentPosition, lowerCasedText)) { 
				// 'tis and 'twas which get tokenized as  "'t is"  and  "'t was"
				tokenLen = 2;
				tokenClass = ContractionToken.class;
				// the "is" or "was" part will become a token on the next iteration
				// TODO potential place to add some self-checking code
			    } else { // is separate punctuation mark
				tokenLen = 1;   
				tokenClass = PunctuationToken.class;
			    }
			} 
			
			else if (isPunctuation(firstCharOfToken)) { // other than any handled above
			    // Already handled minus sign and leading period (which could be part of a decimal)
			    // Since not processing 'web-text', no need to look for things like :)
			    // so is some type of 1-character punctuation token
			    tokenLen = 1; 
			    tokenClass = PunctuationToken.class;
			} 
			
			else if (isLetterOrDigit(firstCharOfToken)) {

			    boolean obviouslyIsWord = true; // until we find a non alphanum before a whitespace
			    boolean obviouslyIsNumber = true; // until we find a non digit before a whitespace
			    int nextWhitespaceOrEndOfSegment = -1;
			    int nextNonLetterOrNonDigit = -1;
			    int nextNonLetterDigitApostrophe = -1;
			    int nextNonTelephoneOrPostalChar = -1; // digits and dash aka hyphen
			    int nextNonNumericChar = -1; // 9,876.012345  is an example with all the numeric chars 
			    int nextNonDigit = -1;
			    
			    // First check the easy case - if just letters and digits until next whitespace (or until end of segment)
			    // then that is a word or a number, can skip all the other logic to check for +hyphens
			    // or contractions etc
			    int i = currentPosition;
			    char ch;
			    do {
				ch = textSegment.charAt(i);
				if (isWhitespace(ch)) {
				    if (nextNonLetterOrNonDigit < 0) nextNonLetterOrNonDigit = i;
				    if (nextNonLetterDigitApostrophe < 0) nextNonLetterDigitApostrophe = i;
				    if (nextNonDigit < 0) nextNonDigit = i;
				    if (nextNonTelephoneOrPostalChar < 0) nextNonTelephoneOrPostalChar = i;
				    if (nextNonNumericChar < 0) nextNonNumericChar = i;
				    nextWhitespaceOrEndOfSegment = i;
				} else if (!isLetterOrDigit(ch)) {
				    obviouslyIsWord = false; // not sure if it will be word all the way to whitespace
				    obviouslyIsNumber = false; // not sure if it will be number all the way to whitespace
				    if (nextNonLetterOrNonDigit < 0) nextNonLetterOrNonDigit = i;
				    if (nextNonLetterDigitApostrophe < 0 && ch!=APOSTROPHE) {
					nextNonLetterDigitApostrophe = i;
				    }
				    if (nextNonDigit < 0) nextNonDigit = i;
				    if (nextNonTelephoneOrPostalChar < 0 && !isTelephoneNumberChar(ch)) {
					nextNonTelephoneOrPostalChar = i;
				    }
				    if (nextNonNumericChar < 0 && !isNumericChar(ch)) {
					nextNonNumericChar = i;
				    }
				    // don't break here though, keep going to set nextWhitespace correctly for other uses
				} else if (!isDigit(ch)) {
				    obviouslyIsNumber = false; // not sure if it will be number all the way to whitespace
				    // since passed nextNonLetterOrNonDigit test above, must be letter, so nextNonLetterOrNonDigit is not changed here
				    // since passed !isLetterOrDigit test above, must be letter, so nextNonLetterDigitApostrophe is not changed here
				    if (nextNonDigit < 0) nextNonDigit = i;
				    if (nextNonTelephoneOrPostalChar < 0 && !isTelephoneNumberChar(ch)) {
					nextNonTelephoneOrPostalChar = i;
				    }
				    if (nextNonNumericChar < 0 && !isNumericChar(ch)) {
					nextNonNumericChar = i;
				    }
				} else {
				    // else is a digit, none of the flags need to be set for digit characters.
				}

				i++;
			    } while (i < textSegment.length() && !isWhitespace(ch));

			    if (i>=textSegment.length()) {
				if (nextWhitespaceOrEndOfSegment < 0) nextWhitespaceOrEndOfSegment = textSegment.length();
				if (nextNonLetterOrNonDigit < 0) nextNonLetterOrNonDigit = textSegment.length();
				if (nextNonLetterDigitApostrophe < 0) nextNonLetterDigitApostrophe = textSegment.length();
				if (nextNonTelephoneOrPostalChar < 0) nextNonTelephoneOrPostalChar = textSegment.length();
				if (nextNonNumericChar < 0) nextNonNumericChar = textSegment.length();
			    }
			    //System.err.println("nextWhitespaceOrEndOfSegment = " + nextWhitespaceOrEndOfSegment);

			    
			    if (obviouslyIsNumber) {
				    tokenLen = nextWhitespaceOrEndOfSegment - currentPosition;
				    tokenClass = NumToken.class;
			    } else if (obviouslyIsWord) {
				// Check for things like "cannot" and "gonna" that appear to be one token but
				// are supposed to be more than one according to PTB rules.
				String lowerCasedSubstring = textSegment.substring(currentPosition, nextWhitespaceOrEndOfSegment).toLowerCase();
				int len = ContractionsPTB.lenOfFirstTokenInContraction(lowerCasedSubstring);
				if (len > 0) { // is a contraction that doesn't contain an apostrophe, like "gonna", create WordToken for first part, 
				    		// and create ContractionToken for other token(s)  
				    tokenLen = len;
				    tokenClass = WordToken.class;
				    bta = createToken(tokenClass, textSegment, jcas, currentPosition, currentPosition+tokenLen, offsetAdjustment);
				    tokens.add(bta);
				    currentPosition+=tokenLen; // currentPosition
				    
				    len = ContractionsPTB.lenOfSecondTokenInContraction(lowerCasedSubstring);
				    
				    tokenLen = len;
				    tokenClass = ContractionToken.class;
				    
				    len = ContractionsPTB.lenOfThirdTokenInContraction(lowerCasedSubstring);
				    if (len>0) { // if there is a 3rd, create the 2nd and set up for the 3rd to be created later
					bta = createToken(tokenClass, textSegment, jcas, currentPosition, currentPosition+tokenLen, offsetAdjustment);
					tokens.add(bta);
					currentPosition+=tokenLen; // currentPosition

					tokenLen = len;
					tokenClass = ContractionToken.class;
				    }
				} else {
				    tokenLen = nextWhitespaceOrEndOfSegment - currentPosition;
				    tokenClass = WordToken.class;
				}
			    } else { // Still within the "isLetterOrDigit(firstCharOfToken)" but not obviously number or word

				int len;

				
				ContractionResult cr;
				
				// Not sure what the token is, the token could extend to 
				// include all to the end of an email address, 
				// or include all to the end of a URL, 
				// or include all to the end of a URL, 
				// or through the next period (for an abbreviation)
				// or to the next hyphen, 
				// or beyond, 
				// or to the next whitespace (note already handle case of all alphanums to whitespace
				// or to the end of input (note already handle case of all alphanums to end of input
				// or the next apostrophe (for a most contractions) 
				// or until "n't" for such contractions
				// or the next other punctuation symbol
				// or beyond (for 80's)
				// or could include some punctuation like 3,245.51

				// Need to check for things like 80's before checking for contractions or else 80's looks like a contraction
			        if (nextNonLetterOrNonDigit < lowerCasedText.length() && lowerCasedText.charAt(nextNonLetterOrNonDigit)==APOSTROPHE) {
			            String lowerCasedSubstring = lowerCasedText.substring(currentPosition, nextWhitespaceOrEndOfSegment);
			            len = ContractionsPTB.tokenLengthCheckingForSingleQuoteWordsToKeepTogether(lowerCasedSubstring);
			            if (len > nextNonLetterOrNonDigit-currentPosition) { // if keeping the apostrophe attached
			        	tokenLen = len;
			        	tokenClass = wordTokenOrNumToken(lowerCasedText, currentPosition, tokenLen);
			            } // else let contraction checking later determine what to do

			            
			        }
			        if (tokenLen == NOT_SET_INDICATOR) { // not found yet
			            if ((cr = ContractionsPTB.getLengthIfNextApostIsMiddleOfContraction(currentPosition, nextNonLetterOrNonDigit, lowerCasedText)) != null) {
			        	len = cr.getWordTokenLen();
			        	tokenLen = len;
			        	tokenClass = WordToken.class;
			        	char c = lowerCasedText.charAt(currentPosition+len);
			        	if (c=='n' || c==APOSTROPHE) { // if a "n't" contraction or a contraction where contraction token starts with '
			        	    if (tokenLen < 0) throw new RuntimeException("c = " + c + "tokenLen = " + tokenLen + " currentPosition = " + currentPosition);
			        	    // First create the WordToken (no apostrophe)
			        	    if(tokenLen > 0){
			        	      bta = createToken(tokenClass, textSegment, jcas, currentPosition, currentPosition+tokenLen, offsetAdjustment);
			        	      //System.out.println("bta = " + bta + " class = " + bta.getClass() + " tokenLen = " + tokenLen + " currentPosition = " + currentPosition);
			        	      tokens.add(bta);
			        	      currentPosition+=tokenLen; // currentPosition
			        	    }
			        	    // Set up to create the second token, for other contractions, the next token will start with an 
			        	    // apostrophe and be handled above... but for "n't" contractions, next token won't start with apostrophe
			        	    // so just go ahead and handle it here instead of having to keep track of previous 
			        	    // and handle n't in next loop.
			        	    tokenLen = cr.getContractionTokenLen();
			        	    // if (tokenLen!=3) throw new RuntimeException("getContractionTokenLen != 3 for n't");
			        	    tokenClass = ContractionToken.class;
			        	} else {
			        	    throw new RuntimeException("ERROR: getLengthIfNextApostIsMiddleOfContraction returned " + len + " but the character (" + c +") after that is not 'n' or apostrophe ");
			        	}
				    
				    
			            } else if ((len = lenIfIsTelephoneNumber(currentPosition, lowerCasedText, nextNonTelephoneOrPostalChar)) > 0) {
			        	tokenLen = len;
			        	tokenClass = WordToken.class;
			            } else if ((len = lenIfIsPostalCode(currentPosition, lowerCasedText, nextNonTelephoneOrPostalChar)) > 0) {
			        	tokenLen = len;
			        	tokenClass = WordToken.class;
			            } else if ((len = lenIfIsUrl(currentPosition, lowerCasedText, nextWhitespaceOrEndOfSegment)) > 0) {
			        	tokenLen = len;
			        	tokenClass = WordToken.class;
			            } else if ((len = lenIfIsEmailAddress(currentPosition, lowerCasedText, nextWhitespaceOrEndOfSegment)) > 0) {
			        	tokenLen = len;
			        	tokenClass = WordToken.class;
			            } else if ((len = lenIfIsAbbreviation(currentPosition, textSegment, nextWhitespaceOrEndOfSegment)) > 0) {
			        	tokenLen = len;
			        	tokenClass = WordToken.class;
			            } else { // Still within the "isLetterOrDigit(firstCharOfToken)".
			        	// not obviously a word or number (already checked those)
			        	// and not Url, EmailAddress, or Abbreviation
			        	// There could be a hyphen before the next white space,
			        	// or a symbol before the next whitespace
			        	// or apostrophe like in 80's or P'yongyang (one token each) or James' or Ted's (2 tokens each)
			        	// Take alphanums, but consider hyphenated words and names with apostrophes 
			        	// and consider tele numbers and postal codes

			        	//				    if (true) { // TBD comment out this debug code
			        	//					System.out.println("lowerCasedSubstring = " + quoted(lowerCasedSubstring));
			        	//					System.out.println("currentPosition = " + currentPosition);
			        	//					System.out.println("nextWhitespaceOrEndOfSegment = " + nextWhitespaceOrEndOfSegment);
			        	//					System.out.println("nextNonLetterOrNonDigit = " + nextNonLetterOrNonDigit);
			        	//					System.out.println("nextNonLetterDigitApostrophe = " + nextNonLetterDigitApostrophe);
			        	//				    }

			        	if (nextNonLetterOrNonDigit<lowerCasedText.length() && lowerCasedText.charAt(nextNonLetterOrNonDigit)==HYPHEN_OR_MINUS_SIGN) {
			        	    // telephone numbers and postal codes handled above already
				            String lowerCasedSubstring = lowerCasedText.substring(currentPosition, nextWhitespaceOrEndOfSegment);
			        	    len = HyphenatedPTB.tokenLengthCheckingForHyphenatedTerms(lowerCasedSubstring);
			        	    tokenLen = len;
			        	    if (tokenLen < 0) throw new RuntimeException("tokenLen = " + tokenLen + " currentPosition = " + currentPosition + " nextNonLetterOrNonDigit = " + nextNonLetterOrNonDigit);
			        	    tokenClass = wordTokenOrNumToken(lowerCasedText, currentPosition, tokenLen);
			        	} else if (nextNonNumericChar > 0 && (len = lenIfIsNumberContainingComma(currentPosition, lowerCasedText, nextNonNumericChar)) > 0) {
			        	    tokenLen = len;
			        	    tokenClass = NumToken.class;
			        	} else if (nextNonLetterDigitApostrophe < lowerCasedText.length() && lowerCasedText.charAt(nextNonLetterDigitApostrophe)==PERIOD) {
			        	    // see if is a number with a decimal place (without commas, comma-containing numbers are handled above)
			        	    if (nextNonDigit==lowerCasedText.length()-1) {
			        		// end of sentence, don't include the period as part of the number, count it as end of sentence marker (punctuation)
			        		tokenLen = nextNonDigit - currentPosition;
			        		//if (tokenLen<1) throw new RuntimeException("Period at end of sentence " + nextNonDigit + " " + nextNonLetterDigitApostrophe+" "+tokenLen+ " " + lowerCasedText);
			        		tokenClass = NumToken.class;
			        	    } else if (nextNonLetterDigitApostrophe==nextNonDigit) {
			        		// if not end of sentence, do include period (decimal point) in the NumToken
			        		tokenLen = nextNonDigit + 1 + getLenToNextNonDigit(lowerCasedText, nextNonDigit+1) - currentPosition;
			        		tokenClass = NumToken.class;
			        	    }
			        	    else {
			        		// something like 2J3. which is not a number or 2'3.
			        		tokenLen = nextNonLetterOrNonDigit - currentPosition;
			        		tokenClass = wordTokenOrNumToken(lowerCasedText, currentPosition, tokenLen);
			        	    }
			        	} else { // breaking character is not - character and not ' character, so stop there
			        	    tokenLen = nextNonLetterOrNonDigit - currentPosition;
			        	    tokenClass = wordTokenOrNumToken(lowerCasedText, currentPosition, tokenLen);
			        	}
			        	//} else {
			        	//    throw new UnsupportedOperationException("nextNonLetterDigitApostrophe = " + nextNonLetterDigitApostrophe);
			        	//}

			            }

			        }
			    }
			    
			} else { // some other symbol or punctuation not included in isPunctuation
			    // Since not processing 'web-text', no need to look for things like :)
			    // so it is some type of 1-character symbol token
			    tokenLen = 1; 
			    tokenClass = SymbolToken.class;
			}
			
			// add the token created
			if (tokenLen < 0) throw new RuntimeException("tokenLen = " + tokenLen + " currentPosition = " + currentPosition);
			bta = createToken(tokenClass, textSegment, jcas, currentPosition, currentPosition+tokenLen, offsetAdjustment);
			//System.out.println("bta = " + bta + " class = " + bta.getClass() + " tokenLen = " + tokenLen + " currentPosition = " + currentPosition);
			tokens.add(bta);
			currentPosition+=tokenLen; // currentPosition

		} // end while loop
		
		return tokens;
		
	}