public void process()

in WhitespaceTokenizer/src/main/java/org/apache/uima/annotator/WhitespaceTokenizer.java [79:215]


   public void process(CAS aCas) throws AnalysisEngineProcessException {

      this.logger.logrb(Level.INFO, "WhitespaceTokenizer", "process",
            MESSAGE_BUNDLE, "whitespace_tokenizer_info_start_processing");

      ArrayList<CAS> casList = new ArrayList<CAS>();
      // check if sofa names are available
      if (this.sofaNames != null && this.sofaNames.length > 0) {

         // get sofa names
         for (int i = 0; i < this.sofaNames.length; i++) {
            Iterator it = aCas.getViewIterator(this.sofaNames[i]);
            while (it.hasNext()) {
               // add sofas to the cas List to process
               casList.add((CAS) it.next());
            }
         }
      } else {
         // use default sofa for the processing
         casList.add(aCas);
      }

      for (int x = 0; x < casList.size(); x++) {

         this.cas = casList.get(x);

         // get text content from the CAS
         char[] textContent = this.cas.getDocumentText().toCharArray();

         int tokenStart = UNDEFINED;
         int currentCharPos = 0;
         int sentenceStart = 0;
         int nextCharType = UNDEFINED;
         char nextChar = INVALID_CHAR;

         while (currentCharPos < textContent.length) {
            char currentChar = textContent[currentCharPos];
            int currentCharType = getCharacterType(currentChar);

            // get character class for current and next character
            if ((currentCharPos + 1) < textContent.length) {
               nextChar = textContent[currentCharPos + 1];
               nextCharType = getCharacterType(nextChar);
            } else {
               nextCharType = UNDEFINED;
               nextChar = INVALID_CHAR;
            }

            // check if current character is a letter or number
            if (currentCharType == CH_LETTER || currentCharType == CH_NUMBER) {

               // check if it is the first letter of a token
               if (tokenStart == UNDEFINED) {
                  // start new token here
                  tokenStart = currentCharPos;
               }
            }

            // check if current character is a whitespace character
            else if (currentCharType == CH_WHITESPACE) {

               // terminate current token
               if (tokenStart != UNDEFINED) {
                  // end of current word
                  createAnnotation(this.tokenType, tokenStart, currentCharPos);
                  tokenStart = UNDEFINED;
               }
            }

            // check if current character is a special character
            else if (currentCharType == CH_SPECIAL) {

               // terminate current token
               if (tokenStart != UNDEFINED) {
                  // end of current word
                  createAnnotation(this.tokenType, tokenStart, currentCharPos);
                  tokenStart = UNDEFINED;
               }

               // create token for special character
               createAnnotation(this.tokenType, currentCharPos,
                     currentCharPos + 1);
            }

            // check if current character is new line character
            else if (currentCharType == CH_NEWLINE) {
               // terminate current token
               if (tokenStart != UNDEFINED) {
                  // end of current word
                  createAnnotation(this.tokenType, tokenStart, currentCharPos);
                  tokenStart = UNDEFINED;
               }
            }

            // check if current character is new punctuation character
            else if (currentCharType == CH_PUNCTUATION) {

               // terminates the current token
               if (tokenStart != UNDEFINED) {
                  createAnnotation(this.tokenType, tokenStart, currentCharPos);
                  tokenStart = UNDEFINED;
               }

               // check next token type so see if we have a sentence end
               if (((nextCharType == CH_WHITESPACE) || (nextCharType == CH_NEWLINE))
                     && (punctuations.contains(new String(
                           new char[] { currentChar })))) {
                  // terminate sentence
                  createAnnotation(this.sentenceType, sentenceStart,
                        currentCharPos + 1);
                  sentenceStart = currentCharPos + 1;
               }
               // create token for punctuation character
               createAnnotation(this.tokenType, currentCharPos,
                     currentCharPos + 1);
            }
            // go to the next token
            currentCharPos++;
         } // end of character loop

         // we are at the end of the text terminate open token annotations
         if (tokenStart != UNDEFINED) {
            // end of current word
            createAnnotation(this.tokenType, tokenStart, currentCharPos);
            tokenStart = UNDEFINED;
         }

         // we are at the end of the text terminate open sentence annotations
         if (sentenceStart != UNDEFINED) {
            // end of current word
            createAnnotation(this.sentenceType, sentenceStart, currentCharPos);
            sentenceStart = UNDEFINED;
         }
      }
      this.logger.logrb(Level.INFO, "WhitespaceTokenizer", "process",
            MESSAGE_BUNDLE, "whitespace_tokenizer_info_stop_processing");
   }