public void buildTokens()

in uimafit-core/src/main/java/org/apache/uima/fit/testing/factory/TokenBuilder.java [207:278]


  public void buildTokens(JCas aJCas, String aText, String aTokensString, String aPosTagsString,
          String aStemsString) {
    aJCas.setDocumentText(aText);

    if (aPosTagsString != null && posFeatureName == null) {
      throw new IllegalArgumentException("posTagsString must be null if TokenBuilder is "
              + "not initialized with a feature name corresponding to the part-of-speech "
              + "feature of the token type (assuming your token type has such a feature).");
    }

    if (aStemsString != null && stemFeatureName == null) {
      throw new IllegalArgumentException("stemsString must be null if TokenBuilder is not "
              + "initialized with a feature name corresponding to the part-of-speech feature "
              + "of the token type (assuming your token type has such a feature).");
    }

    Feature posFeature = null;
    if (posFeatureName != null) {
      // String fullPosFeatureName = tokenClass.getClass().getName()+":"+posFeatureName;
      // posFeature = jCas.getTypeSystem().getFeatureByFullName(fullPosFeatureName);
      posFeature = aJCas.getTypeSystem().getType(tokenClass.getName())
              .getFeatureByBaseName(posFeatureName);
    }
    Feature stemFeature = null;
    if (stemFeatureName != null) {
      stemFeature = aJCas.getTypeSystem().getType(tokenClass.getName())
              .getFeatureByBaseName(stemFeatureName);
    }

    String tokensString = aTokensString.replaceAll("\\s*\n\\s*", "\n");
    String[] sentenceStrings = tokensString.split("\n");
    String[] posTags = aPosTagsString != null ? aPosTagsString.split("\\s+") : null;
    String[] stems = aStemsString != null ? aStemsString.split("\\s+") : null;

    int offset = 0;
    int tokenIndex = 0;

    for (String sentenceString : sentenceStrings) {
      String[] tokenStrings = sentenceString.trim().split("\\s+");
      List<Annotation> tokenAnnotations = new ArrayList<Annotation>();
      for (String tokenString : tokenStrings) {
        // move the offset up to the beginning of the token
        while (!aText.startsWith(tokenString, offset)) {
          offset++;
          if (offset > aText.length()) {
            throw new IllegalArgumentException(
                    String.format("unable to find string %s", tokenString));
          }
        }

        // add the Token
        int start = offset;
        offset = offset + tokenString.length();
        Annotation token = AnnotationFactory.createAnnotation(aJCas, start, offset, tokenClass);
        tokenAnnotations.add(token);

        // set the stem and part of speech if present
        if (posTags != null) {
          token.setStringValue(posFeature, posTags[tokenIndex]);
        }
        if (stems != null) {
          token.setStringValue(stemFeature, stems[tokenIndex]);
        }
        tokenIndex++;
      }
      if (!tokenAnnotations.isEmpty()) {
        int begin = tokenAnnotations.get(0).getBegin();
        int end = tokenAnnotations.get(tokenAnnotations.size() - 1).getEnd();
        AnnotationFactory.createAnnotation(aJCas, begin, end, sentenceClass);
      }
    }
  }