in uimafit-core/src/main/java/org/apache/uima/fit/testing/factory/TokenBuilder.java [207:278]
public void buildTokens(JCas aJCas, String aText, String aTokensString, String aPosTagsString,
String aStemsString) {
aJCas.setDocumentText(aText);
if (aPosTagsString != null && posFeatureName == null) {
throw new IllegalArgumentException("posTagsString must be null if TokenBuilder is "
+ "not initialized with a feature name corresponding to the part-of-speech "
+ "feature of the token type (assuming your token type has such a feature).");
}
if (aStemsString != null && stemFeatureName == null) {
throw new IllegalArgumentException("stemsString must be null if TokenBuilder is not "
+ "initialized with a feature name corresponding to the part-of-speech feature "
+ "of the token type (assuming your token type has such a feature).");
}
Feature posFeature = null;
if (posFeatureName != null) {
// String fullPosFeatureName = tokenClass.getClass().getName()+":"+posFeatureName;
// posFeature = jCas.getTypeSystem().getFeatureByFullName(fullPosFeatureName);
posFeature = aJCas.getTypeSystem().getType(tokenClass.getName())
.getFeatureByBaseName(posFeatureName);
}
Feature stemFeature = null;
if (stemFeatureName != null) {
stemFeature = aJCas.getTypeSystem().getType(tokenClass.getName())
.getFeatureByBaseName(stemFeatureName);
}
String tokensString = aTokensString.replaceAll("\\s*\n\\s*", "\n");
String[] sentenceStrings = tokensString.split("\n");
String[] posTags = aPosTagsString != null ? aPosTagsString.split("\\s+") : null;
String[] stems = aStemsString != null ? aStemsString.split("\\s+") : null;
int offset = 0;
int tokenIndex = 0;
for (String sentenceString : sentenceStrings) {
String[] tokenStrings = sentenceString.trim().split("\\s+");
List<Annotation> tokenAnnotations = new ArrayList<Annotation>();
for (String tokenString : tokenStrings) {
// move the offset up to the beginning of the token
while (!aText.startsWith(tokenString, offset)) {
offset++;
if (offset > aText.length()) {
throw new IllegalArgumentException(
String.format("unable to find string %s", tokenString));
}
}
// add the Token
int start = offset;
offset = offset + tokenString.length();
Annotation token = AnnotationFactory.createAnnotation(aJCas, start, offset, tokenClass);
tokenAnnotations.add(token);
// set the stem and part of speech if present
if (posTags != null) {
token.setStringValue(posFeature, posTags[tokenIndex]);
}
if (stems != null) {
token.setStringValue(stemFeature, stems[tokenIndex]);
}
tokenIndex++;
}
if (!tokenAnnotations.isEmpty()) {
int begin = tokenAnnotations.get(0).getBegin();
int end = tokenAnnotations.get(tokenAnnotations.size() - 1).getEnd();
AnnotationFactory.createAnnotation(aJCas, begin, end, sentenceClass);
}
}
}