in ctakes-dependency-parser/src/main/java/org/apache/ctakes/dependency/parser/cr/DependencyFileCollectionReader.java [127:288]
public void getNext(CAS cas) throws IOException, CollectionException {
try {
//if (hasNext()) {
JCas jCas = cas.getJCas();
int wordNumber = 0;
int sentNumber = 0;
int wordStart = 0;
int wordEnd = 0;
int sentStart = 0;
int sentEnd = 0;
ArrayList<String> lines = new ArrayList<String>(50);
StringBuffer documentText = new StringBuffer();
// First line
line = input.readLine();
if (line==null)
return;
else
lines.add(line);
while (true) {
// Read the line
line = input.readLine();
String testline = "";
if ( lines.size()>0 ) {
testline = lines.get(lines.size()-1);
}
// Check if document is done
if (line==null && testline.matches("\\A\\s*\\Z")) {
jCas.setDocumentText(documentText.toString());
break;
}
// Process when sentence is done
else if (line.matches("\\A\\s*\\Z") || (line==null && !lines.get(lines.size()).matches("^\\s*$")) ) {
lines.trimToSize();
ArrayList<ConllDependencyNode> depNodes = new ArrayList<ConllDependencyNode>(lines.size());
/** Input for training: many formats, creates dependency nodes */
sentStart = wordStart;
sentEnd = sentStart;
for (String aline : lines) {
sentEnd += aline.split("\t")[1].length()+1;
}
if ( !inputFormat.contains("tok") ) {
depNodes.add( new ConllDependencyNode(jCas,sentStart,sentEnd));
depNodes.get(depNodes.size()-1).setId(0);
depNodes.get(depNodes.size()-1).addToIndexes(jCas);
}
/** Create tokens */
if ( inputFormat.contains("tok") ) {
for (String aline : lines) {
String[] tokens = aline.split("\t");
wordEnd = wordStart + tokens[1].length();
BaseToken btoken = new BaseToken(jCas,wordStart,wordEnd);
btoken.setTokenNumber(wordNumber++);
btoken.addToIndexes();
documentText.append(tokens[1] + " ");
wordStart = wordEnd+1;
}
} else if ( inputFormat.contains("min") ) {
for (String aline : lines) {
String[] tokens = aline.split("\t");
wordEnd = wordStart + tokens[1].length();
if (trainingMode)
depNodes.add( new ConllDependencyNode(jCas,wordStart,wordEnd) );
BaseToken btoken = new BaseToken(jCas,wordStart,wordEnd);
btoken.setTokenNumber(wordNumber++);
btoken.addToIndexes();
documentText.append(tokens[1] + " ");
wordStart = wordEnd+1;
}
} else if ( inputFormat.contains("mpos")) {
for (String aline : lines) {
String[] tokens = aline.split("\t");
wordEnd = wordStart + tokens[1].length();
if (trainingMode)
depNodes.add( new ConllDependencyNode(jCas,wordStart,wordEnd) );
BaseToken btoken = new BaseToken(jCas,wordStart,wordEnd);
btoken.setTokenNumber(wordNumber++);
btoken.setPartOfSpeech(tokens[2]);
btoken.addToIndexes();
documentText.append(tokens[1] + " ");
wordStart = wordEnd+1;
}
} else if ( inputFormat.contains("mlem")) {
for (String aline : lines) {
String[] tokens = aline.split("\t");
wordEnd = wordStart + tokens[1].length();
if (trainingMode)
depNodes.add( new ConllDependencyNode(jCas,wordStart,wordEnd) );
BaseToken btoken = new BaseToken(jCas,wordStart,wordEnd);
btoken.setTokenNumber(wordNumber++);
btoken.setNormalizedForm(tokens[2]);
btoken.addToIndexes();
documentText.append(tokens[1] + " ");
wordStart = wordEnd+1;
}
} else if ( inputFormat.contains("dep")) {
for (String aline : lines) {
String[] tokens = aline.split("\t");
wordEnd = wordStart + tokens[1].length();
if (trainingMode)
depNodes.add( new ConllDependencyNode(jCas,wordStart,wordEnd) );
BaseToken btoken = new BaseToken(jCas,wordStart,wordEnd);
btoken.setTokenNumber(wordNumber++);
btoken.setNormalizedForm(tokens[2]);
btoken.setPartOfSpeech(tokens[3]);
btoken.addToIndexes();
documentText.append(tokens[1] + " ");
wordStart = wordEnd+1;
}
} else { // CONLL format assumed
if (!inputFormat.contains("conll")) { System.err.println("Warning: Assuming CONLL-x input format"); }
for (String aline : lines) {
String[] tokens = aline.split("\t");
wordEnd = wordStart + tokens[1].length();
if (trainingMode)
depNodes.add( new ConllDependencyNode(jCas,wordStart,wordEnd) );
BaseToken btoken = new BaseToken(jCas,wordStart,wordEnd);
btoken.setTokenNumber(wordNumber++);
btoken.setNormalizedForm(tokens[2]);
btoken.setPartOfSpeech(tokens[4]);
btoken.addToIndexes();
documentText.append(tokens[1] + " ");
wordStart = wordEnd+1;
}
}
Sentence sentence = new Sentence(jCas, sentStart, wordEnd);
sentence.setSentenceNumber(sentNumber);
sentence.addToIndexes();
if (!inputFormat.contains("tok") && trainingMode)
setDependencyNodesFromTabbedText(jCas, lines, documentText, depNodes);
if (line==null) {
jCas.setDocumentText(documentText.toString());
break;
}
//wordNumber = 0;
//wordStart = 0;
//wordEnd = 0;
sentNumber++;
lines = new ArrayList<String>(50);
} else {
lines.add(line);
}
}
} catch (CASException ce) {
throw new CollectionException(ce);
}
line=null;
}