public void getNext()

in ctakes-dependency-parser/src/main/java/org/apache/ctakes/dependency/parser/cr/DependencyFileCollectionReader.java [127:288]


	public void getNext(CAS cas) throws IOException, CollectionException {
	    try {
	        //if (hasNext()) {
	        JCas jCas = cas.getJCas();
	        int wordNumber = 0;
	        int sentNumber = 0;
	        int wordStart  = 0;
	        int wordEnd    = 0;
	        int sentStart  = 0;
	        int sentEnd    = 0;
	        ArrayList<String> lines = new ArrayList<String>(50);
	        StringBuffer documentText = new StringBuffer();

	        // First line
	        line = input.readLine();
	        if (line==null)
	            return;
	        else
	            lines.add(line);

	        while (true) {

	            // Read the line
	            line = input.readLine();
	            String testline = "";
	            if ( lines.size()>0 ) {
	                testline = lines.get(lines.size()-1);
	            }

	            // Check if document is done
	            if (line==null && testline.matches("\\A\\s*\\Z")) { 
	                jCas.setDocumentText(documentText.toString());
	                break;
	            }

	            // Process when sentence is done
	            else if (line.matches("\\A\\s*\\Z") || (line==null && !lines.get(lines.size()).matches("^\\s*$")) ) {

	                lines.trimToSize();
	                ArrayList<ConllDependencyNode> depNodes = new ArrayList<ConllDependencyNode>(lines.size());


                    /** Input for training: many formats, creates dependency nodes */
	                sentStart = wordStart;
	                sentEnd   = sentStart;
	                for (String aline : lines) {
	                    sentEnd += aline.split("\t")[1].length()+1;
	                }
	                if ( !inputFormat.contains("tok") ) {
	                    depNodes.add( new ConllDependencyNode(jCas,sentStart,sentEnd));
	                    depNodes.get(depNodes.size()-1).setId(0);
	                    depNodes.get(depNodes.size()-1).addToIndexes(jCas);
	                }

	                /** Create tokens */
	                if ( inputFormat.contains("tok") ) {
	                    for (String aline : lines) {                    
	                        String[] tokens = aline.split("\t");
	                        wordEnd = wordStart + tokens[1].length();
	                        BaseToken btoken = new BaseToken(jCas,wordStart,wordEnd);
	                        btoken.setTokenNumber(wordNumber++);
	                        btoken.addToIndexes();
	                        documentText.append(tokens[1] + " ");
	                        wordStart = wordEnd+1;
	                    }
	                } else if ( inputFormat.contains("min") ) {
	                    for (String aline : lines) {                    
	                        String[] tokens = aline.split("\t");
	                        wordEnd = wordStart + tokens[1].length();
	                        if (trainingMode)
	                        	depNodes.add( new ConllDependencyNode(jCas,wordStart,wordEnd) );
	                        BaseToken btoken = new BaseToken(jCas,wordStart,wordEnd);
	                        btoken.setTokenNumber(wordNumber++);
	                        btoken.addToIndexes();
	                        documentText.append(tokens[1] + " ");
	                        wordStart = wordEnd+1;
	                    }
	                } else if ( inputFormat.contains("mpos")) {
	                    for (String aline : lines) {                    
	                        String[] tokens = aline.split("\t");
	                        wordEnd = wordStart + tokens[1].length();
	                        if (trainingMode)
	                        	depNodes.add( new ConllDependencyNode(jCas,wordStart,wordEnd) );
	                        BaseToken btoken = new BaseToken(jCas,wordStart,wordEnd);
	                        btoken.setTokenNumber(wordNumber++);
	                        btoken.setPartOfSpeech(tokens[2]);
	                        btoken.addToIndexes();
	                        documentText.append(tokens[1] + " ");
	                        wordStart = wordEnd+1;
	                    }
	                } else if ( inputFormat.contains("mlem")) {
	                    for (String aline : lines) {                    
	                        String[] tokens = aline.split("\t");
	                        wordEnd = wordStart + tokens[1].length();
	                        if (trainingMode)
	                        	depNodes.add( new ConllDependencyNode(jCas,wordStart,wordEnd) );
	                        BaseToken btoken = new BaseToken(jCas,wordStart,wordEnd);
	                        btoken.setTokenNumber(wordNumber++);
	                        btoken.setNormalizedForm(tokens[2]);
	                        btoken.addToIndexes();
	                        documentText.append(tokens[1] + " ");
	                        wordStart = wordEnd+1;
	                    }
	                } else if ( inputFormat.contains("dep")) {
	                    for (String aline : lines) {                    
	                        String[] tokens = aline.split("\t");
	                        wordEnd = wordStart + tokens[1].length();
	                        if (trainingMode)
	                        	depNodes.add( new ConllDependencyNode(jCas,wordStart,wordEnd) );
	                        BaseToken btoken = new BaseToken(jCas,wordStart,wordEnd);
	                        btoken.setTokenNumber(wordNumber++);
	                        btoken.setNormalizedForm(tokens[2]);
	                        btoken.setPartOfSpeech(tokens[3]);
	                        btoken.addToIndexes();
	                        documentText.append(tokens[1] + " ");
	                        wordStart = wordEnd+1;
	                    }
	                } else { // CONLL format assumed
	                    if (!inputFormat.contains("conll")) { System.err.println("Warning: Assuming CONLL-x input format"); }
	                    for (String aline : lines) {                    
	                        String[] tokens = aline.split("\t");
	                        wordEnd = wordStart + tokens[1].length();
	                        if (trainingMode)
	                        	depNodes.add( new ConllDependencyNode(jCas,wordStart,wordEnd) );
	                        BaseToken btoken = new BaseToken(jCas,wordStart,wordEnd);
	                        btoken.setTokenNumber(wordNumber++);
	                        btoken.setNormalizedForm(tokens[2]);
	                        btoken.setPartOfSpeech(tokens[4]);
	                        btoken.addToIndexes();
	                        documentText.append(tokens[1] + " ");
	                        wordStart = wordEnd+1;
	                    }
	                }


	                Sentence sentence = new Sentence(jCas, sentStart, wordEnd);
	                sentence.setSentenceNumber(sentNumber);
	                sentence.addToIndexes();
	                
	                if (!inputFormat.contains("tok") && trainingMode) 
	                    setDependencyNodesFromTabbedText(jCas, lines, documentText, depNodes);

	                if (line==null) {
	                    jCas.setDocumentText(documentText.toString());
	                    break;
	                }
	                //wordNumber = 0;
	                //wordStart = 0;
	                //wordEnd = 0;
	                sentNumber++;
	                lines = new ArrayList<String>(50);
	            } else {
	                lines.add(line);
	            }
	        }
	        
	    } catch (CASException ce) {
	        throw new CollectionException(ce);
	    }
	    line=null;

	}