private boolean findnextdoc()

in uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccTextCM.java [167:305]


  private boolean findnextdoc(NextDoc condition) throws IOException {
    int startloc=-1;

    if (newWI) {
      newWI = false;
      int len = fis.read(buffer,0,bytelength);
      if (len != bytelength) {
    	throw new IOException("Read "+len+" bytes, expected "+bytelength);
      }
   	  currentindex = 0;
    }

    if (condition.equals(NextDoc.SEP_IN_LASTBLOCK)) {
    	// separator found at end of last block
    	if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
      	  return false;
      	}
      	if (10 == buffer[currentindex]) {
      	  currentindex++; // point at first char in Doc
      	}
      	startloc=currentindex;

        // find end of next doc
        int endloc=0;
        while (currentindex < (bytelength-1)) {
          if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
        	endloc = currentindex - 1;
        	break;
          }
          else {
        	currentindex++;
          }
        }
        if (endloc == 0) {
          throw new RuntimeException("Document larger than "+bytelength+" found in "+inputFileName+" block "+blockindex);
        }
        byte [] docbytes = Arrays.copyOfRange(buffer, startloc, endloc);
        nextDoc = new String(docbytes, encoding);
        nextDocOffset = startloc;
        return true;
      }

    if (condition.equals(NextDoc.FIRSTDOC)) {
      // special handling at beginning of first block
      // skip any leading EOL to find start of first doc
      // only execute this once
      strategy = NextDoc.NORMAL;
      while (10 == buffer[currentindex]) {
    	currentindex++;
    	if (currentindex == bytelength) {
    	  if (firstdoc) {
    		return false; // nothing but newlines in this block
    	  }
    	}
      }
    }

    if (condition.equals(NextDoc.NORMAL)) {
    	// currentindex either pointing at start of a segmentation, or 
    	// if a new block then possibly the middle of a previous document
      if (!(10 == buffer[currentindex] && 10 == buffer[currentindex+1])) {
      	// in the middle of a spilled Doc. Find next segmentation
      	while (currentindex < (bytelength-1)) {
      	  if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
      		break;
      	  }
      	  else {
      		currentindex++;
      	  }
      	}
      }
      if ( currentindex == bytelength-1) {
    	fis.close();
    	return false;
      }
      // now pointing at start of a segmentation, find start/end of next Doc
      while (10 == buffer[currentindex]) {
    	currentindex++;
    	if (currentindex == bytelength) {
    	  if (lastblock) {
    		fis.close();
    		return false;
    	  }
          // read next block and continue looking for end of Doc
    	  int len = fis.read(buffer,bytelength,bytelength);
    	  if (len <= 0) {
            throw new IOException("Read "+len+" bytes for "+inputFileName+" block "+blockindex+1);
    	  }
    	  fis.close();
    	  spilled = true;
    	  bytelength += len;
    	  return findnextdoc(NextDoc.SEP_IN_LASTBLOCK);
    	}
      }
    }

    startloc = currentindex;
    // find end of Doc
    int endloc=0;
    while (currentindex < (bytelength-1)) {
      if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
    	endloc = currentindex - 1;
      	break;
      }
      else {
    	currentindex++;
      }
    }

      if (endloc == 0) {
    	if (lastblock) {
    	  endloc = bytelength-1;
    	}
    	else {
    	  // read next block and continue looking for end of Doc
          int len = fis.read(buffer,bytelength,bytelength);
          if (len <= 0) {
        	throw new IOException("Read "+len+" bytes for "+inputFileName+" block "+blockindex+1);
          }
          fis.close();
          spilled = true;
          bytelength += len;
    	}
        while (currentindex < (bytelength-1)) {
          if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
        	endloc = currentindex - 1;
          	break;
          }
          else {
          	currentindex++;
          }
        }
        endloc = currentindex - 1;
      }
      byte [] docbytes = Arrays.copyOfRange(buffer, startloc, endloc);
      nextDoc = new String(docbytes, encoding);
      nextDocOffset = startloc;
      return true;
  }