in uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccTextCM.java [167:305]
private boolean findnextdoc(NextDoc condition) throws IOException {
int startloc=-1;
if (newWI) {
newWI = false;
int len = fis.read(buffer,0,bytelength);
if (len != bytelength) {
throw new IOException("Read "+len+" bytes, expected "+bytelength);
}
currentindex = 0;
}
if (condition.equals(NextDoc.SEP_IN_LASTBLOCK)) {
// separator found at end of last block
if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
return false;
}
if (10 == buffer[currentindex]) {
currentindex++; // point at first char in Doc
}
startloc=currentindex;
// find end of next doc
int endloc=0;
while (currentindex < (bytelength-1)) {
if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
endloc = currentindex - 1;
break;
}
else {
currentindex++;
}
}
if (endloc == 0) {
throw new RuntimeException("Document larger than "+bytelength+" found in "+inputFileName+" block "+blockindex);
}
byte [] docbytes = Arrays.copyOfRange(buffer, startloc, endloc);
nextDoc = new String(docbytes, encoding);
nextDocOffset = startloc;
return true;
}
if (condition.equals(NextDoc.FIRSTDOC)) {
// special handling at beginning of first block
// skip any leading EOL to find start of first doc
// only execute this once
strategy = NextDoc.NORMAL;
while (10 == buffer[currentindex]) {
currentindex++;
if (currentindex == bytelength) {
if (firstdoc) {
return false; // nothing but newlines in this block
}
}
}
}
if (condition.equals(NextDoc.NORMAL)) {
// currentindex either pointing at start of a segmentation, or
// if a new block then possibly the middle of a previous document
if (!(10 == buffer[currentindex] && 10 == buffer[currentindex+1])) {
// in the middle of a spilled Doc. Find next segmentation
while (currentindex < (bytelength-1)) {
if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
break;
}
else {
currentindex++;
}
}
}
if ( currentindex == bytelength-1) {
fis.close();
return false;
}
// now pointing at start of a segmentation, find start/end of next Doc
while (10 == buffer[currentindex]) {
currentindex++;
if (currentindex == bytelength) {
if (lastblock) {
fis.close();
return false;
}
// read next block and continue looking for end of Doc
int len = fis.read(buffer,bytelength,bytelength);
if (len <= 0) {
throw new IOException("Read "+len+" bytes for "+inputFileName+" block "+blockindex+1);
}
fis.close();
spilled = true;
bytelength += len;
return findnextdoc(NextDoc.SEP_IN_LASTBLOCK);
}
}
}
startloc = currentindex;
// find end of Doc
int endloc=0;
while (currentindex < (bytelength-1)) {
if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
endloc = currentindex - 1;
break;
}
else {
currentindex++;
}
}
if (endloc == 0) {
if (lastblock) {
endloc = bytelength-1;
}
else {
// read next block and continue looking for end of Doc
int len = fis.read(buffer,bytelength,bytelength);
if (len <= 0) {
throw new IOException("Read "+len+" bytes for "+inputFileName+" block "+blockindex+1);
}
fis.close();
spilled = true;
bytelength += len;
}
while (currentindex < (bytelength-1)) {
if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
endloc = currentindex - 1;
break;
}
else {
currentindex++;
}
}
endloc = currentindex - 1;
}
byte [] docbytes = Arrays.copyOfRange(buffer, startloc, endloc);
nextDoc = new String(docbytes, encoding);
nextDocOffset = startloc;
return true;
}