in streampipes-extensions/streampipes-pipeline-elements-experimental-flink/src/main/java/com/kohlschutter/boilerpipe/sax/BoilerpipeHTMLContentHandler.java [287:365]
public void flushBlock() {
if (inBody == 0) {
if ("TITLE".equalsIgnoreCase(lastStartTag) && inBody == 0) {
setTitle(tokenBuffer.toString().trim());
}
textBuffer.setLength(0);
tokenBuffer.setLength(0);
return;
}
final int length = tokenBuffer.length();
switch (length) {
case 0:
return;
case 1:
if (sbLastWasWhitespace) {
textBuffer.setLength(0);
tokenBuffer.setLength(0);
return;
}
}
final String[] tokens = UnicodeTokenizer.tokenize(tokenBuffer);
int numWords = 0;
int numLinkedWords = 0;
int numWrappedLines = 0;
int currentLineLength = -1; // don't count the first space
final int maxLineLength = 80;
int numTokens = 0;
int numWordsCurrentLine = 0;
for (String token : tokens) {
if (ANCHOR_TEXT_START.equals(token)) {
inAnchorText = true;
} else if (ANCHOR_TEXT_END.equals(token)) {
inAnchorText = false;
} else if (isWord(token)) {
numTokens++;
numWords++;
numWordsCurrentLine++;
if (inAnchorText) {
numLinkedWords++;
}
final int tokenLength = token.length();
currentLineLength += tokenLength + 1;
if (currentLineLength > maxLineLength) {
numWrappedLines++;
currentLineLength = tokenLength;
numWordsCurrentLine = 1;
}
} else {
numTokens++;
}
}
if (numTokens == 0) {
return;
}
int numWordsInWrappedLines;
if (numWrappedLines == 0) {
numWordsInWrappedLines = numWords;
numWrappedLines = 1;
} else {
numWordsInWrappedLines = numWords - numWordsCurrentLine;
}
TextBlock tb =
new TextBlock(textBuffer.toString().trim(), currentContainedTextElements, numWords,
numLinkedWords, numWordsInWrappedLines, numWrappedLines, offsetBlocks);
currentContainedTextElements = new BitSet();
offsetBlocks++;
textBuffer.setLength(0);
tokenBuffer.setLength(0);
tb.setTagLevel(blockTagLevel);
addTextBlock(tb);
blockTagLevel = -1;
}