in core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java [81:112]
public String text(Element element) {
// not interested in getting any text?
if (noText) return "";
final StringBuilder accum = new StringBuilder();
// no patterns at all - return the text from the whole document
if (inclusionPatterns.size() == 0 && excludedTags.size() == 0) {
_text(element, accum);
} else {
Elements matches = new Elements();
for (String pattern : inclusionPatterns) {
matches = element.select(pattern);
if (!matches.isEmpty()) {
break;
}
}
// if nothing matches or no patterns were defined use the whole doc
if (matches.isEmpty()) {
matches.add(element);
}
for (Element node : matches) {
_text(node, accum);
accum.append("\n");
}
}
return accum.toString().trim();
}