in ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java [268:381]
public void process(JCas jcaz) throws AnalysisEngineProcessException {
JCas jcas;
try {
if (inputViewName != null) {
jcas = jcaz.getView(inputViewName);
} else {
jcas = jcaz;
}
} catch (CASException e1) {
throw new AnalysisEngineProcessException(e1.getCause());
}
// init:
String documentText = jcas.getDocumentText();
String splitSeq = documentText.contains("\r\n") ? "\r\n" : "\n";
map = new int[documentText.length() + 1];
JCas modview = null;
try {
// check if view already exists:
Iterator<JCas> viewIterator = jcas.getViewIterator();
while (viewIterator.hasNext()) {
JCas jCas2 = viewIterator.next();
if (jCas2.getViewName().equals(modifiedViewName)) {
modview = jCas2;
getContext().getLogger().log(Level.WARNING,
"view with name \"" + modifiedViewName + "\" already exists.");
}
}
if (modview == null) {
modview = jcas.createView(modifiedViewName);
}
} catch (CASException e) {
e.printStackTrace();
return;
}
SortedSet<HtmlConverterPSpan> visibleSpansSoFar = new TreeSet<HtmlConverterPSpan>();
SortedSet<HtmlConverterPSpan> linebreaksFromHtmlTags = new TreeSet<HtmlConverterPSpan>();
SortedSet<HtmlConverterPSpan> gapsFromHtmlTags = new TreeSet<HtmlConverterPSpan>();
// process
try {
Parser parser = new Parser(documentText);
NodeList list = parser.parse(null);
HtmlConverterVisitor visitor = new HtmlConverterVisitor(newlineInducingTags,
newlineInducingTagRegExp, gapInducingTags, gapText, skipWhitespaces, processAll);
list.visitAllNodesWith(visitor);
visibleSpansSoFar = visitor.getTextSpans();
linebreaksFromHtmlTags = visitor.getLinebreaksFromHtmlTags();
gapsFromHtmlTags = visitor.getGapsFromHtmlTags();
} catch (ParserException e) {
throw new AnalysisEngineProcessException(e);
}
if (replaceLinebreaks) {
visibleSpansSoFar = this.handleLinebreaksInDocumentText(visibleSpansSoFar, splitSeq);
}
if (conversionPolicy.equals("heuristic")) {
visibleSpansSoFar = this.htmlDecoding(visibleSpansSoFar);
} else if (conversionPolicy.equals("explicit")) {
for (int i = 0; i < conversionPatterns.length; i++) {
String pat = conversionPatterns[i];
String rep = conversionReplacements[i];
visibleSpansSoFar = this.handleConversion(visibleSpansSoFar, pat, rep);
}
}
visibleSpansSoFar.addAll(linebreaksFromHtmlTags);
visibleSpansSoFar.addAll(gapsFromHtmlTags);
// create new doc-text and the map from deletions and visible-text-spans:
StringBuffer sbu = new StringBuffer(documentText.length());
int originalOffsetI = 0;
int outOffset = 0;
for (HtmlConverterPSpan vis : visibleSpansSoFar) {
final int begin = vis.getBegin();
final int end = vis.getEnd();
// map text before annotation:
while (originalOffsetI < begin) {
map[originalOffsetI++] = outOffset;
}
// get and map text/replacement:
String s = "";
if (vis instanceof HtmlConverterPSpanReplacement) {
// conversion/replacement:
s = vis.getTxt();
// asserts that s is shorter than the original source
while (originalOffsetI < begin + s.length()) {
map[originalOffsetI++] = outOffset++;
}
while (originalOffsetI < end) {
map[originalOffsetI++] = outOffset;
}
} else {
// simple annotation:
s = documentText.substring(begin, end);
while (originalOffsetI < end) {
map[originalOffsetI++] = outOffset++;
}
}
sbu.append(s);
}
while (originalOffsetI < documentText.length()) {
map[originalOffsetI++] = outOffset;
}
map[documentText.length()] = outOffset + 1; // handle doc end separately
String modTxt = sbu.toString();
modview.setDocumentText(modTxt);
// copy annotations using the 'map':
try {
mapAnnotations(jcas, map, modifiedViewName);
} catch (CASException e) {
e.printStackTrace();
}
}