public void process()

in ruta-core/src/main/java/org/apache/uima/ruta/engine/HtmlConverter.java [268:381]


  public void process(JCas jcaz) throws AnalysisEngineProcessException {
    JCas jcas;
    try {
      if (inputViewName != null) {
        jcas = jcaz.getView(inputViewName);
      } else {
        jcas = jcaz;
      }
    } catch (CASException e1) {
      throw new AnalysisEngineProcessException(e1.getCause());
    }
    // init:
    String documentText = jcas.getDocumentText();
    String splitSeq = documentText.contains("\r\n") ? "\r\n" : "\n";
    map = new int[documentText.length() + 1];
    JCas modview = null;
    try {
      // check if view already exists:
      Iterator<JCas> viewIterator = jcas.getViewIterator();
      while (viewIterator.hasNext()) {
        JCas jCas2 = viewIterator.next();
        if (jCas2.getViewName().equals(modifiedViewName)) {
          modview = jCas2;
          getContext().getLogger().log(Level.WARNING,
                  "view with name \"" + modifiedViewName + "\" already exists.");
        }
      }
      if (modview == null) {
        modview = jcas.createView(modifiedViewName);
      }
    } catch (CASException e) {
      e.printStackTrace();
      return;
    }
    SortedSet<HtmlConverterPSpan> visibleSpansSoFar = new TreeSet<HtmlConverterPSpan>();
    SortedSet<HtmlConverterPSpan> linebreaksFromHtmlTags = new TreeSet<HtmlConverterPSpan>();
    SortedSet<HtmlConverterPSpan> gapsFromHtmlTags = new TreeSet<HtmlConverterPSpan>();

    // process
    try {
      Parser parser = new Parser(documentText);
      NodeList list = parser.parse(null);
      HtmlConverterVisitor visitor = new HtmlConverterVisitor(newlineInducingTags,
              newlineInducingTagRegExp, gapInducingTags, gapText, skipWhitespaces, processAll);
      list.visitAllNodesWith(visitor);
      visibleSpansSoFar = visitor.getTextSpans();
      linebreaksFromHtmlTags = visitor.getLinebreaksFromHtmlTags();
      gapsFromHtmlTags = visitor.getGapsFromHtmlTags();
    } catch (ParserException e) {
      throw new AnalysisEngineProcessException(e);
    }
    if (replaceLinebreaks) {
      visibleSpansSoFar = this.handleLinebreaksInDocumentText(visibleSpansSoFar, splitSeq);
    }
    if (conversionPolicy.equals("heuristic")) {
      visibleSpansSoFar = this.htmlDecoding(visibleSpansSoFar);
    } else if (conversionPolicy.equals("explicit")) {
      for (int i = 0; i < conversionPatterns.length; i++) {
        String pat = conversionPatterns[i];
        String rep = conversionReplacements[i];
        visibleSpansSoFar = this.handleConversion(visibleSpansSoFar, pat, rep);
      }
    }
    visibleSpansSoFar.addAll(linebreaksFromHtmlTags);
    visibleSpansSoFar.addAll(gapsFromHtmlTags);

    // create new doc-text and the map from deletions and visible-text-spans:
    StringBuffer sbu = new StringBuffer(documentText.length());
    int originalOffsetI = 0;
    int outOffset = 0;
    for (HtmlConverterPSpan vis : visibleSpansSoFar) {
      final int begin = vis.getBegin();
      final int end = vis.getEnd();

      // map text before annotation:
      while (originalOffsetI < begin) {
        map[originalOffsetI++] = outOffset;
      }

      // get and map text/replacement:
      String s = "";
      if (vis instanceof HtmlConverterPSpanReplacement) {
        // conversion/replacement:
        s = vis.getTxt();
        // asserts that s is shorter than the original source
        while (originalOffsetI < begin + s.length()) {
          map[originalOffsetI++] = outOffset++;
        }
        while (originalOffsetI < end) {
          map[originalOffsetI++] = outOffset;
        }
      } else {
        // simple annotation:
        s = documentText.substring(begin, end);
        while (originalOffsetI < end) {
          map[originalOffsetI++] = outOffset++;
        }
      }
      sbu.append(s);
    }
    while (originalOffsetI < documentText.length()) {
      map[originalOffsetI++] = outOffset;
    }
    map[documentText.length()] = outOffset + 1; // handle doc end separately
    String modTxt = sbu.toString();
    modview.setDocumentText(modTxt);

    // copy annotations using the 'map':
    try {
      mapAnnotations(jcas, map, modifiedViewName);
    } catch (CASException e) {
      e.printStackTrace();
    }
  }