public void process()

in wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java [74:169]


    public void process(WikiArticle page, Siteinfo siteinfo)
        throws SAXException {
      
      if (page.getIntegerNamespace() == 0 && page.isMain()) {
        if (page.getText().toLowerCase().contains("{publish}")) {
          
          String pageText = page.getText();
          int cutIndex = pageText.length();

          for (String endMarker : endOfArticleMarkers) {
            int endMarkerIndex = pageText.indexOf(endMarker);
              if (endMarkerIndex != -1 && endMarkerIndex < cutIndex) {
                cutIndex = endMarkerIndex;
              }
          }
          
          if (cutIndex < pageText.length()) {
            pageText = pageText.substring(0, cutIndex);
          }
          
          WikinewsWikiModel wikiModel = new WikinewsWikiModel(
                  "https://en.wikinews.org/wiki/${image}",
                  "https://en.wikinews.org/wiki/${title}");
          
          AnnotatingMarkupParser converter = new AnnotatingMarkupParser();
          String plainStr = wikiModel.render(converter, pageText);
          
          CAS articleCAS = UimaUtil.createEmptyCAS(tsDesc);
          
          // TODO: find a way to nicely add title ..
          StringBuilder articleText = new StringBuilder();
          articleText.append(page.getTitle());
          
          int endOffsetTitle = articleText.length();
          
          articleText.append("\n");
          articleText.append("\n");
          
          int bodyOffset = articleText.length();
          
          articleText.append(plainStr); // Note: Add offset to annotations ... by this
          
          articleCAS.setDocumentLanguage("en");
          articleCAS.setDocumentText(articleText.toString());
          
          AnnotationFS headlineAnnotation = articleCAS.createAnnotation(articleCAS.getTypeSystem()
              .getType("org.apache.opennlp.annotations.Headline"),
              0, endOffsetTitle);
          
          articleCAS.addFsToIndexes(headlineAnnotation);
          
          for (Annotation paraAnn : converter.getParagraphAnnotations()) {
            AnnotationFS paraAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem()
                .getType("org.apache.opennlp.annotations.Paragraph"),
                bodyOffset + paraAnn.begin, bodyOffset + paraAnn.end);
            
            articleCAS.addFsToIndexes(paraAnnFS);
          }
          
          for (Annotation subHeadAnn : converter.getHeaderAnnotations()) {
            AnnotationFS subHeadAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem()
                .getType("org.apache.opennlp.annotations.SubHeadline"),
                bodyOffset + subHeadAnn.begin, bodyOffset + subHeadAnn.end);
            
            articleCAS.addFsToIndexes(subHeadAnnFS);
          }
          
          Type wikiLinkType = articleCAS.getTypeSystem()
              .getType("org.apache.opennlp.annotations.WikiLink");
          Feature linkFeature = wikiLinkType.getFeatureByBaseName("link");
          
          for (Annotation wikiLinkAnn : converter.getWikiLinkAnnotations()) {
            AnnotationFS wikiLinkAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem()
                .getType("org.apache.opennlp.annotations.WikiLink"),
                bodyOffset + wikiLinkAnn.begin, bodyOffset + wikiLinkAnn.end);
            
            wikiLinkAnnFS.setStringValue(linkFeature, wikiLinkAnn.value);
            
            articleCAS.addFsToIndexes(wikiLinkAnnFS);
          }
          
          CAS markupCas = articleCAS.createView("WikiMarkup");
          markupCas.setDocumentText(page.toString());
          
          // now serialize CAS
          try (OutputStream casOut = new FileOutputStream(outputFolder.getAbsolutePath() +
                  File.separator + titleToUri(page.getTitle()) + ".xmi")) {

              UimaUtil.serializeCASToXmi(articleCAS, casOut);
          }
          catch (IOException e) {
            e.printStackTrace();
          }
        }
      }
    }