in wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java [74:169]
public void process(WikiArticle page, Siteinfo siteinfo)
throws SAXException {
if (page.getIntegerNamespace() == 0 && page.isMain()) {
if (page.getText().toLowerCase().contains("{publish}")) {
String pageText = page.getText();
int cutIndex = pageText.length();
for (String endMarker : endOfArticleMarkers) {
int endMarkerIndex = pageText.indexOf(endMarker);
if (endMarkerIndex != -1 && endMarkerIndex < cutIndex) {
cutIndex = endMarkerIndex;
}
}
if (cutIndex < pageText.length()) {
pageText = pageText.substring(0, cutIndex);
}
WikinewsWikiModel wikiModel = new WikinewsWikiModel(
"https://en.wikinews.org/wiki/${image}",
"https://en.wikinews.org/wiki/${title}");
AnnotatingMarkupParser converter = new AnnotatingMarkupParser();
String plainStr = wikiModel.render(converter, pageText);
CAS articleCAS = UimaUtil.createEmptyCAS(tsDesc);
// TODO: find a way to nicely add title ..
StringBuilder articleText = new StringBuilder();
articleText.append(page.getTitle());
int endOffsetTitle = articleText.length();
articleText.append("\n");
articleText.append("\n");
int bodyOffset = articleText.length();
articleText.append(plainStr); // Note: Add offset to annotations ... by this
articleCAS.setDocumentLanguage("en");
articleCAS.setDocumentText(articleText.toString());
AnnotationFS headlineAnnotation = articleCAS.createAnnotation(articleCAS.getTypeSystem()
.getType("org.apache.opennlp.annotations.Headline"),
0, endOffsetTitle);
articleCAS.addFsToIndexes(headlineAnnotation);
for (Annotation paraAnn : converter.getParagraphAnnotations()) {
AnnotationFS paraAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem()
.getType("org.apache.opennlp.annotations.Paragraph"),
bodyOffset + paraAnn.begin, bodyOffset + paraAnn.end);
articleCAS.addFsToIndexes(paraAnnFS);
}
for (Annotation subHeadAnn : converter.getHeaderAnnotations()) {
AnnotationFS subHeadAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem()
.getType("org.apache.opennlp.annotations.SubHeadline"),
bodyOffset + subHeadAnn.begin, bodyOffset + subHeadAnn.end);
articleCAS.addFsToIndexes(subHeadAnnFS);
}
Type wikiLinkType = articleCAS.getTypeSystem()
.getType("org.apache.opennlp.annotations.WikiLink");
Feature linkFeature = wikiLinkType.getFeatureByBaseName("link");
for (Annotation wikiLinkAnn : converter.getWikiLinkAnnotations()) {
AnnotationFS wikiLinkAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem()
.getType("org.apache.opennlp.annotations.WikiLink"),
bodyOffset + wikiLinkAnn.begin, bodyOffset + wikiLinkAnn.end);
wikiLinkAnnFS.setStringValue(linkFeature, wikiLinkAnn.value);
articleCAS.addFsToIndexes(wikiLinkAnnFS);
}
CAS markupCas = articleCAS.createView("WikiMarkup");
markupCas.setDocumentText(page.toString());
// now serialize CAS
try (OutputStream casOut = new FileOutputStream(outputFolder.getAbsolutePath() +
File.separator + titleToUri(page.getTitle()) + ".xmi")) {
UimaUtil.serializeCASToXmi(articleCAS, casOut);
}
catch (IOException e) {
e.printStackTrace();
}
}
}
}