in TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupAnnotator.java [115:198]
public void process(CAS cas) throws AnalysisEngineProcessException {
CAS originalCas = null;
try {
originalCas = cas.getView(originalViewName);
}
catch (Exception e) {
String viewName = cas.getViewName();
// can't find originalViewName
this.getContext().getLogger().log(Level.WARNING, new StringBuffer("can't find view ").append(originalViewName)
.append(" using ").append(viewName).append(" instead").toString());
originalCas = cas.getCurrentView();
}
InputStream originalStream = originalCas.getSofa().getSofaDataStream();
// parsing with TIKA
// TODO if content type is known then we use it
// otherwise we guess
Parser parser = new AutoDetectParser(config);
Metadata md = new Metadata();
MarkupHandler handler = new MarkupHandler();
try {
parser.parse(originalStream, handler, md);
}
catch (Exception e) {
// if we have a problem just dump the message and continue
this.getContext().getLogger().log(Level.WARNING, new StringBuffer("Problem converting file : ")
.append(e.getMessage()).toString());
// PROBLEM => trying to serialize binary content in XML crash!
return;
}
finally {
try {
originalStream.close();
} catch (IOException e) {
}
}
CAS plainTextView = cas.createView(textViewName);
handler.populateCAS(plainTextView);
// get additional metadata about the document
// e.g content type etc...
// TODO add possibility to define type as parameter and discover
// feature names on the fly
JCas ptv = null;
try {
ptv = plainTextView.getJCas();
} catch (CASException e) {
e.printStackTrace();
return;
}
/* identify language */
extractLanguage(ptv);
Type docAnnotationType = ptv.getTypeSystem().getType("org.apache.uima.tika.SourceDocumentAnnotation");
Iterator iter = ptv.getAnnotationIndex(docAnnotationType).iterator();
SourceDocumentAnnotation docAnnotation = null;
// do we already have one?
if (iter.hasNext()) docAnnotation = (SourceDocumentAnnotation) iter.next();
// otherwise let's create a new annotation
else docAnnotation = new SourceDocumentAnnotation(ptv);
// now iterate on the metadata found by Tika and add them to the info
if (docAnnotation.getFeatures() == null)
docAnnotation.setFeatures((FSArray) cas.createArrayFS(md.size()));
for (int i = 0; i < md.size(); i++) {
String name = md.names()[i];
String value = md.get(name);
FeatureValue fv = new FeatureValue(ptv);
fv.setName(name);
fv.setValue(value);
docAnnotation.setFeatures(i, fv);
}
docAnnotation.addToIndexes();
}