public void process()

in TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupAnnotator.java [115:198]


  public void process(CAS cas) throws AnalysisEngineProcessException {
    CAS originalCas = null;
    try {
      originalCas = cas.getView(originalViewName);
    }
    catch (Exception e) {
      String viewName = cas.getViewName();
      // can't find originalViewName
      this.getContext().getLogger().log(Level.WARNING, new StringBuffer("can't find view ").append(originalViewName)
              .append(" using ").append(viewName).append(" instead").toString());
      originalCas = cas.getCurrentView();
    }

    InputStream originalStream = originalCas.getSofa().getSofaDataStream();

    // parsing with TIKA

    // TODO if content type is known then we use it
    // otherwise we guess

    Parser parser = new AutoDetectParser(config);

    Metadata md = new Metadata();
    MarkupHandler handler = new MarkupHandler();

    try {
      parser.parse(originalStream, handler, md);
    }
    catch (Exception e) {
      // if we have a problem just dump the message and continue
      this.getContext().getLogger().log(Level.WARNING, new StringBuffer("Problem converting file : ")
              .append(e.getMessage()).toString());
      // PROBLEM => trying to serialize binary content in XML crash!
      return;
    }
    finally {
      try {
        originalStream.close();
      } catch (IOException e) {
      }
    }

    CAS plainTextView = cas.createView(textViewName);


    handler.populateCAS(plainTextView);

    // get additional metadata about the document
    // e.g content type etc...
    // TODO add possibility to define type as parameter and discover
    // feature names on the fly
    JCas ptv = null;
    try {
      ptv = plainTextView.getJCas();
    } catch (CASException e) {
      e.printStackTrace();
      return;
    }
    /* identify language */
    extractLanguage(ptv);

    Type docAnnotationType = ptv.getTypeSystem().getType("org.apache.uima.tika.SourceDocumentAnnotation");
    Iterator iter = ptv.getAnnotationIndex(docAnnotationType).iterator();
    SourceDocumentAnnotation docAnnotation = null;
    // do we already have one?
    if (iter.hasNext()) docAnnotation = (SourceDocumentAnnotation) iter.next();
      // otherwise let's create a new annotation
    else docAnnotation = new SourceDocumentAnnotation(ptv);

    // now iterate on the metadata found by Tika and add them to the info
    if (docAnnotation.getFeatures() == null)
      docAnnotation.setFeatures((FSArray) cas.createArrayFS(md.size()));

    for (int i = 0; i < md.size(); i++) {
      String name = md.names()[i];
      String value = md.get(name);
      FeatureValue fv = new FeatureValue(ptv);
      fv.setName(name);
      fv.setValue(value);
      docAnnotation.setFeatures(i, fv);
    }
    docAnnotation.addToIndexes();

  }