public void processDocuments()

in connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java [487:925]


  public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec,
    IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
    throws ManifoldCFException, ServiceInterruption {

    List<String> requiredMetadata = new ArrayList<String>();
    boolean useEmailExtractor = false;
    for (int i = 0; i < spec.getChildCount(); i++) {
      SpecificationNode sn = spec.getChild(i);
      if (sn.getType().equals(EmailConfig.NODE_METADATA)) {
        String metadataAttribute = sn.getAttributeValue(EmailConfig.ATTRIBUTE_NAME);
        requiredMetadata.add(metadataAttribute);
      }
      if (sn.getType().equals(EmailConfig.NODE_EXTRACT_EMAIL)) {
        useEmailExtractor = true;
      }
    }
    
    // Keep a cached set of open folders
    Map<String,Folder> openFolders = new HashMap<String,Folder>();
    try {

      for (String documentIdentifier : documentIdentifiers) {
        final Integer attachmentIndex = extractAttachmentNumberFromDocumentIdentifier(documentIdentifier);
        if (attachmentIndex == null) {
          // It's an email
          String versionString = "_" + urlTemplate;   // NOT empty; we need to make ManifoldCF understand that this is a document that never will change.
          
          // Check if we need to index
          if (!activities.checkDocumentNeedsReindexing(documentIdentifier,versionString))
            continue;
          
          String compositeID = documentIdentifier;
          String version = versionString;
          String folderName = extractFolderNameFromDocumentIdentifier(compositeID);
          String id = extractEmailIDFromDocumentIdentifier(compositeID);
          
          String errorCode = null;
          String errorDesc = null;
          Long fileLengthLong = null;
          long startTime = System.currentTimeMillis();
          try {
            try {
              Folder folder = openFolders.get(folderName);
              if (folder == null)
              {
                getSession();
                OpenFolderThread oft = new OpenFolderThread(session, folderName);
                oft.start();
                folder = oft.finishUp();
                openFolders.put(folderName,folder);
              }
              
              if (Logging.connectors.isDebugEnabled())
                Logging.connectors.debug("Email: Processing document identifier '"
                  + compositeID + "'");
              SearchTerm messageIDTerm = new MessageIDTerm(id);
                
              getSession();
              SearchMessagesThread smt = new SearchMessagesThread(session, folder, messageIDTerm);
              smt.start();
              Message[] message = smt.finishUp();

              String msgURL = makeDocumentURI(urlTemplate, folderName, id);

              Message msg = null;
              for (Message msg2 : message) {
                msg = msg2;
              }
              if (msg == null) {
                // email was not found
                activities.deleteDocument(documentIdentifier);
                continue;
              }
                
              if (!activities.checkURLIndexable(msgURL)) {
                errorCode = activities.EXCLUDED_URL;
                errorDesc = "Excluded because of URL ('"+msgURL+"')";
                activities.noDocument(documentIdentifier, version);
                continue;
              }
                
              long fileLength = msg.getSize();
              if (!activities.checkLengthIndexable(fileLength)) {
                errorCode = activities.EXCLUDED_LENGTH;
                errorDesc = "Excluded because of length ("+fileLength+")";
                activities.noDocument(documentIdentifier, version);
                continue;
              }
                
              Date sentDate = msg.getSentDate();
              if (!activities.checkDateIndexable(sentDate)) {
                errorCode = activities.EXCLUDED_DATE;
                errorDesc = "Excluded because of date ("+sentDate+")";
                activities.noDocument(documentIdentifier, version);
                continue;
              }
              
              String mimeType = "text/plain";
              if (!activities.checkMimeTypeIndexable(mimeType)) {
                errorCode = activities.EXCLUDED_MIMETYPE;
                errorDesc = "Excluded because of mime type ('"+mimeType+"')";
                activities.noDocument(documentIdentifier, version);
                continue;
              }
              
              RepositoryDocument rd = new RepositoryDocument();
              rd.setFileName(msg.getFileName());
              rd.setMimeType(mimeType);
              rd.setCreatedDate(sentDate);
              rd.setModifiedDate(sentDate);

              for (String metadata : requiredMetadata) {
                if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_TO)) {
                  Address[] to = msg.getRecipients(Message.RecipientType.TO);
                  if (to != null) {
                    String[] toStr = new String[to.length];
                    int j = 0;
                    for (Address address : to) {
                      toStr[j] = useEmailExtractor ? extractEmailAddress(address.toString()) : address.toString();
                      j++;
                    }
                    rd.addField(EmailConfig.EMAIL_TO, toStr);
                  }
                } else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_FROM)) {
                  Address[] from = msg.getFrom();
                  String[] fromStr = new String[from.length];
                  int j = 0;
                  for (Address address : from) {
                    fromStr[j] = useEmailExtractor ? extractEmailAddress(address.toString()) : address.toString();
                    j++;
                  }
                  rd.addField(EmailConfig.EMAIL_FROM, fromStr);
                } else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_SUBJECT)) {
                  String subject = msg.getSubject();
                  rd.addField(EmailConfig.EMAIL_SUBJECT, subject);
                } else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_DATE)) {
                  rd.addField(EmailConfig.EMAIL_DATE, sentDate.toString());
                } else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_ATTACHMENT_ENCODING)) {
                  Object o = msg.getContent();
                  if (o != null) {
                    if (o instanceof Multipart) {
                      Multipart mp = (Multipart) o;
                      String[] encoding = new String[mp.getCount()];
                      for (int k = 0, n = mp.getCount(); k < n; k++) {
                        Part part = mp.getBodyPart(k);
                        if (isAttachment(part)) {
                          final String[] fileSplit = part.getFileName().split("\\?");
                          if (fileSplit.length > 1) {
                            encoding[k] = fileSplit[1];
                          } else {
                            encoding[k] = "";
                          }
                        }
                      }
                      rd.addField(EmailConfig.ENCODING_FIELD, encoding);
                    } else if (o instanceof String) {
                      rd.addField(EmailConfig.ENCODING_FIELD, "");
                    }
                  }
                } else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_ATTACHMENT_MIMETYPE)) {
                  Object o = msg.getContent();
                  if (o != null) {
                    if (o instanceof Multipart) {
                      Multipart mp = (Multipart) o;
                      String[] MIMEType = new String[mp.getCount()];
                      for (int k = 0, n = mp.getCount(); k < n; k++) {
                        Part part = mp.getBodyPart(k);
                        if (isAttachment(part)) {
                          MIMEType[k] = part.getContentType();

                        }
                      }
                      rd.addField(EmailConfig.MIMETYPE_FIELD, MIMEType);
                    } else if (o instanceof String) {
                      rd.addField(EmailConfig.MIMETYPE_FIELD, "");
                    }
                  }
                } else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_ATTACHMENTNAME)) {
                  Object o = msg.getContent();
                  if (o != null) {
                    if (o instanceof Multipart) {
                      Multipart mp = (Multipart) o;
                      String[] attachmentNames = new String[mp.getCount()];
                      for (int k = 0, n = mp.getCount(); k < n; k++) {
                        Part part = mp.getBodyPart(k);
                        if (isAttachment(part)) {
                          attachmentNames[k] = part.getFileName();
                        }
                      }
                      rd.addField(EmailConfig.ATTACHMENTNAME_FIELD, attachmentNames);
                    } else if (o instanceof String) {
                      rd.addField(EmailConfig.ATTACHMENTNAME_FIELD, "");
                    }
                  }
                }
              }

              //Content includes both body and attachments,
              //Body will be set as content and attachments will be indexed as separate documents.
              final EmailContent bodyContent = extractBodyContent(msg);
              if(bodyContent != null) {
                rd.setMimeType(bodyContent.getMimeType());
                InputStream is = new ByteArrayInputStream(bodyContent.getContent().getBytes(StandardCharsets.UTF_8));
                try {
                  rd.setBinary(is, fileLength);
                  activities.ingestDocumentWithException(documentIdentifier, version, msgURL, rd);
                  errorCode = "OK";
                  fileLengthLong = new Long(fileLength);
                } finally {
                  is.close();
                }
              }

              // If we're supposed to deal with attachments, this is the time to queue them up
              if (attachmentUrlTemplate != null) {
                if (msg.getContent() != null && msg.getContent() instanceof Multipart) {
                  final Multipart mp = (Multipart) msg.getContent();
                  final int numAttachments = mp.getCount();
                  for (int i = 0; i < numAttachments; i++) {
                    if (isAttachment(mp.getBodyPart(i))) {
                      activities.addDocumentReference(documentIdentifier + ":" + i);
                    }
                  }
                }
              }
              
            } catch (InterruptedException e) {
              throw new ManifoldCFException(e.getMessage(), ManifoldCFException.INTERRUPTED);
            } catch (MessagingException e) {
              errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
              errorDesc = e.getMessage();
              handleMessagingException(e, "processing email");
            } catch (IOException e) {
              errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
              errorDesc = e.getMessage();
              handleIOException(e, "processing email");
              throw new ManifoldCFException(e.getMessage(), e);
            }
          } catch (ManifoldCFException e) {
            if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
              errorCode = null;
            throw e;
          } finally {
            if (errorCode != null)
              activities.recordActivity(new Long(startTime),EmailConfig.ACTIVITY_FETCH,
                fileLengthLong,documentIdentifier,errorCode,errorDesc,null);
          }
        } else {
          // It's a specific attachment
          final int attachmentNumber = attachmentIndex;
          
          String versionString = "_" + attachmentUrlTemplate;   // NOT empty; we need to make ManifoldCF understand that this is a document that never will change.
          
          // Check if we need to index
          if (!activities.checkDocumentNeedsReindexing(documentIdentifier,versionString))
            continue;
          
          String compositeID = documentIdentifier;
          String version = versionString;
          String folderName = extractFolderNameFromDocumentIdentifier(compositeID);
          String id = extractEmailIDFromDocumentIdentifier(compositeID);
          
          String errorCode = null;
          String errorDesc = null;
          Long fileLengthLong = null;
          long startTime = System.currentTimeMillis();
          try {
            try {
              Folder folder = openFolders.get(folderName);
              if (folder == null)
              {
                getSession();
                OpenFolderThread oft = new OpenFolderThread(session, folderName);
                oft.start();
                folder = oft.finishUp();
                openFolders.put(folderName,folder);
              }
              
              if (Logging.connectors.isDebugEnabled())
                Logging.connectors.debug("Email: Processing document identifier '"
                  + documentIdentifier + "'");
              SearchTerm messageIDTerm = new MessageIDTerm(id);
                
              getSession();
              SearchMessagesThread smt = new SearchMessagesThread(session, folder, messageIDTerm);
              smt.start();
              Message[] message = smt.finishUp();

              String msgURL = makeDocumentURI(attachmentUrlTemplate, folderName, id, attachmentNumber);

              Message msg = null;
              for (Message msg2 : message) {
                msg = msg2;
              }
              if (msg == null) {
                // email was not found
                activities.deleteDocument(documentIdentifier);
                continue;
              }
                
              if (!activities.checkURLIndexable(msgURL)) {
                errorCode = activities.EXCLUDED_URL;
                errorDesc = "Excluded because of URL ('"+msgURL+"')";
                activities.noDocument(documentIdentifier, version);
                continue;
              }

              final Date sentDate = msg.getSentDate();
              if (!activities.checkDateIndexable(sentDate)) {
                errorCode = activities.EXCLUDED_DATE;
                errorDesc = "Excluded because of date ("+sentDate+")";
                activities.noDocument(documentIdentifier, version);
                continue;
              }

              final Multipart mp = (Multipart) msg.getContent();
              if (mp.getCount() <= attachmentNumber) {
                activities.deleteDocument(documentIdentifier);
                continue;
              }
              final Part part = mp.getBodyPart(attachmentNumber);
                            
              final long fileLength = part.getSize();
              if (!activities.checkLengthIndexable(fileLength)) {
                errorCode = activities.EXCLUDED_LENGTH;
                errorDesc = "Excluded because of length ("+fileLength+")";
                activities.noDocument(documentIdentifier, version);
                continue;
              }
                
              final String origMimeType = part.getContentType();
              final String mimeType;
              //MSExchange puts crap after the mime type so it has to be munged.
              // Example: "application/msword; name=SampleDOCFile_100kb.doc"
              if (origMimeType == null || origMimeType.indexOf(";") == -1) {
                mimeType = origMimeType;
              } else {
                mimeType = origMimeType.substring(0, origMimeType.indexOf(";"));
              }
              if (!activities.checkMimeTypeIndexable(mimeType)) {
                errorCode = activities.EXCLUDED_MIMETYPE;
                errorDesc = "Excluded because of mime type ('"+mimeType+"')";
                activities.noDocument(documentIdentifier, version);
                continue;
              }

              RepositoryDocument rd = new RepositoryDocument();
              rd.setFileName(part.getFileName());
              rd.setMimeType(mimeType);
              rd.setCreatedDate(sentDate);
              rd.setModifiedDate(sentDate);

              for (String metadata : requiredMetadata) {
                if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_TO)) {
                  Address[] to = msg.getRecipients(Message.RecipientType.TO);
                  if (to != null) {
                    String[] toStr = new String[to.length];
                    int j = 0;
                    for (Address address : to) {
                      toStr[j] = useEmailExtractor ? extractEmailAddress(address.toString()) : address.toString();
                      j++;
                    }
                    rd.addField(EmailConfig.EMAIL_TO, toStr);
                  }
                } else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_FROM)) {
                  Address[] from = msg.getFrom();
                  String[] fromStr = new String[from.length];
                  int j = 0;
                  for (Address address : from) {
                    fromStr[j] = useEmailExtractor ? extractEmailAddress(address.toString()) : address.toString();
                    j++;
                  }
                  rd.addField(EmailConfig.EMAIL_FROM, fromStr);
                } else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_SUBJECT)) {
                  String subject = msg.getSubject();
                  //Attachments may have a field named "subject". So, different field name is used not to clash.
                  rd.addField(EmailConfig.MAILSUBJECT_FIELD, subject);
                } else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_DATE)) {
                  rd.addField(EmailConfig.EMAIL_DATE, sentDate.toString());
                }
              }

              final InputStream is = part.getInputStream();
              try {
                rd.setBinary(is, fileLength);
                activities.ingestDocumentWithException(documentIdentifier, version, msgURL, rd);
                errorCode = "OK";
                fileLengthLong = new Long(fileLength);
              } finally {
                is.close();
              }

            } catch (InterruptedException e) {
              throw new ManifoldCFException(e.getMessage(), ManifoldCFException.INTERRUPTED);
            } catch (MessagingException e) {
              errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
              errorDesc = e.getMessage();
              handleMessagingException(e, "processing email attachment");
            } catch (IOException e) {
              errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
              errorDesc = e.getMessage();
              handleIOException(e, "processing email attachment");
              throw new ManifoldCFException(e.getMessage(), e);
            }
          } catch (ManifoldCFException e) {
            if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
              errorCode = null;
            throw e;
          } finally {
            if (errorCode != null)
              activities.recordActivity(new Long(startTime),EmailConfig.ACTIVITY_FETCH,
                fileLengthLong,documentIdentifier,errorCode,errorDesc,null);
          }

        }
      }
    }
    finally
    {
      for (Folder f : openFolders.values())
      {
        try
        {
          CloseFolderThread cft = new CloseFolderThread(session, f);
          cft.start();
          cft.finishUp();
        }
        catch (InterruptedException e)
        {
          throw new ManifoldCFException(e.getMessage(),ManifoldCFException.INTERRUPTED);
        }
        catch (MessagingException e)
        {
          handleMessagingException(e, "closing folders");
        }
      }
    }

  }