in connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java [487:925]
public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec,
IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
throws ManifoldCFException, ServiceInterruption {
List<String> requiredMetadata = new ArrayList<String>();
boolean useEmailExtractor = false;
for (int i = 0; i < spec.getChildCount(); i++) {
SpecificationNode sn = spec.getChild(i);
if (sn.getType().equals(EmailConfig.NODE_METADATA)) {
String metadataAttribute = sn.getAttributeValue(EmailConfig.ATTRIBUTE_NAME);
requiredMetadata.add(metadataAttribute);
}
if (sn.getType().equals(EmailConfig.NODE_EXTRACT_EMAIL)) {
useEmailExtractor = true;
}
}
// Keep a cached set of open folders
Map<String,Folder> openFolders = new HashMap<String,Folder>();
try {
for (String documentIdentifier : documentIdentifiers) {
final Integer attachmentIndex = extractAttachmentNumberFromDocumentIdentifier(documentIdentifier);
if (attachmentIndex == null) {
// It's an email
String versionString = "_" + urlTemplate; // NOT empty; we need to make ManifoldCF understand that this is a document that never will change.
// Check if we need to index
if (!activities.checkDocumentNeedsReindexing(documentIdentifier,versionString))
continue;
String compositeID = documentIdentifier;
String version = versionString;
String folderName = extractFolderNameFromDocumentIdentifier(compositeID);
String id = extractEmailIDFromDocumentIdentifier(compositeID);
String errorCode = null;
String errorDesc = null;
Long fileLengthLong = null;
long startTime = System.currentTimeMillis();
try {
try {
Folder folder = openFolders.get(folderName);
if (folder == null)
{
getSession();
OpenFolderThread oft = new OpenFolderThread(session, folderName);
oft.start();
folder = oft.finishUp();
openFolders.put(folderName,folder);
}
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("Email: Processing document identifier '"
+ compositeID + "'");
SearchTerm messageIDTerm = new MessageIDTerm(id);
getSession();
SearchMessagesThread smt = new SearchMessagesThread(session, folder, messageIDTerm);
smt.start();
Message[] message = smt.finishUp();
String msgURL = makeDocumentURI(urlTemplate, folderName, id);
Message msg = null;
for (Message msg2 : message) {
msg = msg2;
}
if (msg == null) {
// email was not found
activities.deleteDocument(documentIdentifier);
continue;
}
if (!activities.checkURLIndexable(msgURL)) {
errorCode = activities.EXCLUDED_URL;
errorDesc = "Excluded because of URL ('"+msgURL+"')";
activities.noDocument(documentIdentifier, version);
continue;
}
long fileLength = msg.getSize();
if (!activities.checkLengthIndexable(fileLength)) {
errorCode = activities.EXCLUDED_LENGTH;
errorDesc = "Excluded because of length ("+fileLength+")";
activities.noDocument(documentIdentifier, version);
continue;
}
Date sentDate = msg.getSentDate();
if (!activities.checkDateIndexable(sentDate)) {
errorCode = activities.EXCLUDED_DATE;
errorDesc = "Excluded because of date ("+sentDate+")";
activities.noDocument(documentIdentifier, version);
continue;
}
String mimeType = "text/plain";
if (!activities.checkMimeTypeIndexable(mimeType)) {
errorCode = activities.EXCLUDED_MIMETYPE;
errorDesc = "Excluded because of mime type ('"+mimeType+"')";
activities.noDocument(documentIdentifier, version);
continue;
}
RepositoryDocument rd = new RepositoryDocument();
rd.setFileName(msg.getFileName());
rd.setMimeType(mimeType);
rd.setCreatedDate(sentDate);
rd.setModifiedDate(sentDate);
for (String metadata : requiredMetadata) {
if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_TO)) {
Address[] to = msg.getRecipients(Message.RecipientType.TO);
if (to != null) {
String[] toStr = new String[to.length];
int j = 0;
for (Address address : to) {
toStr[j] = useEmailExtractor ? extractEmailAddress(address.toString()) : address.toString();
j++;
}
rd.addField(EmailConfig.EMAIL_TO, toStr);
}
} else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_FROM)) {
Address[] from = msg.getFrom();
String[] fromStr = new String[from.length];
int j = 0;
for (Address address : from) {
fromStr[j] = useEmailExtractor ? extractEmailAddress(address.toString()) : address.toString();
j++;
}
rd.addField(EmailConfig.EMAIL_FROM, fromStr);
} else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_SUBJECT)) {
String subject = msg.getSubject();
rd.addField(EmailConfig.EMAIL_SUBJECT, subject);
} else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_DATE)) {
rd.addField(EmailConfig.EMAIL_DATE, sentDate.toString());
} else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_ATTACHMENT_ENCODING)) {
Object o = msg.getContent();
if (o != null) {
if (o instanceof Multipart) {
Multipart mp = (Multipart) o;
String[] encoding = new String[mp.getCount()];
for (int k = 0, n = mp.getCount(); k < n; k++) {
Part part = mp.getBodyPart(k);
if (isAttachment(part)) {
final String[] fileSplit = part.getFileName().split("\\?");
if (fileSplit.length > 1) {
encoding[k] = fileSplit[1];
} else {
encoding[k] = "";
}
}
}
rd.addField(EmailConfig.ENCODING_FIELD, encoding);
} else if (o instanceof String) {
rd.addField(EmailConfig.ENCODING_FIELD, "");
}
}
} else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_ATTACHMENT_MIMETYPE)) {
Object o = msg.getContent();
if (o != null) {
if (o instanceof Multipart) {
Multipart mp = (Multipart) o;
String[] MIMEType = new String[mp.getCount()];
for (int k = 0, n = mp.getCount(); k < n; k++) {
Part part = mp.getBodyPart(k);
if (isAttachment(part)) {
MIMEType[k] = part.getContentType();
}
}
rd.addField(EmailConfig.MIMETYPE_FIELD, MIMEType);
} else if (o instanceof String) {
rd.addField(EmailConfig.MIMETYPE_FIELD, "");
}
}
} else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_ATTACHMENTNAME)) {
Object o = msg.getContent();
if (o != null) {
if (o instanceof Multipart) {
Multipart mp = (Multipart) o;
String[] attachmentNames = new String[mp.getCount()];
for (int k = 0, n = mp.getCount(); k < n; k++) {
Part part = mp.getBodyPart(k);
if (isAttachment(part)) {
attachmentNames[k] = part.getFileName();
}
}
rd.addField(EmailConfig.ATTACHMENTNAME_FIELD, attachmentNames);
} else if (o instanceof String) {
rd.addField(EmailConfig.ATTACHMENTNAME_FIELD, "");
}
}
}
}
//Content includes both body and attachments,
//Body will be set as content and attachments will be indexed as separate documents.
final EmailContent bodyContent = extractBodyContent(msg);
if(bodyContent != null) {
rd.setMimeType(bodyContent.getMimeType());
InputStream is = new ByteArrayInputStream(bodyContent.getContent().getBytes(StandardCharsets.UTF_8));
try {
rd.setBinary(is, fileLength);
activities.ingestDocumentWithException(documentIdentifier, version, msgURL, rd);
errorCode = "OK";
fileLengthLong = new Long(fileLength);
} finally {
is.close();
}
}
// If we're supposed to deal with attachments, this is the time to queue them up
if (attachmentUrlTemplate != null) {
if (msg.getContent() != null && msg.getContent() instanceof Multipart) {
final Multipart mp = (Multipart) msg.getContent();
final int numAttachments = mp.getCount();
for (int i = 0; i < numAttachments; i++) {
if (isAttachment(mp.getBodyPart(i))) {
activities.addDocumentReference(documentIdentifier + ":" + i);
}
}
}
}
} catch (InterruptedException e) {
throw new ManifoldCFException(e.getMessage(), ManifoldCFException.INTERRUPTED);
} catch (MessagingException e) {
errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
errorDesc = e.getMessage();
handleMessagingException(e, "processing email");
} catch (IOException e) {
errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
errorDesc = e.getMessage();
handleIOException(e, "processing email");
throw new ManifoldCFException(e.getMessage(), e);
}
} catch (ManifoldCFException e) {
if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
errorCode = null;
throw e;
} finally {
if (errorCode != null)
activities.recordActivity(new Long(startTime),EmailConfig.ACTIVITY_FETCH,
fileLengthLong,documentIdentifier,errorCode,errorDesc,null);
}
} else {
// It's a specific attachment
final int attachmentNumber = attachmentIndex;
String versionString = "_" + attachmentUrlTemplate; // NOT empty; we need to make ManifoldCF understand that this is a document that never will change.
// Check if we need to index
if (!activities.checkDocumentNeedsReindexing(documentIdentifier,versionString))
continue;
String compositeID = documentIdentifier;
String version = versionString;
String folderName = extractFolderNameFromDocumentIdentifier(compositeID);
String id = extractEmailIDFromDocumentIdentifier(compositeID);
String errorCode = null;
String errorDesc = null;
Long fileLengthLong = null;
long startTime = System.currentTimeMillis();
try {
try {
Folder folder = openFolders.get(folderName);
if (folder == null)
{
getSession();
OpenFolderThread oft = new OpenFolderThread(session, folderName);
oft.start();
folder = oft.finishUp();
openFolders.put(folderName,folder);
}
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("Email: Processing document identifier '"
+ documentIdentifier + "'");
SearchTerm messageIDTerm = new MessageIDTerm(id);
getSession();
SearchMessagesThread smt = new SearchMessagesThread(session, folder, messageIDTerm);
smt.start();
Message[] message = smt.finishUp();
String msgURL = makeDocumentURI(attachmentUrlTemplate, folderName, id, attachmentNumber);
Message msg = null;
for (Message msg2 : message) {
msg = msg2;
}
if (msg == null) {
// email was not found
activities.deleteDocument(documentIdentifier);
continue;
}
if (!activities.checkURLIndexable(msgURL)) {
errorCode = activities.EXCLUDED_URL;
errorDesc = "Excluded because of URL ('"+msgURL+"')";
activities.noDocument(documentIdentifier, version);
continue;
}
final Date sentDate = msg.getSentDate();
if (!activities.checkDateIndexable(sentDate)) {
errorCode = activities.EXCLUDED_DATE;
errorDesc = "Excluded because of date ("+sentDate+")";
activities.noDocument(documentIdentifier, version);
continue;
}
final Multipart mp = (Multipart) msg.getContent();
if (mp.getCount() <= attachmentNumber) {
activities.deleteDocument(documentIdentifier);
continue;
}
final Part part = mp.getBodyPart(attachmentNumber);
final long fileLength = part.getSize();
if (!activities.checkLengthIndexable(fileLength)) {
errorCode = activities.EXCLUDED_LENGTH;
errorDesc = "Excluded because of length ("+fileLength+")";
activities.noDocument(documentIdentifier, version);
continue;
}
final String origMimeType = part.getContentType();
final String mimeType;
//MSExchange puts crap after the mime type so it has to be munged.
// Example: "application/msword; name=SampleDOCFile_100kb.doc"
if (origMimeType == null || origMimeType.indexOf(";") == -1) {
mimeType = origMimeType;
} else {
mimeType = origMimeType.substring(0, origMimeType.indexOf(";"));
}
if (!activities.checkMimeTypeIndexable(mimeType)) {
errorCode = activities.EXCLUDED_MIMETYPE;
errorDesc = "Excluded because of mime type ('"+mimeType+"')";
activities.noDocument(documentIdentifier, version);
continue;
}
RepositoryDocument rd = new RepositoryDocument();
rd.setFileName(part.getFileName());
rd.setMimeType(mimeType);
rd.setCreatedDate(sentDate);
rd.setModifiedDate(sentDate);
for (String metadata : requiredMetadata) {
if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_TO)) {
Address[] to = msg.getRecipients(Message.RecipientType.TO);
if (to != null) {
String[] toStr = new String[to.length];
int j = 0;
for (Address address : to) {
toStr[j] = useEmailExtractor ? extractEmailAddress(address.toString()) : address.toString();
j++;
}
rd.addField(EmailConfig.EMAIL_TO, toStr);
}
} else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_FROM)) {
Address[] from = msg.getFrom();
String[] fromStr = new String[from.length];
int j = 0;
for (Address address : from) {
fromStr[j] = useEmailExtractor ? extractEmailAddress(address.toString()) : address.toString();
j++;
}
rd.addField(EmailConfig.EMAIL_FROM, fromStr);
} else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_SUBJECT)) {
String subject = msg.getSubject();
//Attachments may have a field named "subject". So, different field name is used not to clash.
rd.addField(EmailConfig.MAILSUBJECT_FIELD, subject);
} else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_DATE)) {
rd.addField(EmailConfig.EMAIL_DATE, sentDate.toString());
}
}
final InputStream is = part.getInputStream();
try {
rd.setBinary(is, fileLength);
activities.ingestDocumentWithException(documentIdentifier, version, msgURL, rd);
errorCode = "OK";
fileLengthLong = new Long(fileLength);
} finally {
is.close();
}
} catch (InterruptedException e) {
throw new ManifoldCFException(e.getMessage(), ManifoldCFException.INTERRUPTED);
} catch (MessagingException e) {
errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
errorDesc = e.getMessage();
handleMessagingException(e, "processing email attachment");
} catch (IOException e) {
errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
errorDesc = e.getMessage();
handleIOException(e, "processing email attachment");
throw new ManifoldCFException(e.getMessage(), e);
}
} catch (ManifoldCFException e) {
if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
errorCode = null;
throw e;
} finally {
if (errorCode != null)
activities.recordActivity(new Long(startTime),EmailConfig.ACTIVITY_FETCH,
fileLengthLong,documentIdentifier,errorCode,errorDesc,null);
}
}
}
}
finally
{
for (Folder f : openFolders.values())
{
try
{
CloseFolderThread cft = new CloseFolderThread(session, f);
cft.start();
cft.finishUp();
}
catch (InterruptedException e)
{
throw new ManifoldCFException(e.getMessage(),ManifoldCFException.INTERRUPTED);
}
catch (MessagingException e)
{
handleMessagingException(e, "closing folders");
}
}
}
}