backend/app/extraction/email/msg/MsgEmailExtractor.scala (84 lines of code) (raw):
package extraction.email.msg
import java.io.{ByteArrayInputStream, InputStream}
import java.nio.file.{Files, StandardCopyOption}
import java.security.DigestInputStream
import cats.syntax.either._
import com.auxilii.msgparser.OutlookMessageParser
import com.auxilii.msgparser.model.{OutlookFileAttachment, OutlookMessage, OutlookMsgAttachment}
import extraction.{ExtractionParams, Extractor}
import ingestion.IngestionContextBuilder
import model.manifest.{Blob, MimeType}
import model.{Email, Priority, Recipient, Sensitivity, Uri, _}
import org.apache.commons.io.FileUtils
import services.ingestion.IngestionServices
import services.{FingerprintServices, ScratchSpace, Tika}
import utils.attempt.{Failure, UnknownFailure}
import utils.{DateTimeUtils, Logging}
import java.util.stream.Collectors
import scala.jdk.CollectionConverters._
class MsgEmailExtractor(scratch: ScratchSpace, ingestionServices: IngestionServices, tika: Tika) extends Extractor with Logging {
val mimeTypes = Set(
"application/vnd.ms-outlook"
)
override def canProcessMimeType = mimeTypes.contains
override def indexing = true
override def priority = 4
private def getHeaderValue(message: OutlookMessage, header: String): Option[String] = Option(message.getHeaders)
.flatMap(_.lines.collect(Collectors.toList[String]).asScala.find(_.startsWith(s"$header:")))
.map(_.stripPrefix(s"$header:").trim)
.filter(!_.isEmpty)
private def processMessage(blob: Blob, msg: OutlookMessage, params: ExtractionParams): Unit = {
val uri = msg.getMessageId.hasTextOrNone().map(id => Uri(Email.cleanUri(id)))
val from = Option(msg.getFromEmail).map(e => Recipient(Option(msg.getFromName), e))
val sentAt = getHeaderValue(msg, "Date").flatMap(DateTimeUtils.rfc1123ToIsoDateString)
val subject = Option(msg.getSubject).getOrElse("")
val priority: Option[String] = getHeaderValue(msg, "X-Priority").map(v => Priority.withRfcValue(v))
val sensitivity: Option[Sensitivity] = getHeaderValue(msg, "Sensitivity").flatMap(v => Sensitivity.withPstIdOption(v.toInt))
val inReplyTo: List[String] = getHeaderValue(msg, "In-Reply-To").toList.flatMap(Email.cleanInReplyTo)
val references: List[String] = getHeaderValue(msg, "References").toList.flatMap(Email.cleanInReplyTo)
val recipients: List[Recipient] = msg.getRecipients.asScala
.flatMap(r => Option(r.getAddress)
.map(e => Recipient(Option(r.getName), e))).toList
val attachments = msg.getOutlookAttachments.asScala
val msgAttachments = attachments.collect { case m: OutlookMsgAttachment => m}
val fileAttachments = attachments.collect { case f: OutlookFileAttachment => f }
val body = msg.getBodyText
val html = Option(msg.getBodyHTML).map(msgHtml => Email.inlineAttachmentsIntoHtml(msgHtml, fileAttachments.iterator)(a =>
Option(a.getContentId).map { id =>
(a.getMimeTag, id.removeChevrons(), new ByteArrayInputStream(a.getData))
}
))
val attachmentCount = msgAttachments.length + fileAttachments.count { attachment =>
Option(attachment.getContentDisposition).forall(!_.startsWith("inline"))
}
val email = Email.createFrom(uri, from, recipients, sentAt, sensitivity, priority, subject, body, inReplyTo, references, html, attachmentCount)
val context = IngestionContextBuilder(blob.uri, params).finishWithEmail(email)
ingestionServices.ingestEmail(context, "application/vnd.ms-outlook")
val attachmentBuilder = IngestionContextBuilder(email.uri, params)
msgAttachments.foreach { m =>
processMessage(blob, m.getOutlookMessage, params)
}
fileAttachments.foreach { attachment =>
val attachmentStream = new ByteArrayInputStream(attachment.getData)
val workingDir = scratch.createWorkingDir(s"emails/${email.uri.value}/")
try {
// Create Blob URI
val localPath = workingDir.resolve(attachment.getLongFilename)
val attachmentFile = scratch.copyToScratchSpace(localPath, attachmentStream)
val blobUri = Uri(FingerprintServices.createFingerprintFromFile(attachmentFile))
val mimeType = Option(attachment.getMimeTag)
.getOrElse(tika.detectType(attachmentFile.toPath).map(_.toString)
.getOrElse(throw new Exception("Failed to get MIME type for attachment")))
// Ingest
val blob = Blob(blobUri, attachmentFile.length(), Set(MimeType(mimeType)))
val attachmentContext = attachmentBuilder.finishWithFile(attachmentFile.toPath)
ingestionServices.ingestFile(attachmentContext, blob.uri, attachmentFile.toPath)
} finally {
attachmentStream.close()
FileUtils.deleteDirectory(workingDir.toFile)
}
}
}
override def extract(blob: Blob, stream: InputStream, params: ExtractionParams): Either[Failure, Unit] = {
processMessage(blob, new OutlookMessageParser().parseMsg(stream), params)
Right(())
}
}