backend/app/extraction/email/eml/EmlParser.scala (129 lines of code) (raw):

package extraction.email.eml import java.io.InputStream import java.nio.file.attribute.FileTime import java.nio.file.{Files, StandardCopyOption} import java.security.DigestInputStream import com.amazonaws.util.IOUtils import com.google.common.net.MediaType import ingestion.IngestionContextBuilder import jakarta.mail.Message import jakarta.mail.internet._ import model._ import model.manifest.{Blob, MimeType} import org.apache.commons.io.FileUtils import services.ingestion.IngestionServices import services.{FingerprintServices, ScratchSpace} import utils.{HtmlToPlainText, DateTimeUtils, Logging} import scala.jdk.CollectionConverters._ import scala.util.control.NonFatal class EmlParser(val scratch: ScratchSpace, val ingestionServices: IngestionServices) extends Logging { def parseMessage(message: Message): Option[(Email, Seq[MimeBodyPart])] = { val uri = getMessageUri(message) val senderAddress = Option(message.getFrom).flatMap(_.headOption.map(_.asInstanceOf[InternetAddress])) val from = senderAddress.map { addr => Recipient(Option(addr.getPersonal), addr.getAddress) } val sentAt = Option(message.getHeader("Date")).flatMap(_.headOption.flatMap(DateTimeUtils.rfc1123ToIsoDateString)) val subject = Option(message.getSubject).map(MimeUtility.decodeText).orNull val priority: Option[String] = Option(message.getHeader("X-Priority")).flatMap(_.headOption.map(v => Priority.withRfcValue(v))) val sensitivity: Option[Sensitivity] = Option(message.getHeader("Sensitivity")).flatMap(_.headOption.flatMap(v => Sensitivity.withRfcOption(v))) val inReplyTo: List[String] = Option(message.getHeader("In-Reply-To")).map(_.toList).getOrElse(Nil) val references: List[String] = Option(message.getHeader("References")).map(_.toList).getOrElse(Nil) val recipients: List[Recipient] = Option(message.getAllRecipients).map(_.toList).getOrElse(Nil) .collect { case c: InternetAddress => c } .flatMap { r => Option(r.getAddress).map(Recipient(Option(r.getPersonal), _)) } message.getContent match { case content: MimeMultipart => val parts = (for (a <- 0 until content.getCount) yield content.getBodyPart(a)) .collect { case p: MimeBodyPart => p } .flatMap(flattenMultipart) val attachments = parts.filter(p => Option(p.getEncoding).filter(_.toLowerCase() == "base64").nonEmpty && getFilename(p).nonEmpty) val nonAttachments = parts.filter(p => getFilename(p).isEmpty) val bodyPart = nonAttachments.find(_.getContentType.toLowerCase().startsWith("text/plain")) val htmlPart = nonAttachments.find(_.getContentType.toLowerCase().startsWith("text/html")) val body = (bodyPart, htmlPart) match { case (Some(body), _) => body.getContent.asInstanceOf[String] case (None, Some(html)) => HtmlToPlainText.convert(html.getContent.asInstanceOf[String]) case _ => "" } val html: Option[String] = htmlPart .map(_.getContent.asInstanceOf[String]) .map(Email.inlineAttachmentsIntoHtml(_, attachments.iterator)(a => Option(a.getContentID).map { id => (a.getContentType, id.removeChevrons(), a.getInputStream) } )) val attachmentCount = attachments.flatMap(getRawContentDisposition).count(!_.startsWith("inline")) val email = Email.createFrom(uri, from, recipients, sentAt, sensitivity, priority, subject, body, inReplyTo, references, html, attachmentCount) Some((email, attachments)) case plainText: String => val email = Email.createFrom(uri, from, recipients, sentAt, sensitivity, priority, subject, plainText, inReplyTo, references, None, 0) Some((email, Nil)) case is: InputStream => // Just a single attachment, no message text body val headers = new InternetHeaders() headers.addHeader("Content-Type", message.getContentType) headers.addHeader("Content-Disposition", message.getDisposition) val email = Email.createFrom(uri, from, recipients, sentAt, sensitivity, priority, subject, "<empty>", inReplyTo, references, None, 0) val attachment = new MimeBodyPart(headers, IOUtils.toByteArray(is)) Some((email, Seq(attachment))) case other => logger.info(s"Unknown EML message content type ${other.getClass}") None } } def getMessageUri(message: Message): Option[Uri] = message match { case mimeMessage: MimeMessage => Option(mimeMessage.getMessageID).map(id => Uri(Email.cleanUri(id))).filter(_.value.trim.nonEmpty) case _ => None } def ingestAttachment(context: IngestionContextBuilder, email: Email, attachment: MimeBodyPart): Unit = { val attachmentStream = attachment.getInputStream val attachmentRoot = scratch.createWorkingDir(s"emails/${email.uri.value}/") try { val name = getFilename(attachment).getOrElse(throw new IllegalArgumentException(s"Missing Content-Disposition for attachment in ${email.uri}")) val rawContentType = attachment.getContentType val semicolonIndex = rawContentType.indexOf(";") val mimeType = if (semicolonIndex > 0) rawContentType.substring(0, semicolonIndex) else rawContentType // Create Blob URI val attachmentFile = scratch.copyToScratchSpace(attachmentStream) val blobUri = Uri(FingerprintServices.createFingerprintFromFile(attachmentFile)) // Ingest val blob = Blob(blobUri, attachmentFile.length(), Set(MimeType(mimeType))) // https://tools.ietf.org/html/rfc2183 val creationTime = headerDateToFileTime(attachment, "Creation-Date").orElse(email.sentAtMillis().map(FileTime.fromMillis)) val lastAccessTime = headerDateToFileTime(attachment, "Read-Date") val lastModificationTime = headerDateToFileTime(attachment, "Modification-Date") val attachmentContext = context.finish(name, attachmentFile.toPath, creationTime, lastAccessTime, lastModificationTime) ingestionServices.ingestFile(attachmentContext, blob.uri, attachmentFile.toPath) } finally { attachmentStream.close() FileUtils.deleteDirectory(attachmentRoot.toFile) } } private def headerDateToFileTime(attachment: MimeBodyPart, name: String): Option[FileTime] = { Option(attachment.getHeader(name)) .flatMap(_.headOption.flatMap(DateTimeUtils.rfc1123ToEpochMillis)) .map(FileTime.fromMillis) } private def flattenMultipart(part: MimeBodyPart): List[MimeBodyPart] = { part.getContent match { case p: MimeMultipart => (for (a <- 0 until p.getCount) yield p.getBodyPart(a)).collect { case p: MimeBodyPart => p }.flatMap(flattenMultipart).toList case _ => List(part) } } private def getParameter(name: String, tpe: MediaType): Option[String] = { val params = tpe.parameters().asMap().asScala params.get(name).flatMap(_.asScala.headOption) } private val filenamesRegex = """filename\*\d+=\"(.+)\"""".r private def getRawContentDisposition(part: MimeBodyPart): Option[String] = { Option(part.getHeader("Content-Disposition", null)) } private def getFilename(part: MimeBodyPart): Option[String] = try { Option(part.getFileName).map(MimeUtility.decodeText) } catch { case e: ParseException => // Try to handle cases where javax.mail (even with strict off) can't hack it getRawContentDisposition(part).map { value => // Try and handle folded filename entries, which may be UTF-8 base64 encoded. // I don't know how prevalent this is in the real world but I have seen it in a real dataset and the Platform for // Investigations is just a curated set of special cases right!! // // The format is: // Content-Disposition: attachment; // filename*0="=?UTF-8?B?<base 64 encoded stuff>" // filename*1="<more base64 encoded stuff>?=" // // Note the lack of "?=" terminator until right at the end. The values (when treated in aggregate) may or may // not be UTF-8 base-64 encoded. val encoded = filenamesRegex.findAllIn(value).matchData.map(_.group(1)).toList.mkString("") MimeUtility.decodeText(encoded) } } }