backend/app/model/Email.scala (176 lines of code) (raw):

package model import java.io.InputStream import java.security.MessageDigest import java.util import java.util.{Base64, Locale} import com.pff._ import enumeratum.EnumEntry.Snakecase import enumeratum.{EnumEntry, PlayEnum} import extraction.email.pst.iterators.{AttachmentIterator, RecipientIterator} import model.index.IndexedResource import org.apache.commons.io.IOUtils import play.api.libs.json._ import utils.{DateTimeUtils, Logging, UriCleaner} import java.util.stream.Collectors import scala.jdk.CollectionConverters._ object Priority { val NotUrgent = "not_urgent" val Normal = "normal" val Urgent = "urgent" def withPstIdOption(id: Int): Option[String] = id match { case -1 => Some(NotUrgent) case 0 => Some(Normal) case 1 => Some(Urgent) case _ => None } def withRfcValue(id: String): String = id match { case _ if id.startsWith("1") || id.startsWith("2") => Urgent case _ if id.startsWith("3") => Normal case _ if id.startsWith("4") || id.startsWith("5") => NotUrgent // cover cases where clients have written the priority as a string case _ => id.toLowerCase(Locale.UK) } } sealed abstract class Sensitivity(val pstId: Int) extends EnumEntry with Snakecase object Sensitivity extends PlayEnum[Sensitivity] { case object None extends Sensitivity(0) case object Personal extends Sensitivity(1) case object Private extends Sensitivity(2) case object CompanyConfidential extends Sensitivity(3) val values = findValues def withPstIdOption(id: Int): Option[Sensitivity] = values.find(_.pstId == id) def withRfcOption(id: String): Option[Sensitivity] = id match { case "Personal" => Some(Personal) case "Private" => Some(Private) case "Company-Confidential" => Some(CompanyConfidential) case _ => scala.None } } case class Recipient (displayName: Option[String], email: String) object Recipient { implicit val recipientFormat = Json.format[Recipient] val unknown = Recipient(Some("Unknown Recipient"), "unknown@recipient.com") def fromPSTRecipient(r: PSTRecipient) = Recipient(r.getDisplayName.hasTextOrNone(), r.getEmailAddress.removeChevrons()) } case class Email( uri: Uri, from: Option[Recipient], recipients: List[Recipient], sentAt: Option[String], sensitivity: Option[Sensitivity], priority: Option[String], subject: String, body: String, inReplyTo: List[String], references: List[String], html: Option[String], attachmentCount: Int, metadata: Map[String, Seq[String]], flag: Option[String] = None) extends IndexedResource { def sentAtMillis(): Option[Long] = { sentAt.flatMap { ts => DateTimeUtils.isoToEpochMillis(ts) orElse DateTimeUtils.isoMissingTimeZoneToMillis(ts) } } // Omit body from toString (to avoid filling up logs) override def toString: String = { s"Email(uri=$uri, from: $from, recipients: [${recipients.mkString(",")}], sentAt: $sentAt," + s"sensitivity: $sensitivity, priority: $priority, subject: $subject, inReplyTo: [${inReplyTo.mkString(",")}]" + s"references: [${references.mkString(",")}]" } } object Email extends Logging { implicit val format = Json.format[Email] def inlineAttachmentsIntoHtml[T](html: String, attachments: Iterator[T])(getContent: T => Option[(String, String, InputStream)]): String = { attachments.foldLeft(html) { (htmlText, a) => getContent(a) match { case Some((mimeType, id, attachmentStream)) => try { val bytes = IOUtils.toByteArray(attachmentStream) val encoded = Base64.getEncoder.encodeToString(bytes) htmlText.replace(s"cid:$id", s"data:$mimeType;base64,$encoded") } finally { attachmentStream.close() } case None => htmlText } } } // trim to avoid trailing NUL characters from PSTs def cleanUri(original: String): String = UriCleaner.clean(original.removeChevrons()) def cleanInReplyTo(original: String): List[String] = original.splitListClean(' ').map(_.removeChevrons()) /** * An alternative endpoint that can be used if we are not certain we'll have a useful message ID. * * In this case if we don't have an explicit message ID we make a best effort to create one that is unique from the * information that we do have. * * We include the main components but have deliberately omitted some of the metadata flags. The aim is a good * balance between having enough entropy without causing problems if we change the way that some of the more * esoteric features are represented. */ def createFrom(maybeUri: Option[Uri], from: Option[Recipient], recipients: List[Recipient], sentAt: Option[String], sensitivity: Option[Sensitivity], priority: Option[String], subject: String, body: String, inReplyTo: List[String], references: List[String], html: Option[String], attachmentCount: Int, metadata: Map[String, Seq[String]] = Map.empty, flag: Option[String] = None): Email = { val uri = maybeUri.getOrElse { val toBeHashed = s"$from/$recipients/$sentAt/$subject/$body/$inReplyTo/$references/$html" val uri = { val digest = MessageDigest.getInstance("SHA-512") digest.update(toBeHashed.getBytes("UTF-8")) Uri(s"no_id:${Base64.getUrlEncoder.withoutPadding.encodeToString(digest.digest())}") } logger.warn(s"Synthesised message ID $uri") uri } // ensure that the IDs for reply-to and references are clean and valid val cleanReplyTo = inReplyTo.map(_.trim).filter(_.nonEmpty) if (cleanReplyTo != inReplyTo) logger.warn(s"In-Reply-To list was cleaned up for $uri. Was: $inReplyTo Now: $cleanReplyTo") val cleanReferenced = references.map(_.trim).filter(_.nonEmpty) if (cleanReferenced != references) logger.warn(s"Referenced list was cleaned up for $uri. Was: $references Now: $cleanReferenced") Email( uri = uri, from = from, recipients = recipients, sentAt = sentAt, sensitivity = sensitivity, priority = priority, subject = subject, body = body, inReplyTo = cleanReplyTo, references = cleanReferenced, html = html, attachmentCount = attachmentCount, metadata = metadata, flag = flag ) } def fromPSTMessage(message: PSTMessage) = { val inReplyTo = message.getInReplyToId.hasTextOrNone().toList.flatMap(Email.cleanInReplyTo) val headers = message.getTransportMessageHeaders val references = headers.lines.collect(Collectors.toList[String]).asScala.find(_.startsWith("References:")).map(_.stripPrefix("References:")).toList.flatMap(Email.cleanInReplyTo) val date = headers.lines.collect(Collectors.toList[String]).asScala.find(_.startsWith("Date:")).flatMap { date => DateTimeUtils.rfc1123ToIsoDateString(date.stripPrefix("Date: ").trim()) } val imageInlinedHtml = message.getBodyHTML.hasTextOrNone().map { rawHtml => inlineAttachmentsIntoHtml(rawHtml, new AttachmentIterator(message))(a => a.getContentId.hasTextOrNone().map { id => (a.getMimeTag, id, a.getFileInputStream) } ) } val attachmentCount = new AttachmentIterator(message).count(_.getContentId.isEmpty) Email.createFrom( maybeUri = message.getInternetMessageId.hasTextOrNone().map(id => Uri(cleanUri(id))), from = message.getSenderEmailAddress.hasTextOrNone().map(Recipient(message.getSenderName.hasTextOrNone(), _)), recipients = new RecipientIterator(message).map(Recipient.fromPSTRecipient).toList, sentAt = date, sensitivity = Sensitivity.withPstIdOption(message.getOriginalSensitivity), priority = Priority.withPstIdOption(message.getPriority), subject = message.getSubject.replace("\u0000", ""), body = message.getBody, inReplyTo = inReplyTo, references = references, html = imageInlinedHtml, attachmentCount = attachmentCount, metadata = Map.empty ) } }