backend/app/extraction/email/mbox/MBoxEmailDetector.scala (33 lines of code) (raw):
package extraction.email.mbox
import java.io.InputStream
import java.io.InputStream
import java.util.Properties
import extraction.email.{CustomTikaDetector, JakartaMail}
import org.apache.tika.io.TikaInputStream
import org.apache.tika.mime.MediaType
import scala.util.control.NonFatal
/*
* Not really a standard but pretty common place
* https://www.loc.gov/preservation/digital/formats/fdd/fdd000383.shtml
*/
object MBoxEmailDetector extends CustomTikaDetector {
val MBOX_MIME_TYPE = "application/mbox"
override def detectType(input: InputStream): Option[MediaType] = input match {
case tikaInput: TikaInputStream =>
val url = s"mbox:${tikaInput.getFile.getAbsolutePath}"
val mbox = JakartaMail.openStore(url)
try {
val messageCount = mbox.getMessageCount()
if(messageCount >= 2) {
Some(MediaType.parse(MBOX_MIME_TYPE))
} else {
logger.error(s"Mbox file had ${messageCount} messages - this could be because JakartaMail failed to properly open the MBOX file. ")
None
}
} catch {
case NonFatal(e) =>
logger.error("Failed to open mbox", e)
None
} finally {
mbox.close()
}
case _ =>
None
}
}