image-loader/app/lib/imaging/FileMetadataReader.scala (205 lines of code) (raw):

package lib.imaging import java.io.File import java.util.concurrent.Executors import com.adobe.internal.xmp.XMPMetaFactory import com.drew.imaging.ImageMetadataReader import com.drew.metadata.exif.{ExifDirectoryBase, ExifIFD0Directory, ExifSubIFDDirectory} import com.drew.metadata.icc.IccDirectory import com.drew.metadata.iptc.IptcDirectory import com.drew.metadata.jpeg.JpegDirectory import com.drew.metadata.png.PngDirectory import com.drew.metadata.xmp.XmpDirectory import com.drew.metadata.{Directory, Metadata} import com.gu.mediaservice.lib.{ImageWrapper, StorableImage} import com.gu.mediaservice.lib.imaging.im4jwrapper.ImageMagick._ import com.gu.mediaservice.lib.logging.{GridLogging, LogMarker} import com.gu.mediaservice.lib.metadata.ImageMetadataConverter import com.gu.mediaservice.model._ import model.upload.UploadRequest import org.joda.time.{DateTime, DateTimeZone} import org.joda.time.format.ISODateTimeFormat import play.api.libs.json.JsValue import scala.jdk.CollectionConverters._ import scala.collection.compat._ import scala.concurrent.{ExecutionContext, Future} object FileMetadataReader extends GridLogging { /* The XMPMetaFactory in the Adobe xmpcore library keeps a stateful list of previously seen prefix (namespace) to schema mappings in a `XMPSchemaRegistry` Let: - Image A namespace Getty schema (http://xmp.gettyimages.com/gift/1.0/) with `prefix0` - Image B namespace Getty schema (http://xmp.gettyimages.com/gift/1.0/) with `GettyImagesGIFT` If we process Image A first, the `XMPSchemaRegistry` will say the Getty namespace is prefixed with `prefix0`. When we process Image B, we'll see it uses the Getty namespace with the `GettyImagesGIFT` prefix, but that the namespace is in the `XMPSchemaRegistry` from Image A, so we'll set the prefix to `prefix0`. Conversely, if we process Image B first, the `XMPSchemaRegistry` cache will be in the desired state and Image A will be ingested correctly. That is, it's pot luck! As a workaround, register the Getty prefix as `GettyImagesGIFT` early to force Getty metadata to use the correct prefix. This is what ExifTool does - https://github.com/exiftool/exiftool/blob/3339862c31076f9db30270b3965ac1c49ee0687a/lib/Image/ExifTool/XMP.pm#L184 */ private val namespaces = Map( "GettyImagesGIFT" -> "http://xmp.gettyimages.com/gift/1.0/" ) for ((prefix, namespaceUri) <- namespaces) XMPMetaFactory.getSchemaRegistry.registerNamespace(namespaceUri, prefix) private implicit val ctx: ExecutionContext = ExecutionContext.fromExecutor(Executors.newCachedThreadPool) def fromIPTCHeaders(image: File, imageId:String): Future[FileMetadata] = for { metadata <- readMetadata(image) } yield getMetadataWithIPTCHeaders(metadata, imageId) // FIXME: JPEG, JFIF, Photoshop, GPS, File def fromIPTCHeadersWithColorInfo(image: ImageWrapper)(implicit logMarker: LogMarker): Future[FileMetadata] = fromIPTCHeadersWithColorInfo(image.file, image.id, image.mimeType) def fromIPTCHeadersWithColorInfo(image: File, imageId:String, mimeType: MimeType)(implicit logMarker: LogMarker): Future[FileMetadata] = for { metadata <- readMetadata(image) colourModelInformation <- getColorModelInformation(image, metadata, mimeType) } yield getMetadataWithIPTCHeaders(metadata, imageId).copy(colourModelInformation = colourModelInformation) private def getMetadataWithIPTCHeaders(metadata: Metadata, imageId:String): FileMetadata = FileMetadata( iptc = exportDirectory(metadata, classOf[IptcDirectory]), exif = exportDirectory(metadata, classOf[ExifIFD0Directory]), exifSub = exportDirectory(metadata, classOf[ExifSubIFDDirectory]), xmp = exportXmpPropertiesInTransformedSchema(metadata, imageId), icc = redactLongFieldValues(imageId, "ICC")(exportDirectory(metadata, classOf[IccDirectory])), getty = exportGettyDirectory(metadata, imageId), colourModel = None, colourModelInformation = Map() ) // Export all the metadata in the directory private def exportDirectory[T <: Directory](metadata: Metadata, directoryClass: Class[T]): Map[String, String] = Option(metadata.getFirstDirectoryOfType(directoryClass)) map { directory => val metaTagsMap = directory.getTags.asScala. filter(tag => tag.hasTagName). // Ignore seemingly useless "Padding" fields // see: https://github.com/drewnoakes/metadata-extractor/issues/100 filter(tag => tag.getTagName != "Padding"). // Ignore meta-metadata filter(tag => tag.getTagName != "XMP Value Count"). flatMap { tag => nonEmptyTrimmed(tag.getDescription) map { value => tag.getTagName -> value } }.toMap directory match { case d: IptcDirectory => val dateTimeCreated = Option(d.getDateCreated).map(d => dateToUTCString(new DateTime(d))).map("Date Time Created Composite" -> _) val digitalDateTimeCreated = Option(d.getDigitalDateCreated).map(d => dateToUTCString(new DateTime(d))).map("Digital Date Time Created Composite" -> _) metaTagsMap ++ dateTimeCreated ++ digitalDateTimeCreated case d: ExifSubIFDDirectory => val dateTimeCreated = Option(d.getDateOriginal).map(d => dateToUTCString(new DateTime(d))).map("Date/Time Original Composite" -> _) metaTagsMap ++ dateTimeCreated case _ => metaTagsMap } } getOrElse Map() private val datePattern = "(.*[Dd]ate.*)".r private def xmpDirectoryToMap(directory: XmpDirectory, imageId: String): Map[String, String] = { directory.getXmpProperties.asScala.view.mapValues(nonEmptyTrimmed).collect { case (datePattern(key), Some(value)) => key -> ImageMetadataConverter.cleanDate(value, key, imageId) case (key, Some(value)) => key -> value }.toMap } private val redactionThreshold = 5000 val redactionReplacementValue = s"REDACTED (value longer than $redactionThreshold characters, please refer to the metadata stored in the file itself)" private def redactLongFieldValues(imageId: String, metadataType: String, exceptions: List[String] = Nil)(props: Map[String, String]) = props.map { case (fieldName, value) if value.length > redactionThreshold && !exceptions.exists(fieldName.contains) => logger.warn(s"Redacting '$fieldName' $metadataType field for image $imageId, as it's problematically long (longer than $redactionThreshold characters") fieldName -> redactionReplacementValue case keyValuePair => keyValuePair } // We redact most xmp fields because they are usually short in length, or are not required for usual grid operation. // These fields are the exceptions - they may be long, and they are displayed to users, so are allowed as an exception. private val allowedLongXmpFields = List( "dc:description", "photoshop:Headline", "photoshop:Instructions", ) private def exportRawXmpProperties(metadata: Metadata, imageId:String): Map[String, String] = { val directories = metadata.getDirectoriesOfType(classOf[XmpDirectory]).asScala.toList val props: Map[String, String] = directories.foldLeft[Map[String, String]](Map.empty)((acc, dir) => { // An image can have multiple xmp directories. A directory has multiple xmp properties. // A property can be repeated across directories and its value may not be unique. // Keep the first value encountered on the basis that there will only be multiple directories // if there is no space in the previous one as directories have a maximum size. acc ++ xmpDirectoryToMap(dir, imageId).view.filterKeys(k => !acc.contains(k)).toMap }) redactLongFieldValues(imageId, "XMP", allowedLongXmpFields)(props) } private def exportXmpPropertiesInTransformedSchema(metadata: Metadata, imageId:String): Map[String, JsValue] = { val props = exportRawXmpProperties(metadata, imageId) FileMetadataAggregator.aggregateMetadataMap(props) } // Getty made up their own XMP namespace. // We're awaiting actual documentation of the properties available, so // this only extracts a small subset of properties as a means to identify Getty images. private def exportGettyDirectory(metadata: Metadata, imageId:String): Map[String, String] = { val xmpProperties = exportRawXmpProperties(metadata, imageId) def readProperty(name: String): Option[String] = xmpProperties.get(name) def readAssetId: Option[String] = readProperty("GettyImagesGIFT:AssetId").orElse(readProperty("GettyImagesGIFT:AssetID")) // Not to live in a glass house and throw stones, but this looks awfully like a case class // Don't change the field names mind. Map( "Asset ID" -> readAssetId, "Call For Image" -> readProperty("GettyImagesGIFT:CallForImage"), "Camera Filename" -> readProperty("GettyImagesGIFT:CameraFilename"), "Camera Make Model" -> readProperty("GettyImagesGIFT:CameraMakeModel"), "Composition" -> readProperty("GettyImagesGIFT:Composition"), "Exclusive Coverage" -> readProperty("GettyImagesGIFT:ExclusiveCoverage"), "Image Rank" -> readProperty("GettyImagesGIFT:ImageRank"), "Original Create Date Time" -> readProperty("GettyImagesGIFT:OriginalCreateDateTime"), "Original Filename" -> readProperty("GettyImagesGIFT:OriginalFilename"), "Personality" -> readProperty("GettyImagesGIFT:Personality"), "Time Shot" -> readProperty("GettyImagesGIFT:TimeShot") ).flattenOptions } private def dateToUTCString(date: DateTime): String = ISODateTimeFormat.dateTime.print(date.withZone(DateTimeZone.UTC)) def dimensions(image: File, mimeType: Option[MimeType]): Future[Option[Dimensions]] = for { metadata <- readMetadata(image) } yield { mimeType match { case Some(Jpeg) => for { jpegDir <- Option(metadata.getFirstDirectoryOfType(classOf[JpegDirectory])) } yield Dimensions(jpegDir.getImageWidth, jpegDir.getImageHeight) case Some(Png) => for { pngDir <- Option(metadata.getFirstDirectoryOfType(classOf[PngDirectory])) } yield { val width = pngDir.getInt(PngDirectory.TAG_IMAGE_WIDTH) val height = pngDir.getInt(PngDirectory.TAG_IMAGE_HEIGHT) Dimensions(width, height) } case Some(Tiff) => for { exifDir <- Option(metadata.getFirstDirectoryOfType(classOf[ExifIFD0Directory])) } yield { val width = exifDir.getInt(ExifDirectoryBase.TAG_IMAGE_WIDTH) val height = exifDir.getInt(ExifDirectoryBase.TAG_IMAGE_HEIGHT) Dimensions(width, height) } case _ => None } } def getColorModelInformation(image: File, metadata: Metadata, mimeType: MimeType)(implicit logMarker: LogMarker): Future[Map[String, String]] = { val source = addImage(image) val formatter = format(source)("%r") runIdentifyCmd(formatter, useImageMagick = false).map{ imageType => getColourInformation(metadata, imageType.headOption, mimeType) } .recover { case _ => getColourInformation(metadata, None, mimeType) } } // bits per sample might be a useful value, eg. "1", "8"; or it might be annoying like "1 bits/component/pixel", "8 8 8 bits/component/pixel" // either way we want everything up to the first space private def extractBitsPerSample(data: String): Option[String] = data.split(" ").headOption private def getFromDirectory(maybeDir: Option[Directory])(value: Int): Option[String] = maybeDir.flatMap(dir => Option(dir.getDescription(value))) private def getColourInformation(metadata: Metadata, maybeImageType: Option[String], mimeType: MimeType): Map[String, String] = { val hasAlpha = maybeImageType.map(imageType => if (imageType.contains("Matte")) "true" else "false") val exifDirectory = Option(metadata.getFirstDirectoryOfType(classOf[ExifIFD0Directory])) val getFromExifDirectory = getFromDirectory(exifDirectory) _ val photometricInterpretation = getFromExifDirectory(ExifDirectoryBase.TAG_PHOTOMETRIC_INTERPRETATION) mimeType match { case Png => val pngDirectory = Option(metadata.getFirstDirectoryOfType(classOf[PngDirectory])) val getFromPngDirectory = getFromDirectory(pngDirectory) _ Map( "hasAlpha" -> hasAlpha, "colorType" -> getFromPngDirectory(PngDirectory.TAG_COLOR_TYPE), "bitsPerSample" -> getFromPngDirectory(PngDirectory.TAG_BITS_PER_SAMPLE).flatMap(extractBitsPerSample), "paletteHasTransparency" -> getFromPngDirectory(PngDirectory.TAG_PALETTE_HAS_TRANSPARENCY), "paletteSize" -> getFromPngDirectory(PngDirectory.TAG_PALETTE_SIZE), "iccProfileName" -> getFromPngDirectory(PngDirectory.TAG_ICC_PROFILE_NAME) ).flattenOptions case Jpeg => Map( "hasAlpha" -> Some("false"), "colorType" -> maybeImageType, "photometricInterpretation" -> photometricInterpretation, "bitsPerSample" -> Some("8") ).flattenOptions case Tiff => Map( "hasAlpha" -> hasAlpha, "colorType" -> maybeImageType, "photometricInterpretation" -> photometricInterpretation, "bitsPerSample" -> getFromExifDirectory(ExifDirectoryBase.TAG_BITS_PER_SAMPLE).flatMap(extractBitsPerSample) ).flattenOptions } } private def nonEmptyTrimmed(nullableStr: String): Option[String] = Option(nullableStr) map (_.trim) filter (_.nonEmpty) private def readMetadata(file: File): Future[Metadata] = Future { ImageMetadataReader.readMetadata(file) } // Helper to flatten maps of options implicit class MapFlattener[K, V](val map: Map[K, Option[V]]) { def flattenOptions: Map[K, V] = map.collect { case (key, Some(value)) => key -> value } } }