backend/app/commands/DeleteResource.scala (63 lines of code) (raw):
package commands
import model.{Language, Uri}
import play.api.mvc.Result
import play.api.mvc.Results.NoContent
import services.ObjectStorage
import services.index.Index
import services.manifest.Manifest
import services.observability.PostgresClient
import services.previewing.PreviewService
import utils.{Logging, Timing}
import utils.attempt.{Attempt, DeleteFailure, IllegalStateFailure}
import scala.concurrent.ExecutionContext
class DeleteResource( manifest: Manifest, index: Index, previewStorage: ObjectStorage, objectStorage: ObjectStorage, postgresClient: PostgresClient) (implicit ec: ExecutionContext)
extends Timing {
private def deleteFromS3Preview(blobUri: Uri, pagePreviewKeys: Set[String]): Attempt[Iterator[Unit]] = {
// The full-document preview, as well as all the previews of individual pages
val keys = pagePreviewKeys + blobUri.toStoragePath
logger.info(s"Deleting ${keys.size} objects from preview storage")
// Group, just in case we have thousands of pages.
// 1000 objects is the limit for a batch:
// https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObjects.html
Attempt.traverse(keys.grouped(500)) { batchOfS3Keys =>
Attempt.fromEither(previewStorage.deleteMultiple(batchOfS3Keys))
}
}
private def getPagePreviewS3Keys(uri: Uri, ocrLanguages: List[Language]): Attempt[Set[String]] = {
if (ocrLanguages.isEmpty) {
// This typically means the blob was not processed by the OcrMyPdfExtractor,
// either because it's not a PDF or because it was processed before
// we introduced the OcrMyPdfExtractor.
logger.info(s"No OCR languages found for blob ${uri.value}")
} else {
logger.info(s"Deleting page previews for ${uri.value} in languages: ${ocrLanguages.map(_.key).mkString(", ")}")
}
// Try and delete from these legacy paths as well.
// If there's nothing under them, we just won't find any objects and there will be nothing extra to delete.
// We can delete this code once we've delete or reprocessed everything under these folders.
val legacyPagePreviewPrefixes = List("ocr.english", "text").map(folder => s"pages/${folder}/${uri.toStoragePath}")
val pagePreviewPrefixes = ocrLanguages.map(PreviewService.getPageStoragePrefix(uri, _))
val prefixesToDelete = legacyPagePreviewPrefixes ::: pagePreviewPrefixes
logger.info(s"Deleting prefixes: ${prefixesToDelete.mkString(", ")}")
Attempt.traverse(prefixesToDelete) { prefix =>
Attempt.fromEither(previewStorage.list(prefix))
}.map(_.flatten.toSet)
}
private def deleteResource(uri: Uri): Attempt[Unit] = timeAsync("Total to delete resource", {
val successAttempt = Attempt.Right(())
for {
// clean up observability data
_ <- Attempt.fromEither(timeSync("Deleting blob observability events", postgresClient.deleteBlobIngestionEventsAndMetadata(uri.value)))
// For blobs not processed by the OcrMyPdfExtractor, ocrLanguages will be an empty list
ocrLanguages <- timeAsync("Getting langs from neo4j", manifest.getLanguagesProcessedByOcrMyPdf(uri))
// Not everything has a preview but S3 returns success for deleting an object that doesn't exist so we're fine
pagePreviewS3Keys <- timeAsync("Get page preview S3 keys", getPagePreviewS3Keys(uri, ocrLanguages))
_ <- timeAsync("Preview storage S3 delete", deleteFromS3Preview(uri, pagePreviewS3Keys))
_ <- Attempt.fromEither(timeSync("Ingest storage S3 delete", objectStorage.delete(uri.toStoragePath)))
_ <- timeAsync("Delete blob from neo4j", manifest.deleteBlob(uri))
// We use the index to determine what blobs are in a collection.
// So we should delete from the index last, so that if any of the above
// operations fails, we are still able to clear things up
// by restarting the delete collection operation. (Otherwise,
// it would think the blob no longer exists even though there may
// be traces in neo4j or S3).
_ <- timeAsync("Delete blob from elasticsearch", index.delete(uri.value))
_ <- successAttempt
} yield {
()
}
})
// Deletes resource after checking it has no child nodes
def deleteBlobCheckChildren(id: String): Attempt[Unit] = {
val uri = Uri(id)
// casting to an option here because Attempt[Resource] and Attempt[Unit] are incompatible - so can't use a for comprehension with toAttempt
val deleteResult = manifest.getResource(uri).toOption map { resource =>
if (resource.children.isEmpty) deleteResource(uri)
else Attempt.Left[Unit](IllegalStateFailure(s"Cannot delete $uri as it has child nodes"))
}
deleteResult.getOrElse(Attempt.Left(DeleteFailure("Failed to fetch resource")))
}
def deleteBlob(id: String): Attempt[Unit] = {
val uri = Uri(id)
deleteResource(uri)
}
}