in src/main/scala/ophan/google/indexing/observatory/DataStore.scala [25:53]
def fetchExistingRecordsFor(uris: Set[URI]): Future[Set[AvailabilityRecord]] = scanamoAsync.exec(
table.getAll(Field.Uri in uris)
).map(_.flatMap(_.toOption))
/**
* When we initially store an availability record in the DynamoDB table, we don't store anything about its
* availability, just its URL, whether it redirects, and the time we first have seen this url.
*/
def storeNewRecordsFor(sitemapDownload: SitemapDownload, resolutionsForUrisNotSeenBefore: Set[Resolution]): Future[Unit] = {
def logContextFor[R <: Resolution](fieldSuffix: String, resolutions: Set[R]): Map[String, _] =
contextSampleOf(s"sitemap.uris.fresh.$fieldSuffix", resolutions.map(_.redirectPath.originalUri))
val resolvedNewUris = resolutionsForUrisNotSeenBefore.collect { case r: Resolved => r }
logger.info(Map(
"site" -> sitemapDownload.site.url,
"sitemap.uris.all" -> sitemapDownload.allUris.size,
) ++ logContextFor("unresolved", resolutionsForUrisNotSeenBefore.collect { case u: Unresolved => u})
++ logContextFor("resolved", resolvedNewUris)
++ logContextFor("resolved.notOK", resolvedNewUris.filter(!_.conclusion.isOk))
++ logContextFor("resolved.redirecting", resolvedNewUris.filter(_.redirectPath.doesRedirect)),
s"Storing ${resolvedNewUris.size} new resolved uris for ${sitemapDownload.site.url}")
if (resolvedNewUris.isEmpty) Future.successful(()) else {
scanamoAsync.exec(
table.putAll(resolvedNewUris.map { resolved => AvailabilityRecord(resolved, sitemapDownload.timestamp) })
)
}
}