archive/app/controllers/ArchiveController.scala (133 lines of code) (raw):
package controllers
import commercial.campaigns.ShortCampaignCodes
import common._
import model.Cached.{CacheableResult, WithoutRevalidationResult}
import play.api.mvc._
import services.{GoogleBotMetric, RedirectService}
import java.net.URLDecoder
import javax.ws.rs.core.UriBuilder
import model.{CacheTime, Cached}
import org.apache.http.HttpStatus
import play.api.libs.ws.WSClient
import services.RedirectService.{ArchiveRedirect, Destination, PermanentRedirect}
import scala.concurrent.Future
class ArchiveController(redirects: RedirectService, val controllerComponents: ControllerComponents, ws: WSClient)
extends BaseController
with GuLogging
with ImplicitControllerExecutionContext {
private val R1ArtifactUrl = """^/(.*)/[0|1]?,[\d]*,(-?\d+),[\d]*(.*)""".r
private val ShortUrl = """^(/p/[\w\d]+).*$""".r
private val R1Redirect = """^/[\w\d-]+(.*/[0|1]?,[\d]*,-?\d+,[\d]*.*)""".r
private val CombinerSection = """^(/[\w\d-]+)[\w\d-/]*\+[\w\d-/]+$""".r
private val CombinerSectionRss = """^(/[\w\d-]+)[\w\d-/]*\+[\w\d-/]+/rss$""".r
private val Guardian = """^/Guardian(/.*)$""".r
private val DatedSpecialIndexPage = """^(/[\w\d-]+)/(.*)/(week|lead)$""".r
private val SectionSpecialIndex = """^(/[\w\d-]+)/(week|lead)$""".r
private val NewspaperPage = "^(/theguardian|/theobserver)/(\\d{4}/\\w{3}/\\d{2})/(.+)".r
private val redirectHttpStatus = HttpStatus.SC_MOVED_PERMANENTLY
def getLocal404Page(implicit request: RequestHeader): Future[Result] =
Future {
Cached(CacheTime.NotFound)(WithoutRevalidationResult(NotFound(views.html.notFound())))
}
def lookup(path: String): Action[AnyContent] =
Action.async { implicit request =>
lookupPath(path)
.map {
_.map(Cached(CacheTime.ArchiveRedirect))
.orElse(redirectForPath(path))
}
.flatMap(_.map(Future.successful).getOrElse(getLocal404Page))
}
// Note this code is duplicated in R2PressedController - changes here should be reflected there
// Our redirects are 'normalised' Vignette URLs, Ie. path/to/0,<n>,123,<n>.html -> path/to/0,,123,.html
def normalise(path: String, zeros: String = ""): String =
path match {
case R1ArtifactUrl(p, artifactOrContextId, extension) =>
s"/$p/0,,$artifactOrContextId,$zeros.html"
case ShortUrl(p) => p
case _ => path
}
def linksToItself(path: String, destination: String): Boolean =
path match {
case R1Redirect(r1path) => destination.endsWith(r1path)
case _ => false
}
def retainShortUrlCampaign(path: String, redirectLocation: String): String = {
// if the path is a short url with a campaign, and the destination doesn't have a campaign, pass it through the redirect.
val shortUrlWithCampaign = """.*www\.theguardian\.com/p/[\w\d]+/([\w\d]+)$""".r
val urlWithCampaignParam = """.*www\.theguardian\.com.*?.*CMP=.*$""".r
val destinationHasCampaign = redirectLocation match {
case shortUrlWithCampaign(_) => true
case urlWithCampaignParam() => true
case _ => false
}
path match {
case shortUrlWithCampaign(campaign) if !destinationHasCampaign =>
val uri = UriBuilder.fromPath(redirectLocation)
ShortCampaignCodes.getFullCampaign(campaign).foreach(uri.replaceQueryParam("CMP", _))
uri.build().toString
case _ => redirectLocation
}
}
private def destinationFor(path: String): Future[Option[Destination]] = redirects.getDestination(normalise(path))
private object Combiner {
def unapply(path: String): Option[String] = {
val decodedPath = URLDecoder.decode(path, "UTF-8")
val combinerPath = decodedPath.replace(" ", "+") // the + is for combiner pages
if (combinerPath != decodedPath && combinerPath != path) Some(combinerPath) else None
}
}
private object Gallery {
def unapply(path: String): Option[String] =
if (path contains "/gallery/") Some(path.replace("/gallery/", "/pictures/")) else None
}
private object Century {
private val CenturyUrlEx = """/century(\/)?$""".r
private val CenturyDecadeUrlEx = """(\/\d{4}-\d{4})(\/)?$""".r
private val CenturyStoryUrlEx = """\/(\d{4}-\d{4})\/Story\/([0|1]?,[\d]*,-?\d+,[\d]*)(.*)""".r
private val ngCenturyFront = "/world/2014/jul/31/-sp-how-the-guardian-covered-the-20th-century"
def unapply(path: String): Option[String] =
path match {
case CenturyUrlEx(_) => Some(ngCenturyFront)
case CenturyDecadeUrlEx(_, _) => Some(ngCenturyFront)
case CenturyStoryUrlEx(decade, storyId, ext) => Some(s"/century/$decade/Story/$storyId$ext")
case _ => None
}
}
private object Lowercase {
def unapply(path: String): Option[String] =
path.split("/").toList match {
case "" :: section :: other if URLDecoder.decode(section, "UTF-8").exists(_.isUpper) =>
Some(("" :: section.toLowerCase :: other).mkString("/"))
case _ => None
}
}
private def redirectTo(path: String, pathSuffixes: String*)(implicit request: RequestHeader): Result = {
val endOfPath = if (pathSuffixes.isEmpty) "" else s"/${pathSuffixes.mkString("/")}"
val redirect = LinkTo(path) + endOfPath
logInfoWithRequestId(s"""Archive $redirectHttpStatus, redirect to $redirect""")
Cached(CacheTime.ArchiveRedirect)(WithoutRevalidationResult(Redirect(redirect, redirectHttpStatus)))
}
private def lookupPath(path: String)(implicit request: RequestHeader): Future[Option[CacheableResult]] =
destinationFor(path).map { _.flatMap(processLookupDestination(path).lift) }
private def redirectForPath(path: String)(implicit request: RequestHeader): Option[Result] =
path match {
case Gallery(gallery) => Some(redirectTo(gallery))
case Century(century) => Some(redirectTo(century))
case Guardian(endOfUrl) => Some(redirectTo(endOfUrl))
case Lowercase(lower) => Some(redirectTo(lower))
// Googlebot hits a bunch of really old combiners and combiner RSS
// bounce these to the section
case CombinerSectionRss(section) => Some(redirectTo(s"$section/rss"))
case CombinerSection(section) => Some(redirectTo(section))
case Combiner(combiner) => Some(redirectTo(combiner))
case DatedSpecialIndexPage(section, rest, _) => Some(redirectTo(section, rest, "all"))
case SectionSpecialIndex(section, _) => Some(redirectTo(section, "all"))
case NewspaperPage(paper, date, book) => Some(redirectTo(paper, book, date, "all"))
case _ => None
}
def processLookupDestination(
path: String,
)(implicit request: RequestHeader): PartialFunction[Destination, CacheableResult] = {
case PermanentRedirect(_, location) if !linksToItself(path, location) =>
val locationWithCampaign = retainShortUrlCampaign(path, location)
WithoutRevalidationResult(Redirect(LinkTo(locationWithCampaign), redirectHttpStatus))
case ArchiveRedirect(_, archivePath) =>
// http://wiki.nginx.org/X-accel
WithoutRevalidationResult(Ok.withHeaders("X-Accel-Redirect" -> s"/s3-archive/$archivePath"))
}
}