common/app/views/support/HtmlCleaner.scala (766 lines of code) (raw):
package views.support
import java.net.URI
import java.util.regex.{Matcher, Pattern}
import common.{Edition, GuLogging, LinkTo}
import conf.Configuration.affiliateLinks._
import conf.Configuration.site.host
import conf.switches.Switches._
import layout.ContentWidths
import layout.ContentWidths._
import model._
import model.content._
import model.dotcomrendering.pageElements.TextBlockElement
import navigation.ReaderRevenueSite
import org.joda.time.DateTime
import org.jsoup.Jsoup
import org.jsoup.nodes.{Document, Element, TextNode}
import play.api.mvc.RequestHeader
import services.SkimLinksCache
import scala.jdk.CollectionConverters._
import scala.collection.mutable
import scala.util.Try
trait HtmlCleaner {
def clean(d: Document): Document
}
object BlockNumberCleaner extends HtmlCleaner {
private val Block = """<!-- Block (\d*) -->""".r
override def clean(document: Document): Document = {
document.getAllElements.asScala.foreach { element =>
val blockComments = element.childNodes.asScala.flatMap { node =>
node.toString.trim match {
case Block(num) =>
Option(node.nextSibling).foreach(_.attr("id", s"block-$num"))
Some(node)
case _ => None
}
}
blockComments.foreach(_.remove())
}
document
}
}
object BlockquoteCleaner extends HtmlCleaner {
override def clean(document: Document): Document = {
val quotedBlockquotes = document.getElementsByTag("blockquote").asScala.filter(_.hasClass("quoted"))
val quoteSvg = views.html.fragments.inlineSvg("quote", "icon").toString()
val wrapBlockquoteChildren = (blockquoteElement: Element) => {
val container = document.createElement("div")
container.addClass("quoted__contents")
// Get children before mutating
val children = blockquoteElement.children()
blockquoteElement.prependChild(container)
container.insertChildren(0, children)
blockquoteElement.prepend(quoteSvg)
}
quotedBlockquotes.foreach(wrapBlockquoteChildren)
document
}
}
object PullquoteCleaner extends HtmlCleaner {
override def clean(document: Document): Document = {
val pullquotes = document.getElementsByTag("aside").asScala.filter(_.hasClass("element-pullquote"))
val openingQuoteSvg = views.html.fragments.inlineSvg("quote", "icon", List("inline-tone-fill")).toString()
pullquotes.foreach { element: Element =>
element.prepend(openingQuoteSvg)
element.getElementsByTag("p").addClass("pullquote-paragraph")
element.getElementsByTag("cite").addClass("pullquote-cite")
}
document
}
}
case object R2VideoCleaner extends HtmlCleaner {
override def clean(document: Document): Document = {
val legacyVideos =
document.getElementsByTag("video").asScala.filter(_.hasClass("gu-video")).filter(_.parent().tagName() != "figure")
legacyVideos.foreach(videoElement => {
videoElement.wrap("<figure class=\"test element element-video\"></figure>")
})
document
}
}
case class RecipeBodyImage(isRecipeArticle: Boolean) extends HtmlCleaner {
override def clean(document: Document): Document = {
if (isRecipeArticle) {
document.getElementsByClass("element-image").asScala foreach (_.remove())
document.getElementsByTag("aside").asScala.filter(_.hasClass("element-pullquote")) foreach (_.remove())
document.getElementsByClass("element-rich-link").asScala.foreach(_.remove())
}
document
}
}
case class PictureCleaner(article: Article)(implicit request: RequestHeader)
extends HtmlCleaner
with implicits.Numbers {
def clean(body: Document): Document = {
for {
figure <- body.getElementsByTag("figure").asScala
image <- figure.getElementsByTag("img").asScala.headOption
if !(figure.hasClass("element-comment") ||
figure.hasClass("element-witness") ||
figure.hasClass("element-atom"))
container <- findContainerFromId(figure.attr("data-media-id"), image.attr("src"))
image <- container.images.largestImage
} {
val hinting = findBreakpointWidths(figure)
val relation = {
if (article.isLiveBlog) LiveBlogMedia
else if (article.isTheMinute) MinuteMedia
else if (article.isImmersive) ImmersiveMedia
else BodyMedia
}
val widths = ContentWidths.getWidthsFromContentElement(hinting, relation)
val orientationClass = image.orientation match {
case Portrait => Some("img--portrait")
case _ => Some("img--landscape")
}
val smallImageClass = hinting match {
case Thumbnail => None
case _ if image.width <= 220 => Some("img--inline")
case _ => None
}
val inlineClass =
if (article.isTheMinute && !figure.hasClass("element--thumbnail")) Some("element--inline") else None
val figureClasses = List(orientationClass, smallImageClass, hinting.className, inlineClass).flatten.mkString(" ")
// lightbox uses the images in the order mentioned in the header array
val lightboxInfo: Option[(Int, ImageAsset)] = for {
index <- Some(article.lightbox.lightboxImages.indexOf(container)).flatMap(index =>
if (index == -1) None else Some(index + 1),
)
crop <- container.images.largestEditorialCrop
if !article.isLiveBlog
} yield (index, crop)
val html = views.html.fragments
.imageFigure(
container.images,
lightboxIndex = lightboxInfo.map(_._1),
widthsByBreakpoint = widths,
image_figureClasses = Some(image, figureClasses),
shareInfo = lightboxInfo.map { case (index, crop) =>
(
article.sharelinks.elementShares(s"img-$index", crop.url),
article.metadata.contentType.getOrElse(DotcomContentType.Unknown),
)
},
)
.toString()
figure.replaceWith(Jsoup.parseBodyFragment(html).body().child(0))
}
body
}
def findContainerFromId(id: String, src: String): Option[ImageElement] = {
// It is possible that a single data media id can appear multiple times in the elements array.
val maybeSrcImagePath = Try(new URI(src.trim).getPath).toOption
val imageContainers = article.elements.bodyImages.filter(_.properties.id == id)
// Try to match the container based on both URL and media ID.
val fullyMatchedImage: Seq[ImageElement] = for {
container <- imageContainers
asset <- container.images.imageCrops
url <- asset.url if maybeSrcImagePath.exists(url.contains)
} yield container
fullyMatchedImage.headOption orElse imageContainers.headOption
}
def findBreakpointWidths(figure: Element): ContentHinting = {
figure.classNames().asScala.map(Some(_)) match {
case classes if classes.asJava.contains(Supporting.className) => Supporting
case classes if classes.asJava.contains(Showcase.className) => Showcase
case classes if classes.asJava.contains(Thumbnail.className) => Thumbnail
case classes if classes.asJava.contains(Immersive.className) => Immersive
case classes if classes.asJava.contains(Halfwidth.className) => Halfwidth
case _ => Inline
}
}
}
object BulletCleaner {
def apply(body: String): String = body.replace("•", """<span class="bullet">•</span>""")
}
trait HttpsUrl {
def ensureHttps(url: String): String = url.replace("http:", "https:")
}
object VideoEncodingUrlCleaner extends HttpsUrl {
def apply(url: String): String = ensureHttps(url.filter(_ != '\n'))
}
object AmpSrcCleaner extends HttpsUrl {
def apply(videoSrc: String): String = {
// All media sources need to start with https for AMP.
// Temporary code until all media urls returned from CAPI are https
ensureHttps(videoSrc)
}
}
case class InBodyLinkCleaner(dataLinkName: String)(implicit val edition: Edition, implicit val request: RequestHeader)
extends HtmlCleaner {
def clean(body: Document): Document = {
val links = body.getElementsByAttribute("href")
links.asScala.foreach { link =>
if (link.tagName == "a") {
link.attr("href", LinkTo(link.attr("href"), edition))
link.attr("data-link-name", dataLinkName)
link.addClass("u-underline")
}
if (ReaderRevenueSite.isReaderRevenueSiteUrl(link.attr("href"))) {
link.addClass("js-acquisition-link")
}
}
// Prevent text in non clickable anchors from looking like links
// <a name="foo">bar</a> -> <a name="foo"></a>bar
val anchors = body.getElementsByAttribute("name")
anchors.asScala.foreach { anchor =>
if (anchor.tagName == "a") {
val text = anchor.ownText()
anchor.empty().after(text)
}
}
body
}
}
case class TruncateCleaner(limit: Int)(implicit val edition: Edition, implicit val request: RequestHeader)
extends HtmlCleaner {
def clean(body: Document): Document = {
def truncateTextNode(charLimit: Int, textNode: TextNode): Int = {
val newCharLimit = charLimit - textNode.text.length
if (newCharLimit < 0) {
textNode.text(textNode.text.take(charLimit.max(0)).trim.stripSuffix(".") + (if (charLimit > 0) "…" else ""))
}
newCharLimit
}
def truncateElement(charLimit: Int, element: Element): Int = {
element.childNodes.asScala.foldLeft(charLimit) { (t, node) =>
node match {
case tNode: TextNode => truncateTextNode(t, tNode)
case elem: Element => truncateElement(t, elem)
case _ => t
}
}
}
truncateElement(limit, body)
body
}
}
class TweetCleaner(content: Content) extends HtmlCleaner {
override def clean(document: Document): Document = {
document.getElementsByClass("element-tweet").asScala.foreach { tweet =>
val tweetData: Option[Tweet] = Option(tweet.attr("data-canonical-url")).flatMap { url =>
url.split('/').lastOption.flatMap { id =>
content.tweets.find(_.id == id)
}
}
val tweetImage = tweetData.flatMap(_.firstImage)
tweet.getElementsByClass("twitter-tweet").asScala.foreach { element =>
val el = element.clone()
if (el.children.size > 1) {
val body = el.child(0).attr("class", "tweet-body")
val date = el.child(1).attr("class", "tweet-date")
val user =
el.ownText()
.replaceFirst("— ", "")
.split("""(?=\(@)""") // Remove the '-' and split at the '(@' username but keep delimiter
val userName = user.headOption.getOrElse("")
val userId = user.lift(1).getOrElse("")
val userNameEl = document.createElement("span").attr("class", "tweet__user-name").text(userName)
val userIdEl = document.createElement("span").attr("class", "tweet__user-id").text(userId)
val link = document.createElement("a").attr("href", date.attr("href")).attr("style", "display: none;")
element.empty().removeClass("twitter-tweet").addClass("js-tweet tweet")
tweetImage.foreach { image =>
val img = document.createElement("img")
img.attr("src", image)
img.attr("alt", "")
img.attr("rel", "nofollow")
img.addClass("js-tweet-main-image tweet-main-image")
element.appendChild(img)
}
List(userNameEl, userIdEl, body, link, date).map(element.appendChild)
}
}
}
document
}
}
case class TagLinker(article: Article)(implicit val edition: Edition, implicit val request: RequestHeader)
extends HtmlCleaner {
private val group1 = "$1"
private val group2 = "$2"
private val group4 = "$4"
private val group5 = "$5"
private val dot = Pattern.quote(".")
private val question = Pattern.quote("?")
private def keywordRegex(tag: Tag) = {
val tagName = Pattern.quote(Matcher.quoteReplacement(tag.name))
s"""(.*)( |^)($tagName)( |,|$$|$dot|$question)(.*)""".r
}
def clean(doc: Document): Document = {
if (article.content.showInRelated) {
// Get all paragraphs which are not contained in a pullquote or in an instagram caption
val paragraphs = doc
.getElementsByTag("p")
.asScala
.filterNot(p =>
p.parents.asScala.exists { ancestor =>
val inPullquote = ancestor.tagName() == "aside" && ancestor.hasClass("element-pullquote")
val inInstagramBlock = ancestor.hasClass("instagram-media")
inPullquote || inInstagramBlock
},
)
// order by length of name so we do not make simple match errors
// e.g 'Northern Ireland' & 'Ireland'
article.tags.keywords.filterNot(_.isSectionTag).sortBy(_.name.length).reverse.foreach { keyword =>
// don't link again in paragraphs that already have links
val unlinkedParas = paragraphs.filterNot(_.html.contains("<a"))
// pre-filter paragraphs so we do not do multiple regexes on every single paragraph in every single article
val candidateParagraphs = unlinkedParas.filter(_.html.contains(keyword.name))
if (candidateParagraphs.nonEmpty) {
val regex = keywordRegex(keyword)
val paragraphsWithMatchers =
candidateParagraphs.map(p => (regex.pattern.matcher(p.html), p)).find(_._1.matches())
paragraphsWithMatchers.foreach { case (matcher, p) =>
val tagLink = doc.createElement("a")
tagLink.attr("href", LinkTo(keyword.metadata.url, edition))
tagLink.text(keyword.name)
tagLink.attr("data-link-name", "auto-linked-tag")
tagLink.attr("data-component", "auto-linked-tag")
tagLink.addClass("u-underline")
val tagLinkHtml = tagLink.toString
val newHtml = matcher.replaceFirst(s"$group1$group2$tagLinkHtml$group4$group5")
p.html(newHtml)
}
}
}
}
doc
}
}
object InBodyElementCleaner extends HtmlCleaner {
private val supportedElements = Set(
"element-tweet",
"element-video",
"element-image",
"element-witness",
"element-comment",
"element-interactive",
)
override def clean(document: Document): Document = {
// this code REMOVES unsupported embeds
if (ShowAllArticleEmbedsSwitch.isSwitchedOff) {
val embeddedElements = document.getElementsByTag("figure").asScala.filter(_.hasClass("element"))
val unsupportedElements = embeddedElements.filterNot(e => supportedElements.exists(e.hasClass))
unsupportedElements.foreach(_.remove())
}
document
}
}
case class Summary(amount: Int) extends HtmlCleaner {
override def clean(document: Document): Document = {
val children = document.body().children().asScala.toList
val para: Option[Element] = children.filter(_.nodeName() == "p").take(amount).lastOption
// if there is are no p's, just take the first n things (could be a blog)
para match {
case Some(p) => children.drop(children.indexOf(p)).foreach(_.remove())
case _ => children.drop(amount).foreach(_.remove())
}
document
}
}
case class PhotoEssayImages(isPhotoEssay: Boolean) extends HtmlCleaner {
override def clean(document: Document): Document = {
if (isPhotoEssay) {
document.getElementsByTag("figure").asScala.filter(_.hasClass("element-image")) foreach { images =>
images.addClass("element-image--photo-essay")
}
document.getElementsByClass("block-share--article").asScala.foreach { shares =>
shares.remove()
}
}
document
}
}
case class PhotoEssayQuotes(isPhotoEssay: Boolean) extends HtmlCleaner {
override def clean(document: Document): Document = {
if (isPhotoEssay) {
document.getElementsByClass("element-pullquote").asScala.foreach { quotes =>
quotes.addClass("element-pullquote--photo-essay")
}
}
document
}
}
case class PhotoEssayCaptions(isPhotoEssay: Boolean) extends HtmlCleaner {
override def clean(document: Document): Document = {
if (isPhotoEssay) {
document.getElementsByClass("caption--img").asScala.foreach { captions =>
captions.remove()
}
}
document
}
}
case class PhotoEssayHalfWidth(isPhotoEssay: Boolean) extends HtmlCleaner {
override def clean(document: Document): Document = {
if (isPhotoEssay) {
document.getElementsByTag("figure").asScala.filter(_.hasClass("element--halfWidth")).zipWithIndex.foreach {
case (halfWidthImage, index) =>
if (index % 2 == 0) {
halfWidthImage.addClass("half-width-odd")
}
}
}
document
}
}
case class PhotoEssayBlockQuote(isPhotoEssay: Boolean) extends HtmlCleaner {
override def clean(document: Document): Document = {
if (isPhotoEssay) {
document.getElementsByTag("blockquote").asScala.foreach { blockquotes =>
if (!blockquotes.children().is(".pullquote-paragraph")) {
blockquotes.addClass("photo-essay-block-quote")
}
}
}
document
}
}
case class ImmersiveLinks(isImmersive: Boolean) extends HtmlCleaner {
override def clean(document: Document): Document = {
if (isImmersive) {
document.getElementsByTag("a").asScala.foreach { a =>
a.addClass("in-body-link--immersive")
}
}
document
}
}
case class ImmersiveHeaders(isImmersive: Boolean) extends HtmlCleaner {
override def clean(document: Document): Document = {
if (isImmersive) {
document.getElementsByTag("h2").asScala.foreach { h2 =>
val beforeH2 = h2.previousElementSibling()
if (beforeH2 != null) {
if (beforeH2.hasClass("element--immersive") && beforeH2.hasClass("element-image")) {
beforeH2.addClass("section-image")
beforeH2.prepend("""<h2 class="section-title">""" + h2.text() + "</h2>")
h2.remove()
}
}
}
}
document
}
}
case class DropCaps(isFeature: Boolean, isImmersive: Boolean, isRecipeArticle: Boolean = false) extends HtmlCleaner {
private def setDropCap(p: Element): String = {
if (p.text.length > 199) {
p.html.replaceFirst(
"^([\"'“‘]*[a-zA-Z])(.{199,})",
"""<span class="drop-cap"><span class="drop-cap__inner">$1</span></span>$2""",
)
} else p.html
}
override def clean(document: Document): Document = {
if (isFeature && !isRecipeArticle) {
val children = document.body().children().asScala.toList
children.headOption match {
case Some(p) =>
if (p.nodeName() == "p") p.html(setDropCap(p))
case _ =>
}
}
document.getElementsByTag("h2").asScala.foreach { h2 =>
if (isImmersive && h2.text() == "* * *") {
h2.before("""<hr class="section-rule" />""")
val maybeNext = Option(h2.nextElementSibling())
maybeNext
.filter(_.nodeName() == "p")
.foreach { el =>
el.html(setDropCap(el))
}
h2.remove()
}
}
document
}
}
case class NumberedListFurniture(isNumberedList: Boolean) extends HtmlCleaner {
override def clean(document: Document): Document = {
if (isNumberedList) {
// Adds yellow styling to star ratings mid article
document.select("p:containsOwn(★)").asScala.foreach { star =>
star.addClass("stars")
}
// Styled link/section end
document.select("ul > li:only-child").asScala.foreach { li =>
val ul = li.parent();
ul.addClass("article-link")
}
// Faux h3 headings, for second level of heading hierarchy in numbered list articles
document.select("p > strong").asScala.foreach { strong =>
val p = strong.parent();
if (p.is("p:matchesOwn(^$)") && !p.children().is("a")) {
p.addClass("falseH3")
}
}
}
document
}
}
// Gallery Caption's don't come back as structured data
// This is a hack to serve the correct html
object GalleryCaptionCleaner extends HtmlCleaner {
override def clean(galleryCaption: Document): Document = {
// There is an inconsistent number of <br> tags in gallery captions.
// To create some consistency, re will remove them all.
galleryCaption.getElementsByTag("br").remove()
val firstStrong = Option(galleryCaption.getElementsByTag("strong").first())
val captionTitle = galleryCaption.createElement("h2")
val captionTitleText = firstStrong.map(_.html()).getOrElse("")
// <strong> is removed in place of having a <h2> element
firstStrong.foreach(_.remove())
captionTitle.addClass("gallery__caption__title")
captionTitle.html(captionTitleText)
galleryCaption.body.prependChild(captionTitle)
galleryCaption
}
}
object InteractiveSrcdocCleaner extends HtmlCleaner {
override def clean(document: Document): Document = {
if (interactivePressing.isSwitchedOn) {
for {
iframe <- Option(document.getElementsByTag("iframe").first())
srcdoc = iframe.attr("srcdoc")
if srcdoc.nonEmpty
} yield {
// noscript is added for immersive interactives, no idea why
// see https://github.com/guardian/flexible-content/pull/1597
// hopefully we can remove all of this soon anyway
val html = Jsoup.parse(srcdoc).getElementsByTag("noscript").html()
iframe.after(html).remove()
}
}
document
}
}
object FigCaptionCleaner extends HtmlCleaner {
override def clean(document: Document): Document = {
document.getElementsByTag("figcaption").asScala.foreach { _.addClass("caption caption--img") }
document
}
}
object MainFigCaptionCleaner extends HtmlCleaner {
override def clean(document: Document): Document = {
document.getElementsByTag("figcaption").asScala.foreach { _.addClass("caption caption--img caption--main") }
document
}
}
case class RichLinkCleaner()(implicit val request: RequestHeader) extends HtmlCleaner {
override def clean(document: Document): Document = {
val richLinks = document.getElementsByClass("element-rich-link")
richLinks
.addClass("element-rich-link--not-upgraded")
.attr("data-component", "rich-link")
.asScala
.zipWithIndex
.map { case (el, index) => el.attr("data-link-name", s"rich-link-${richLinks.asScala.length} | ${index + 1}") }
richLinks.asScala
.map(richLink => {
val link = richLink.getElementsByTag("a").first()
val href = link.attr("href")
val html = views.html.fragments.richLinkDefault(link.text(), href).toString()
richLink.empty().prepend(html)
})
document
}
}
object MembershipEventCleaner extends HtmlCleaner {
override def clean(document: Document): Document = {
val membershipEvents = document.getElementsByClass("element-membership")
membershipEvents
.addClass("element-membership--not-upgraded")
.attr("data-component", "membership-events")
.asScala
.zipWithIndex
.map { case (el, index) =>
el.attr("data-link-name", s"membership-event-${membershipEvents.asScala.length} | ${index + 1}")
}
document
}
}
case class AtomsCleaner(
atoms: Option[Atoms],
shouldFence: Boolean = true,
mediaWrapper: Option[MediaWrapper] = None,
posterImageOverride: Option[ImageMedia] = None,
)(implicit val request: RequestHeader, context: ApplicationContext)
extends HtmlCleaner {
private def findAtom(id: String): Option[Atom] = {
atoms.flatMap(_.all.find(_.id == id))
}
override def clean(document: Document): Document = {
if (UseAtomsSwitch.isSwitchedOn) {
for {
atomContainer <- document.getElementsByClass("element-atom").asScala
bodyElement <- atomContainer.getElementsByTag("gu-atom").asScala
atomId <- Some(bodyElement.attr("data-atom-id"))
atomType <- Some(bodyElement.attr("data-atom-type"))
} {
if (atomType != "audio" || (atomType == "audio" && RenderInArticleAudioAtomSwitch.isSwitchedOn)) {
findAtom(atomId).fold {
atomContainer.remove()
} { atomData =>
if (mediaWrapper.contains(MediaWrapper.MainMedia)) {
atomContainer.addClass("element-atom--main-media")
}
if (atomData.isInstanceOf[MediaAtom]) {
atomContainer.addClass("element-atom--media")
}
atomContainer.attr("data-atom-id", atomId)
atomContainer.attr("data-atom-type", atomType)
val html = views.html.fragments.atoms
.atom(
atomData,
shouldFence,
mediaWrapper,
posterImageOverride,
)
.toString()
bodyElement.remove()
atomContainer.append(html)
}
}
}
}
document
}
}
object setSvgClasses {
def apply(svg: String, classes: Seq[String] = List(), label: Option[String] = None): String = {
val document = Jsoup.parse(svg)
val svgHtml = document.getElementsByTag("svg")
val modifiedClasses = classes.map(_.concat("__svg")).mkString(" ")
svgHtml.addClass(modifiedClasses)
label.map(text => {
svgHtml.attr("aria-label", text)
})
svgHtml.toString
}
}
case class CommercialMPUForFronts()(implicit val request: RequestHeader) extends HtmlCleaner {
override def clean(document: Document): Document = {
def hasFirstContainerThrasher(element: Element, index: Int): Boolean = {
index == 0 && element.hasClass("fc-container--thrasher")
}
def hasAdjacentCommercialContainer(element: Element): Boolean = {
val maybeNextEl: Option[Element] = Option(element.nextElementSibling())
element.hasClass("fc-container--commercial") || maybeNextEl.exists(_.hasClass("fc-container--commercial"))
}
def hasAdjacentThrasher(element: Element): Boolean =
Option(element.nextElementSibling()).exists(_.hasClass("fc-container--thrasher"))
def isMostViewedContainer(element: Element): Boolean =
Option(element.id()).exists(_.contains("most-viewed")) || Option(element.id()).exists(_.contains("popular-in"))
val sliceSlot = views.html.fragments.items.facia_cards.sliceSlot
val containers: List[Element] = document.getElementsByClass("fc-container").asScala.toList
// On mobile, we remove the first container if it is a thrasher
// and remove a container if it, or the next sibling, is a commercial container
// we also exclude any containers that are directly before a thrasher
// then we take every other container, up to a maximum of 10, for targeting MPU insertion
val containersForCommercialMPUs = containers.zipWithIndex
.collect {
case (x, i)
if !hasFirstContainerThrasher(x, i) && !hasAdjacentCommercialContainer(x) && !hasAdjacentThrasher(
x,
) && !isMostViewedContainer(x) =>
x
}
.zipWithIndex
.collect {
case (x, i) if i % 2 == 0 => x
}
.take(10)
for (container <- containersForCommercialMPUs) {
container.after(s"""<section class="fc-container__mpu--mobile">${sliceSlot(
containersForCommercialMPUs.indexOf(container),
isMobile = true,
)}</section>""")
}
// On desktop, a MPU slot is simply inserted when there is a slice available
val slices: List[Element] = document.getElementsByClass("fc-slice__item--mpu-candidate").asScala.toList
for (slice <- slices) {
slice.append(s"${sliceSlot(slices.indexOf(slice) + 1)}")
}
document
}
}
case class CommercialComponentHigh(isPaidContent: Boolean, isNetworkFront: Boolean, hasPageSkin: Boolean)(
implicit val edition: Edition,
implicit val request: RequestHeader,
) extends HtmlCleaner {
override def clean(document: Document): Document = {
val containers: List[(Element, Int)] = document.getElementsByClass("fc-container").asScala.toList.zipWithIndex
val minContainers = if (isPaidContent) 1 else 2
if (containers.length >= minContainers) {
val containerIndex = (containers.length >= 4, isNetworkFront) match {
case (false, _) => 0
case (true, false) => 2
case (true, true) => 3
}
val adSlotHtml = views.html.fragments.commercial.commercialComponentHigh(isPaidContent, hasPageSkin)
val adSlot: Option[Element] =
Jsoup.parseBodyFragment(adSlotHtml.toString).body().children().asScala.toList.headOption
for {
(container, _) <- containers.lift(containerIndex)
slot <- adSlot
} {
container.after(slot)
slot.wrap("""<div class="fc-container fc-container--commercial"></div>""")
}
}
document
}
}
object GarnettQuoteCleaner extends HtmlCleaner {
val garnettQuote = views.html.fragments.inlineSvg("garnett-quote", "icon").toString
override def clean(document: Document): Document = {
for {
quote <- document.getElementsByClass("inline-quote").asScala
} {
quote.before(garnettQuote)
quote.remove()
}
document
}
}
case class AffiliateLinksCleaner(
pageUrl: String,
showAffiliateLinks: Option[Boolean],
appendDisclaimer: Option[Boolean] = None,
tags: List[String],
) extends HtmlCleaner
with GuLogging {
override def clean(document: Document): Document = {
if (
AffiliateLinks.isSwitchedOn && AffiliateLinksCleaner.shouldAddAffiliateLinks(
AffiliateLinks.isSwitchedOn,
showAffiliateLinks,
alwaysOffTags,
tags,
)
) {
AffiliateLinksCleaner.replaceLinksInHtml(document, pageUrl, skimlinksId)
} else document
}
}
object AffiliateLinksCleaner {
def getAffiliateableLinks(html: Document): mutable.Seq[Element] =
html.getElementsByAttribute("href").asScala.filter(isAffiliatable)
def replaceLinksInHtml(
html: Document,
pageUrl: String,
skimlinksId: String,
): Document = {
val linksToReplace: mutable.Seq[Element] = getAffiliateableLinks(html)
linksToReplace.foreach { el =>
el.attr("href", linkToSkimLink(el.attr("href"), pageUrl, skimlinksId)).attr("rel", "sponsored")
}
html
}
def replaceLinksInElement(html: String, pageUrl: String): TextBlockElement = {
val doc = Jsoup.parseBodyFragment(html)
val linksToReplace: mutable.Seq[Element] = getAffiliateableLinks(doc)
linksToReplace.foreach { el =>
el.attr("href", linkToSkimLink(el.attr("href"), pageUrl, skimlinksId)).attr("rel", "sponsored")
}
if (linksToReplace.nonEmpty) {
TextBlockElement(doc.body().html())
} else {
TextBlockElement(html)
}
}
def isAffiliatable(element: Element): Boolean =
element.tagName == "a" && SkimLinksCache.isSkimLink(element.attr("href"))
def linkToSkimLink(link: String, pageUrl: String, skimlinksId: String): String = {
val urlEncodedLink = URLEncode(link)
s"https://go.skimresources.com/?id=$skimlinksId&url=$urlEncodedLink&sref=$host$pageUrl"
}
def contentHasAlwaysOffTag(tagPaths: List[String], alwaysOffTags: Set[String]): Boolean = {
tagPaths.exists(path => alwaysOffTags.contains(path))
}
def shouldAddAffiliateLinks(
switchedOn: Boolean,
showAffiliateLinks: Option[Boolean],
alwaysOffTags: Set[String],
tagPaths: List[String],
): Boolean = {
// Never include affiliate links if it is tagged with an always off tag
if (!contentHasAlwaysOffTag(tagPaths, alwaysOffTags) && switchedOn) {
if (showAffiliateLinks.isDefined) {
showAffiliateLinks.contains(true)
} else false
} else false
}
def stringContainsAffiliateableLinks(s: String): Boolean = {
getAffiliateableLinks(Jsoup.parseBodyFragment(s)).nonEmpty
}
}