common/app/pagepresser/HtmlCleaner.scala

package pagepresser import common.{GuLogging} import org.jsoup.Jsoup import org.jsoup.nodes.{Element, Document} import conf.Configuration import scala.jdk.CollectionConverters._ abstract class HtmlCleaner extends GuLogging { lazy val fallbackCacheBustId = Configuration.r2Press.fallbackCachebustId lazy val staticRegEx = """//static.guim.co.uk/static/(?<cacheBustId>\w+)/(?<paths>.+)(?<extension>\.\w+)$""".r lazy val nonDigitRegEx = """\D+""".r def canClean(document: Document): Boolean def clean(document: Document, convertToHttps: Boolean): Document protected def universalClean(document: Document): Document = { removeAds(document) removeByClass(document, "top-search-box") removeByClass(document, "share-links") // removeRelatedComponent(document) This removes the byline on datablog pieces, but may be needed elsewhere removeByClass(document, "user-details") removeByClass(document, "initially-off") removeByClass(document, "comment-count") replaceLinks(document) repairStaticLinks(document) repairStaticSources(document) deComboLinks(document) } def repairStaticLinks(document: Document): Document = { document.getAllElements.asScala .filter { el => el.hasAttr("href") && el.attr("href").contains("/static/") } .foreach { el => val staticLink = staticRegEx.findFirstMatchIn(el.attr("href")) staticLink.foreach { link => val cacheBustId = link.group("cacheBustId") val extension = link.group("extension") val paths = link.group("paths").split('+') paths.map { path => val newPath = if (nonDigitRegEx.findFirstMatchIn(cacheBustId).isEmpty) { s"//static.guim.co.uk/static/$fallbackCacheBustId/$path$extension" } else { s"//static.guim.co.uk/static/$cacheBustId/$path$extension" } val newEl = el.clone.attr("href", newPath) el.after(newEl) } } el.remove() } document } def repairStaticSources(document: Document): Document = { val elementsWithSrc = document.getAllElements.asScala.filter { el => el.hasAttr("src") && el.attr("src").contains("/static/") } elementsWithSrc.foreach { el => val staticSrc = staticRegEx.findFirstMatchIn(el.attr("src")) staticSrc.foreach { src => val cacheBustId = src.group("cacheBustId") val extension = src.group("extension") val paths = src.group("paths").split('+') paths.map { path => val newPath = if (nonDigitRegEx.findFirstMatchIn(cacheBustId).isEmpty) { s"//static.guim.co.uk/static/$fallbackCacheBustId/$path$extension" } else { s"//static.guim.co.uk/static/$cacheBustId/$path$extension" } val newEl = el.clone.attr("src", newPath) el.after(newEl) } } el.remove() } document } def replaceLinks(document: Document): Document = { try { document.getAllElements.asScala .filter { el => (el.hasAttr("href") && el.attr("href").contains("http://")) || (el .hasAttr("src") && el.attr("src").contains("http://")) } .foreach { el => if (el.hasAttr("href")) { el.attr("href", el.attr("href").replace("http://", "//")) } else { el.attr("src", el.attr("src").replace("http://", "//")) } } document } catch { case e: Exception => log.warn("Unable to convert links for document from http to protocol relative url.") document } } def removeScripts(document: Document): Document = { document.getElementsByTag("script").asScala.toList.foreach(_.remove()) document } def removeAds(document: Document): Document = { val element = document.getElementById("sub-header") if (element != null) { val ads = element .children() .asScala .toList .filterNot(e => e.attr("class") == "top-navigation twelve-col top-navigation-js") ads.foreach(_.remove()) val htmlComments = element.childNodes().asScala.filter(node => node.nodeName().equals("#comment")) htmlComments.foreach(_.remove()) val promo = document.getElementById("promo") if (promo != null) promo.remove() } document } def removeRelatedComponent(document: Document): Document = { val element = document.getElementById("related") if (element != null) element.remove() document } def removeByClass(document: Document, className: String): Document = { document.getElementsByClass(className).asScala.foreach(_.remove()) document } def removeByTagName(document: Document, tagName: String): Document = { document.getElementsByTag(tagName).asScala.foreach(_.remove()) document } private def elementContainsCombo(el: Element): Boolean = { val comboHost = "combo.guim.co.uk" val attr = if (el.hasAttr("href")) { el.attr("href") } else if (el.hasAttr("src")) { el.attr("src") } else { "" } attr.contains(comboHost) } def deComboLinks(document: Document): Document = { document.getAllElements.asScala.filter(elementContainsCombo).foreach { el => val combinerRegex = """//combo.guim.co.uk/(?<cacheBustId>\w+)/(?<paths>.+)(?<extension>\.\w+)$""".r val href = if (el.hasAttr("href")) { el.attr("href") } else { el.attr("src") } val combiner = combinerRegex.findFirstMatchIn(href) combiner.foreach { combiner => val cacheBustId = combiner.group("cacheBustId") val extension = combiner.group("extension") val paths = combiner.group("paths").split('+') paths.map { path => val newPath = if (nonDigitRegEx.findFirstMatchIn(cacheBustId).isEmpty) { s"//static.guim.co.uk/static/$fallbackCacheBustId/$path$extension" } else { s"//static.guim.co.uk/static/$cacheBustId/$path$extension" } val newEl = if (el.hasAttr("href")) { el.clone.attr("href", newPath) } else { el.clone.attr("src", newPath) } el.after(newEl) } el.remove() } } document } def secureDocument(document: Document): Document = { val tmpDoc = Jsoup.parse(secureSource(document.html())) document.head().replaceWith(tmpDoc.head()) document.body().replaceWith(tmpDoc.body()) document } private def secureSource(src: String): String = { src.replace("http://", "//") } }

common/app/pagepresser/HtmlCleaner.scala (179 lines of code) (raw):