obelics/utils/simplification_utils.py (176 lines of code) (raw):

import os import re from urllib.parse import urlparse TAG_TO_SEP = { "address": "\n", "article": "\n", "aside": "\n", "blink": "", "blockquote": "\n\n", "body": "", "caption": "", "center": "\n", "dd": "\n", "dl": "\n\n", "dt": "\n", "div": "\n", "figcaption": "\n", "h": "", "h1": "\n\n", "h2": "\n\n", "h3": "\n\n", "h4": "\n\n", "h5": "\n\n", "h6": "\n\n", "hgroup": "\n", "html": "", "legend": "\n", "main": "\n", "marquee": "\n", "ol": "\n\n", "p": "\n\n", "section": "\n", "summary": "\n", "title": "", "ul": "\n\n", } def get_media_src(node): node_attributes = node.attributes node_tag = node.tag src = None if node_tag == "img": # Check all possible source type, and keep the first valid one img_source_types = [ "src", "data-src", "data-src-fg", "data-scroll-image", "srcset", "data-lateloadsrc", "data-img-src", "data-original", "data-gt-lazy-src", "data-lazy", "data-lazy-src", "src2", ] for source_type in img_source_types: if source_type in node_attributes and node_attributes[source_type]: if ("," not in node_attributes[source_type]) and (" " not in node_attributes[source_type]): src = node_attributes[source_type] break elif node_tag == "video": if ("src" in node_attributes) and node_attributes["src"]: src = node_attributes["src"] else: for cnode in node.iter(): if not src: if cnode.tag == "source": cnode_attributes = cnode.attributes if ("src" in cnode_attributes) and cnode_attributes["src"]: src = cnode_attributes["src"] elif node_tag == "audio": if ("src" in node_attributes) and node_attributes["src"]: src = node_attributes["src"] else: for cnode in node.iter(): if not src: if cnode.tag == "source": cnode_attributes = cnode.attributes if ("src" in cnode_attributes) and cnode_attributes["src"]: src = cnode_attributes["src"] else: return None # TODO iframes # Check on comma because it's non-canonical and should not be used anyway in urls. # TODO: have checks on valid URLs # Useless (at least for images) since already checked if src is not None and (("," in src) or (" " in src)): return None return src def format_image_size(size): size = re.sub('[”"<>]', "", size) try: return int(size) except ValueError: if "px" in size: return int(re.sub("[px;]", "", size)) elif "%" in size: # That should be the only case where we can't return a integer. return size.strip() elif "." in size: # If it's a float, then to make it simple, round it. return int(float(size)) elif "full-width" == size or "auto" == size: return "100%" else: raise ValueError(f"Unrecognized size for image: `{size}`") def format_filename(filename): # TODO: refine this function. fairly imprefect. # Potential improvements: `Untitled`, `untitled`, `blank`, check whether each word is in a dictionary _, simp_filename = os.path.split(filename) simp_filename = simp_filename.split(".")[0] if re.findall( r"\?[A-Za-z0-9]+=", simp_filename ): # Example `it?ids=2019042515182454151475%3A027064510%3A001&ca=n&coo=y` return "" simp_filename = re.sub(r"[_-]", " ", simp_filename) # Example: `Chocolate_Berry_Frozen_Yogurt_Bark` simp_filename = re.sub(r"%2[0]*", " ", simp_filename) # Example: `hearts%2Band%2Bhome%20Bbadge` simp_filename = re.sub(r"[0-9]+x[0-9]+", "", simp_filename) # Example: `104x403` simp_filename = re.sub(r"[0-9]+", " ", simp_filename) # Example: `icon18_wrench_allbkg` simp_filename = re.sub(r"[ ]{2,}", " ", simp_filename) # Example: `icon wrenchallbkg` for r in ["\n", "+", "%B", "%"]: simp_filename = simp_filename.replace(r, " ") simp_filename = simp_filename.strip() if len(simp_filename) <= 1: return "" else: return simp_filename def format_relative_to_absolute_path(page_url, relative_path): if relative_path.startswith("//"): abs_path = "http:" + relative_path else: if "./" in relative_path: relative_path = re.sub(r"\.+\/", "", relative_path) if not relative_path.startswith("/"): relative_path = "/" + relative_path domain_name = urlparse(page_url).netloc abs_path = "https://" + domain_name + relative_path return abs_path def is_url_valid(url): regex = re.compile( r"^(?:http|ftp)s?://" # http:// or https:// r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?))" # domain... r"(?:/?|[/?]\S+)$", re.IGNORECASE, ) return re.match(regex, url) is not None def simplify_media_node(node, page_url): src = get_media_src(node) if not src: return None unformatted_src = src if not src.startswith("http"): src = format_relative_to_absolute_path(page_url=page_url, relative_path=src) if not is_url_valid(src): return None node_attributes = node.attributes if node.tag == "img": new_image = {"document_url": page_url} new_image["unformatted_src"] = unformatted_src new_image["src"] = src formatted_filename = format_filename(src) if formatted_filename: new_image["formatted_filename"] = formatted_filename if ("alt" in node_attributes) and node_attributes["alt"]: new_image["alt_text"] = node_attributes["alt"] # TODO: eventually, for image sizes we could parse cases like # `{'src': 'http://wellbeingteams.org/wp-content/uploads/2017/04/spread600300.jpg', # 'width': None, 'height': None, 'alt': None, 'title': 'spread600300', 'class': 'img-responsive wp-image-122', # 'srcset': 'https://wellbeingteams.org/wp-content/uploads/2017/04/spread600300-200x100.jpg 200w, https://wellbeingteams.org/wp-content/uploads/2017/04/spread600300-400x200.jpg 400w, https://wellbeingteams.org/wp-content/uploads/2017/04/spread600300.jpg 600w', # 'sizes': '(max-width: 800px) 100vw, 400px'}` for size in ["width", "height"]: if size in node_attributes and node_attributes[size] is not None: try: new_image[f"rendered_{size}"] = format_image_size(node_attributes[size]) except ValueError: pass # Unrecognized format, generally an error, skipping return new_image elif node.tag == "video": new_video = {"document_url": page_url} new_video["src"] = src if "width" in node_attributes: new_video["width"] = node_attributes["width"] if "height" in node_attributes: new_video["height"] = node_attributes["height"] return new_video elif node.tag == "audio": new_audio = {"document_url": page_url} new_audio["src"] = src return new_audio