obelics/processors/dom_tree_simplificator.py (195 lines of code) (raw):

import re from obelics.utils import ( INTERESTING_TAGS_SET, MEDIA_CONTAIN_INTERESTING_ATTRIBUTES_SET, UNWRAP_TAGS, get_media_src, make_selectolax_tree, ) class DOMTreeSimplificator: def __init__( self, strip_multiple_linebreaks=True, strip_multiple_spaces=True, remove_html_comments=True, replace_line_break_tags=True, unwrap_tags=True, strip_tags=True, strip_special_divs=True, remove_dates=True, remove_empty_leaves=True, unnest_nodes=True, remake_tree=True, css_rules=None, css_rules_replace_with_text=None, ): self.strip_multiple_linebreaks = strip_multiple_linebreaks self.strip_multiple_spaces = strip_multiple_spaces self.remove_html_comments = remove_html_comments self.replace_line_break_tags = replace_line_break_tags self.unwrap_tags = unwrap_tags self.strip_tags = strip_tags self.strip_special_divs = strip_special_divs self.remove_dates = remove_dates self.remove_empty_leaves = remove_empty_leaves self.unnest_nodes = unnest_nodes self.remake_tree = remake_tree self.css_rules = css_rules self.css_rules_replace_with_text = css_rules_replace_with_text def __call__( self, html_str, type_return, ): if type_return not in ["str", "selectolax_tree"]: raise ValueError("`type_return` must be `str` or `selectolax_tree`") if self.strip_multiple_linebreaks: html_str = self._strip_multiple_linebreaks(html_str) if self.strip_multiple_spaces: html_str = self._strip_multiple_spaces(html_str) if self.remove_html_comments: html_str = self._remove_html_comments(html_str) if self.replace_line_break_tags: html_str = self._replace_line_break_tags(html_str) selectolax_tree = make_selectolax_tree(html_str) if self.css_rules: selectolax_tree = self._remove_nodes_matching_css_rules(selectolax_tree) if self.css_rules_replace_with_text: selectolax_tree = self._replace_nodes_matching_css_rules_with_text(selectolax_tree) if self.unwrap_tags: selectolax_tree = self._unwrap_html_tree(selectolax_tree) if self.strip_tags: selectolax_tree = self._strip_html_tree(selectolax_tree) if self.strip_special_divs: selectolax_tree = self._strip_special_divs(selectolax_tree) if self.remove_dates: selectolax_tree = self._remove_dates(selectolax_tree) if self.remove_empty_leaves: selectolax_tree = self._remove_empty_leaves(selectolax_tree) if self.unnest_nodes: selectolax_tree = self._unnest_nodes(selectolax_tree) if self.remake_tree: selectolax_tree = self._remake_tree(selectolax_tree) if type_return == "str": return selectolax_tree.html elif type_return == "selectolax_tree": return selectolax_tree def _strip_multiple_linebreaks(self, html_str): html_str = re.sub(r"[\n]{2,}", "\n", html_str) return html_str def _strip_multiple_spaces(self, html_str): html_str = re.sub(r"[ ]{2,}", " ", html_str) return html_str def _remove_html_comments(self, html_str): html_str = re.sub(r"<!--(?s).*?-->", "", html_str) return html_str def _replace_line_break_tags(self, html_str): html_str = re.sub("<br>|<br/>|<br />|</br>", "#BR_TAG#", html_str) return html_str def _unwrap_html_tree(self, selectolax_tree): selectolax_tree.unwrap_tags(UNWRAP_TAGS) # `.unwrap_tags` won't unwrap+remove tags which are empty leaves (i.e. either with no children tags or no text). # For instance, `<a href="https://twitter.com/share"><img src="blo.png"></a>` will be unwrapped and mentions # of `a` will be removed, but `<a href="https://twitter.com/share"></a>` will stay as is. # As a consequence, we call `strip_tags` to remove these unwrap empty tags for good. selectolax_tree.strip_tags(UNWRAP_TAGS) return selectolax_tree def _remove_digits_string(self, string): string = re.sub(r"\d+", "", string) return string def _remove_nodes_matching_css_rules(self, selectolax_tree): modification = True while modification: found_a_node = False for css_rule in self.css_rules: for node in selectolax_tree.css(css_rule): if node.tag != "html": node.decompose() found_a_node = True break if not found_a_node: modification = False return selectolax_tree def _replace_nodes_matching_css_rules_with_text(self, selectolax_tree): for css_rule, text in self.css_rules_replace_with_text.items(): for node in selectolax_tree.css(css_rule): node.replace_with(text) return selectolax_tree def _strip_html_tree(self, selectolax_tree): strip_tags_l = [ node.tag for node in selectolax_tree.root.traverse() if self._remove_digits_string(node.tag) not in INTERESTING_TAGS_SET ] strip_tags_l = list(set(strip_tags_l)) selectolax_tree.strip_tags(strip_tags_l) return selectolax_tree def _strip_special_divs(self, selectolax_tree): special_div_ids = ["footer", "header", "navigation", "nav", "navbar", "menu"] modification = True while modification: # Traverse the tree to find one node to remove, and remove it right then # to avoid the recursivity problem with `decompose` found_a_node = False for node in selectolax_tree.root.traverse(): if node.tag == "div": attributes = node.attributes if ( ("id" in attributes and attributes["id"] in special_div_ids) or ("class" in attributes and attributes["class"] in special_div_ids) or ("title" in attributes and attributes["title"] in special_div_ids) ): node.decompose() found_a_node = True break if not found_a_node: modification = False return selectolax_tree def _remove_dates(self, selectolax_tree): nodes_to_remove = [] for node in selectolax_tree.root.traverse(): if node.tag == "div": if node.attributes: if "class" in node.attributes: if node.attributes["class"]: if "date" in node.attributes["class"]: nodes_to_remove += [ child for child in node.iter(include_text=True) if child.tag == "-text" ] for node in nodes_to_remove: node.decompose() return selectolax_tree def _remove_empty_leaves(self, selectolax_tree): """ Function used to remove empty leaves iteratively, so it also ends up also removing nodes that are higher up in the tree. """ modification = True while modification: nodes_to_remove = [ node for node in selectolax_tree.root.traverse() if ( (node.tag not in MEDIA_CONTAIN_INTERESTING_ATTRIBUTES_SET) and (not [child for child in node.iter()]) and (not node.text().strip()) and (node.tag != "html") ) or ( (node.tag in MEDIA_CONTAIN_INTERESTING_ATTRIBUTES_SET) and not get_media_src(node) ) ] if nodes_to_remove: for node in nodes_to_remove: node.decompose(recursive=False) else: modification = False return selectolax_tree def _unnest_nodes(self, selectolax_tree): modification = True while modification: modification = False for node in selectolax_tree.root.traverse(): children = [child for child in node.iter()] if len(children) == 1: child = children[0] if node.tag == child.tag: text = node.text(deep=False).strip() if not text: node.replace_with(child) modification = True break return selectolax_tree def _remake_tree(self, selectolax_tree): """It could be interesting to remake a tree after the simplifications since it can now merge some text nodes that couldn't be merged before""" return make_selectolax_tree(selectolax_tree.html)