obelics/processors/pre_extraction_simplificator.py (152 lines of code) (raw):
import re
from obelics.utils import (
MEDIA_CONTAIN_INTERESTING_ATTRIBUTES_SET,
TAG_TO_SEP,
simplify_media_node,
)
class Node:
def __init__(self, path_in_tree, media_info, text, children):
self.path_in_tree = path_in_tree
self.media_info = media_info
self.text = text
self.children = children
@property
def tag(self):
return self.path_in_tree[-1][0]
@property
def level(self):
return len(self.path_in_tree)
class Tree:
def __init__(
self,
selectolax_root_node,
page_url,
):
self.num_nodes = 0
self.tree = self.make_tree(selectolax_root_node, page_url)
def make_tree(self, selectolax_node, page_url, path_in_tree=[]):
tag = selectolax_node.tag
path_in_tree = path_in_tree + [[tag, self.num_nodes]]
self.num_nodes += 1
if tag in MEDIA_CONTAIN_INTERESTING_ATTRIBUTES_SET:
return Node(
path_in_tree=path_in_tree,
media_info=simplify_media_node(selectolax_node, page_url=page_url),
text="",
children=[],
)
elif tag == "-text":
return Node(
path_in_tree=path_in_tree,
media_info=None,
text=selectolax_node.text(deep=False, separator="", strip=False),
children=[],
)
return Node(
path_in_tree=path_in_tree,
media_info=None,
text="",
children=[
self.make_tree(child, page_url=page_url, path_in_tree=path_in_tree)
for child in selectolax_node.iter(include_text=True)
],
)
def traverse(self):
def traverse_recursive(node, list_nodes):
list_nodes.append(node)
for child_node in node.children:
traverse_recursive(child_node, list_nodes=list_nodes)
list_nodes = []
traverse_recursive(self.tree, list_nodes=list_nodes)
return list_nodes
class PreExtractionSimplificator:
def __init__(
self,
only_text_image_nodes=True,
format_texts=True,
merge_consecutive_text_nodes=True,
):
self.only_text_image_nodes = only_text_image_nodes
self.format_texts = format_texts
self.merge_consecutive_text_nodes = merge_consecutive_text_nodes
def __call__(self, selectolax_tree, page_url):
tree = Tree(selectolax_tree.root, page_url=page_url)
list_nodes = tree.traverse()
if self.only_text_image_nodes:
list_nodes = self._only_text_image_nodes(list_nodes)
if self.format_texts:
list_nodes = self._format_texts(list_nodes)
if self.merge_consecutive_text_nodes:
list_nodes = self._merge_consecutive_text_nodes(list_nodes)
return list_nodes
def _only_text_image_nodes(self, list_nodes):
list_nodes = [
node
for node in list_nodes
if (node.tag == "-text") or (node.tag == "figure") or ((node.tag == "img") and (node.media_info))
]
return list_nodes
def _format_texts(self, list_nodes):
def format_one_text(text):
if text == "":
return text
text = text.replace("\n", " ")
text = text.replace("\t", " ")
text = re.sub(r"[ ]{2,}", " ", text)
beg_sep = " " == text[0]
end_sep = (" " == text[-1]) and (len(text) > 1)
text = "\n".join([el.strip() for el in text.split("#BR_TAG#")])
text = beg_sep * " " + text + end_sep * " "
return text
for idx, node in enumerate(list_nodes):
list_nodes[idx].text = format_one_text(node.text)
list_nodes = [node for node in list_nodes if (node.tag != "-text") or ((node.tag == "-text") and (node.text))]
return list_nodes
def _merge_consecutive_text_nodes(self, list_nodes):
current_idx = 0
while current_idx <= len(list_nodes) - 1:
if list_nodes[current_idx].tag != "-text":
current_idx += 1
else:
if (current_idx == len(list_nodes) - 1) or (
(current_idx + 1 <= len(list_nodes) - 1) and (list_nodes[current_idx + 1].tag != "-text")
):
list_nodes[current_idx].path_in_tree = [["-text", 0]]
list_nodes[current_idx].text = list_nodes[current_idx].text.strip()
current_idx += 1
else:
seps = set()
text_1 = list_nodes[current_idx].text
text_2 = list_nodes[current_idx + 1].text
for char in ["\n\n", "\n", " "]:
if text_1.endswith(char):
seps.add(char)
text_1 = text_1[: -len(char)]
if text_2.startswith(char):
seps.add(char)
text_2 = text_2[len(char) :]
path_1 = list_nodes[current_idx].path_in_tree
path_2 = list_nodes[current_idx + 1].path_in_tree
start_diff_path = 0
for i in range(min(len(path_1), len(path_2))):
if path_1[i] != path_2[i]:
start_diff_path = i
break
for tag, _ in path_1[start_diff_path:] + path_2[start_diff_path:]:
if tag in TAG_TO_SEP:
seps.add(TAG_TO_SEP[tag])
if "\n\n" in seps:
sep = "\n\n"
elif "\n" in seps:
sep = "\n"
elif " " in seps:
sep = " "
else:
sep = ""
list_nodes[current_idx].path_in_tree = path_2
list_nodes[current_idx].text = text_1 + sep + text_2
del list_nodes[current_idx + 1]
list_nodes = [
node for node in list_nodes if (node.tag != "-text") or ((node.tag == "-text") and (node.text.strip()))
]
return list_nodes