in vision/m4/sourcing/data_collection/processors/pre_extraction_simplificator.py [0:0]
def _merge_consecutive_text_nodes(self, list_nodes):
current_idx = 0
while current_idx <= len(list_nodes) - 1:
if list_nodes[current_idx].tag != "-text":
current_idx += 1
else:
if (current_idx == len(list_nodes) - 1) or (
(current_idx + 1 <= len(list_nodes) - 1) and (list_nodes[current_idx + 1].tag != "-text")
):
list_nodes[current_idx].path_in_tree = [["-text", 0]]
list_nodes[current_idx].text = list_nodes[current_idx].text.strip()
current_idx += 1
else:
seps = set()
text_1 = list_nodes[current_idx].text
text_2 = list_nodes[current_idx + 1].text
for char in ["\n\n", "\n", " "]:
if text_1.endswith(char):
seps.add(char)
text_1 = text_1[: -len(char)]
if text_2.startswith(char):
seps.add(char)
text_2 = text_2[len(char) :]
path_1 = list_nodes[current_idx].path_in_tree
path_2 = list_nodes[current_idx + 1].path_in_tree
start_diff_path = 0
for i in range(min(len(path_1), len(path_2))):
if path_1[i] != path_2[i]:
start_diff_path = i
break
for tag, _ in path_1[start_diff_path:] + path_2[start_diff_path:]:
if tag in TAG_TO_SEP:
seps.add(TAG_TO_SEP[tag])
if "\n\n" in seps:
sep = "\n\n"
elif "\n" in seps:
sep = "\n"
elif " " in seps:
sep = " "
else:
sep = ""
list_nodes[current_idx].path_in_tree = path_2
list_nodes[current_idx].text = text_1 + sep + text_2
del list_nodes[current_idx + 1]
list_nodes = [
node for node in list_nodes if (node.tag != "-text") or ((node.tag == "-text") and (node.text.strip()))
]
return list_nodes