def _merge_consecutive_text_nodes()

in obelics/processors/pre_extraction_simplificator.py [0:0]


    def _merge_consecutive_text_nodes(self, list_nodes):
        current_idx = 0
        while current_idx <= len(list_nodes) - 1:
            if list_nodes[current_idx].tag != "-text":
                current_idx += 1
            else:
                if (current_idx == len(list_nodes) - 1) or (
                    (current_idx + 1 <= len(list_nodes) - 1) and (list_nodes[current_idx + 1].tag != "-text")
                ):
                    list_nodes[current_idx].path_in_tree = [["-text", 0]]
                    list_nodes[current_idx].text = list_nodes[current_idx].text.strip()
                    current_idx += 1
                else:
                    seps = set()

                    text_1 = list_nodes[current_idx].text
                    text_2 = list_nodes[current_idx + 1].text

                    for char in ["\n\n", "\n", " "]:
                        if text_1.endswith(char):
                            seps.add(char)
                            text_1 = text_1[: -len(char)]
                        if text_2.startswith(char):
                            seps.add(char)
                            text_2 = text_2[len(char) :]

                    path_1 = list_nodes[current_idx].path_in_tree
                    path_2 = list_nodes[current_idx + 1].path_in_tree

                    start_diff_path = 0
                    for i in range(min(len(path_1), len(path_2))):
                        if path_1[i] != path_2[i]:
                            start_diff_path = i
                            break
                    for tag, _ in path_1[start_diff_path:] + path_2[start_diff_path:]:
                        if tag in TAG_TO_SEP:
                            seps.add(TAG_TO_SEP[tag])

                    if "\n\n" in seps:
                        sep = "\n\n"
                    elif "\n" in seps:
                        sep = "\n"
                    elif " " in seps:
                        sep = " "
                    else:
                        sep = ""

                    list_nodes[current_idx].path_in_tree = path_2
                    list_nodes[current_idx].text = text_1 + sep + text_2
                    del list_nodes[current_idx + 1]

        list_nodes = [
            node for node in list_nodes if (node.tag != "-text") or ((node.tag == "-text") and (node.text.strip()))
        ]
        return list_nodes