def final_cleaning_node_level()

in build_obelics/13_final_processing.py [0:0]


def final_cleaning_node_level(texts, images, metadata):
    new_texts = []
    new_images = []
    new_metadata = []

    previous_is_text = False
    for text, image, meta in zip(texts, images, metadata):
        if text is not None:
            assert image is None
            assert meta is None
            if text == "":
                continue
            if previous_is_text:
                new_texts[-1] = new_texts[-1] + "\n\n" + text
            else:
                new_texts.append(text)
                new_images.append(None)
                new_metadata.append(None)
                previous_is_text = True
        elif image is not None:
            assert (text is None) and (meta is not None)
            new_texts.append(None)
            new_images.append(image)
            new_metadata.append(meta)
            previous_is_text = False
        elif meta is not None:
            raise ValueError("metadata cannot be != None if text and image are None")

    assert len(new_texts) == len(new_images) == len(new_metadata)
    return new_texts, new_images, new_metadata