def merge_consecutive_END_OF_DOCUMENT_TOKEN_TO_BE_REPLACED()

in build_obelics/13_final_processing.py [0:0]


def merge_consecutive_END_OF_DOCUMENT_TOKEN_TO_BE_REPLACED(texts, images, metadata):
    new_texts = []
    for text in texts:
        if text is None:
            new_texts.append(None)
        else:
            paragraphs = text.split("\n\n")
            indices_to_remove = set()
            last_is_eos = False
            for ind, paragraph in enumerate(paragraphs):
                if last_is_eos:
                    if paragraph.strip() == "END_OF_DOCUMENT_TOKEN_TO_BE_REPLACED":
                        indices_to_remove.add(ind)
                    else:
                        last_is_eos = False
                else:
                    if paragraph.strip() == "END_OF_DOCUMENT_TOKEN_TO_BE_REPLACED":
                        last_is_eos = True
            new_paragraphs = [el for ind, el in enumerate(paragraphs) if ind not in indices_to_remove]
            new_text = "\n\n".join(new_paragraphs)
            new_texts.append(new_text)
    return new_texts, images, metadata