in build_obelics/13_final_processing.py [0:0]
def merge_consecutive_END_OF_DOCUMENT_TOKEN_TO_BE_REPLACED(texts, images, metadata):
new_texts = []
for text in texts:
if text is None:
new_texts.append(None)
else:
paragraphs = text.split("\n\n")
indices_to_remove = set()
last_is_eos = False
for ind, paragraph in enumerate(paragraphs):
if last_is_eos:
if paragraph.strip() == "END_OF_DOCUMENT_TOKEN_TO_BE_REPLACED":
indices_to_remove.add(ind)
else:
last_is_eos = False
else:
if paragraph.strip() == "END_OF_DOCUMENT_TOKEN_TO_BE_REPLACED":
last_is_eos = True
new_paragraphs = [el for ind, el in enumerate(paragraphs) if ind not in indices_to_remove]
new_text = "\n\n".join(new_paragraphs)
new_texts.append(new_text)
return new_texts, images, metadata