in build_obelics/13_final_processing.py [0:0]
def func_map_final_processing_node_level(example):
texts = example["texts"]
images = example["images"]
metadata = json.loads(example["metadata"])
assert len(texts) == len(images) == len(metadata)
new_texts, new_images, new_metadata = remove_duplicated_images(texts, images, metadata)
new_texts, new_images, new_metadata = remove_spam_paragraphs(new_texts, new_images, new_metadata)
new_texts, new_images, new_metadata = final_cleaning_node_level(new_texts, new_images, new_metadata)
new_texts, new_images, new_metadata = merge_consecutive_END_OF_DOCUMENT_TOKEN_TO_BE_REPLACED(
new_texts, new_images, new_metadata
)
new_texts, new_images, new_metadata = remove_end_END_OF_DOCUMENT_TOKEN_TO_BE_REPLACED(
new_texts, new_images, new_metadata
)
new_texts, new_images, new_metadata = final_cleaning_node_level(new_texts, new_images, new_metadata)
example["texts"] = new_texts
example["images"] = new_images
example["metadata"] = json.dumps(new_metadata)
return example