def func_map_final_processing_node_level()

in build_obelics/13_final_processing.py [0:0]


def func_map_final_processing_node_level(example):
    texts = example["texts"]
    images = example["images"]
    metadata = json.loads(example["metadata"])
    assert len(texts) == len(images) == len(metadata)

    new_texts, new_images, new_metadata = remove_duplicated_images(texts, images, metadata)
    new_texts, new_images, new_metadata = remove_spam_paragraphs(new_texts, new_images, new_metadata)
    new_texts, new_images, new_metadata = final_cleaning_node_level(new_texts, new_images, new_metadata)
    new_texts, new_images, new_metadata = merge_consecutive_END_OF_DOCUMENT_TOKEN_TO_BE_REPLACED(
        new_texts, new_images, new_metadata
    )
    new_texts, new_images, new_metadata = remove_end_END_OF_DOCUMENT_TOKEN_TO_BE_REPLACED(
        new_texts, new_images, new_metadata
    )
    new_texts, new_images, new_metadata = final_cleaning_node_level(new_texts, new_images, new_metadata)

    example["texts"] = new_texts
    example["images"] = new_images
    example["metadata"] = json.dumps(new_metadata)

    return example