def func_map_final_cleaning_node_level()

in build_obelics/10_final_cleaning.py [0:0]


def func_map_final_cleaning_node_level(example):
    texts = example["texts"]
    images = example["images"]
    metadata = json.loads(example["metadata"])
    assert len(texts) == len(images) == len(metadata)

    new_texts = []
    new_images = []
    new_metadata = []

    previous_is_text = False
    for text, image, meta in zip(texts, images, metadata):
        if text is not None:
            assert (image is None) and (meta is None)
            if text == "":
                continue
            if previous_is_text:
                new_texts[-1] = new_texts[-1] + "\n\n" + text
            else:
                new_texts.append(text)
                new_images.append(None)
                new_metadata.append(None)
                previous_is_text = True
        elif image is not None:
            assert (text is None) and (meta is not None)
            new_texts.append(None)
            new_images.append(image)
            new_metadata.append(meta)
            previous_is_text = False
        elif meta is not None:
            raise ValueError("metadata cannot be != None if text and image are None")

    assert len(new_texts) == len(new_images) == len(new_metadata)
    example["texts"] = new_texts
    example["images"] = new_images
    example["metadata"] = json.dumps(new_metadata)

    return example