in build_obelics/10_final_cleaning.py [0:0]
def func_map_final_cleaning_node_level(example):
texts = example["texts"]
images = example["images"]
metadata = json.loads(example["metadata"])
assert len(texts) == len(images) == len(metadata)
new_texts = []
new_images = []
new_metadata = []
previous_is_text = False
for text, image, meta in zip(texts, images, metadata):
if text is not None:
assert (image is None) and (meta is None)
if text == "":
continue
if previous_is_text:
new_texts[-1] = new_texts[-1] + "\n\n" + text
else:
new_texts.append(text)
new_images.append(None)
new_metadata.append(None)
previous_is_text = True
elif image is not None:
assert (text is None) and (meta is not None)
new_texts.append(None)
new_images.append(image)
new_metadata.append(meta)
previous_is_text = False
elif meta is not None:
raise ValueError("metadata cannot be != None if text and image are None")
assert len(new_texts) == len(new_images) == len(new_metadata)
example["texts"] = new_texts
example["images"] = new_images
example["metadata"] = json.dumps(new_metadata)
return example