in obelics/processors/web_document_extractor.py [0:0]
def func_html_to_web_documents(example):
html_str = example[html_column_name]
page_url = example[url_column_name]
general_metadata = {}
if all(
[
column_name in example
for column_name in ["url", "warc_filename", "warc_record_offset", "warc_record_length"]
]
):
general_metadata = {
"url": example["url"],
"warc_filename": example["warc_filename"],
"warc_record_offset": example["warc_record_offset"],
"warc_record_length": example["warc_record_length"],
}
try:
selectolax_tree = dom_tree_simplificator(html_str, type_return="selectolax_tree")
list_nodes = pre_extraction_simplificator(selectolax_tree, page_url=page_url)
except Exception:
print("EXCEPTION")
example["texts"] = []
example["images"] = []
example["metadata"] = json.dumps([])
example["general_metadata"] = json.dumps([])
return example
texts = []
images = []
metadata = []
for node in list_nodes:
if node.tag == "-text":
texts.append(node.text)
images.append("")
metadata.append(None)
elif node.tag == "img":
texts.append(None)
images.append(node.media_info["src"])
metadata.append(node.media_info)
example["texts"] = texts
example["images"] = images
example["metadata"] = json.dumps(metadata)
example["general_metadata"] = json.dumps(general_metadata)
return example