def func_html_to_web_documents()

in obelics/processors/web_document_extractor.py [0:0]


    def func_html_to_web_documents(example):
        html_str = example[html_column_name]
        page_url = example[url_column_name]
        general_metadata = {}
        if all(
            [
                column_name in example
                for column_name in ["url", "warc_filename", "warc_record_offset", "warc_record_length"]
            ]
        ):
            general_metadata = {
                "url": example["url"],
                "warc_filename": example["warc_filename"],
                "warc_record_offset": example["warc_record_offset"],
                "warc_record_length": example["warc_record_length"],
            }

        try:
            selectolax_tree = dom_tree_simplificator(html_str, type_return="selectolax_tree")
            list_nodes = pre_extraction_simplificator(selectolax_tree, page_url=page_url)

        except Exception:
            print("EXCEPTION")
            example["texts"] = []
            example["images"] = []
            example["metadata"] = json.dumps([])
            example["general_metadata"] = json.dumps([])
            return example
        
        texts = []
        images = []
        metadata = []
        for node in list_nodes:
            if node.tag == "-text":
                texts.append(node.text)
                images.append("")
                metadata.append(None)
            elif node.tag == "img":
                texts.append(None)
                images.append(node.media_info["src"])
                metadata.append(node.media_info)

        example["texts"] = texts
        example["images"] = images
        example["metadata"] = json.dumps(metadata)
        example["general_metadata"] = json.dumps(general_metadata)

        return example