def _extract_images_not_in_simplified_dom_tree()

in vision/m4/sourcing/data_collection/processors/pair_extractor.py [0:0]


    def _extract_images_not_in_simplified_dom_tree(self, html_str, page_url, images_in_simplified_dom_tree):
        selectolax_tree = make_selectolax_tree(html_str)
        all_images = [
            selectolax_node for selectolax_node in selectolax_tree.root.traverse() if selectolax_node.tag == "img"
        ]
        all_images = [simplify_media_node(selectolax_node, page_url=page_url) for selectolax_node in all_images]
        all_images = [image for image in all_images if image]

        set_images_in_simplified_dom_tree = set([media_info["src"] for media_info in images_in_simplified_dom_tree])
        images_not_in_simplified_dom_tree = [
            media_info for media_info in all_images if media_info["src"] not in set_images_in_simplified_dom_tree
        ]
        for ind, media_info in enumerate(images_not_in_simplified_dom_tree):
            url = media_info["src"]
            image = fetch_single_image(url, timeout=1)
            if image is not None:
                media_info["original_width"], media_info["original_height"] = image.size
                if image.format:
                    media_info["format"] = image.format.lower()
                if self.extract_clip_scores:
                    media_info = self._get_clip_scores(media_info, image)
                media_info["image_in_simplified_dom_tree"] = False
                images_not_in_simplified_dom_tree[ind] = media_info
            else:
                images_not_in_simplified_dom_tree[ind] = None
        images_not_in_simplified_dom_tree = [image for image in images_not_in_simplified_dom_tree if image]

        return images_not_in_simplified_dom_tree