in vision/m4/sourcing/data_collection/processors/pair_extractor.py [0:0]
def _extract_images_not_in_simplified_dom_tree(self, html_str, page_url, images_in_simplified_dom_tree):
selectolax_tree = make_selectolax_tree(html_str)
all_images = [
selectolax_node for selectolax_node in selectolax_tree.root.traverse() if selectolax_node.tag == "img"
]
all_images = [simplify_media_node(selectolax_node, page_url=page_url) for selectolax_node in all_images]
all_images = [image for image in all_images if image]
set_images_in_simplified_dom_tree = set([media_info["src"] for media_info in images_in_simplified_dom_tree])
images_not_in_simplified_dom_tree = [
media_info for media_info in all_images if media_info["src"] not in set_images_in_simplified_dom_tree
]
for ind, media_info in enumerate(images_not_in_simplified_dom_tree):
url = media_info["src"]
image = fetch_single_image(url, timeout=1)
if image is not None:
media_info["original_width"], media_info["original_height"] = image.size
if image.format:
media_info["format"] = image.format.lower()
if self.extract_clip_scores:
media_info = self._get_clip_scores(media_info, image)
media_info["image_in_simplified_dom_tree"] = False
images_not_in_simplified_dom_tree[ind] = media_info
else:
images_not_in_simplified_dom_tree[ind] = None
images_not_in_simplified_dom_tree = [image for image in images_not_in_simplified_dom_tree if image]
return images_not_in_simplified_dom_tree