in vision/m4/sourcing/data_collection/utils/simplification_utils.py [0:0]
def get_media_src(node):
node_attributes = node.attributes
node_tag = node.tag
src = None
if node_tag == "img":
# Check all possible source type, and keep the first valid one
img_source_types = [
"src",
"data-src",
"data-src-fg",
"data-scroll-image",
"srcset",
"data-lateloadsrc",
"data-img-src",
"data-original",
"data-gt-lazy-src",
"data-lazy",
"data-lazy-src",
"src2",
]
for source_type in img_source_types:
if source_type in node_attributes and node_attributes[source_type]:
if ("," not in node_attributes[source_type]) and (" " not in node_attributes[source_type]):
src = node_attributes[source_type]
break
elif node_tag == "video":
if ("src" in node_attributes) and node_attributes["src"]:
src = node_attributes["src"]
else:
for cnode in node.iter():
if not src:
if cnode.tag == "source":
cnode_attributes = cnode.attributes
if ("src" in cnode_attributes) and cnode_attributes["src"]:
src = cnode_attributes["src"]
elif node_tag == "audio":
if ("src" in node_attributes) and node_attributes["src"]:
src = node_attributes["src"]
else:
for cnode in node.iter():
if not src:
if cnode.tag == "source":
cnode_attributes = cnode.attributes
if ("src" in cnode_attributes) and cnode_attributes["src"]:
src = cnode_attributes["src"]
else:
return None # TODO iframes
# Check on comma because it's non-canonical and should not be used anyway in urls.
# TODO: have checks on valid URLs
# Useless (at least for images) since already checked
if src is not None and (("," in src) or (" " in src)):
return None
return src