in obelics/utils/simplification_utils.py [0:0]
def simplify_media_node(node, page_url):
src = get_media_src(node)
if not src:
return None
unformatted_src = src
if not src.startswith("http"):
src = format_relative_to_absolute_path(page_url=page_url, relative_path=src)
if not is_url_valid(src):
return None
node_attributes = node.attributes
if node.tag == "img":
new_image = {"document_url": page_url}
new_image["unformatted_src"] = unformatted_src
new_image["src"] = src
formatted_filename = format_filename(src)
if formatted_filename:
new_image["formatted_filename"] = formatted_filename
if ("alt" in node_attributes) and node_attributes["alt"]:
new_image["alt_text"] = node_attributes["alt"]
# TODO: eventually, for image sizes we could parse cases like
# `{'src': 'http://wellbeingteams.org/wp-content/uploads/2017/04/spread600300.jpg',
# 'width': None, 'height': None, 'alt': None, 'title': 'spread600300', 'class': 'img-responsive wp-image-122',
# 'srcset': 'https://wellbeingteams.org/wp-content/uploads/2017/04/spread600300-200x100.jpg 200w, https://wellbeingteams.org/wp-content/uploads/2017/04/spread600300-400x200.jpg 400w, https://wellbeingteams.org/wp-content/uploads/2017/04/spread600300.jpg 600w',
# 'sizes': '(max-width: 800px) 100vw, 400px'}`
for size in ["width", "height"]:
if size in node_attributes and node_attributes[size] is not None:
try:
new_image[f"rendered_{size}"] = format_image_size(node_attributes[size])
except ValueError:
pass # Unrecognized format, generally an error, skipping
return new_image
elif node.tag == "video":
new_video = {"document_url": page_url}
new_video["src"] = src
if "width" in node_attributes:
new_video["width"] = node_attributes["width"]
if "height" in node_attributes:
new_video["height"] = node_attributes["height"]
return new_video
elif node.tag == "audio":
new_audio = {"document_url": page_url}
new_audio["src"] = src
return new_audio