def get_media_src()

in obelics/utils/simplification_utils.py [0:0]


def get_media_src(node):
    node_attributes = node.attributes
    node_tag = node.tag
    src = None

    if node_tag == "img":
        # Check all possible source type, and keep the first valid one
        img_source_types = [
            "src",
            "data-src",
            "data-src-fg",
            "data-scroll-image",
            "srcset",
            "data-lateloadsrc",
            "data-img-src",
            "data-original",
            "data-gt-lazy-src",
            "data-lazy",
            "data-lazy-src",
            "src2",
        ]
        for source_type in img_source_types:
            if source_type in node_attributes and node_attributes[source_type]:
                if ("," not in node_attributes[source_type]) and (" " not in node_attributes[source_type]):
                    src = node_attributes[source_type]
                    break

    elif node_tag == "video":
        if ("src" in node_attributes) and node_attributes["src"]:
            src = node_attributes["src"]
        else:
            for cnode in node.iter():
                if not src:
                    if cnode.tag == "source":
                        cnode_attributes = cnode.attributes
                        if ("src" in cnode_attributes) and cnode_attributes["src"]:
                            src = cnode_attributes["src"]

    elif node_tag == "audio":
        if ("src" in node_attributes) and node_attributes["src"]:
            src = node_attributes["src"]
        else:
            for cnode in node.iter():
                if not src:
                    if cnode.tag == "source":
                        cnode_attributes = cnode.attributes
                        if ("src" in cnode_attributes) and cnode_attributes["src"]:
                            src = cnode_attributes["src"]
    else:
        return None  # TODO iframes

    # Check on comma because it's non-canonical and should not be used anyway in urls.
    # TODO: have checks on valid URLs
    # Useless (at least for images) since already checked
    if src is not None and (("," in src) or (" " in src)):
        return None

    return src