def text_cleanup()

in python/mhtml_to_json.py [0:0]


def text_cleanup(node):
    # Only keep text elements from https://developer.mozilla.org/en-US/docs/Web/HTML/Element
    valid_tags = [
        "blockquote",
        "dd",
        "div",
        "dl",
        "dt",
        "figcaption",
        "hr",
        "li",
        "ol",
        "p",
        "pre",
        "ul",
        "h1",
        "h2",
        "h3",
        "h4",
        "h5",
        "h6",
        "a",
        "abbr",
        "b",
        "bdi",
        "bdo",
        "br",
        "cite",
        "code",
        "data",
        "dfn",
        "em",
        "i",
        "kbd",
        "mark",
        "q",
        "rb",
        "rp",
        "rt",
        "rtc",
        "ruby",
        "s",
        "samp",
        "small",
        "span",
        "strong",
        "sub",
        "sup",
        "time",
        "u",
        "var",
        "wbr",
        "caption",
        "col",
        "colgroup",
        "table",
        "tbody",
        "td",
        "tfoot",
        "th",
        "thead",
        "tr",
    ]
    remove_all_but_text_nodes(node, valid_tags)
    return node