def get_body()

in readability/htmls.py [0:0]


def get_body(doc):
    for elem in doc.xpath(".//script | .//link | .//style"):
        elem.drop_tree()
    # tostring() always return utf-8 encoded string
    # FIXME: isn't better to use tounicode?
    raw_html = str_(tostring(doc.body or doc))
    cleaned = clean_attributes(raw_html)
    try:
        # BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
        return cleaned
    except Exception:  # FIXME find the equivalent lxml error
        # logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
        return raw_html