in readability/htmls.py [0:0]
def get_body(doc):
for elem in doc.xpath(".//script | .//link | .//style"):
elem.drop_tree()
# tostring() always return utf-8 encoded string
# FIXME: isn't better to use tounicode?
raw_html = str_(tostring(doc.body or doc))
cleaned = clean_attributes(raw_html)
try:
# BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
return cleaned
except Exception: # FIXME find the equivalent lxml error
# logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
return raw_html