in readability/readability.py [0:0]
def summary(self, html_partial=False):
"""
Given a HTML file, extracts the text of the article.
:param html_partial: return only the div of the document, don't wrap
in html and body tags.
Warning: It mutates internal DOM representation of the HTML document,
so it is better to call other API methods before this one.
"""
try:
ruthless = True
while True:
self._html(True)
for i in self.tags(self.html, "script", "style"):
i.drop_tree()
for i in self.tags(self.html, "body"):
i.set("id", "readabilityBody")
if ruthless:
self.remove_unlikely_candidates()
self.transform_misused_divs_into_paragraphs()
candidates = self.score_paragraphs()
best_candidate = self.select_best_candidate(candidates)
if best_candidate:
article = self.get_article(
candidates, best_candidate, html_partial=html_partial
)
else:
if ruthless:
log.info("ruthless removal did not work. ")
ruthless = False
log.debug(
(
"ended up stripping too much - "
"going for a safer _parse"
)
)
# try again
continue
else:
log.debug(
(
"Ruthless and lenient parsing did not work. "
"Returning raw html"
)
)
article = self.html.find("body")
if article is None:
article = self.html
cleaned_article = self.sanitize(article, candidates)
article_length = len(cleaned_article or "")
retry_length = self.retry_length
of_acceptable_length = article_length >= retry_length
if ruthless and not of_acceptable_length:
ruthless = False
# Loop through and try again.
continue
else:
return cleaned_article
except Exception as e:
log.exception("error getting summary: ")
if sys.version_info[0] == 2:
from .compat.two import raise_with_traceback
else:
from .compat.three import raise_with_traceback
raise_with_traceback(Unparseable, sys.exc_info()[2], str_(e))