in readability/readability.py [0:0]
def score_paragraphs(self):
MIN_LEN = self.min_text_length
candidates = {}
ordered = []
for elem in self.tags(self._html(), "p", "pre", "td"):
parent_node = elem.getparent()
if parent_node is None:
continue
grand_parent_node = parent_node.getparent()
inner_text = clean(elem.text_content() or "")
inner_text_len = len(inner_text)
# If this paragraph is less than 25 characters
# don't even count it.
if inner_text_len < MIN_LEN:
continue
if parent_node not in candidates:
candidates[parent_node] = self.score_node(parent_node)
ordered.append(parent_node)
if grand_parent_node is not None and grand_parent_node not in candidates:
candidates[grand_parent_node] = self.score_node(grand_parent_node)
ordered.append(grand_parent_node)
content_score = 1
content_score += len(inner_text.split(","))
content_score += min((inner_text_len / 100), 3)
# if elem not in candidates:
# candidates[elem] = self.score_node(elem)
# WTF? candidates[elem]['content_score'] += content_score
candidates[parent_node]["content_score"] += content_score
if grand_parent_node is not None:
candidates[grand_parent_node]["content_score"] += content_score / 2.0
# Scale the final candidates score based on link density. Good content
# should have a relatively small link density (5% or less) and be
# mostly unaffected by this operation.
for elem in ordered:
candidate = candidates[elem]
ld = self.get_link_density(elem)
score = candidate["content_score"]
log.debug(
"Branch %6.3f %s link density %.3f -> %6.3f"
% (score, describe(elem), ld, score * (1 - ld))
)
candidate["content_score"] *= 1 - ld
return candidates