in modules/url_comparison.py [0:0]
def compare_two_soups(self, soup_1, soup_2):
"""
Compare content returned by a pair of urls.
Use difflib.SequenceMatcher to get the ratio of similar text,
record whether the titles are the same. Measure total time to fetch and
parse URLs. difflib.SequenceMatcher ratio is equal to 2.0 * M/T, where
M is the number of matches and T is the number of elements in both
sequences.
See https://docs.python.org/2/library/difflib.html.
"""
assert isinstance(soup_1, URLContentFetcher)
assert isinstance(soup_2, URLContentFetcher)
try:
start_time = time.time()
if self.verbose:
logging.info("soup1: " + soup_1.get_body())
logging.info("soup2: " + soup_2.get_body())
dl_ratio = difflib.SequenceMatcher(
None,
soup_1.get_body(),
soup_2.get_body(),
).ratio()
body_length = len(soup_1.get_body())
same_title = \
soup_1.get_title() == soup_1.get_title()
end_time = time.time()
running_time = end_time - start_time
return pd.Series({
"success": True,
"message": None,
"dl_ratio": dl_ratio,
"running_time": running_time,
"body_length": body_length,
"same_title": same_title
})
except Exception as e:
message = str(e) + ", url: {0}".format(soup_1.url)
logging.error(message)
return pd.Series({
"success": False,
"message": message,
"dl_ratio": None,
"running_time": None,
"body_length": None,
"same_title": None
})