def compare_two_soups()

in modules/url_comparison.py [0:0]


    def compare_two_soups(self, soup_1, soup_2):
        """
        Compare content returned by a pair of urls.
        Use difflib.SequenceMatcher to get the ratio of similar text,
        record whether the titles are the same. Measure total time to fetch and
        parse URLs. difflib.SequenceMatcher ratio is equal to 2.0 * M/T, where
        M is the number of matches and T is the number of elements in both
        sequences.
        See https://docs.python.org/2/library/difflib.html.
        """

        assert isinstance(soup_1, URLContentFetcher)
        assert isinstance(soup_2, URLContentFetcher)

        try:
            start_time = time.time()
            if self.verbose:
                logging.info("soup1: " + soup_1.get_body())
                logging.info("soup2: " + soup_2.get_body())
            dl_ratio = difflib.SequenceMatcher(
                None,
                soup_1.get_body(),
                soup_2.get_body(),
            ).ratio()
            body_length = len(soup_1.get_body())
            same_title = \
                soup_1.get_title() == soup_1.get_title()
            end_time = time.time()
            running_time = end_time - start_time
            return pd.Series({
                "success": True,
                "message": None,
                "dl_ratio": dl_ratio,
                "running_time": running_time,
                "body_length": body_length,
                "same_title": same_title
            })
        except Exception as e:
            message = str(e) + ", url: {0}".format(soup_1.url)
            logging.error(message)
            return pd.Series({
                "success": False,
                "message": message,
                "dl_ratio": None,
                "running_time": None,
                "body_length": None,
                "same_title": None
            })