def append_url_similarity()

in modules/url_parameters_removal.py [0:0]


    def append_url_similarity(self, url_info):
        assert isinstance(self.url_data, pd.DataFrame)
        assert isinstance(url_info, pd.DataFrame)

        if 'canonical_url' not in self.url_data:
            raise ValueError('missing column canonical_url')
        if 'full_domain' not in self.url_data:
            self.url_data['full_domain'] = [
                urlparse.urlparse(x).netloc
                for x in self.url_data['canonical_url'].values
            ]
        if 'url_id' not in self.url_data:
            self.url_data['url_id'] = [
                hash(x)
                for x in self.url_data['canonical_url']
            ]

        url_info_2 = url_info[
            url_info['success']][
            ['url', 'key', 'body_length', 'dl_ratio', 'same_title']]
        url_info_2 = url_info_2.rename(columns={
            'url': 'canonical_url',
            'key': 'param',
            'dl_ratio': 'qsim'
        })

        url_data_with_similarity = self.url_data.merge(
            url_info_2, how='inner')

        return url_data_with_similarity