in modules/url_parameters_removal.py [0:0]
def append_url_similarity(self, url_info):
assert isinstance(self.url_data, pd.DataFrame)
assert isinstance(url_info, pd.DataFrame)
if 'canonical_url' not in self.url_data:
raise ValueError('missing column canonical_url')
if 'full_domain' not in self.url_data:
self.url_data['full_domain'] = [
urlparse.urlparse(x).netloc
for x in self.url_data['canonical_url'].values
]
if 'url_id' not in self.url_data:
self.url_data['url_id'] = [
hash(x)
for x in self.url_data['canonical_url']
]
url_info_2 = url_info[
url_info['success']][
['url', 'key', 'body_length', 'dl_ratio', 'same_title']]
url_info_2 = url_info_2.rename(columns={
'url': 'canonical_url',
'key': 'param',
'dl_ratio': 'qsim'
})
url_data_with_similarity = self.url_data.merge(
url_info_2, how='inner')
return url_data_with_similarity