in modules/url_parameters_removal.py [0:0]
def parse_urls_for_param(self):
urls_id_list = []
urls_param_list = []
for i in range(self.url_data.shape[0]):
row_i = self.url_data.loc[i]
parsed = urlparse.urlparse(row_i['canonical_url'])
query = urlparse.parse_qs(parsed.query)
params = list(query.keys())
urls_id_list.extend([row_i['url_id']] * len(params))
urls_param_list.extend(params)
if i % 100000 == 0:
print("progress: {} / {}".format(i, self.url_data.shape[0]),
file=sys.stderr)
urls_with_param = pd.DataFrame({
'url_id': urls_id_list,
'param': urls_param_list
})
return urls_with_param