in modules/url_parameters_removal.py [0:0]
def remove_pii_params(urls, lower=None, upper=None):
"""
Now group by URL and check each parameter against the list and against
common phone number patterns for countries in the list above. Save output
as we go in case the process dies.
"""
urls_grouped = urls.groupby('canonical_url')
results = []
i = 0
for _idx, url_group in urls_grouped:
if i % 10000 == 0:
print("progress: {} / {}".format(i, len(urls_grouped)),
file=sys.stderr)
if lower is not None and i < lower:
i = i + 1
continue
if upper is not None and i >= upper:
break
results.append(URLParametersRemoval.drop_query_params(url_group))
i = i + 1
urls_params_dropped = pd.Series(results)
# parse out url and params dropped from output
clean_urls = urls_params_dropped.apply(pd.Series)
clean_urls.reset_index(level=0, inplace=True)
clean_urls.columns = ['index', 'urlid', 'canonical_url', 'clean_url',
'params_dropped', 'params_kept']
# Remove email addresses
email_pattern = re.compile(
(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'))
clean_urls['clean_url'] = \
clean_urls['clean_url'].replace(
email_pattern, '<EMAIL>', regex=True)
return clean_urls