def remove_pii_params()

in modules/url_parameters_removal.py [0:0]


    def remove_pii_params(urls, lower=None, upper=None):
        """
        Now group by URL and check each parameter against the list and against
        common phone number patterns for countries in the list above. Save output
        as we go in case the process dies.
        """
        urls_grouped = urls.groupby('canonical_url')
        results = []
        i = 0
        for _idx, url_group in urls_grouped:
            if i % 10000 == 0:
                print("progress: {} / {}".format(i, len(urls_grouped)),
                      file=sys.stderr)
            if lower is not None and i < lower:
                i = i + 1
                continue
            if upper is not None and i >= upper:
                break
            results.append(URLParametersRemoval.drop_query_params(url_group))
            i = i + 1
        urls_params_dropped = pd.Series(results)

        # parse out url and params dropped from output
        clean_urls = urls_params_dropped.apply(pd.Series)
        clean_urls.reset_index(level=0, inplace=True)
        clean_urls.columns = ['index', 'urlid', 'canonical_url', 'clean_url',
                              'params_dropped', 'params_kept']

        # Remove email addresses
        email_pattern = re.compile(
            (r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'))

        clean_urls['clean_url'] = \
            clean_urls['clean_url'].replace(
                email_pattern, '<EMAIL>', regex=True)
        return clean_urls