def drop_params_via_similarity()

in modules/url_parameters_removal.py [0:0]


    def drop_params_via_similarity(
            urls_with_param, param_domain,
            same_title_upper_bound=0.95,
            mean_diff_gsim_lower_bound=0.02,
            mean_diff_gsim_upper_bound=0.98,
            body_length_lower_bound=100):
        urls = urls_with_param[
            ['url_id', 'full_domain', 'canonical_url', 'param']]
        urls = pd.merge(
            urls, param_domain, how="left", on=["full_domain", "param"])
        urls['url'] = urls['canonical_url']

        # keep list of parameters to remove for each URL, defaults to False
        if 'keep' not in urls:
            urls['keep'] = False

        # THIS IS WHERE THE RUBBER MEETS THE ROAD.
        # Keep params that when removed result in a webpage with a different title or
        # a page whose content is very different. Note that urls['diff'] is
        # the difference above and beyond the change when the URL is refreshed.

        # convert same_tile from bool to float before going further
        urls = urls.astype({'same_title': 'float'})
        keep_idx = (
            (((urls['same_title'] < same_title_upper_bound)
                & (abs(urls['diff_gsim']) > mean_diff_gsim_lower_bound))
                | (urls['diff_gsim'] > mean_diff_gsim_upper_bound))
            & (urls['body_length'] > body_length_lower_bound))

        # keep_idx.mean() # 0.21376656596720206, old result
        urls['keep'] = np.where(keep_idx, True, urls['keep'])

        # ^^ THIS SHOULD EVENTUALLY BE DONE VIA ML
        # - Training outcome, something like:

        # Add these parameters + others that we obviously need to remove to
        # our drop list:
        drop = [
            # pii related parameters:
            'pw', 'pass', 'password', 'key', 'username', 'name', 'email',
            'address', 'account', 'password', 'ssn', 'dob', 'zipcode',
            'user_id', 'userid', 'accountid', 'account_id',
            # commonly occuring tracking/token-related parameters
            'utm_source', 'utm_medium', 'utm_campaign', 'utm_content', 'source',
            'utm_term', 'usp', 'edit_requested', 'ogsrc', 'fbclid',
            'entrypoint', 'redirect', 'platform', 'widgetTypeCall', 'logType',
            'uuid', 'app_id', 'campaign', 'src', 'caption', 'fbrefresh',
            'user', 'cp', 'desc', 'c_id', 'geo', 'cmpid', 'cHash', '_reff',
            'pk_campaign', 'ctype', 's_src', 'referrer',
            'channel', 'userid', 'uc_param_str', 'fb-share-results', 'cpidfb',
            'content_type', 'tag', 'campaign_id', 'cID', 'channel_id',
            'NONCE_TOKEN', 'reco_id', 'promo_id',
        ]

        # Add params to a drop index if they match:
        drop_str = '|'.join(drop)
        drop_idx = urls['param'].str.contains(drop_str, regex=True)

        # NOTE: if drop_idx is null, it means that we couldn't reach the website to
        # check query params, so add all params to the drop list.
        drop_idx[pd.isnull(drop_idx)] = True

        # Note that our parameter keep-list should shrink a touch in response to
        # this filtering.
        urls['keep'] = np.where(drop_idx, False, urls['keep'])

        return urls