in modules/url_parameters_removal.py [0:0]
def drop_params_via_similarity(
urls_with_param, param_domain,
same_title_upper_bound=0.95,
mean_diff_gsim_lower_bound=0.02,
mean_diff_gsim_upper_bound=0.98,
body_length_lower_bound=100):
urls = urls_with_param[
['url_id', 'full_domain', 'canonical_url', 'param']]
urls = pd.merge(
urls, param_domain, how="left", on=["full_domain", "param"])
urls['url'] = urls['canonical_url']
# keep list of parameters to remove for each URL, defaults to False
if 'keep' not in urls:
urls['keep'] = False
# THIS IS WHERE THE RUBBER MEETS THE ROAD.
# Keep params that when removed result in a webpage with a different title or
# a page whose content is very different. Note that urls['diff'] is
# the difference above and beyond the change when the URL is refreshed.
# convert same_tile from bool to float before going further
urls = urls.astype({'same_title': 'float'})
keep_idx = (
(((urls['same_title'] < same_title_upper_bound)
& (abs(urls['diff_gsim']) > mean_diff_gsim_lower_bound))
| (urls['diff_gsim'] > mean_diff_gsim_upper_bound))
& (urls['body_length'] > body_length_lower_bound))
# keep_idx.mean() # 0.21376656596720206, old result
urls['keep'] = np.where(keep_idx, True, urls['keep'])
# ^^ THIS SHOULD EVENTUALLY BE DONE VIA ML
# - Training outcome, something like:
# Add these parameters + others that we obviously need to remove to
# our drop list:
drop = [
# pii related parameters:
'pw', 'pass', 'password', 'key', 'username', 'name', 'email',
'address', 'account', 'password', 'ssn', 'dob', 'zipcode',
'user_id', 'userid', 'accountid', 'account_id',
# commonly occuring tracking/token-related parameters
'utm_source', 'utm_medium', 'utm_campaign', 'utm_content', 'source',
'utm_term', 'usp', 'edit_requested', 'ogsrc', 'fbclid',
'entrypoint', 'redirect', 'platform', 'widgetTypeCall', 'logType',
'uuid', 'app_id', 'campaign', 'src', 'caption', 'fbrefresh',
'user', 'cp', 'desc', 'c_id', 'geo', 'cmpid', 'cHash', '_reff',
'pk_campaign', 'ctype', 's_src', 'referrer',
'channel', 'userid', 'uc_param_str', 'fb-share-results', 'cpidfb',
'content_type', 'tag', 'campaign_id', 'cID', 'channel_id',
'NONCE_TOKEN', 'reco_id', 'promo_id',
]
# Add params to a drop index if they match:
drop_str = '|'.join(drop)
drop_idx = urls['param'].str.contains(drop_str, regex=True)
# NOTE: if drop_idx is null, it means that we couldn't reach the website to
# check query params, so add all params to the drop list.
drop_idx[pd.isnull(drop_idx)] = True
# Note that our parameter keep-list should shrink a touch in response to
# this filtering.
urls['keep'] = np.where(drop_idx, False, urls['keep'])
return urls