in modules/url_parameters_removal.py [0:0]
def build_param_data(url_data_with_similarity):
"""
For each domain, take the average of query similarity, title similarity,
body length so we can create domain-specific rules
"""
param_dat_aa = url_data_with_similarity[
url_data_with_similarity['param'] == ''].copy()
param_dat_ab = url_data_with_similarity[
url_data_with_similarity['param'] != ''].copy()
# fix a few difficult but popular cases:
youtube_rows = param_dat_ab[
"full_domain"].str.contains("www.youtube.com")
param_dat_ab["same_title"] = np.where(
youtube_rows & (param_dat_ab["param"] == "v"),
False, param_dat_ab["same_title"])
google_rows = param_dat_ab[
"full_domain"].str.contains("www.google.com")
param_dat_ab["same_title"] = np.where(
google_rows & (param_dat_ab.param == "url"),
False, param_dat_ab['same_title'])
# take the average of query similarity, title similarity, body length
param_domain = param_dat_ab.groupby(["full_domain", "param"])[
"qsim", "same_title", "body_length"
].mean()
param_domain = param_domain.reset_index()
# subtract AA test similarity from difference
same_url_means = param_dat_aa.groupby(
["full_domain"])["qsim", "same_title"].mean()
same_url_means = same_url_means.reset_index()
same_url_means.columns = ["full_domain", "gsim_mean", "same_title_mean"]
same_url_means = same_url_means.astype({
"same_title_mean": "float64"})
# Join it back together
param_domain = pd.merge(param_domain, same_url_means)
param_domain["diff_gsim"] = \
param_domain["gsim_mean"] - param_domain["qsim"]
param_domain["diff_same_title"] = \
param_domain["same_title_mean"] \
- param_domain["same_title"].astype('float64')
return param_domain