def build_param_data()

in modules/url_parameters_removal.py [0:0]


    def build_param_data(url_data_with_similarity):
        """
        For each domain, take the average of query similarity, title similarity,
        body length so we can create domain-specific rules
        """
        param_dat_aa = url_data_with_similarity[
            url_data_with_similarity['param'] == ''].copy()
        param_dat_ab = url_data_with_similarity[
            url_data_with_similarity['param'] != ''].copy()

        # fix a few difficult but popular cases:
        youtube_rows = param_dat_ab[
            "full_domain"].str.contains("www.youtube.com")
        param_dat_ab["same_title"] = np.where(
            youtube_rows & (param_dat_ab["param"] == "v"),
            False, param_dat_ab["same_title"])

        google_rows = param_dat_ab[
            "full_domain"].str.contains("www.google.com")
        param_dat_ab["same_title"] = np.where(
            google_rows & (param_dat_ab.param == "url"),
            False, param_dat_ab['same_title'])

        # take the average of query similarity, title similarity, body length
        param_domain = param_dat_ab.groupby(["full_domain", "param"])[
            "qsim", "same_title", "body_length"
        ].mean()
        param_domain = param_domain.reset_index()

        # subtract AA test similarity from difference
        same_url_means = param_dat_aa.groupby(
            ["full_domain"])["qsim", "same_title"].mean()
        same_url_means = same_url_means.reset_index()
        same_url_means.columns = ["full_domain", "gsim_mean", "same_title_mean"]
        same_url_means = same_url_means.astype({
            "same_title_mean": "float64"})

        # Join it back together
        param_domain = pd.merge(param_domain, same_url_means)
        param_domain["diff_gsim"] = \
            param_domain["gsim_mean"] - param_domain["qsim"]
        param_domain["diff_same_title"] = \
            param_domain["same_title_mean"] \
            - param_domain["same_title"].astype('float64')
        return param_domain