def get_browse_group_from_history()

in src/jobs/util/labeled_data_utils.py [0:0]


def get_browse_group_from_history(history_df: pd.DataFrame) -> dict[str, int]:
    """
    This is a utility function that takes in browsing history as a dataframe
    and creates a 'browse_group' column based on the provenance of the browsing.

    Use the id column to represent the id of a page view, and from_visit to indicate
    the id that we came from to get to that id.
    """
    cur_assignments = {}
    # create list of mappings
    cur_index = 0
    all_ids = set(history_df["id"].to_list())
    for index, row in history_df.iterrows():
        if row.from_visit == 0 or row.from_visit not in all_ids:
            cur_assignments[row.id] = cur_index
            cur_index += 1
    num_changes = 1
    while num_changes > 0:
        num_changes = 0
        for index, row in history_df.iterrows():
            if row.id not in cur_assignments and row.from_visit in cur_assignments:
                cur_assignments[row.id] = cur_assignments[row.from_visit]
                num_changes += 1
    history_df["browse_group"] = history_df["id"].apply(lambda x: cur_assignments.get(x))
    result_dict = dict(zip(history_df.url, history_df.browse_group))
    return result_dict