in src/jobs/util/labeled_data_utils.py [0:0]
def get_browse_group_from_history(history_df: pd.DataFrame) -> dict[str, int]:
"""
This is a utility function that takes in browsing history as a dataframe
and creates a 'browse_group' column based on the provenance of the browsing.
Use the id column to represent the id of a page view, and from_visit to indicate
the id that we came from to get to that id.
"""
cur_assignments = {}
# create list of mappings
cur_index = 0
all_ids = set(history_df["id"].to_list())
for index, row in history_df.iterrows():
if row.from_visit == 0 or row.from_visit not in all_ids:
cur_assignments[row.id] = cur_index
cur_index += 1
num_changes = 1
while num_changes > 0:
num_changes = 0
for index, row in history_df.iterrows():
if row.id not in cur_assignments and row.from_visit in cur_assignments:
cur_assignments[row.id] = cur_assignments[row.from_visit]
num_changes += 1
history_df["browse_group"] = history_df["id"].apply(lambda x: cur_assignments.get(x))
result_dict = dict(zip(history_df.url, history_df.browse_group))
return result_dict