src/gen_annotation_data.py (167 lines of code) (raw):
import argparse
import uuid
from dataclasses import dataclass
from random import randint, sample, choice
from typing import List
import json
import os
from datetime import datetime
import pandas as pd
from dotenv import load_dotenv
from util.language_model import LanguageModel
"""
Generate synthetic archetypes and actions for archetypes
"""
DATA_PATH = "./data/synthetic_data_tasks"
STREAMING = "Streaming Videos"
SOCIAL_MEDIA = "Browse Social Media"
SHOPPING = "Online Shopping"
EMAIL = "Online research"
READING = "Reading Articles"
WORK_APPS = "Using web based work applications"
FINANCE = "Online banking and Financial Management"
FORUMS = "Participating in online forums and communities"
TRAVEL = "Researching or Booking Travel"
HOBBY = "Exploring a hobby online"
TASKS = [
STREAMING,
SOCIAL_MEDIA,
SHOPPING,
EMAIL,
READING,
WORK_APPS,
FINANCE,
FORUMS,
TRAVEL,
]
SUBTASKS = {
STREAMING: ["news_subtopics.json"],
SOCIAL_MEDIA: [],
SHOPPING: ["shopping_subtopics.json"],
EMAIL: ["email_subtopics.json"],
READING: ["news_subtopics.json"],
WORK_APPS: [],
FINANCE: [],
FORUMS: ["news_subtopics.json"],
TRAVEL: ["travel_subtopics.json"],
HOBBY: []
}
OPEN_TABS_PER_TASK = {
"default": {
"min": 1,
"max": 10
}
}
TYPICAL_TASK_COUNTS = [1, 2, 3, 4, 5, 9, 10]
TAB_URL_KEY = "url"
TAB_TITLE_KEY = "title"
@dataclass
class UserArchetype:
user_description: str = None
task_list: str = None
class TabGroupingDataGenerator:
all_archetypes: List[UserArchetype] = []
def __init__(self):
self.lm = LanguageModel()
def load_subtopic_data(self):
with open(os.path.join(DATA_PATH, "news_subtopics.json")) as json_file:
self.news_subtopics = json.load(json_file)['subtopics']
with open(os.path.join(DATA_PATH, "shopping_subtopics.json")) as json_file:
self.shopping_subtopics = json.load(json_file)['subtopics']
with open(os.path.join(DATA_PATH, "email_subtopics.json")) as json_file:
self.email_subtopics = json.load(json_file)['subtopics']
with open(os.path.join(DATA_PATH, "travel_subtopics.json")) as json_file:
self.travel_subtopics = json.load(json_file)['subtopics']
with open(os.path.join(DATA_PATH, "user_locations.json")) as json_file:
self.user_locations = json.load(json_file)
def get_subtopics_with_prefixes(self, task):
if task in [STREAMING, READING, FORUMS]:
return self.news_subtopics, "about", 3
elif task in [SHOPPING]:
return self.shopping_subtopics, "for", 2
elif task in [EMAIL]:
return self.email_subtopics, "for", 1
elif task in [TRAVEL]:
return self.travel_subtopics, "for", 1
return [], None
def generate_tasks_with_subtopics(self, tasks):
generated_tasks = []
for task in tasks:
subtask_supported = len(SUBTASKS.get(task, [])) > 0
if subtask_supported:
data, prefix, sample_size = self.get_subtopics_with_prefixes(task)
sampled_subtasks = sample(data, sample_size)
new_tasks = [f"{task} {prefix} {l}" for l in sampled_subtasks]
generated_tasks += new_tasks
else:
generated_tasks += [task]
return list(set(generated_tasks))
def compute_tasks_for_user(self, archetype_name, task_list):
ls = self.generate_tasks_with_subtopics(task_list)
task_name = f"We have a web user of the following archetype. {archetype_name}. Indicate whether they would typically do the following task in a desktop web browser."
tasks_supported = self.lm.ask_list_boolean(ls.copy(), task_name)
tasks_df = pd.DataFrame({"tasks_supported": tasks_supported, "task": ls.copy()})
tasks_for_user = tasks_df[tasks_df.tasks_supported].task.to_list()
return tasks_for_user
def build_archetypes(self, n_archetypes=10, add_location=False):
self.load_subtopic_data()
archetypes = self.lm.get_list(
f"Create a list of {n_archetypes} of the user archetypes who use desktop web browsers. Include their gender, age, and other relevant info that might determine what sites they browse")
for arch in archetypes:
location = choice(self.user_locations)
arch = f"{arch} Located in {location}."
print(arch)
tasks_for_user = self.compute_tasks_for_user(arch, TASKS)
user = UserArchetype(user_description=arch, task_list=tasks_for_user)
self.all_archetypes.append(user)
def gen_data_for_archetype(self, user: UserArchetype, allow_same_task_twice=False, n_sample_pages=10, retarget_topic=True):
print(f"gen_data_for_archetype {n_sample_pages}")
data_list = []
task_count_index = randint(0, len(TYPICAL_TASK_COUNTS) - 1)
task_count = TYPICAL_TASK_COUNTS[task_count_index]
test_set_id = str(uuid.uuid4())
task_list = self.compute_tasks_for_user(user.user_description, TASKS)
cur_task_list = pd.Series(task_list).sample(n=min(task_count, len(task_list))).to_list()
for task in cur_task_list:
print(f"running task {task}")
# if randrange(2) or True:
# revised_task = self.lm.text_query(f"I'm trying to define a task for the following user archetype: {user.user_description}. Please update the following task to make it more specific for an imagined scenario that you dream up. Use just a few words ### {task}", retry_count=1)
# if revised_task is not None and len(revised_task) > 0:
# print(f"Made task more specific as {revised_task}")
# task = revised_task
browsing_data = self.lm.ask_df(
f"We are generating sample browsing data for a user of the following user. {user.user_description}"
f"Generate {n_sample_pages} sample page tiles and URLs for the user performing a specific instance task {task} in a single browser session",
[TAB_URL_KEY, TAB_TITLE_KEY])
open_tabs_info = OPEN_TABS_PER_TASK[task] if task in OPEN_TABS_PER_TASK else OPEN_TABS_PER_TASK["default"]
num_open_tabs = randint(open_tabs_info["min"], open_tabs_info["max"])
browsing_data = browsing_data.sample(n=num_open_tabs).reset_index(drop=True)
browsing_data["task"] = task
browsing_data["test_set_id"] = test_set_id
browsing_data["task_id"] = f"{test_set_id}_{str(uuid.uuid4())}"
browsing_data["user_description"] = user.user_description
data_list.append(browsing_data)
return pd.concat(data_list, axis=0)
def build_data(self, num_tests_per_user: int = 40, archetypes=None):
data_list = []
if archetypes is None:
archetypes = self.all_archetypes
for arch in archetypes:
print(f"working on archetype {arch.user_description}")
for i in range(num_tests_per_user):
try:
data_list.append(self.gen_data_for_archetype(arch))
except Exception as ex:
print(f"Failed instance {i} archetype {arch}: {ex}")
return pd.concat(data_list, axis=0)
if __name__ == "__main__":
load_dotenv()
parser = argparse.ArgumentParser(description="Generates synthetic data for tab grouping")
parser.add_argument('--num_archetypes', type=int, default=50, help='Number of archetypes')
parser.add_argument('--num_tests_per_user', type=int, default=40, help='Number of tests for a user archetype')
args = parser.parse_args()
gen = TabGroupingDataGenerator()
print(f"Generating {args.num_archetypes} archetypes")
gen.build_archetypes(n_archetypes=args.num_archetypes, add_location=True)
cur_data = pd.DataFrame()
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M")
out_filename = f"output/gen_data_{timestamp}.csv"
# we get about 5 clusters per user, so 20 arch * 50 users = 5000 clusters
for archetype in gen.all_archetypes:
print(f"Generating {args.num_tests_per_user} tests for {archetype.user_description}")
data = gen.build_data(num_tests_per_user=args.num_tests_per_user, archetypes=[archetype])
cur_data = pd.concat([cur_data, data], axis=0)
print(f"finished arch {archetype.user_description}. Saving in progress file")
cur_data.to_csv(out_filename, index=False)
print(f"All done. Final result in {out_filename}")