Purpose of the notebook:

Evaluate the current NER approach. This approach uses existing models supported by Transformers.js library.
We see where it fails. 
With the hypothesis classifier based approach might be better for we prepare and label the data
(https://www.microsoft.com/en-us/download/details.aspx?id=58227)

with some improvements in labeling

In [None]:
## imports

from transformers import pipeline
import pandas as pd
from tqdm import tqdm
from pprint import pprint
import random
import matplotlib.pyplot as plt

#### Some examples of where the NER based approach is failing

In [None]:
classifier = pipeline("zero-shot-classification", model='typeform/mobilebert-uncased-mnli', device='cpu')


texts = [
    "what is democracy",
    "restaurants in oakville",
    "buy iphone",
    "bank login",
    "temperature in San Jose",
    "wood floor buckling repair",
    "wood floor cost estimator",
    "panera bread menu price",
    "how much is hbo now subscription",
    "how much is a golden retriever puppy",
    "how much is nebraska's sales tax",
    "how much is donald trump jr worth",
    "how much is a liposuction",
    "does mushroom cause food allergy",
]



intent_labels_lkp = {
    "yelp_intent": "search for local service, food, home repair, maintenance, cost estimation excluding weather intents",
    # "yelp_intent": "to discover, connect and transact with local businesses",
    "information_intent": "search for general knowledge what some concept is and not related to weather, services, or products",
    "weather_intent": "check weather conditions like forecast, temperature, radar, storms, or pollen",
    "purchase_intent": "make an online purchase",
    "navigation_intent": "navigate to a specific website"
}

intent_desc_lkp = {intent_desc: intent_key for intent_key, intent_desc in intent_labels_lkp.items()}

# Refined intent labels
intent_labels = [
    intent_labels_lkp["yelp_intent"],
    intent_labels_lkp["information_intent"],
    intent_labels_lkp["weather_intent"],
    intent_labels_lkp["purchase_intent"],
    intent_labels_lkp["navigation_intent"],
]

result = classifier(texts, candidate_labels=intent_labels)
# pprint(result)


In [None]:
# result_df = pd.DataFrame(result)
def prepare_df_from_reesult(result):
    updated_result = []
    for idx, res in enumerate(result):
        labels_and_scores = {'sequence': res['sequence']}
        for label, score in zip(res['labels'], res['scores']):
            labels_and_scores[intent_desc_lkp[label]] = score
        updated_result.append(labels_and_scores)
    
    return pd.DataFrame(updated_result)

updated_result_df = prepare_df_from_reesult(result)

In [None]:
updated_result_df

Some of the above results are bit unclear. `does the mushroom cause food allergy` is more of a information intent than a yelp intent.
There were many other cases which showed that NER alone may not be suitable for this problem. We need to solve the intent classification problem in this use case

#### Marco data

This dataset can be downloaded from https://www.microsoft.com/en-us/download/details.aspx?id=58227

In [None]:
marco_text_queries = set()
with open("../data/full_marco_sessions_ann_split.train.tsv", "r") as f:
    marco_texts = f.read().split('\n')
    for text in marco_texts:
        for query in text.split("\t"):
            if "marco-gen-train" not in query and len(query) >= 3:
                marco_text_queries.add(query.lower())

marco_text_queries_list = list(marco_text_queries)

In [None]:
len(marco_text_queries_list)

In [None]:
## some example queries

marco_text_queries_list[:50]

In [None]:
marco_df = pd.DataFrame({"sequence": marco_text_queries_list})

In [None]:
def labeling_stats(df):
    if 'target' not in marco_df.columns:
        df['target'] = None
    print(f"Size of the dataset = {len(df)}")
    print(f"Number of examples to be labeled = {df['target'].isna().sum()}")
    print(f"Number of examples labeled = {(~df['target'].isna()).sum()}")
    print("Labels distributed as \n", df['target'].value_counts())


## Prints labeling stats
labeling_stats(marco_df)

#### Find potential ngram mappings for targets

In [None]:
from collections import Counter
from itertools import islice

# Generalize function to extract n-grams
def extract_ngrams(query, n):
    words = query.split()
    ngrams = zip(*[islice(words, i, None) for i in range(n)])  # Generate n-grams
    return [' '.join(ngram) for ngram in ngrams]  # Join n-grams into a single string

# Flatten the n-grams into a list and count them
def count_ngrams(queries_list, n):
    all_ngrams = [ngram for query in queries_list for ngram in extract_ngrams(query, n)]
    ngram_counter = Counter(all_ngrams)
    return ngram_counter


In [None]:
def search_queries_by_words(search_text, to_be_labelled_sequence_list):
    for query in to_be_labelled_sequence_list:
        if search_text in query:
            yield query

In [None]:
cnt = 0
for query in search_queries_by_words("24 hour", marco_text_queries_list):
    if cnt >= 100:  # Stop after 20 results
        break
    print(cnt + 1, query)
    cnt += 1

In [None]:

target_mapping = {
    'how do': 'information_intent',
    'how to': 'information_intent',
    'weather in': 'weather_intent',
    'the weather': 'weather_intent',
    'hurricane': 'information_intent',
    # 'tornado': 'weather_intent',
    'current temperature': 'weather_intent',
    'current weather': 'weather_intent',
    'weather forecast in': 'weather_intent',
    'temperature in': 'weather_intent',
    # 'how much': 'purchase_intent', 
    # 'cost to': 'purchase_intent',
    # 'where is': 'navigation_intent', 
    'sign in ': 'navigation_intent',
    'signin ': 'navigation_intent',
    'login ': 'navigation_intent',
    'phone number': 'navigation_intent', 
    'customer service': 'navigation_intent',
    'bank routing': 'navigation_intent',
    'phone banking': 'navigation_intent',
    'watch online': 'navigation_intent',
    'help desk': 'navigation_intent',
    'what are': 'information_intent',
    'what county is': 'information_intent',
    'what is a ': 'information_intent',
    # 'what is': 'information_intent',
    'what does': 'information_intent',
    'what do': 'information_intent',
    'definition of': 'information_intent',
    'meaning': 'information_intent',
    'symptoms': 'information_intent',
    'zip code': 'information_intent',
    'zipcode': 'information_intent',
    'postal code': 'information_intent',
    'postalcode': 'information_intent',
    'area code': 'information_intent',
    'areacode': 'information_intent',
    'definition': 'information_intent',
    'define': 'information_intent',
    'what is the difference between': 'information_intent',
    'what is the purpose of': 'information_intent',
    'what is the function of': 'information_intent',
    'how long does it take': 'information_intent',
    'what is the name of': 'information_intent',
    'what is the population of': 'information_intent',
    'what is an example of': 'information_intent',
    'which of the following': 'information_intent',
    'what is the purpose': 'information_intent',
    # 'what time zone is': 'information_intent',
    'what is the average': 'information_intent',
    'is in what county': 'information_intent',
    'calories in': 'information_intent',
    # 'how many calories in': 'information_intent',
    "causes of": 'information_intent',
    "tom cruise": 'information_intent',
    'visit': 'travel_intent',
    'travel to': 'travel_intent',
    'cruise': 'travel_intent',
    'tours': 'travel_intent',
    'mortgage rate': 'yelp_intent',
    'interest rate': 'yelp_intent',
    'price of': 'purchase_intent',
    'amazon price': 'purchase_intent',
    'cost of living': 'information_intent',
    'to eat': 'yelp_intent', 
    'does it cost': 'yelp_intent', 
    'dental': 'yelp_intent',
    'dentist': 'yelp_intent',
    # 'what is the current': ?
    'what is the largest': 'information_intent',
    'what is the currency': 'information_intent',
    'how old do you': 'information_intent',
    'how long does a': 'information_intent',
    # 'what time is it': 'information_intent',
    'what time': 'information_intent',
    'you have to be': 'information_intent',
    'do you need to': 'information_intent',
    'what is considered a': 'information_intent',
    'dialing code': 'information_intent',
    'side effects': 'information_intent',
    'stock market': 'information_intent',
    'how many calories': 'information_intent',
    'average salary for': 'information_intent',
    'how many grams': 'information_intent',
    'what foods are': 'information_intent',
    'how many ounces': 'information_intent',
    'how many carbs': 'information_intent',
    'what year was': 'information_intent',
    'how old is': 'information_intent',
    'how much is': 'information_intent',
    'what type of': 'information_intent',
    'how do i': 'information_intent',
    'what kind of': 'information_intent',
    'who is the': 'information_intent',
    'where is the': 'information_intent',
    # 'different types of': 'information_intent',
    'types': 'information_intent',
    'what is': 'information_intent',
    'how do you': 'information_intent',
    'what was the': 'information_intent',
    'in the world': 'information_intent',
    'how long is': 'information_intent',
    'when was': 'information_intent',
    'when did': 'information_intent',
    'how far is': 'information_intent',
    'how tall is': 'information_intent',
    'what to do': 'information_intent',
    'how long': 'information_intent',
    'types of': 'information_intent',
    'who is': 'information_intent',
    'where is': 'information_intent',
    'what causes': 'information_intent',
    'stock price': 'information_intent',
    'difference between': 'information_intent',
    'social security': 'information_intent',
    'who was': 'information_intent',
    'net worth': 'information_intent',
    'cast of': 'information_intent',
    'how many': 'information_intent',
    'how does': 'information_intent',
    'how is': 'information_intent',
    'what did': 'information_intent',
    'good for': 'information_intent',
    'population of': 'information_intent',
    'can you': 'information_intent',
    'what can': 'information_intent',
    'how big': 'information_intent',
    'what size': 'information_intent',
    'average salary of': 'information_intent',
    'what year': 'information_intent',
    'part of': 'information_intent',
    'another word': 'information_intent',
    'who invented': 'information_intent',
    'what can you': 'information_intent',
    'how much money': 'information_intent',
    'what size': 'information_intent',
    'what state': 'information_intent',
    'what county': 'information_intent',
    'in the us': 'information_intent',
    'how old': 'information_intent',
    'icd code': 'information_intent',
    'what city': 'information_intent',
    'can you': 'information_intent',
    'can i': 'information_intent',
    'when is': 'information_intent',
    'how did': 'information_intent',
    'what can': 'information_intent',
    'what to': 'information_intent',
    'the same': 'information_intent',
    "cleaning ": 'yelp_intent',
    'restaurant': 'yelp_intent',
    'recommendation': 'yelp_intent',
    'repair': 'yelp_intent',
    'parking': 'yelp_intent',
    'oil change': 'yelp_intent',
    ' rental': 'yelp_intent',
    'auto ': 'yelp_intent',
    'dry clean': 'yelp_intent',
    'club': 'yelp_intent',
    'hotel': 'yelp_intent',
    'stores': 'yelp_intent',
    'shopping': 'yelp_intent',
    ' shop ': 'yelp_intent',
    ' shops ': 'yelp_intent',
    ' mall ': 'yelp_intent',
    'furniture': 'yelp_intent',
    'crafts': 'yelp_intent',
    'clothing': 'yelp_intent',
    # 'benefits of': 'yelp_intent',
    'average cost': 'yelp_intent',
    'cost to install': 'yelp_intent',
    'contact number': 'navigation_intent',
    'what airport': 'travel_intent',
    # 'flight': 'travel_intent',
    'cabins': 'travel_intent',
    'cost for': 'yelp_intent',
    'do you': 'information_intent',
    'when does': 'information_intent',
    'why is': 'information_intent',
    "what's the": 'information_intent',
    'what was': 'information_intent',
    'what language': 'information_intent',
    'should i': 'information_intent',
    'convert': 'information_intent',
    'medication': 'information_intent',
    'treatment': 'yelp_intent',
    'tv show': 'information_intent',
    'history': 'information_intent',
    'remedies': 'information_intent',
    'county is': 'information_intent',
    'synonym ': 'information_intent',
    'credit union number': 'navigation_intent',
    'credit union phone number': 'navigation_intent',
    'credit union hours': 'navigation_intent',
    'movie cast': 'information_intent',
    'average salary': 'information_intent',
    'example': 'information_intent',
    'blood pressure': 'information_intent',
    'credit card': 'navigation_intent',
    'time zone': 'information_intent',
    'time in': 'information_intent',
    'foods that': 'information_intent',
    'salary for': 'information_intent',
    "weather": 'weather_intent',
    "weather forecast": 'weather_intent',
    "windy": 'weather_intent',
    "humidity": 'weather_intent',
    "monsoon": 'weather_intent',
    "flooding": 'weather_intent',
    "rain in": 'weather_intent',
    "storms": 'weather_intent',
    "storm in": 'weather_intent',
    "forcast": 'weather_intent',
    "wether": 'weather_intent',
    "wather": 'weather_intent',
    "weahter": 'weather_intent',
    "weater": 'weather_intent',
    "weaher": 'weather_intent',
    " vindy ": 'weather_intent',
    " sunny ": 'weather_intent',
    " rain ": 'weather_intent',
    "windy": 'weather_intent',
    "cloudy": 'weather_intent',
    "storms": 'weather_intent',
    "air quality": 'weather_intent',
    "thunderstorm": 'weather_intent',
    "pollen": 'weather_intent',
    "snow": 'weather_intent',
    "blizzard": 'weather_intent',
    "radar": 'weather_intent',
    "tiempo": 'weather_intent',
    "clima": 'weather_intent',
    "doppler radar": 'weather_intent',
    "local radar": 'weather_intent',
    "local weather": 'weather_intent',
    # "map": 'weather_intent',
    "us weather radar": 'weather_intent',
    "weather radar near me": 'weather_intent',
    "radar near me": 'weather_intent',
    'salary': 'information_intent',
    'cost to build': 'yelp_intent',
    'icd ': 'information_intent',
    'how often': 'information_intent',
    'get rid of': 'information_intent',
    'university of': 'navigation_intent',
    'windows 10': 'navigation_intent',
    'causes for': 'information_intent',
    'calculat': 'information_intent',
    'which is ': 'information_intent',
    'where are ': 'information_intent',
    'kelvin': 'information_intent',
    'celsius': 'information_intent',
    'fahrenheit': 'information_intent',
    'when ': 'information_intent',
    'benefit of': 'information_intent',
    'most common': 'information_intent',
    'which ': 'information_intent',
    'refers ': 'information_intent',
    'where does ': 'information_intent',
    'synonym': 'information_intent', 
    'salaries': 'information_intent', 
    'function of': 'information_intent', 
    'cause of': 'information_intent', 
    'effects of': 'information_intent', 
    'used for': 'information_intent', 
    'what color is': 'information_intent', 
    'weight loss': 'yelp_intent', 
    'where do': 'information_intent', 
    'what foods': 'information_intent', 
    'used for': 'information_intent', 
    'why': 'information_intent', 
    'age of': 'information_intent', 
    'who wrote': 'information_intent', 
    'function of': 'information_intent', 
    "what's a": 'information_intent', 
    "how fast": 'information_intent', 
    'most popular': 'information_intent', 
    'where': 'information_intent', 
    'is used': 'information_intent', 
    'doctors': 'yelp_intent', 
    'who ': 'information_intent', 
    ' hours': 'navigation_intent',
    'schedule': 'information_intent', 
    'what age': 'information_intent',
    'cheap': 'yelp_intent',
    'most expensive': 'information_intent',
    'size of': 'information_intent',
    'what exactly': 'information_intent',
    'ways to ': 'information_intent',
    'disorder': 'information_intent',
    'disease': 'information_intent',
    'felony': 'information_intent',
    'movie': 'information_intent',
    # 'cost of': 'yelp_intent',
    'what were': 'information_intent',
    'degree': 'information_intent',
    'what day': 'information_intent',
    'ways to': 'information_intent',
    'influen': 'information_intent',
    'importan': 'information_intent',
    'school': 'information_intent',
    'train': 'information_intent',
    'dimension': 'information_intent',
    'what makes': 'information_intent',
    'what were': 'information_intent',
    'what food': 'information_intent',
    'normal range': 'information_intent',
    'ways to': 'information_intent',
    'requirements for': 'information_intent',
    'employment': 'information_intent',
    'support number': 'navigation_intent',
    ' support ': 'navigation_intent',
    'appointment': 'navigation_intent',
    'calculator': 'navigation_intent',
    ' application': 'navigation_intent',
    ' license': 'navigation_intent',
    'craigslist': 'navigation_intent',
    'fedex': 'navigation_intent',
    'forex': 'navigation_intent',
    ' ups ': 'navigation_intent',
    ' usps ': 'navigation_intent',
    'dhl': 'navigation_intent',
    'fax number': 'navigation_intent',
    'considered a': 'information_intent',
    'distance ': 'information_intent',
    'share price': 'information_intent',
    'stock': 'information_intent',
    'channel is': 'information_intent',
    'continent': 'information_intent',
    'what level': 'information_intent',
    'english to': 'translation_intent',
    'to english': 'translation_intent',
    'translat': 'translation_intent',
    'what currency': 'information_intent',
    'blood test': 'information_intent',
    'replacement cost': 'yelp_intent',
    'how tall': 'information_intent',
    'characteristics of': 'information_intent',
    'tracking number': 'navigation_intent',
    'tracking': 'navigation_intent',
    'to replace': 'yelp_intent',
    'pay for': 'information_intent',
    'calories': 'information_intent',
    'health': 'information_intent',
    'tax': 'information_intent',
    'deadline': 'information_intent',
    'insurance': 'information_intent',
    'cancel': 'navigation_intent',
    'address': 'navigation_intent',
    'healthy': 'yelp_intent',
    'diet': 'information_intent',
    'lyrics': 'information_intent',
    'cell phone': 'purchase_intent',
    'discount': 'purchase_intent',
    'coupon': 'purchase_intent',
    'promo code': 'purchase_intent',
    ' deal': 'purchase_intent',
    'where to buy': 'purchase_intent',
    ' buy': 'purchase_intent',
    'purchase': 'purchase_intent',
    'blackfriday': 'purchase_intent',
    'cybermonday': 'purchase_intent',
    'amazon prime': 'purchase_intent',
    'clearance': 'purchase_intent',
    'on sale': 'purchase_intent',
    'refurbished': 'purchase_intent',
    'warranty': 'purchase_intent',
    'compare price': 'purchase_intent',
    'cashback': 'purchase_intent',
    'in stock': 'purchase_intent',
    'lowest price': 'purchase_intent',
    'free shipping': 'purchase_intent',
    'android': 'information_intent',
    'protein': 'information_intent',
    '401k': 'information_intent',
    ' ira ': 'information_intent',
    'population': 'information_intent',
    'president': 'information_intent',
    'whats': 'information_intent',
    "what's": 'information_intent',
    'benefits': 'information_intent',
    ' pain ': 'yelp_intent',
    'installation cost': 'yelp_intent',
    'in spanish': 'translation_intent',
    'to spanish': 'translation_intent',
    'in french': 'translation_intent',
    'to french': 'translation_intent',
    'in japanese': 'translation_intent',
    'to japanese': 'translation_intent',
    'in chinese': 'translation_intent',
    'to chinese': 'translation_intent',
    'side effect': 'information_intent',
    'cost to live': 'information_intent',
    'cost of living': 'information_intent',
    'cost to': 'yelp_intent',
    'cost per': 'information_intent',
    'disney world': 'navigation_intent',
    'surgery cost': 'yelp_intent',
    'album': 'information_intent',
    'genre': 'information_intent',
    'much water': 'information_intent',
    'job': 'navigation_intent',
    'netflix': 'information_intent',
    'nutrient': 'information_intent',
    'amazon stock': 'information_intent',
    'music': 'information_intent',
    'caffeine': 'information_intent',
    'adoption': 'yelp_intent',
    'dogs': 'yelp_intent',
    'cats': 'yelp_intent',
    'countries': 'information_intent',
    'number of': 'information_intent',
    'related to': 'information_intent',
    'foods with': 'information_intent',
    'restaurant': 'yelp_intent',
    'cusine': 'yelp_intent',
    'italian': 'yelp_intent',
    'mediterranean': 'yelp_intent',
    'vietnamese': 'yelp_intent',
    'recipe': 'yelp_intent',
    'vegan': 'yelp_intent',
    ' vegeta': 'yelp_intent',
    ' meat': 'yelp_intent',
    ' spice': 'yelp_intent',
    ' beer': 'yelp_intent',
    ' wine': 'yelp_intent',
    ' fresh ': 'yelp_intent',
    'fruit': 'yelp_intent',
    'restaurant': 'yelp_intent',
    'resort': 'travel_intent',
    'attraction': 'travel_intent',
    'installation': 'yelp_intent',
    'service': 'yelp_intent',
    'routing number': 'navigation_intent',
    'amazon': 'navigation_intent',
}

In [None]:
print("key", "#examples")
information_queries_set = set()
for key,val in target_mapping.items():
    if val == 'information_intent':
        cnt = 0
        for query in search_queries_by_words(key, marco_text_queries_list):
            # if key == 'amazon':
            #     print(query)
            information_queries_set.add(query)
            cnt += 1

        print(key, cnt)


In [None]:
information_queries_set

In [None]:
print("key", "#examples")
navigation_queries_set = set()
for key,val in target_mapping.items():
    if val == 'navigation_intent':
        cnt = 0
        for query in search_queries_by_words(key, marco_text_queries_list):
            # if key == 'amazon':
            #     print(query)
            navigation_queries_set.add(query)
            cnt += 1

        print(key, cnt)


In [None]:
print("key", "#examples")
purchase_queries_set = set()
for key,val in target_mapping.items():
    if val == 'purchase_intent':
        cnt = 0
        for query in search_queries_by_words(key, marco_text_queries_list):
            purchase_queries_set.add(query)
            cnt += 1

        print(key, cnt)


In [None]:
purchase_queries_set

In [None]:
print("key", "#examples")
yelp_queries_set = set()
for key,val in target_mapping.items():
    if val == 'yelp_intent':
        cnt = 0
        for query in search_queries_by_words(key, marco_text_queries_list):
            yelp_queries_set.add(query)
            cnt += 1

        print(key, cnt)


In [None]:
yelp_queries = list(yelp_queries_set)
yelp_queries[:5]

yelp_ngram_counter = count_ngrams(yelp_queries, 2)
yelp_most_common_ngrams = yelp_ngram_counter.most_common(100)

# Display the weather_most_common_ngrams
print(yelp_most_common_ngrams)

In [None]:
yelp_queries_set

In [None]:
print("key", "#examples")
weather_queries_set = set()
for key,val in target_mapping.items():
    if val == 'weather_intent':
        cnt = 0
        for query in search_queries_by_words(key, marco_text_queries_list):
            weather_queries_set.add(query)
            cnt += 1

        print(key, cnt)


In [None]:
weather_queries = list(weather_queries_set)
weather_queries[:5]

weather_ngram_counter = count_ngrams(weather_queries, 2)
weather_most_common_ngrams = weather_ngram_counter.most_common(100)

# Display the weather_most_common_ngrams
print(weather_most_common_ngrams)

In [None]:
weather_templates = [
    # Original Patterns
    ("The weather in {}", 0.539),
    ("What is the weather in {}", 0.499),
    ("What's the weather in {}", 0.046),
    ("Weather forecast in {}", 0.039),
    ("What is the temperature in {}", 0.033),
    ("The weather forecast for {}", 0.034),
    ("Current weather in {}", 0.023),
    ("Average weather in {}", 0.022),
    ("What is the weather forecast for {}", 0.014),
    ("Weather in {} in {}", 0.011),
    ("How is the weather in {}", 0.006),
    ("What is the climate of {}", 0.009),
    ("Is the weather forecast for {}", 0.005),
    ("Rain in {}", 0.002),
    ("What is the weather like in {}", 0.009),
    ("What is the climate in {}", 0.001),
    ("The weather today in {}", 0.001),
    ("What's the weather forecast for {}", 0.002),
    ("What is the best weather in {}", 0.001),
    ("Is the weather today in {}", 0.001),
    ("Current temperature in {}", 0.001),
    ("Storms in {}", 0.0007),
    ("Humidity in {}", 0.003),
    ("Windy in {}", 0.0005),
    ("Snow in {}", 0.009),
    ("Weather radar in {}", 0.005),
    ("The temperature in {}", 0.005),
    ("Weather like in {}", 0.006),
    ("What's the temperature in {}", 0.001),
    ("Is the weather like in {}", 0.006),

    # # Additional Patterns (10% of original weight)
    ("weather {}", 0.10 * 0.539),
    ("{} weather", 0.10 * 0.539),
    ("temperature {}", 0.10 * 0.033),
    ("{} temperature", 0.10 * 0.033),
]

# Expanding the typo variants further to include the common misspellings for "weather", "temperature", and "forecast"
extended_typo_variants = [
    # Misspellings for "weather"
    ("weathr {}", 0.20 * 0.539),
    ("{} weathr", 0.20 * 0.539),
    ("The weathr in {}", 0.20 * 0.539),
    ("What is the weathr in {}", 0.20 * 0.499),
    ("What's the weathr in {}", 0.20 * 0.046),
    ("Weathr forecast in {}", 0.20 * 0.039),
    ("What is the weathr like in {}", 0.20 * 0.009),
    ("The wether in {}", 0.20 * 0.539),
    ("What is the wether in {}", 0.20 * 0.499),
    ("What's the wether in {}", 0.20 * 0.046),
    ("Wether forecast in {}", 0.20 * 0.039),
    ("What is the wether like in {}", 0.20 * 0.009),
    ("The weater in {}", 0.20 * 0.539),
    ("What is the weater in {}", 0.20 * 0.499),
    ("What's the weater in {}", 0.20 * 0.046),
    ("Weater forecast in {}", 0.20 * 0.039),
    ("What is the weater like in {}", 0.20 * 0.009),
    ("The wather in {}", 0.20 * 0.539),
    ("What is the wather in {}", 0.20 * 0.499),
    ("What's the wather in {}", 0.20 * 0.046),
    ("Wather forecast in {}", 0.20 * 0.039),
    ("What is the wather like in {}", 0.20 * 0.009),
    ("The weahter in {}", 0.20 * 0.539),
    ("What is the weahter in {}", 0.20 * 0.499),
    ("What's the weahter in {}", 0.20 * 0.046),
    ("Weahter forecast in {}", 0.20 * 0.039),
    ("What is the weahter like in {}", 0.20 * 0.009),
    ("The weaher in {}", 0.20 * 0.539),
    ("What is the weaher in {}", 0.20 * 0.499),
    ("What's the weaher in {}", 0.20 * 0.046),
    ("Weaher forecast in {}", 0.20 * 0.039),
    ("What is the weaher like in {}", 0.20 * 0.009),
    ("The waether in {}", 0.20 * 0.539),
    ("What is the waether in {}", 0.20 * 0.499),
    ("What's the waether in {}", 0.20 * 0.046),
    ("Waether forecast in {}", 0.20 * 0.039),
    ("What is the waether like in {}", 0.20 * 0.009),

    # Misspellings for "temperature"
    ("What is the temprature in {}", 0.20 * 0.033),
    ("What is the temperture in {}", 0.20 * 0.033),
    ("What is the tempreture in {}", 0.20 * 0.033),
    ("What is the tempratuer in {}", 0.20 * 0.033),
    ("What is the tempratue in {}", 0.20 * 0.033),
    ("What is the tempertuer in {}", 0.20 * 0.033),
    ("What is the tempretuer in {}", 0.20 * 0.033),
    ("What is the temprture in {}", 0.20 * 0.033),

    # Misspellings for "forecast"
    ("Forcast in {}", 0.20 * 0.039),
    ("What is the forcast for {}", 0.20 * 0.034),
    ("Forcst in {}", 0.20 * 0.039),
    ("What is the forcst for {}", 0.20 * 0.034),
    ("Forescast in {}", 0.20 * 0.039),
    ("What is the forescast for {}", 0.20 * 0.034),
    ("Forecats in {}", 0.20 * 0.039),
    ("What is the forecats for {}", 0.20 * 0.034),
    ("Forcaste in {}", 0.20 * 0.039),
    ("What is the forcaste for {}", 0.20 * 0.034),
    ("Forecst in {}", 0.20 * 0.039),
    ("What is the forecst for {}", 0.20 * 0.034),
    ("Forecase in {}", 0.20 * 0.039),
    ("What is the forecase for {}", 0.20 * 0.034),
    ("Foercast in {}", 0.20 * 0.039),
    ("What is the foercast for {}", 0.20 * 0.034),
]

# Combine original templates and the expanded typo variants
weather_templates_extended = weather_templates + extended_typo_variants


weather_templates_df = pd.DataFrame(weather_templates_extended, columns=['pattern', 'weight'])
weather_templates_df['weight'] = weather_templates_df['weight'] / weather_templates_df['weight'].sum()
weather_templates_df

In [None]:
weather_templates_df.head(50)

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [None]:
url = "https://en.m.wikipedia.org/wiki/List_of_television_stations_in_North_America_by_media_market"
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    dma_heading = soup.find('h4', string='DMAs')
    dma_list = dma_heading.find_next('ul')
    
    dma_data = []
    if dma_list:
        for li in dma_list.find_all('li'):
            market_name = li.get_text(strip=True)

            # Split by dash (-) or en-dash (–) to handle cases like "Dallas-Fort Worth"
            split_names = re.split(r'–|-', market_name)

            # Process each split name
            for name in split_names:
                # Remove the (#NUM) part using regex
                name = re.sub(r'\s*\(#\d+\)', '', name).strip()

                # Check if there's a city in parentheses and split them
                match = re.match(r'(.+?)\s*\((.+?)\)', name)
                if match:
                    main_city = match.group(1).strip()
                    parenthetical_city = match.group(2).strip()
                    dma_data.append(main_city)  # Add the main city
                    dma_data.append(parenthetical_city)  # Add the city in parentheses
                else:
                    dma_data.append(name) 



In [None]:
len(dma_data)

In [None]:
print(dma_data)

In [None]:
from collections import Counter

# months
months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

# Function to generate random queries with 30% lowercased
def generate_queries_with_case(df, cities, months, num_queries=10, lower_case_prob=0.3):
    queries = set()
    cnt = 0
    pattern_counter = Counter()
    while cnt < num_queries:
        # Choose a pattern based on the weights
        pattern = random.choices(df['pattern'], weights=df['weight'], k=1)[0]
        
        # Replace placeholders in the pattern with a random city and/or month
        city = random.choice(cities)
        if "{} in {}" in pattern:
            month = random.choice(months)
            query = pattern.format(city, month)
        else:
            query = pattern.format(city)

        if pattern_counter.get(pattern, 0) > num_queries//10:
            continue
        pattern_counter.update([pattern])
        
        # Randomly convert the query to lowercase with the given probability
        if random.random() < lower_case_prob:
            query = query.lower()

        if query not in queries:
            queries.add(query)
            cnt += 1
    
    return list(queries), pattern_counter

# Generate 10 sample queries with 30% in lowercase
sample_queries_with_case, pattern_counter = generate_queries_with_case(weather_templates_df, dma_data, months, num_queries=10000, lower_case_prob=0.3)

print(len(sample_queries_with_case))
sample_queries_with_case[:10]


In [None]:
pattern_counter

In [None]:
# sample_queries_with_case[1000:2000]

In [None]:
# sample_queries_with_case[:100]
weather_examples = pd.DataFrame(sample_queries_with_case, columns=['sequence'])
weather_examples['target'] = 'weather_intent'
weather_examples

#### Yelp examples

In [None]:
# Original Yelp Intent Templates
yelp_intent_templates = [
    ("What are the best restaurants in {}", 0.12),
    ("Top-rated restaurants in {}", 0.10),
    ("Popular coffee shops in {}", 0.09),
    ("Best pizza places in {}", 0.08),
    ("Best sushi places in {}", 0.07),
    ("Cheap restaurants in {}", 0.06),
    ("Best places to eat in {}", 0.06),
    ("Restaurants near me in {}", 0.05),
    # ("What is the average cost of a meal in {}", 0.04),
    ("Best Italian restaurants in {}", 0.04),
    ("Best fast food restaurants in {}", 0.04),
    ("Mexican restaurants in {}", 0.03),
    ("Chinese food near me in {}", 0.03),
    ("Best hotels in {}", 0.03),
    ("Affordable hotels in {}", 0.03),
    ("Best parks to visit in {}", 0.02),
    ("Best attractions in {}", 0.02),
    ("Popular things to do in {}", 0.02),
    ("Best shopping centers in {}", 0.02),
    ("Best gyms in {}", 0.02),
    ("Top hair salons in {}", 0.02),
    ("What are the best-rated dentists in {}", 0.02),
    ("Local plumbers in {}", 0.02),
    ("Popular electricians in {}", 0.02),
    # ("What is the phone number for a restaurant in {}", 0.02),
    # ("Phone number for hotels in {}", 0.02),
    ("Top-rated cafes in {}", 0.02),
    ("Best massage spas in {}", 0.02),
    ("Grocery stores near me in {}", 0.02),
    ("Where can I buy clothes in {}", 0.01),
    ("Pharmacies near me in {}", 0.01),
    ("Best bars in {}", 0.01),
    ("Cocktail bars in {}", 0.01),
    ("Family-friendly restaurants in {}", 0.01),
    ("Kid-friendly restaurants in {}", 0.01),
    ("Pet-friendly restaurants in {}", 0.01),
    ("Vegan restaurants in {}", 0.01),
    ("Best rooftop bars in {}", 0.01),
    ("Top pizza delivery places in {}", 0.01),
    ("Where can I get sushi in {}", 0.01),
    ("Best food delivery services in {}", 0.01),
    ("Catering services in {}", 0.01),
    ("Top-rated bakeries in {}", 0.01),
    ("Where can I find a gym in {}", 0.01),
    ("Yoga studios near me in {}", 0.01),
    # ("What’s the cost of living in {}", 0.01),
    # ("How much does it cost to live in {}", 0.01),
    ("Best places for nightlife in {}", 0.01),
    ("Local car repair shops in {}", 0.01),
    ("Best car rental services in {}", 0.01),
    ("{} restaurants", 0.02),
    ("{} hotels", 0.02),
    ("{} food", 0.02),
]

# Function to add typos to templates
def add_typos_to_template(template, typo_prob=0.1):
    typos = {
        "restaurants": ["restarants", "resturants", "restrants"],
        "best": ["bst", "besst", "bet"],
        "popular": ["populer", "ppular", "poplar"],
        "coffee": ["cofee", "cofffe", "cofee"],
        "pizza": ["piza", "pzza", "piza"],
        "hotels": ["hoetls", "hotls", "hoetls"],
        "places": ["plces", "place", "palces"],
        "attractions": ["attractons", "atrctions", "attractins"],
        "cheap": ["chep", "cheep", "cheap"],
        "meal": ["mel", "meel", "male"],
        "cost": ["cst", "cots", "cot"],
        "living": ["lving", "livng", "livin"],
        "yoga": ["yga", "yoaga", "ygoa"],
        "food": ["fod", "fud", "fodd"],
        "parks": ["praks", "parcs", "paks"],
        "near": ["ner", "neer", "naer"],
        "bar": ["bar", "ber", "baer"],
        "family": ["famly", "famliy", "faimly"],
        "friendly": ["frindly", "frendly", "friendley"]
    }

    words = template.split()
    for i, word in enumerate(words):
        if word.lower().strip("{}") in typos and random.random() < typo_prob:
            words[i] = random.choice(typos[word.lower().strip("{}")])
    return " ".join(words)

# Extending the list with typos
extended_yelp_intent_templates = []
extended_yelp_intent_templates_set = set()

for template, weight in yelp_intent_templates:
    if template in extended_yelp_intent_templates_set:
        continue
    extended_yelp_intent_templates.append((template, weight))
    extended_yelp_intent_templates_set.add(template)
    
    # Adding a typo variant 10-20% of the time
    if random.random() < 0.2:
        typo_template = add_typos_to_template(template)
        typo_weight = weight * 0.2  # Typos occur less frequently, so reduce weight
        if typo_template in extended_yelp_intent_templates_set:
            continue
        extended_yelp_intent_templates.append((typo_template, typo_weight))
        extended_yelp_intent_templates_set.add(typo_template)

# Convert to DataFrame for better readability
df_extended_yelp_intent_templates = pd.DataFrame(extended_yelp_intent_templates, columns=["pattern", "weight"])
df_extended_yelp_intent_templates['weight'] = df_extended_yelp_intent_templates['weight'] / df_extended_yelp_intent_templates['weight'].sum()
df_extended_yelp_intent_templates

In [None]:
list(weather_templates_df['pattern'].values) + list(df_extended_yelp_intent_templates['pattern'].values)

In [None]:
# Function to generate random queries with 30% lowercased
def generate_yelp_queries_with_case(df, cities, num_queries=10, lower_case_prob=0.3):
    queries = set()
    cnt = 0
    pattern_counter = Counter()
    while cnt < num_queries:
        # Choose a pattern based on the weights
        pattern = random.choices(df['pattern'], weights=df['weight'], k=1)[0]
        
        # Replace placeholders in the pattern with a random city and/or month
        city = random.choice(cities)
        query = pattern.format(city)

        if pattern_counter.get(pattern, 0) > num_queries//10:
            continue
        pattern_counter.update([pattern])
        
        # Randomly convert the query to lowercase with the given probability
        if random.random() < lower_case_prob:
            query = query.lower()

        if query not in queries:
            queries.add(query)
            cnt += 1
    
    return list(queries), pattern_counter

In [None]:
# Generate 10 sample queries with 30% in lowercase
sample_yelp_queries_with_case, pattern_counter = generate_yelp_queries_with_case(df_extended_yelp_intent_templates, dma_data, num_queries=3000, lower_case_prob=0.4) #10000

print(len(sample_yelp_queries_with_case))
sample_yelp_queries_with_case[:10]

In [None]:
# sample_yelp_queries_with_case

In [None]:
pattern_counter

In [None]:
yelp_examples = pd.DataFrame(sample_yelp_queries_with_case, columns=['sequence'])
yelp_examples['target'] = 'yelp_intent'
yelp_examples

#### Purchase intent data augmentation

In [None]:
electronics = [
    'iPhone', 'Samsung Galaxy', 'MacBook', 'PlayStation 5', 'AirPods', 'Xbox Series X', 'Canon DSLR', 
    'GoPro', 'Fitbit', 'Google Pixel', 'Bose headphones', 'Sony TV', 'Apple Watch', 'Nintendo Switch', 
    'Kindle', 'Sony WH-1000XM4', 'Microsoft Surface', 'DJI Drone', 'Logitech Webcam', 'HP Spectre x360', 
    'Dell XPS 13', 'Roku streaming stick', 'Apple iPad', 'Samsung QLED TV', 'LG OLED TV', 
    'JBL Bluetooth speaker', 'Amazon Echo', 'Nest Thermostat', 'Oculus Quest 2', 'Tile Tracker', 
    'Sony PlayStation VR', 'Huawei MateBook', 'Garmin watch', 'Bose Soundbar', 'Canon mirrorless camera',
    'Nikon Coolpix', 'WD external hard drive', 'Seagate backup drive', 'Razer gaming mouse', 'Corsair keyboard'
]

home_appliances = [
    'Dyson vacuum', 'Roomba', 'KitchenAid mixer', 'Ninja air fryer', 'Instant Pot', 'LG refrigerator', 
    'Samsung washing machine', 'Whirlpool dryer', 'Panasonic microwave', 'Breville toaster oven', 
    'Miele dishwasher', 'Cuisinart coffee maker', 'GE oven', 'Philips air purifier', 'Hoover carpet cleaner', 
    'Honeywell thermostat', 'LG air conditioner', 'Bosch induction cooktop', 'Crock-Pot', 'Frigidaire freezer',
    'Black+Decker blender', 'Sunbeam iron', 'KitchenAid food processor', 'Keurig coffee maker', 'NutriBullet blender',
    'Magic Bullet', 'Hamilton Beach rice cooker', 'DeLonghi espresso machine', 'Dyson fan', 'Electrolux washer',
    'Whirlpool stove', 'Bissell vacuum', 'Toshiba microwave', 'Vitamix blender', 'Smart humidifier'
]

furnitures = [
    'Ikea sofa', 'West Elm dining table', 'La-Z-Boy recliner', 'Ashley bed frame', 'Herman Miller chair', 
    'CB2 bookshelf', 'Pottery Barn desk', 'Crate & Barrel coffee table', 'Sealy mattress', 'Serta sectional sofa',
    'Wayfair sideboard', 'RH leather chair', 'Flexsteel armchair', 'Sauder TV stand', 'Modway bar stool',
    'Tempur-Pedic mattress', 'Ikea wardrobe', 'Zinus platform bed', 'Ashley loveseat', 'AllModern bench',
    'Bed Bath & Beyond dresser', 'Tuft & Needle mattress', 'Article couch', 'Living Spaces end table',
    'West Elm armchair', 'Burrow sectional', 'Bloomingdale’s accent chair', 'Castlery coffee table', 'Raymour & Flanigan bookcase',
    'Casper mattress', 'Simmons futon', 'Sleep Number adjustable bed', 'Havertys recliner', 'Anthropologie console'
]


fashion_and_clothing = [
    'Nike shoes', 'Adidas sneakers', 'Levi’s jeans', 'Gucci handbag', 'Rolex watch', 'Ray-Ban sunglasses', 
    'Patagonia jacket', 'H&M dress', 'Michael Kors purse', 'North Face parka', 'Calvin Klein suit', 'Under Armour hoodie', 
    'Puma sneakers', 'Tommy Hilfiger t-shirt', 'Lululemon leggings', 'Vans skate shoes', 'Coach wallet', 'Fossil watch', 
    'Zara coat', 'Birkenstock sandals', 'Uniqlo turtleneck', 'Balenciaga sneakers', 'Supreme hoodie', 'Carhartt work pants', 
    'Burberry trench coat', 'Lacoste polo', 'Forever 21 dress', 'Mango blouse', 'Gap denim jacket',
    'Old Navy shorts', 'Lands’ End swimwear', 'Diesel jeans', 'Victoria’s Secret lingerie', 'Ralph Lauren blazer'
]

beauty_and_personal_care = [
    'Dior perfume', 'Chanel foundation', 'Neutrogena moisturizer', 'MAC lipstick', 'Olay anti-aging cream', 
    'Pantene shampoo', 'Gillette razor', 'Oral-B electric toothbrush', 'Clarisonic face brush', 'Nivea body lotion',
    'L’Oreal conditioner', 'Revlon hair dryer', 'Estee Lauder serum', 'Clinique cleanser', 'Philips hair trimmer',
    'Remington hair straightener', 'Aveeno sunscreen', 'Aveda hair oil', 'La Roche-Posay sunscreen', 'Anastasia eyebrow pencil',
    'Biore face wash', 'Urban Decay eye shadow', 'Maybelline mascara', 'Cetaphil cleanser', 'TRESemme conditioner',
    'Garnier micellar water', 'Rimmel lip gloss', 'Kiehl’s toner', 'Moroccan Oil treatment', 'Huda Beauty contour palette',
    'Tom Ford lipstick', 'Charlotte Tilbury foundation', 'Tatcha face mask', 'Fenty Beauty highlighter', 'Paula’s Choice exfoliant'
]


automotives = [
    'Tesla Model S', 'Ford Mustang', 'Chevrolet Camaro', 'Toyota Corolla', 'Honda Civic', 'BMW X5', 
    'Mercedes-Benz GLC', 'Jeep Wrangler', 'Ford F-150', 'Hyundai Tucson', 'Mazda CX-5', 'Volkswagen Jetta', 
    'Nissan Altima', 'Dodge Ram', 'Chevrolet Tahoe', 'Lexus RX', 'Kia Sorento', 'Subaru Outback', 'Volvo XC90', 
    'Cadillac Escalade', 'Audi Q5', 'Porsche Cayenne', 'Land Rover Defender', 'Toyota Highlander', 'Jaguar F-Pace',
    'Acura MDX', 'Chrysler Pacifica', 'Honda CR-V', 'Ram 1500', 'GMC Sierra', 'Alfa Romeo Stelvio', 'Lincoln Navigator'
]

household_items = [
    'Tide laundry detergent', 'Scotch-Brite sponges', 'Bounty paper towels', 'Clorox bleach', 'Ziploc bags', 
    'Swiffer mop', 'Mr. Clean Magic Eraser', 'Glad trash bags', 'Febreze air freshener', 'Lysol disinfectant spray',
    'Dawn dish soap', 'Windex glass cleaner', 'Arm & Hammer baking soda', 'Tupperware', 'Brita water filter',
    'O-Cedar mop', 'Scrub Daddy', 'Bounce dryer sheets', 'Hefty storage containers', 'Method all-purpose cleaner',
    'Cascade dishwasher pods', 'Pledge furniture polish', 'Comet bathroom cleaner', 'Woolite laundry detergent',
    'Soft Scrub cleaner', 'Reynolds Wrap foil', 'Cling film wrap', 'Magic Zipper bags', 'Pine-Sol floor cleaner',
    'OxiClean stain remover', 'Scotch tape', 'Command hooks', 'Tide Pods', 'Microfiber cloths'
]

toys_and_games = [
    'LEGO sets', 'Barbie dolls', 'Hot Wheels cars', 'Nerf blasters', 'Fisher-Price playsets', 'Monopoly board game', 
    'Jenga', 'Uno card game', 'Crayola coloring kits', 'Play-Doh sets', 'Marvel action figures', 'RC cars', 
    'Beyblade', 'Transformers toys', 'Super Soaker water guns', 'Paw Patrol toys', 'My Little Pony dolls', 
    'Magic: The Gathering cards', 'Lego Mindstorms', 'Nintendo Switch games', 'Fortnite Nerf blasters', 'Scrabble', 
    'Guess Who?', 'Minecraft LEGOs', 'Funko Pop figures', 'Mega Bloks', 'Hasbro puzzles', 'FurReal Pets', 'LOL Surprise dolls',
    'Disney Princess dolls', 'Harry Potter LEGO', 'X-shot blasters', 'Playmobil sets', 'Star Wars LEGO sets'
]

books_and_media = [
    'Harry Potter books', 'The Lord of the Rings', 'The Great Gatsby', 'To Kill a Mockingbird', '1984 by George Orwell', 
    'The Catcher in the Rye', 'The Hunger Games', 'Game of Thrones', 'Twilight series', 'Sherlock Holmes novels',
    'The Da Vinci Code', 'The Alchemist', 'The Chronicles of Narnia', 'Percy Jackson series', 'The Maze Runner',
    'The Girl with the Dragon Tattoo', 'Moby Dick', 'Pride and Prejudice', 'The Handmaid’s Tale', 'The Witcher series',
    'Outlander series', 'Dracula', 'Little Women', 'Gone with the Wind', 'Dune', 'The Hobbit', 'Fifty Shades of Grey', 
    'The Shining', 'The Road', 'Jurassic Park', 'Catch-22', 'The Time Traveler’s Wife', 'The Giver', 'The Color Purple', 'Beloved'
]

sport_equipments = [
    'Nike soccer ball', 'Wilson tennis racket', 'Adidas football cleats', 'Spalding basketball', 'Under Armour workout gloves', 
    'Yonex badminton racket', 'Callaway golf clubs', 'Fitbit fitness tracker', 'Everlast boxing gloves', 'Wilson baseball glove',
    'Babolat tennis shoes', 'Reebok CrossFit gear', 'Nike running shoes', 'Speedo swim goggles', 'Bauer hockey skates',
    'Garmin GPS watch', 'Rawlings baseball bat', 'Easton batting gloves', 'Columbia hiking boots', 'Asics running shoes',
    'NordicTrack treadmill', 'ProForm elliptical', 'Bowflex dumbbells', 'Schwinn stationary bike', 'Theragun massager',
    'Rogue kettlebells', 'Concept2 rower', 'Under Armour mouthguard', 'Lululemon yoga mat', 'Sklz agility ladder'
]

gifts = [
    'custom gifts', 'gift cards', 'personalized mugs', 'engraved jewelry', 'photo frames', 'custom t-shirts', 
    'personalized blankets', 'engraved watches', 'photo books', 'custom calendars', 'digital photo frames', 
    'chocolate gift boxes', 'flower bouquets', 'monogrammed bags', 'engraved rings', 'handmade candles',
    'subscription boxes', 'custom puzzles', 'personalized stationery', 'custom keychains'
]

hunting_equipment = [
    'crossbow', 'compound bow', 'hunting knives', 'camouflage clothing',
    'deer stand', 'trail camera', 'hunting boots', 'binoculars', 
    'rangefinder', 'backpack for hunting'
]

eyewear = [
    'prescription glasses', 'sunglasses', 'blue light blocking glasses',
    'bifocals', 'transition lenses', 'polarized sunglasses',
    'contact lenses', 'eyeglass frames', 'sports glasses', 'reading glasses'
]

supplements = [
    'magnesium taurate', 'vitamin D', 'fish oil', 'multivitamins',
    'probiotics', 'protein powder', 'collagen', 'iron supplements',
    'zinc supplements', 'calcium supplements'
]

pet_supplies = [
    'dog food', 'cat food', 'pet beds', 'dog treats', 'pet grooming kits',
    'cat litter', 'dog toys', 'cat scratchers', 'pet carriers', 'pet feeders'
]

bedding = [
    'queen size bedspreads', 'king size comforter sets', 'sheets',
    'duvet covers', 'pillows', 'mattress protectors', 'weighted blankets',
    'electric blankets', 'bamboo sheets', 'silk pillowcases'
]
##
kitchen_appliance = [
    'blender', 'air fryer', 'pressure cooker', 'food processor',
    'stand mixer', 'toaster oven', 'microwave', 'coffee maker',
    'deep fryer', 'slow cooker'
]

automotive_parts = [
    'tires', 'car batteries', 'carburetors', 'brake pads', 
    'windshield wipers', 'car mats', 'air filters', 'engine oil',
    'spark plugs', 'headlights'
]

tech_accessories = [
    'phone case', 'charging cable', 'laptop sleeve', 'wireless charger',
    'screen protector', 'portable battery', 'USB hub', 'headphone adapter',
    'keyboard cover', 'stylus pen'
]

fitness_equipment = [
    'treadmill', 'dumbbells', 'resistance bands', 'exercise bike',
    'yoga mat', 'pull-up bar', 'rowing machine', 'kettlebell',
    'weight bench', 'jump rope'
]

seasonal_products = [
    'Christmas tree', 'holiday lights', 'Halloween costumes',
    'summer outdoor furniture', 'Thanksgiving decorations',
    'winter jackets', 'snow blowers', 'Easter baskets', 'grills', 'pool accessories'
]

platform = [
    'Amazon', 'eBay', 'Walmart', 'Best Buy', 'Target',
    'Apple Store', 'Google Store', 'Newegg', 'B&H', 'Costco',
]    


event = [
    'Coachella', 'Lollapalooza', 'Burning Man', 'Comic-Con', 'The Oscars',
    'Super Bowl', 'World Series', 'NBA Finals', 'Wimbledon', 'Grammy Awards',
]

festival = [
    'Coachella', 'Lollapalooza', 'Burning Man', 'Tomorrowland', 'SXSW',
    'Glastonbury', 'Oktoberfest', 'Mardi Gras', 'Cannes Film Festival', 'Sundance Film Festival',
    'Ultra Music Festival', 'New Orleans Jazz & Heritage Festival', 'Austin City Limits', 'Bonnaroo', 'Electric Daisy Carnival',
    'Stagecoach', 'Summerfest', 'Essence Festival', 'Rock in Rio', 'Woodstock',
]

artist = [
    'Taylor Swift', 'Beyoncé', 'Ed Sheeran', 'Drake', 'Ariana Grande',
    'Billie Eilish', 'The Weeknd', 'Justin Bieber', 'Kanye West', 'Rihanna',
    'Bruno Mars', 'Shawn Mendes', 'Dua Lipa', 'Travis Scott', 'Lady Gaga',
    'Post Malone', 'Harry Styles', 'Adele', 'Coldplay', 'Imagine Dragons',
]


In [None]:
purchase_intent_templates = [
    # Electronics Purchase Intent
    "{electronics} price",
    "Where to buy {electronics} online?",
    "Best deals on {electronics} this year",
    "Is {electronics} worth buying in 2024?",
    "Discounts available for {electronics}?",
    # "How to repair {electronics} at home?",
    "Which is better: {electronics} or {electronics}?",
    "Where to buy used {electronics}?",
    "Is {electronics} in stock near me?",
    "What {electronics} make the best gifts?",
    "Top-rated {electronics} to buy as a gift",
    "Is {electronics} available with free shipping?",
    "What stores sell {electronics} with warranties?",
    "Where to find refurbished {electronics}?",
    "How long does {electronics} last?",
    "Can I get extended warranty for {electronics}?",
    
    # Home Appliances Purchase Intent
    "{home_appliance} price",
    "Where to buy {home_appliance} at the best price?",
    "How to repair a {home_appliance}?",
    "Best deals on {home_appliance} right now",
    "What are the reviews for {home_appliance}?",
    "Should I upgrade my {home_appliance} this year?",
    "Compare {home_appliance} with {home_appliance} for best value",
    "Are refurbished {home_appliance} worth buying?",
    "Where can I find {home_appliance} available now?",
    "Most energy-efficient {home_appliance} in 2024",
    "Best {home_appliance} for small spaces",
    "How to maintain a {home_appliance}?",
    "Top stores to buy {home_appliance} with discounts",
    
    # Furniture Purchase Intent
    "{furniture} price",
    "What is the best {furniture} for a small space?",
    "Where to buy affordable {furniture} online?",
    "Best places to buy {furniture} for my home",
    "Top-rated {furniture} on sale this weekend",
    "How to assemble {furniture} yourself",
    "Best {furniture} to buy as gifts for new homeowners",
    "Are {furniture} available for same-day delivery?",
    "What {furniture} brands have the best quality?",
    "Where to buy modern {furniture} for a living room?",
    "What’s trending in {furniture} for 2024?",
    "Top-rated stores for {furniture} deals",
    "How to clean and maintain {furniture}?",
    
    # Fashion and Clothing Purchase Intent
    "{fashion_and_clothing} price",
    "What are the latest deals on {fashion_and_clothing}?",
    "Where to buy {fashion_and_clothing} online?",
    "Top-rated {fashion_and_clothing} for this season",
    "Best styles of {fashion_and_clothing} in 2024",
    "Is {fashion_and_clothing} worth the price?",
    "What {fashion_and_clothing} brands are best for longevity?",
    "Where can I buy sustainable {fashion_and_clothing}?",
    "Is {fashion_and_clothing} available in plus sizes?",
    "Can I return {fashion_and_clothing} if it doesn’t fit?",
    "Where to find discounts on designer {fashion_and_clothing}?",
    
    # Beauty and Personal Care Purchase Intent
    "{beauty_and_personal_care} price",
    "Best {beauty_and_personal_care} products to buy this year",
    "Where to buy {beauty_and_personal_care} online?",
    "Top reviews for {beauty_and_personal_care} products",
    "Are {beauty_and_personal_care} products worth it?",
    "What are the best deals for {beauty_and_personal_care}?",
    "How to get a subscription for {beauty_and_personal_care} products?",
    "What stores sell natural {beauty_and_personal_care} products?",
    "Are {beauty_and_personal_care} available for sensitive skin?",
    "Top-rated {beauty_and_personal_care} for aging skin",
    "What are the best organic {beauty_and_personal_care} products?",
    
    # Automotive Purchase Intent
    "{automotive} price",
    "Is {automotive} a good car to buy?",
    "Best deals on {automotive} this year",
    "Where to buy {automotive} accessories?",
    "How to finance a new {automotive}?",
    "Top-rated {automotive} models in 2024",
    "What’s the lifespan of {automotive}?",
    "Where to find certified pre-owned {automotive}?",
    "Is {automotive} reliable for long drives?",
    "Can I test drive {automotive} near me?",
    "What’s the fuel efficiency of {automotive}?",
    
    # Household Items Purchase Intent
    "{household_item} price",
    "What are the top-rated {household_item} this year?",
    "Where to buy {household_item} online?",
    "How to get discounts on {household_item}?",
    "Are {household_item} worth buying?",
    "Top stores for {household_item} deals",
    "Eco-friendly {household_item} options available",
    "How to clean {household_item} properly?",
    "Top-rated {household_item} for allergies",
    "Where to find bulk deals on {household_item}?",
    
    # Toys and Games Purchase Intent
    "{toys_and_games} price",
    "Where to buy {toys_and_games} for kids?",
    "Best reviews for {toys_and_games}",
    "What are the best prices for {toys_and_games}?",
    "What are the top {toys_and_games} for Christmas?",
    "Top-rated {toys_and_games} on sale",
    "Best {toys_and_games} for educational purposes",
    "Where to find interactive {toys_and_games} for learning?",
    "Are {toys_and_games} safe for children under 5?",
    "What {toys_and_games} are best for birthdays?",
    
    # Books and Media Purchase Intent
    "{books_and_media} price",
    "Best places to buy {books_and_media} online",
    "What are the reviews for {books_and_media}?",
    "Is {books_and_media} worth buying?",
    "Top-rated {books_and_media} for this year",
    "What are the best deals on {books_and_media}?",
    "Where to buy a subscription for {books_and_media}?",
    "Are {books_and_media} available in eBook format?",
    "Is there an audiobook version of {books_and_media}?",
    "Can I find {books_and_media} in my local library?",
    
    # Sports Equipment Purchase Intent
    "{sport_equipments} price",
    "What are the best {sport_equipments} to buy?",
    "Where can I find discounts on {sport_equipments}?",
    "Top-rated {sport_equipments} for 2024",
    "Where to buy {sport_equipments} online?",
    "Are {sport_equipments} worth the price?",
    "What are the best deals on {sport_equipments}?",
    "Reviews of {sport_equipments} from users",
    "Where to buy {sport_equipments} for beginners?",
    "What are the must-have {sport_equipments} for athletes?",
    "Top stores offering deals on {sport_equipments}",
    "Can I rent {sport_equipments} instead of buying?",
    "What are the best {sport_equipments} for home training?",

    # Gift-Related Queries
    "{gifts} price",
    "Where to buy {gifts} online?",
    "Best deals on {gifts} this holiday season",
    "What {gifts} make the best presents?",
    "How to personalize {gifts} for special occasions?",
    "Are {gifts} available for same-day delivery?",
    "Top personalized {gifts} for anniversaries",
    "Unique {gifts} ideas for birthdays",
    "Affordable {gifts} for holidays",
    "Can I gift-wrap {gifts} at checkout?",
    "How to create custom {gifts} online?",
    
    # Hunting Equipment Queries
    "{hunting_equipment} price",
    "Best {hunting_equipment} for deer hunting",
    "Where to buy {hunting_equipment} online?",
    "What are the top {hunting_equipment} brands?",
    "How to maintain {hunting_equipment}?",
    "Where can I rent {hunting_equipment}?",
    "Top safety tips for using {hunting_equipment}",
    "What’s the best {hunting_equipment} for beginners?",
    
    # Eyewear Queries
    "{eyewear} price",
    "Where to order {eyewear} online?",
    "Best prices for {eyewear}",
    "Are {eyewear} available with insurance coverage?",
    "Top-rated {eyewear} for outdoor sports",
    "Where to get prescription {eyewear}?",
    "Which brands make the most durable {eyewear}?",
    "What are the best {eyewear} for UV protection?",
    
    # Supplements Queries
    "{supplements} price",
    "Where to buy {supplements} for health?",
    "Top reviews for {supplements}",
    # "What are the benefits of {supplements}?",
    "How to find discounts on {supplements}?",
    "Can I get a subscription for {supplements}?",
    "What are the best {supplements} for immunity?",
    "Are {supplements} safe for daily use?",
    "Top stores offering deals on {supplements}",
    
    # Pet Supplies Queries
    "{pet_supplies} price",
    "Where to buy {pet_supplies} online?",
    "What are the best {pet_supplies} for dogs?",
    "Top-rated {pet_supplies} for cats",
    "How to get discounts on {pet_supplies}?",
    "Are {pet_supplies} available for same-day delivery?",
    "What are the best eco-friendly {pet_supplies}?",
    "Can I subscribe to auto-delivery for {pet_supplies}?",
    
    # Bedding Queries
    "{bedding} price",
    "Where to buy {bedding} on sale?",
    "Best {bedding} for a comfortable night's sleep",
    "How to choose {bedding} for different seasons?",
    "Are {bedding} available for delivery today?",
    "What are the most luxurious {bedding} brands?",
    "Best {bedding} for people with allergies",
    "What’s the best material for {bedding}?",
    "How to wash {bedding} properly?",
##
    # Kitchen Appliance Queries
    "{kitchen_appliance} price",
    "Best deals on {kitchen_appliance} this year",
    "Where to buy {kitchen_appliance} online?",
    "How to repair {kitchen_appliance} at home?",
    "Are {kitchen_appliance} worth buying refurbished?",
    "Most energy-efficient {kitchen_appliance}",
    "How to clean and maintain {kitchen_appliance}?",
    "Is {kitchen_appliance} available for same-day delivery?",
    
    # Automotive Parts Queries
    "{automotive_parts} price",
    "Where to buy {automotive_parts} for my car?",
    "Best deals on {automotive_parts} this year",
    "How to install {automotive_parts}?",
    "Is {automotive_parts} in stock near me?",
    "What {automotive_parts} are compatible with my car?",
    "How to maintain {automotive_parts} for longevity?",
    "Top-rated {automotive_parts} for safety",
    
    # Tech Accessories Queries
    "{tech_accessories} price",
    "Where to buy {tech_accessories} online?",
    "Best {tech_accessories} for my {electronics}",
    "What are the reviews for {tech_accessories}?",
    "Are {tech_accessories} compatible with {device}?",
    "How to find durable {tech_accessories}?",
    "What are the best {tech_accessories} for travel?",
    "Are {tech_accessories} available in stores near me?",
    
    # Fitness Equipment Queries
    "{fitness_equipment} price",
    "Best {fitness_equipment} to buy for home gym",
    "Where to find {fitness_equipment} deals?",
    "Top-rated {fitness_equipment} for 2024",
    "What are the must-have {fitness_equipment}?",
    "What {fitness_equipment} are best for beginners?",
    "How to maintain {fitness_equipment} at home?",
    
    # Seasonal Products Queries
    "{seasonal_products} price",
    "Where to buy {seasonal_products} during the holiday season?",
    "Best {seasonal_products} for {season}",
    "Are {seasonal_products} available for same-day delivery?",
    "Top-rated {seasonal_products} for this year",
    "Where to find discounts on {seasonal_products}?",
    "How to store {seasonal_products} for next season?",
    "Are there any eco-friendly {seasonal_products}?",

    # Events & Ticketing
    "{artist} price",
    "find concert tickets for {artist} on {platform}",
    "how to book tickets for {event}?",
    # "find the best seats for concert",
    "how to get discounts for {festival} tickets?",
    "ticket refund policy for {platform}",
]


In [None]:
len(purchase_intent_templates)

In [None]:
ELECTRONICS_PURCHASE = "{electronics}"
HOME_APPLIANCES_PURCHASE = "{home_appliance}"
FURNITURES_PURCHASE = "{furniture}"
FASHION_CLOTHING_PURCHASE = "{fashion_and_clothing}"
BEAUTY_AND_PERSONAL_CARE_PURCHASE = "{beauty_and_personal_care}"
AUTOMOTIVE_PURCHASE = "{automotive}"
HOUSEHOLD_ITEMS_PURCHASE = "{household_item}"
TOYS_AND_GAMES_PURCHASE = "{toys_and_games}"
BOOKS_AND_MEDIA_PURCHASE = "{books_and_media}"
SPORTS_EQUIPMENT_PURCHASE = "{sport_equipments}"
GIFTS_PURCHASE = "{gifts}"
HUNTING_EQUIPMENT_PURCHASE = "{hunting_equipment}"
EYEWEAR_PURCHASE = "{eyewear}"
SUPPLEMENTS_PURCHASE = "{supplements}"
PET_SUPPLIES_PURCHASE = "{pet_supplies}"
BEDDING_PURCHASE = "{bedding}"
KITCHEN_APPLIANCE_PURCHASE = "{kitchen_appliance}"
AUTOMOTIVE_PARTS_PURCHASE = "{automotive_parts}"
TECH_ACCESSORIES_PURCHASE = "{tech_accessories}"
FITNESS_EQUIPMENT_PURCHASE = "{fitness_equipment}"
SEASONAL_PRODUCTS_PURCHASE = "{seasonal_products}"
PLATFORM_PURCHASE = "{platform}"
EVENT_PURCHASE = "{event}"
FESTIVAL_PURCHASE = "{festival}"
ARTIST_PURCHASE = "{artist}"

product_categories = {
    ELECTRONICS_PURCHASE: electronics,
    HOME_APPLIANCES_PURCHASE: home_appliances,
    FURNITURES_PURCHASE: furnitures,
    FASHION_CLOTHING_PURCHASE: fashion_and_clothing,
    BEAUTY_AND_PERSONAL_CARE_PURCHASE: beauty_and_personal_care,
    AUTOMOTIVE_PURCHASE: automotives,
    HOUSEHOLD_ITEMS_PURCHASE: household_items,
    TOYS_AND_GAMES_PURCHASE: toys_and_games,
    BOOKS_AND_MEDIA_PURCHASE: books_and_media,
    SPORTS_EQUIPMENT_PURCHASE: sport_equipments,
    GIFTS_PURCHASE: gifts,
    HUNTING_EQUIPMENT_PURCHASE: hunting_equipment,
    EYEWEAR_PURCHASE: eyewear,
    SUPPLEMENTS_PURCHASE: supplements,
    PET_SUPPLIES_PURCHASE: pet_supplies,
    BEDDING_PURCHASE: bedding,
    KITCHEN_APPLIANCE_PURCHASE: kitchen_appliance,
    AUTOMOTIVE_PARTS_PURCHASE: automotive_parts,
    TECH_ACCESSORIES_PURCHASE: tech_accessories,
    FITNESS_EQUIPMENT_PURCHASE: fitness_equipment,
    SEASONAL_PRODUCTS_PURCHASE: seasonal_products,
    PLATFORM_PURCHASE: platform,
    EVENT_PURCHASE: event,
    FESTIVAL_PURCHASE: festival,
    ARTIST_PURCHASE: artist,
}

def detect_product(product_categories, template):
    for category in product_categories.keys():
        if category in template:
            return category

def generate_queries(templates, n_queries=1000):
    cnt = 0
    queries = []
    query_set = set()
    while cnt < n_queries:
        if cnt %500 == 0:
            print(f"{cnt+1} examples added")
        template = random.choice(templates)
        # print(f"template = {template}")
        category = detect_product(product_categories, template)
        # print(f"category = {category}")
        product = random.choice(product_categories[category])
        # print(f"product = {product}")
        category = category.replace("{","").replace("}", "")
        query = template.replace(f"{{{category}}}",product)
        # print(f"query = {query}")
        # print()
        if query not in query_set:
            queries.append(query)
            query_set.add(query)
            cnt += 1
    return queries

In [None]:
purchase_intent_queries = generate_queries(purchase_intent_templates, n_queries=5100)

In [None]:
len(purchase_intent_queries)

In [None]:
purchase_intent_examples = pd.DataFrame(purchase_intent_queries, columns=['sequence'])
purchase_intent_examples['target'] = 'purchase_intent'
purchase_intent_examples

In [None]:
import json 

def get_geonames_city_state_data():
    geonames_file = "../data/geonames-cities-states.json"
    with open(geonames_file, 'r') as f:
        geonames_dict = json.load(f)
    
    
    cities_data = pd.DataFrame(geonames_dict['cities'])\
                    .rename(columns={'admin1_code': 'state_code', 'name': 'city_name', 'population': 'city_popln'})
    cities_data = cities_data[['id', 'state_code', 'city_name', 'city_popln', 'alternate_names']]
    states_data = pd.DataFrame(geonames_dict['states_by_abbr'].values())\
                    .rename(columns={'admin1_code': 'state_code', 'name': 'state_name'})
    states_data = states_data[['state_code', 'state_name']]
    city_states_data = cities_data.merge(states_data, how='left', on='state_code')
    city_states_data['city_weight'] = city_states_data['city_popln'] / city_states_data['city_popln'].sum()
    return city_states_data

In [None]:
city_states_data = get_geonames_city_state_data()
city_weights = city_states_data[['city_name', 'city_weight']].set_index('city_name').to_dict()['city_weight']
city_state_code_info = city_states_data[['city_name', 'state_code', 'city_weight']].copy()
city_state_name_info = city_states_data[['city_name', 'state_name', 'city_weight']].copy()

In [None]:
def get_sample_from_cities_and_states(city_state_code_info, city_state_name_info, state_code_threshold=0.8):
    rand_val = random.random()
    if rand_val <= state_code_threshold:
        return ', '.join(city_state_code_info.sample(1, weights='city_weight', replace=True)[['city_name', 'state_code']].values.tolist()[0])
    return ', '.join(city_state_name_info.sample(1, weights='city_weight', replace=True)[['city_name', 'state_name']].values.tolist()[0])

city_state=get_sample_from_cities_and_states(city_state_code_info, city_state_name_info, state_code_threshold=0.8)

In [None]:
city_state

In [None]:
from collections import Counter

city_states_counter = Counter()
for _ in range(10000):
    city_states_counter.update([get_sample_from_cities_and_states(city_state_code_info, city_state_name_info, state_code_threshold=0.8)])

city_state = [cit_sta for cit_sta, cnt in city_states_counter.most_common(200)]

In [None]:
city_state


#### some additional augmented yelp intent queries

In [None]:
home_maintenance_repair = [
    'roofing', 'flooring', 'plumbing', 'house painting', 'carpet installation',
    'hardwood floor refinishing', 'drywall repair', 'electrical services', 'window installation',
    'HVAC services', 'fencing', 'roof replacement', 'gutter repair', 'kitchen renovation',
    'bathroom remodeling', 'exterior painting', 'interior painting', 'concrete repair',
    'driveway paving', 'deck repair', 'plumber', 'electrician',
]

moving_storage = [
    'local movers', 'long-distance movers', 'packing services', 'furniture movers',
    'storage solutions', 'cross-country movers', 'apartment moving', 'pool table moving',
    'interstate movers', 'moving truck rental', 'moving labor', 'junk removal',
    'relocation services', 'packing supplies', 'office movers', 'vehicle shipping',
    'moving container rental', 'small move services', 'senior moving services', 'pet relocation'
]

restaurants_food = [
    'sushi', 'tacos', 'pizza', 'burgers', 'ramen', 'pasta', 'salads', 'barbecue', 'ice cream',
    'acai bowls', 'vegan food', 'steakhouses', 'buffet restaurants', 'seafood restaurants',
    'brunch spots', 'food trucks', 'fast food', 'diner', 'Mexican restaurants', 'Italian restaurants',
    'fried chicken', 'hot dogs', 'donuts', 'bagels', 'barbecue chicken',
    'buffalo wings', 'grilled cheese', 'cheesesteak', 'poutine', 'meatloaf',
    'fried fish', 'soul food', 'dim sum', 'dumplings', 'tapas',
    'Indian restaurants', 'Mediterranean food', 'Korean BBQ', 'Vietnamese pho', 'crepes',
    'cupcakes'

]

health_wellness = [
    'dermatologists', 'dentists', 'optometrists', 'pediatricians', 'pharmacies',
    'acupuncture', 'chiropractors', 'physical therapy', 'massage therapy', 'eyebrow threading',
    'laser hair removal', 'facials', 'hair salons', 'nail salons', 'spas', 'mental health counseling',
    'cosmetic surgery', 'nutritionists', 'fitness trainers', 'wellness centers', 'yoga', 'pilates',
]

car_repair_automotive_services = [
    'oil change', 'tire replacement', 'brake repair', 'car inspection', 'car detailing',
    'transmission repair', 'engine diagnostics', 'battery replacement', 'alignment services',
    'auto body repair', 'windshield replacement', 'car wash', 'wheel alignment', 'car painting',
    'exhaust system repair', 'auto glass repair', 'AC repair', 'tune-up services', 'car rental', 'car towing'
]

cleaning_services = [
    'house cleaning', 'deep cleaning', 'maid services', 'disinfection and sanitization', 'carpet cleaning',
    'window cleaning', 'move-out cleaning', 'office cleaning', 'apartment cleaning', 'laundry services',
    'floor cleaning', 'pressure washing', 'garage cleaning', 'post-construction cleaning',
    'air duct cleaning', 'roof cleaning', 'tile and grout cleaning', 'upholstery cleaning',
    'yard cleaning', 'organizing services'
]

entertainment_activities = [
    'bowling', 'karaoke', 'movie theaters', 'mini-golf', 'amusement parks', 'live music venues', 'escape rooms',
    'arcade', 'arcades', 'zoos', 'aquariums', 'water parks', 'comedy clubs', 'museums', 'laser tag', 'go-karts', 
    'roller skating', 'trampoline parks', 'horseback riding', 'batting cages', 'rock climbing gyms', 'sports bar',
    'sports bars','karaoke bar',
]

beauty_personal_care = [
    'hair salons', 'barbershops', 'nail salons', 'spas', 'eyebrow threading', 'facials', 'microblading',
    'laser hair removal', 'brow lamination', 'spray tanning', 'makeup artists', 'cosmetic surgery',
    'waxing services', 'beauty salons', 'eyelash extensions', 'massage therapy', 'piercing', 'acne treatments',
    'dermatology', 'body sculpting'
]

specialty_shops_services = [
    'embroidery services', 'custom painting', 'interior design', 'furniture restoration', 'florists', 
    'tailors', 'wedding planners', 'personal chefs', 'home organizers', 'antique shops', 'handyman services',
    'custom cabinet makers', 'fence installation', 'deck building', 'security system installation', 'pest control',
    'landscaping services', 'pet grooming', 'art restoration', 'personal trainers'
]

city_short = ["sf", "sfo", "san francisco",
              "nyc", "new york",
              "la", "lax",
              "chi", "chicago",
              "hou", "houston",
              "mia", "miami",
              "vegas", "lv",
              "bos", "boston", 
              "sea", "seattle", 
              "atl", "atlanta",
              "dfw", "dallas",
              "dc", "washington",
              "philly", "philadelphia",
              "phx", "phoenix",
              "sd", "sandiego",
              "den", "denver",
              "orl", "orlando",
              "atx", "austin",
              "nash", "nashville",
              "pdx", "portland",
              "nola", "new orleans",
              "sat", "san antonio",
              "clt", "charlotte",
              "det", "detroit",
              "tpa", "tampa",
              "balt", "baltimore",
              "cle", "cleveland",
              "mpls", "minneapolis",
              "slc", "salt lake city",
              "indy", "indianapolis",
              "kc", "kansas city",
]


In [None]:
yelp_keywords_data = pd.read_json("https://firefox-settings-attachments.cdn.mozilla.net/main-workspace/quicksuggest/33987d71-9e87-4b7e-86d3-6f292b89e8bf.json")['subjects'].values[0]

In [None]:
general_yelp_keyword = yelp_keywords_data[::]

In [None]:
len(general_yelp_keyword)

In [None]:
yelp_intent_additional_templates = [
    # Home Maintenance & Repair
    "Find {home_maintenance_repair} near me",
    "Best {home_maintenance_repair} providers in {city_state}",
    "Affordable {home_maintenance_repair} in {city_state}",
    "Top-rated {home_maintenance_repair} companies",
    "How much does {home_maintenance_repair} cost?",
    "Compare reviews of {home_maintenance_repair} providers",
    # "{city_short} {home_maintenance_repair}",
    # "{home_maintenance_repair} {city_short}",
    # "{city_state} {home_maintenance_repair}",
    # "{home_maintenance_repair} {city_state}",
    "{home_maintenance_repair}",
    "{home_maintenance_repair} in {city_state}",
    

    # Moving & Storage
    "Best {moving_storage} for long distance moving",
    "Find a local {moving_storage} company",
    "Compare prices for {moving_storage} in {city_state}",
    "Reviews of {moving_storage} companies near me",
    "How to hire {moving_storage} for a small move",
    "Best-rated {moving_storage} companies",
    # "{city_short} {moving_storage}",
    # "{moving_storage} {city_short}",
    # "{city_state} {moving_storage}",
    # "{moving_storage} {city_state}",
    "{moving_storage}",

    # Restaurants & Food
    "Best {restaurants_food} near me",
    "Top {restaurants_food} reviews in {city_state}",
    "Affordable {restaurants_food} options near me",
    "Where to find the best {restaurants_food} in {city_state}?",
    "Compare {restaurants_food} reviews in {city_state}",
    "5-star {restaurants_food} recommendations",
    # "{city_short} {restaurants_food}",
    # "{restaurants_food} {city_short}",
    # "{city_state} {restaurants_food}",
    # "{restaurants_food} {city_state}",
    "{restaurants_food}",

    # Health & Wellness
    "Find {health_wellness} near me",
    "Best-rated {health_wellness} in {city_state}",
    "Affordable {health_wellness} options near me",
    "Compare reviews for {health_wellness} providers",
    "Top doctors and clinics for {health_wellness}",
    "How much does {health_wellness} cost?",
    # "{city_short} {health_wellness}",
    # "{health_wellness} {city_short}",
    # "{city_state} {health_wellness}",
    # "{health_wellness} {city_state}",
    "{health_wellness}",

    # Car Repair & Automotive Services
    "Find {car_repair_automotive_services} near me",
    "Best {car_repair_automotive_services} providers in {city_state}",
    "Top car repair shops for {car_repair_automotive_services}",
    "Compare prices for {car_repair_automotive_services}",
    "Affordable {car_repair_automotive_services} options near me",
    "Top-rated {car_repair_automotive_services} providers",
    # "{city_short} {car_repair_automotive_services}",
    # "{car_repair_automotive_services} {city_short}",
    # "{city_state} {car_repair_automotive_services}",
    # "{car_repair_automotive_services} {city_state}",
    "{car_repair_automotive_services}",

    # Cleaning Services
    "Find a {cleaning_services} in {city_state}",
    "Affordable {cleaning_services} options near me",
    "Compare reviews for {cleaning_services} providers",
    "Get a {cleaning_services} quote near me",
    "Best {cleaning_services} providers in {city_state}",
    "How much does {cleaning_services} cost?",
    # "{city_short} {cleaning_services}",
    # "{cleaning_services} {city_short}",
    # "{city_state} {cleaning_services}",
    # "{cleaning_services} {city_state}",
    "{cleaning_services}",

    # Entertainment & Activities
    "Best {entertainment_activities} near me",
    "Top-rated {entertainment_activities} in {city_state}",
    "Where to find {entertainment_activities} options in {city_state}?",
    "Affordable {entertainment_activities} activities near me",
    "Compare reviews for {entertainment_activities} venues",
    "Top places for {entertainment_activities} this weekend",
    # "{city_short} {entertainment_activities}",
    # "{entertainment_activities} {city_short}",
    # "{city_state} {entertainment_activities}",
    # "{entertainment_activities} {city_state}",
    "{entertainment_activities}",
    "{entertainment_activities} place",
    "{entertainment_activities} for beginners",

    # Beauty & Personal Care
    "Find {beauty_personal_care} near me",
    "Top-rated {beauty_personal_care} salons in {city_state}",
    "Compare reviews for {beauty_personal_care} providers",
    "Affordable {beauty_personal_care} services near me",
    "Best {beauty_personal_care} options in {city_state}",
    "How much does {beauty_personal_care} cost?",
    # "{city_short} {beauty_personal_care}",
    # "{beauty_personal_care} {city_short}",
    # "{city_state} {beauty_personal_care}",
    # "{beauty_personal_care} {city_state}",
    "{beauty_personal_care}",

    # Specialty Shops & Services
    "Where to find {specialty_shops_services} in {city_state}?",
    "Best reviews for {specialty_shops_services} near me",
    "Affordable {specialty_shops_services} options near me",
    "Compare {specialty_shops_services} providers in {city_state}",
    "How to hire {specialty_shops_services} professionals",
    "Top-rated {specialty_shops_services} in {city_state}",
    # "{city_short} {specialty_shops_services}",
    # "{specialty_shops_services} {city_short}",
    # "{city_state} {specialty_shops_services}",
    # "{specialty_shops_services} {city_state}",
    "{specialty_shops_services}",

    "{general_yelp_keyword}",
    "{general_yelp_keyword} near me",
    "{general_yelp_keyword} in {city_state}"
]


In [None]:
len(yelp_intent_additional_templates)

In [None]:
def detect_service(service_categories, template):
    categories = []
    for category in service_categories.keys():
        if category in template:
            categories.append(category)
    return categories


def generate_service_queries(service_categories, templates, n_queries=1000):
    cnt = 0
    queries = []
    query_set = set()
    while cnt < n_queries:
        if cnt % 1000 == 0:
            print(f"{cnt+1} examples added")
        template = random.choice(templates)
        # print(f"template = {template}")
        categories = detect_service(service_categories, template)
        # print(f"categories = {categories}")
        query = template
        for category in categories:
            if category:
                service = random.choice(service_categories[category])
                category = category.replace("{","").replace("}", "")
                query = query.replace(f"{{{category}}}",service)
                # print(f"query = {query}")
                # print(f"category = {category}")
                if query not in query_set and "{" not in query:
                    queries.append(query)
                    query_set.add(query)
                    cnt += 1
    return queries

In [None]:
HOME_MAINTENANCE_REPAIR = "{home_maintenance_repair}"
MOVING_STORAGE = "{moving_storage}"
RESTAURANT_FOOD = "{restaurants_food}"
HEALTH_WELLNESS = "{health_wellness}"
CAR_REPAIR_AUTOMOTIVE_SERVICES = "{car_repair_automotive_services}"
CLEANING_SERVICES = "{cleaning_services}"
ENTERTAINMENT_ACTIVITIES = "{entertainment_activities}"
BEAUTY_PERSONAL_CARE = "{beauty_personal_care}"
SPECIALITY_SHOPS_SERVICES = "{specialty_shops_services}"
CITY_STATES = "{city_state}"
CITY_SHORT = "{city_short}"
GENERAL_YELP_KEYWORD = "{general_yelp_keyword}"


service_categories = {
    HOME_MAINTENANCE_REPAIR: home_maintenance_repair,
    MOVING_STORAGE: moving_storage,
    RESTAURANT_FOOD: restaurants_food,
    HEALTH_WELLNESS: health_wellness,
    CAR_REPAIR_AUTOMOTIVE_SERVICES: car_repair_automotive_services,
    CLEANING_SERVICES: cleaning_services,
    ENTERTAINMENT_ACTIVITIES: entertainment_activities,
    BEAUTY_PERSONAL_CARE: beauty_personal_care,
    SPECIALITY_SHOPS_SERVICES: specialty_shops_services,
    CITY_STATES: city_state,
    CITY_SHORT: city_short,
    GENERAL_YELP_KEYWORD: general_yelp_keyword,
}


In [None]:
yelp_intent_additional_queries = generate_service_queries(service_categories, yelp_intent_additional_templates, n_queries=5000) #15000
print(len(yelp_intent_additional_queries))

In [None]:
yelp_intent_additional_queries_df = pd.DataFrame(yelp_intent_additional_queries, columns=['sequence'])
yelp_intent_additional_queries_df['target'] = 'yelp_intent'
yelp_intent_additional_queries_df

#### Navigation intent additional queries

In [None]:
navigation_intent_templates = [
    # Routing Numbers & Bank Information
    "routing number for {bank}",
    "address of {bank}",
    "what is the routing number for {bank}",
    "verify routing number for {bank}",
    "contact number for {bank} customer service",
    "find routing number of {bank} in {location}",
    "routing number for {credit_union}",

    # Company & Service Support
    "support number for {service}",
    "how to contact {service} support",
    "customer support number for {service}",
    "cancel {service} account",
    "what is the {service} customer care number",
    "fax number for {service}",
    "call {service} customer service",

    # Login or Account Information
    "login to {service} account",
    "how to login to {service} on my computer",
    "account management for {service}",
    "reset password for {service}",
    "forgot login details for {service}",
    "how to access {service} account",

    # Addresses & Locations
    "address for {location}",
    "find address of {business} in {location}",
    "location of {business}",
    "where is {place} located",
    "directions to {place}",
    "address of {store} in {location}",

    # Cancellation or Service Changes
    "cancel {service} subscription",
    "change address for {service}",
    "cancellation fee for {service}",
    "cancellation policy for {service}",
    "how to cancel {service} account",
    "cancel {service} membership",

    "features of {product}",

    # TV Shows, Movies, and Streaming Services
    "is {show} on {streaming_service}?",
    "is {movie} available on {platform}?",
    "does {device} support {service}?",
    "can I watch {show} on {device}?",
    "how to stream {show} on {platform}",
    "is {show} canceled?",

    # Educational Resources & Information
    "tuition fee for {university} in 2024",
    "how to apply for {course} on {learning_platform}",
    "contact {university} admissions office",
    "academic calendar for {university}",
    "what is the login for {learning_platform}?",

    # Shipping & Tracking
    "track my package on {shipping_service}",
    "shipping cost for {product} on {platform}",
    "what is the tracking number for {courier}?",
    "how to track {courier} delivery?",
    "where is my {shipping_service} package?",

    # Government Services & Documents
    "how to renew my driver’s license with {state_dmv}",
    "where is the closest post office?",
    "how to apply for a passport in the US",
    "IRS contact number for tax queries",
    "how to change address with {state_dmv}",

    # Finance & Banking
    "how to increase my credit limit with {bank}",
    "where to find {bank} ATM near me",
    "credit score needed for {credit_card}",
    # "what are the benefits of {credit_card}?",
    "how to apply for a mortgage with {bank}",

    # Tech Support & Troubleshooting
    "how to fix {device} screen issue",
    "support number for {tech_company}",
    "how to update {software} on {device}",
    "what to do if {device} won’t start?",
    "reset password for {account} on {device}",

    # Employment & Career
    "find job openings at {company}",
    "what are the job duties for {position}?",
    "how to apply for {job_role} at {company}",
    "contact HR at {company}",
    "career advice for {industry}",

    # Public Services & Utilities
    "pay my electricity bill with {utility_company}",
    "find waste management services near me",
    "report a power outage with {utility_company}",
    "how to sign up for {utility_service}?",
    "how to contact {utility_company} support?",

    # # Events & Ticketing
    # "find concert tickets for {artist} on {platform}",
    # "how to book tickets for {event}?",
    # "find the best seats for {concert}",
    # "how to get discounts for {festival} tickets?",
    # "ticket refund policy for {platform}",

    # Email & Account Access
    "login to {email_provider}",
    "access {email_provider} on my computer",
    "forgot password for {login_service}",
    "how to reset password for {login_service} account",
    "access my {email_provider} inbox",
    
    # Government Services
    "how to track my refund on {government_service} website",
    "get support from {government_service} for {topic}",
    "how to check my status with {government_service}",
    "apply for services through {government_service}",
    
    # Financial Services & Bank Support
    "login to {financial_service} account",
    "how to check balance on {financial_service}",
    "support number for {financial_service} customer service",
    "pay my bill with {financial_service}",
    
    # Software & Device Support
    "fix {device} issues with {support_service} support",
    "how to troubleshoot {software} problems",
    "download {software} for {device}",
    "check updates for {software}",
    
    # Other Services & General Queries
    "how to download {software} for {task}",
    "install {software} on {device}",
    "find customer support number for {support_service}",
    "how to change account details for {login_service}",

    # General Navigation Queries
    "how do I sign in to {domain}",
    "login to {domain} account",
    "where is the sign-in page on {domain}",
    "reset my password on {domain}",
    "authenticate my account on {domain}",
    "how to sign up for {domain} account",

    # Registration & Account Creation
    "create an account on {domain}",
    "how to register on {domain}",
    "where can I sign up for {domain}",
    "register for a new account on {domain}",
    "sign up for {domain} services",
    
    # Login, Sign-in, Authentication
    "how do I log into {domain}",
    "sign into {domain} with email",
    "can I sign in to {domain} with my phone number",
    "how do I recover my password on {domain}",
    "log out of {domain} account",
    
    # Forms & Document Submission
    "where to submit forms on {domain}",
    "download forms from {domain}",
    "upload documents to {domain}",
    "how do I submit a form on {domain}",
    "find registration forms on {domain}",
    
    # Contact & Customer Support
    "how do I contact support on {domain}",
    "where is the customer service number on {domain}",
    "how do I get help on {domain}",
    "contact support on {domain} for issues",
    "find contact info on {domain}",
    
    # Tracking & Status Updates
    "track my package on {domain}",
    "check my order status on {domain}",
    "how do I track a shipment on {domain}",
    "where is the tracking page on {domain}",
    "track delivery updates on {domain}",

    "{domain}/jobs",
    "{domain}/careers",
    "{domain}/login",
    "{domain}/signin",
    "{domain}/sign in",
    "{domain} jobs",
    "{domain} careers",
    "{domain} login",
    "{domain} signin",
    "{domain} sign in",
]


In [None]:
len(navigation_intent_templates)

In [None]:
bank = [
    'Wells Fargo', 'Bank of America', 'Chase', 'TD Bank', 'PNC',
    'Citibank', 'US Bank', 'Capital One', 'HSBC', 'Fifth Third Bank',
    'Regions Bank', 'Ally Bank', 'SunTrust', 'KeyBank', 'M&T Bank',
]

credit_card = [
    'Chase Sapphire Preferred', 'Capital One Venture Rewards', 'American Express Platinum', 
    'Citi Double Cash', 'Discover It Cash Back', 
    'Wells Fargo Active Cash', 'Bank of America Travel Rewards', 
    'Chase Freedom Unlimited', 'Capital One Quicksilver', 'U.S. Bank Visa Platinum',
    'American Express Gold', 'Citi Premier Card', 'Discover It Miles',
    'Barclays AAdvantage Aviator Red', 'Amazon Prime Rewards Visa Signature',
    'Delta SkyMiles Platinum American Express', 'Hilton Honors American Express Surpass',
    'Southwest Rapid Rewards Plus', 'Marriott Bonvoy Boundless', 'United Explorer Card',
]

location = [
    'New York', 'Los Angeles', 'Chicago', 'Houston', 'Miami',
    'Dallas', 'San Francisco', 'Atlanta', 'Seattle', 'Boston',
    'Phoenix', 'Orlando', 'Philadelphia', 'Denver', 'Las Vegas',
]

credit_union = [
    'Navy Federal Credit Union', 'Alliant Credit Union', 'Golden 1 Credit Union',
    'First Tech Federal Credit Union', 'America First Credit Union', 'Pentagon Federal Credit Union',
    'San Diego County Credit Union', 'Suncoast Credit Union', 'BECU', 'Teachers Federal Credit Union',
    'Keesler Federal Credit Union', 'Valley First Credit Union', 'River Region Credit Union',
    'Champion Credit Union', 'Mountain America Credit Union',
]

service = [
    'Netflix', 'Spotify', 'Amazon Prime', 'Google Drive', 'Uber',
    'Disney+', 'YouTube Premium', 'Dropbox', 'Zoom', 'Venmo',
    'Lyft', 'Twitch', 'Slack', 'LinkedIn', 'DoorDash',
]

product = [
    'iPhone', 'Samsung Galaxy', 'MacBook', 'PlayStation 5', 'AirPods',
    'Sony TV', 'Apple Watch', 'Bose headphones', 'Canon DSLR', 'GoPro',
    'Microsoft Surface', 'Google Pixel', 'Fitbit', 'Nintendo Switch', 'Xbox Series X',
]

platform = [
    'Amazon', 'eBay', 'Walmart', 'Best Buy', 'Target',
    'Apple Store', 'Google Store', 'Newegg', 'B&H', 'Costco',
]

streaming_service = [
    'Netflix', 'Hulu', 'Amazon Prime', 'Disney+', 'HBO Max',
    'Apple TV+', 'Peacock', 'Paramount+', 'YouTube TV', 'Sling TV',
]

show = [
    'Breaking Bad', 'Stranger Things', 'Game of Thrones', 'Friends', 'The Office',
    'The Mandalorian', 'The Crown', 'WandaVision', 'Loki', 'The Boys',
]

learning_platform = [
    'Coursera', 'Udemy', 'edX', 'Khan Academy', 'LinkedIn Learning',
    'Pluralsight', 'Skillshare', 'Codecademy', 'Udacity', 'FutureLearn',
]

shipping_service = [
    'FedEx', 'UPS', 'USPS', 'DHL', 'Amazon Logistics',
    'Yanwen', 'Aramex', 'Canada Post', 'Royal Mail', 'Hermes',
]

courier = [
    'FedEx', 'UPS', 'DHL', 'USPS', 'Aramex',
    'Yanwen', 'Canada Post', 'Royal Mail', 'Hermes', 'TNT',
]

university = [
    'Harvard University', 'Stanford University', 'Massachusetts Institute of Technology', 
    'University of California, Berkeley', 'Princeton University',
    'Yale University', 'Columbia University', 'University of Chicago', 'New York University', 'University of Michigan',
]

state_dmv = [
    'California DMV', 'New York DMV', 'Texas DMV', 'Florida DMV', 'Illinois DMV',
    'Pennsylvania DMV', 'Ohio BMV', 'Georgia DDS', 'Virginia DMV', 'New Jersey MVC',
]

utility_company = [
    'Pacific Gas & Electric', 'Duke Energy', 'Con Edison', 'Southern California Edison', 'National Grid',
    'Xcel Energy', 'Florida Power & Light', 'PSEG', 'Dominion Energy', 'Consumers Energy',
]

utility_service = [
    'electricity', 'water supply', 'natural gas', 'internet', 'cable TV',
    'trash collection', 'sewage service', 'recycling pickup', 'phone service',
    'solar power', 'wind energy', 'fiber internet', 'home security system', 
    'smart meter installation', 'smart thermostat installation',
    'geothermal heating', 'propane service', 'stormwater management',
    'emergency power backup', 'district heating',
]

event = [
    'Coachella', 'Lollapalooza', 'Burning Man', 'Comic-Con', 'The Oscars',
    'Super Bowl', 'World Series', 'NBA Finals', 'Wimbledon', 'Grammy Awards',
]

company = [
    'Google', 'Apple', 'Microsoft', 'Facebook', 'Amazon',
    'Tesla', 'Twitter', 'Netflix', 'Airbnb', 'Spotify',
]

device = [
    'iPhone', 'MacBook', 'Samsung Galaxy', 'iPad', 'PlayStation 5',
    'Xbox Series X', 'Apple Watch', 'Fitbit', 'Surface Pro', 'Nintendo Switch',
]

festival = [
    'Coachella', 'Lollapalooza', 'Burning Man', 'Tomorrowland', 'SXSW',
    'Glastonbury', 'Oktoberfest', 'Mardi Gras', 'Cannes Film Festival', 'Sundance Film Festival',
    'Ultra Music Festival', 'New Orleans Jazz & Heritage Festival', 'Austin City Limits', 'Bonnaroo', 'Electric Daisy Carnival',
    'Stagecoach', 'Summerfest', 'Essence Festival', 'Rock in Rio', 'Woodstock',
]

artist = [
    'Taylor Swift', 'Beyoncé', 'Ed Sheeran', 'Drake', 'Ariana Grande',
    'Billie Eilish', 'The Weeknd', 'Justin Bieber', 'Kanye West', 'Rihanna',
    'Bruno Mars', 'Shawn Mendes', 'Dua Lipa', 'Travis Scott', 'Lady Gaga',
    'Post Malone', 'Harry Styles', 'Adele', 'Coldplay', 'Imagine Dragons',
]

job_role = [
    'Software Engineer', 'Data Scientist', 'Marketing Manager', 'Graphic Designer', 'Project Manager',
    'Sales Representative', 'Accountant', 'Nurse', 'Mechanical Engineer', 'Product Manager',
    'Business Analyst', 'Consultant', 'UX/UI Designer', 'Customer Support Specialist', 'Operations Manager',
    'Human Resources Manager', 'Financial Analyst', 'Social Media Manager', 'Content Writer', 'DevOps Engineer',
]

position = [
    'Software Developer', 'Senior Manager', 'Account Executive', 'Nurse Practitioner', 'Mechanical Technician',
    'Business Consultant', 'Marketing Director', 'Sales Engineer', 'Systems Analyst', 'Financial Consultant',
    'HR Specialist', 'Executive Assistant', 'Data Engineer', 'Legal Advisor', 'Product Owner',
    'Operations Director', 'IT Administrator', 'Brand Manager', 'Customer Service Representative', 'Medical Assistant',
]

industry = [
    'technology', 'finance', 'healthcare', 'manufacturing', 'education',
    'real estate', 'marketing', 'media', 'retail', 'automotive',
    'hospitality', 'construction', 'pharmaceutical', 'telecommunications', 'energy',
    'transportation', 'insurance', 'consulting', 'legal', 'entertainment',
]

account = [
    'Google account', 'Facebook account', 'Apple account', 'Amazon account', 'Netflix account',
    'Spotify account', 'Microsoft account', 'Instagram account', 'Twitter account', 'Uber account',
    'Dropbox account', 'LinkedIn account', 'Slack account', 'Zoom account', 'Venmo account',
    'PayPal account', 'eBay account', 'Airbnb account', 'Twitch account', 'Pinterest account',
]

software = [
    'Windows 10', 'macOS', 'Microsoft Office', 'Adobe Photoshop', 'Slack',
    'Zoom', 'Google Chrome', 'Firefox', 'Visual Studio Code', 'Python',
    'Java', 'Salesforce', 'WordPress', 'AutoCAD', 'Tableau',
    'SQL Server', 'GitHub', 'IntelliJ IDEA', 'Figma', 'Trello',
]

place = [
    'Disneyland', 'Eiffel Tower', 'Statue of Liberty', 'The Grand Canyon', 'The Colosseum',
    'Empire State Building', 'Golden Gate Bridge', 'Mount Rushmore', 'Niagara Falls', 'The Louvre',
    'Big Ben', 'The Vatican', 'Great Wall of China', 'Times Square', 'Central Park',
    'Sydney Opera House', 'Stonehenge', 'Machu Picchu', 'Christ the Redeemer', 'The Pyramids of Giza',
]

email_provider = ['Gmail', 'Yahoo Mail', 'Outlook', 'iCloud', 'ProtonMail',]
login_service = [
    'Netflix', 'Amazon', 'Spotify', 'Facebook', 'Instagram', 'PayPal', 'Gmail',
    'LinkedIn', 'Twitter', 'Zoom', 'Dropbox', 'Uber', 'Venmo',
]
government_service = ['IRS', 'DMV', 'SSA', 'FBI', 'DHS', 'CDC']
financial_service = [
    'Bank of America', 'Wells Fargo', 'Chase', 'Citibank', 'Capital One',
    'Discover', 'American Express', 'PayPal', 'Venmo',
]
support_service = ['Dell', 'Apple', 'Samsung', 'HP', 'Lenovo', 'Microsoft',]

domain = [
    'google.com',
    'facebook.com',
    'amazon.com',
    'youtube.com',
    'wikipedia.org',
    'twitter.com',
    'reddit.com',
    'netflix.com',
    'ebay.com',
    'linkedin.com',
    'pinterest.com',
    'instagram.com',
    'craigslist.org',
    'yahoo.com',
    'hulu.com',

    # News & Media
    'espn.com',
    'foxnews.com',
    'cnn.com',
    'nytimes.com',
    'washingtonpost.com',
    'bbc.com',
    'msnbc.com',
    'theguardian.com',
    'buzzfeednews.com',
    'nbcnews.com',

    # Shopping & E-commerce
    'walmart.com',
    'apple.com',
    'target.com',
    'costco.com',
    'bestbuy.com',
    'homedepot.com',
    'lowes.com',
    'etsy.com',
    'kohls.com',
    'macys.com',

    # Government Services
    'irs.gov',
    'dmv.org',
    'ssa.gov',
    'healthcare.gov',
    'fbi.gov',
    'usps.com',
    'medicaid.gov',
    'va.gov',
    'uscis.gov',
    'cdc.gov',

    # Entertainment & Streaming
    'spotify.com',
    'disneyplus.com',
    'peacocktv.com',
    'hbomax.com',
    'paramountplus.com',
    'twitch.tv',
    'sling.com',
    'primevideo.com',
    'tv.apple.com',

    # Travel & Booking
    'expedia.com',
    'tripadvisor.com',
    'booking.com',
    'airbnb.com',
    'priceline.com',
    'southwest.com',
    'aa.com',
    'delta.com',

    # Financial Services & Payments
    'paypal.com',
    'venmo.com',
    'chase.com',
    'bankofamerica.com',
    'wellsfargo.com',
    'capitalone.com',
    'americanexpress.com',
    'discover.com',
    'stripe.com',

    # Utility Services
    'comcast.com',
    'xfinity.com',
    'att.com',
    'verizon.com',
    'spectrum.com',
    'duke-energy.com',
    'coned.com',
    'pseg.com',
    'nationalgridus.com',
    'fpl.com',

    # Health & Fitness
    'webmd.com',
    'myfitnesspal.com',
    'mayoclinic.org',
    'healthline.com',
    'bcbs.com',
    'uhc.com',
    'walgreens.com',
    'cvs.com',

    ## Additional domains
    'tiktok.com',
    'whatsapp.com',
    'messenger.com',
    'snapchat.com',
    'slack.com',
    'forbes.com',
    'bloomberg.com',
    'reuters.com',
    'usatoday.com',
    'aljazeera.com',
    'newegg.com',
    'wayfair.com',
    'zillow.com',
    'chewy.com',
    'sephora.com',
    'coursera.org',
    'udemy.com',
    'khanacademy.org',
    'edx.org',
    'duolingo.com',
    'nih.gov',
    'clevelandclinic.org',
    'robinhood.com',
    'sofi.com',
    'dropbox.com',
    'weebly.com',
    'shopify.com',
    'wordpress.com',
    'turbotax.com',
    'creditkarma.com',
    'intuit.com',
    'geico.com',
    'progressive.com',
    'statefarm.com',
    'allstate.com',
    'esurance.com',
    'pnc.com',
    'td.com',
    'citibank.com',
    'suntrust.com',
    'huntington.com',
    'ally.com',
    'navyfed.org',
    'fidelity.com',
    'vanguard.com',
    'etrade.com',
    'schwab.com',
    'ameritrade.com',
    'coinmarketcap.com',
    'yelp.com',
    'opentable.com',
    'groupon.com',
    'livingSocial.com',
    'kayak.com',
    'hotels.com',
    'orbitz.com',
    'cheapoair.com',
    'travelocity.com',
    'skyscanner.com',
    'jetblue.com',
    'alaskaair.com',
    'spirit.com',
    'nordstrom.com',
    'gap.com',
    'oldnavy.com',
    'bananaRepublic.com',
    'hottopic.com',
    'uniqlo.com',
    'jcpenney.com',
    'sears.com',
    'footlocker.com',
    'victoriassecret.com',
    'adidas.com',
    'nike.com',
    'underarmour.com',
]


In [None]:
len(navigation_intent_templates)

In [None]:
BANK_NAVIGATION = "{bank}"
CREDIT_CARD_NAVIGATION = "{credit_card}"
LOCATION_NAVIGATION = "{location}"
CREDIT_UNION_NAVIGATION = "{credit_union}"
SERVICE_NAVIGATION = "{service}"
PRODUCT_NAVIGATION = "{product}"
PLATFORM_NAVIGATION = "{platform}"
STREAMING_SERVICE_NAVIGATION = "{streaming_service}"
SHOW_NAVIGATION = "{show}"
LEARNING_PLATFORM_NAVIGATION = "{learning_platform}"
SHIPPING_SERVICE_NAVIGATION = "{shipping_service}"
COURIER_NAVIGATION = "{courier}"
UNIVERSITY_NAVIGATION = "{university}"
STATE_DMV_NAVIGATION = "{state_dmv}"
UTILITY_COMPANY_NAVIGATION = "{utility_company}"
UTILITY_SERVICE_NAVIGATION = "{utility_service}"
EVENT_NAVIGATION = "{event}"
COMPANY_NAVIGATION = "{company}"
DEVICE_NAVIGATION = "{device}"
FESTIVAL_NAVIGATION = "{festival}"
ARTIST_NAVIGATION = "{artist}"
JOBROLE_NAVIGATION = "{job_role}"
POSITION_NAVIGATION = "{position}"
INDUSTRY_NAVIGATION = "{industry}"
ACCOUNT_NAVIGATION = "{account}"
SOFTWARE_NAVIGATION = "{software}"
PLACE_NAVIGATION = "{place}"
EMAIL_PROVIDER_NAVIGATION = "{email_provider}"
LOGIN_SERVICE_NAVIGATION = "{login_service}"
GOVERNMENT_SRVICE_NAVIGATION = "{government_service}"
FINANCIAL_SERVICE_NAVIGATION = "{financial_service}"
SUPPORT_SREVICE_NAVIGATION = "{support_service}"
DOMAIN_NAVIGATION = "{domain}"

navigation_categories = {
    BANK_NAVIGATION: bank,
    CREDIT_CARD_NAVIGATION: credit_card,
    LOCATION_NAVIGATION: location,
    CREDIT_UNION_NAVIGATION: credit_union,
    SERVICE_NAVIGATION: service,
    PRODUCT_NAVIGATION: product,
    PLATFORM_NAVIGATION: platform,
    STREAMING_SERVICE_NAVIGATION: streaming_service,
    SHOW_NAVIGATION: show,
    LEARNING_PLATFORM_NAVIGATION: learning_platform,
    SHIPPING_SERVICE_NAVIGATION: shipping_service,
    COURIER_NAVIGATION: courier,
    UNIVERSITY_NAVIGATION: university,
    STATE_DMV_NAVIGATION: state_dmv,
    UTILITY_COMPANY_NAVIGATION: utility_company,
    UTILITY_SERVICE_NAVIGATION: utility_service,
    EVENT_NAVIGATION: event,
    COMPANY_NAVIGATION: company,
    DEVICE_NAVIGATION: device,
    FESTIVAL_NAVIGATION: festival,
    ARTIST_NAVIGATION: artist,
    JOBROLE_NAVIGATION: job_role,
    POSITION_NAVIGATION: position,
    INDUSTRY_NAVIGATION: industry,
    ACCOUNT_NAVIGATION: account,
    SOFTWARE_NAVIGATION: software,
    PLACE_NAVIGATION: place,
    EMAIL_PROVIDER_NAVIGATION: email_provider,
    LOGIN_SERVICE_NAVIGATION: login_service,
    GOVERNMENT_SRVICE_NAVIGATION: government_service,
    FINANCIAL_SERVICE_NAVIGATION: financial_service,
    SUPPORT_SREVICE_NAVIGATION: support_service,
    DOMAIN_NAVIGATION: domain,
}



In [None]:
navigation_intent_additional_queries = generate_service_queries(navigation_categories, navigation_intent_templates, n_queries=9000)
print(len(navigation_intent_additional_queries))

In [None]:
navigation_intent_additional_queries_df = pd.DataFrame(navigation_intent_additional_queries, columns=['sequence'])
navigation_intent_additional_queries_df['target'] = 'navigation_intent'
navigation_intent_additional_queries_df

#### Travel intent additional queries generation

In [None]:
travel_intent_templates = [
    # Visa Information & Requirements
    "What is the US cost for {country} visitor visa?",
    "Do I need a visa to visit {country}?",
    "How long can I stay in {country} with a visa?",
    "Requirements for {country} tourist visa",
    "What is the visa fee for {country} visitors?",
    "Can I extend my visa stay in {country}?",
    "What documents are needed for a {country} work visa?",
    "How to apply for a visa for {country}?",
    "What is the processing time for a {country} visa?",
    "Is a transit visa required for {country}?",

    # Cruise Information & Pricing
    "What are the prices for cruises to {destination}?",
    "Does {cruise_line} offer {service}?",
    "Which cruise lines sail from {location}?",
    "Best time to book cruises to {destination}",
    "What is the cost of a {cruise_line} cruise to {destination}?",
    "Do cruises from {location} go to {destination}?",
    "What are the cancellation policies for {cruise_line}?",
    "Is there an all-inclusive option for {cruise_line}?",
    "What’s included in a {cruise_line} package?",
    "What are the best-rated {cruise_line} destinations?",
    
    # Airport & Flight Information
    "What airport is closest to {location}?",
    "What airport code is {airport_code}?",
    "Which airlines travel to {destination}?",
    "What airport is near {tourist_attraction}?",
    "What is the best airport for {city_state}?",
    "Direct flights from {location} to {destination}",
    "What are the best budget airlines to {destination}?",
    "Is there an airport lounge at {airport_code}?",
    "How early should I arrive at {airport_code}?",
    "What are the baggage policies for flights to {destination}?",

    # Best Time to Visit
    "When is the best time to visit {destination}?",
    "What is the best season to visit {destination}?",
    "What month should I visit {tourist_attraction}?",
    "When should I travel to {country} for good weather?",
    "Best time to visit {tourist_destination} in {country}",
    "What are the off-peak months for {destination}?",
    "Is it worth visiting {destination} during winter?",
    "What is the rainy season in {country}?",
    "When can I avoid crowds in {tourist_attraction}?",
    "What is the tourist season for {destination}?",

    # Tourist Attractions & Tours
    "Top tourist attractions in {destination}",
    "Best tours of {destination}",
    "Guided tours to {country}",
    "What are the must-visit attractions in {location}?",
    "What are the most popular tours in {destination}?",
    "What tours are available in {region}?",
    "Are there family-friendly tours in {destination}?",
    "How to book a tour of {tourist_attraction}?",
    "Is {tourist_attraction} open year-round?",
    "What is the admission fee for {tourist_attraction}?",

    # Resorts & Hotels
    "What are the best resorts in {destination}?",
    "Is {resort} all-inclusive?",
    "Does {resort} charge a resort fee?",
    "Where is the nearest resort to {location}?",
    "What resorts in {destination} offer all-inclusive packages?",
    "Are there kid-friendly resorts in {destination}?",
    "Is {resort} pet-friendly?",
    "What are the spa services available at {resort}?",
    "Can I book a suite at {resort}?",
    "Does {resort} offer transportation from {airport_code}?",

    # Weather Information
    "Best weather for visiting {destination}",
    "What is the weather like in {country} during {month}?",
    "What is the average temperature in {destination} in {season}?",
    "How does the weather in {destination} change by season?",
    "What is the weather forecast for {destination} next week?",
    "What is the humidity level in {destination} during {month}?",
    "Is it rainy in {destination} in {season}?",
    "What are the sunniest months in {destination}?",
    "Is it cold in {destination} in {month}?",
    "What’s the UV index in {destination} this time of year?",

    # Travel Costs & Pricing
    "What is the cost of a vacation to {destination}?",
    "How much does it cost to visit {tourist_attraction}?",
    "What is the average cost of a flight to {destination}?",
    "How much do guided tours in {destination} cost?",
    "How much money should I bring for a trip to {country}?",
    "What’s the average hotel rate in {destination}?",
    "How expensive is dining in {destination}?",
    "Are there budget travel options for {destination}?",
    "What’s the cheapest month to travel to {destination}?",
    "Can I travel to {destination} on a low budget?",

    # Passports & Travel Documentation
    "Do I need a passport to travel to {destination}?",
    "What documents are required to visit {country}?",
    "How to apply for a visa to visit {destination}?",
    "Can US citizens travel to {country} without a passport?",
    "Where to apply for a passport to travel to {destination}?",
    "How long is my passport valid for traveling to {country}?",
    "Can I use a digital visa for {country}?",
    "How to renew my passport before traveling?",
    "Are vaccinations required for {country} travel?",
    "Do I need travel insurance to visit {destination}?",

    # Travel Destinations
    "Most visited places in {country}",
    "Top travel destinations in {destination}",
    "What are the top places to visit in {country}?",
    "What are the most popular tourist attractions in {city}?",
    "What are the best destinations in {region} for vacations?",
    "Best adventure travel spots in {region}",
    "Underrated places to visit in {country}",
    "Top beach destinations in {country}",
    "What are the top historical sites in {destination}?",
    "Best romantic getaways in {destination}",

    ## short queries
    # Country or City Searches
    "{country} travel",
    "{destination} flights",
    "{country} visa",
    "{destination} hotels",
    "{country} tourism",
    "{city_state} guide",
    "{region} cruises",

    # Tourist Attractions
    "{tourist_attraction}",
    "visit {tourist_attraction}",
    "explore {tourist_attraction}",

    # Flights and Airports
    "{airport_code} flights",
    "{city_state} flights",
    "{destination} airport",
    "{airport_code} airport",
    "{destination} fares",

    # Travel Essentials
    "{country} passport",
    "{country} documents",
    "{country} travel",
    "{destination} costs",
    "{country} insurance",

    # Tours and Cruises
    "{destination} tours",
    "{cruise_line} cruise",
    "tours {destination}",
    "cruise {destination}",

    # Resorts and Hotels
    "{resort} stay",
    "{destination} hotels",
    "{resort} booking",
    "{destination} resort",
    "stay {destination}",

    # Travel Costs and Budgets
    "{destination} prices",
    "{destination} budget",
    "{country} costs",
    "{destination} expense",
    "{country} currency",
]

print(len(travel_intent_templates))

In [None]:
country = [
    'Australia', 'Israel', 'Dominican Republic', 'Mexico', 'Canada',
    'United Kingdom', 'France', 'Spain', 'Italy', 'Japan',
    'Germany', 'Brazil', 'Argentina', 'China', 'South Korea',
    'Thailand', 'India', 'Greece', 'Egypt', 'New Zealand',
    'Sweden', 'Norway', 'Portugal', 'Switzerland', 'South Africa',
    'Iceland', 'Russia', 'Peru', 'Morocco', 'Vietnam',
]

destination = [
    'Hawaii', 'Las Vegas', 'Disney World', 'Grand Canyon', 'New Zealand',
    'Singapore', 'Bahamas', 'Switzerland', 'Ireland', 'Rome',
    'Maui', 'Bora Bora', 'Dubai', 'Bali', 'Maldives',
    'Machu Picchu', 'Reykjavik', 'Iceland', 'Paris', 'London',
    'Bangkok', 'Vienna', 'Amsterdam', 'Bruges', 'Santorini',
    'Phuket', 'Cairo', 'Cape Town', 'Prague', 'Sydney',
]

cruise_line = [
    'Carnival Cruise', 'Royal Caribbean', 'Disney Cruise Line', 'Norwegian Cruise Line', 'Celebrity Cruises',
    'Princess Cruises', 'Holland America Line', 'MSC Cruises', 'Viking Cruises', 'Azamara Club Cruises',
    'Costa Cruises', 'Silversea Cruises', 'Seabourn Cruise Line', 'Oceania Cruises', 'Regent Seven Seas Cruises',
]

location = [
    'Miami', 'Los Angeles', 'Orlando', 'Seattle', 'Galveston',
    'New York City', 'San Francisco', 'Tucson', 'Las Vegas', 'Phoenix',
    'Austin', 'Boston', 'Chicago', 'Houston', 'Denver',
    'Portland', 'Salt Lake City', 'Atlanta', 'Dallas', 'Nashville',
    'Philadelphia', 'Baltimore', 'Detroit', 'Indianapolis', 'Charlotte',
]

tourist_attraction = [
    'White House', 'Niagara Falls', 'Yosemite National Park', 'Tower of London', 'Vatican Museum',
    'Eiffel Tower', 'Mount Rushmore', 'Disneyland', 'Air Force Academy', 'The Colosseum',
    'Statue of Liberty', 'Golden Gate Bridge', 'Stonehenge', 'Machu Picchu', 'The Great Wall of China',
    'Taj Mahal', 'Petra', 'Christ the Redeemer', 'Angkor Wat', 'Sagrada Familia',
    'Mount Everest', 'Victoria Falls', 'Banff National Park', 'Kremlin', 'Acropolis',
    'Sydney Opera House', 'Buckingham Palace', 'Temple of the Emerald Buddha', 'Grand Bazaar', 'Meiji Shrine',
]

airport_code = [
    'JFK', 'LAX', 'IAD', 'ORD', 'ATL', 'MCO', 'PHL', 'SFO', 'SEA', 'PHX',
    'DFW', 'MIA', 'DEN', 'BOS', 'DTW', 'LGA', 'CLT', 'MSP', 'FLL', 'LAS',
    'IAH', 'HNL', 'SAN', 'BWI', 'TPA', 'YVR', 'YYZ', 'DCA', 'CDG', 'FRA',
]

resort = [
    'Port Orleans Resort', 'Westgate Resort', 'Kona Coast Resort', 'Bahia Luxury Resort', 'Elara by Hilton',
    'Nizuc Resort', 'Grand Lakes Resort', 'Ashford Castle', 'Vienna Resort', 'Koh Samui Resort',
    'Four Seasons Resort Maui', 'Ritz-Carlton Kapalua', 'Waldorf Astoria Los Cabos', 'Atlantis Paradise Island', 'Le Blanc Spa Resort',
    'Bora Bora Lagoon Resort', 'Amangiri', 'Jade Mountain Resort', 'Shangri-La Resort', 'Amanpuri',
]

region = [
    'South East Asia', 'Caribbean', 'Mediterranean', 'Pacific Islands', 'Western Europe',
    'East Africa', 'Middle East', 'South America', 'Southern Africa', 'Western US',
    'Northern Europe', 'Central America', 'Eastern Europe', 'Indian Ocean', 'Arctic Circle',
    'Scandinavia', 'Baltic States', 'North Africa', 'Andes Mountains', 'French Polynesia',
]

city_state = [
    'Washington, DC', 'Orlando, FL', 'Las Vegas, NV', 'San Diego, CA', 'New York, NY',
    'Los Angeles, CA', 'Miami, FL', 'Jacksonville, NC', 'Galveston, TX', 'Williamsburg, VA',
    'Austin, TX', 'Boston, MA', 'Phoenix, AZ', 'Dallas, TX', 'Nashville, TN',
    'San Antonio, TX', 'San Jose, CA', 'Sacramento, CA', 'Portland, OR', 'St. Louis, MO',
]



In [None]:
COUNTRY_TRAVEL = "{country}"
DESTINATION_TRAVEL = "{destination}"
CRUISE_LINE_TRAVEL = "{cruise_line}"
LOCATION_TRAVEL = "{location}"
TOURIST_ATTRACTION_TRAVEL = "{tourist_attraction}"
AIRPORT_CODE_TRAVEL = "{airport_code}"
RESORT_TRAVEL = "{resort}"
REGION_TRAVEL = "{region}"
CITY_STATE_TRAVEL = "{city_state}"


travel_categories = {
    COUNTRY_TRAVEL: country,
    DESTINATION_TRAVEL: destination,
    CRUISE_LINE_TRAVEL: cruise_line,
    LOCATION_TRAVEL: location,
    TOURIST_ATTRACTION_TRAVEL: tourist_attraction,
    AIRPORT_CODE_TRAVEL: airport_code,
    RESORT_TRAVEL: resort,
    REGION_TRAVEL: region,
    CITY_STATE_TRAVEL: city_state,
}



In [None]:
travel_intent_additional_queries = generate_service_queries(travel_categories, travel_intent_templates, n_queries=5000)
print(len(travel_intent_additional_queries))

In [None]:
# travel_intent_additional_queries
travel_intent_additional_queries_df = pd.DataFrame(travel_intent_additional_queries, columns=['sequence'])
travel_intent_additional_queries_df['target'] = 'travel_intent'
travel_intent_additional_queries_df

#### Additional examples for Translation intent

In [None]:
translation_intent_templates = [
    # Basic Translations (Word or Phrase)
    "What is the translation for {word} in {language}?",
    "How do you say {phrase} in {language}?",
    "What does {word} mean in {language}?",
    "Translate {word} to {language}",
    "What is {phrase} in {language}?",
    "Translate {phrase} to {language}",

    # Meaning of Words in a Language
    "What does {word} mean in {language}?",
    "What is the meaning of {word} in {language}?",
    "Explain the meaning of {phrase} in {language}",
    "What is the translation of {phrase} in {language}?",
    "How do you express {word} in {language}?",

    # Pronunciations & Spellings
    "How do you pronounce {word} in {language}?",
    "What is the correct spelling of {word} in {language}?",
    "What is the phonetic spelling for {word} in {language}?",
    "How to spell {word} in {language}?",
    "How do you pronounce {phrase} in {language}?",
    
]


In [None]:
word = [
    'beautiful', 'friend', 'hello', 'thank you', 'family',
    'happy', 'love', 'music', 'freedom', 'peace',
    'home', 'work', 'future', 'goodbye', 'success',
    'health', 'school', 'truth', 'happiness', 'strength'
]

phrase = [
    'how are you', 'good morning', 'I love you', 'what’s your name', 'where is the bathroom',
    'see you later', 'happy birthday', 'congratulations', 'good night', 'I miss you',
    'nice to meet you', 'have a great day', 'thank you very much', 'how old are you', 'take care',
    'good afternoon', 'can you help me', 'I don’t understand', 'excuse me', 'I am sorry'
]

language = [
    'Spanish', 'French', 'German', 'Japanese', 'Chinese',
    'Russian', 'Italian', 'Portuguese', 'Korean', 'Hindi',
    'Arabic', 'Dutch', 'Greek', 'Hebrew', 'Swedish',
    'Turkish', 'Vietnamese', 'Polish', 'Thai', 'Bengali'
]



In [None]:
WORD_TRANSLATE = "{word}"
PHRASE_TRANSLATE = "{phrase}"
LANGUAGE_TRANSLATE = "{language}"


translate_categories = {
    WORD_TRANSLATE: word,
    PHRASE_TRANSLATE: phrase,
    LANGUAGE_TRANSLATE: language,
}



In [None]:
translate_intent_additional_queries = generate_service_queries(translate_categories, translation_intent_templates, n_queries=2000)
print(len(translate_intent_additional_queries))

In [None]:
translate_intent_additional_queries_df = pd.DataFrame(translate_intent_additional_queries, columns=['sequence'])
translate_intent_additional_queries_df['target'] = 'translation_intent'
translate_intent_additional_queries_df

In [None]:
unknown_intent_templates = [
    "{unknown1}",
    "{unknown1} {unknown2}",
    "{rand_city}",
]


In [None]:
unknown1 = [
    "next", "there", "proposal",
    "proposa", "banana", "mango", "pineapple", "apple", "grapes", "orange",
]

unknown2 = unknown1[::]

rand_city = [
    "Big City", "Silver City", "Golden City", "Mystic City",
]

In [None]:
UNKNOWN1_CATEGORY = "{unknown1}"
UNKNOWN2_CATEGORY = "{unknown2}"
RAND_CITY_CATEGORY = "{rand_city}"


unknown_categories = {
    UNKNOWN1_CATEGORY: unknown1,
    UNKNOWN2_CATEGORY: unknown2,
    RAND_CITY_CATEGORY: rand_city
}

In [None]:
unknown_intent_additional_queries = generate_service_queries(unknown_categories, unknown_intent_templates, n_queries=100)
print(len(unknown_intent_additional_queries))

In [None]:
unknown_intent_additional_queries

In [None]:
unknown_intent_additional_queries_df = pd.DataFrame(unknown_intent_additional_queries, columns=['sequence'])
unknown_intent_additional_queries_df['target'] = 'unknown'
unknown_intent_additional_queries_df

#### Adding some Information intent examples

In [None]:
movie = [
    # 2019 Movies
    "Avengers: Endgame", "The Lion King (2019)", "Frozen II", "Toy Story 4", 
    "Star Wars: The Rise of Skywalker", "Joker", "Spider-Man: Far From Home", 
    "Captain Marvel", "Aladdin (2019)", "Knives Out", "Us", "Once Upon a Time in Hollywood", 
    "1917", "Ford v Ferrari", "It Chapter Two", "Parasite", "Shazam!", 
    "How to Train Your Dragon: The Hidden World", "Jumanji: The Next Level", "Little Women", 
    "Marriage Story", "Jojo Rabbit", "The Irishman", "Rocketman", 
    "John Wick: Chapter 3 – Parabellum", "Glass", "Hustlers", "The Lego Movie 2: The Second Part", 
    "Dumbo", "Alita: Battle Angel", "Doctor Sleep", "Ad Astra", "The Lighthouse", 
    "Frozen II", "Zombieland: Double Tap", "Midsommar", "Good Boys", "A Beautiful Day in the Neighborhood",

    # 2020 Movies
    "Tenet", "Sonic the Hedgehog", "Wonder Woman 1984", "Birds of Prey", 
    "The Invisible Man", "Soul", "Onward", "The Croods: A New Age", "Mulan (2020)", 
    "Bad Boys for Life", "The Trial of the Chicago 7", "Palm Springs", 
    "Hamilton", "Ma Rainey's Black Bottom", "Borat Subsequent Moviefilm", 
    "The Old Guard", "Enola Holmes", "The Midnight Sky", "Extraction", 
    "The Call of the Wild", "Greyhound", "The Way Back", "Da 5 Bloods", 
    "One Night in Miami...", "Sound of Metal", "Promising Young Woman", 
    "The Devil All the Time", "News of the World", "Over the Moon", 
    "A Shaun the Sheep Movie: Farmageddon", "My Spy", "The Personal History of David Copperfield", 
    "The Half of It", "Pieces of a Woman", "The King of Staten Island", 
    "The Lovebirds", "The Secret Garden", "Let Him Go", "Kajillionaire", "The Witches (2020)",

    # 2021 Movies
    "Spider-Man: No Way Home", "Shang-Chi and the Legend of the Ten Rings", 
    "Black Widow", "Eternals", "Dune (2021)", "The Suicide Squad", 
    "Free Guy", "Encanto", "Cruella", "No Time to Die", "The Mitchells vs. the Machines", 
    "Luca", "Raya and the Last Dragon", "The Green Knight", "In the Heights", 
    "A Quiet Place Part II", "Don't Look Up", "House of Gucci", 
    "West Side Story (2021)", "The French Dispatch", "Jungle Cruise", 
    "King Richard", "Belfast", "The Last Duel", "CODA", "Tick, Tick... Boom!", 
    "Nightmare Alley", "The Power of the Dog", "Venom: Let There Be Carnage", 
    "Ghostbusters: Afterlife", "The Forever Purge", "The Eyes of Tammy Faye", 
    "Malcolm & Marie", "Spencer", "Antlers", "The Many Saints of Newark", 
    "Fear Street Part One: 1994", "The Tomorrow War", "Bo Burnham: Inside",

    # 2022 Movies
    "Top Gun Maverick", "The Batman", "Black Panther: Wakanda Forever", 
    "Doctor Strange in the Multiverse of Madness", "Avatar: The Way of Water", 
    "Minions: The Rise of Gru", "Jurassic World Dominion", "Thor: Love and Thunder", 
    "Everything Everywhere All at Once", "Nope", "The Woman King", "Smile", 
    "The Menu", "Turning Red", "Glass Onion: A Knives Out Mystery", "Prey", 
    "The Fabelmans", "Puss in Boots: The Last Wish", "Lightyear", "Pinocchio (2022)", 
    "The Whale", "All Quiet on the Western Front", "Bullet Train", "Elvis", 
    "The Banshees of Inisherin", "Barbarian", "Babylon", "Don't Worry Darling", 
    "Amsterdam", "Marcel the Shell with Shoes On", "Hocus Pocus 2", "Bodies Bodies Bodies", 
    "Bones and All", "The Northman", "RRR", "Emancipation", "Thirteen Lives", 
    "The Adam Project", "Apollo 10½", "The Lost City", "Deep Water", 
    "Where the Crawdads Sing", "No Exit", "Scream (2022)", "Women Talking",

    # 2023 Movies
    "Barbie", "Oppenheimer", "Spider-Man: Across the Spider-Verse", 
    "Guardians of the Galaxy Vol. 3", "The Super Mario Bros. Movie", "The Little Mermaid (2023)", 
    "Mission: Impossible – Dead Reckoning Part One", "Fast X", "John Wick: Chapter 4", 
    "The Flash", "Elemental", "Indiana Jones and the Dial of Destiny", 
    "Dungeons & Dragons: Honor Among Thieves", "Creed III", "The Marvels", 
    "Ant-Man and the Wasp: Quantumania", "Evil Dead Rise", "The Hunger Games: The Ballad of Songbirds and Snakes", 
    "Killers of the Flower Moon", "The Equalizer 3", "A Haunting in Venice", 
    "Napoleon", "Wish", "The Nun II", "The Boogeyman", "Talk to Me", 
    "Blue Beetle", "Teenage Mutant Ninja Turtles: Mutant Mayhem", 
    "The Creator", "Transformers: Rise of the Beasts", "Asteroid City", 
    "Saw X", "The Exorcist: Believer", "Five Nights at Freddy's", 
    "Shazam! Fury of the Gods", "The Whale (Wide Release)", 
    "Air", "Joy Ride", "The Pale Blue Eye", "Polite Society", 
    "Are You There God? It’s Me, Margaret.", "Beau Is Afraid", "Gran Turismo", 
    "Past Lives", "Next Goal Wins", "Maestro", "The Holdovers", "Poor Things", 
    "The Killer", "TÁR (Wide Release)", "Foe", "Saltburn", "Knox Goes Away", 
    "Wonka", "Flamin' Hot", "One Piece Film: Red",

    # 2010 Movies
    "Inception", "Toy Story 3", "The Social Network", "Harry Potter and the Deathly Hallows: Part 1",
    "Shutter Island", "Black Swan", "Iron Man 2", "The King's Speech", "Tangled", "Despicable Me",
    "How to Train Your Dragon", "The Twilight Saga: Eclipse", "Alice in Wonderland (2010)", 
    "True Grit", "The Fighter", "Kick-Ass", "127 Hours", "Scott Pilgrim vs. The World", "Easy A", 
    "The Town", "The Other Guys", "Buried", "The Expendables", "The Book of Eli", "Salt", 
    "Clash of the Titans", "Robin Hood (2010)", "Percy Jackson & the Olympians: The Lightning Thief", 
    "Tron: Legacy", "The Karate Kid (2010)", "Grown Ups", "Date Night", "Due Date",

    # 2011 Movies
    "Harry Potter and the Deathly Hallows: Part 2", "The Help", "Thor", "Captain America: The First Avenger", 
    "The Twilight Saga: Breaking Dawn – Part 1", "The Girl with the Dragon Tattoo (2011)", "Rise of the Planet of the Apes",
    "Bridesmaids", "X-Men: First Class", "The Hunger Games", "Drive", "Moneyball", "War Horse",
    "The Artist", "Midnight in Paris", "Horrible Bosses", "Crazy, Stupid, Love", "The Descendants", 
    "Super 8", "Tinker Tailor Soldier Spy", "Rango", "Hugo", "Kung Fu Panda 2", "Cars 2", 
    "Fast Five", "The Adjustment Bureau", "Contagion", "Sherlock Holmes: A Game of Shadows", "Real Steel", 
    "Paranormal Activity 3", "Puss in Boots", "The Smurfs", "Sucker Punch", "The Tree of Life",

    # 2012 Movies
    "The Avengers", "The Dark Knight Rises", "The Hunger Games", "Skyfall", "The Amazing Spider-Man",
    "The Twilight Saga: Breaking Dawn – Part 2", "Django Unchained", "Life of Pi", "The Hobbit: An Unexpected Journey",
    "Les Misérables", "Brave", "Wreck-It Ralph", "Silver Linings Playbook", "Argo", "Zero Dark Thirty", 
    "Prometheus", "21 Jump Street", "Looper", "Magic Mike", "Ted", "Hotel Transylvania", "The Bourne Legacy", 
    "Lincoln", "The Master", "Pitch Perfect", "The Perks of Being a Wallflower", "Beasts of the Southern Wild", 
    "Flight", "Rise of the Guardians", "Cloud Atlas", "The Cabin in the Woods", "Chronicle",

    # 2013 Movies
    "Frozen", "Iron Man 3", "Despicable Me 2", "The Hunger Games: Catching Fire", "Man of Steel", 
    "Gravity", "The Wolf of Wall Street", "American Hustle", "Thor: The Dark World", "The Great Gatsby (2013)", 
    "The Hobbit: The Desolation of Smaug", "Monsters University", "12 Years a Slave", "The Conjuring", 
    "Frozen", "World War Z", "Pacific Rim", "Captain Phillips", "Now You See Me", "The Heat", 
    "Blue Jasmine", "Dallas Buyers Club", "Prisoners", "Saving Mr. Banks", "Her", "Rush", "This Is the End", 
    "The Croods", "Elysium", "The Secret Life of Walter Mitty", "Inside Llewyn Davis", "The Wolverine",

    # 2014 Movies
    "Guardians of the Galaxy", "The LEGO Movie", "Captain America: The Winter Soldier", "Interstellar", 
    "The Hunger Games: Mockingjay – Part 1", "Gone Girl", "The Hobbit: The Battle of the Five Armies", 
    "Big Hero 6", "The Fault in Our Stars", "X-Men: Days of Future Past", "Maleficent", "Divergent", 
    "The Grand Budapest Hotel", "How to Train Your Dragon 2", "The Imitation Game", "Birdman", 
    "Whiplash", "American Sniper", "The Maze Runner", "Edge of Tomorrow", "Nightcrawler", 
    "Unbroken", "The Theory of Everything", "The Equalizer", "Fury", "Godzilla (2014)", 
    "22 Jump Street", "The Babadook", "A Most Violent Year", "Selma", "Boyhood",

    # 2015 Movies
    "Star Wars: The Force Awakens", "Avengers: Age of Ultron", "Jurassic World", "Inside Out", 
    "The Martian", "Mad Max: Fury Road", "The Revenant", "Furious 7", "The Hunger Games: Mockingjay – Part 2", 
    "Cinderella (2015)", "The Peanuts Movie", "Ant-Man", "Minions", "Spectre", "Pitch Perfect 2", 
    "Creed", "The Big Short", "Ex Machina", "Room", "Spotlight", "Bridge of Spies", "Sicario", 
    "Straight Outta Compton", "The Danish Girl", "Trainwreck", "The Good Dinosaur", 
    "Shaun the Sheep Movie", "Spy", "The Man from U.N.C.L.E.", "Paper Towns", "Paddington",

    # 2016 Movies
    "Captain America: Civil War", "Rogue One: A Star Wars Story", "Finding Dory", "Zootopia", 
    "The Jungle Book (2016)", "Moana", "Doctor Strange", "Fantastic Beasts and Where to Find Them", 
    "Deadpool", "Batman v Superman: Dawn of Justice", "Suicide Squad", "La La Land", "Hacksaw Ridge", 
    "Hidden Figures", "Arrival", "Manchester by the Sea", "Lion", "Moonlight", "Hell or High Water", 
    "The Nice Guys", "Passengers", "The Secret Life of Pets", "Sing", "Trolls", 
    "Kubo and the Two Strings", "10 Cloverfield Lane", "The Legend of Tarzan", 
    "The Magnificent Seven (2016)", "The Shallows", "War Dogs", "Deepwater Horizon",

    # 2017 Movies
    "Wonder Woman", "Star Wars: The Last Jedi", "Beauty and the Beast (2017)", "Thor: Ragnarok", 
    "Guardians of the Galaxy Vol. 2", "Spider-Man: Homecoming", "Justice League", "It (2017)", 
    "Logan", "Coco", "Get Out", "Dunkirk", "The Shape of Water", "Blade Runner 2049", "Lady Bird", 
    "Three Billboards Outside Ebbing, Missouri", "Call Me by Your Name", "Baby Driver", "The Disaster Artist", 
    "The Post", "Darkest Hour", "I, Tonya", "Phantom Thread", "Paddington 2", "The Greatest Showman", 
    "Jumanji: Welcome to the Jungle", "The Lego Batman Movie", "War for the Planet of the Apes", 
    "The Boss Baby", "Ferdinand", "Split", "John Wick: Chapter 2", "Atomic Blonde",
]

celebrity = [
    "Leonardo DiCaprio", "Tom Cruise", "Dwayne Johnson", "Zendaya", 
    "Timothée Chalamet", "Florence Pugh", "Margot Robbie", "Chris Hemsworth", 
    "Robert Downey Jr.", "Scarlett Johansson", "Tom Holland", "Ryan Reynolds", 
    "Gal Gadot", "Pedro Pascal", "Elizabeth Olsen", "Jenna Ortega", 
    "Millie Bobby Brown", "Finn Wolfhard", "Anya Taylor-Joy", "Jason Momoa", 
    "Chris Evans", "Natalie Portman", "Henry Cavill", "Daniel Radcliffe", 
    "Emma Watson", "Rupert Grint", "Michael B. Jordan", "Anne Hathaway", 
    "Brad Pitt", "Angelina Jolie", "Keanu Reeves", "Sandra Bullock", 
    "Jake Gyllenhaal", "Christian Bale", "Cate Blanchett", "Hugh Jackman", 
    "Jennifer Lawrence", "Will Smith", "Jada Pinkett Smith", "Viola Davis", 
    "Austin Butler", "Jamie Lee Curtis", "Paul Mescal", "Tobey Maguire", 
    "Andrew Garfield", "Harrison Ford", "Helen Mirren", "Brendan Fraser", 

    # Classic Hollywood Legends
    "Marlon Brando", "James Dean", "Audrey Hepburn", "Marilyn Monroe", 
    "Humphrey Bogart", "Clark Gable", "Bette Davis", "Elizabeth Taylor",
    "Fred Astaire", "Ginger Rogers", "Ingrid Bergman", "Greta Garbo", 
    "Katharine Hepburn", "Cary Grant", "Spencer Tracy", "Rita Hayworth",
    "Grace Kelly", "Vivien Leigh", "Judy Garland", "Henry Fonda",
    "Lauren Bacall", "Paul Newman", "Charlton Heston", "Joan Crawford",

    # Modern Hollywood Icons
    "Meryl Streep", "Tom Hanks", "Denzel Washington", "Robert De Niro", 
    "Al Pacino", "Jack Nicholson", "Julia Roberts", "Leonardo DiCaprio",
    "Brad Pitt", "Angelina Jolie", "George Clooney", "Cate Blanchett",
    "Johnny Depp", "Tom Cruise", "Sandra Bullock", "Nicole Kidman", 
    "Halle Berry", "Harrison Ford", "Sigourney Weaver", "Morgan Freeman", 
    "Michelle Pfeiffer", "Dustin Hoffman", "Robin Williams", "Will Smith",

    # Franchise and Action-Adventure Stars
    "Orlando Bloom", "Viggo Mortensen", "Ian McKellen", "Elijah Wood",
    "Sean Astin", "Dominic Monaghan", "Billy Boyd", "Liv Tyler", 
    "Hugo Weaving", "Andy Serkis", "Keira Knightley", "Geoffrey Rush",
    "Johnny Depp", "Daniel Radcliffe", "Emma Watson", "Rupert Grint",
    "Helena Bonham Carter", "Ralph Fiennes", "Alan Rickman", "Michael Gambon",
    "Ewan McGregor", "Liam Neeson", "Natalie Portman", "Hayden Christensen",
    "Mark Hamill", "Carrie Fisher", "Harrison Ford", "Daisy Ridley",
    "Adam Driver", "John Boyega", "Oscar Isaac", "Diego Luna", 
    "Felicity Jones", "Pedro Pascal", "Chris Hemsworth", "Chris Evans", 
    "Scarlett Johansson", "Robert Downey Jr.", "Mark Ruffalo", "Chris Pratt",
    "Tom Holland", "Zendaya", "Benedict Cumberbatch", "Tobey Maguire", 
    "Andrew Garfield", "Hugh Jackman", "Patrick Stewart", "Ian McKellen", 
    "Ryan Reynolds", "Gal Gadot", "Henry Cavill", "Jason Momoa", 
    "Ben Affleck", "Zoe Saldaña", "Dave Bautista", "Karen Gillan",

    # Versatile and Popular Contemporary Actors
    "Christian Bale", "Amy Adams", "Ryan Gosling", "Emma Stone",
    "Anne Hathaway", "Jennifer Lawrence", "Joaquin Phoenix", "Margot Robbie",
    "Adam Driver", "Michael B. Jordan", "Florence Pugh", "Timothée Chalamet",
    "Austin Butler", "Jessica Chastain", "Mahershala Ali", "Viola Davis", 
    "Octavia Spencer", "Toni Collette", "Rami Malek", "Lakeith Stanfield",
    "Cillian Murphy", "Matt Damon", "Ben Affleck", "Jeremy Renner", 

    # Young Rising Stars
    "Millie Bobby Brown", "Finn Wolfhard", "Sadie Sink", "Noah Schnapp", 
    "Anya Taylor-Joy", "Jenna Ortega", "Hunter Schafer", "Hailee Steinfeld", 
    "Lucas Hedges", "Elle Fanning", "Dakota Fanning", "Jacob Elordi", 
    "Sydney Sweeney", "Joey King", "Sophie Turner", "Maisie Williams",

    # Comedy and Character Actors
    "Steve Carell", "Tina Fey", "Amy Poehler", "Melissa McCarthy", 
    "Kristen Wiig", "Seth Rogen", "Will Ferrell", "Paul Rudd", 
    "Bill Hader", "Jason Bateman", "Jonah Hill", "Michael Cera",
    "Ken Jeong", "Kevin Hart", "Maya Rudolph", "Chris Rock", 

    # Iconic Action and Adventure Stars
    "Dwayne Johnson", "Arnold Schwarzenegger", "Sylvester Stallone", 
    "Bruce Willis", "Jason Statham", "Keanu Reeves", "Vin Diesel", 
    "Charlize Theron", "Emily Blunt", "John Cena", "Liam Neeson", 
    "Daniel Craig", "Idris Elba", "Pierce Brosnan", "Angelina Jolie", 
    "Kate Beckinsale", "Milla Jovovich",

    # Supporting Actors and Other Notables
    "John Goodman", "Jeff Goldblum", "J.K. Simmons", "Stanley Tucci",
    "Frances McDormand", "Allison Janney", "Angela Bassett", "Regina King",
    "Jessica Lange", "Bryan Cranston", "Aaron Paul", "Bob Odenkirk", 
    "Giancarlo Esposito", "David Harbour", "Winona Ryder", 

    # Diverse and Internationally Acclaimed Actors
    "Salma Hayek", "Antonio Banderas", "Diego Luna", "Oscar Isaac", 
    "Gael García Bernal", "Eva Longoria", "Jessica Alba", 
    "Awkwafina", "Sandra Oh", "Steven Yeun", "Simu Liu", 
    "Lucy Liu", "Gemma Chan", "Mindy Kaling", "Ali Wong", 
    "Lupita Nyong'o", "Chadwick Boseman", "Daniel Kaluuya", "Letitia Wright",
    "Dev Patel", "Riz Ahmed", "Zazie Beetz", "Mahershala Ali",

    # Sports
    "Lionel Messi", "Cristiano Ronaldo", "Neymar Jr.", "Kylian Mbappé", 
    "LeBron James", "Serena Williams", "Roger Federer", "Novak Djokovic", 
    "Rafael Nadal", "Simone Biles", "Naomi Osaka", "Stephen Curry", 
    "Kevin Durant", "Tom Brady", "Patrick Mahomes", "Virat Kohli", 
    "Rohit Sharma", "Shaquille O'Neal", "Tiger Woods", "Lewis Hamilton", 
    "Max Verstappen", "Charles Leclerc", "Usain Bolt", "Megan Rapinoe", 
    "Alex Morgan", "Katie Ledecky", "Michael Phelps", "Giannis Antetokounmpo", 
    "Damian Lillard", "Anthony Davis", "Zlatan Ibrahimović", "Harry Kane", 
    "Sadio Mané", "Karim Benzema", "Gareth Bale", "Robert Lewandowski", 
    "Erling Haaland", "Venus Williams", "Iga Świątek", "Aryna Sabalenka", 

    # Politics and Leaders
    "Joe Biden", "Kamala Harris", "Barack Obama", "Michelle Obama", 
    "Donald Trump", "Melania Trump", "Emmanuel Macron", "Olaf Scholz", 
    "Volodymyr Zelenskyy", "Rishi Sunak", "Narendra Modi", "Jacinda Ardern", 
    "Justin Trudeau", "Xi Jinping", "Vladimir Putin", "Angela Merkel", 
    "Elizabeth II", "King Charles III", "Prince William", "Prince Harry", 
    "Meghan Markle", "Queen Letizia", "Pope Francis", "Dalai Lama", 
    "Greta Thunberg", "Alexandria Ocasio-Cortez", "Bernie Sanders", 
    "Nicolas Maduro", "Jair Bolsonaro", "Fumio Kishida", "Yoon Suk-yeol",

    # Business and Technology
    "Elon Musk", "Jeff Bezos", "Mark Zuckerberg", "Bill Gates", "Tim Cook", 
    "Sundar Pichai", "Satya Nadella", "Warren Buffett", "Bernard Arnault", 
    "Larry Page", "Sergey Brin", "Steve Wozniak", "Reed Hastings", "Susan Wojcicki", 
    "Jack Ma", "Daniel Ek", "Evan Spiegel", "Andrew Ng", "Sam Altman", 
    "Sheryl Sandberg", "Peter Thiel", "Marc Benioff", "Richard Branson", 
    "Oprah Winfrey", "Howard Schultz", "Larry Ellison", "David Baszucki", 
    "Parag Agrawal", "Adam Neumann", "Kylie Jenner", "Kim Kardashian", 
    "Khloé Kardashian", "Kris Jenner", "Robert Kiyosaki", "Barbara Corcoran", 

    # Science and Innovation
    "Jane Goodall", "Neil deGrasse Tyson", "Brian Cox", "Michio Kaku", 
    "Katherine Johnson", "Jennifer Doudna", "Emmanuelle Charpentier", "Tim Berners-Lee", 
    "Mae Jemison", "Katie Bouman", "Brian Greene", "James Lovelock", 
    "Roger Penrose", "Dmitry Muratov", "Frances Arnold", "Venki Ramakrishnan", 
    "Paul Nurse", "Elizabeth Blackburn", "Carol Greider", "David Julius", 
    "Abhijit Banerjee", "Esther Duflo", "Michael Kremer", "Andrea Ghez", 
    "Reinhard Genzel", "Jennifer Hudson", "Ashoke Sen", "Subrahmanyan Chandrasekhar", 

    # Others
    "Ellen DeGeneres", "Oprah Winfrey", "Trevor Noah", "Jimmy Fallon", 
    "Stephen Colbert", "John Oliver", "James Corden", "Conan O'Brien", 
    "Dolly Parton", "Gordon Ramsay", "David Beckham", "Victoria Beckham", 
    "RuPaul", "Chris Rock", "Dave Chappelle", "Trevor Noah", "Hasan Minhaj", 
    "Ali Wong", "Bo Burnham", "Jo Koy", "Kevin Hart", "Sarah Silverman", 
    "Tiffany Haddish", "Joe Rogan", "Logan Paul", "MrBeast", "PewDiePie", 
    "Emma Chamberlain", "Charli D'Amelio", "Addison Rae", "Bella Poarch",
]

show = [
    'Breaking Bad', 'Stranger Things', 'Game of Thrones', 'Friends', 'The Office',
    'The Mandalorian', 'The Crown', 'WandaVision', 'Loki', 'The Boys',
    'Better Call Saul', 'The Witcher', 'House of the Dragon', 'Severance', 'The Last of Us',
    'The White Lotus', 'Succession', 'Ted Lasso', 'Squid Game', 'The Marvelous Mrs. Maisel',
    'Euphoria', 'Ozark', 'The Handmaid’s Tale', 'Westworld', 'Only Murders in the Building',
    'The Umbrella Academy', 'Black Mirror', 'Peaky Blinders', 'Sherlock', 'Brooklyn Nine-Nine',
    'Parks and Recreation', 'Fargo', 'Mindhunter', 'Dark', 'Arcane',
    'The Sandman', 'Yellowstone', 'The Walking Dead', 'American Horror Story', 'The Sopranos',
    'Mad Men', 'Arrested Development', 'Rick and Morty', 'BoJack Horseman', 'Fleabag',
]

In [None]:
print(f"len(movie) = {len(movie)}")
print(f"len(celebrity) = {len(celebrity)}")
print(f"len(show) = {len(show)}")

In [None]:
information_intent_templates = [
    "{movie}",
    "{movie} cast",
    "{movie} budget",
    "{movie} director",
    "{movie} collection",
    "{celebrity}",
    "{celebrity} age",
    "is {celebrity} married",
    "{celebrity} net worth",
    "{show}",
    "{show} cast",
    "is {show} available on Netflix/Disney+/Amazon Prime?",
    "what is {show} about?",
    "how many seasons of {show}?",
    "who are the main characters in {show}?"
    "{show} reviews",
]


MOVIE_CATEGORY = "{movie}"
CELEBRITY_CATEGORY = "{celebrity}"
SHOW_CATEGORY = "{show}"

INFORMATION_CATEGORIES = {
    MOVIE_CATEGORY: movie,
    CELEBRITY_CATEGORY: celebrity,
    SHOW_CATEGORY: show,
}


In [None]:
information_intent_additional_queries = generate_service_queries(INFORMATION_CATEGORIES, information_intent_templates, n_queries=1200)
print(len(information_intent_additional_queries))

In [None]:
information_intent_additional_queries_df = pd.DataFrame(information_intent_additional_queries, columns=['sequence'])
information_intent_additional_queries_df['target'] = 'information_intent'
information_intent_additional_queries_df

In [None]:
# len(partial_queries_with_intents)

# yelp_keywords_data = pd.read_json("https://firefox-settings-attachments.cdn.mozilla.net/main-workspace/quicksuggest/33987d71-9e87-4b7e-86d3-6f292b89e8bf.json")['subjects'].values[0]

In [None]:
print(len(yelp_keywords_data))
yelp_keywords_data[:10]

In [None]:
import random

# Expanded base examples for each intent
information_examples_partial = [
    "capital of", "history of", "current news", "population of", "how to make",
    "meaning of", "invention of", "who discovered", "history of", "what is",
    "symptoms of", "definition of", "why is", "facts about", "signs of", 
    "how does", "global warming", "causes of", "influence of", "effects of",
    "famous quotes", "important events", "founder of", "principles of", 
    "basics of", "impact of", "recent studies", "causes of", "research about",
    "inventions by", "works of", "origin of", "foundation of",
    "world war", "founding fathers", "civil rights", "ancient civilizations", 
    "DNA structure", "gravity", "nobel prize", "space exploration",
    "vaccination history", "internet development", "economic recession", 
    "major religions", "greenhouse gases", "solar system", "industrial revolution", 
    "history of technology", "evolution of species", "brain functions", "famous personalities",
    "meaning of", "definition", "age of", "side effects", "how to ", "explain me",
    "largest country", "oldest language", "famous battles", "biggest animal", 
    "smallest planet", "tallest mountain", "fastest car", "deepest ocean", 
    "heaviest element", "largest desert", "how electricity works", 
    "origin of music", "who is the richest", "what is AI", "how planes fly", 
    "why we dream", "why the sky is blue", "meaning of life", "origin of pizza", 
    "discovery of fire", "who invented the wheel", "benefits of exercise", 
    "causes of obesity", "why we sleep", "origin of the internet", 
    "who built the pyramids", "what is democracy", "founder of apple", 
    "who created bitcoin", "why people yawn", "first computer", 
    "how rain forms", "who invented cars", "why water is wet", 
    "how plants grow", "why the moon shines", "origin of coffee", 
    "who is the president", "what is inflation", "how atoms work", 
    "what is a black hole", "first man on moon", "fastest animal", 
    "oldest tree", "why oceans are salty", "who built the Taj Mahal", 
    "origin of Christmas", "causes of earthquakes", "who discovered penicillin", 
    "what is time", "meaning of happiness", "history of jazz", "largest volcano", 
    "who founded America", "how tides work", "what is evolution", 
    "causes of climate change", "who created Google", "why stars twinkle", 
    "origin of yoga", "what is philosophy", "largest mammal", 
    "how computers work", "meaning of justice", "history of cinema", 
    "famous painters", "when did dinosaurs exist", "how magnets work", 
    "famous philosophers", "largest river", "origin of beer", 
    "who is Cleopatra", "why do we laugh", "how do vaccines work", 
    "founder of Amazon", "what is meditation", "first smartphone", 
    "history of money", "first human", "what is photosynthesis", 
    "biggest galaxy", "origin of tea", "first civilization", 
    "causes of addiction", "largest waterfall", "oldest book", 
    "who is Shakespeare", "how phones work", "founder of Tesla", 
    "largest continent", "origin of chocolate", "what is gravity", 
    "meaning of friendship", "how cells work", "what is quantum physics", 
    "why people lie", "who is Buddha", "origin of math", 
    "what is linguistics", "famous explorers",
    "who is Albert Einstein", "life of Mahatma Gandhi", "legacy of Martin Luther King", 
    "biography of Leonardo da Vinci", "about Cleopatra", "teachings of Confucius", 
    "philosophy of Socrates", "contributions of Marie Curie", "ideas of Karl Marx", 
    "what did Nikola Tesla invent", "about Winston Churchill", "works of Vincent van Gogh", 
    "principles of Sigmund Freud", "achievements of Isaac Newton", "writings of J.K. Rowling", 
    "who is Oprah Winfrey", "about Steve Jobs", "success of Jeff Bezos", 
    "what is relativity", "theory of evolution", "who is Elon Musk", 
    "biography of Pablo Picasso", "life of Bruce Lee", "teachings of Buddha", 
    "works of Ernest Hemingway", "about Charles Darwin", "philosophy of Friedrich Nietzsche", 
    "impact of Beethoven", "life of Shakespeare", "contributions of Thomas Edison", 
    "discoveries of Galileo", "teachings of Jesus", "who is Beyoncé", 
    "about Muhammad Ali", "contributions of Ada Lovelace", "achievements of Michael Jackson", 
    "works of Mark Twain", "discoveries of Alexander Fleming", "who is Rihanna", 
    "about Walt Disney", "inventions of Alexander Graham Bell", "what is the Big Bang theory", 
    "legacy of Mother Teresa", "ideas of Aristotle", "works of Jane Austen", 
    "achievements of Usain Bolt", "who is Michael Jordan", "philosophy of John Locke", 
    "life of Malcolm X", "discoveries of Louis Pasteur", "about Frida Kahlo", 
    "impact of Mozart", "biography of Bob Marley", "contributions of Carl Jung", 
    "who is Stephen Hawking", "legacy of Rosa Parks", "who is Billie Eilish", 
    "writings of Franz Kafka", "philosophy of Jean-Jacques Rousseau", "about Coco Chanel", 
    "works of Michelangelo", "who is Vladimir Putin", "discoveries of Dmitri Mendeleev", 
    "about Salvador Dalí", "theory of Alfred Wegener", "contributions of Alan Turing", 
    "what is existentialism", "about Greta Thunberg", "philosophy of Immanuel Kant", 
    "who is Mahatma Buddha", "ideas of Max Weber", "discoveries of Gregor Mendel", 
    "who is Prince", "impact of The Beatles", "biography of Neil Armstrong", 
    "about Amelia Earhart", "contributions of Enrico Fermi", "discoveries of Henry Ford", 
    "who is Malala Yousafzai", "philosophy of Plato", "works of Andy Warhol", 
    "life of Florence Nightingale", "impact of Jimi Hendrix", "who is Serena Williams", 
    "about Desmond Tutu", "legacy of Malcolm Gladwell", "contributions of James Watson", 
    "achievements of Charles Dickens", "ideas of Noam Chomsky", "biography of Stephen King", 
    "teachings of Mahavira", "about Maya Angelou", "who is Alfred Nobel", 
    "philosophy of Bertrand Russell", "discoveries of Hans Christian Andersen", 
    "legacy of Marie Antoinette", "life of Helen Keller", "who is Ludwig van Beethoven", 
    "about Genghis Khan", "teachings of Laozi",
    "latest news", "who won the game", "movie releases", "upcoming TV shows", 
    "best new movies", "celebrity gossip", "who is dating", "sports highlights", 
    "current stock prices", "what is inflation", "latest trends", "who is the president", 
    "climate change updates", "new technology trends", "upcoming elections", 
    "Olympic results", "famous actors", "blockbuster movies", "NBA scores", 
    "trending music", "celebrity net worth", "economic growth", "who won the Grammy", 
    "what is cryptocurrency", "latest from Hollywood", "box office hits", 
    "who won the World Cup", "new Marvel movie", "political scandals", 
    "Oscars winners", "top Netflix shows", "popular TV series", "current affairs", 
    "who won the award", "crypto news", "popular streaming shows", "best books of the year", 
    "housing market trends", "what is GDP", "who won the election", "new iPhone release", 
    "what is recession", "latest in fashion", "upcoming concerts", "celebrity marriages", 
    "sports schedules", "movie box office", "biggest TV show", "current unemployment rate", 
    "what is TikTok", "latest science news", "trending diets", "popular YouTubers", 
    "what is Metaverse", "who is the governor", "latest in medicine", "NFL scores", 
    "latest political news", "updates on the economy", "best restaurants near me", 
    "new space discoveries", "streaming top charts", "celebrity breakups", "football scores", 
    "Oscars nominations", "recent Nobel winners", "what is AI", "who won the Superbowl", 
    "what is quantum computing", "latest health news", "viral social media trends", 
    "global warming news", "best-selling albums", "political debates", 
    "new vaccine updates", "stock market predictions", "who won the Emmys", 
    "top-rated series", "economic policies", "fashion week highlights", 
    "best reality shows", "latest space missions", "cryptocurrency trends", 
    "sports team rankings", "who won the Golden Globes", "who is the prime minister", 
    "what is net neutrality", "climate change effects", "famous sports players", 
    "who won Wimbledon", "top YouTube videos", "election results", "biggest IPOs", 
    "celebrity endorsements", "new movie trailers", "global economic trends", 
    "biggest tech companies", "influential leaders", "what is NATO", 
    "top tourist destinations", "world leaders summit", "upcoming sporting events",
    "Taylor Swift songs", "Leonardo DiCaprio movies", "Beyoncé albums", 
    "Friends TV show", "Breaking Bad episodes", "Game of Thrones cast", 
    "Marvel Cinematic Universe", "Star Wars movies", "Inception plot", 
    "lyrics to Bohemian Rhapsody", "Stranger Things characters", "The Beatles discography", 
    "Ariana Grande latest album", "Elvis Presley hits", "Black Mirror episodes", 
    "Westworld show", "Lady Gaga songs", "Drake top hits", "Harry Potter movies", 
    "The Godfather actors", "Taylor Swift tour", "Stranger Things season 4", 
    "Dwayne Johnson movies", "Selena Gomez songs", "Friends reunion", 
    "Taylor Swift Eras Tour", "Top Gun Maverick", "The Office episodes", 
    "Rihanna Fenty Beauty", "Adele 30 album", "Star Trek series", 
    "Billie Eilish new album", "Fast and Furious cast", "James Bond movies", 
    "SpongeBob SquarePants characters", "House of the Dragon plot", "Super Bowl halftime show", 
    "Coldplay discography", "Jennifer Aniston roles", "Avatar movie", 
    "The Avengers actors", "Michael Jackson hits", "Narcos series", 
    "Justin Bieber songs", "Taylor Swift lyrics", "The Mandalorian show", 
    "Ed Sheeran albums", "Kanye West controversies", "The Crown Netflix", 
    "BTS Butter lyrics", "Game of Thrones finale", "Spider-Man No Way Home", 
    "Joker movie plot", "Kim Kardashian fashion", "Avengers Endgame cast", 
    "Bridgerton Netflix", "Post Malone songs", "Friends theme song", 
    "Star Wars characters", "Euphoria cast", "Frozen movie", "Ariana Grande discography", 
    "Megan Thee Stallion hits", "Kendrick Lamar albums", "Lizzo music", 
    "The Lion King soundtrack", "Selena Gomez movies", "Will Smith Oscars", 
    "Wonder Woman cast", "Dua Lipa songs", "The Simpsons episodes", 
    "Stranger Things season 5", "Queen band members", "Michael Jordan career", 
    "Madonna songs", "The Twilight Saga movies", "Shawn Mendes latest album", 
    "The Weeknd discography", "Friends Thanksgiving episodes", "Billie Eilish Grammy wins", 
    "The Sopranos cast", "Matrix movies", "Taylor Swift Red album", 
    "Drake collaborations", "The Witcher Netflix", "Peaky Blinders characters", 
    "Bohemian Rhapsody movie", "Adele Hello lyrics", "Rocky Balboa movies", 
    "The Walking Dead cast", "Nirvana band members", "A Star is Born soundtrack", 
    "Tom Hanks movies", "Fleetwood Mac hits", "Meryl Streep roles", 
    "Dune movie cast", "Friends Ross and Rachel", "Miley Cyrus top songs", 
    "Ariana Grande and The Weeknd", "Shrek movies", "Backstreet Boys songs", 
    "The Great Gatsby movie", "The Beatles Yellow Submarine",
    # Entertainment and Music
    "Taylor Swift", "Billie Eilish", "Drake", "Ariana Grande", "Rihanna", 
    "Beyoncé", "Ed Sheeran", "Harry Styles", "Shakira", "Kanye West", 
    "Lady Gaga", "Post Malone", "The Weeknd", "Justin Bieber", "Selena Gomez", 
    "Dua Lipa", "Bruno Mars", "Katy Perry", "Shawn Mendes", "Camila Cabello", 
    "Bad Bunny", "J Balvin", "Olivia Rodrigo", "Lil Nas X", "Doja Cat", 
    "Cardi B", "Lizzo", "Sam Smith", "SZA", "Chris Brown", 
    "BLACKPINK", "BTS", "Jungkook", "Lisa Manoban", "Rosé", 
    "Megan Thee Stallion", "Karol G", "Anitta", "Jack Harlow", "Nicki Minaj", 
    "Kendrick Lamar", "Imagine Dragons", "Coldplay", "John Legend", "Adele", 
    "Miley Cyrus", "Zayn Malik", "Charlie Puth", "Halsey", "Marshmello", 
    "Jared Leto", "Adam Levine", "Snoop Dogg", "Pharrell Williams", "Travis Scott",

    # Film and TV
    "Leonardo DiCaprio", "Tom Cruise", "Dwayne Johnson", "Zendaya", 
    "Timothée Chalamet", "Florence Pugh", "Margot Robbie", "Chris Hemsworth", 
    "Robert Downey Jr.", "Scarlett Johansson", "Tom Holland", "Ryan Reynolds", 
    "Gal Gadot", "Pedro Pascal", "Elizabeth Olsen", "Jenna Ortega", 
    "Millie Bobby Brown", "Finn Wolfhard", "Anya Taylor-Joy", "Jason Momoa", 
    "Chris Evans", "Natalie Portman", "Henry Cavill", "Daniel Radcliffe", 
    "Emma Watson", "Rupert Grint", "Michael B. Jordan", "Anne Hathaway", 
    "Brad Pitt", "Angelina Jolie", "Keanu Reeves", "Sandra Bullock", 
    "Jake Gyllenhaal", "Christian Bale", "Cate Blanchett", "Hugh Jackman", 
    "Jennifer Lawrence", "Will Smith", "Jada Pinkett Smith", "Viola Davis", 
    "Austin Butler", "Jamie Lee Curtis", "Paul Mescal", "Tobey Maguire", 
    "Andrew Garfield", "Harrison Ford", "Helen Mirren", "Brendan Fraser", 

    # Classic Hollywood Legends
    "Marlon Brando", "James Dean", "Audrey Hepburn", "Marilyn Monroe", 
    "Humphrey Bogart", "Clark Gable", "Bette Davis", "Elizabeth Taylor",
    "Fred Astaire", "Ginger Rogers", "Ingrid Bergman", "Greta Garbo", 
    "Katharine Hepburn", "Cary Grant", "Spencer Tracy", "Rita Hayworth",
    "Grace Kelly", "Vivien Leigh", "Judy Garland", "Henry Fonda",
    "Lauren Bacall", "Paul Newman", "Charlton Heston", "Joan Crawford",

    # Modern Hollywood Icons
    "Meryl Streep", "Tom Hanks", "Denzel Washington", "Robert De Niro", 
    "Al Pacino", "Jack Nicholson", "Julia Roberts", "Leonardo DiCaprio",
    "Brad Pitt", "Angelina Jolie", "George Clooney", "Cate Blanchett",
    "Johnny Depp", "Tom Cruise", "Sandra Bullock", "Nicole Kidman", 
    "Halle Berry", "Harrison Ford", "Sigourney Weaver", "Morgan Freeman", 
    "Michelle Pfeiffer", "Dustin Hoffman", "Robin Williams", "Will Smith",

    # Franchise and Action-Adventure Stars
    "Orlando Bloom", "Viggo Mortensen", "Ian McKellen", "Elijah Wood",
    "Sean Astin", "Dominic Monaghan", "Billy Boyd", "Liv Tyler", 
    "Hugo Weaving", "Andy Serkis", "Keira Knightley", "Geoffrey Rush",
    "Johnny Depp", "Daniel Radcliffe", "Emma Watson", "Rupert Grint",
    "Helena Bonham Carter", "Ralph Fiennes", "Alan Rickman", "Michael Gambon",
    "Ewan McGregor", "Liam Neeson", "Natalie Portman", "Hayden Christensen",
    "Mark Hamill", "Carrie Fisher", "Harrison Ford", "Daisy Ridley",
    "Adam Driver", "John Boyega", "Oscar Isaac", "Diego Luna", 
    "Felicity Jones", "Pedro Pascal", "Chris Hemsworth", "Chris Evans", 
    "Scarlett Johansson", "Robert Downey Jr.", "Mark Ruffalo", "Chris Pratt",
    "Tom Holland", "Zendaya", "Benedict Cumberbatch", "Tobey Maguire", 
    "Andrew Garfield", "Hugh Jackman", "Patrick Stewart", "Ian McKellen", 
    "Ryan Reynolds", "Gal Gadot", "Henry Cavill", "Jason Momoa", 
    "Ben Affleck", "Zoe Saldaña", "Dave Bautista", "Karen Gillan",

    # Versatile and Popular Contemporary Actors
    "Christian Bale", "Amy Adams", "Ryan Gosling", "Emma Stone",
    "Anne Hathaway", "Jennifer Lawrence", "Joaquin Phoenix", "Margot Robbie",
    "Adam Driver", "Michael B. Jordan", "Florence Pugh", "Timothée Chalamet",
    "Austin Butler", "Jessica Chastain", "Mahershala Ali", "Viola Davis", 
    "Octavia Spencer", "Toni Collette", "Rami Malek", "Lakeith Stanfield",
    "Cillian Murphy", "Matt Damon", "Ben Affleck", "Jeremy Renner", 

    # Young Rising Stars
    "Millie Bobby Brown", "Finn Wolfhard", "Sadie Sink", "Noah Schnapp", 
    "Anya Taylor-Joy", "Jenna Ortega", "Hunter Schafer", "Hailee Steinfeld", 
    "Lucas Hedges", "Elle Fanning", "Dakota Fanning", "Jacob Elordi", 
    "Sydney Sweeney", "Joey King", "Sophie Turner", "Maisie Williams",

    # Comedy and Character Actors
    "Steve Carell", "Tina Fey", "Amy Poehler", "Melissa McCarthy", 
    "Kristen Wiig", "Seth Rogen", "Will Ferrell", "Paul Rudd", 
    "Bill Hader", "Jason Bateman", "Jonah Hill", "Michael Cera",
    "Ken Jeong", "Kevin Hart", "Maya Rudolph", "Chris Rock", 

    # Iconic Action and Adventure Stars
    "Dwayne Johnson", "Arnold Schwarzenegger", "Sylvester Stallone", 
    "Bruce Willis", "Jason Statham", "Keanu Reeves", "Vin Diesel", 
    "Charlize Theron", "Emily Blunt", "John Cena", "Liam Neeson", 
    "Daniel Craig", "Idris Elba", "Pierce Brosnan", "Angelina Jolie", 
    "Kate Beckinsale", "Milla Jovovich",

    # Supporting Actors and Other Notables
    "John Goodman", "Jeff Goldblum", "J.K. Simmons", "Stanley Tucci",
    "Frances McDormand", "Allison Janney", "Angela Bassett", "Regina King",
    "Jessica Lange", "Bryan Cranston", "Aaron Paul", "Bob Odenkirk", 
    "Giancarlo Esposito", "David Harbour", "Winona Ryder", 

    # Diverse and Internationally Acclaimed Actors
    "Salma Hayek", "Antonio Banderas", "Diego Luna", "Oscar Isaac", 
    "Gael García Bernal", "Eva Longoria", "Jessica Alba", 
    "Awkwafina", "Sandra Oh", "Steven Yeun", "Simu Liu", 
    "Lucy Liu", "Gemma Chan", "Mindy Kaling", "Ali Wong", 
    "Lupita Nyong'o", "Chadwick Boseman", "Daniel Kaluuya", "Letitia Wright",
    "Dev Patel", "Riz Ahmed", "Zazie Beetz", "Mahershala Ali",

    # Sports
    "Lionel Messi", "Cristiano Ronaldo", "Neymar Jr.", "Kylian Mbappé", 
    "LeBron James", "Serena Williams", "Roger Federer", "Novak Djokovic", 
    "Rafael Nadal", "Simone Biles", "Naomi Osaka", "Stephen Curry", 
    "Kevin Durant", "Tom Brady", "Patrick Mahomes", "Virat Kohli", 
    "Rohit Sharma", "Shaquille O'Neal", "Tiger Woods", "Lewis Hamilton", 
    "Max Verstappen", "Charles Leclerc", "Usain Bolt", "Megan Rapinoe", 
    "Alex Morgan", "Katie Ledecky", "Michael Phelps", "Giannis Antetokounmpo", 
    "Damian Lillard", "Anthony Davis", "Zlatan Ibrahimović", "Harry Kane", 
    "Sadio Mané", "Karim Benzema", "Gareth Bale", "Robert Lewandowski", 
    "Erling Haaland", "Venus Williams", "Iga Świątek", "Aryna Sabalenka", 

    # Politics and Leaders
    "Joe Biden", "Kamala Harris", "Barack Obama", "Michelle Obama", 
    "Donald Trump", "Melania Trump", "Emmanuel Macron", "Olaf Scholz", 
    "Volodymyr Zelenskyy", "Rishi Sunak", "Narendra Modi", "Jacinda Ardern", 
    "Justin Trudeau", "Xi Jinping", "Vladimir Putin", "Angela Merkel", 
    "Elizabeth II", "King Charles III", "Prince William", "Prince Harry", 
    "Meghan Markle", "Queen Letizia", "Pope Francis", "Dalai Lama", 
    "Greta Thunberg", "Alexandria Ocasio-Cortez", "Bernie Sanders", 
    "Nicolas Maduro", "Jair Bolsonaro", "Fumio Kishida", "Yoon Suk-yeol",

    # Business and Technology
    "Elon Musk", "Jeff Bezos", "Mark Zuckerberg", "Bill Gates", "Tim Cook", 
    "Sundar Pichai", "Satya Nadella", "Warren Buffett", "Bernard Arnault", 
    "Larry Page", "Sergey Brin", "Steve Wozniak", "Reed Hastings", "Susan Wojcicki", 
    "Jack Ma", "Daniel Ek", "Evan Spiegel", "Andrew Ng", "Sam Altman", 
    "Sheryl Sandberg", "Peter Thiel", "Marc Benioff", "Richard Branson", 
    "Oprah Winfrey", "Howard Schultz", "Larry Ellison", "David Baszucki", 
    "Parag Agrawal", "Adam Neumann", "Kylie Jenner", "Kim Kardashian", 
    "Khloé Kardashian", "Kris Jenner", "Robert Kiyosaki", "Barbara Corcoran", 

    # Science and Innovation
    "Jane Goodall", "Neil deGrasse Tyson", "Brian Cox", "Michio Kaku", 
    "Katherine Johnson", "Jennifer Doudna", "Emmanuelle Charpentier", "Tim Berners-Lee", 
    "Mae Jemison", "Katie Bouman", "Brian Greene", "James Lovelock", 
    "Roger Penrose", "Dmitry Muratov", "Frances Arnold", "Venki Ramakrishnan", 
    "Paul Nurse", "Elizabeth Blackburn", "Carol Greider", "David Julius", 
    "Abhijit Banerjee", "Esther Duflo", "Michael Kremer", "Andrea Ghez", 
    "Reinhard Genzel", "Jennifer Hudson", "Ashoke Sen", "Subrahmanyan Chandrasekhar", 

    # Others
    "Ellen DeGeneres", "Oprah Winfrey", "Trevor Noah", "Jimmy Fallon", 
    "Stephen Colbert", "John Oliver", "James Corden", "Conan O'Brien", 
    "Dolly Parton", "Gordon Ramsay", "David Beckham", "Victoria Beckham", 
    "RuPaul", "Chris Rock", "Dave Chappelle", "Trevor Noah", "Hasan Minhaj", 
    "Ali Wong", "Bo Burnham", "Jo Koy", "Kevin Hart", "Sarah Silverman", 
    "Tiffany Haddish", "Joe Rogan", "Logan Paul", "MrBeast", "PewDiePie", 
    "Emma Chamberlain", "Charli D'Amelio", "Addison Rae", "Bella Poarch",

    # 2019 Movies
    "Avengers: Endgame", "The Lion King (2019)", "Frozen II", "Toy Story 4", 
    "Star Wars: The Rise of Skywalker", "Joker", "Spider-Man: Far From Home", 
    "Captain Marvel", "Aladdin (2019)", "Knives Out", "Us", "Once Upon a Time in Hollywood", 
    "1917", "Ford v Ferrari", "It Chapter Two", "Parasite", "Shazam!", 
    "How to Train Your Dragon: The Hidden World", "Jumanji: The Next Level", "Little Women", 
    "Marriage Story", "Jojo Rabbit", "The Irishman", "Rocketman", 
    "John Wick: Chapter 3 – Parabellum", "Glass", "Hustlers", "The Lego Movie 2: The Second Part", 
    "Dumbo", "Alita: Battle Angel", "Doctor Sleep", "Ad Astra", "The Lighthouse", 
    "Frozen II", "Zombieland: Double Tap", "Midsommar", "Good Boys", "A Beautiful Day in the Neighborhood",

    # 2020 Movies
    "Tenet", "Sonic the Hedgehog", "Wonder Woman 1984", "Birds of Prey", 
    "The Invisible Man", "Soul", "Onward", "The Croods: A New Age", "Mulan (2020)", 
    "Bad Boys for Life", "The Trial of the Chicago 7", "Palm Springs", 
    "Hamilton", "Ma Rainey's Black Bottom", "Borat Subsequent Moviefilm", 
    "The Old Guard", "Enola Holmes", "The Midnight Sky", "Extraction", 
    "The Call of the Wild", "Greyhound", "The Way Back", "Da 5 Bloods", 
    "One Night in Miami...", "Sound of Metal", "Promising Young Woman", 
    "The Devil All the Time", "News of the World", "Over the Moon", 
    "A Shaun the Sheep Movie: Farmageddon", "My Spy", "The Personal History of David Copperfield", 
    "The Half of It", "Pieces of a Woman", "The King of Staten Island", 
    "The Lovebirds", "The Secret Garden", "Let Him Go", "Kajillionaire", "The Witches (2020)",

    # 2021 Movies
    "Spider-Man: No Way Home", "Shang-Chi and the Legend of the Ten Rings", 
    "Black Widow", "Eternals", "Dune (2021)", "The Suicide Squad", 
    "Free Guy", "Encanto", "Cruella", "No Time to Die", "The Mitchells vs. the Machines", 
    "Luca", "Raya and the Last Dragon", "The Green Knight", "In the Heights", 
    "A Quiet Place Part II", "Don't Look Up", "House of Gucci", 
    "West Side Story (2021)", "The French Dispatch", "Jungle Cruise", 
    "King Richard", "Belfast", "The Last Duel", "CODA", "Tick, Tick... Boom!", 
    "Nightmare Alley", "The Power of the Dog", "Venom: Let There Be Carnage", 
    "Ghostbusters: Afterlife", "The Forever Purge", "The Eyes of Tammy Faye", 
    "Malcolm & Marie", "Spencer", "Antlers", "The Many Saints of Newark", 
    "Fear Street Part One: 1994", "The Tomorrow War", "Bo Burnham: Inside",

    # 2022 Movies
    "Top Gun Maverick", "The Batman", "Black Panther: Wakanda Forever", 
    "Doctor Strange in the Multiverse of Madness", "Avatar: The Way of Water", 
    "Minions: The Rise of Gru", "Jurassic World Dominion", "Thor: Love and Thunder", 
    "Everything Everywhere All at Once", "Nope", "The Woman King", "Smile", 
    "The Menu", "Turning Red", "Glass Onion: A Knives Out Mystery", "Prey", 
    "The Fabelmans", "Puss in Boots: The Last Wish", "Lightyear", "Pinocchio (2022)", 
    "The Whale", "All Quiet on the Western Front", "Bullet Train", "Elvis", 
    "The Banshees of Inisherin", "Barbarian", "Babylon", "Don't Worry Darling", 
    "Amsterdam", "Marcel the Shell with Shoes On", "Hocus Pocus 2", "Bodies Bodies Bodies", 
    "Bones and All", "The Northman", "RRR", "Emancipation", "Thirteen Lives", 
    "The Adam Project", "Apollo 10½", "The Lost City", "Deep Water", 
    "Where the Crawdads Sing", "No Exit", "Scream (2022)", "Women Talking",

    # 2023 Movies
    "Barbie", "Oppenheimer", "Spider-Man: Across the Spider-Verse", 
    "Guardians of the Galaxy Vol. 3", "The Super Mario Bros. Movie", "The Little Mermaid (2023)", 
    "Mission: Impossible – Dead Reckoning Part One", "Fast X", "John Wick: Chapter 4", 
    "The Flash", "Elemental", "Indiana Jones and the Dial of Destiny", 
    "Dungeons & Dragons: Honor Among Thieves", "Creed III", "The Marvels", 
    "Ant-Man and the Wasp: Quantumania", "Evil Dead Rise", "The Hunger Games: The Ballad of Songbirds and Snakes", 
    "Killers of the Flower Moon", "The Equalizer 3", "A Haunting in Venice", 
    "Napoleon", "Wish", "The Nun II", "The Boogeyman", "Talk to Me", 
    "Blue Beetle", "Teenage Mutant Ninja Turtles: Mutant Mayhem", 
    "The Creator", "Transformers: Rise of the Beasts", "Asteroid City", 
    "Saw X", "The Exorcist: Believer", "Five Nights at Freddy's", 
    "Shazam! Fury of the Gods", "The Whale (Wide Release)", 
    "Air", "Joy Ride", "The Pale Blue Eye", "Polite Society", 
    "Are You There God? It’s Me, Margaret.", "Beau Is Afraid", "Gran Turismo", 
    "Past Lives", "Next Goal Wins", "Maestro", "The Holdovers", "Poor Things", 
    "The Killer", "TÁR (Wide Release)", "Foe", "Saltburn", "Knox Goes Away", 
    "Wonka", "Flamin' Hot", "One Piece Film: Red",

    # 2010 Movies
    "Inception", "Toy Story 3", "The Social Network", "Harry Potter and the Deathly Hallows: Part 1",
    "Shutter Island", "Black Swan", "Iron Man 2", "The King's Speech", "Tangled", "Despicable Me",
    "How to Train Your Dragon", "The Twilight Saga: Eclipse", "Alice in Wonderland (2010)", 
    "True Grit", "The Fighter", "Kick-Ass", "127 Hours", "Scott Pilgrim vs. The World", "Easy A", 
    "The Town", "The Other Guys", "Buried", "The Expendables", "The Book of Eli", "Salt", 
    "Clash of the Titans", "Robin Hood (2010)", "Percy Jackson & the Olympians: The Lightning Thief", 
    "Tron: Legacy", "The Karate Kid (2010)", "Grown Ups", "Date Night", "Due Date",

    # 2011 Movies
    "Harry Potter and the Deathly Hallows: Part 2", "The Help", "Thor", "Captain America: The First Avenger", 
    "The Twilight Saga: Breaking Dawn – Part 1", "The Girl with the Dragon Tattoo (2011)", "Rise of the Planet of the Apes",
    "Bridesmaids", "X-Men: First Class", "The Hunger Games", "Drive", "Moneyball", "War Horse",
    "The Artist", "Midnight in Paris", "Horrible Bosses", "Crazy, Stupid, Love", "The Descendants", 
    "Super 8", "Tinker Tailor Soldier Spy", "Rango", "Hugo", "Kung Fu Panda 2", "Cars 2", 
    "Fast Five", "The Adjustment Bureau", "Contagion", "Sherlock Holmes: A Game of Shadows", "Real Steel", 
    "Paranormal Activity 3", "Puss in Boots", "The Smurfs", "Sucker Punch", "The Tree of Life",

    # 2012 Movies
    "The Avengers", "The Dark Knight Rises", "The Hunger Games", "Skyfall", "The Amazing Spider-Man",
    "The Twilight Saga: Breaking Dawn – Part 2", "Django Unchained", "Life of Pi", "The Hobbit: An Unexpected Journey",
    "Les Misérables", "Brave", "Wreck-It Ralph", "Silver Linings Playbook", "Argo", "Zero Dark Thirty", 
    "Prometheus", "21 Jump Street", "Looper", "Magic Mike", "Ted", "Hotel Transylvania", "The Bourne Legacy", 
    "Lincoln", "The Master", "Pitch Perfect", "The Perks of Being a Wallflower", "Beasts of the Southern Wild", 
    "Flight", "Rise of the Guardians", "Cloud Atlas", "The Cabin in the Woods", "Chronicle",

    # 2013 Movies
    "Frozen", "Iron Man 3", "Despicable Me 2", "The Hunger Games: Catching Fire", "Man of Steel", 
    "Gravity", "The Wolf of Wall Street", "American Hustle", "Thor: The Dark World", "The Great Gatsby (2013)", 
    "The Hobbit: The Desolation of Smaug", "Monsters University", "12 Years a Slave", "The Conjuring", 
    "Frozen", "World War Z", "Pacific Rim", "Captain Phillips", "Now You See Me", "The Heat", 
    "Blue Jasmine", "Dallas Buyers Club", "Prisoners", "Saving Mr. Banks", "Her", "Rush", "This Is the End", 
    "The Croods", "Elysium", "The Secret Life of Walter Mitty", "Inside Llewyn Davis", "The Wolverine",

    # 2014 Movies
    "Guardians of the Galaxy", "The LEGO Movie", "Captain America: The Winter Soldier", "Interstellar", 
    "The Hunger Games: Mockingjay – Part 1", "Gone Girl", "The Hobbit: The Battle of the Five Armies", 
    "Big Hero 6", "The Fault in Our Stars", "X-Men: Days of Future Past", "Maleficent", "Divergent", 
    "The Grand Budapest Hotel", "How to Train Your Dragon 2", "The Imitation Game", "Birdman", 
    "Whiplash", "American Sniper", "The Maze Runner", "Edge of Tomorrow", "Nightcrawler", 
    "Unbroken", "The Theory of Everything", "The Equalizer", "Fury", "Godzilla (2014)", 
    "22 Jump Street", "The Babadook", "A Most Violent Year", "Selma", "Boyhood",

    # 2015 Movies
    "Star Wars: The Force Awakens", "Avengers: Age of Ultron", "Jurassic World", "Inside Out", 
    "The Martian", "Mad Max: Fury Road", "The Revenant", "Furious 7", "The Hunger Games: Mockingjay – Part 2", 
    "Cinderella (2015)", "The Peanuts Movie", "Ant-Man", "Minions", "Spectre", "Pitch Perfect 2", 
    "Creed", "The Big Short", "Ex Machina", "Room", "Spotlight", "Bridge of Spies", "Sicario", 
    "Straight Outta Compton", "The Danish Girl", "Trainwreck", "The Good Dinosaur", 
    "Shaun the Sheep Movie", "Spy", "The Man from U.N.C.L.E.", "Paper Towns", "Paddington",

    # 2016 Movies
    "Captain America: Civil War", "Rogue One: A Star Wars Story", "Finding Dory", "Zootopia", 
    "The Jungle Book (2016)", "Moana", "Doctor Strange", "Fantastic Beasts and Where to Find Them", 
    "Deadpool", "Batman v Superman: Dawn of Justice", "Suicide Squad", "La La Land", "Hacksaw Ridge", 
    "Hidden Figures", "Arrival", "Manchester by the Sea", "Lion", "Moonlight", "Hell or High Water", 
    "The Nice Guys", "Passengers", "The Secret Life of Pets", "Sing", "Trolls", 
    "Kubo and the Two Strings", "10 Cloverfield Lane", "The Legend of Tarzan", 
    "The Magnificent Seven (2016)", "The Shallows", "War Dogs", "Deepwater Horizon",

    # 2017 Movies
    "Wonder Woman", "Star Wars: The Last Jedi", "Beauty and the Beast (2017)", "Thor: Ragnarok", 
    "Guardians of the Galaxy Vol. 2", "Spider-Man: Homecoming", "Justice League", "It (2017)", 
    "Logan", "Coco", "Get Out", "Dunkirk", "The Shape of Water", "Blade Runner 2049", "Lady Bird", 
    "Three Billboards Outside Ebbing, Missouri", "Call Me by Your Name", "Baby Driver", "The Disaster Artist", 
    "The Post", "Darkest Hour", "I, Tonya", "Phantom Thread", "Paddington 2", "The Greatest Showman", 
    "Jumanji: Welcome to the Jungle", "The Lego Batman Movie", "War for the Planet of the Apes", 
    "The Boss Baby", "Ferdinand", "Split", "John Wick: Chapter 2", "Atomic Blonde",

    # Hit Songs (Various Years)
    "Blinding Lights - The Weeknd", "Shape of You - Ed Sheeran", "Uptown Funk - Mark Ronson ft. Bruno Mars", 
    "Old Town Road - Lil Nas X ft. Billy Ray Cyrus", "Bad Guy - Billie Eilish", 
    "Dynamite - BTS", "Watermelon Sugar - Harry Styles", "Stay - The Kid LAROI & Justin Bieber", 
    "Levitating - Dua Lipa ft. DaBaby", "Rockstar - Post Malone ft. 21 Savage", 
    "Savage Love - Jawsh 685 & Jason Derulo", "Circles - Post Malone", "Drivers License - Olivia Rodrigo", 
    "Sunflower - Post Malone & Swae Lee", "All of Me - John Legend", "Someone Like You - Adele", 
    "Rolling in the Deep - Adele", "Hello - Adele", "As It Was - Harry Styles", "Bad Habits - Ed Sheeran", 
    "Industry Baby - Lil Nas X & Jack Harlow", "Happier - Marshmello ft. Bastille", 
    "Good 4 U - Olivia Rodrigo", "Montero (Call Me By Your Name) - Lil Nas X", 
    "Peaches - Justin Bieber ft. Daniel Caesar & Giveon", "Butter - BTS", 
    "Permission to Dance - BTS", "We Don't Talk About Bruno - Encanto Cast", 
    "Shallow - Lady Gaga & Bradley Cooper", "Havana - Camila Cabello ft. Young Thug", 
    "Senorita - Shawn Mendes & Camila Cabello", "Perfect - Ed Sheeran", "Love Yourself - Justin Bieber", 
    "Despacito - Luis Fonsi & Daddy Yankee ft. Justin Bieber", "Sorry - Justin Bieber", 
    "Closer - The Chainsmokers ft. Halsey", "Something Just Like This - The Chainsmokers & Coldplay", 
    "Don't Start Now - Dua Lipa", "Break My Heart - Dua Lipa", "Titanium - David Guetta ft. Sia", 
    "Chandelier - Sia", "Elastic Heart - Sia", "Alive - Sia", "Cheap Thrills - Sia ft. Sean Paul", 
    "Toxic - Britney Spears", "Oops!... I Did It Again - Britney Spears", "Shake It Off - Taylor Swift", 
    "Blank Space - Taylor Swift", "Bad Blood - Taylor Swift ft. Kendrick Lamar", 
    "Anti-Hero - Taylor Swift", "You Belong With Me - Taylor Swift", "Delicate - Taylor Swift", 
    "All Too Well - Taylor Swift", "I Knew You Were Trouble - Taylor Swift", 
    "Midnight Rain - Taylor Swift", "Lavender Haze - Taylor Swift", "22 - Taylor Swift", 
    "Me! - Taylor Swift ft. Brendon Urie", "Cruel Summer - Taylor Swift", "Style - Taylor Swift", 
    "Wildest Dreams - Taylor Swift", "Look What You Made Me Do - Taylor Swift", 
    "Anti-Hero - Taylor Swift", "Tim McGraw - Taylor Swift", "Cardigan - Taylor Swift", 
    "Willow - Taylor Swift", "Bejeweled - Taylor Swift", "We Are Never Ever Getting Back Together - Taylor Swift", 
    "This Love - Taylor Swift", "Gold Rush - Taylor Swift", "My Tears Ricochet - Taylor Swift", 
    "Shake It Off - Taylor Swift", "Maroon - Taylor Swift", "Wildest Dreams - Taylor Swift", 
    "You're On Your Own, Kid - Taylor Swift", "The 1 - Taylor Swift", "Invisible String - Taylor Swift", 
    "Enchanted - Taylor Swift", "Welcome to New York - Taylor Swift", "Dear John - Taylor Swift", 
    "Evermore - Taylor Swift ft. Bon Iver", "Exile - Taylor Swift ft. Bon Iver", 
    "Champagne Problems - Taylor Swift", "Betty - Taylor Swift", "Paper Rings - Taylor Swift", 
    "The Archer - Taylor Swift", "Getaway Car - Taylor Swift", "August - Taylor Swift", 
    "Cornelia Street - Taylor Swift", "King of My Heart - Taylor Swift", 
    "Love Story - Taylor Swift", "You Belong With Me - Taylor Swift", "White Horse - Taylor Swift", 
    "You Should Be Here - Taylor Swift", "End Game - Taylor Swift ft. Future & Ed Sheeran", 
    "Call It What You Want - Taylor Swift", "We Are Never Ever Getting Back Together - Taylor Swift", 
    "Lover - Taylor Swift", "You Need to Calm Down - Taylor Swift", "Daylight - Taylor Swift", 
    "I Forgot That You Existed - Taylor Swift", "Cruel Summer - Taylor Swift", 
    "All Too Well - Taylor Swift", "Enchanted - Taylor Swift", 
    "Style - Taylor Swift", "Wildest Dreams - Taylor Swift", "Gold Rush - Taylor Swift", 
    "The Archer - Taylor Swift", "Getaway Car - Taylor Swift", "Cornelia Street - Taylor Swift", 
    "Betty - Taylor Swift", "Exile - Taylor Swift ft. Bon Iver", 
    "Champagne Problems - Taylor Swift", "The 1 - Taylor Swift", "Invisible String - Taylor Swift", 
    "You're On Your Own, Kid - Taylor Swift", "Enchanted - Taylor Swift",

    # Taylor Swift Albums
    "Midnights - Taylor Swift", "Folklore - Taylor Swift", "Evermore - Taylor Swift",
    "1989 - Taylor Swift", "Reputation - Taylor Swift", "Fearless (Taylor’s Version) - Taylor Swift",
    "Speak Now (Taylor’s Version) - Taylor Swift", "Red (Taylor’s Version) - Taylor Swift",
    "Lover - Taylor Swift", "Fearless - Taylor Swift", "Speak Now - Taylor Swift",

    # Beyoncé Albums
    "Renaissance - Beyoncé", "Lemonade - Beyoncé", "4 - Beyoncé", 
    "B’Day - Beyoncé", "Dangerously in Love - Beyoncé", "The Gift - Beyoncé",

    # Drake Albums
    "Certified Lover Boy - Drake", "Scorpion - Drake", "Views - Drake",
    "Take Care - Drake", "Nothing Was the Same - Drake", "Her Loss - Drake & 21 Savage",

    # Adele Albums
    "30 - Adele", "25 - Adele", "21 - Adele", "19 - Adele",

    # The Weeknd Albums
    "Dawn FM - The Weeknd", "After Hours - The Weeknd", "Starboy - The Weeknd",
    "Beauty Behind the Madness - The Weeknd", "Kiss Land - The Weeknd",

    # Billie Eilish Albums
    "Happier Than Ever - Billie Eilish", "When We All Fall Asleep, Where Do We Go? - Billie Eilish",
    "Don’t Smile at Me - Billie Eilish",

    # Harry Styles Albums
    "Harry’s House - Harry Styles", "Fine Line - Harry Styles", "Harry Styles - Harry Styles",

    # Olivia Rodrigo Albums
    "SOUR - Olivia Rodrigo", "GUTS - Olivia Rodrigo",

    # BTS Albums
    "Map of the Soul: 7 - BTS", "BE - BTS", "Proof - BTS",
    "Love Yourself: Tear - BTS", "Love Yourself: Answer - BTS", "Wings - BTS",

    # Bad Bunny Albums
    "Un Verano Sin Ti - Bad Bunny", "YHLQMDLG - Bad Bunny", "El Último Tour Del Mundo - Bad Bunny",

    # Taylor Swift Expanded
    "Red - Taylor Swift", "Fearless - Taylor Swift", "Lover - Taylor Swift",
    "Reputation - Taylor Swift", "Speak Now - Taylor Swift", "Midnights - Taylor Swift",

    # Hip-Hop Albums
    "DAMN. - Kendrick Lamar", "Mr. Morale & the Big Steppers - Kendrick Lamar", 
    "Good Kid, M.A.A.D City - Kendrick Lamar", "The Off-Season - J. Cole", 
    "4 Your Eyez Only - J. Cole", "2014 Forest Hills Drive - J. Cole", 
    "Astroworld - Travis Scott", "Birds in the Trap Sing McKnight - Travis Scott", 
    "Rodeo - Travis Scott", "Montero - Lil Nas X", "The Pinkprint - Nicki Minaj",
    "Queen - Nicki Minaj", "The Marshall Mathers LP - Eminem", 
    "The Eminem Show - Eminem", "Kamikaze - Eminem", "Encore - Eminem",
    "My Turn - Lil Baby", "It's Only Me - Lil Baby",

    # Rock Albums
    "AM - Arctic Monkeys", "Humbug - Arctic Monkeys", "Tranquility Base Hotel & Casino - Arctic Monkeys",
    "When We Were Young - The Killers", "Wonderful Wonderful - The Killers", 
    "American Idiot - Green Day", "Dookie - Green Day", "Father of All... - Green Day",
    "Rumours - Fleetwood Mac", "Tusk - Fleetwood Mac", "The Dark Side of the Moon - Pink Floyd", 
    "The Wall - Pink Floyd", "Wish You Were Here - Pink Floyd",

    # Pop Albums
    "Future Nostalgia - Dua Lipa", "Levitating - Dua Lipa", "Don’t Start Now - Dua Lipa",
    "Chromatica - Lady Gaga", "Born This Way - Lady Gaga", "The Fame Monster - Lady Gaga",
    "Lemonade - Beyoncé", "4 - Beyoncé",

    # Country Albums
    "Golden Hour - Kacey Musgraves", "Same Trailer Different Park - Kacey Musgraves", 
    "Star-Crossed - Kacey Musgraves", "Traveller - Chris Stapleton", 
    "From A Room: Volume 1 - Chris Stapleton", "What You See Is What You Get - Luke Combs", 
    "Growin’ Up - Luke Combs", "Dangerous: The Double Album - Morgan Wallen",

    # More Pop Albums
    "Fine Line - Harry Styles", "When We All Fall Asleep, Where Do We Go? - Billie Eilish",
    "Harry’s House - Harry Styles", "SOUR - Olivia Rodrigo",

    # Albums from the Last Decade
    "Random Access Memories - Daft Punk", "25 - Adele", "Anti - Rihanna", 
    "Lemonade - Beyoncé", "Currents - Tame Impala", "Melodrama - Lorde", 
    "Pure Heroine - Lorde", "Invasion of Privacy - Cardi B", 
    "Blonde - Frank Ocean", "Channel Orange - Frank Ocean", 
    "CTRL - SZA", "SOS - SZA", "Heard It in a Past Life - Maggie Rogers", 
    "The Slow Rush - Tame Impala", "Manic - Halsey", "Hopeless Fountain Kingdom - Halsey", 
    "Badlands - Halsey", "Human - Rag'n'Bone Man", "After Laughter - Paramore",
    "Brand New Eyes - Paramore", "Riot! - Paramore", "This Is Why - Paramore",
    "Divide - Ed Sheeran", "Multiply - Ed Sheeran", "Equals - Ed Sheeran",

    # Top Albums of All Time
    "Abbey Road - The Beatles", "Revolver - The Beatles", "Sgt. Pepper’s Lonely Hearts Club Band - The Beatles",
    "Rumours - Fleetwood Mac", "Thriller - Michael Jackson", "Off the Wall - Michael Jackson",
    "Purple Rain - Prince", "1999 - Prince", "The Joshua Tree - U2", 
    "Achtung Baby - U2", "OK Computer - Radiohead", "Kid A - Radiohead", 
    "Nevermind - Nirvana", "In Utero - Nirvana", "Led Zeppelin IV - Led Zeppelin",
    "Physical Graffiti - Led Zeppelin", "Hotel California - Eagles", 
    "Back in Black - AC/DC", "Highway to Hell - AC/DC", "The Suburbs - Arcade Fire",
    "Funeral - Arcade Fire", "Born to Run - Bruce Springsteen", 
    "The River - Bruce Springsteen", "Darkness on the Edge of Town - Bruce Springsteen",
    "21 - Adele", "25 - Adele", "30 - Adele", "Midnights - Taylor Swift",
    "1989 - Taylor Swift", "Fearless (Taylor's Version) - Taylor Swift",
    "Red (Taylor's Version) - Taylor Swift", "Speak Now (Taylor's Version) - Taylor Swift",
    "Lover - Taylor Swift", "Folklore - Taylor Swift", "Evermore - Taylor Swift",

    # More Hip-Hop Albums
    "Illmatic - Nas", "It Was Written - Nas", "The Blueprint - Jay-Z", 
    "Reasonable Doubt - Jay-Z", "The Black Album - Jay-Z", "Watch the Throne - Jay-Z & Kanye West",
    "Graduation - Kanye West", "My Beautiful Dark Twisted Fantasy - Kanye West", 
    "The College Dropout - Kanye West", "Late Registration - Kanye West",
    "Life of Pablo - Kanye West", "Astroworld - Travis Scott", 
    "Rodeo - Travis Scott", "DAMN. - Kendrick Lamar", "To Pimp a Butterfly - Kendrick Lamar",
    "Good Kid, M.A.A.D City - Kendrick Lamar", "The Eminem Show - Eminem",
    "The Marshall Mathers LP - Eminem", "Kamikaze - Eminem", "Encore - Eminem",

    # Electronic & Indie Albums
    "Discovery - Daft Punk", "Random Access Memories - Daft Punk", 
    "Currents - Tame Impala", "The Slow Rush - Tame Impala", 
    "An Awesome Wave - Alt-J", "This Is All Yours - Alt-J", 
    "Relaxer - Alt-J", "AM - Arctic Monkeys", "Whatever People Say I Am, That’s What I’m Not - Arctic Monkeys",
    "Humbug - Arctic Monkeys", "Suck It and See - Arctic Monkeys", 
    "Tranquility Base Hotel & Casino - Arctic Monkeys",

    # Netflix Original Series
    "Stranger Things", "The Crown", "The Witcher", "Money Heist", "Wednesday", 
    "Bridgerton", "The Umbrella Academy", "Ozark", "Mindhunter", "The Queen's Gambit", 
    "You", "Narcos", "13 Reasons Why", "BoJack Horseman", "Big Mouth", 
    "The Sandman", "Locke & Key", "Cobra Kai", "Russian Doll", "Sweet Tooth", 
    "Heartstopper", "Sex Education", "Shadow and Bone", "The Lincoln Lawyer", 
    "Outer Banks", "Lucifer", "The Last Kingdom", "Elite", "The Dark Crystal: Age of Resistance", 
    "Arcane", "Daredevil", "Jessica Jones", "Luke Cage", "Iron Fist", "The Punisher", 
    "The Defenders", "Unbreakable Kimmy Schmidt", "Grace and Frankie", "Glow", 
    "The Kominsky Method", "Dead to Me", "Queen Charlotte: A Bridgerton Story", 
    "Maniac", "Altered Carbon", "Black Mirror", "House of Cards", "F Is for Family", 
    "American Vandal", "Bloodline", "Hemlock Grove", "The Chilling Adventures of Sabrina", 
    "Emily in Paris", "The Haunting of Hill House", "The Haunting of Bly Manor",

    # Disney+ Original Series
    "The Mandalorian", "WandaVision", "Loki", "The Falcon and the Winter Soldier", 
    "Hawkeye", "What If...?", "She-Hulk: Attorney at Law", "Ms. Marvel", 
    "Moon Knight", "Andor", "The Book of Boba Fett", "Obi-Wan Kenobi", 
    "Star Wars: The Bad Batch", "Star Wars: Visions", "The Imagineering Story", 
    "Marvel's 616", "High School Musical: The Musical: The Series", "Diary of a Future President", 
    "Big Shot", "The Mighty Ducks: Game Changers", "Doogie Kameāloha, M.D.", "The World According to Jeff Goldblum",

    # Amazon Prime Video Original Series
    "The Boys", "The Marvelous Mrs. Maisel", "Fleabag", "Jack Ryan", "The Wheel of Time", 
    "Good Omens", "Reacher", "The Lord of the Rings: The Rings of Power", 
    "Invincible", "The Expanse", "Upload", "The Man in the High Castle", 
    "Carnival Row", "Mozart in the Jungle", "Transparent", "Patriot", "Goliath", 
    "Bosch", "Tales from the Loop", "The Underground Railroad", "Outer Range", 
    "A League of Their Own", "The Wilds", "Red Oaks", "Hanna", "Utopia",

    # HBO and HBO Max Original Series
    "Game of Thrones", "House of the Dragon", "Succession", "Euphoria", "The Last of Us", 
    "Barry", "The White Lotus", "Westworld", "Big Little Lies", "Chernobyl", 
    "Veep", "True Detective", "Silicon Valley", "The Flight Attendant", 
    "Insecure", "Raised by Wolves", "The Righteous Gemstones", "Station Eleven", 
    "The Undoing", "Winning Time: The Rise of the Lakers Dynasty", "Industry", 
    "Perry Mason", "Love Life", "Peacemaker", "Mare of Easttown", "The Outsider", 
    "Doom Patrol", "Titans", "His Dark Materials", "The Nevers", "Watchmen",

    # Hulu Original Series
    "The Handmaid's Tale", "Only Murders in the Building", "The Bear", "Castle Rock", 
    "Nine Perfect Strangers", "Dopesick", "The Act", "Little Fires Everywhere", 
    "Shrill", "Ramy", "Solar Opposites", "Animaniacs (Reboot)", "Chance", 
    "The Great", "Woke", "Future Man", "Love, Victor", "Pen15", "The Looming Tower", 
    "Difficult People", "Casual", "Harlots", "The Path", "11.22.63", 
    "The Dropout", "Pam & Tommy", "How I Met Your Father", "Victoria’s Secret: Angels and Demons",

    # Apple TV+ Original Series
    "Ted Lasso", "Severance", "The Morning Show", "For All Mankind", "Foundation", 
    "Pachinko", "Servant", "See", "Shrinking", "Mythic Quest", "Truth Be Told", 
    "The Afterparty", "Dickinson", "Tehran", "Defending Jacob", "Slow Horses", 
    "Invasion", "Black Bird", "Lisey’s Story", "Snoopy in Space", "Fraggle Rock: Back to the Rock",

    # Other Platforms (FX, Showtime, AMC, Peacock, Paramount+)
    "The Walking Dead", "Breaking Bad", "Better Call Saul", "Mad Men", 
    "Fargo", "The Americans", "Sons of Anarchy", "Dexter", "Dexter: New Blood", 
    "Billions", "Yellowjackets", "Shameless", "Ray Donovan", "The Chi", 
    "Penny Dreadful", "Penny Dreadful: City of Angels", "Twin Peaks (2017)", 
    "Bates Motel", "Mr. Robot", "Hannibal", "Preacher", "Orphan Black", 
    "Killing Eve", "The Killing", "Broadchurch", "Sherlock", "Downton Abbey", 
    "Dr. Death", "Girls5eva", "Yellowstone", "1883", "Mayor of Kingstown", 
    "Star Trek: Discovery", "Star Trek: Picard", "Star Trek: Strange New Worlds", 
    "Halo", "The Good Fight", "Evil", "A Discovery of Witches", "The Morning Show", 
    "Glee", "New Girl", "How I Met Your Mother", "Parks and Recreation", 
    "The Office (US)", "Brooklyn Nine-Nine", "30 Rock", "Community", 
    "Archer", "Rick and Morty", "Family Guy", "Bob’s Burgers", "The Simpsons", 
    "South Park", "Big Bang Theory", "Young Sheldon", "Two and a Half Men", 
    "Friends", "How I Met Your Mother", "The Middle", "Supernatural", "Gotham", 
    "Smallville", "Arrow", "The Flash", "Supergirl", "Legends of Tomorrow", 
    "Batwoman", "Glee", "Futurama", "Scrubs", "Charmed", "The Vampire Diaries", 
    "The Originals", "Legacies", "True Blood", "Once Upon a Time", "Grimm",

    # Fiction Authors
    "Colleen Hoover", "Taylor Jenkins Reid", "Emily Henry", "Lisa Jewell", 
    "Kristin Hannah", "Alice Feeney", "Fredrik Backman", "Jodi Picoult", 
    "Elin Hilderbrand", "Jennifer Weiner", "Brit Bennett", "Liane Moriarty", 
    "Sally Rooney", "Celeste Ng", "Jojo Moyes", "Madeline Miller", "Delia Owens", 
    "Barbara Kingsolver", "Ruth Ware", "Karen McManus", "Nicholas Sparks", 
    "Christina Lauren", "Khaled Hosseini", "Chloe Gong", "Amor Towles", 
    "Ann Patchett", "Jamie Ford", "Colson Whitehead", "Hanya Yanagihara", 
    "Téa Obreht", "Douglas Stuart", "Min Jin Lee", "Eleanor Catton", 
    "Anthony Doerr", "Kazuo Ishiguro", "Jeanine Cummins", "Rebecca Serle", 
    "Katherine Center", "Laura Dave", "Elizabeth Strout", "Maggie O'Farrell", 
    "Erin Morgenstern", "Tara Westover", "Alex Michaelides", "Lisa See", 
    "Taylor Adams", "Mary Kubica", "Shari Lapena", "Rachel Hawkins", 
    "Veronica Roth", "Erin A. Craig", "Cynthia Hand", "Karen White", 
    "Mary Beth Keane", "Abbi Waxman", "Frances Cha", "Carley Fortune", 

    # Fantasy and Sci-Fi Authors
    "Sarah J. Maas", "Leigh Bardugo", "V.E. Schwab", "Brandon Sanderson", 
    "Marissa Meyer", "Sabaa Tahir", "Victoria Aveyard", "Jay Kristoff", 
    "Tahereh Mafi", "Cassandra Clare", "R.F. Kuang", "Naomi Novik", 
    "T.J. Klune", "Patrick Rothfuss", "Holly Black", "Laini Taylor", 
    "Adrienne Young", "N.K. Jemisin", "Tracy Deonn", "Samantha Shannon", 
    "Tomi Adeyemi", "Pierce Brown", "A.G. Slatter", "Rebecca Yarros", 
    "Martha Wells", "P. Djèlí Clark", "Christopher Paolini", 
    "Daniel Abraham", "James S.A. Corey", "Joe Abercrombie", "Neal Shusterman", 

    # Mystery, Thriller, and Suspense Authors
    "Stephen King", "Riley Sager", "Harlan Coben", "Gillian Flynn", 
    "Tana French", "Lisa Gardner", "Karin Slaughter", "Paula Hawkins", 
    "Louise Penny", "Michael Connelly", "David Baldacci", "John Grisham", 
    "Dan Brown", "Greer Hendricks", "Sarah Pekkanen", "Lucy Foley", 
    "C.J. Box", "Don Winslow", "James Patterson", "Stieg Larsson", 
    "Robert Galbraith", "Dean Koontz", "Clive Cussler", "Mark Greaney", 
    "A.J. Finn", "B.A. Paris", "Tess Gerritsen", "Peter Swanson", "Megan Miranda", 
    "Sophie Hannah", "Sandra Brown", "Alyssa Cole", "Rachel Caine", 
    "Fiona Barton", "Shari Lapena", "Lisa Scottoline", 

    # Young Adult and Children's Authors
    "Rick Riordan", "Angie Thomas", "Adam Silvera", "Jenny Han", 
    "Marie Lu", "Rainbow Rowell", "Becky Albertalli", "John Green", 
    "Karen M. McManus", "Kiera Cass", "Nicola Yoon", "Leigh Bardugo", 
    "Sarah Dessen", "Victoria Schwab", "Brigid Kemmerer", "Neal Shusterman", 
    "E. Lockhart", "Kami Garcia", "Tomi Adeyemi", "Melissa de la Cruz", 
    "Roshani Chokshi", "Tahereh Mafi", "Julie Murphy", "Marissa Meyer", 
    "Jennifer L. Armentrout", "Stephanie Garber", "Ashley Poston", 
    "Jasmine Warga", "Kalynn Bayron", "Amie Kaufman", "Holly Jackson", 

    # Non-Fiction and Memoir Authors
    "Michelle Obama", "Barack Obama", "Trevor Noah", "Matthew Perry", 
    "Viola Davis", "David Sedaris", "Tara Westover", "Chanel Miller", 
    "Glennon Doyle", "Brené Brown", "Elizabeth Gilbert", "Malcolm Gladwell", 
    "Yuval Noah Harari", "Atul Gawande", "Michael Pollan", "Chimamanda Ngozi Adichie", 
    "Isabel Wilkerson", "Jon Krakauer", "Ta-Nehisi Coates", "Ijeoma Oluo", 
    "Robin DiAngelo", "Michelle Alexander", "Daniel Kahneman", 
    "Adam Grant", "Robert Greene", "Marie Forleo", "James Clear", 
    "Mark Manson", "Cal Newport", "David Goggins", "Simon Sinek", 

    # Romance Authors
    "Colleen Hoover", "Emily Henry", "Christina Lauren", "Helen Hoang", 
    "Ali Hazelwood", "Tessa Bailey", "Abby Jimenez", "Jasmine Guillory", 
    "Sally Thorne", "Casey McQuiston", "Lyssa Kay Adams", "Lauren Layne", 
    "R.S. Grey", "Katherine Center", "Penelope Douglas", "Mariana Zapata", 
    "E.L. James", "Elle Kennedy", "Vi Keeland", "Sarina Bowen", 
    "Cora Reilly", "Meghan Quinn", "Lucy Score", "K.A. Tucker", 

    # Historical Fiction Authors
    "Kristin Hannah", "Kate Quinn", "Madeline Miller", "Anthony Doerr", 
    "Ruta Sepetys", "Pam Jenoff", "Heather Morris", "Lisa Wingate", 
    "Christina Baker Kline", "Martha Hall Kelly", "Paula McLain", 
    "Alyssa Cole", "Jennifer Robson", "Imogen Kealey", "Natasha Lester", 

    # Business and Self-Help Authors
    "Tim Ferriss", "Tony Robbins", "Dale Carnegie", "Napoleon Hill", 
    "Stephen R. Covey", "James Clear", "Mark Manson", "Ryan Holiday", 
    "Simon Sinek", "Jen Sincero", "Rachel Hollis", "Marie Kondo", 
    "Robin Sharma", "Hal Elrod", "Mel Robbins", "John Maxwell", 
    "David Allen", "Gary Vaynerchuk", "Seth Godin", "Grant Cardone", 

    # Poetry and Literary Fiction Authors
    "Rupi Kaur", "Amanda Gorman", "Ocean Vuong", "Maggie Nelson", 
    "Danez Smith", "Mary Oliver", "Margaret Atwood", "Louise Glück", 
    "Jhumpa Lahiri", "Kazuo Ishiguro", "Zadie Smith", "Chimamanda Ngozi Adichie", 
    "Arundhati Roy", "Ann Patchett", "Jonathan Franzen", "Colson Whitehead", 
    "George Saunders", "Elizabeth Strout", "Alice Walker", "Toni Morrison",

    # Basketball (NBA/WNBA)
    "LeBron James", "Stephen Curry", "Kevin Durant", "Kobe Bryant", "Giannis Antetokounmpo", 
    "Shaquille O'Neal", "Michael Jordan", "Tim Duncan", "Magic Johnson", "Larry Bird", 
    "Wilt Chamberlain", "Kareem Abdul-Jabbar", "Russell Westbrook", "Anthony Davis", 
    "Jayson Tatum", "Luka Dončić", "Damian Lillard", "Chris Paul", "James Harden", 
    "Kyrie Irving", "Devin Booker", "Zion Williamson", "Ja Morant", "Draymond Green", 
    "Klay Thompson", "Nikola Jokić", "Joel Embiid", "Jimmy Butler", "Paul George", 
    "Diana Taurasi", "Sue Bird", "Candace Parker", "Breanna Stewart", "A'ja Wilson", 
    "Elena Delle Donne", "Sabrina Ionescu", "Skylar Diggins-Smith", "Lisa Leslie", 
    "Maya Moore", "Sylvia Fowles",

    # Football (NFL)
    "Tom Brady", "Patrick Mahomes", "Aaron Rodgers", "Josh Allen", "Lamar Jackson", 
    "Jalen Hurts", "Russell Wilson", "Joe Burrow", "Justin Herbert", "Derrick Henry", 
    "Christian McCaffrey", "Cooper Kupp", "Davante Adams", "Deebo Samuel", "Stefon Diggs", 
    "Travis Kelce", "George Kittle", "Aaron Donald", "Von Miller", "J.J. Watt", 
    "T.J. Watt", "Nick Bosa", "Myles Garrett", "Micah Parsons", "Tyreek Hill", 
    "Ja'Marr Chase", "Ezekiel Elliott", "Saquon Barkley", "Jonathan Taylor", 
    "Justin Jefferson", "Kirk Cousins", "Dak Prescott", "Kyler Murray", 
    "DeAndre Hopkins", "Chris Godwin", "A.J. Brown", "Mark Andrews", "Dalvin Cook",

    # Soccer (MLS/USMNT/USWNT)
    "Christian Pulisic", "Weston McKennie", "Tyler Adams", "Gio Reyna", "Matt Turner", 
    "Brenden Aaronson", "Tim Weah", "Walker Zimmerman", "Antonee Robinson", "Sergiño Dest", 
    "Landon Donovan", "Clint Dempsey", "Alexi Lalas", "Jozy Altidore", "Brad Guzan", 
    "Megan Rapinoe", "Alex Morgan", "Carli Lloyd", "Abby Wambach", "Tobin Heath", 
    "Crystal Dunn", "Julie Ertz", "Kelley O'Hara", "Mallory Swanson", "Sophia Smith", 
    "Rose Lavelle", "Becky Sauerbrunn", "Lindsey Horan", "Ashlyn Harris", 
    "Christen Press", "Hope Solo",

    # Tennis
    "Serena Williams", "Venus Williams", "Coco Gauff", "Sloane Stephens", "Jessica Pegula", 
    "Madison Keys", "Danielle Collins", "Jennifer Brady", "Taylor Townsend", "Alison Riske", 
    "Sofia Kenin", "Frances Tiafoe", "Taylor Fritz", "John Isner", "Jack Sock", 
    "Reilly Opelka", "Tommy Paul", "Sebastian Korda", "Ben Shelton", "Brandon Nakashima", 
    "Michael Chang", "Pete Sampras", "Andre Agassi", "Andy Roddick", 

    # Golf
    "Tiger Woods", "Phil Mickelson", "Jordan Spieth", "Brooks Koepka", "Bryson DeChambeau", 
    "Dustin Johnson", "Justin Thomas", "Xander Schauffele", "Collin Morikawa", "Patrick Cantlay", 
    "Scottie Scheffler", "Rickie Fowler", "Matt Kuchar", "Tony Finau", "Zach Johnson", 
    "Lexi Thompson", "Nelly Korda", "Jessica Korda", "Michelle Wie", "Stacy Lewis", 
    "Danielle Kang", "Paula Creamer", "Natalie Gulbis", 

    # Track and Field
    "Allyson Felix", "Carl Lewis", "Florence Griffith Joyner", "Usain Bolt (Caribbean Legend)", 
    "Sydney McLaughlin", "Noah Lyles", "Michael Norman", "Dalilah Muhammad", "Sha'Carri Richardson", 
    "Athing Mu", "Donavan Brazier", "Fred Kerley", "Erriyon Knighton", "Trayvon Bromell", 
    "Christian Coleman", "Grant Holloway", "DeAnna Price", "Emma Coburn", "Kara Winger", 

    # Swimming
    "Michael Phelps", "Katie Ledecky", "Caeleb Dressel", "Ryan Lochte", "Missy Franklin", 
    "Simone Manuel", "Nathan Adrian", "Lilly King", "Regan Smith", "Chase Kalisz", 
    "Maggie Steffens", "Abbey Weitzeil", 

    # Boxing and MMA
    "Floyd Mayweather Jr.", "Mike Tyson", "Deontay Wilder", "Terence Crawford", 
    "Errol Spence Jr.", "Gervonta Davis", "Shakur Stevenson", "Claressa Shields", 
    "Holly Holm", "Ronda Rousey", "Conor McGregor", "Dustin Poirier", "Israel Adesanya", 
    "Jon Jones", "Kamaru Usman", "Amanda Nunes", "Julianna Peña", 

    # Baseball (MLB)
    "Mike Trout", "Shohei Ohtani", "Aaron Judge", "Bryce Harper", "Mookie Betts", 
    "Jacob deGrom", "Clayton Kershaw", "Juan Soto", "Ronald Acuña Jr.", "Trea Turner", 
    "Nolan Arenado", "Freddie Freeman", "Max Scherzer", "Justin Verlander", 
    "Gerrit Cole", "Francisco Lindor", "Corey Seager", "Manny Machado", "Yadier Molina", 

    # Hockey (NHL)
    "Wayne Gretzky", "Sidney Crosby", "Alex Ovechkin", "Connor McDavid", 
    "Nathan MacKinnon", "Auston Matthews", "Patrick Kane", "Jonathan Toews", 
    "Carey Price", "Igor Shesterkin", "Cale Makar", "Steven Stamkos", 
    "Victor Hedman", "Leon Draisaitl", "Jack Eichel", "Matthew Tkachuk", 

    # Gymnastics
    "Simone Biles", "Gabby Douglas", "Aly Raisman", "Jade Carey", 
    "Sunisa Lee", "Laurie Hernandez", "McKayla Maroney", "Katelyn Ohashi", 
    "Nastia Liukin", "Shawn Johnson", 

    # Other Sports
    "Chloe Kim (Snowboarding)", "Shaun White (Snowboarding)", "Lindsey Vonn (Skiing)", 
    "Mikaela Shiffrin (Skiing)", "Nathan Chen (Figure Skating)", "Evan Lysacek (Figure Skating)", 
    "Sydney McLaughlin (Track)", "Katie Zaferes (Triathlon)", "Maggie Nichols (Gymnastics)", 
    "April Ross (Beach Volleyball)", "Kerri Walsh Jennings (Beach Volleyball)",

    # Lifestyle and Fashion Influencers
    "Emma Chamberlain", "Chiara Ferragni", "Camila Coelho", "Negin Mirsalehi", "Julie Sariñana", 
    "Danielle Bernstein", "Aimee Song", "Lauren Conrad", "Nicole Guerriero", "Chriselle Lim", 
    "Raven Elyse", "Amber Fillerup Clark", "Jenn Im", "Caitlin Covington", "Tess Christine", 
    "Sivan Ayla", "Arielle Charnas", "Julia Engel", "Kelsey Simone", "Jackie Aina", 
    "Marianna Hewitt", "Rachel Parcell", "Jacey Duprie", "Brittany Xavier", "Claire Marshall", 
    "Tezza Barton", "Sarah Ashcroft", "Alisha Marie", "Remi Cruz", "Ashley Brooke",

    # Beauty Influencers
    "James Charles", "Nikkie de Jager (NikkieTutorials)", "Huda Kattan", "Manny MUA", 
    "Jeffree Star", "Patrick Starrr", "Jackie Aina", "Tati Westbrook", "Kathleen Lights", 
    "Carli Bybel", "Bretman Rock", "Desi Perkins", "Iluvsarahii", "Christen Dominique", 
    "RawBeautyKristi", "Jkissa", "RCL Beauty101", "Chloe Morello", "Sophia Esperanza", 
    "Michelle Phan", "Cindy Kimberly", "Abby Roberts", "Eleanor Barnes (Snitchery)", 
    "Melissa Alatorre", "Aliss Bonython", "Amrezy", "Kaitlyn Bristowe", "Pony Syndrome", 
    "Leah Halton", "Soph Does Nails", "Nabela Noor",

    # Fitness Influencers
    "Chloe Ting", "Whitney Simmons", "Cassey Ho (Blogilates)", "Joe Wicks", 
    "Kayla Itsines", "Pamela Reif", "Emily Skye", "Kelsey Wells", "Brittany Dawn", 
    "Massy Arias", "Nikki Blackketter", "Sami Clarke", "Cydney Gillon", "Courtney King", 
    "Jeff Nippard", "Chris Heria", "Amanda Bisk", "Anna Victoria", "Rachel Brathen (Yoga Girl)", 
    "Stefanie Cohen", "Steve Cook", "Alex Toussaint", "Robin Arzón", "Ally Love", 
    "Tunde Oyeneyin", "Remi Ishizuka", "Lauren Simpson", "Hannah Bower", "Ashleigh Jordan",

    # Food Influencers
    "Gordon Ramsay", "Molly Yeh", "Rosanna Pansino", "Andrew Rea (Binging with Babish)", 
    "Tasty", "Half Baked Harvest (Tieghan Gerard)", "Joshua Weissman", "Delish", 
    "The Pioneer Woman (Ree Drummond)", "Yumna Jawad (Feel Good Foodie)", "Munchies", 
    "Claire Saffitz", "Erwan Heussaff", "Laura Vitale", "Rachael Ray", "Sohla El-Waylly", 
    "Damn Delicious (Chungah Rhee)", "Eitan Bernath", "The Woks of Life", "Binging with Babish", 
    "Preppy Kitchen (John Kanell)", "Honeysuckle (Dzung Lewis)", "Brothers Green Eats", 
    "Minimalist Baker (Dana Shultz)", "Tasty", "My Healthy Dish", "Skinnytaste (Gina Homolka)", 
    "Jessica in the Kitchen", "Deliciously Ella", "Jocelyn Delk Adams (Grandbaby Cakes)", 
    "Fit Men Cook (Kevin Curry)",

    # Tech Influencers
    "Marques Brownlee (MKBHD)", "Linus Sebastian (Linus Tech Tips)", "Austin Evans", 
    "Unbox Therapy (Lewis George Hilsenteger)", "Dave Lee (Dave2D)", "iJustine", 
    "Jonathan Morrison (TLD)", "Sara Dietschy", "Marques Brownlee", "TechMeOut", 
    "TechnoBuffalo", "Linus Tech Tips", "Mrwhosetheboss", "Casey Neistat", "Justine Ezarik", 
    "JerryRigEverything", "Dave Lee", "Joanna Stern", "Michael Fisher (MrMobile)", 
    "Jon Rettinger", "DetroitBORG", "Ali Abdaal", "Andru Edwards", "Snazzy Labs", 
    "TechLinked", "Brandon Butch", "Kevin Stratvert", "Supersaf", "UACrew", "Rene Ritchie",

    # Comedy Influencers
    "Lilly Singh", "Lele Pons", "King Bach", "Amanda Cerny", "Rudy Mancuso", 
    "David Dobrik", "Logan Paul", "Jake Paul", "Anwar Jibawi", "Zach King", 
    "Gabriel Iglesias", "Kevin Hart", "Chris D'Elia", "Bo Burnham", "Trevor Wallace", 
    "Sarah Cooper", "Shane Dawson", "Ryan Higa (Nigahiga)", "Ian Hecox (Smosh)", 
    "Jenna Marbles", "Liza Koshy", "Markiplier", "PewDiePie", "MatPat (Game Theory)", 
    "Vsauce", "Danny Gonzalez", "Drew Gooden", "Cody Ko", "Noel Miller", 
    "Kurtis Conner", "Nikita Dragun",

    # TikTok Stars
    "Charli D'Amelio", "Addison Rae", "Bella Poarch", "Loren Gray", "Dixie D'Amelio", 
    "Bryce Hall", "Avani Gregg", "Noah Beck", "Josh Richards", "Nikita Dragun", 
    "Michael Le (JustMaiko)", "Zoe LaVerne", "Griffin Johnson", "Vinnie Hacker", 
    "Chase Hudson (Lil Huddy)", "Nessa Barrett", "Anwar Jibawi", "Madi Monroe", 
    "Anna Shumate", "Quinton Griggs", "Spencer X", "Khaby Lame", "Nick Austin", 
    "Ryland Storms", "Brooklyn and Bailey", "Montana Tucker", "Tayler Holder", 
    "Brent Rivera", "Lexi Rivera", "Pierson Wodzynski", 

    # YouTube Stars
    "MrBeast", "Dude Perfect", "Markiplier", "Nikita Dragun", "Emma Chamberlain", 
    "Casey Neistat", "Logan Paul", "Jake Paul", "David Dobrik", "Shane Dawson", 
    "Lilly Singh", "Smosh", "PewDiePie", "Zoella", "NikkieTutorials", 
    "James Charles", "Jenna Marbles", "Ryan Higa", "Tana Mongeau", "Colleen Ballinger", 
    "Liza Koshy", "Tyler Oakley", "Superwoman", "Anthony Padilla", "H3H3 Productions", 
    "Philip DeFranco", "Vsauce", "Gabbie Hanna", "Manny MUA", "Bretman Rock",

    # Miscellaneous
    "Trevor Noah", "Bretman Rock", "Huda Kattan", "Casey Neistat", "JoJo Siwa", 
    "Amanda Steele", "PewDiePie", "Emma Chamberlain", "Marques Brownlee", "Jeffree Star",

    # Science and Technology
    "What is quantum physics", "How do black holes form", "What is the theory of relativity", 
    "How does Wi-Fi work", "What is artificial intelligence", "What is machine learning", 
    "How does 5G technology work", "What is the Internet of Things", "What is blockchain technology", 
    "How does cryptocurrency work", "What is virtual reality", "What is augmented reality", 
    "How do electric cars work", "What is renewable energy", "What is nuclear fusion", 
    "How do solar panels work", "What is genetic engineering", "What is CRISPR", 
    "How does GPS work", "What is cloud computing", "What is a quantum computer", 
    "How do vaccines work", "What is herd immunity", "What is the immune system", 
    "How does DNA work", "What are stem cells", "What causes earthquakes", 
    "How do volcanoes form", "What is plate tectonics", "How does the water cycle work", 
    "What is global warming", "What are greenhouse gases", "How do hurricanes form", 
    "What is the Big Bang theory", "How does gravity work", "What are exoplanets", 
    "What is dark matter", "What is dark energy", "What are gravitational waves",

    # History and Geography
    "What caused World War I", "What caused World War II", "Who discovered America", 
    "What is the history of the Roman Empire", "Who were the founding fathers", 
    "What caused the Great Depression", "What was the Cold War", "What is the Industrial Revolution", 
    "What is the Renaissance", "What is the history of the United States", 
    "What caused the Civil War", "What is the history of slavery", "What is the history of democracy", 
    "What are the Seven Wonders of the World", "What is the tallest mountain", 
    "What is the longest river", "What is the largest desert", "What are the continents", 
    "What is the smallest country", "What is the largest ocean", "What are the major oceans", 
    "What are the Great Lakes", "What is the history of Europe", "What is the history of Asia", 
    "What is the history of Africa", "What is the history of South America", 
    "What is the history of Australia", "What is the history of Antarctica", 
    "What is the history of space exploration", "What is the history of the moon landing",

    # Health and Medicine
    "What are the symptoms of diabetes", "What are the symptoms of heart disease", 
    "What is depression", "What are the symptoms of anxiety", "What are the symptoms of ADHD", 
    "What are the benefits of exercise", "What is a balanced diet", "What is cholesterol", 
    "What are the effects of smoking", "What are the effects of alcohol", 
    "What is mental health", "What is physical health", "What is holistic health", 
    "What is mindfulness", "What is meditation", "What is yoga", 
    "What are the benefits of sleep", "What causes insomnia", "What are the stages of sleep", 
    "What are the benefits of drinking water", "What is intermittent fasting", 
    "What are the symptoms of COVID-19", "What are the benefits of vaccines", 
    "What is the difference between a virus and bacteria", "What is the common cold", 
    "What is the flu", "What is cancer", "What is chemotherapy", 
    "What are the symptoms of stroke", "What causes high blood pressure", 
    "What is the treatment for migraines", "What is a food allergy",

    # General Knowledge and Curiosities
    "What is the meaning of life", "What is philosophy", "What is ethics", 
    "What is morality", "What are the laws of physics", "What are the principles of mathematics", 
    "What is the history of language", "What is the history of writing", 
    "What is the history of art", "What is the history of music", 
    "What are the major religions", "What is Christianity", "What is Islam", 
    "What is Buddhism", "What is Hinduism", "What is Judaism", 
    "What is atheism", "What is agnosticism", "What is astrology", 
    "What is astronomy", "What is biology", "What is chemistry", 
    "What is physics", "What is geology", "What is meteorology", 
    "What is oceanography", "What is paleontology", "What is archaeology", 
    "What is anthropology", "What is sociology", "What is psychology", 
    "What is economics", "What is political science", "What is law", 
    "What is business", "What is marketing", "What is management",

    # Technology and Innovation
    "What is social media", "What is Facebook", "What is Instagram", 
    "What is Twitter", "What is TikTok", "What is YouTube", 
    "What is a smartphone", "What is an operating system", 
    "What is a computer virus", "What is cybersecurity", "What is artificial intelligence", 
    "What is a chatbot", "What is a search engine", "What is e-commerce", 
    "What is online banking", "What is mobile payment", "What is a cryptocurrency wallet", 
    "What is a self-driving car", "What is a smart home", "What is a 3D printer", 
    "What is a drone", "What is a robot", "What is a space telescope", 
    "What is a satellite", "What is the International Space Station", 
    "What is Mars exploration", "What is a lunar rover",

    # Arts and Culture
    "What is classical music", "What is jazz", "What is rock music", 
    "What is hip hop", "What is pop music", "What is country music", 
    "What is electronic music", "What is reggae", "What is blues", 
    "What is opera", "What is ballet", "What is contemporary dance", 
    "What is theater", "What is literature", "What are the major literary genres", 
    "What is poetry", "What is prose", "What is drama", "What is a novel", 
    "What is a short story", "What is a play", "What is a musical", 
    "What are the major art movements", "What is surrealism", "What is cubism", 
    "What is impressionism", "What is modernism", "What is postmodernism",

    # Nature and Environment
    "What are ecosystems", "What are biomes", "What is a rainforest", 
    "What is a desert", "What is a tundra", "What is an ocean", 
    "What is a coral reef", "What is a mountain range", "What is a river basin", 
    "What are wetlands", "What is biodiversity", "What are endangered species", 
    "What is conservation", "What is deforestation", "What is reforestation", 
    "What are national parks", "What is the greenhouse effect", 
    "What is climate change", "What is global warming", "What is sustainable development",

    # Common nouns
    "city", "country", "river", "mountain", "lake", "ocean",
    "history", "population", "language", "philosophy", "justice", 
    "idea", "belief", "thought", "sky", "star", "volcano", 
    "tree", "animal", "bird", "book", "clock", "computer", 
    "school", "student", "teacher", "scientist",   

    # general medical terms
    "symptoms", "diagnosis", "treatment", "prevention", "prognosis",
    "risk factors", "complications", "chronic", "acute", "infection",
    "inflammation", "allergy", "immunity", "vaccine", "side effects",
    "therapy", "surgery", "rehabilitation", "recovery", "anesthesia",
    "specialist", "primary care", "emergency", "ICU", "outpatient",
    "inpatient", "referral", "second opinion", "medical history",
    "prescription", "over-the-counter", "placebo", "clinical trial",
    "medical imaging", "MRI", "CT scan", "ultrasound", "x-ray",

    # common symptoms
    "fever", "cough", "cold", "headache", "nausea", "vomiting",
    "diarrhea", "dizziness", "fatigue", "chills", "rash",
    "pain", "swelling", "shortness of breath", "chest pain",
    "itching", "sore throat", "joint pain", "muscle pain", "cramps",
    "loss of taste", "loss of smell", "blurred vision", "hearing loss",
    "bleeding", "weakness", "tingling", "numbness", "weight loss",
    "weight gain", "insomnia", "night sweats", "abdominal pain",
    "constipation", "heartburn", "bloating", "confusion",

    # common conditions
    "diabetes", "hypertension", "asthma", "arthritis", "allergies",
    "obesity", "depression", "anxiety", "migraine", "epilepsy",
    "insomnia", "anemia", "thyroid disorder", "acid reflux",
    "COPD", "osteoporosis", "gout", "eczema", "psoriasis",
    "UTI", "high cholesterol", "vitamin deficiency", "heart disease",
    "stroke", "heart attack", "liver disease", "kidney stones",
    "gallstones", "irritable bowel syndrome (IBS)", "GERD",
    "celiac disease", "autoimmune disease", "sepsis", "seizures",
    "pneumonia", "bronchitis", "sinusitis", "chronic pain",

    # infectious diseases
    "COVID-19", "flu", "common cold", "HIV/AIDS", "tuberculosis",
    "hepatitis A", "hepatitis B", "hepatitis C", "dengue fever",
    "malaria", "measles", "mumps", "rubella", "chickenpox",
    "shingles", "herpes", "HPV", "mononucleosis", "strep throat",
    "E. coli", "salmonella", "Zika virus", "Ebola virus",
    "RSV (respiratory syncytial virus)", "Lyme disease",
    "rabies", "H1N1", "norovirus", "rotavirus", "pertussis (whooping cough)",

    # chronic diseases
    "diabetes", "hypertension", "chronic kidney disease", "COPD",
    "heart failure", "rheumatoid arthritis", "Parkinson's disease",
    "Alzheimer's disease", "multiple sclerosis", "Crohn's disease",
    "ulcerative colitis", "chronic fatigue syndrome", "fibromyalgia",
    "chronic migraines", "cystic fibrosis", "sickle cell anemia",

    # common cancers
    "breast cancer", "lung cancer", "prostate cancer", "colorectal cancer",
    "skin cancer", "melanoma", "leukemia", "lymphoma", "brain cancer",
    "pancreatic cancer", "ovarian cancer", "cervical cancer",
    "testicular cancer", "thyroid cancer", "liver cancer", "esophageal cancer",
    "stomach cancer", "bone cancer", "sarcoma", "oral cancer",

    # mental health conditions
    "depression", "anxiety", "bipolar disorder", "schizophrenia",
    "OCD (obsessive-compulsive disorder)", "PTSD (post-traumatic stress disorder)",
    "ADHD", "autism spectrum disorder", "eating disorders", "anorexia nervosa",
    "bulimia nervosa", "binge eating disorder", "panic disorder",
    "social anxiety disorder", "phobias", "dissociative identity disorder",
    "borderline personality disorder",

    # rare diseases
    "ALS (amyotrophic lateral sclerosis)", "Huntington's disease",
    "Lupus", "scleroderma", "Marfan syndrome", "Ehlers-Danlos syndrome",
    "Tay-Sachs disease", "Gaucher disease", "Fabry disease",
    "Duchenne muscular dystrophy", "myasthenia gravis", "Prader-Willi syndrome",
    "Angelman syndrome", "Charcot-Marie-Tooth disease",

    # womens health
    "menstruation", "pregnancy", "fertility", "menopause", "PCOS (polycystic ovary syndrome)",
    "endometriosis", "gestational diabetes", "pre-eclampsia", "miscarriage",
    "postpartum depression", "breastfeeding", "fibroids", "ovarian cysts",
    "cervical dysplasia", "HPV infection", "pelvic inflammatory disease (PID)",

    # childhood illnesses
    "chickenpox", "measles", "mumps", "rubella", "RSV (respiratory syncytial virus)",
    "whooping cough", "hand-foot-and-mouth disease", "ear infections",
    "scarlet fever", "croup", "strep throat", "fifth disease",
    "Kawasaki disease", "bronchiolitis", "teething issues",

    # emrgency conditions
    "heart attack", "stroke", "sepsis", "anaphylaxis", "asthma attack",
    "heat stroke", "hypothermia", "poisoning", "drowning", "seizures",
    "burns", "fractures", "traumatic brain injury", "cardiac arrest",
    "shock", "severe dehydration",
    
    # general technology terms
    "artificial intelligence", "machine learning", "cloud computing", 
    "blockchain", "cryptocurrency", "virtual reality", "augmented reality", 
    "Internet of Things", "big data", "cybersecurity", "data privacy", 
    "5G technology", "VPN", "cloud storage", "edge computing", 
    "DevOps", "microservices", "API", "operating system", "open source",
    "data science", "automation", "autonomous vehicles", "quantum computing",
    "deep learning", "software as a service (SaaS)",
    
    # software applications
    "Microsoft Office", "Zoom", "Slack", "Google Drive", 
    "Adobe Photoshop", "Figma", "Canva", "AutoCAD", "Final Cut Pro",
    "QuickBooks", "Notion", "Trello", "Asana", "Spotify", 
    "Netflix", "YouTube", "TikTok", "Instagram", "Facebook", 
    "Twitter", "Snapchat", "LinkedIn", "WhatsApp", "Discord",
    "VS Code", "IntelliJ IDEA", "Eclipse", "Microsoft Teams", 
    "iMovie", "GarageBand", "GIMP",

    # programming terms
    "Python", "JavaScript", "Java", "C++", "C#", "Swift", 
    "Ruby", "Kotlin", "Rust", "PHP", "SQL", "HTML", "CSS", 
    "React", "Angular", "Vue.js", "Node.js", "Django", 
    "Flask", "TensorFlow", "PyTorch", "Docker", "Kubernetes", 
    "AWS", "Azure", "Google Cloud", "GitHub", "GitLab", 
    "API development", "REST API", "GraphQL", "CI/CD pipelines",

    # cyber security terms
    "password manager", "firewall", "antivirus software", 
    "multi-factor authentication", "ransomware", "phishing attacks", 
    "data breach", "malware", "spyware", "DDoS attack", 
    "encryption", "endpoint security", "zero trust architecture",
    "dark web", "ethical hacking", "cybersecurity certifications", 
    "identity theft", "dark web monitoring", "social engineering", 
    "VPN services", "privacy laws", "GDPR", "CCPA",

    # trends and innovations
    "ChatGPT", "Generative AI", "Tesla autopilot", 
    "Metaverse", "NFTs", "self-driving cars", 
    "electric vehicles", "AR glasses", "robotics", 
    "3D printing", "smart cities", "renewable energy tech", 
    "biometric authentication", "wearable tech", 
    "voice recognition", "personalized medicine technology", 
    "quantum computing breakthroughs", "fusion energy", 
    "cloud gaming", "AI ethics", "neural networks",

    # gaming and entertainment
    "Fortnite", "Minecraft", "Call of Duty", "Valorant", 
    "League of Legends", "Elden Ring", "Cyberpunk 2077", 
    "Overwatch", "Roblox", "Among Us", "GTA V", 
    "The Legend of Zelda", "Animal Crossing", "FIFA", "Madden NFL", 
    "eSports", "streaming platforms", "Twitch", "YouTube Gaming", 
    "game development tools", "Unreal Engine", "Unity", 
    "VR gaming", "Steam", "Epic Games Store",

    # cloud and data
    "AWS", "Google Cloud", "Microsoft Azure", 
    "cloud storage", "cloud migration", "data lake", 
    "data warehouse", "data analytics", "machine learning models", 
    "serverless architecture", "edge computing", 
    "data visualization tools", "Tableau", "Power BI", 
    "BigQuery", "Snowflake", "data pipeline", 
    "real-time data processing", "ETL process",

    # tech giants
    "Apple", "Google", "Amazon", "Microsoft", "Meta", 
    "Tesla", "Netflix", "Nvidia", "Intel", "AMD", 
    "SpaceX", "Blue Origin", "Samsung", "Sony", "Dell", 
    "HP", "IBM", "Cisco", "Salesforce", "Oracle",
    
    # science and space
    "NASA", "SpaceX", "astronomy", "astrophysics", "black holes", 
    "quantum physics", "the Big Bang theory", "Mars exploration", 
    "moon landing", "James Webb Telescope", "Hubble Telescope", 
    "gravitational waves", "dark matter", "dark energy", 
    "International Space Station", "space tourism", "solar system", 
    "exoplanets", "Milky Way galaxy", "cosmic microwave background", 
    "physics laws", "DNA structure", "CRISPR", "genetic engineering", 
    "stem cells", "evolution", "paleontology", "geology", 
    "oceanography", "meteorology", "climate change", "global warming",

    # history and events
    "American Revolution", "Civil War", "World War I", 
    "World War II", "Cold War", "Great Depression", 
    "Industrial Revolution", "9/11 attacks", "Pearl Harbor", 
    "Civil Rights Movement", "Boston Tea Party", "the New Deal", 
    "Vietnam War", "Korean War", "Apollo 11", "Watergate scandal", 
    "Trail of Tears", "Women’s suffrage movement", 
    "Prohibition era", "Manhattan Project", "Space Race", 
    "Gold Rush", "founding of America", "Declaration of Independence", 
    "Emancipation Proclamation", "Louisiana Purchase", 
    "March on Washington", "Famous assassinations (JFK, MLK Jr.)",

    # education and academics
    "SAT preparation", "ACT study guides", "AP exams", 
    "college application process", "scholarship opportunities", 
    "student loans", "FAFSA", "Ivy League universities", 
    "community colleges", "trade schools", "STEM education", 
    "liberal arts programs", "online degrees", "vocational training", 
    "education technology", "early childhood education", 
    "special education programs", "school rankings", 
    "teacher certifications", "homeschooling", 
    "MOOCs (Massive Open Online Courses)", "edtech platforms", 
    "study abroad programs", "education reforms", 
    "standardized testing", "extracurricular activities",

    # finance and economy
    "stock market", "cryptocurrency", "Bitcoin", "Ethereum", 
    "personal finance", "budgeting", "retirement planning", 
    "401(k)", "Roth IRA", "investment strategies", 
    "mutual funds", "ETFs", "real estate market", 
    "credit scores", "credit cards", "debt consolidation", 
    "student loans", "mortgages", "auto loans", 
    "inflation", "recession", "unemployment rates", 
    "economic policies", "Federal Reserve", "GDP", 
    "consumer spending", "tax filing", "tax deductions", 
    "small business loans", "venture capital", 
    "financial literacy resources",

    # # health & fitness
    # "nutrition", "exercise routines", "cardio workouts", 
    # "strength training", "yoga", "pilates", "HIIT workouts", 
    # "mental health", "stress management", "mindfulness", 
    # "sleep hygiene", "weight loss programs", "diet plans", 
    # "vegan diet", "keto diet", "intermittent fasting", 
    # "hydration tips", "health apps", "calorie tracking", 
    # "fitness trackers", "step challenges", "home gym equipment", 
    # "physical therapy", "posture correction", "rehabilitation exercises",

    # law, policies and legal
    "constitutional rights", "Bill of Rights", "Supreme Court cases", 
    "immigration laws", "tax laws", "labor laws", 
    "intellectual property", "copyright laws", "patent filing", 
    "civil rights", "criminal justice system", "gun control laws", 
    "marriage laws", "environmental laws", "housing policies", 
    "healthcare policies", "education reforms", "voting rights", 
    "prison reform", "whistleblower protections", "consumer rights",

    # social issues
    "gender equality", "racial equality", "LGBTQ+ rights", 
    "climate justice", "income inequality", "mental health awareness", 
    "poverty alleviation", "access to education", "homelessness", 
    "human trafficking", "substance abuse", "voter suppression", 
    "child welfare", "domestic violence", "animal rights", 
    "cyberbullying", "freedom of speech", "healthcare access", 
    "workplace harassment", "data privacy concerns",

    
    
]

information_examples_partial = [item for item in information_examples_partial if len(item) >4 ]

yelp_examples_partial = [
    "best pizza", "top restaurant", "gym near", "spa open", "mexican food",
    "coffee shop", "laundry near", "good places to eat", "sushi near", 
    "hair salon", "nearby cafes", "local barbers", "laundry nearby", 
    "hotel deals", "gym membership", "top dining spots", "top nightlife",
    "bakery nearby", "famous bars", "nearest grocery", "parking nearby",
    "family restaurants", "dog parks", "organic shops", "delis open", 
    "live music bars", "seafood places", "bbq joints", "vegan options",
    "pet stores", "hardware stores", "movie theaters", "car wash",
    "home improvement", "paint stores", "dance studios", "music shops",
    "wine stores", "health stores", "barbecue spots", "dim sum places",
    "italian restaurants", "beach resorts", "karaoke bars", "juice bars",
    "top takeout", "electricians near", "plumbers", "rooftop bars",
    "restaurant", "hotel", "mall", "theater", 
    "park", "market", "museum", "hospital", "office",
    "factory", "dentist", "doctor", "chef",
] + yelp_keywords_data
    
    
weather_examples_partial = [
    "weather tomorrow", "rain in", "temperature", "forecast for", "sunrise time",
    "storm warning", "rainy season", "hurricane update", "snow tomorrow", 
    "today's climate", "UV index", "coldest day", "wind conditions", 
    "humidity level", "current weather", "today’s temperature", 
    "hourly forecast", "weather updates", "snowfall predictions", "wind speed",
    "high temperature", "freezing temperatures", "sunny days", "storm chances", 
    "next week forecast", "weather map", "air quality", "heatwave warnings",
    "drought warnings", "fog advisory", "visibility levels", "temperature fluctuations",
    "sunset times", "tornado warning", "heat index", "lightning storms",
    "tropical storm", "hail prediction", "UV forecast", "rainfall accumulation",
    "barometric pressure", "seasonal forecast", "dew point", "morning mist",
    "typhoon forecast", "cyclone warnings", "climate trends", "polar vortex",
    "weekly weather outlook", "temperature drop", "clear skies forecast", "storm tracker", 
    "today's rain probability", "overnight temperatures", "winter forecast", "heat advisory", 
    "real feel temperature", "weather near me", "sunny spells", "cloud cover", "pollen count", 
    "flood warnings", "weather tomorrow morning", "weather alert", "rain chance this evening", 
    "cold front update", "frost warnings", "spring forecast", "evening temperature", 
    "next 10 days weather", "weather radar", "weather history", "long-range forecast", 
    "weekly temperature highs", "low visibility", "wind advisory", "morning frost", 
    "real-time weather", "humidity forecast", "ice storm warning", "rain forecast today", 
    "temperature variation", "fall forecast", "hourly rain chances", "summer heat predictions", 
    "wind chill factor", "sunrise and sunset times", "gale warnings", "arctic blast", 
    "light rain or showers", "severe weather alerts", "regional forecast", "tornado risk", 
    "wind gust forecast", "chilly mornings", "monsoon season update", "muggy conditions", 
    "foggy days", "weather conditions", "storm risk", "current climate conditions", 
    "precipitation levels", "UV alert", "heatwave duration", "snow accumulation", "cold wave warning", 
    "snow depth", "drizzle forecast", "evening showers", "freezing rain warning", "hot and humid", 
    "dry spell", "local weather news", "snow squall warning", "storm outlook", "weather watch", 
    "sunshine hours", "weather patterns", "damp conditions", "extreme heat forecast", 
    "temperature records", "record high temperatures", "unseasonably warm", "wind direction", 
    "monthly climate outlook", "extreme cold forecast", "falling temperatures", "temperature swings", 
    "cyclone path", "current weather radar", "gusty winds", "cold temperatures tonight", "weather now", 
    "latest snow forecast", "frost formation", "air quality levels", "seasonal weather trends", 
    "afternoon thunderstorms", "nighttime temperatures", "freezing point", "global warming impact", 
    "hail forecast", "humidity today", "wildfire weather conditions", "barometric trend", "snowfall totals",
    "rain", "wind", "sky", "cloud", "sun", "moon", "star", 
    "ocean", "river", "lake", "sand", "volcano", "forest", 
    "desert", "storm", "hurricane", "snow", "temperature", 
    "climate", "season", "spring", "summer", "autumn", "winter",
]

navigation_examples_partial = [
    "login to my bank account", "open Facebook", "Amazon sign in", "Twitter homepage", 
    "navigate to YouTube", "check Gmail login", "open Instagram", "eBay account login",
    "find Netflix homepage", "sign in to LinkedIn", "Pinterest account", 
    "Reddit homepage", "Spotify login page", "access Google Drive", "open Zoom meeting", 
    "Yahoo Mail login", "Hotmail account access", "Slack workspace sign in", 
    "open Microsoft Teams", "navigate to Dropbox", "sign on to Salesforce", 
    "WordPress admin login", "WhatsApp Web access", "navigate to Hulu", "Apple ID sign in", 
    "sign into PayPal", "open Skype", "open Trello board", "find Evernote account", 
    "Quora homepage", "sign in to Snapchat", "access to Reddit inbox", 
    "sign into iCloud", "sign on to Asana", "Notion account access", "navigate to Medium", 
    "Uber Eats sign in", "Grubhub login page", "Google Analytics login", 
    "open Shopify store", "navigate to Etsy seller account", "Figma login", 
    "open Venmo", "Twitch homepage", "access Outlook", "open Steam account", 
    "navigate to Amazon Prime", "sign into Dropbox Paper", "Canvas student login", 
    "sign on to Coursera", "Pluralsight login page", "open Basecamp", "open GitHub", 
    "access my T-Mobile account", "find Verizon login", "navigate to AT&T website", 
    "Walmart homepage", "open Best Buy", "sign into Chegg", "navigate to Khan Academy", 
    "find Zillow homepage", "Redfin sign in", "Spotify Web access", 
    "navigate to Robinhood", "Coinbase login", "Crypto.com sign on", "Etsy login", 
    "find Airbnb login page", "navigate to Discord", "access Slack messages", 
    "open Google Photos", "navigate to iTunes store", "Yelp homepage", "Craigslist login", 
    "Home Depot account", "open Lowe's account", "sign in to Target", 
    "access Dropbox Business", "find Udemy login", "Skillshare homepage", 
    "Medium sign on", "navigate to Tumblr", "sign into TikTok", "GitLab login page", 
    "open Binance account", "sign in to Adobe", "open Shopify admin", 
    "navigate to Bank of America", "Chase bank login", "Wells Fargo online banking", 
    "access Capital One account", "sign in to Square", "Google My Business login", 
    "navigate to Fidelity", "sign into Vanguard", "sign on to TD Ameritrade", 
    "open E*TRADE account", "American Express login", "Bank of America sign in", 
    "H&R Block login", "TurboTax homepage", "QuickBooks sign in", "Dropbox team account", 
    "Yahoo homepage", "open DuckDuckGo", "Wikipedia main page", "navigate to IMDb", 
    "open BBC News", "CNN live access", "ESPN homepage", "navigate to WebMD", 
    "access LinkedIn Learning", "open CNN Business", "Google Calendar login", 
    "find Google News", "navigate to Reddit homepage",
    "Amazon customer support", "Netflix help center", "contact PayPal support", 
    "Spotify FAQs", "manage subscription on YouTube", "update profile settings on Facebook", 
    "change payment method on Etsy", "privacy settings on Instagram", 
    "explore new movies on Netflix", "latest tech deals on Amazon", 
    "bestselling books on Kindle", "shop new arrivals on Zara", 
    "track my order on Amazon", "eBay order status", "Uber Eats delivery status", 
    "check FedEx shipment", "book tickets on Eventbrite", "movie showtimes on AMC", 
    "upcoming events on Meetup", "openTable reservations", "live TV on Hulu", 
    "watchlist on Disney+", "browse podcasts on Spotify", "explore documentaries on Prime Video", 
    "my courses on Coursera", "learn Python on Udemy", "training portal on LinkedIn Learning", 
    "online classes on Khan Academy", "shared documents on Google Drive", 
    "files on OneDrive", "Dropbox shared folders", "recent uploads on Box", 
    "view portfolio on Fidelity", "bill payment on PayPal", "credit score on Credit Karma", 
    "investment dashboard on Robinhood", "browse topics on Reddit", 
    "community forum on Stack Overflow", "support group on Facebook Groups", 
    "Q&A on Quora", "manage trip on Expedia", "flight details on Delta", 
    "vacation rentals on Airbnb", "car rental on Hertz", "find doctors on Zocdoc", 
    "health articles on WebMD", "online appointment on Walgreens", 
    "fitness tracker on Fitbit",
    "view Amazon Prime movies", "Google Photos backup access", "Facebook privacy settings",
    "manage HBO Max account", "Instagram explore page", "Reddit trending posts",
    "explore Apple Music playlists", "Twitter trending topics", "best deals on Walmart",
    "view account balance on Chase", "Prime Video watchlist", "Amazon Music account access",
    "Pinterest saved boards", "LinkedIn job postings", "Reddit community posts",
    "My Verizon account overview", "edit LinkedIn profile", "Facebook marketplace",
    "CNN top stories", "Spotify podcasts", "shop Walmart grocery", "Etsy order history",
    "check Apple iCloud storage", "Google One storage management", "manage Google subscriptions",
    "Disney Plus movie categories", "Uber driver login", "DoorDash customer service",
    "latest news on NPR", "Google Workspace admin login", "Facebook group discussions",
    "Pinterest trending pins", "GitHub repositories", "Google Maps recent searches",
    "YouTube playlist access", "Google Photos shared albums", "edit Amazon profile",
    "explore Fitbit dashboard", "open Google Keep notes", "Venmo transaction history",
    "Slack channel notifications", "Redfin housing market", "Google Assistant settings",
    "Microsoft Office online", "Facebook account activity log", "Reddit saved posts",
    "edit Spotify playlists", "latest releases on SoundCloud", "TikTok discover page",
    "Facebook memories", "edit Apple Music library", "Zillow property search",
    "check Grubhub rewards", "Netflix kids mode", "Instagram stories archive",
    "view Lyft ride history", "edit profile on Coursera", "Amazon gift card balance",
    "Dropbox folder sharing", "Eventbrite event discovery", "Google account security check",
    "Best Buy rewards access", "Google My Business analytics", "Twitter notifications",
    "manage Reddit subscriptions", "PayPal transaction history", "Google Books my library",
    "Walmart order tracker", "Craigslist free items section", "Microsoft Teams meetings",
    "Instagram saved posts", "Netflix account settings", "YouTube comment notifications",
    "Pinterest saved recipes", "Google Pay payment history", "Bing rewards dashboard",
    "My T-Mobile data usage", "Etsy store dashboard", "Microsoft 365 subscriptions",
    "YouTube upload page", "Google account backup options", "edit Amazon address book",
    "Audible library access", "Apple Wallet settings", "CNN live news updates",
    "Google Play Music playlists", "Reddit account preferences", "Uber trip receipts",
    "Notion workspace settings", "Pinterest boards management", "Dropbox recent activity",
    "Google Meet history", "find Hulu profile settings", "Google Analytics reports",
    "Quora inbox", "Twitter direct messages", "Slack user profiles", "LinkedIn news feed",
    "Google News trending", "Instagram explore reels", "BBC World News live",
    "access Google Authenticator", "Google Translate history", "manage Amazon wish list",
    "Apple Podcasts browse", "view Google Calendar invites", "edit Zoom profile picture",
    "login to Wells Fargo", "open Chase QuickPay", "access Capital One credit card",
    "find Fidelity 401k account", "view Merrill Lynch portfolio", "sign into Schwab account",
    "navigate to SoFi dashboard", "open Ally Bank login", "sign into Discover card account",
    "open CitiBank online", "access my US Bank account", "find Navy Federal login",
    "navigate to Truist Bank", "login to Regions Bank", "PNC online banking access",
    "open Robinhood app", "access my Vanguard funds", "open Acorns dashboard",
    "sign into Betterment", "login to Mint.com", "PayPal business account access",
    "navigate to Venmo transaction history", "login to Zelle", "access Cash App",
    "find Square dashboard", "open Stripe account", "login to QuickBooks Online",
    "access Xero accounting software", "navigate to Shopify admin", 
    "login to Google Workspace", "find Google Admin console", "open Microsoft Azure portal",
    "access AWS Management Console", "navigate to Heroku dashboard",
    "login to GitHub Enterprise", "find Atlassian Jira login", "open Trello workspace",
    "access Asana tasks", "open Monday.com dashboard", "login to Basecamp",
    "navigate to Slack channels", "access Zoom recordings", "find Microsoft Teams workspace",
    "open Google Meet settings", "access WebEx meetings", "login to Dropbox Paper",
    "navigate to OneDrive Business", "find Google Drive shared folders", "open Notion workspace",
    "login to Evernote", "access Airtable base", "open Coda docs",
    "navigate to Box cloud storage", "find Adobe Creative Cloud", "login to Canva",
    "access Figma files", "open Sketch workspace", "navigate to InVision",
    "find Behance projects", "login to Dribbble profile", "access my Pinterest boards",
    "find Etsy store dashboard", "navigate to Amazon Seller Central", "open Walmart Marketplace",
    "access Target Circle rewards", "login to Home Depot Pro Xtra", "open Lowe's for Pros",
    "access Best Buy Totaltech", "navigate to Costco membership portal", "find Sam's Club login",
    "open Staples rewards", "access Office Depot account", "navigate to FedEx Delivery Manager",
    "login to UPS My Choice", "open USPS informed delivery", "access DHL tracking portal",
    "find Uber driver portal", "navigate to Lyft driver login", "open DoorDash merchant dashboard",
    "access Grubhub for Restaurants", "find Postmates Fleet login", "open Instacart Shopper app",
    "login to Shipt Shopper portal", "navigate to Rover pet sitters", "find Wag walker account",
    "open Care.com dashboard", "access TaskRabbit tasks", "find Fiverr seller login",
    "navigate to Upwork profile", "login to Freelancer.com", "access 99designs workspace",
    "open Toptal freelancer portal", "navigate to Indeed employer login", "find Glassdoor employer account",
    "open LinkedIn Recruiter", "login to AngelList Talent", "access Crunchbase profile",
    "navigate to Google Ads", "find Facebook Business Manager", "open Instagram Insights",
    "access TikTok for Business", "login to Twitter Ads Manager", "navigate to Pinterest Analytics",
    "open Snapchat Ads Manager", "find Reddit Ads dashboard", "access Amazon Advertising",
    "navigate to eBay Seller Hub", "open Etsy Ads Manager", "find Walmart Connect login",
    "access Shopify Marketing", "navigate to HubSpot CRM", "find Salesforce Marketing Cloud",
    "open Zoho CRM dashboard", "login to Pipedrive", "access Freshworks CRM",
    "navigate to Mailchimp campaigns", "find Constant Contact login", "open Campaign Monitor dashboard",
    "access Klaviyo account", "navigate to Drip marketing", "find ActiveCampaign login",
    "open SendGrid dashboard", "login to Twilio account", "access WhatsApp Business API",
    "navigate to Telegram channels", "open Discord server", "access Reddit community",
    "login to Twitch streamer portal", "navigate to YouTube Studio", "open Vimeo dashboard",
    "find Dailymotion login", "access Hulu Live TV", "navigate to Peacock homepage",
    "open Paramount Plus", "login to Discovery Plus", "access HBO Max profiles",
    "find Disney Plus Kids Mode", "navigate to Apple TV+", "open Netflix family account",
    "access Google TV settings", "find Roku Channel Store", "navigate to Amazon Fire TV",
    "login to Plex Media Server", "open Kodi settings", "access Sling TV lineup",
    "navigate to ESPN+ Live Sports", "open Fox Sports account", "access CBS All Access",
    "find NBC Sports login", "navigate to MLS Season Pass", "open NFL Sunday Ticket",
    "access NBA League Pass", "find MLB.tv homepage", "navigate to NHL.tv",
    "login to Peacock Sports", "access Spotify Premium", "navigate to SoundCloud Go+",
    "open Amazon Music Unlimited", "find Apple Music settings", "access Tidal HiFi",
    "navigate to Pandora Plus", "open iHeartRadio All Access", "access Audible library",
    "find OverDrive eBooks", "navigate to Libby app", "open Google Books",
    "login to Kindle Unlimited", "access Barnes & Noble Nook", "find Kobo eReader settings",
    "navigate to Chegg eTextbooks", "open Pearson MyLab", "access Coursera for Business",
    "navigate to edX Professional Certificate", "find LinkedIn Learning Paths",
    "open Skillshare Premium", "access Khan Academy Teacher Dashboard",
    "navigate to Duolingo Classroom", "find Babbel for Business login", "open Rosetta Stone settings",
    "access Berlitz Virtual Classroom", "navigate to Codecademy Pro", "find Udemy Business login",
    "open Pluralsight Skills", "access DataCamp for Teams", "navigate to Tableau eLearning",
    "find Power BI tutorials", "open Salesforce Trailhead", "access Google Cloud Training",
    "navigate to AWS Certification Hub", "find Microsoft Learn homepage",
    "go to Netflix homepage", "jump to Gmail inbox", "land on Facebook profile", 
    "reach my bank dashboard", "launch Spotify Web Player", "head to Instagram DMs",
    "access my LinkedIn jobs", "dive into Dropbox files", "log back into Zoom account",
    "resume Slack messages", "continue Microsoft Teams call", "check Google Meet invites",
    "start streaming on Hulu", "fire up Disney Plus", "direct me to Apple TV settings",
    "relocate to Amazon login", "get to eBay watchlist", "retrieve Pinterest saved pins",
    "step into Reddit comments", "locate TikTok notifications", "teleport to Trello boards",
    "jumpstart Basecamp projects", "fetch Notion workspaces", "fast track to Coursera lessons",
    "open my GitHub repositories", "tap into Spotify playlist settings", 
    "activate SoundCloud premium", "pull up my Google Calendar events", 
    "snap back to Snapchat messages", "rewind to Netflix episode list",
    "unlock my Apple ID", "show me my Amazon orders", "review Gmail sent folder", 
    "trace Etsy order tracking", "queue up Twitch live streams", 
    "jump into Google Drive shared docs", "link to my PayPal wallet", 
    "shortcut to TikTok discover feed", "bridge to Microsoft Outlook mail", 
    "dock at Dropbox team account", "step into Evernote notebooks", 
    "trace my Venmo transaction history", "pick up Spotify wrapped list", 
    "shoot over to Reddit trending posts", "open Google Photos albums", 
    "redirect to Shopify dashboard", "revive WordPress editor", 
    "bookmark my Goodreads reading list", "restart Trello daily tasks", 
    "return to Zillow saved homes", "visit my Lyft ride history", 
    "map out Uber trip logs", "toggle to Fitbit fitness stats", 
    "retrieve OneDrive shared links", "examine Pinterest analytics", 
    "switch back to Slack channels", "restore GitLab issues", 
    "cross into CNN live headlines", "pop into ESPN game scores", 
    "load Instagram reels", "reach back to WhatsApp chat", 
    "switch on Google Authenticator codes", "flip through Google Docs recent edits", 
    "hitch onto Dropbox team space", "toggle Netflix parental controls", 
    "stick to Amazon cart", "hover over Walmart pickup info", 
    "skip back to PayPal balance", "circle back to Google Keep notes", 
    "browse Amazon Prime video categories", "trace Spotify queue", 
    "line up LinkedIn messages", "jog back to Facebook memories", 
    "dock into YouTube history", "reconnect to Zoom call recordings", 
    "swap into GitHub pull requests", "route back to Google Maps timeline", 
    "set off Netflix subtitles page", "track LinkedIn news feed", 
    "navigate home on Etsy dashboard", "run back to Twitch follower list", 
    "lock onto my Airbnb bookings", "reroute to DoorDash account", 
    "shuttle to Slack notification center", "switch over to Dropbox shared folder",
    "touch base on Hulu profiles", "arrive at Microsoft Azure portal", 
    "send me back to Trello boards", "connect to T-Mobile billing portal", 
    "rewind to Spotify premium benefits", "point to Reddit private messages", 
    "spot my Fitbit sleep logs", "line up Zillow new listings", 
    "push me to Apple Wallet settings", "cue up Google Meet recordings", 
    "head back to Disney Plus movie categories", "restore Google Play account info", 
    "revisit Audible audiobook library", "streamline Dropbox business files", 
    "find my iPhone via iCloud", "adjust Venmo privacy settings", 
    "follow Google Calendar reminders", "unlock Uber Eats saved locations", 
    "queue Twitch subscription details", "resume Twitter trending stories", 
    "cycle back to Target order tracker", "refresh Etsy payment methods", 
    "access DoorDash saved addresses", "park at Dropbox upload history", 
    "dive back into Amazon Music", "unlock Capital One credit history", 
    "expand Slack sidebar", "load TikTok shared videos", 
    "toggle between Spotify family accounts", "locate Google Workspace apps", 
    "check Apple ID device list", "snap to WhatsApp voice messages", 
    "ping Dropbox business folders", "start Gmail calendar integration", 
    "zone into Zoom virtual backgrounds", "line up Netflix continue-watching list", 
    "fast-track LinkedIn learning progress", "shuffle into Spotify podcast library", 
    "take me to PayPal login portal", "boost Google One settings", 
    "power on Disney Plus kids profiles", "make a shortcut to Quora drafts", 
    "link to Instagram saved stories", "reset to Facebook ad manager", 
    "retrieve my YouTube channel analytics", "cross over to Amazon delivery details", 
    "dock on Hulu recent episodes", "pop up Reddit moderator tools", 
    "trace Apple ID subscription details", "jumpstart LinkedIn recruiter tools", 
    "switch on Twitter direct mentions", "bookmark HBO Max recent streams", 
    "pin Evernote shared notes", "fast-forward Spotify discover playlists", 
    "recover Zoom chat history", "zone into Fitbit dashboard", 
    "cut to Google Maps saved routes", "track down Reddit profile info", 
    "trace Netflix payment details", "trigger Hulu parental settings", 
    "restore Etsy transaction history", "find Disney Plus sign-in page", 
    "jump back to Airbnb saved stays", "pinpoint Target Circle rewards", 
    "lock back to Zillow real estate trends", "link me to Amazon Alexa skills",
    "bring up Twitch drops page", "quick pull up Spotify equalizer settings", 
    "nudge into eBay seller dashboard", "roll over to Hulu account details", 
    "pick up Facebook notifications page", "route through Lyft shared rides", 
    "dock into Canva project drafts", "rewind Reddit comment history", 
    "fetch Microsoft Office online apps", "reopen Google Pay transaction records", 
    "cue into Pinterest shopping pins", "bounce back to SoundCloud followers", 
    "knit to Walmart mobile checkout", "open the Dropbox collaboration panel", 
    "gear into Shopify fulfillment options", "step into my GitHub issues", 
    "shift to Reddit personal chats", "spin up TikTok creator tools", 
    "turn on my Disney Plus profiles", "land me on Hulu billing info", 
    "jump into Snapchat recent snaps", "reopen Airbnb host settings", 
    "pin onto my Lyft rider details", "align my Google Photos albums", 
    "mount Dropbox file explorer", "flag Evernote pinned notes", 
    "warp to Twitch streamer highlights", "shift Spotify private session mode", 
    "browse Netflix family movies", "kick-start Google Calendar syncs", 
    "funnel into Amazon wishlist tracker", "dig into Reddit poll results", 
    "swipe Slack pinned messages", "flip Spotify favorites", 
    "cross Netflix series updates", "sync to Fitbit daily activity logs", 
    "retune my YouTube uploads", "set off Venmo shared expenses tracker", 
    "realign Hulu group watch rooms", "dock into Google Workspace tools",
    "renew my driver's license online", "DMV appointment scheduler", "California DMV forms",
    "check New York DMV status", "apply for REAL ID online", "Texas driver's license renewal",
    "update vehicle registration", "replace lost license", "pay traffic ticket online",
    "check speeding ticket status", "Florida DMV login", "DMV learner's permit application",
    "schedule road test appointment", "find DMV office near me", "Illinois DMV online services",
    "submit emissions test results", "update license address", "vehicle title transfer form",
    "access DMV practice tests", "find my voter registration form", "register to vote online",
    "update voter registration", "find polling location", "track my absentee ballot",
    "apply for US passport online", "renew passport application", "lost passport replacement",
    "check passport appointment status", "DS-11 application form", "visa application form DS-160",
    "apply for US citizenship", "check green card application status", "track immigration case online",
    "find USCIS forms", "schedule USCIS biometrics appointment", "renew permanent resident card",
    "apply for asylum online", "check ESTA status", "pay USCIS fees online", "file I-130 petition",
    "file N-400 citizenship application", "access Social Security statement", "apply for SSN replacement",
    "check Medicare enrollment", "file unemployment benefits claim", "update unemployment claim status",
    "access state disability insurance", "apply for SNAP benefits", "check food stamps balance",
    "renew Medicaid application", "find Affordable Care Act plans", "apply for TANF benefits",
    "track child support payments", "file for child custody modification", "apply for Section 8 housing",
    "find federal housing programs", "check HUD foreclosure listings", "submit FAFSA application",
    "track FAFSA status", "find Pell Grant eligibility", "apply for student loan forgiveness",
    "check student loan repayment status", "access Parent PLUS loan application",
    "find IRS forms online", "file federal tax return", "track tax refund status",
    "find tax transcripts", "submit W-9 form", "apply for an EIN number", "file 1099 form online",
    "file state tax return", "check estimated tax payment status", "apply for tax extension",
    "access IRS identity verification", "find ITIN application form", "update my tax withholding",
    "apply for property tax exemption", "check state sales tax rates", "file use tax online",
    "find tax relief programs", "apply for utility bill assistance", "pay electricity bill online",
    "check gas bill statement", "find water bill payment portal", "apply for energy assistance",
    "submit meter reading online", "update utility account information", "report power outage",
    "schedule service reconnection", "find low-income energy programs", "file insurance claim online",
    "check car insurance policy", "access home insurance documents", "apply for life insurance benefits",
    "file health insurance appeal", "renew renters insurance policy", "update beneficiary information",
    "find accident claim status", "check flood insurance eligibility", "apply for business insurance",
    "pay mortgage online", "access loan modification forms", "file property lien release",
    "apply for mortgage pre-approval", "check credit report online", "dispute credit report errors",
    "find credit score report", "freeze my credit file", "apply for personal loan online",
    "submit small business loan application", "track SBA loan status", "apply for PPP loan forgiveness",
    "check bank account balance", "transfer funds between accounts", "report lost debit card",
    "order new checks online", "update account beneficiaries", "open a business checking account",
    "apply for home equity loan", "file fraud dispute online", "find FDIC insured banks",
    "apply for school enrollment", "track school application status", "find FAFSA application deadlines",
    "submit college transcripts", "find school district boundaries", "schedule parent-teacher conferences",
    "check school lunch menus", "access student attendance records", "file private school applications",
    "apply for IEP services", "track college admission status", "find SAT test dates",
    "register for ACT test", "find Common App login", "submit application fee waiver",
    "schedule college campus tour", "find dorm assignment details", "apply for work-study programs",
    "access my college financial aid portal", "find student portal login", "file residency reclassification form",
    "register for university classes", "find professor office hours", "submit academic appeal form",
    "pay parking ticket online", "file public records request", "submit FOIA request online",
    "apply for concealed carry permit", "renew hunting license", "register for fishing license",
    "find state parks pass application", "apply for veterans benefits", "track VA disability claim",
    "find VA forms online", "apply for GI Bill benefits", "access military records",
    "check Selective Service registration", "register for disaster assistance",
    "apply for FEMA benefits", "track FEMA application status", "submit disaster loan application",
    "find local FEMA offices", "apply for business permits online", "renew professional license",
    "file workplace discrimination claim", "check workers' compensation status",
    "find OSHA complaint form", "file wage theft complaint", "find labor law posters",
    "check WIC eligibility", "apply for Head Start programs", "find child care subsidies",
    "schedule doctor appointment online", "access telehealth portal", "renew driver's medical card",
    "submit disability parking application", "check ADA accommodation status",
    "apply for court records online", "file restraining order petition", "track divorce case status",
    "find small claims court forms", "apply for legal aid assistance", "find pro bono attorneys",
    "submit jury duty questionnaire", "check court hearing schedule", "access public defender services",
    "find state attorney general forms", "submit police report online", "find criminal background check forms",
    "renew firearm registration", "file firearm transfer forms", "check TSA PreCheck application status",
    "apply for Global Entry", "renew TSA Known Traveler Number", "find CLEAR enrollment status",
    "apply for Amtrak Guest Rewards", "check flight cancellation policies", "find airport TSA hours",
    "apply for local library card", "find library eBook portal", "access library digital resources",
    "track interlibrary loan status", "submit book purchase suggestion", "apply for volunteer positions",
    "find senior center activities", "register for community classes", "apply for homeless assistance",
    "find affordable childcare resources", "access neighborhood watch program",
    # Tax-related forms and actions
    "file IRS Form 1040", "download Form W-2", "apply for EIN online", 
    "submit 1099-NEC form", "track federal tax refund", "file state tax return", 
    "check estimated tax payments", "apply for tax extension online", 
    "update tax withholding on IRS website", "access Form 8962 for premium tax credits", 
    "correct filed tax return online", "request IRS tax transcripts", 
    "report tax fraud to IRS", "apply for small business tax relief", 
    "find IRS instructions for Schedule C", "pay quarterly estimated taxes online",

    # Legal-related actions
    "file a civil lawsuit online", "submit FOIA request form", "apply for restraining order online", 
    "check court hearing schedule", "access small claims court forms", 
    "apply for public defender services", "file a discrimination complaint", 
    "report workplace harassment to EEOC", "submit police report online", 
    "check criminal background online", "apply for power of attorney forms", 
    "find legal aid for low-income families", "track immigration case status", 
    "submit appeal for denied claims", "file for child custody modification", 
    "apply for tenant's rights assistance", "submit consumer complaint to FTC", 

    # Anti-abuse and reporting
    "report cyberbullying online", "file a complaint with IC3 for cybercrime", 
    "report identity theft to FTC", "submit spam calls complaint to FCC", 
    "file a claim for bank fraud online", "report phishing emails to banks", 
    "report financial scams to SEC", "access National Center for Missing and Exploited Children website", 
    "file domestic violence report online", "apply for stalking protection order", 
    "report elder abuse in my state", "file a human trafficking report online", 
    "submit internet safety tip to FBI", "report fake job scams online",

    # Bank and financial fraud
    "report lost or stolen credit card", "file bank fraud report online", 
    "dispute credit card charges", "check FDIC claim status", 
    "apply for fraud protection services", "block fraudulent transactions on PayPal", 
    "file a chargeback request online", "report unauthorized debit transactions", 
    "freeze credit through Equifax", "lock credit file through Experian", 
    "file report of elder financial abuse", "access fraud department for Capital One", 
    "dispute transaction with Chase bank", "report fraudulent Zelle payment",

    # Government forms and services
    "apply for Social Security card replacement", "access Medicare enrollment forms", 
    "file unemployment benefits claim", "submit disability benefits application", 
    "check SNAP eligibility online", "apply for WIC benefits", 
    "find FEMA disaster relief forms", "submit voter registration online", 
    "report government waste to GAO", "access OSHA workplace safety complaint form", 
    "file whistleblower complaint online", "report corruption to Department of Justice", 
    "apply for Affordable Care Act subsidies", "submit housing discrimination complaint", 
    "track Section 8 housing application", "report workplace injuries online",
    "find online utility bill assistance forms", "apply for FEMA disaster loans", 
    "report identity fraud to SSA", "track disability determination online", 
    "access Department of Labor claims portal", "submit veterans benefits application",
    # School-related actions
    "apply for school enrollment online", "check school district boundaries", 
    "submit parent-teacher conference form", "pay for school lunches online", 
    "access student attendance records", "check school supply lists", 
    "apply for special education services", "find PTA meeting schedule", 
    "access online gradebook", "register for after-school programs", 
    "pay school fees online", "download school event calendar", 
    "submit absence excuse form", "apply for school transportation", 
    "find classroom supply requests", "access teacher contact information", 
    "request school transcript", "schedule a school counselor meeting", 

    # College application and admission
    "access Common App portal", "apply for early decision online", 
    "track college admission status", "submit SAT scores to colleges", 
    "register for ACT test", "apply for college application fee waiver", 
    "check college essay requirements", "find college admission deadlines", 
    "schedule campus visit", "access college financial aid portal", 
    "find scholarship opportunities", "submit FAFSA application", 
    "track FAFSA status online", "apply for student work-study programs", 
    "submit letters of recommendation", "access AP test score portal",

    # Student services and resources
    "log in to student portal", "register for college classes", 
    "find professor office hours", "schedule academic advising appointment", 
    "pay tuition fees online", "download class schedule", 
    "apply for dormitory housing", "find roommate assignment", 
    "check dining hall menu", "access campus map", 
    "submit course withdrawal form", "apply for course overload approval", 
    "find library hours", "access campus gym schedule", 
    "log in to online learning platform", "track student loan repayment status", 
    "request campus parking permit", "find textbook list for classes", 

    # Academic and career development
    "apply for internships online", "find career fair schedule", 
    "access resume workshop details", "log in to career services portal", 
    "schedule mock interview", "find alumni networking events", 
    "apply for study abroad programs", "access research grant applications", 
    "find academic journal access portal", "apply for teaching assistantship", 
    "submit thesis proposal online", "check academic probation status", 
    "schedule tutoring session", "find group study rooms", 
    "access campus research labs", "download degree audit report", 
    "apply for graduation online", "track diploma mailing status",

    # College administration
    "submit residency reclassification form", "request change of major", 
    "apply for student health insurance", "log in to bursar's office portal", 
    "submit disability accommodation request", "access registrar's office forms", 
    "apply for academic appeals online", "find transfer credit evaluation", 
    "request official transcript mailing", "access Title IX reporting form", 
    "apply for withdrawal leave of absence", "find campus IT support login", 
    "update emergency contact information", "access financial aid appeal form",

    # clerk
    "log in to payroll system", "submit timesheets online", "access company forms", 
    "track office supply orders", "request vacation days online", 
    "check work schedule", "submit leave application", 
    "log in to employee intranet", "update contact information in HR portal", 
    "download meeting minutes", "find office seating chart", 
    "check internal email inbox", "access document approval workflow", 
    "submit expense reimbursement form",

    # techie
    "access GitHub repositories", "log in to Jira dashboard", 
    "track bug reports in Bugzilla", "find API documentation", 
    "access AWS console", "log in to Microsoft Azure portal", 
    "navigate to CI/CD pipeline", "download Docker images", 
    "access Kubernetes dashboard", "log in to GitLab issues", 
    "track server uptime in Grafana", "submit code review in Bitbucket", 
    "monitor cloud resource usage", "log in to DevOps toolkit", 
    "download SDKs from developer portal",

    # business owner
    "log in to QuickBooks Online", "access Shopify admin dashboard", 
    "track sales performance in Square", "log in to Stripe account", 
    "find business tax forms", "update products on Etsy seller account", 
    "check inventory levels on Amazon Seller Central", 
    "download profit and loss statements", "log in to Google My Business", 
    "access Facebook Ads Manager", "update employee records online", 
    "log in to HubSpot CRM", "schedule team meeting in Zoom", 
    "apply for business loan online",

    # homemaker
    "log in to grocery delivery app", "track Walmart grocery orders", 
    "find recipes on Pinterest", "access online family budget tracker", 
    "order cleaning supplies from Amazon", "log in to home security system", 
    "navigate to IKEA furniture shopping", "check delivery status on FedEx", 
    "book pest control services online", "find local daycare reviews", 
    "manage subscription on Netflix", "order home decor from Wayfair", 
    "pay electricity bill online", "schedule home repairs on Angi",

    # school teacher
    "log in to Google Classroom", "update grades in school portal", 
    "download lesson plans from Teachers Pay Teachers", 
    "schedule parent-teacher conferences", "log in to Zoom for class", 
    "access student attendance records", "submit curriculum plans online", 
    "find classroom supply discounts", "access educational research journals", 
    "track student progress reports", "apply for professional development programs", 
    "log in to school administration portal", "download assessment rubrics",

    # school student
    "log in to school portal", "submit homework on Google Classroom", 
    "download study guides", "check upcoming tests", 
    "access online library resources", "find class schedule", 
    "log in to Khan Academy", "access coding lessons on Code.org", 
    "download science fair instructions", "log in to Zoom class", 
    "submit project online", "find book reports templates", 
    "schedule tutoring sessions online", "track grades online",

    # college student
    "register for college classes online", "log in to student portal", 
    "download class syllabus", "access online course materials", 
    "submit assignments on Canvas", "track financial aid status", 
    "find scholarship application forms", "log in to university library system", 
    "schedule advisor meeting", "apply for internships online", 
    "find study group sessions", "access professor office hours schedule", 
    "pay tuition fees online", "download graduation requirements checklist",

    # accountant
    "log in to QuickBooks Online", "access payroll system", 
    "download tax forms", "file tax returns for clients", 
    "track income and expense reports", "log in to Xero accounting software", 
    "update accounts payable records", "access client financial statements", 
    "check IRS e-filing portal", "log in to Sage accounting software", 
    "schedule client meetings", "submit audit trail reports", 
    "find accounting software updates", "access expense tracker apps",

    # Auditor
    "log in to audit management portal", "access financial statement templates", 
    "download compliance checklists", "submit internal audit reports", 
    "log in to SEC filing system", "access risk assessment tools", 
    "track regulatory updates online", "download audit trail data", 
    "log in to GRC software", "schedule audit interviews online", 
    "submit client compliance feedback", "access previous audit findings", 
    "check ISO audit certification details", "review SOX compliance guidelines",

    # professional
    "log in to LinkedIn profile", "access online resume builder", 
    "register for networking events", "log in to corporate email", 
    "schedule meetings on Microsoft Teams", "log in to Slack channels", 
    "find professional certifications", "submit expense reports online", 
    "access job postings on Indeed", "track industry news on Bloomberg", 
    "log in to project management tools", "update professional portfolio online", 
    "access online learning resources", "schedule professional development workshops",

    # healthcare
    "log in to electronic health records", "submit patient prescriptions online",
    "track lab test results", "access telemedicine platform",
    "schedule patient appointments online", "check insurance eligibility",
    "log in to medical billing portal", "access diagnostic imaging systems",
    "download medical journals", "apply for medical licensing online",

    # legal 
    "access case management system", "submit legal filings online",
    "log in to legal research platform", "track court schedules",
    "download legal templates", "apply for power of attorney online",
    "file a discrimination complaint", "report workplace harassment to EEOC",
    "submit FOIA requests", "check case verdicts online",

    # freelancer
    "log in to Upwork account", "track project milestones on Fiverr", 
    "submit invoices online", "access client feedback", 
    "log in to Payoneer for payments", "check freelance job postings", 
    "apply for contracts on Freelancer.com", "manage tasks on Trello", 
    "log in to Airtable project workspace", "download design files on Canva", 
    "access portfolio on Behance", "track hours on Toggl", 
    "submit bids on PeoplePerHour", "schedule client calls on Zoom",

    # retail worker
    "log in to POS system", "access shift schedules", 
    "track inventory in stock management system", "submit timesheets online", 
    "update customer orders", "log in to employee portal", 
    "find product pricing details", "download safety training modules", 
    "access loyalty program data", "submit return or refund requests", 
    "log in to retail analytics dashboard", "track daily sales goals",
    
    # delivery driver
    "log in to Uber Eats driver portal", "find delivery routes on Google Maps", 
    "check order pick-up details", "track earnings on DoorDash app", 
    "access customer delivery instructions", "log in to Grubhub driver account", 
    "find fuel discount programs", "access Lyft driver support", 
    "schedule shifts on Amazon Flex", "log in to Postmates Fleet dashboard", 
    "track delivery performance metrics", "access vehicle maintenance records",

    # artist navigation
    "log in to Etsy seller dashboard", "access Behance portfolio", 
    "upload new designs to Redbubble", "log in to Patreon creator account", 
    "track art commissions on DeviantArt", "download design templates from Canva", 
    "access tutorial videos on Skillshare", "log in to Adobe Creative Cloud", 
    "manage gallery submissions online", "upload digital art to Procreate gallery", 
    "track merchandise orders on Printful", "submit artwork to competitions online",

    # engineer
    "log in to CAD software portal", "access circuit simulation tools online", 
    "download blueprints from company database", "track project timelines in Jira", 
    "log in to MATLAB for simulations", "access engineering standards online", 
    "submit technical reports online", "schedule maintenance checks in SAP", 
    "access IoT device dashboards", "log in to 3D printing tools", 
    "find engineering webinars online", "submit equipment calibration forms",

    # content creator
    "log in to YouTube Studio", "access video analytics on TikTok Creator Portal", 
    "track engagement metrics on Instagram Insights", "upload podcast episodes on Spotify", 
    "find trending topics on Twitter", "log in to Canva for graphic design", 
    "schedule posts on Hootsuite", "edit videos on Final Cut Pro", 
    "manage ad revenue on Facebook Creator Studio", "download media from Dropbox", 
    "log in to Twitch affiliate dashboard", "check copyright claims on YouTube",

    # sports fan
    "log in to ESPN Fantasy Football", "track live scores on NBA app", 
    "check upcoming NFL schedules", "log in to Strava fitness app", 
    "access training programs on Nike Training Club", "find local marathon events", 
    "log in to fitness tracker dashboard", "track workout history on Fitbit", 
    "upload running routes to Garmin Connect", "check cricket scores on ESPN Cricinfo", 
    "find soccer leagues near me", "track progress on Peloton app",

    # entrepreneur
    "log in to business loan portal", "track sales on Shopify dashboard", 
    "access marketing analytics on HubSpot", "apply for a business license online", 
    "log in to investor relations portal", "download business plan templates", 
    "access pitch deck templates", "schedule team meetings on Microsoft Teams", 
    "apply for venture capital funding", "track project progress on Monday.com", 
    "log in to Zoom for investor calls", "track customer feedback on SurveyMonkey",

    # govt employee
    "log in to federal employee portal", "submit payroll forms online", 
    "access compliance training modules", "schedule inter-agency meetings on WebEx", 
    "submit travel reimbursement requests", "track citizen service requests", 
    "access state records management system", "log in to public health database", 
    "submit procurement forms online", "track grants in federal funding system", 
    "download public policy updates", "log in to internal GIS dashboard",

]

travel_examples_partial = [
    "flight to", "visit", "visa requirements", "trip planner", "tourist spots",
    "cheap flights", "best places to visit", "hotel booking", "sightseeing",
    "tourist attractions", "road trip", "travel insurance", "best time to visit",
    "budget travel", "local tours", "vacation packages", "recommended destinations",
    "cruise deals", "vacation spots", "passport renewal", "visa rules",
    "beach resorts", "airfare deals", "holiday trips", "affordable destinations",
    "international flights", "city tours", "top landmarks", "country guide", 
    "tropical destinations", "travel guide", "family vacations", "romantic getaways",
    "honeymoon ideas", "backpacking trips", "adventure travel", "historical sites",
    "city tours", "weekend escapes", "wilderness tours", "hotel reviews",
    "best tour agencies", "theme park trips", "water park tickets", "beach excursions",
    "cultural heritage sites", "spa resorts", "all-inclusive deals", "mountain climbing trips",
    "scenic routes", "train to", "luxury resorts", "ski resorts", "travel checklist", 
    "best food spots", "travel restrictions", "city guide", "cultural experiences", 
    "eco-friendly travel", "destination weddings", "local food tours", "historical landmarks", 
    "best museums", "national parks", "guided tours", "travel blogs", "itinerary ideas", 
    "vacation rentals", "staycations", "must-visit islands", "nature trails", "wildlife safaris", 
    "travel deals", "popular road trips", "long weekend trips", "day trips", "romantic stays", 
    "budget hotels", "festival dates", "safari lodges", "desert tours", "exploring islands", 
    "wine tours", "biking tours", "remote locations", "travel gadgets", "hiking trails", 
    "outdoor adventures", "wildlife tours", "road trip essentials", "unexplored destinations", 
    "beach holidays", "rural getaways", "city breaks", "glamping spots", "hostels near", 
    "overwater bungalows", "luxury train journeys", "heritage hotels", "guided city walks", 
    "solo travel tips", "local guides", "temple tours", "urban exploration", "roadside attractions", 
    "adventure parks", "best hiking spots", "spa hotels", "island escapes", "mountain retreats", 
    "desert safaris", "hidden gems", "UNESCO world heritage sites", "fishing trips", 
    "hot air balloon rides", "cultural festivals", "top skiing destinations", "outdoor excursions", 
    "family-friendly hotels", "wildlife sanctuaries", "sailing trips", "beach clubs", "travel essentials", 
    "outdoor campsites", "eco-tourism", "ferry trips", "popular cruise lines", "historical city tours", 
    "luxury beach resorts", "best places for sunset", "city skyline views", "hidden beaches", 
    "cooking classes abroad", "famous street markets", "travel discounts", "offbeat destinations", 
    "airport transfers", "last-minute getaways", "camping near", "castle tours", "mountain trekking", 
    "road trips with kids", "beach activities", "travel booking", "urban sightseeing", 
    "exotic travel spots", "local dining", "city festivals", "budget airlines", "remote island resorts", 
    "best travel agencies", "heritage villages", "road trip routes", "weekend hikes",
    "tropical beach resorts", "volcano tours", "camping essentials", "national park passes", 
    "city walking tours", "local cuisine tasting", "nightlife hotspots", "family-friendly resorts", 
    "road trip itinerary", "where to snorkel", "destination guides", "pet-friendly hotels", 
    "skiing and snowboarding", "lake getaways", "historic inns", "guided mountain hikes", 
    "waterfall tours", "snowboard rentals", "sunset cruises", "island hopping", 
    "group travel discounts", "botanical gardens", "roadside diners", "where to scuba dive", 
    "road trip packing list", "desert camping", "travel credit cards", "luxury travel experiences", 
    "golf resorts", "photography spots", "stargazing tours", "festival packages", 
    "architecture tours", "ghost town tours", "city nightlife", "climbing expeditions", 
    "rural stays", "best beach towns", "lighthouse visits", "fishing expeditions", 
    "cultural exhibitions", "best places for diving", "mountain bike trails", "wine tasting tours", 
    "eco lodges", "luxury camping", "hiking with pets", "cruise excursions", "zip lining adventures", 
    "remote mountain villages", "volunteering abroad", "sunrise viewpoints", "bird watching tours", 
    "yoga retreats", "ferry schedules", "local handicrafts", "wellness retreats", "pilgrimage sites", 
    "city skylines", "seafood markets", "mountain lodges", "oceanfront villas", "bicycle rentals", 
    "travel souvenirs", "bike tours", "haunted locations", "picnic spots", "romantic sunsets", 
    "night market tours", "expedition cruises", "historical reenactments", "luxury spas", 
    "weekend villas", "urban parks", "cheap car rentals", "temple stays", "architecture marvels", 
    "cliff diving spots", "beach house rentals", "public transport guides", "rooftop restaurants", 
    "vintage markets", "remote villages", "water sports rentals", "art museum tours", 
    "sustainable travel tips", "cultural food festivals", "boating rentals", "seasonal events", 
    "island retreats", "ancient ruins tours", "safari trips", "adventure resorts", "UNESCO sites nearby", 
    "travel vaccinations", "lake cabins", "train journeys", "cruise ship tours", "underwater hotels", 
    "iconic landmarks", "wilderness camping", "unique Airbnbs", "fine dining experiences", 
    "cheap destinations", "secluded beaches", "budget adventures", "wildlife preserves", "water sports activities",
    # National Parks
    "Grand Canyon National Park", "Yellowstone National Park", 
    "Yosemite National Park", "Zion National Park", "Glacier National Park", 
    "Great Smoky Mountains National Park", "Arches National Park", 
    "Rocky Mountain National Park", "Bryce Canyon National Park", 
    "Acadia National Park", "Sequoia National Park", "Joshua Tree National Park", 
    "Grand Teton National Park", "Mount Rainier National Park", 
    "Everglades National Park", "Denali National Park", 
    "Badlands National Park", "Death Valley National Park", 
    "Shenandoah National Park", "Big Bend National Park",

    # Beaches and Coastal Destinations
    "Maui, Hawaii", "Waikiki Beach, Oahu", "Clearwater Beach, Florida", 
    "South Beach, Miami", "Santa Monica Beach, California", 
    "Myrtle Beach, South Carolina", "Destin, Florida", 
    "Hilton Head Island, South Carolina", "Outer Banks, North Carolina", 
    "Cape Cod, Massachusetts", "Malibu, California", "Laguna Beach, California", 
    "Siesta Key, Florida", "Naples, Florida", "Kauai, Hawaii", 
    "Cannon Beach, Oregon", "Virginia Beach, Virginia", 
    "Sanibel Island, Florida", "Key West, Florida", "Rehoboth Beach, Delaware",

    # Landmarks and Monuments
    "Statue of Liberty, New York", "Empire State Building, New York", 
    "Golden Gate Bridge, San Francisco", "Mount Rushmore, South Dakota", 
    "The White House, Washington, D.C.", "Lincoln Memorial, Washington, D.C.", 
    "Hollywood Sign, Los Angeles", "Gateway Arch, St. Louis", 
    "Space Needle, Seattle", "Alcatraz Island, San Francisco", 
    "The Pentagon, Virginia", "Freedom Tower, New York", 
    "Hoover Dam, Nevada", "Graceland, Memphis", 
    "Independence Hall, Philadelphia", "The Liberty Bell, Philadelphia", 
    "United States Capitol, Washington, D.C.", "Brooklyn Bridge, New York", 
    "Times Square, New York", "Biltmore Estate, North Carolina",

    # Historic Sites
    "Pearl Harbor, Hawaii", "Gettysburg National Military Park, Pennsylvania", 
    "Colonial Williamsburg, Virginia", "Ellis Island, New York", 
    "Salem Witch Museum, Massachusetts", "Plymouth Rock, Massachusetts", 
    "Monticello, Virginia", "Alamo Mission, Texas", "Fort Sumter, South Carolina", 
    "Mesa Verde National Park, Colorado", "San Antonio Missions, Texas", 
    "Martin Luther King Jr. National Historic Site, Georgia", 
    "Chaco Culture National Historical Park, New Mexico", 
    "Trail of Tears National Historic Trail", "Antietam National Battlefield, Maryland", 
    "Harper's Ferry, West Virginia", "Jamestown Settlement, Virginia", 
    "Wounded Knee, South Dakota", "Fredericksburg Battlefield, Virginia", 
    "Fort McHenry, Maryland",

    # Cultural Destinations
    "Broadway, New York", "Metropolitan Museum of Art, New York", 
    "Smithsonian Institution, Washington, D.C.", "Art Institute of Chicago, Illinois", 
    "The Getty Center, Los Angeles", "MoMA (Museum of Modern Art), New York", 
    "National Gallery of Art, Washington, D.C.", "American Museum of Natural History, New York", 
    "The Field Museum, Chicago", "Boston Museum of Fine Arts, Massachusetts", 
    "Rock and Roll Hall of Fame, Cleveland", "Griffith Observatory, Los Angeles", 
    "The Aquarium of the Pacific, Long Beach", "Georgia Aquarium, Atlanta", 
    "Space Center Houston, Texas", "National Museum of African American History, Washington, D.C.", 
    "The Henry Ford Museum, Michigan", "Country Music Hall of Fame, Nashville", 
    "Dollywood, Tennessee", "Stax Museum of American Soul Music, Memphis",

    # Family-Friendly Attractions
    "Disneyland, California", "Walt Disney World, Florida", 
    "Universal Studios, Florida", "Universal Studios Hollywood, California", 
    "SeaWorld Orlando, Florida", "LEGOLAND Florida, Winter Haven", 
    "San Diego Zoo, California", "Monterey Bay Aquarium, California", 
    "Disney's Animal Kingdom, Florida", "Epcot, Florida", 
    "Magic Kingdom, Florida", "Hollywood Studios, Florida", 
    "Adventure Island, Tampa", "Six Flags Magic Mountain, California", 
    "Busch Gardens, Virginia", "Knott's Berry Farm, California", 
    "Hersheypark, Pennsylvania", "Cedar Point, Ohio", "Kings Island, Ohio", 
    "Silver Dollar City, Missouri",

    # Unique Natural Attractions
    "Niagara Falls, New York", "Antelope Canyon, Arizona", "Horseshoe Bend, Arizona", 
    "Monument Valley, Arizona/Utah", "Sedona, Arizona", "Bryce Canyon Hoodoos, Utah", 
    "Devil's Tower, Wyoming", "Lake Tahoe, California/Nevada", 
    "Crater Lake, Oregon", "Carlsbad Caverns, New Mexico", 
    "Great Salt Lake, Utah", "The Wave, Arizona", "Mammoth Cave, Kentucky", 
    "Mount St. Helens, Washington", "Hot Springs National Park, Arkansas", 
    "The Everglades, Florida", "Big Sur, California", "Lake Powell, Arizona", 
    "Gulf Shores, Alabama", "The Painted Desert, Arizona",

    # Unique and Unusual Destinations
    "Area 51, Nevada", "Roswell, New Mexico", "Salvation Mountain, California", 
    "Winchester Mystery House, California", "Mystic Seaport, Connecticut", 
    "The House on the Rock, Wisconsin", "Voodoo Museum, New Orleans", 
    "International UFO Museum, New Mexico", "Carhenge, Nebraska", 
    "World's Largest Ball of Twine, Kansas", "Cadillac Ranch, Texas", 
    "Devil's Kettle, Minnesota", "Salton Sea, California", 
    "Biosphere 2, Arizona", "Gravity Hill, Pennsylvania", "Portland Underground, Oregon", 
    "The Thing, Arizona", "Fallingwater, Pennsylvania", "Neon Museum, Las Vegas", 
    "The Mütter Museum, Philadelphia",

    # Popular Events and Festivals
    "Mardi Gras, New Orleans", "Coachella, California", "Burning Man, Nevada", 
    "South by Southwest (SXSW), Texas", "Comic-Con, California", 
    "Kentucky Derby, Kentucky", "Albuquerque Balloon Fiesta, New Mexico", 
    "The Masters Golf Tournament, Georgia", "Lollapalooza, Illinois", 
    "Sturgis Motorcycle Rally, South Dakota", "Jazz Fest, New Orleans", 
    "The Rose Parade, California", "Times Square Ball Drop, New York", 
    "Thanksgiving Day Parade, New York", "Sundance Film Festival, Utah", 
    "Easter Jeep Safari, Utah", "Art Basel Miami Beach, Florida", 
    "Nashville Film Festival, Tennessee", "American Royal BBQ Contest, Kansas", 
    "Taste of Chicago, Illinois",

    "airport", "station", "hotel", "beach", "mountain", "valley",
    "ocean", "river", "lake", "park", "forest", "museum", "theater", 
    "tourist", "vacation", "weekend", "holiday",
]

purchase_examples_partial = [
    "buy", "discounts on", "product reviews", "top deals", "best price for",
    "shopping for", "sale on", "cheap", "where to find", "store near",
    "affordable", "buying guide", "online shopping", "deal of the day", "compare",
    "electronics deals", "clearance", "best gift ideas", "smartphone discounts",
    "shop online", "find discount", "order now", "best rated", "hot deals",
    "where to buy", "gift for friends", "find cheap", "cost comparison",
    "monthly deals", "discounted gadgets", "price check", "stock availability",
    "sale event", "on discount", "store open hours", "free shipping", 
    "how much is", "good quality items", "deal store", "bargain finder",
    "warranty coverage", "best for price", "flash sale", "limited offers",
    "smart home devices", "eco-friendly items", "vintage collectibles", "buy used items",
    "cheap deals", "limited stock", "special offers", "price match", "buy one get one", 
    "gift ideas for family", "budget-friendly", "exclusive deals", "best price guarantee", 
    "bundle offers", "best deals on gadgets", "customer reviews", "discount codes", 
    "holiday discounts", "best budget options", "shopping deals", "affordable gifts", 
    "seasonal sale", "luxury items on sale", "new arrivals", "best in category", 
    "coupon codes", "cashback offers", "limited-time sale", "new discounts", 
    "exclusive online deals", "shop by brand", "last-minute gift ideas", "price drops", 
    "shopping near me", "pre-order", "buy for less", "gift cards", "price tracker", 
    "trending items", "budget-friendly gifts", "buy gift cards", "clearance items", 
    "discount store", "special promotions", "home appliances on sale", "best-selling items", 
    "limited edition", "budget options", "discount for students", "free returns", 
    "seasonal clearance", "price alerts", "find bargains", "subscribe and save", 
    "on sale today", "holiday specials", "discount electronics", "order for delivery", 
    "wholesale prices", "compare online prices", "best gift options", "discount on shipping", 
    "weekly discounts", "in-store pickup", "flash discount", "holiday sales", "new collection", 
    "luxury deals", "limited-time discount", "find in stock", "refurbished items", 
    "discount furniture", "price discounts", "compare gadgets", "outlet store", 
    "affordable tech", "buy more save more", "kids’ toys on sale", "new in store", 
    "exclusive membership discounts", "gifts under $50", "back-to-school deals", 
    "best value items", "store offers", "top-rated electronics", "shop today", 
    "affordable fashion", "best budget laptops", "appliance deals", "gift for him", 
    "gift for her", "pet supplies sale", "baby products on sale", "student discounts", 
    "outdoor gear deals", "flash price drop", "get quotes", "bulk purchase discounts", 
    "subscribe for offers", "personalized gifts", "compare TVs", "high-quality items", 
    "best phone under $500", "kitchen appliances sale", "budget picks", "latest deals",
    "gift sets for holidays", "best-rated products", "limited stock available",
    "financing options", "pre-owned items", "upcoming sales", "best sellers in category",
    "affordable options near me", "deals on essentials", "rewards programs",
    "online exclusive items", "referral discounts", "new stock release", "wholesale electronics",
    "price comparisons for brands", "reliable product options", "holiday gift bundles",
    "daily price drops", "in-store discounts", "gift for pet lovers", "seasonal décor discounts",
    "best deals this week", "multi-buy discounts", "shopping rewards points",
    "compare subscription boxes", "limited edition sale items", "outlet for electronics",
    "price comparison website", "gift baskets under $100", "clearance on winter wear",
    "gift card sale", "budget-friendly finds", "high-rated items on sale",
    "low-cost delivery options", "home and garden deals", "flash sales this weekend",
    "price guarantee policy", "trending holiday gifts", "big savings on furniture",
    "best Black Friday deals on TVs", "Cyber Monday smartphone discounts", 
    "discounted laptops for sale", "affordable tablets this holiday", 
    "smartwatches on sale", "best earbuds for gifts", "4K TV holiday discounts", 
    "gaming consoles Black Friday deals", "top-rated smart home devices on sale", 
    "holiday deals on headphones", "Cyber Monday smartwatch deals", 
    "best holiday discounts on laptops", "VR headset offers", "wireless speakers on sale", 
    "holiday sale on digital cameras", "gaming laptop Black Friday deal", 
    "discount on wireless earbuds", "home security gadgets on discount", 
    "budget laptops Cyber Monday deals", "buy drones on sale", 
    "holiday discounts on gaming PCs", "tablet Black Friday offers", 
    "latest smartphone deals", "smart thermostats on sale", 
    "fitness trackers discounted", "holiday deals on portable chargers", 
    "affordable smart TVs", "Cyber Monday deals on computer monitors", 
    "deals on gaming accessories", "best Black Friday soundbar offers", 
    "discounts on streaming devices", "smart kitchen gadgets holiday deals", 
    "robot vacuums on sale", "best wireless chargers to buy", 
    "discounts on noise-canceling headphones", "holiday deals on Bluetooth speakers", 
    "best Black Friday deals on cameras", "affordable smartwatches", 
    "top gadgets on sale this holiday", "smart bulbs Black Friday deals", 
    "holiday sales on gaming chairs", "discount on external hard drives", 
    "smartphone accessory discounts", "top holiday deals on tablets", 
    "Cyber Monday deals on PC accessories", "budget-friendly smart home gadgets", 
    "best deals on wearable tech", "Black Friday sale on laptops", 
    "Bluetooth earbuds holiday discounts", "holiday deals on action cameras", 
    "best smartwatch under $200", "buy holiday gift tech items", 
    "discount on electric scooters", "Cyber Monday laptop sale", 
    "tablet accessory discounts", "4K projectors holiday deals", 
    "gaming headset Black Friday sale", "affordable video doorbells", 
    "best budget tech gifts", "holiday discounts on PC parts", 
    "discounted smart locks", "popular tech deals this season", 
    "best tech under $50 holiday sale", "Black Friday deals on tablets", 
    "smart home starter kits on sale", "affordable drones for kids", 
    "holiday discount on laptops", "smart home assistants on discount", 
    "best gaming gadgets for gifts", "holiday sale on charging accessories", 
    "budget-friendly tech for teens", "holiday price drops on gadgets",
    "best deals on smart home gadgets", "trending kitchen appliances on Amazon", 
    "top-rated fitness trackers", "latest wireless earbuds", "best pet care products", 
    "most popular home decor items", "best holiday gifts on Amazon", 
    "discounted beauty and skincare products", "trending books this season", 
    "must-have camping gear", "top video games on sale", "popular Amazon devices", 
    "best new releases in electronics", "hot fashion items on eBay", 
    "trending children's toys", "best-selling phone accessories", 
    "latest smartwatches", "trending holiday decor", "discounted fitness equipment", 
    "popular laptop bags and cases", "new trending gadgets", "must-have kitchen tools", 
    "trending DIY tools", "affordable trending tech accessories", "most-wanted gaming consoles", 
    "top-rated skincare tools", "most searched for drones", "trending home office essentials", 
    "discounted sports gear", "latest model phones on Amazon", "affordable gaming accessories", 
    "popular outdoor gear", "best-selling pet toys", "affordable beauty products", 
    "trending holiday outfits", "top-selling jewelry items", "trending supplements", 
    "top gifts for gadget lovers", "most popular smart speakers", "trending electric scooters", 
    "popular kids' educational toys", "top-rated phone cases", "discounts on smart plugs", 
    "new popular beauty tools", "best-selling air purifiers", "trending cleaning supplies", 
    "latest workout equipment", "top deals on wearable tech", "popular car accessories", 
    "trending board games", "hot deals on small appliances", "new Amazon home essentials", 
    "affordable holiday gift ideas", "best-selling kitchen appliances", 
    "trending men's grooming kits", "top deals on baby essentials", 
    "best new home fitness products", "popular home organization items", 
    "top-rated clothing on eBay", "latest model headphones", "must-have travel accessories", 
    "popular holiday gift baskets", "trending video doorbells", "best-selling cookbooks", 
    "top-rated sports gear", "discounted home automation gadgets", 
    "best new releases in toys", "popular subscription boxes", "trending wellness products", 
    "top hair care products", "affordable smart light bulbs", "trending car gadgets", 
    "must-have holiday gift sets", "best discounts on essentials", 
    "popular kids' STEM toys", "best-rated massage guns", "trending water bottles", 
    "most-wanted kitchen gadgets", "top pet grooming products", 
    "latest deals on tech gifts", "affordable holiday party supplies", 
    "new releases in fitness trackers", "top-rated baby gear", "most popular e-books", 
    "best trending Bluetooth speakers", "popular luxury gift items", 
    "trending electric shavers", "best-selling skincare kits", "trending fitness bands", 
    "top-rated baby monitors", "affordable fashion accessories", "must-have holiday tech items", 
    "new kitchen essentials on Amazon", "top trending sneakers", "popular office supplies",
    "affordable wireless earbuds", "latest electric shavers", "top-rated water purifiers",
    "portable air conditioners on sale", "discounted robot vacuums", 
    "eco-friendly reusable bags", "best protein powders", "budget smartphones on Amazon",
    "affordable ergonomic chairs", "trending video editing software", "best-selling backpacks",
    "kids' learning tablets", "affordable noise-canceling headphones", "best gaming routers",
    "affordable juicers", "best deals on blenders", "home theater systems sale",
    "discounted adjustable dumbbells", "affordable yoga mats", "reliable air fryers", 
    "best-selling essential oils", "new arrivals in winter jackets", "top-rated bed sheets",
    "discounted skincare gift sets", "outdoor patio furniture deals", "holiday sales on cameras",
    "affordable garden tools", "popular craft supplies", "trending LED lights for rooms",
    "best handheld vacuums", "discounted pressure cookers", "high-rated cat furniture",
    "affordable baby monitors", "budget kitchen knives", "new skincare serums", 
    "trending graphic T-shirts", "best gaming headsets", "electric toothbrush deals", 
    "discounts on wireless keyboards", "popular camping tents", "best air purifiers for homes",
    "affordable humidifiers", "smart light strips on sale", "home gym equipment deals",
    "top-rated inflatable pools", "affordable electric kettles", "best-selling printers",
    "trending wireless chargers", "new baking supplies", "best camping gear for families",
    "affordable pet strollers", "trending wall art", "best value office chairs", 
    "kids' smartwatches on sale", "top-rated wine coolers", "affordable gaming chairs", 
    "best deals on projectors", "popular hand sanitizers", "budget-friendly kitchen scales",
    "reliable electric blankets", "affordable air mattresses", "popular reusable water bottles",
    "best-selling patio heaters", "discounted storage bins", "top-rated computer desks",
    "affordable photo frames", "popular exercise bikes", "best portable power banks", 
    "new smartwatch models", "affordable resistance bands", "best robot mop deals", 
    "popular outdoor string lights", "affordable garden planters", "trending weighted blankets",
    "best-selling makeup brushes", "kids' electric cars on sale", "top-rated laundry baskets",
    "best infrared thermometers", "budget-friendly ceiling fans", "affordable spice racks",
    "top-rated portable generators", "popular home security cameras", "affordable phone stands",
    "best sewing machines for beginners", "affordable heated jackets", "best-selling dish racks",
    "top-rated water bottles for kids", "discounts on slow cookers", "popular home tool sets",
    "affordable coffee tables", "best LED makeup mirrors", "top-rated cycling helmets",
    "trending travel backpacks", "affordable baby high chairs", "new security camera models",
    "best holiday gift baskets", "affordable 3D printers", "popular mini fridges",
    "best luxury watches on sale", "affordable drones for beginners", "reliable floor steamers",
    "best-selling kitchen tongs", "popular dehumidifiers", "discounts on storage shelves",
    "trending patio umbrellas", "best-rated comforters", "affordable electric griddles",
    # Electronics and Gadgets
    "Smartphones", "Laptops", "Tablets", "Smartwatches", "Wireless earbuds", 
    "Bluetooth speakers", "4K TVs", "Gaming consoles", "VR headsets", "Gaming monitors", 
    "External hard drives", "Portable chargers", "Smart home devices", "Streaming devices", 
    "Digital cameras", "Action cameras", "Drone cameras", "Smart thermostats", 
    "Wireless keyboards", "Noise-canceling headphones", "Fitness trackers", 
    "Gaming mice", "Wi-Fi routers", "Dash cams", "Projectors", 
    "USB hubs", "Smart plugs", "Car phone holders", "Webcams", "Video doorbells",

    # Home and Kitchen Products
    "Air fryers", "Instant Pots", "Blenders", "Coffee makers", "Vacuum cleaners", 
    "Robot vacuums", "Water purifiers", "Electric kettles", "Non-stick cookware sets", 
    "Cast iron skillets", "Cutting boards", "Knife sets", "Storage containers", 
    "Dish racks", "Food processors", "Baking tools", "Microwave ovens", 
    "Toaster ovens", "Bread makers", "Portable heaters", "Ceiling fans", 
    "Air purifiers", "Humidifiers", "Essential oil diffusers", "Mattresses", 
    "Weighted blankets", "Comforter sets", "Bed sheets", "Throw pillows", 
    "Electric blankets", "Area rugs", "Curtains", "Laundry hampers", 
    "Bookshelves", "Standing desks", "Office chairs", "Storage bins", 
    "Couches", "Bar stools", "Ottomans", "Patio furniture", "Outdoor grills",

    # Fashion and Accessories
    "Sneakers", "Running shoes", "Winter boots", "Sandals", "High heels", 
    "Handbags", "Backpacks", "Luggage sets", "Sunglasses", "Watches", 
    "Earrings", "Necklaces", "Bracelets", "Rings", "Hats", 
    "Scarves", "Gloves", "Belts", "Swimwear", "Activewear", 
    "Jeans", "T-shirts", "Sweaters", "Jackets", "Coats", 
    "Dresses", "Suits", "Pajamas", "Socks", "Underwear", 
    "Raincoats", "Yoga pants", "Leggings", "Shirts", "Blouses",

    # Beauty and Personal Care
    "Skincare products", "Facial cleansers", "Moisturizers", "Serums", "Face masks", 
    "Makeup palettes", "Lipsticks", "Mascaras", "Eyeliners", "Foundations", 
    "Hair dryers", "Flat irons", "Curling wands", "Shampoos", "Conditioners", 
    "Hair oils", "Body lotions", "Perfumes", "Sunscreens", "Electric toothbrushes", 
    "Men’s razors", "Beard trimmers", "Epilators", "Hair removal kits", 
    "Deodorants", "Nail polish sets", "Press-on nails", "Bath bombs", "Lip balms",

    # Health and Fitness Products
    "Yoga mats", "Dumbbells", "Resistance bands", "Treadmills", "Ellipticals", 
    "Stationary bikes", "Foam rollers", "Protein powders", "Pre-workout supplements", 
    "Fitness trackers", "Massage guns", "Water bottles", "First aid kits", 
    "Pulse oximeters", "Blood pressure monitors", "Thermometers", 
    "Multivitamins", "Probiotics", "Essential oils", "Knee braces", 
    "Compression socks", "Hand grips", "Pull-up bars", "Ab rollers", "Jump ropes",

    # Baby and Kids Products
    "Diapers", "Baby wipes", "Strollers", "Car seats", "Baby monitors", 
    "Cribs", "High chairs", "Baby bottles", "Pacifiers", "Teething toys", 
    "Baby carriers", "Play mats", "Activity gyms", "Toddler beds", 
    "Children’s books", "Educational toys", "Ride-on toys", "Kids' backpacks", 
    "School supplies", "Lunch boxes", "Kids’ scooters", "Bikes", "LEGO sets", 
    "Action figures", "Dolls", "Board games", "Art supplies", "Plush toys",

    # Pet Products
    "Dog food", "Cat food", "Pet collars", "Leashes", "Pet carriers", 
    "Pet beds", "Pet toys", "Litter boxes", "Dog crates", "Aquariums", 
    "Cat trees", "Pet grooming kits", "Pet shampoo", "Pet clothing", 
    "Automatic pet feeders", "Pet fountains", "Pet training pads", 
    "Dog harnesses", "Scratching posts", "Bird cages",

    # Outdoor and Sports Products
    "Camping tents", "Sleeping bags", "Hiking boots", "Backpacking gear", 
    "Fishing rods", "Kayaks", "Paddleboards", "Bicycles", "Trekking poles", 
    "Binoculars", "Coolers", "Portable grills", "Portable generators", 
    "Outdoor chairs", "Hammocks", "Picnic blankets", "Golf clubs", 
    "Tennis rackets", "Soccer balls", "Basketballs", "Football gear", 
    "Baseball gloves", "Skateboards", "Rollerblades", "Snowboards", 
    "Ski equipment", "Hunting gear", "Archery kits", "Air mattresses",

    # Home Improvement and Tools
    "Power drills", "Cordless screwdrivers", "Tool sets", "Ladders", 
    "Paint sprayers", "Gardening tools", "Lawn mowers", "Chainsaws", 
    "Pressure washers", "Solar lights", "Smart locks", "Security cameras", 
    "Doorbell cameras", "Thermostats", "Wall mounts", "Curtain rods", 
    "Faucets", "Showerheads", "Screwdrivers", "Pliers",

    # Miscellaneous
    "Books", "eBooks", "Gift cards", "Board games", "Puzzle sets", 
    "Calendars", "Planners", "Notebooks", "Pens and markers", 
    "Phone cases", "Laptop sleeves", "Portable coffee mugs", 
    "Reusable water bottles", "Eco-friendly straws", "Bike helmets", 
    "Electric scooters", "Camera lenses", "Tripods", "Greenhouse kits", 
    "Seed starter kits", "Fitness apps", "Streaming subscriptions",

    # common nouns
    "car", "bicycle", "phone", "computer", "dress", "shoe",
    "watch", "jewelry", "bread", "rice", "chocolate", 
    "pizza", "burger", "cake", "milk", "cheese", "butter",
    "knife", "spoon", "fork", "glass", "bottle",
    
]

translation_examples_partial = [
    "translate to", "how to say in", "meaning of word", "translate phrase", 
    "language translation", "definition in", "translate from", "dictionary for", 
    "how to pronounce", "spell in", "word meaning", "basic phrases", 
    "translation for", "translate app", "how to write", "pronunciation guide", 
    "vocabulary", "common phrases", "phrasebook", "translate sentence",
    "dictionary lookup", "phrase meaning", "find synonyms", "language converter",
    "grammar check", "English to", "how to read", "how to spell", "pronunciation of",
    "definition lookup", "language helper", "correct spelling", "translation options",
    "language tutor", "vocabulary booster", "common questions", "phrases in", 
    "meaning finder", "language app", "multi-language support", "phrase examples",
    "learn expressions", "phrase structure", "language guide", "convert to"
]

unknown_examples_partial = [
    'snoozlegrip', 'shenanigans', 'kerplunk', 'clip', 'snappyy', 'spindlywhack', 'crinkly', 'pressed enter too soon', 
    'try this', 'query here', 'mistyped selection', 'smorgasbord', 'crumplify', 'snooze', 'twonkle', 'bamboozlemate', 
    'this doesn’t matter', 'zap', 'mind blank', 'hiss', 'snagged', 'splurgy', 'snagglebash', 'guess', 'zapz', 'frap', 
    'blotter', "don't even know", 'don’t know answer', 'spindletastic', 'zizzlesplat', 'jinkled', 'placeholder search', 
    'uncertain search', 'splode', 'abcxyz', 'twangleblop', 'shifty', 'bumfuzzle', 'plunge', 'thingy', 
    'swooshenator', 'quark', 'tatterblast', 'frizzlefry', 'something random', 'puff', 'blobby', 'placeholder attempt', 
    'weird example', 'wiggle', 'snortleboo', 'bouncy', 'qwerty', 'whirl', 'nix', 'idk what', 'random search', 
    'glimmering', 'guzzle', 'strange text', 'accidental hit', 'forgot keypress', 'dazzleplunk', 'snurply', 
    'confused', 'weird gibberish', 'idc either', 'test123', 'huff', 'supercalifragilistic', 'clap', 'whoopsie', 'nump', 
    'lorem ipsum', 'snuffle', 'unknown phrase', 'whizz', 'bloop', 'glitch', 'zomp', 'clappy', 'gush', 'zappletastic', 
    'hooey', 'bing', 'slap', 'ting', 'miscellaneous', 'jingle', 'idk just looking', 'twangy', 'dinglefrizzle', 
    'just clicking', 'quizzical', 'splatterdash', 'kerplunkitude', 'fizzlematic', 'piff', 'jazz', 'jib', 'random phrase', 
    'flapper', 'uhmm', 'nothing much', 'sdf', 'snub', 'confusing example', 'keyboard smash', 'randomized words', 
    'nothing useful', 'random sentence', 'placeholder input', 'splattergrip', 'zorp', 'fluffernutter', 'splopp', 
    'incomplete search', 'check this out', 'woozle', 'bananarama', 'quiz', 'spiffy', 'undefined', 'confusing term', 'sploom', 
    'randomized example', 'spliffy', 'ooze', 'blazing', 'uncertain input', 'unknown search', 'random guesses', 
    'unknown', 'concept unclear', 'accidental input', 'sporkinator', 'whats this', 'maybe', 'ignore this', 'twinkle', 
    'whatchamacallit', 'splank', 'weird thing', 'huh', 'into the unknown', 'chaos', 'wigglie', 'twistamatic', 'kerflapify', 
    'twizzletude', 'mock', 'thud', 'shrug', 'grizzed', 'jibberjabber', 'weirdness', 'anything', 'plop', 'dazzlicious', 
    'random selection', 'splatt', 'abracadabra', 'whooshenator', 'random mouse click', 'sparklefish', 'banal', 
    "what's the word", 'mistyped search', 'twinklebash', 'splush', 'splazz', 'forgot search term', 'crumplamatic', 'glee', 
    'whizzy', 'whizzlemate', 'jumpy', 'dork', 'randomxyz', 'gobsmacktastic', 'no clue what', 'zazz', 'beyond the void', 
    'weird try', 'drift', 'yank', 'yodelsnap', 'biff', 'forgot randomness', 'splatterblast', 'no idea', 'smooshify', 
    'peep', 'rick', 'splendiferous', 'squishy', 'muff', 'flabbergizmo', 'confuzzled', 'I think so', 'zing', 
    'meaningless typing', 'shush', 'zany', 'don’t need help', 'randomly chosen', 'warpydash', 'forgot words', 
    'placeholder typing', 'spunky', 'spindleplop', 'crash', 'flabbergast', 'snaggleplop', 'hootnanny', 'blurp', 
    'miff', 'snarkle', 'snookie', 'gleamitude', 'hello world', 'zag', 'accidental gibberish', 'nothing in mind', 
    'bash', 'spiv', 'rift', 'don’t know what to search', 'splong', 'no point', 'forgot attempt', 'fluttermate', 
    'flub', 'guff', 'dazzled', 'doodad', 'forgot term', 'blotchy', 'odd', 'kerplazzle', 'grubby', 'try to see', 'glop', 
    'whooshify', 'snicker', 'snuffly', 'random thought', 'mixed up stuff', 'zapper', 'sort of searching', 'slushy', 
    'blurification', 'mop', 'smit', 'splurge', 'meaningless input', 'quix', 'zapplarific', 'splang', 'zoinkalicious', 
    'unclear selection', 'splushy', 'guesstimate', 'snazzie', 'what about this', 'input fail', 'codswallop', 'dink', 'splunk', 
    'unclear', 'strange example', 'jitter', 'sploff', 'blip', 'unknown meaning', 'nope', 'gadzooks', 'odd example', 
    'zappomatic', 'janglystorm', 'ink', 'wobbled', 'wigglyy', 'typed by mistake', 'twirly', 'lurk', 'kerplottify', 
    'twizzlefang', 'muck', 'clunky', 'splatterific', 'clippy', 'oops input', 'what am I doing', 'qazwsxedc', 'does it matter', 
    'nonsensical', 'swooshinator', 'poiuuy', 'splish', 'mistyped query', 'squizzlewhack', 'what now', 'spluzz', 'glim', 
    'placeholder keypress', 'mistyped randomness', 'what is it', 'don’t know why', 'quibbleplop', 'guess what', 'snizzlezap', 
    'meaning of nothing', 'wiggles', 'zxcvbn', 'spur', 'uncertain term', 'what am I typing', 'zoodleblorp', 'floppy', 'asdfasdf', 
    'confused input', 'unclear sentence', 'snortlematic', 'smooshinator', 'random term', 'searching something', 
    'snorflemate', 'twinkly', 'skip', 'quib', 'forgotten term', 'oops', 'splodge', 'meaningless words', 'unclear input', 
    'unclear phrase', 'zoom', 'sneeze', 'cat on keyboard', 'nincompoop', 'zappification', 'warpington', 'splurty', 
    'do I know', 'splott', 'splurb', 'plink', 'dazzlematic', 'could be anything', 'lost thoughts', 'what', 'pizz', 
    'jiggles', 'splodgy', 'twang', 'i forgot', 'meaningless term', 'unclear search', 'thunderplunk', 'just pressing keys', 
    'splodgify', 'flit', 'snazzify', 'zoop', 'totally confused', 'quip', 'womp', 'wham', 'wigglyz', 'fuzzyy', 'why is this here', 
    'malarkey', 'widget', 'don’t care', 'scoff', 'randomized search', 'unclear example', 'pop', 'quash', 'uh oh', 
    'placeholder randomness', 'splatification', 'snickerplunk', 'nutterbutter', 'whisk', 'nibs', 'help', 'strange attempt', 
    'blurptacular', 'gizmo', 'forgotten query', 'spazzy', 'ding', 'lost search', 'buzzing', 'hum', 'nonsensicality', 
    'gloop', 'globby', 'lost meaning', 'plopperific', 'hard to say', 'snappy', 'don’t type this', 'blunderous', 'twizzlegrip', 
    'flappy', 'random keypress', 'zizzlewhack', 'forgot what I typed', 'zingerdoodle', 'randomized attempt', 'unsure words', 
    'strange sentence', 'asfjkl', 'frizz', 'idk', 'gobbledygook', 'flibbertigibbet', 'gadzookify', 'flabberzap', 'vroom', 
    'splitch', 'glimmerstorm', 'blurt', 'frizzle', 'meaningless search', 'thingamajig', 'murmur', 'not this', 'sploof', 
    'fiddlewhip', 'mumbojumbo', 'something strange', 'splurg', 'fake input', 'whiffle', 'forgot query', 'search mix', 
    'yapplify', 'zippy', 'splurpy', 'splat', 'zoinks', 'bizz', 'crumby', 'meaningless query', 'snickerdoodle', 'weird word', 
    'squidge', 'don’t know term', 'spangletude', 'spazzmatic', 'just testing', 'baffled', 'splurt', 'gaze', 'frizzy', 
    'bamboozling', 'slurp', 'zappertude', 'splorch', 'swooshtastic', 'dunk', 'honk', 'smudgy', 'flimmerstorm', 'tizz', 
    'uncertain randomness', 'jangletude', 'perhaps this', 'placeholder search term', 'whoosh', 'spike', 'glitterbop', 
    'idiosyncratic', 'odd typing', 'blob', 'bazzlemate', 'crumpleton', 'clutterbomb', 'whatever', 'kerfuffle', 'test input', 
    'randomized keypress', 'meaningless randomness', 'why not', 'snizzleblap', 'bonk', 'forgot search', 'zonk', 'whatsisname', 
    'doesn’t matter', 'splurgz', 'twig', 'ramblethorp', 'fake query', 'ping', 'smack', 'buzz', 'tingly', 'warpydoodle', 
    'filler words', 'buzzed', 'unclear thought', 'weird input', 'blap', 'snazzy', 'look for this', 'snorkelwhip', 'spoon', 
    'just guessing', 'glitche', 'swirl', 'snooker', 'search fail', 'random gibberish', 'abstract thought', 'spindelicious', 
    'snorple', 'fell asleep typing', 'splunge', 'twit', 'grippy', 'flip', 'whatsisface', 'maybe something', 'bamboozle', 
    'zinger', 'drizzleblip', 'splonky', 'what do I search', 'blat', 'another try', 'odd randomness', 'yarn', 'squib', 
    'confused term', 'flabbergasted', 'testing input', 'don’t know', 'thunderbop', 'blurpsational', 'janglydash', 'brouhaha', 
    'find out about', 'strange randomness', 'kerplizzle', 'meaningless attempt', 'spud', 'placeholder term', 'woof', 'splaff', 
    'jigglez', 'fuzzed', 'blahblah', 'grizzle', 'something here', 'blink', 'snuggly', 'yelp', 'chop', 'eternal question', 'splift', 
    'what do you mean', 'hullabazoo', 'cloggy', 'wrong key pressed', 'test again', 'don’t ask me', 'blur', 'twisty', 'flapperdash', 
    'crinklewhip', 'plinky', 'gobbleplop', 'I don’t understand', 'random', 'dummy text', 'blurblenator', 'try something', 'input here', 
    'thing', 'fringe', 'no answer', 'placeholder selection', 'test', 'spangleplop', 'splash', 'lost in thought', 'zest', 
    'fiddleplop', 'bunk', 'snag', 'vex', 'placeholder randomness example', 'spat', 'placeholder phrase', 'random search term', 
    'squigg', 'tinge', 'random words', 'unknown query', 'not useful', 'snuzzlefrump', 'type here', 'snuzzle', 'drip', 'gibberish', 
    'hodgepodge', 'forgot the term', 'completely random', 'doesn’t make sense', 'lost', 'splatterstorm', 'meaningless text', 
    'twizzle', 'find something', 'twinkletude', 'zine', 'spunked', 'crikey', 'mistaken input', 'no idea what this is', 'spork', 
    'glimmertastic', 'sloppy', 'twirky', 'abstract query', 'fluffytude', 'randomized selection', 'randomized randomness', 
    'nudge', 'gawk', 'buzzer', 'nonsensical search', 'i was curious', 'zapplify', 'cloppy', 'doohickey', 'snickly', 'doodle', 
    'placeholder example', 'placeholder text', 'nonsense search', 'why search this', "this doesn't work", 'splendiferific', 
    'crappy', 'what are words', 'clop', 'randomized term', 'weird', 'snazztastic', 'whizzbang', 'blaze', 'twangaloo', 
    'strange keypress', 'placeholder query', 'skew', 'splink', 'lkjhgfd', 'unclear meaning', 'flummoxify', 'lollygag', 
    'odd gibberish', 'clunk', 'snap', 'zapf', 'flummoxed', 'yawn', 'random input', 'strange word', 'zapplomatic', 
    'does this work', 'gasp', 'typing nothing', 'idk anymore', 'empty thoughts', 'pluck', 'randomized test', 
    'brain fog', 'squibbletude', 'fizzle', 'jinglyy', 'mistyped term', 'confused mind', 'random typing', 'asdfgh', 
    'infinity', 'twist', 'something typed', 'kerplunktastic', 'just trying this', 'mistaken search', 'sparklematic', 
    'woop', 'jittery', 'oopsie', 'snippy', 'splinky', 'splint', 'swooshification', 'spit', 'zinged', 'blop', 'lost words',
    'crux', 'blurbleplop', 'balderdash', 'perhaps not', 'flibber', 'snickerwhack', 'try later', 'zork', 'void', 
    'accidental query', 'fumble', 'snarked', 'don’t care search', 'just looking', 'spindling', 'snip', 'squish', 
    'blazer', 'splo', 'splunky', 'unclear randomness', 'spliff', 'not this either', 'nonsensical words', 
    'testing random', 'snigglewhap', 'odd input', 'whizzlegrip', 'dazzlegrip', 'fling', 'meaning of gibberish', 
    'weird thoughts', 'gunk', 'does this help', 'flux', 'wink', 'wonky', 'wisp', 'drizzlematic', 'another test', 
    'test search', 'just wondering', 'crumblewhack', 'spaz', 'splung', 'skid', 'quirky', 'odd search', 'accidental term', 
    'dunno', 'quizzicality', 'gleam', 'glimmer', 'don’t press enter', 'gadget', 'whizzleplop', 'don’t know exactly', 
    'odd words', 'blotty', 'thunderblop', 'maybe not', 'spludge', 'discombobulated', 'stuff', 'halfway done', 
    'sparklenator', 'zang', 'jolt', 'accidental search', 'what is going on', 'wiggler', 'mnbvcxz', 'yip', 'wriggle', 
    'hullaballoo', 'janglenut', 'zapplesmash', 'janglitude', 'what is this', 'whip', 'tiddlywinks', 'wiggly', 'weird randomness', 
    'sporkalicious', 'wriggy', 'meaningless selection', 'crumble', 'weird thought', 'splurch', 'don’t understand', 
    'sploosh', 'yap', 'nonsense', 'wobble', 'question of life', 'randomly typed', 'snuggle', 'snizzlegrip', 'oops I typed', 
    'zappy', 'twinkleplop', 'uncertain example', 'idc', 'mash', 'not sure', 'pandemonium', 'perhaps later', 'quirked', 
    'smug', 'warp', 'dash', 'could be nothing', 'unsure search', 'jumbled phrases', 'hush', 'wibble', 'weird search', 
    'quibberish', 'flop', 'discombobulate', 'this makes no sense', 'fizz', 'quirkitude', 'zingzang', 'dank', 'limitless', 
    'this is random', 'crunch', 'vibe', 'nothing specific', 'forgot', 'not important', 'slosh', 'question mark', 'zoopendous', 
    'flummify', 'splosh', 'splorp', 'splishy', 'snurkle', 'blah', 'guess answer', 'twitch', 'flap', 'snooperdoodle', 
    'janglybits', 'snizzleflap', 'slush', 'snortlemate', 'quirk', 'void query', 'fizzled', 'lollygagging', 'wonkifying', 
    'nothing', 'splunch', 'hullabaloo', 'thingamabob', 'dazzlebash', 'whizzie', 'this and that', 'shard', 'twix',
    "crumpled", "splizzle", "gargle", "mangled", "shamble", "wobblish", "drizzlepop",
    "splinker", "fiddlest", "twizzlepop", "blurzzle", "snizzlewick", "wozzle", 
    "cracklepop", "glibbish", "twezzle", "boondock", "sizzleflip", "snigglemash",
    "zazzle", "fizzlepot", "scramble", "tinglish", "sprozzle", "blimble", "zibble",
    "slapdash", "gobstork", "ziggler", "flingle", "wrangly", "twizzlebit", "brambly",
    "snubble", "splintery", "fizznack", "tibber", "quaggly", "whooshpop", "snibble",
    "plunkish", "glimflash", "wobbert", "squidgy", "kerplonk", "fobble", "blurzy",
    "scriggly", "smudgify", "tassler", "whipple", "snuzzify", "zaggle", "plonker",
    "smizzle", "quiggle", "spongle", "shizzle", "drippity", "bogglepop", "twiddly",
    "puzzleth", "flummish", "sniggleflop", "crumplish", "twiggle", "nubbish", 
    "splurkle", "whibber", "jibblish", "twonker", "fizzlewhip", "spazzle", "splorpish",
    "snuffler", "hubble", "twinkler", "crumpler", "wimbley", "twazzle", "blurbonic",
    "zapplepop", "flippery", "snuzle", "quizzwhip", "clatter", "garglunk", "splingle",
    "drabbler", "spunkly", "jumbler", "snappish", "zingify", "buzzpop", "snizzlehop",
    "plobber", "scribble", "twongle", "scrabbly", "sniggler", "bimblepop", "snorplebop",
    "wizzle", "blimpy", "splinglepop", "frizzlepop", "grizzleton", "whizbang", 
    "tinklish", "blopple", "blurbit", "wozzly", "zingpong", "splimble", "twinklypop",
    "spinkly", "snubbleton", "glozzle", "splonkle", "quizzle", "drizzlebot", "snarbly",
    "twizzleth", "whizzleton", "crumblish", "snapple", "splozzle", "glimmish", 
    "plimbish", "snuzzleblop", "twinklish", "fizzywhip", "snorblish", "drizzler", 
    "flopplish", "smizzlepop", "crumpledash", "twizzlefizz", "plumbly", "smuzzle",
    "tizzler", "gobblish", "splunkton", "jibberdash", "sproingly", "snizzler", 
    "glabble", "twinkleflip", "flobble", "twonklepop", "splittish", "grumblepop",
    "whimblish", "splingledash", "snarpish", "twinklybit", "spindlish", "grubble",
    "smarple", "twonkerish", "sniffly", "snibbleton", "grizzlepop", "tazzler", 
    "splinsh", "snazzler", "twinklepuff", "zopple", "glunkish", "crizzlepop", 
    "snarklebot", "whibblish", "flimmerdash", "splurpyton", "snuzzlepop", "wigglerish",
    "sniggleplop", "jigglish", "splurble", "buzzsnip", "plomble", "splattypop", 
    "twinklepip", "twonglish", "flobber", "grimpish", "quaggler", "sporkish", 
    "drizzleth", "squiggler", "splobber", "ploppish", "snigglerish", "splingleth",
    "grizzleblop", "sploblish", "snarbler", "smarvish", "quizzlet", "snapplish",
    "snuzleflip", "plongish", "crizzlebot", "grimpish", "twinklebot", "blurpish",
    "splopple", "gizzleth", "drizzlepuff", "twonklish", "snubbler", "blurblebot",
    "splizzy", "twinkleton", "jibbler", "splizzlepop", "splurbit", "plobblish", 
    "crumplish", "snizzlebit", "twinklishbot", "spinkler", "snibbleflip", "wigglebot",
    "twonglishbot", "snizzleton", "splongle", "blonker", "glimmerbit", "snarvish",
    "love", "anger", "hope", "dream", "thought", "courage", 
    "strength", "patience", "birthday", "anniversary", 
    "vacation", "weekend", "holiday", "winter", "summer", 
    "autumn", "spring", "success", "failure", "freedom", "peace", "wisdom", 
    "kindness", "respect", "free", "freedom", "great", "best", "worst", "last", "first", "second", 
    "next", "there", "proposal",
    "proposa", "big city", "banana", "mango", "pineapple", "apple", "grapes", "orange",
    "Big City", "Silver City", "Golden City", "Mystic City",
    "Sunset City", "Iron City", "Emerald City", "Shadow City", "Crystal City",
    "Harmony City", "Aurora City", "Dream City", "Thorn City", "Lunar City", "Twilight City", 
    "Velvet City", "Willow City", "Ivory City", "Eclipse City",
    "Storm City", "Bliss City", "Shimmer City", "Echo City", "Frost City",
    "Sapphire City", "Obsidian City", "Tranquil City", "Starlight City",
    "Drift City", "Amber City", "Hollow City", "Gilded City", "Quartz City",
    "Meadow City", "Rosewood City", "Timber City", "Bright City", "Fox City",
    "Dusk City", "Goldenleaf City", "Wind City", "Harbor City", "Cedar City",
    "Azure City", "Elder City", "Crescent City", "Pine City", "Summit City",
    "Cobalt City", "Bluff City", "Stone City",

]




In [None]:
print(f"#information_examples_partial examples = {len(information_examples_partial)}")
print(f"#yelp_examples_partial examples = {len(yelp_examples_partial)}")
print(f"#weather_examples_partial examples = {len(weather_examples_partial)}")
print(f"#navigation_examples_partial examples = {len(navigation_examples_partial)}")
print(f"#travel_examples_partial examples = {len(travel_examples_partial)}")
print(f"#purchase_examples_partial examples = {len(purchase_examples_partial)}")
print(f"#translation_examples_partial examples = {len(translation_examples_partial)}")
print(f"#unknown_examples_partial examples = {len(unknown_examples_partial)}")


In [None]:
# Add noise to simulate partial, incomplete, or typo-filled queries
def generate_partial_variations(base_phrase, num_partial_variations=3):
    variations = []
    for _ in range(num_partial_variations):  # Create more variations per base phrase
        # Truncate phrase
        # trunc_index = random.randint(len(base_phrase) // 2, len(base_phrase) - 1)
        trunc_index = random.randint(len(base_phrase) - 2, len(base_phrase) - 1)
        truncated = base_phrase[:trunc_index].strip()
        
        # Add typos
        typo_index = random.randint(0, len(truncated) - 1)
        typo_variation = truncated[:typo_index] + random.choice("abcdefghijklmnopqrstuvwxyz") + truncated[typo_index + 1:]
        
        # Add random prefix/suffix
        prefix = random.choice(["how", "find", "get", "best", "near"]) if random.random() > 0.99 else ""
        suffix = random.choice(["info", "details", "nearby", "today", "now"]) if random.random() > 0.99 else ""
        
        # Combine
        combined_variation = f"{prefix} {typo_variation} {suffix}".strip()
        variations.append(combined_variation)
    
    return variations

preModifiers = ["best", "cheap", "good", "affordable", "budget", "top quality", "absolutely best", "cheapest", 
                "best rated", "best local", "the best", "local", "find local", "small local", "list of"]
# Generate a large dictionary of entries by combining base phrases with variations
def generate_large_entry_dict(base_examples, target_intent, num_variations):
    entries = {}
    
    for example in base_examples:
        entries[example] = target_intent
        if target_intent == 'yelp_intent':
            for prefix in random.choices(preModifiers, k=5):
                entries[prefix + " " + example] = target_intent
            num_partial_variations = 2
        elif target_intent == 'purchase_intent':
            for prefix in random.choices(preModifiers, k=1):
                entries[prefix + " " + example] = target_intent
            num_partial_variations = 1
        elif target_intent == 'unknown':
            num_partial_variations = 1
        elif target_intent == 'navigation_intent':
            num_partial_variations = 1
        elif target_intent == 'information_intent':
            num_partial_variations = 0
        else:
            num_partial_variations = 2
        variations = generate_partial_variations(example, num_partial_variations)
        for variation in variations:
            if variation not in entries and len(entries) < num_variations and len(variation) > 3:
                entries[variation] = target_intent
    return entries

# Compile all intents with increased variation
partial_queries_with_intents = {}
partial_queries_with_intents.update(generate_large_entry_dict(information_examples_partial, "information_intent", 5000))
partial_queries_with_intents.update(generate_large_entry_dict(yelp_examples_partial, "yelp_intent", 12000))
partial_queries_with_intents.update(generate_large_entry_dict(weather_examples_partial, "weather_intent", 3000))
partial_queries_with_intents.update(generate_large_entry_dict(navigation_examples_partial, "navigation_intent", 3000))
partial_queries_with_intents.update(generate_large_entry_dict(travel_examples_partial, "travel_intent", 3000))
partial_queries_with_intents.update(generate_large_entry_dict(purchase_examples_partial, "purchase_intent", 3000))
partial_queries_with_intents.update(generate_large_entry_dict(translation_examples_partial, "translation_intent", 500))
partial_queries_with_intents.update(generate_large_entry_dict(unknown_examples_partial, "unknown", 3000))

# Verify the total count
print(f"Total entries generated: {len(partial_queries_with_intents)}")

In [None]:
len(purchase_examples_partial), len(information_examples_partial)

In [None]:
# purchase_examples_partial

In [None]:
partial_queries_with_intents

In [None]:
partial_queries = []
for query,target in partial_queries_with_intents.items():
    partial_queries.append({'sequence': query, 
                            'target': target})
partial_queries_df = pd.DataFrame(partial_queries)
print(len(partial_queries_df))
partial_queries_df

In [None]:
partial_queries_df['target'].value_counts()

In [None]:
partial_queries_df.loc[partial_queries_df['target'] == 'navigation_intent']

In [None]:
# def apply_target_mapping(df, target_mapping):
#     mapped_text_set = set()
#     for ngram in target_mapping.keys():
#         # mask = df['sequence'].apply(lambda text: ngram in text)
#         mask = df['sequence'].apply(lambda text: ngram in text and text not in mapped_text_set)
#         print(f'Number of matches found for "{ngram}"  = {mask.sum()}')
#         print(f'size of mapped_text_set = {len(mapped_text_set)}')
#         df.loc[mask, 'target'] = target_mapping[ngram]
#         mapped_text_set.update(df.loc[mask, 'sequence'].values.tolist())
#         print()

In [None]:
def apply_target_mapping(df, target_mapping, ngram, mapped_text_set):
    # mapped_text_set = set()
    # for ngram in target_mapping.keys():
    # mask = df['sequence'].apply(lambda text: ngram in text)
    mask = df['sequence'].apply(lambda text: ngram in text and text not in mapped_text_set)
    print(f'Number of matches found for "{ngram}"  = {mask.sum()}')
    print(f'size of mapped_text_set = {len(mapped_text_set)}')
    df.loc[mask, 'target'] = target_mapping[ngram]
    mapped_text_set.update(df.loc[mask, 'sequence'].values.tolist())
    print()

In [None]:
to_be_labelled = marco_df.loc[marco_df['target'].isna()].copy()
labelled = marco_df.loc[~marco_df['target'].isna()].copy()

In [None]:
len(to_be_labelled), len(labelled)

In [None]:
manual_labelled = pd.read_csv("../data/manual_labels_v2.csv")
manual_labelled = manual_labelled.loc[~manual_labelled['target'].isna()]
print(len(manual_labelled))
print(manual_labelled['target'].value_counts())
manual_labelled_lkp = manual_labelled[['sequence','target']].set_index('sequence').to_dict()['target']
manual_labelled.head()

In [None]:
def apply_manual_mapping(df, manual_labelled_lkp):
    mask = df['sequence'].apply(lambda text: text in manual_labelled_lkp)
    print(f'Number of matches found in manual labels = {mask.sum()}')
    df.loc[mask, 'target'] = df.loc[mask, 'sequence'].map(manual_labelled_lkp)
    print()

In [None]:
apply_manual_mapping(to_be_labelled, manual_labelled_lkp)
labelled = pd.concat([labelled, to_be_labelled.loc[~to_be_labelled['target'].isna()]], axis=0).sample(frac=1.0)
to_be_labelled = to_be_labelled.loc[to_be_labelled['target'].isna()]
print(f"to_be_labelled: {len(to_be_labelled)}, labelled: {len(labelled)}")

In [None]:

print(f"Number of examples labeled = {len(labelled)}")
print(f"Number of examples to be labeled = {len(to_be_labelled)}")
print(f"Label stats \n{labelled['target'].value_counts()}\n")

# Step 3: Get most common n-grams for a given n
n = 2  # Change this to any n (e.g., 1 for unigrams, 3 for trigrams)
to_be_labelled_sequence_list = to_be_labelled['sequence'].values.tolist()
ngram_counter = count_ngrams(to_be_labelled_sequence_list, n)
most_common_ngrams = ngram_counter.most_common(100)

# Display the most common n-grams
print(most_common_ngrams)

# Example usage with a limit on the number of results
cnt = 0
for query in search_queries_by_words("5 star", to_be_labelled_sequence_list):
    if cnt >= 100:  # Stop after 20 results
        break
    print(cnt + 1, query)
    cnt += 1



In [None]:
translate_intent_additional_queries_df

In [None]:
weather_examples

In [None]:
general_yelp_keyword[:10]

In [None]:
one_word_yelp_examples = [kw for kw in general_yelp_keyword if len(kw.split(" ")) <= 2]
len(one_word_yelp_examples)

In [None]:
one_word_yelp_examples_df = pd.DataFrame({"sequence": one_word_yelp_examples * 3}) 
one_word_yelp_examples_df['target'] = 'yelp_intent'
one_word_yelp_examples_df

In [None]:
labelled = pd.concat([labelled, 
                      weather_examples, 
                      yelp_examples, 
                      purchase_intent_examples, 
                      yelp_intent_additional_queries_df,
                      navigation_intent_additional_queries_df,
                      travel_intent_additional_queries_df,
                      translate_intent_additional_queries_df,
                      unknown_intent_additional_queries_df,
                      # information_intent_additional_queries_df,
                      partial_queries_df,
                      one_word_yelp_examples_df,
                     ], axis=0)
mapped_text_set = set()
for i, ngram in enumerate(target_mapping.keys()):
    print()
    print(f"iteration {i+1}: to_be_labelled: {len(to_be_labelled)}, labelled: {len(labelled)}")
    apply_target_mapping(to_be_labelled, target_mapping, ngram, mapped_text_set)
    labelled = pd.concat([labelled, to_be_labelled.loc[~to_be_labelled['target'].isna()]], axis=0)
    to_be_labelled = to_be_labelled.loc[to_be_labelled['target'].isna()]

In [None]:
labelled = labelled.sample(frac=1.0, replace=False)

In [None]:
labelled[:60]

#### Skip this for manual labeling

In [None]:
## Only if special list for manual process needed else skip this 

SKIP_MANUAL_LABEL_PREP = True
if not SKIP_MANUAL_LABEL_PREP:
    special_list = set()
    
    cnt = 0
    
    for query in search_queries_by_words("how much", to_be_labelled_sequence_list):
        if cnt >= 10000:  # Stop after 20 results
            break
        # print(cnt + 1, query)
        cnt += 1
        special_list.add(query)
    
    pd.DataFrame(special_list, columns=['sequence']).to_csv('special_list_manual_label.csv', index=False)

In [None]:
to_be_labelled

In [None]:
labelled['target'].value_counts()

In [None]:
plt.figure(figsize=(6,3))
plt.title("training data distribution")
labelled['target'].value_counts().sort_values().plot.barh();

In [None]:
# labelled.loc[labelled['target'] == 'translation_intent']['sequence'].sample(100).values

In [None]:
# labelled.loc[labelled['sequence'].apply(lambda q: "sf " in q)]['sequence'].values

In [None]:
combined = pd.concat([labelled, to_be_labelled], axis=0).reset_index(drop=True)
print(len(combined))
combined

In [None]:
labelled['target'].value_counts()

In [None]:
labelled.sample(frac=1.0,replace=False).to_csv("../data/marco_train_v7.csv", index=False)

In [None]:
import pandas as pd
from umap import UMAP
from sklearn.pipeline import make_pipeline 
from embetter.text import SentenceEncoder


SKIP_ENCODING = False
if not SKIP_ENCODING:
    # Build a sentence encoder pipeline with UMAP at the end.
    enc = SentenceEncoder('all-MiniLM-L6-v2')
    umap = UMAP()
    
    text_emb_pipeline = make_pipeline(
      enc, umap
    )
    
    # Load sentences
    X = combined['sequence'].values.tolist()
    
    # Calculate embeddings 
    X_tfm = text_emb_pipeline.fit_transform(X)
    
    # Write to disk. Note! Text column must be named "text"
    df = pd.DataFrame({"text": X})
    df['x'] = X_tfm[:, 0]
    df['y'] = X_tfm[:, 1]
    df.to_csv("marco_ready.csv", index=False)
    df['target'] = combined['target'].fillna('unknown')
else:
    df = pd.read_csv("marco_ready.csv")
    df['target'] = combined['target'].fillna('unknown')

In [None]:
combined

In [None]:
df

In [None]:
import plotly.express as px

In [None]:
fig_2d = px.scatter(
    df, x='x', y='y',
    color=df['target'], labels={'color': 'target'},
    hover_name="text",
    opacity=0.3,
    title="marcos web search queries intents map"
)



In [None]:
fig_2d

In [None]:
fig_2d.write_html("../reports/web_search_intents.html")

In [None]:
# [query for query in labelled.loc[labelled['target'] == 'yelp_intent']['sequence'].values.tolist() if 'medication' in query]

In [None]:
labelled.loc[labelled['target'] == 'yelp_intent']

In [None]:
len(to_be_labelled)

In [None]:
to_be_labelled

In [None]:
to_be_labelled.to_csv('../data/to_be_labelled.csv', index=False)