Purpose of this notebook is to generate additional validation data for yelp intent and weather intent

In [1]:
import random
import pandas as pd


In [2]:
random.seed(42)

In [3]:
# Sample cities and city-state combinations
cities = [
    "New York", "Los Angeles", "Chicago", "Houston", "Phoenix", 
    "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose",
    "Miami", "San Francisco", "Seattle", "Atlanta", "Denver", 
    "Boston", "Las Vegas", "Orlando", "Austin", "Nashville", 
    "Detroit", "Portland", "Charlotte", "Baltimore", "St. Louis", 
    "Tampa", "Minneapolis", "Cleveland", "Pittsburgh", "Cincinnati"
]

city_states = [
    "New York, NY", "Los Angeles, CA", "Chicago, IL", "Houston, TX", "Phoenix, AZ", 
    "Philadelphia, PA", "San Antonio, TX", "San Diego, CA", "Dallas, TX", "San Jose, CA",
    "Miami, FL", "San Francisco, CA", "Seattle, WA", "Atlanta, GA", "Denver, CO", 
    "Boston, MA", "Las Vegas, NV", "Orlando, FL", "Austin, TX", "Nashville, TN", 
    "Detroit, MI", "Portland, OR", "Charlotte, NC", "Baltimore, MD", "St. Louis, MO", 
    "Tampa, FL", "Minneapolis, MN", "Cleveland, OH", "Pittsburgh, PA", "Cincinnati, OH"
]

In [4]:
# Function to generate queries
def generate_queries(queries, pre_modifiers, post_modifiers, cities, city_states, num_queries=1000):
    all_queries = []

    for query in queries:
        # Generate queries with city and city_state substitutions
        for city, city_state in zip(cities, city_states):
            for pre in pre_modifiers:
                for post in post_modifiers:
                    if "{city}" in post:
                        city_query = f"{pre} {query} {post.replace('{city}', city)}"
                        all_queries.append(city_query)
                    elif "{city_state}" in post:
                        city_state_query = f"{pre} {query} {post.replace('{city_state}', city_state)}"
                        all_queries.append(city_state_query)
                    else:
                        generic_query = f"{pre} {query} {post}"
                        all_queries.append(generic_query)

    all_queries = list(set(all_queries))
    # Randomize the output to avoid any specific order
    random.shuffle(all_queries)
    
    # Limit the number of queries to the desired amount
    return all_queries[:num_queries]

#### Yelp val data generation

In [5]:
# Pre-modifiers
yelp_pre_modifiers = [
    "", "list of local", "cost of", "cost of a", "cost to", "average cost of", 
    "average cost of a", "average price for", "average price of", 
    "average price of a", "average price to", "find", "find a", 
    "find me the best", "looking for the best"
]

# Post-modifiers
yelp_post_modifiers = [
    "","near me", "nearby", "delivery", "in my area", "in {city}", "near {city}", "in {city_state}"
]

# Example queries
yelp_keywords = [
    # Original Queries
    "24 hour cleaning services", 
    "appliance movers", 
    "roofing company", 
    "house painting service", 
    "window cleaning", 
    "5 star restaurants", 
    "carpet cleaning", 
    "house cleaning service", 
    "movers", 
    "apartment cleaning services",
    
    # Services
    "plumbing services", 
    "electricians near me", 
    "HVAC repair", 
    "locksmith services", 
    "handyman services", 
    "pool cleaning service", 
    "pest control", 
    "lawn care service", 
    "home cleaning services", 
    "trash removal services", 
    "junk removal", 
    "tree trimming services", 
    "landscaping companies", 
    "gutter cleaning services", 
    "septic tank cleaning",
    
    # Local Businesses
    "dry cleaners", 
    "tailor near me", 
    "barber shop", 
    "nail salon", 
    "hair salon", 
    "spa near me", 
    "massage therapy", 
    "pet grooming", 
    "dog walking services", 
    "veterinary clinic", 
    "auto repair", 
    "tire shop", 
    "body shop", 
    "car wash", 
    "oil change services",
    
    # Restaurants & Dining
    "best sushi near me", 
    "pizza delivery", 
    "Mexican restaurants", 
    "vegan restaurants", 
    "brunch spots", 
    "fine dining restaurants", 
    "seafood restaurants", 
    "Thai food near me", 
    "BBQ restaurants", 
    "Italian restaurants", 
    "coffee shops", 
    "bakery near me", 
    "food trucks", 
    "dine-in restaurants", 
    "steakhouses",
    
    # Health & Fitness
    "gyms near me", 
    "personal trainer", 
    "yoga classes", 
    "crossfit gyms", 
    "boxing gyms", 
    "pilates classes", 
    "spinning classes", 
    "martial arts studios", 
    "swimming lessons", 
    "fitness boot camps", 
    "physical therapy clinics", 
    "acupuncture near me", 
    "chiropractor services", 
    "nutritionists", 
    "wellness centers",
    
    # Shopping & Retail
    "grocery stores", 
    "furniture stores", 
    "antique shops", 
    "thrift stores", 
    "clothing stores", 
    "shoe stores", 
    "jewelry stores", 
    "malls near me", 
    "outlet malls", 
    "toy stores", 
    "pet stores", 
    "bookstores", 
    "wine shops", 
    "gift shops", 
    "hardware stores",
    
    # Event & Activity Services
    "party rental services", 
    "photographers near me", 
    "DJ services", 
    "wedding planners", 
    "catering services", 
    "event venues", 
    "balloon delivery services", 
    "cake decorators", 
    "florists near me", 
    "karaoke bars", 
    "comedy clubs", 
    "escape rooms", 
    "movie theaters", 
    "bowling alleys", 
    "mini golf courses"
]



In [6]:
yelp_generated_queries = generate_queries(yelp_keywords, yelp_pre_modifiers, yelp_post_modifiers, cities, city_states, num_queries=2000)
yelp_val_generated_data = pd.DataFrame(yelp_generated_queries, columns=['queries'])
yelp_val_generated_data['target'] = 'yelp_intent'
yelp_val_generated_data.to_csv("../data/yelp_val_generated_data.csv", index=False)
yelp_val_generated_data

Unnamed: 0,queries,target
0,find wine shops near Nashville,yelp_intent
1,average price of photographers near me delivery,yelp_intent
2,average price of event venues near Detroit,yelp_intent
3,"average cost of a DJ services in Philadelphia, PA",yelp_intent
4,average price to hardware stores in Seattle,yelp_intent
...,...,...
1995,looking for the best antique shops near Las Vegas,yelp_intent
1996,list of local home cleaning services near Tampa,yelp_intent
1997,cost of a movie theaters near Orlando,yelp_intent
1998,cost of a window cleaning in San Antonio,yelp_intent


#### Weather val data generation

In [7]:
weather_keywords = [
    "weather", "forecast", "windy", "humidity", "monsoon", "flooding", "rain in", 
    "storms", "storm in", "forcast", "wether", "wather", "weahter", "weater", 
    "weaher", "vindy", "sunny", "rain", "windy", "cloudy", "storms", "air quality", 
    "thunderstorm", "tornado", "hurricane", "pollen", "snow", "blizzard", "radar", 
    "tiempo", "clima", "doppler radar", "local radar", "local weather", "map", 
    "us weather radar", "weather radar near me", "radar near me", "temperature"
]

weather_pre_modifiers = [
    "", "current", "hourly", "daily", "weekly", "10-day", "weekend", "live", "doppler", 
    "interactive", "national", "regional", "severe", "latest", "future", "local", "us"
]

# Post-modifiers (same structure as in Yelp queries)
weather_post_modifiers = [
    "", "near me", "in my area", "in {city}", "near {city}", "in {city_state}", 
    "near {city_state}", "for {city}", "for {city_state}"
]

In [8]:
weather_generated_queries = generate_queries(weather_keywords, weather_pre_modifiers, weather_post_modifiers, cities, city_states, num_queries=2000)
weather_val_generated_data = pd.DataFrame(weather_generated_queries, columns=['queries'])
weather_val_generated_data['target'] = 'weather_intent'
weather_val_generated_data.to_csv("../data/weather_val_generated_data.csv", index=False)
weather_val_generated_data

Unnamed: 0,queries,target
0,weekly monsoon in St. Louis,weather_intent
1,future weahter in Denver,weather_intent
2,"doppler cloudy in Cleveland, OH",weather_intent
3,regional blizzard near Phoenix,weather_intent
4,"severe clima near Orlando, FL",weather_intent
...,...,...
1995,latest clima in Detroit,weather_intent
1996,weater near Denver,weather_intent
1997,doppler storm in near Houston,weather_intent
1998,"future sunny near Cincinnati, OH",weather_intent
