notebooks/intent_val_data_prep.ipynb (542 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"id": "07c76f5c-0bfc-4b51-97fe-5906e8f9c5dd",
"metadata": {},
"source": [
"Purpose of this notebook is to generate additional validation data for yelp intent and weather intent"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "52b3b4ba-146e-44f7-819a-b0d3932ea7e6",
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"import pandas as pd\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "6d4bb365-5057-4816-bae6-cf01b3ef3f8e",
"metadata": {},
"outputs": [],
"source": [
"random.seed(42)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "ec6bde28-9edc-447f-964b-95df0a91eb01",
"metadata": {},
"outputs": [],
"source": [
"# Sample cities and city-state combinations\n",
"cities = [\n",
" \"New York\", \"Los Angeles\", \"Chicago\", \"Houston\", \"Phoenix\", \n",
" \"Philadelphia\", \"San Antonio\", \"San Diego\", \"Dallas\", \"San Jose\",\n",
" \"Miami\", \"San Francisco\", \"Seattle\", \"Atlanta\", \"Denver\", \n",
" \"Boston\", \"Las Vegas\", \"Orlando\", \"Austin\", \"Nashville\", \n",
" \"Detroit\", \"Portland\", \"Charlotte\", \"Baltimore\", \"St. Louis\", \n",
" \"Tampa\", \"Minneapolis\", \"Cleveland\", \"Pittsburgh\", \"Cincinnati\"\n",
"]\n",
"\n",
"city_states = [\n",
" \"New York, NY\", \"Los Angeles, CA\", \"Chicago, IL\", \"Houston, TX\", \"Phoenix, AZ\", \n",
" \"Philadelphia, PA\", \"San Antonio, TX\", \"San Diego, CA\", \"Dallas, TX\", \"San Jose, CA\",\n",
" \"Miami, FL\", \"San Francisco, CA\", \"Seattle, WA\", \"Atlanta, GA\", \"Denver, CO\", \n",
" \"Boston, MA\", \"Las Vegas, NV\", \"Orlando, FL\", \"Austin, TX\", \"Nashville, TN\", \n",
" \"Detroit, MI\", \"Portland, OR\", \"Charlotte, NC\", \"Baltimore, MD\", \"St. Louis, MO\", \n",
" \"Tampa, FL\", \"Minneapolis, MN\", \"Cleveland, OH\", \"Pittsburgh, PA\", \"Cincinnati, OH\"\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "09cebdbf-9c99-46fc-97c6-b45c366df71a",
"metadata": {},
"outputs": [],
"source": [
"# Function to generate queries\n",
"def generate_queries(queries, pre_modifiers, post_modifiers, cities, city_states, num_queries=1000):\n",
" all_queries = []\n",
"\n",
" for query in queries:\n",
" # Generate queries with city and city_state substitutions\n",
" for city, city_state in zip(cities, city_states):\n",
" for pre in pre_modifiers:\n",
" for post in post_modifiers:\n",
" if \"{city}\" in post:\n",
" city_query = f\"{pre} {query} {post.replace('{city}', city)}\"\n",
" all_queries.append(city_query)\n",
" elif \"{city_state}\" in post:\n",
" city_state_query = f\"{pre} {query} {post.replace('{city_state}', city_state)}\"\n",
" all_queries.append(city_state_query)\n",
" else:\n",
" generic_query = f\"{pre} {query} {post}\"\n",
" all_queries.append(generic_query)\n",
"\n",
" all_queries = list(set(all_queries))\n",
" # Randomize the output to avoid any specific order\n",
" random.shuffle(all_queries)\n",
" \n",
" # Limit the number of queries to the desired amount\n",
" return all_queries[:num_queries]"
]
},
{
"cell_type": "markdown",
"id": "55ff1eae-2440-4d6a-9dcb-9d989924d6d2",
"metadata": {},
"source": [
"#### Yelp val data generation"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "29ecc694-e5da-4498-bffb-ffe91761515c",
"metadata": {},
"outputs": [],
"source": [
"# Pre-modifiers\n",
"yelp_pre_modifiers = [\n",
" \"\", \"list of local\", \"cost of\", \"cost of a\", \"cost to\", \"average cost of\", \n",
" \"average cost of a\", \"average price for\", \"average price of\", \n",
" \"average price of a\", \"average price to\", \"find\", \"find a\", \n",
" \"find me the best\", \"looking for the best\"\n",
"]\n",
"\n",
"# Post-modifiers\n",
"yelp_post_modifiers = [\n",
" \"\",\"near me\", \"nearby\", \"delivery\", \"in my area\", \"in {city}\", \"near {city}\", \"in {city_state}\"\n",
"]\n",
"\n",
"# Example queries\n",
"yelp_keywords = [\n",
" # Original Queries\n",
" \"24 hour cleaning services\", \n",
" \"appliance movers\", \n",
" \"roofing company\", \n",
" \"house painting service\", \n",
" \"window cleaning\", \n",
" \"5 star restaurants\", \n",
" \"carpet cleaning\", \n",
" \"house cleaning service\", \n",
" \"movers\", \n",
" \"apartment cleaning services\",\n",
" \n",
" # Services\n",
" \"plumbing services\", \n",
" \"electricians near me\", \n",
" \"HVAC repair\", \n",
" \"locksmith services\", \n",
" \"handyman services\", \n",
" \"pool cleaning service\", \n",
" \"pest control\", \n",
" \"lawn care service\", \n",
" \"home cleaning services\", \n",
" \"trash removal services\", \n",
" \"junk removal\", \n",
" \"tree trimming services\", \n",
" \"landscaping companies\", \n",
" \"gutter cleaning services\", \n",
" \"septic tank cleaning\",\n",
" \n",
" # Local Businesses\n",
" \"dry cleaners\", \n",
" \"tailor near me\", \n",
" \"barber shop\", \n",
" \"nail salon\", \n",
" \"hair salon\", \n",
" \"spa near me\", \n",
" \"massage therapy\", \n",
" \"pet grooming\", \n",
" \"dog walking services\", \n",
" \"veterinary clinic\", \n",
" \"auto repair\", \n",
" \"tire shop\", \n",
" \"body shop\", \n",
" \"car wash\", \n",
" \"oil change services\",\n",
" \n",
" # Restaurants & Dining\n",
" \"best sushi near me\", \n",
" \"pizza delivery\", \n",
" \"Mexican restaurants\", \n",
" \"vegan restaurants\", \n",
" \"brunch spots\", \n",
" \"fine dining restaurants\", \n",
" \"seafood restaurants\", \n",
" \"Thai food near me\", \n",
" \"BBQ restaurants\", \n",
" \"Italian restaurants\", \n",
" \"coffee shops\", \n",
" \"bakery near me\", \n",
" \"food trucks\", \n",
" \"dine-in restaurants\", \n",
" \"steakhouses\",\n",
" \n",
" # Health & Fitness\n",
" \"gyms near me\", \n",
" \"personal trainer\", \n",
" \"yoga classes\", \n",
" \"crossfit gyms\", \n",
" \"boxing gyms\", \n",
" \"pilates classes\", \n",
" \"spinning classes\", \n",
" \"martial arts studios\", \n",
" \"swimming lessons\", \n",
" \"fitness boot camps\", \n",
" \"physical therapy clinics\", \n",
" \"acupuncture near me\", \n",
" \"chiropractor services\", \n",
" \"nutritionists\", \n",
" \"wellness centers\",\n",
" \n",
" # Shopping & Retail\n",
" \"grocery stores\", \n",
" \"furniture stores\", \n",
" \"antique shops\", \n",
" \"thrift stores\", \n",
" \"clothing stores\", \n",
" \"shoe stores\", \n",
" \"jewelry stores\", \n",
" \"malls near me\", \n",
" \"outlet malls\", \n",
" \"toy stores\", \n",
" \"pet stores\", \n",
" \"bookstores\", \n",
" \"wine shops\", \n",
" \"gift shops\", \n",
" \"hardware stores\",\n",
" \n",
" # Event & Activity Services\n",
" \"party rental services\", \n",
" \"photographers near me\", \n",
" \"DJ services\", \n",
" \"wedding planners\", \n",
" \"catering services\", \n",
" \"event venues\", \n",
" \"balloon delivery services\", \n",
" \"cake decorators\", \n",
" \"florists near me\", \n",
" \"karaoke bars\", \n",
" \"comedy clubs\", \n",
" \"escape rooms\", \n",
" \"movie theaters\", \n",
" \"bowling alleys\", \n",
" \"mini golf courses\"\n",
"]\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "930b941a-20ad-4888-8bf8-21dbaff61654",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>queries</th>\n",
" <th>target</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>find wine shops near Nashville</td>\n",
" <td>yelp_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>average price of photographers near me delivery</td>\n",
" <td>yelp_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>average price of event venues near Detroit</td>\n",
" <td>yelp_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>average cost of a DJ services in Philadelphia, PA</td>\n",
" <td>yelp_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>average price to hardware stores in Seattle</td>\n",
" <td>yelp_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1995</th>\n",
" <td>looking for the best antique shops near Las Vegas</td>\n",
" <td>yelp_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1996</th>\n",
" <td>list of local home cleaning services near Tampa</td>\n",
" <td>yelp_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1997</th>\n",
" <td>cost of a movie theaters near Orlando</td>\n",
" <td>yelp_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1998</th>\n",
" <td>cost of a window cleaning in San Antonio</td>\n",
" <td>yelp_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1999</th>\n",
" <td>average price for spinning classes in Tampa, FL</td>\n",
" <td>yelp_intent</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2000 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" queries target\n",
"0 find wine shops near Nashville yelp_intent\n",
"1 average price of photographers near me delivery yelp_intent\n",
"2 average price of event venues near Detroit yelp_intent\n",
"3 average cost of a DJ services in Philadelphia, PA yelp_intent\n",
"4 average price to hardware stores in Seattle yelp_intent\n",
"... ... ...\n",
"1995 looking for the best antique shops near Las Vegas yelp_intent\n",
"1996 list of local home cleaning services near Tampa yelp_intent\n",
"1997 cost of a movie theaters near Orlando yelp_intent\n",
"1998 cost of a window cleaning in San Antonio yelp_intent\n",
"1999 average price for spinning classes in Tampa, FL yelp_intent\n",
"\n",
"[2000 rows x 2 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"yelp_generated_queries = generate_queries(yelp_keywords, yelp_pre_modifiers, yelp_post_modifiers, cities, city_states, num_queries=2000)\n",
"yelp_val_generated_data = pd.DataFrame(yelp_generated_queries, columns=['queries'])\n",
"yelp_val_generated_data['target'] = 'yelp_intent'\n",
"yelp_val_generated_data.to_csv(\"../data/yelp_val_generated_data.csv\", index=False)\n",
"yelp_val_generated_data"
]
},
{
"cell_type": "markdown",
"id": "5b5faca4-54f3-4798-8b05-654e264dc4b8",
"metadata": {},
"source": [
"#### Weather val data generation"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "0021c37d-ba16-47d3-a617-e147dc487d0a",
"metadata": {},
"outputs": [],
"source": [
"weather_keywords = [\n",
" \"weather\", \"forecast\", \"windy\", \"humidity\", \"monsoon\", \"flooding\", \"rain in\", \n",
" \"storms\", \"storm in\", \"forcast\", \"wether\", \"wather\", \"weahter\", \"weater\", \n",
" \"weaher\", \"vindy\", \"sunny\", \"rain\", \"windy\", \"cloudy\", \"storms\", \"air quality\", \n",
" \"thunderstorm\", \"tornado\", \"hurricane\", \"pollen\", \"snow\", \"blizzard\", \"radar\", \n",
" \"tiempo\", \"clima\", \"doppler radar\", \"local radar\", \"local weather\", \"map\", \n",
" \"us weather radar\", \"weather radar near me\", \"radar near me\", \"temperature\"\n",
"]\n",
"\n",
"weather_pre_modifiers = [\n",
" \"\", \"current\", \"hourly\", \"daily\", \"weekly\", \"10-day\", \"weekend\", \"live\", \"doppler\", \n",
" \"interactive\", \"national\", \"regional\", \"severe\", \"latest\", \"future\", \"local\", \"us\"\n",
"]\n",
"\n",
"# Post-modifiers (same structure as in Yelp queries)\n",
"weather_post_modifiers = [\n",
" \"\", \"near me\", \"in my area\", \"in {city}\", \"near {city}\", \"in {city_state}\", \n",
" \"near {city_state}\", \"for {city}\", \"for {city_state}\"\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "6095c619-92a0-4e4e-9b07-5da7109bce3d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>queries</th>\n",
" <th>target</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>weekly monsoon in St. Louis</td>\n",
" <td>weather_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>future weahter in Denver</td>\n",
" <td>weather_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>doppler cloudy in Cleveland, OH</td>\n",
" <td>weather_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>regional blizzard near Phoenix</td>\n",
" <td>weather_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>severe clima near Orlando, FL</td>\n",
" <td>weather_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1995</th>\n",
" <td>latest clima in Detroit</td>\n",
" <td>weather_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1996</th>\n",
" <td>weater near Denver</td>\n",
" <td>weather_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1997</th>\n",
" <td>doppler storm in near Houston</td>\n",
" <td>weather_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1998</th>\n",
" <td>future sunny near Cincinnati, OH</td>\n",
" <td>weather_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1999</th>\n",
" <td>local hurricane for St. Louis, MO</td>\n",
" <td>weather_intent</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2000 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" queries target\n",
"0 weekly monsoon in St. Louis weather_intent\n",
"1 future weahter in Denver weather_intent\n",
"2 doppler cloudy in Cleveland, OH weather_intent\n",
"3 regional blizzard near Phoenix weather_intent\n",
"4 severe clima near Orlando, FL weather_intent\n",
"... ... ...\n",
"1995 latest clima in Detroit weather_intent\n",
"1996 weater near Denver weather_intent\n",
"1997 doppler storm in near Houston weather_intent\n",
"1998 future sunny near Cincinnati, OH weather_intent\n",
"1999 local hurricane for St. Louis, MO weather_intent\n",
"\n",
"[2000 rows x 2 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"weather_generated_queries = generate_queries(weather_keywords, weather_pre_modifiers, weather_post_modifiers, cities, city_states, num_queries=2000)\n",
"weather_val_generated_data = pd.DataFrame(weather_generated_queries, columns=['queries'])\n",
"weather_val_generated_data['target'] = 'weather_intent'\n",
"weather_val_generated_data.to_csv(\"../data/weather_val_generated_data.csv\", index=False)\n",
"weather_val_generated_data"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (dl-course)",
"language": "python",
"name": "dl-course"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 5
}