notebooks/intent_val_data_prep.ipynb (542 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "id": "07c76f5c-0bfc-4b51-97fe-5906e8f9c5dd", "metadata": {}, "source": [ "Purpose of this notebook is to generate additional validation data for yelp intent and weather intent" ] }, { "cell_type": "code", "execution_count": 1, "id": "52b3b4ba-146e-44f7-819a-b0d3932ea7e6", "metadata": {}, "outputs": [], "source": [ "import random\n", "import pandas as pd\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "6d4bb365-5057-4816-bae6-cf01b3ef3f8e", "metadata": {}, "outputs": [], "source": [ "random.seed(42)" ] }, { "cell_type": "code", "execution_count": 3, "id": "ec6bde28-9edc-447f-964b-95df0a91eb01", "metadata": {}, "outputs": [], "source": [ "# Sample cities and city-state combinations\n", "cities = [\n", " \"New York\", \"Los Angeles\", \"Chicago\", \"Houston\", \"Phoenix\", \n", " \"Philadelphia\", \"San Antonio\", \"San Diego\", \"Dallas\", \"San Jose\",\n", " \"Miami\", \"San Francisco\", \"Seattle\", \"Atlanta\", \"Denver\", \n", " \"Boston\", \"Las Vegas\", \"Orlando\", \"Austin\", \"Nashville\", \n", " \"Detroit\", \"Portland\", \"Charlotte\", \"Baltimore\", \"St. Louis\", \n", " \"Tampa\", \"Minneapolis\", \"Cleveland\", \"Pittsburgh\", \"Cincinnati\"\n", "]\n", "\n", "city_states = [\n", " \"New York, NY\", \"Los Angeles, CA\", \"Chicago, IL\", \"Houston, TX\", \"Phoenix, AZ\", \n", " \"Philadelphia, PA\", \"San Antonio, TX\", \"San Diego, CA\", \"Dallas, TX\", \"San Jose, CA\",\n", " \"Miami, FL\", \"San Francisco, CA\", \"Seattle, WA\", \"Atlanta, GA\", \"Denver, CO\", \n", " \"Boston, MA\", \"Las Vegas, NV\", \"Orlando, FL\", \"Austin, TX\", \"Nashville, TN\", \n", " \"Detroit, MI\", \"Portland, OR\", \"Charlotte, NC\", \"Baltimore, MD\", \"St. Louis, MO\", \n", " \"Tampa, FL\", \"Minneapolis, MN\", \"Cleveland, OH\", \"Pittsburgh, PA\", \"Cincinnati, OH\"\n", "]" ] }, { "cell_type": "code", "execution_count": 4, "id": "09cebdbf-9c99-46fc-97c6-b45c366df71a", "metadata": {}, "outputs": [], "source": [ "# Function to generate queries\n", "def generate_queries(queries, pre_modifiers, post_modifiers, cities, city_states, num_queries=1000):\n", " all_queries = []\n", "\n", " for query in queries:\n", " # Generate queries with city and city_state substitutions\n", " for city, city_state in zip(cities, city_states):\n", " for pre in pre_modifiers:\n", " for post in post_modifiers:\n", " if \"{city}\" in post:\n", " city_query = f\"{pre} {query} {post.replace('{city}', city)}\"\n", " all_queries.append(city_query)\n", " elif \"{city_state}\" in post:\n", " city_state_query = f\"{pre} {query} {post.replace('{city_state}', city_state)}\"\n", " all_queries.append(city_state_query)\n", " else:\n", " generic_query = f\"{pre} {query} {post}\"\n", " all_queries.append(generic_query)\n", "\n", " all_queries = list(set(all_queries))\n", " # Randomize the output to avoid any specific order\n", " random.shuffle(all_queries)\n", " \n", " # Limit the number of queries to the desired amount\n", " return all_queries[:num_queries]" ] }, { "cell_type": "markdown", "id": "55ff1eae-2440-4d6a-9dcb-9d989924d6d2", "metadata": {}, "source": [ "#### Yelp val data generation" ] }, { "cell_type": "code", "execution_count": 5, "id": "29ecc694-e5da-4498-bffb-ffe91761515c", "metadata": {}, "outputs": [], "source": [ "# Pre-modifiers\n", "yelp_pre_modifiers = [\n", " \"\", \"list of local\", \"cost of\", \"cost of a\", \"cost to\", \"average cost of\", \n", " \"average cost of a\", \"average price for\", \"average price of\", \n", " \"average price of a\", \"average price to\", \"find\", \"find a\", \n", " \"find me the best\", \"looking for the best\"\n", "]\n", "\n", "# Post-modifiers\n", "yelp_post_modifiers = [\n", " \"\",\"near me\", \"nearby\", \"delivery\", \"in my area\", \"in {city}\", \"near {city}\", \"in {city_state}\"\n", "]\n", "\n", "# Example queries\n", "yelp_keywords = [\n", " # Original Queries\n", " \"24 hour cleaning services\", \n", " \"appliance movers\", \n", " \"roofing company\", \n", " \"house painting service\", \n", " \"window cleaning\", \n", " \"5 star restaurants\", \n", " \"carpet cleaning\", \n", " \"house cleaning service\", \n", " \"movers\", \n", " \"apartment cleaning services\",\n", " \n", " # Services\n", " \"plumbing services\", \n", " \"electricians near me\", \n", " \"HVAC repair\", \n", " \"locksmith services\", \n", " \"handyman services\", \n", " \"pool cleaning service\", \n", " \"pest control\", \n", " \"lawn care service\", \n", " \"home cleaning services\", \n", " \"trash removal services\", \n", " \"junk removal\", \n", " \"tree trimming services\", \n", " \"landscaping companies\", \n", " \"gutter cleaning services\", \n", " \"septic tank cleaning\",\n", " \n", " # Local Businesses\n", " \"dry cleaners\", \n", " \"tailor near me\", \n", " \"barber shop\", \n", " \"nail salon\", \n", " \"hair salon\", \n", " \"spa near me\", \n", " \"massage therapy\", \n", " \"pet grooming\", \n", " \"dog walking services\", \n", " \"veterinary clinic\", \n", " \"auto repair\", \n", " \"tire shop\", \n", " \"body shop\", \n", " \"car wash\", \n", " \"oil change services\",\n", " \n", " # Restaurants & Dining\n", " \"best sushi near me\", \n", " \"pizza delivery\", \n", " \"Mexican restaurants\", \n", " \"vegan restaurants\", \n", " \"brunch spots\", \n", " \"fine dining restaurants\", \n", " \"seafood restaurants\", \n", " \"Thai food near me\", \n", " \"BBQ restaurants\", \n", " \"Italian restaurants\", \n", " \"coffee shops\", \n", " \"bakery near me\", \n", " \"food trucks\", \n", " \"dine-in restaurants\", \n", " \"steakhouses\",\n", " \n", " # Health & Fitness\n", " \"gyms near me\", \n", " \"personal trainer\", \n", " \"yoga classes\", \n", " \"crossfit gyms\", \n", " \"boxing gyms\", \n", " \"pilates classes\", \n", " \"spinning classes\", \n", " \"martial arts studios\", \n", " \"swimming lessons\", \n", " \"fitness boot camps\", \n", " \"physical therapy clinics\", \n", " \"acupuncture near me\", \n", " \"chiropractor services\", \n", " \"nutritionists\", \n", " \"wellness centers\",\n", " \n", " # Shopping & Retail\n", " \"grocery stores\", \n", " \"furniture stores\", \n", " \"antique shops\", \n", " \"thrift stores\", \n", " \"clothing stores\", \n", " \"shoe stores\", \n", " \"jewelry stores\", \n", " \"malls near me\", \n", " \"outlet malls\", \n", " \"toy stores\", \n", " \"pet stores\", \n", " \"bookstores\", \n", " \"wine shops\", \n", " \"gift shops\", \n", " \"hardware stores\",\n", " \n", " # Event & Activity Services\n", " \"party rental services\", \n", " \"photographers near me\", \n", " \"DJ services\", \n", " \"wedding planners\", \n", " \"catering services\", \n", " \"event venues\", \n", " \"balloon delivery services\", \n", " \"cake decorators\", \n", " \"florists near me\", \n", " \"karaoke bars\", \n", " \"comedy clubs\", \n", " \"escape rooms\", \n", " \"movie theaters\", \n", " \"bowling alleys\", \n", " \"mini golf courses\"\n", "]\n", "\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "930b941a-20ad-4888-8bf8-21dbaff61654", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>queries</th>\n", " <th>target</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>find wine shops near Nashville</td>\n", " <td>yelp_intent</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>average price of photographers near me delivery</td>\n", " <td>yelp_intent</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>average price of event venues near Detroit</td>\n", " <td>yelp_intent</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>average cost of a DJ services in Philadelphia, PA</td>\n", " <td>yelp_intent</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>average price to hardware stores in Seattle</td>\n", " <td>yelp_intent</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>1995</th>\n", " <td>looking for the best antique shops near Las Vegas</td>\n", " <td>yelp_intent</td>\n", " </tr>\n", " <tr>\n", " <th>1996</th>\n", " <td>list of local home cleaning services near Tampa</td>\n", " <td>yelp_intent</td>\n", " </tr>\n", " <tr>\n", " <th>1997</th>\n", " <td>cost of a movie theaters near Orlando</td>\n", " <td>yelp_intent</td>\n", " </tr>\n", " <tr>\n", " <th>1998</th>\n", " <td>cost of a window cleaning in San Antonio</td>\n", " <td>yelp_intent</td>\n", " </tr>\n", " <tr>\n", " <th>1999</th>\n", " <td>average price for spinning classes in Tampa, FL</td>\n", " <td>yelp_intent</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>2000 rows × 2 columns</p>\n", "</div>" ], "text/plain": [ " queries target\n", "0 find wine shops near Nashville yelp_intent\n", "1 average price of photographers near me delivery yelp_intent\n", "2 average price of event venues near Detroit yelp_intent\n", "3 average cost of a DJ services in Philadelphia, PA yelp_intent\n", "4 average price to hardware stores in Seattle yelp_intent\n", "... ... ...\n", "1995 looking for the best antique shops near Las Vegas yelp_intent\n", "1996 list of local home cleaning services near Tampa yelp_intent\n", "1997 cost of a movie theaters near Orlando yelp_intent\n", "1998 cost of a window cleaning in San Antonio yelp_intent\n", "1999 average price for spinning classes in Tampa, FL yelp_intent\n", "\n", "[2000 rows x 2 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "yelp_generated_queries = generate_queries(yelp_keywords, yelp_pre_modifiers, yelp_post_modifiers, cities, city_states, num_queries=2000)\n", "yelp_val_generated_data = pd.DataFrame(yelp_generated_queries, columns=['queries'])\n", "yelp_val_generated_data['target'] = 'yelp_intent'\n", "yelp_val_generated_data.to_csv(\"../data/yelp_val_generated_data.csv\", index=False)\n", "yelp_val_generated_data" ] }, { "cell_type": "markdown", "id": "5b5faca4-54f3-4798-8b05-654e264dc4b8", "metadata": {}, "source": [ "#### Weather val data generation" ] }, { "cell_type": "code", "execution_count": 7, "id": "0021c37d-ba16-47d3-a617-e147dc487d0a", "metadata": {}, "outputs": [], "source": [ "weather_keywords = [\n", " \"weather\", \"forecast\", \"windy\", \"humidity\", \"monsoon\", \"flooding\", \"rain in\", \n", " \"storms\", \"storm in\", \"forcast\", \"wether\", \"wather\", \"weahter\", \"weater\", \n", " \"weaher\", \"vindy\", \"sunny\", \"rain\", \"windy\", \"cloudy\", \"storms\", \"air quality\", \n", " \"thunderstorm\", \"tornado\", \"hurricane\", \"pollen\", \"snow\", \"blizzard\", \"radar\", \n", " \"tiempo\", \"clima\", \"doppler radar\", \"local radar\", \"local weather\", \"map\", \n", " \"us weather radar\", \"weather radar near me\", \"radar near me\", \"temperature\"\n", "]\n", "\n", "weather_pre_modifiers = [\n", " \"\", \"current\", \"hourly\", \"daily\", \"weekly\", \"10-day\", \"weekend\", \"live\", \"doppler\", \n", " \"interactive\", \"national\", \"regional\", \"severe\", \"latest\", \"future\", \"local\", \"us\"\n", "]\n", "\n", "# Post-modifiers (same structure as in Yelp queries)\n", "weather_post_modifiers = [\n", " \"\", \"near me\", \"in my area\", \"in {city}\", \"near {city}\", \"in {city_state}\", \n", " \"near {city_state}\", \"for {city}\", \"for {city_state}\"\n", "]" ] }, { "cell_type": "code", "execution_count": 8, "id": "6095c619-92a0-4e4e-9b07-5da7109bce3d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>queries</th>\n", " <th>target</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>weekly monsoon in St. Louis</td>\n", " <td>weather_intent</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>future weahter in Denver</td>\n", " <td>weather_intent</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>doppler cloudy in Cleveland, OH</td>\n", " <td>weather_intent</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>regional blizzard near Phoenix</td>\n", " <td>weather_intent</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>severe clima near Orlando, FL</td>\n", " <td>weather_intent</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>1995</th>\n", " <td>latest clima in Detroit</td>\n", " <td>weather_intent</td>\n", " </tr>\n", " <tr>\n", " <th>1996</th>\n", " <td>weater near Denver</td>\n", " <td>weather_intent</td>\n", " </tr>\n", " <tr>\n", " <th>1997</th>\n", " <td>doppler storm in near Houston</td>\n", " <td>weather_intent</td>\n", " </tr>\n", " <tr>\n", " <th>1998</th>\n", " <td>future sunny near Cincinnati, OH</td>\n", " <td>weather_intent</td>\n", " </tr>\n", " <tr>\n", " <th>1999</th>\n", " <td>local hurricane for St. Louis, MO</td>\n", " <td>weather_intent</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>2000 rows × 2 columns</p>\n", "</div>" ], "text/plain": [ " queries target\n", "0 weekly monsoon in St. Louis weather_intent\n", "1 future weahter in Denver weather_intent\n", "2 doppler cloudy in Cleveland, OH weather_intent\n", "3 regional blizzard near Phoenix weather_intent\n", "4 severe clima near Orlando, FL weather_intent\n", "... ... ...\n", "1995 latest clima in Detroit weather_intent\n", "1996 weater near Denver weather_intent\n", "1997 doppler storm in near Houston weather_intent\n", "1998 future sunny near Cincinnati, OH weather_intent\n", "1999 local hurricane for St. Louis, MO weather_intent\n", "\n", "[2000 rows x 2 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "weather_generated_queries = generate_queries(weather_keywords, weather_pre_modifiers, weather_post_modifiers, cities, city_states, num_queries=2000)\n", "weather_val_generated_data = pd.DataFrame(weather_generated_queries, columns=['queries'])\n", "weather_val_generated_data['target'] = 'weather_intent'\n", "weather_val_generated_data.to_csv(\"../data/weather_val_generated_data.csv\", index=False)\n", "weather_val_generated_data" ] } ], "metadata": { "kernelspec": { "display_name": "Python (dl-course)", "language": "python", "name": "dl-course" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 5 }