notebooks/NER_val_data_prep.ipynb (321 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"id": "07c76f5c-0bfc-4b51-97fe-5906e8f9c5dd",
"metadata": {},
"source": [
"Purpose of this notebook is to generate additional validation data for NER (city, state)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "52b3b4ba-146e-44f7-819a-b0d3932ea7e6",
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"import pandas as pd\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "6d4bb365-5057-4816-bae6-cf01b3ef3f8e",
"metadata": {},
"outputs": [],
"source": [
"random.seed(42)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "29ecc694-e5da-4498-bffb-ffe91761515c",
"metadata": {},
"outputs": [],
"source": [
"query_templates = [\n",
" # Weather-related queries\n",
" \"{city} weather forecast\", # City at start\n",
" \"current weather in {city}\", # City in middle\n",
" \"weather update for {city}, {state_code}\", # City and state code in end\n",
" \"is it raining in {city}, {state}\", # City and state at end\n",
" \"humidity levels in {city}\", # City in middle\n",
" \"storm warning for {state_code}\", # State code at end\n",
" \"temperature in {city}, {state_code}\", # City and state code in end\n",
" \"{city} weather radar\", # City at start\n",
" \"weather report for {city}, {state}\", # City and state at end\n",
" \n",
" # Travel-related queries\n",
" \"flights to {city}, {state_code}\", # City and state code at end\n",
" \"cheapest hotels in {city}\", # City in middle\n",
" \"best time to visit {city}, {state}\", # City and state at end\n",
" \"tourist attractions in {city}, {state_code}\", # City and state code at end\n",
" \"car rentals in {city}, {state}\", # City and state at end\n",
" \"how far is {city} from {state_code}\", # City and state code in middle\n",
" \"vacation spots in {state}\", # State in middle\n",
" \"{city}, {state_code} travel guide\", # City and state code in start\n",
" \"{city} airport information\", # City at start\n",
" \"things to do in {city}, {state_code}\", # City and state code at end\n",
" \n",
" # Real Estate queries\n",
" \"{city} real estate market\", # City at start\n",
" \"{state} housing prices\", # State in middle\n",
" \"property for sale in {city}, {state_code}\", # City and state code at end\n",
" \"homes for rent in {city}, {state}\", # City and state at end\n",
" \"real estate agents in {city}, {state_code}\", # City and state code in end\n",
" \"average home price in {city}\", # City in middle\n",
" \"mortgage rates in {state}\", # State in middle\n",
" \"{city}, {state_code} real estate trends\", # City and state code at start\n",
" \n",
" # Service-related queries\n",
" \"plumbing services in {city}, {state}\", # City and state at end\n",
" \"electricians in {city}\", # City in middle\n",
" \"find cleaning services near {city}, {state_code}\", # City and state code at end\n",
" \"best roofing companies in {city}, {state}\", # City and state at end\n",
" \"{state_code} licensed contractors\", # State code at end\n",
" \"{city}, {state_code} locksmith services\", # City and state code at start\n",
" \"pool cleaning services in {city}, {state}\", # City and state at end\n",
" \"HVAC repair in {city}\", # City in middle\n",
" \n",
" # Event-related queries\n",
" \"events in {city} this weekend\", # City in middle\n",
" \"upcoming concerts in {city}, {state_code}\", # City and state code at end\n",
" \"food festivals in {city}, {state}\", # City and state at end\n",
" \"{city} sports events\", # City at start\n",
" \"music events in {city}, {state_code}\", # City and state code at end\n",
" \"theater performances in {state}\", # State in middle\n",
" \"{city} marathon registration\", # City at start\n",
" \"comedy shows in {city}, {state_code}\", # City and state code at end\n",
" \n",
" # Transportation-related queries\n",
" \"public transportation in {city}, {state}\", # City and state at end\n",
" \"bus schedule in {city}, {state_code}\", # City and state code at end\n",
" \"train stations near {city}\", # City in middle\n",
" \"bike rentals in {city}, {state}\", # City and state at end\n",
" \"{city}, {state_code} subway map\", # City and state code at start\n",
" \"{city} parking information\", # City at start\n",
" \"taxi services in {city}, {state}\", # City and state at end\n",
" \"directions to {city}, {state_code}\", # City and state code in end\n",
" \n",
" # Education-related queries\n",
" \"top universities in {city}, {state}\", # City and state at end\n",
" \"colleges near {city}, {state_code}\", # City and state code in end\n",
" \"find schools in {state_code}\", # State code at end\n",
" \"{city}, {state_code} public schools\", # City and state code at start\n",
" \"{state} high school rankings\", # State at start\n",
" \"private schools in {city}, {state}\", # City and state at end\n",
" \n",
" # Government-related queries\n",
" \"{state_code} DMV locations\", # State code at start\n",
" \"government offices in {city}, {state}\", # City and state at end\n",
" \"tax offices in {city}\", # City in middle\n",
" \"voter registration in {state_code}\", # State code at end\n",
" \"where is the courthouse in {city}, {state}\", # City and state at end\n",
" \"city hall in {city}\", # City in middle\n",
" \"{city} public library hours\", # City at start\n",
" \n",
" # Health-related queries\n",
" \"hospitals in {city}, {state}\", # City and state at end\n",
" \"urgent care near {city}, {state_code}\", # City and state code in end\n",
" \"COVID-19 testing in {city}, {state}\", # City and state at end\n",
" \"pharmacies in {city}\", # City in middle\n",
" \"doctors in {city}, {state_code}\", # City and state code in end\n",
" \"mental health services in {state}\", # State in middle\n",
" \"{city} vaccination centers\", # City at start\n",
"]\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "458354db-ebd5-4e44-8489-9bffe4935dfb",
"metadata": {},
"outputs": [],
"source": [
"# Cities, States, and State Codes\n",
"cities = [\n",
" \"New York\", \"Los Angeles\", \"Chicago\", \"Houston\", \"Phoenix\", \n",
" \"Philadelphia\", \"San Antonio\", \"San Diego\", \"Dallas\", \"San Jose\",\n",
" \"Miami\", \"San Francisco\", \"Seattle\", \"Atlanta\", \"Denver\", \n",
" \"Boston\", \"Las Vegas\", \"Orlando\", \"Austin\", \"Nashville\", \n",
" \"Detroit\", \"Portland\", \"Charlotte\", \"Baltimore\", \"St. Louis\", \n",
" \"Tampa\", \"Minneapolis\", \"Cleveland\", \"Pittsburgh\", \"Cincinnati\"\n",
"]\n",
"\n",
"states = [\n",
" \"New York\", \"California\", \"Illinois\", \"Texas\", \"Arizona\", \n",
" \"Pennsylvania\", \"Texas\", \"California\", \"Texas\", \"California\",\n",
" \"Florida\", \"California\", \"Washington\", \"Georgia\", \"Colorado\", \n",
" \"Massachusetts\", \"Nevada\", \"Florida\", \"Texas\", \"Tennessee\", \n",
" \"Michigan\", \"Oregon\", \"North Carolina\", \"Maryland\", \"Missouri\", \n",
" \"Florida\", \"Minnesota\", \"Ohio\", \"Pennsylvania\", \"Ohio\"\n",
"]\n",
"\n",
"state_codes = [\n",
" \"NY\", \"CA\", \"IL\", \"TX\", \"AZ\", \n",
" \"PA\", \"TX\", \"CA\", \"TX\", \"CA\",\n",
" \"FL\", \"CA\", \"WA\", \"GA\", \"CO\", \n",
" \"MA\", \"NV\", \"FL\", \"TX\", \"TN\", \n",
" \"MI\", \"OR\", \"NC\", \"MD\", \"MO\", \n",
" \"FL\", \"MN\", \"OH\", \"PA\", \"OH\"\n",
"]\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "d3b24680-c7ab-4f77-8b45-2d3c4e4a1bc6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>queries</th>\n",
" <th>city</th>\n",
" <th>state</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>train stations near detroit</td>\n",
" <td>detroit</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>voter registration in wa</td>\n",
" <td>None</td>\n",
" <td>wa</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>weather report for los angeles, california</td>\n",
" <td>los angeles</td>\n",
" <td>california</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>city hall in boston</td>\n",
" <td>boston</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>bike rentals in baltimore, maryland</td>\n",
" <td>baltimore</td>\n",
" <td>maryland</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" queries city state\n",
"0 train stations near detroit detroit None\n",
"1 voter registration in wa None wa\n",
"2 weather report for los angeles, california los angeles california\n",
"3 city hall in boston boston None\n",
"4 bike rentals in baltimore, maryland baltimore maryland"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Function to generate queries with city, state, and state_code columns\n",
"def generate_named_entity_queries_with_columns(cities, states, state_codes, query_templates, num_queries=1000):\n",
" all_queries = []\n",
"\n",
" for city, state, state_code in zip(cities, states, state_codes):\n",
" for template in query_templates:\n",
" query = template.replace(\"{city}\", city).replace(\"{state}\", state).replace(\"{state_code}\", state_code)\n",
" \n",
" # Determine if the query contains the state or state code\n",
" if \"{state}\" in template:\n",
" state_value = state\n",
" elif \"{state_code}\" in template:\n",
" state_value = state_code\n",
" else:\n",
" state_value = random.choice([state, state_code]) # Randomly assign state or state code if not specified\n",
"\n",
" # Add query, city, and state/state_code to the list\n",
" all_queries.append({\n",
" \"queries\": query,\n",
" \"city\": city if \"{city}\" in template else None,\n",
" \"state\": state_value if \"{state\" in template else None,\n",
" })\n",
" \n",
" random.shuffle(all_queries) # Shuffle to randomize order\n",
" return all_queries[:num_queries]\n",
"\n",
"# Generate 1000 Named Entity Queries with city and state columns\n",
"generated_named_entity_queries = generate_named_entity_queries_with_columns(cities, states, state_codes, query_templates, num_queries=1000)\n",
"\n",
"# Create DataFrame\n",
"named_entity_val_generated_data = pd.DataFrame(generated_named_entity_queries)\n",
"named_entity_val_generated_data['queries'] = named_entity_val_generated_data['queries'].str.lower()\n",
"named_entity_val_generated_data['city'] = named_entity_val_generated_data['city'].str.lower()\n",
"named_entity_val_generated_data['state'] = named_entity_val_generated_data['state'].str.lower()\n",
"\n",
"# Save to CSV\n",
"named_entity_val_generated_data.to_csv(\"../data/named_entity_val_generated_data.csv\", index=False)\n",
"\n",
"# Display first few rows of the DataFrame\n",
"named_entity_val_generated_data.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6fcee0bc-55f3-4276-9a86-c44414d93119",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (dl-course)",
"language": "python",
"name": "dl-course"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 5
}