notebooks/city_state_exploration_and_dataprep_v2.ipynb (2,238 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"id": "73d1863d-1d54-4cdd-843c-c033b28f15f6",
"metadata": {},
"source": [
"Explore whether the weather keywords and locations are captured correctly"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bd4805cc-8d46-40fa-8d39-35158d9212d4",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import re\n",
"from datasets import load_dataset, Dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b64db933-17ab-47cc-b0ba-ae37e89e450a",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import random\n",
"from collections import Counter"
]
},
{
"cell_type": "markdown",
"id": "8bcf91d7-8344-4b5e-9641-461b2630cb0f",
"metadata": {},
"source": [
"#### Read the data/geonames-cities-states.json"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "738661a5-668f-4b2c-8823-dc3c0c92be94",
"metadata": {},
"outputs": [],
"source": [
"import json \n",
"\n",
"def get_geonames_city_state_data():\n",
" geonames_file = \"../data/geonames-cities-states.json\"\n",
" with open(geonames_file, 'r') as f:\n",
" geonames_dict = json.load(f)\n",
" \n",
" \n",
" cities_data = pd.DataFrame(geonames_dict['cities'])\\\n",
" .rename(columns={'admin1_code': 'state_code', 'name': 'city_name', 'population': 'city_popln'})\n",
" cities_data = cities_data[['id', 'state_code', 'city_name', 'city_popln', 'alternate_names']]\n",
" states_data = pd.DataFrame(geonames_dict['states_by_abbr'].values())\\\n",
" .rename(columns={'admin1_code': 'state_code', 'name': 'state_name'})\n",
" states_data = states_data[['state_code', 'state_name']]\n",
" city_states_data = cities_data.merge(states_data, how='left', on='state_code')\n",
" city_states_data['city_weight'] = city_states_data['city_popln'] / city_states_data['city_popln'].sum()\n",
" return city_states_data\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a3aeb4bd-2e84-4121-84b7-8ffb1118ca37",
"metadata": {},
"outputs": [],
"source": [
"city_states_data = get_geonames_city_state_data()\n",
"print(len(city_states_data))\n",
"city_states_data"
]
},
{
"cell_type": "markdown",
"id": "45711e5b-1f06-4cac-aea8-97deaea292a5",
"metadata": {},
"source": [
"<!-- #### Add some partial city names for capturing the consumer needs \n",
"if they type partial city names such as `coffee near me sunnyval` -->"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f31c7125-c7ea-426c-b85c-1f71d1507fdd",
"metadata": {},
"outputs": [],
"source": [
"# city_states_data['city_name'].apply(len).describe(percentiles=[.1, .2, .25, .3, .4, .5, .6 ,.7, .75, .8, .9, .95, .98, .99])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0a53d52c-5d9d-4963-90a6-31cb269bf71d",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "4c84a18b-1f21-4006-a634-9da9ff725070",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "d35076ae-1d45-4699-8257-e98612500e43",
"metadata": {},
"outputs": [],
"source": [
"city_states_data.sort_values('city_weight', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ad8ca7e6-7511-42f2-92df-a02526637f23",
"metadata": {},
"outputs": [],
"source": [
"city_weights = city_states_data[['city_name', 'city_weight']].set_index('city_name').to_dict()['city_weight']\n",
"# city_weights"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0eca185b-0ff3-4cad-a878-92f1d065081c",
"metadata": {},
"outputs": [],
"source": [
"city_info = city_states_data[['city_name', 'alternate_names']].set_index('city_name').to_dict()['alternate_names']\n",
"state_info = city_states_data[['state_code', 'state_name']].set_index('state_code').to_dict()['state_name']\n",
"city_state_code_info = city_states_data[['city_name', 'state_code', 'city_weight']].copy()\n",
"city_state_name_info = city_states_data[['city_name', 'state_name', 'city_weight']].copy()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "986b55c1-c92f-4722-91b8-29e48bbe2813",
"metadata": {},
"outputs": [],
"source": [
"# city_info"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9cc31eb8-b5eb-4daa-a466-f873db8e3038",
"metadata": {},
"outputs": [],
"source": [
"city_state_code_info"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bfa88c33-da07-4261-b890-2aa111988d3c",
"metadata": {},
"outputs": [],
"source": [
"city_state_name_info"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "116d168e-dd72-407c-8cad-865d11143307",
"metadata": {},
"outputs": [],
"source": [
"# list(city_info.keys())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3692858c-faa7-4219-8a4b-d25c82144e34",
"metadata": {},
"outputs": [],
"source": [
"fake_cities = [\n",
" 'Umber Glade', 'Crimson Hollow', 'Midland Creek', 'Boulderfield', 'Fairbrook', 'Mossmere', 'Hearthfield', 'Norwyn', \n",
" 'Elysian Ridge', 'Dover Hollow', 'Mistral Cove', 'Starfall', 'Eaglebrooke', 'Granite Ridge', 'Umbrafield', 'Goldenstone', \n",
" 'Palisade Brook', 'Willowfield', 'Noblehaven', 'Frostgrove', 'Oasis Ridge', 'Larkspur Vale', 'Elderstone', 'Forest Vale', \n",
" 'Yonder Bluff', 'Cloverstone', 'Kingsvale', 'Ashen Bluff', 'Yarrow Bluff', 'Zion Hollow', 'Velvet Pine', 'Fernspire', \n",
" 'Inkwell', 'Eaglewood', 'Driftshade', 'Prairiefield', 'Northshade', 'Riverwatch', 'Sapphire Hollow', 'Jadehaven', \n",
" 'Lunaris', 'Quailstone', 'Birchvale', 'Blossom Creek', 'Union Hollow', 'Whispering Brook', 'Yarrowstone', 'Candlevale', \n",
" 'Ravenshire', 'Willowhaven', 'Wyrmrest', 'Frostshade', 'Silverbrook', 'Azure Hollow', 'Tanglefield', 'Umberstone', \n",
" 'Glimmerbrook', 'Ravencrest', 'Larkridge', 'Windspire', 'Oakheart', 'Obsidian Point', 'Newstone', 'Moonlit Vale', \n",
" 'Tranquil Ridge', 'Gilded Summit', 'Lunarshade', 'Seabrook', 'Quartzwood', 'Juniper Crest', 'Norvale', 'Hollowmere', \n",
" 'Kindlewood', 'Dawnspire', 'Obelisk Point', 'Kindred Hollow', 'Autumn Hollow', 'Orchard Ridge', 'Underbrook', 'Kingshaven', \n",
" 'Ebonwood', 'Violet Haven', 'Peregrine Spire', 'Summitwood', 'Lakeshore Valley', 'Umbra Shores', 'Trillium Vale', \n",
" 'Halcyon Creek', 'Xander Cove', 'Glenstone', 'Nimbus Grove', 'Willowfern', 'Vista Hollow', 'Jasperwood', 'Jasmine Vale', \n",
" 'Rustvale', 'Quillbrook', 'Ravenmere', 'Zerith Hollow', 'Golden Ridge', 'Thistlewood', 'Quiet Hollow', \n",
" 'Ridgevale', 'Bluewater Ridge', 'Unity Crest', 'Cedar Hollow', 'Bluffstone', 'Larchfield', 'Quarry Hollow', \n",
" 'Laurel Ridge', 'Yellowfield', 'Amberfield', 'Quartz Creek', 'Zephyr Vale', 'Larkfield', 'Verdant Hollow', \n",
" 'Cinder Hollow', 'Havencliff', 'Harborwood', 'Onyx Ridge', \"Kite's Hollow\", 'Brookfield', 'Brightveil', 'Redhawk', \n",
" 'Valleywood', 'Havenwood', 'Thornhill', 'Silverwood', 'Duskfield', 'Tidesreach', 'Cypress Vale', 'Fernwood', \n",
" 'Moonwillow', 'Verdant Shade', 'Willowthorn', 'Garnet Crossing', 'Ivy Hollow', 'Kestrel Cove', 'Amberpeak', 'Meadowcrest', \n",
" 'Yellowvine', 'Violet Sands', 'Ironwood', 'Timber Shade', 'Dovewood Creek', 'Pinecairn', 'Driftvale', 'Crescent Vale', \n",
" 'Juniper Grove', 'Ridgehaven', 'Timbervale', 'Hollowstone', 'Dawnbreak', 'Oceangrove', 'Pinegrove', 'Alderstone', \n",
" 'Primrose Point', 'Jasper Vale', 'Pinevale', 'Quartzfield', 'Crescent Bluff', 'Jasperstone', 'Umbra Vale', \n",
" 'Violet Ridge', 'Knollfield', 'Ironshade', 'Zephyr Crossing', 'Zenith Valley', 'Ashmoor', 'Xyron Bay', 'Everstone', \n",
" 'Moonstone Creek', 'Foxshade', 'Ashfield', 'Xyros Hill', 'Sapphire Ridge', 'Elmfield', 'Ivoryfield', 'Hollowvale', 'Frostbluff', \n",
" 'Xenia Ridge', 'Briarcliff', 'Kestrel Bluff', 'Nightingale Ridge', 'Peridot Bay', 'Islefield', 'Ivory Spire', 'Solace Grove', \n",
" 'Xanadu Grove', 'Ecliptus', 'Zephyr Hollow', 'Oakenhill', 'Glade Ridge', 'Winterridge', 'Jadestone', 'Indigo Bay', 'Duskhaven',\n",
" \"Shadowpine\", \"Crystal Vale\", \"Harbor Reach\", \"Eldermoor\",\n",
" \"Thornhollow\", \"Silverpeak\", \"Mistwood\", \"Shadowfall\",\n",
" \"Willowbright\", \"Dusklight\", \"Havenvale\", \"Starcrest\",\n",
" \"Glacier Hollow\", \"Cinderbluff\", \"Ironpeak\", \"Frostwood\",\n",
" \"Embergrove\", \"Aurora Ridge\", \"Driftmoor\", \"Mooncrest\",\n",
" \"Stonehearth\", \"Riverwood\", \"Briarfrost\", \"Quillhaven\",\n",
" \"Stormvale\", \"Eaglesong\", \"Wanderwood\", \"Summervale\",\n",
" \"Brightwood\", \"Cloudspire\", \"Snowhaven\", \"Golden Hollow\",\n",
" \"Northcove\", \"Miststone\", \"Clearbrook\", \"Suncrest\",\n",
" \"Twilight Vale\", \"Aspen Hollow\", \"Boulderhaven\", \"Shimmerwood\",\n",
" \"Darkspire\", \"Oakbluff\", \"Hollowbright\", \"Sablewood\",\n",
" \"Lunarfrost\", \"Dovewood Point\", \"Crescent Glade\", \"Wraithstone\",\n",
" \"Foxwood Hollow\", \"Amberwood\", \"Midnight Ridge\", \"Garnet Hollow\",\n",
" \"big city\", \"Big City\", \"Silver City\", \"Golden City\", \"Mystic City\",\n",
" \"Sunset City\", \"Iron City\", \"Emerald City\", \"Shadow City\", \"Crystal City\",\n",
" \"Harmony City\", \"Aurora City\", \"Dream City\", \"Thorn City\", \"Lunar City\", \"Twilight City\", \n",
" \"Velvet City\", \"Willow City\", \"Ivory City\", \"Eclipse City\",\n",
" \"Storm City\", \"Bliss City\", \"Shimmer City\", \"Echo City\", \"Frost City\",\n",
" \"Sapphire City\", \"Obsidian City\", \"Tranquil City\", \"Starlight City\",\n",
" \"Drift City\", \"Amber City\", \"Hollow City\", \"Gilded City\", \"Quartz City\",\n",
" \"Meadow City\", \"Rosewood City\", \"Timber City\", \"Bright City\", \"Fox City\",\n",
" \"Dusk City\", \"Goldenleaf City\", \"Wind City\", \"Harbor City\", \"Cedar City\",\n",
" \"Azure City\", \"Elder City\", \"Crescent City\", \"Pine City\", \"Summit City\",\n",
" \"Cobalt City\", \"Bluff City\", \"Stone City\",\n",
"]\n",
"\n",
"fake_state_names = [\n",
" 'Meadowvale', 'Boulderwatch', 'Harperfield', 'Verdantia', 'Redhaven', 'Ashspire', 'Ecliptica', 'Cindermist', \n",
" 'Stormhaven', 'Crystalbourne', 'Sunspire', 'Twilight Hollow', 'Frostspire', 'Silverwatch', 'Keystone Ridge', \n",
" 'Gilded Vale', 'Bluewater', 'Jadewood', 'Northgate', 'Timberland', 'Ravenmark', 'Auroravale', 'Zephyr Bay', \n",
" 'Stormspire', 'Stonemeadow', 'Quintarra', 'Stonepeak', 'Willowcrown', 'Thistledown', 'Verdantreach', 'Lunaris', \n",
" 'Oakenshire', 'Brightwatch', 'Dawnhaven', 'Northreach', 'Verdant Hollow', 'Horizon Ridge', 'Xantria', 'Ironvale', \n",
" 'Amberreach', 'Silverveil', 'Moonwatch', 'Umbershade', 'Windswept', 'Shadowpine', 'Shadowreach', 'Zionshade', \n",
" 'Oasisland', 'Goldmere', 'Frosthaven', 'Drakemont', 'Emberland', 'Rivermist', 'Duskland', 'Firgrove', 'Driftstone', \n",
" 'Frostveil', 'Amberwyn', 'Velvet Ridge', 'Mystic Vale', 'Snowpoint', 'Bluehaven', 'Opal Grove', 'Jasper Hollow', \n",
" 'Tideridge', 'Crimson Bay', 'Aurorawood', 'Larkland', 'Thornvale', 'Shadewind', 'Ridgefall', 'Darkfall', 'Silvercrown', \n",
" 'Goldenreach', 'Ivory Plains', 'Nobleshore', 'Yellowcove', 'Hollowbrook', 'Ravendale', 'Frostwood', 'Brightshade', 'Brightmere', \n",
" 'Wytherstone', 'Eaglecrest', 'Frostmere', 'Moonbrooke', 'Goldenvale', 'Quillsprings', 'Pinemark', 'Prairiefield', \n",
" 'Cascade', 'Kindlemark', 'Aspenvale', 'Ivoryreach', 'Thorncrest', 'Cloudwood', 'Jade Ridge', 'Westmarch', \n",
" 'Wintercrest', 'Copperfield', 'Prairiefrost', 'Bladewind', 'Everwind', 'Quarrycrest', 'Lunashire', 'Hollowreach', \n",
" 'Whispering Pines', 'Blueshore', 'Glacier Point', 'Gildan', 'Zephyrlight', 'Sablepeak', 'Northspire', 'Starhearth', \n",
" 'Whispercrown', 'Valewind', 'Umbravale', 'Kindleland', 'Westwatch',\n",
"]\n",
"\n",
"fake_state_codes = [\n",
" 'QT', 'WX', 'CZ', 'GW', 'FR', 'VW', 'BN', 'BM', 'LS', 'ZR', 'QN', 'KP', 'WS', 'ZZ', 'YW', 'XK', 'LR', 'NX', \n",
" 'SW', 'XT', 'QB', 'ZT', 'SR', 'CW', 'JT', 'RP', 'HW', 'JV', 'FV', 'XW', 'PD', 'WR', 'QQ', 'UV', 'LK', 'LD', \n",
" 'LM', 'HT', 'VR', 'XY', 'RG', 'UR', 'NT', 'PT', 'YT', 'MQ', 'DR', 'SP', 'FG', 'YS', 'ZS', 'PW', 'FN', 'XF', \n",
" 'LV', 'RX', 'TG', 'CQ', 'LW', 'MX', 'BL', 'TF', 'GH', 'DX', 'QT', 'KV', 'RW', 'XL', 'FW', 'JR', 'PL', 'FB', \n",
" 'ZN', 'KR', 'QZ', 'DF', 'HD',\n",
"]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6d4509f1-6cb0-4478-8f89-90328e8e4f5a",
"metadata": {},
"outputs": [],
"source": [
"len(fake_state_names), len(set(fake_state_names))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d45a21d0-b71f-4464-80e1-e365b8bf20dd",
"metadata": {},
"outputs": [],
"source": [
"len(fake_state_codes), len(set(fake_state_codes))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9e83904d-9a24-412a-b3fa-bd2c66d20f8b",
"metadata": {},
"outputs": [],
"source": [
"len(fake_cities), len(set(fake_cities))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "95deb546-c1cb-44ac-ba5a-52cfff3402b9",
"metadata": {},
"outputs": [],
"source": [
"# print(set(fake_state_codes))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "12c31739-2930-438e-9515-47a8bf4ca8ee",
"metadata": {},
"outputs": [],
"source": [
"# valid_state_codes = set(city_state_code_info['state_code'].values.tolist())\n",
"# len(valid_state_codes)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "65bdb448-77b9-480e-955b-17d6f4193607",
"metadata": {},
"outputs": [],
"source": [
"# print([state_code for state_code in fake_state_codes if state_code not in valid_state_codes])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1e49ae90-5560-42b7-8483-5ddeae911b7e",
"metadata": {},
"outputs": [],
"source": [
"# # fake_state_names\n",
"# valid_state_names = set(city_state_name_info['state_name'].values.tolist())\n",
"\n",
"# print([state_name for state_name in fake_state_names if state_name not in valid_state_names])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2bda51cd-13fd-4ba8-9dbd-ed19d3f11250",
"metadata": {},
"outputs": [],
"source": [
"# len(valid_state_names)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5ae739c3-6486-4a66-9978-9b3d3decb1fd",
"metadata": {},
"outputs": [],
"source": [
"# # fake_cities\n",
"# # city_info\n",
"# print([city_name for city_name in fake_cities if city_name not in city_info])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ddd77b4d-13c8-4017-acd6-8714d98f1579",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "ac238a0e-7526-40d3-8606-42d65fda3bd9",
"metadata": {},
"outputs": [],
"source": [
"label_map = {\n",
" 0: \"O\", # Outside any named entity\n",
" 1: \"B-PER\", # Beginning of a person entity\n",
" 2: \"I-PER\", # Inside a person entity\n",
" 3: \"B-ORG\", # Beginning of an organization entity\n",
" 4: \"I-ORG\", # Inside an organization entity\n",
" 5: \"B-CITY\", # Beginning of a city entity\n",
" 6: \"I-CITY\", # Inside a city entity\n",
" 7: \"B-STATE\", # Beginning of a state entity\n",
" 8: \"I-STATE\", # Inside a state entity\n",
" 9: \"B-CITYSTATE\", # Beginning of a city_state entity\n",
" 10: \"I-CITYSTATE\", # Inside a city_state entity\n",
" }\n",
"\n",
"\n",
"persons = [\n",
" 'Donald Trump', 'John Smith', 'Roger Williams', 'Michelle Obama', 'Elon Musk',\n",
" 'Barack Obama', 'Bill Gates', 'Steve Jobs', 'Warren Buffett', 'Oprah Winfrey',\n",
" 'Jeff Bezos', 'Taylor Swift', 'Jennifer Lawrence', 'Brad Pitt', 'Leonardo DiCaprio',\n",
" 'Katy Perry', 'Tom Hanks', 'Emma Watson', 'Johnny Depp', 'Scarlett Johansson',\n",
" 'Mark Zuckerberg', 'Sheryl Sandberg', 'Ivanka Trump', 'Joe Biden', 'Kamala Harris',\n",
" 'Serena Williams', 'Michael Jordan', 'LeBron James', 'Tiger Woods', 'Cristiano Ronaldo',\n",
" 'Lionel Messi', 'Roger Federer', 'Usain Bolt', 'Simone Biles', 'Tom Brady',\n",
" 'Peyton Manning', 'David Beckham', 'Rafael Nadal', 'Novak Djokovic', 'Andy Murray',\n",
" 'George Clooney', 'Matt Damon', 'Julia Roberts', 'Angelina Jolie', 'Morgan Freeman',\n",
" 'Chris Hemsworth', 'Dwayne Johnson', 'Vin Diesel', 'Keanu Reeves', 'Robert Downey Jr.',\n",
" 'Chris Evans', 'Will Smith', 'Johnny Cash', 'Bob Dylan', 'Paul McCartney',\n",
" 'Ringo Starr', 'John Lennon', 'George Harrison', 'Madonna', 'Prince',\n",
" 'Bruce Springsteen', 'Elton John', 'David Bowie', 'Whitney Houston', 'Celine Dion',\n",
" 'Marilyn Monroe', 'Audrey Hepburn', 'Albert Einstein', 'Isaac Newton', 'Marie Curie',\n",
" 'Galileo Galilei', 'Nikola Tesla', 'Stephen Hawking', 'Richard Feynman', 'Carl Sagan',\n",
" 'Neil Armstrong', 'Yuri Gagarin', 'Sally Ride', 'Jane Goodall', 'Charles Darwin',\n",
" 'Mahatma Gandhi', 'Nelson Mandela', 'Martin Luther King Jr.', 'Malala Yousafzai', 'Angela Merkel',\n",
" 'Theresa May', 'Vladimir Putin', 'Xi Jinping', 'Justin Trudeau', 'Jacinda Ardern',\n",
" 'Pope Francis', 'Dalai Lama', 'Queen Elizabeth II', 'Prince William', 'Prince Harry',\n",
" 'James Anderson', 'Michael Brown', 'David Clark', 'John Doe', 'Robert Evans',\n",
" 'Christopher Foster', 'William Garcia', 'Charles Hall', 'Joseph Harris', 'Daniel Jackson',\n",
" 'Matthew Johnson', 'George King', 'Anthony Lewis', 'Mark Miller', 'Paul Moore',\n",
" 'Steven Nelson', 'Kevin Perry', 'Thomas Reed', 'Brian Roberts', 'Jason Scott',\n",
" 'Andrew Smith', 'Joshua Thompson', 'Ryan Turner', 'Brandon Walker', 'Nicholas White',\n",
" 'Jonathan Young', 'Adam Baker', 'Justin Carter', 'Benjamin Collins', 'Aaron Cook',\n",
" 'Alexander Davis', 'Tyler Edwards', 'Zachary Fisher', 'Ethan Graham', 'Jacob Green',\n",
" 'Austin Hernandez', 'Mason Hill', 'Logan Hughes', 'Owen Jenkins', 'Lucas Kelly',\n",
" 'Nathan Lee', 'Caleb Long', 'Henry Martinez', 'Dylan Mitchell', 'Gabriel Morris',\n",
" 'Jack Murphy', 'Connor Myers', 'Liam Parker', 'Isaac Patterson', 'Evan Phillips',\n",
" 'Hunter Price', 'Noah Richardson', 'Samuel Rivera', 'Gavin Rogers', 'Aiden Ross',\n",
" 'Christian Russell', 'Ian Sanders', 'Eli Simmons', 'Chase Stewart', 'Cameron Sullivan',\n",
" 'Bryan Taylor', 'Cole Thomas', 'Jake Thompson', 'Luke Torres', 'Blake Turner',\n",
" 'Jesse Ward', 'Joel Watson', 'Derek Williams', 'Mitchell Wright', 'Dustin Young',\n",
" 'Megan Allen', 'Jennifer Bailey', 'Jessica Bennett', 'Emily Brooks', 'Sarah Campbell',\n",
" 'Amanda Carter', 'Rebecca Collins', 'Samantha Cooper', 'Stephanie Diaz', 'Rachel Evans',\n",
" 'Christine Flores', 'Laura Foster', 'Michelle Garcia', 'Amber Gonzales', 'Lisa Gray',\n",
" 'Kimberly Green', 'Heather Harris', 'Tiffany Henderson', 'Natalie Hernandez', 'Crystal Hill',\n",
" 'Victoria Hughes', 'Erica Jenkins', 'Nicole Johnson', 'Katherine Kelly', 'Danielle Lee',\n",
" 'Hannah Lewis', 'Melissa Lopez', 'Patricia Martin', 'Brittany Moore', 'Brenda Morgan',\n",
"\n",
" ]\n",
"organizations = [\n",
" 'Google Inc.', 'Apple Inc.', 'Amazon.com', 'Facebook Inc.', 'Microsoft Corporation',\n",
" 'Tesla Motors', 'Netflix Inc.', 'The New York Times', 'The Washington Post', 'Wall Street Journal',\n",
" 'Intel Corporation', 'Oracle Corporation', 'IBM', 'Coca-Cola Company', 'PepsiCo',\n",
" 'Starbucks', 'Walmart Inc.', 'Target Corporation', 'ExxonMobil', 'Shell Oil Company',\n",
" 'Ford Motor Company', 'General Motors', 'Toyota Motor Corporation', 'Volkswagen Group', 'BMW Group',\n",
" 'American Airlines', 'Delta Airlines', 'United Airlines', 'Boeing Company', 'Lockheed Martin',\n",
" 'SpaceX', 'NASA', 'Harvard University', 'Stanford University', 'Massachusetts Institute of Technology',\n",
" 'University of California, Berkeley', 'University of Oxford', 'University of Cambridge', 'Princeton University', 'Yale University',\n",
" 'University of Chicago', 'Columbia University', 'Johns Hopkins University', 'University of Southern California', 'University of Michigan',\n",
" 'Goldman Sachs', 'JPMorgan Chase', 'Citibank', 'Morgan Stanley', 'Bank of America',\n",
" 'Deloitte', 'Ernst & Young', 'PricewaterhouseCoopers', 'KPMG', 'McKinsey & Company',\n",
" 'Boston Consulting Group', 'Accenture', 'BlackRock', 'Fidelity Investments', 'Vanguard Group',\n",
" 'Nike Inc.', 'Adidas', 'Under Armour', 'Patagonia', 'The Walt Disney Company',\n",
" 'Time Warner', 'NBCUniversal', 'Sony Corporation', 'Warner Bros.', 'Paramount Pictures',\n",
" 'Universal Music Group', 'Sony Music Entertainment', 'Warner Music Group', 'Pfizer Inc.', 'Johnson & Johnson',\n",
" 'Novartis', 'Merck & Co.', 'GlaxoSmithKline', 'AstraZeneca', 'Moderna',\n",
" 'New York City Hospital', 'Los Angeles County Library', 'San Francisco Community College',\n",
" 'Miami International University', 'Chicago Regional Bank', 'Dallas Medical Center',\n",
" 'Boston Tech Solutions', 'Atlanta City Bank', 'Seattle Software Hub', 'Phoenix Energy Solutions',\n",
" 'Denver Financial Group', 'Houston General Hospital', 'Portland Health Services', 'Las Vegas Convention Center',\n",
" 'San Diego Software Innovations', 'Philadelphia Law Firm', 'Orlando Realty Group',\n",
" 'Austin Engineering Solutions', 'Cleveland City Schools', 'Detroit Manufacturing Hub',\n",
" 'Baltimore Technology Inc.', 'Minneapolis Insurance Group', 'St. Louis Transportation Services',\n",
" 'Tampa Healthcare Network', 'Pittsburgh Steelworks Corporation', 'Sacramento Business Ventures',\n",
" 'Indianapolis Marketing Solutions', 'Columbus Financial Advisors', 'Fort Worth Electric Company',\n",
" 'Charlotte Digital Marketing', 'Milwaukee Industrial Solutions', 'Memphis Logistics Services',\n",
" 'Washington DC Development', 'Nashville Business Enterprises', 'Louisville Fitness Center',\n",
" 'Kansas City Architectural Firm', 'Oklahoma City University', 'Virginia Beach Law Associates',\n",
" 'Raleigh Research Institute', 'Salt Lake City Analytics', 'Richmond Financial Group',\n",
" 'Newark Data Solutions', 'Anchorage Energy Solutions', 'Fresno Water Authority',\n",
" 'Omaha Financial Services', 'Colorado Springs Health Institute', 'Mesa Auto Parts',\n",
" 'Virginia Beach Shipping', 'Sacramento Community Center', 'Albuquerque Electronics Company',\n",
" 'Tucson Data Science Center', 'Miami Lakes Software Solutions', 'Wichita Steel Corporation',\n",
" 'Arlington Cybersecurity Group', 'Bakersfield Construction Services', 'Aurora Logistics Firm',\n",
" 'Anaheim Technology Hub', 'Santa Ana Healthcare Services', 'Riverside Manufacturing Co.',\n",
" 'St. Paul Medical Associates', 'Lexington University Hospital', 'Plano Technology Solutions',\n",
" 'Lincoln Manufacturing Inc.', 'Greensboro Industrial Partners', 'Jersey City Financial Group',\n",
" 'Chandler Electronics', 'Madison Biotechnology Solutions', 'Lubbock Medical Supplies',\n",
" 'Scottsdale Real Estate Group', 'Reno Venture Capitalists', 'Henderson Engineering Consultants',\n",
" 'Norfolk Health Services', 'Chesapeake Data Systems', 'Fremont Software Group',\n",
" 'Irvine Legal Services', 'San Bernardino Logistics Group', 'Boise Energy Technologies',\n",
" 'Spokane Steel Fabricators', 'Glendale Solar Power Corporation', 'Garland Medical Services',\n",
" 'Hialeah Shipping and Logistics', 'Chesapeake Financial Advisors', 'Frisco Software Hub',\n",
" 'McKinney Electronics Corporation', 'Gilbert Transportation Group', 'Baton Rouge Financial Services',\n",
" 'Shreveport Data Analytics', 'Mobile Business Solutions', 'Huntsville Rocket Technologies',\n",
" 'Knoxville Agricultural Partners', 'Dayton Software Innovations', 'Grand Rapids Healthcare Network',\n",
" 'Fort Lauderdale Construction Group', 'Tempe Electric Vehicles', 'Winston-Salem Marketing Firm',\n",
" 'Fayetteville Consulting Services', 'Springfield Realty Group', 'Yonkers Manufacturing Hub',\n",
" 'Augusta Insurance Group', 'Salem Solar Energy Solutions', 'Pasadena Legal Consultants',\n",
" 'Seattle Pacific University', 'San Diego Zoo', 'Portland Art Museum',\n",
" 'Boston Medical Group', 'Chicago Tribune', 'Dallas Cowboys Football Club',\n",
" 'Los Angeles Philharmonic Orchestra', 'New York University', 'Houston Community College',\n",
" 'Phoenix Solar Power', 'Denver Public Library', 'Miami International Airport',\n",
" 'Atlanta Symphony Orchestra', 'San Francisco Opera', 'Orlando City Soccer Club',\n",
" 'Nashville Symphony', 'Baltimore Ravens Football Team', 'Cleveland Clinic',\n",
" 'Pittsburgh Steelers Football Team', 'Detroit Institute of Arts',\n",
" 'Tampa Bay Buccaneers Football Club', 'St. Louis Cardinals Baseball Team',\n",
" 'Indianapolis Colts Football Team', 'Austin Film Society', 'Seattle Sounders Football Club',\n",
" 'Minneapolis Institute of Art', 'Charlotte Hornets Basketball Club', 'Portland Trail Blazers Basketball Team',\n",
" 'Las Vegas Convention and Visitors Authority', 'New Orleans Saints Football Club',\n",
" 'San Antonio Spurs Basketball Club', 'Philadelphia Eagles Football Club',\n",
" 'Kansas City Chiefs Football Team', 'Cincinnati Reds Baseball Club',\n",
" 'Memphis Grizzlies Basketball Team', 'Washington Wizards Basketball Club',\n",
" 'Milwaukee Bucks Basketball Club', 'Sacramento Kings Basketball Team',\n",
" 'Salt Lake City Ballet', 'Boise State University', 'Albuquerque International Balloon Fiesta',\n",
" 'Raleigh-Durham International Airport', 'Richmond Symphony', 'Fresno Pacific University',\n",
" 'Spokane Transit Authority', 'Henderson Engineering', 'Mesa Public Schools',\n",
" 'Scottsdale Museum of Contemporary Art', 'Chandler Regional Medical Center', 'Glendale Unified School District',\n",
" 'Riverside Community Hospital', 'Aurora Public Schools', 'Anaheim Ducks Hockey Team',\n",
" 'Santa Ana College', 'Stockton Unified School District', 'Irvine Company', 'San Bernardino Community College District',\n",
" 'Modesto Junior College', 'Bakersfield Condors Hockey Team', 'Fresno State University',\n",
" 'Chesapeake Energy Corporation', 'Omaha World-Herald', 'Tucson Medical Center',\n",
" 'Virginia Beach Public Schools', 'Norfolk Naval Shipyard', 'Newark Beth Israel Medical Center',\n",
" 'Fort Wayne Mad Ants Basketball Team', 'Fremont High School', 'Shreveport Regional Airport',\n",
" 'Mobile Public Library', 'Huntsville Hospital', 'Knoxville Symphony Orchestra',\n",
" 'Dayton International Airport', 'Grand Rapids Symphony', 'Winston-Salem Dash Baseball Team',\n",
" 'Fayetteville Technical Community College', 'Springfield Cardinals Baseball Team',\n",
" 'Augusta National Golf Club', 'Salem Health', 'Pasadena Playhouse', 'Yonkers Public Schools',\n",
" 'Boulder Community Health', 'Naperville North High School', 'Lansing Community College',\n",
" 'Reno-Tahoe International Airport', 'Columbia University Medical Center', 'Albany Law School',\n",
" 'Buffalo Sabres Hockey Team', 'Syracuse University', 'Toledo Museum of Art', 'Akron Public Schools',\n",
" 'Daytona International Speedway', 'Des Moines Public Library', 'Rochester Philharmonic Orchestra',\n",
" 'Flint Institute of Arts', 'Lincoln Memorial University', 'Baton Rouge Community College',\n",
" 'Chattanooga Symphony and Opera', 'Greenville Technical College', 'Cedar Rapids Opera Theatre',\n",
" 'Pensacola Naval Air Station'\n",
" ]\n",
"\n",
"products = [\n",
" 'iPhone', 'Samsung Galaxy', 'MacBook', 'PlayStation 5', 'Nike shoes', \n",
" 'AirPods', 'Xbox Series X', 'Canon DSLR', 'GoPro', 'Adidas sneakers', \n",
" 'Fitbit', 'Google Pixel', 'Kindle', 'Bose headphones', 'Sony TV', \n",
" 'Dyson vacuum', 'KitchenAid mixer', 'Surface Pro', 'Roomba', 'Apple Watch'\n",
"]\n",
"\n",
"countries = [\n",
" 'USA', 'France', 'Japan', 'Germany', 'Canada', \n",
" 'Australia', 'Mexico', 'China', 'Brazil', 'India', \n",
" 'Italy', 'Spain', 'South Korea', 'Russia', 'Netherlands', \n",
" 'United Kingdom', 'Sweden', 'Norway', 'Switzerland', 'Argentina'\n",
"]\n",
"\n",
"services = [\n",
" 'Netflix', 'Spotify', 'Uber', 'Amazon Prime', 'Google Drive', \n",
" 'Zoom', 'Dropbox', 'Slack', 'LinkedIn', 'Disney+', \n",
" 'YouTube Premium', 'Venmo', 'DoorDash', 'Postmates', 'Hulu', \n",
" 'Skype', 'Grubhub', 'Twitch', 'Instacart', 'Lyft'\n",
"]\n",
"\n",
"cars = [\n",
" 'Tesla Model S', 'Ford Mustang', 'Chevrolet Camaro', 'Toyota Corolla', 'Honda Civic', \n",
" 'BMW 3 Series', 'Audi A4', 'Mercedes-Benz C-Class', 'Jeep Wrangler', 'Ford F-150', \n",
" 'Hyundai Elantra', 'Mazda CX-5', 'Chevrolet Tahoe', 'Nissan Altima', 'Kia Sorento', \n",
" 'Volkswagen Golf', 'Subaru Outback', 'Tesla Model 3', 'Dodge Charger', 'Volvo XC90'\n",
"]\n",
"\n",
"gadgets = [\n",
" 'smartwatch', 'Bluetooth headphones', 'fitness tracker', 'smart speaker', 'tablet', \n",
" 'laptop', 'gaming mouse', 'wireless charger', 'VR headset', 'noise-canceling headphones', \n",
" 'dashcam', 'e-reader', 'action camera', 'portable hard drive', 'gaming console', \n",
" 'mechanical keyboard', '4K monitor', 'digital camera', 'portable power bank', 'USB-C hub'\n",
"]\n",
"\n",
"stocks = [\n",
" 'AAPL', 'GOOGL', 'AMZN', 'MSFT', 'TSLA', \n",
" 'NFLX', 'FB', 'BABA', 'NVDA', 'JPM', \n",
" 'V', 'PYPL', 'BRK.A', 'DIS', 'INTC', \n",
" 'PFE', 'NKE', 'ORCL', 'VZ', 'BA'\n",
"]\n",
"\n",
"moneys = [\n",
" 'cryptocurrency', 'cash', 'PayPal', 'credit card', 'Bitcoin', \n",
" 'Ethereum', 'bank transfer', 'wire transfer', 'Western Union', 'Venmo', \n",
" 'debit card', 'Zelle', 'Apple Pay', 'Google Pay', 'Coinbase', \n",
" 'Tether', 'Litecoin', 'Dogecoin', 'cash app', 'Ripple'\n",
"]\n",
"\n",
"finances = [\n",
" '401(k)', 'IRA', 'mutual funds', 'mortgage', 'student loan', \n",
" 'savings account', 'retirement fund', 'bond', 'annuity', 'index fund', \n",
" 'Roth IRA', 'tax-free savings account', 'pension', 'trust fund', 'hedge fund', \n",
" 'credit score', 'auto loan', 'home equity loan', 'personal loan', 'debt consolidation'\n",
"]\n",
"\n",
"travels = [\n",
" 'flights', 'hotels', 'car rentals', 'vacation packages', 'cruise trips', \n",
" 'road trips', 'train tickets', 'adventure tours', 'guided tours', 'backpacking trips',\n",
" 'honeymoon destinations', 'beach resorts', 'luxury travel', 'budget travel', 'camping gear', \n",
" 'family vacations', 'ski trips', 'all-inclusive resorts', 'last-minute deals', 'travel insurance'\n",
"]\n",
"\n",
"foods = [\n",
" 'pizza', 'sushi', 'burgers', 'pasta', 'salads', \n",
" 'vegan food', 'barbecue', 'fried chicken', 'ramen', 'tacos', \n",
" 'sandwiches', 'noodles', 'soups', 'cakes', 'ice cream', \n",
" 'steak', 'seafood', 'breakfast food', 'brunch', 'desserts',\n",
" 'hot dogs', 'waffles', 'pancakes', 'donuts', 'cookies',\n",
" 'bagels', 'burritos', 'pho', 'fried rice', 'dim sum',\n",
" 'smoothies', 'milkshakes', 'cupcakes', 'cheesecake', 'crepes',\n",
" 'nachos', 'guacamole', 'shawarma', 'gyros', 'kebabs',\n",
" 'clam chowder', 'chili', 'mac and cheese', 'meatballs', 'lasagna',\n",
" 'quesadillas', 'falafel', 'curry', 'pork ribs', 'buffalo wings',\n",
" 'brownies', 'apple pie', 'frozen yogurt', 'churros', 'stuffed crust pizza',\n",
" 'poutine', 'pad thai', 'dim sum', 'korean barbecue', 'bibimbap',\n",
" 'tandoori chicken', 'naan', 'samosa', 'biryani', 'dumplings',\n",
" 'bao buns', 'poke bowl', 'ceviche', 'tamales', 'empanadas',\n",
" 'shabu shabu', 'jollof rice', 'laksa', 'banh mi', 'spring rolls',\n",
" 'paella', 'gnocchi', 'risotto', 'french fries', 'croissants',\n",
" 'hummus', 'tzatziki', 'miso soup', 'kimchi', 'baklava',\n",
" 'souvlaki', 'galbi', 'arepas', 'roti', 'malai kofta',\n",
" 'sichuan chicken', 'teriyaki', 'yakitori', 'fettuccine alfredo',\n",
" 'gnocchi', 'ratatouille', 'tempura', 'onigiri', 'calamari',\n",
" 'chimichurri steak', 'goulash', 'pierogi', 'fondue', 'strudel',\n",
" 'schnitzel', 'tikka masala', 'paneer', 'plantains', 'croquettes',\n",
" 'coffee', \n",
"]\n",
"\n",
"restaurants = [\n",
" 'Italian restaurants', 'Mexican restaurants', 'Japanese restaurants', 'Chinese restaurants', 'Indian restaurants', \n",
" 'fast food chains', 'fine dining', 'vegan restaurants', 'steakhouses', 'seafood restaurants', \n",
" 'barbecue joints', 'sushi bars', 'cafes', 'pizzerias', 'buffet restaurants', \n",
" 'food trucks', 'family-friendly restaurants', 'gastropubs', 'brunch spots', 'diner',\n",
"]\n",
"\n",
"## Additional partial terms\n",
"sports_terms_missing = [\n",
" \"footbal\", \"baske\", \"socce\", \"golf\", \"cricke\", \"rugby\", \"hocke\", \"tenni\", \n",
" \"swimmin\", \"athleti\", \"fishi\", \"basebal\", \"volleybal\", \"badminto\", \"maratho\", \n",
" \"skatin\", \"climbin\", \"racquetball\", \"bowlin\", \"darts\", \"gymnasti\", \"bikin\", \"bowling\",\n",
"]\n",
"\n",
"locations_and_landmarks = [\n",
" \"statue\", \"museum\", \"plaza\", \"zoo\", \"church\", \"theater\", \"stadium\", \"mountain\", \n",
" \"park\", \"lake\", \"beach\", \"river\", \"palace\", \"cathedra\", \"mansion\", \"monument\", \n",
" \"temple\", \"observato\", \"canyon\", \"garden\", \"conservato\", \"boardwal\", \"forest\", \n",
" \"pier\", \"lighthouse\", \"arena\",\n",
"]\n",
"\n",
"activities_and_events = [\n",
" \"conc\", \"exhib\", \"meet\", \"parad\", \"festi\", \"tourn\", \"game\", \"sho\", \"even\", \n",
" \"gala\", \"confere\", \"seminar\", \"webina\", \"worksho\", \"lectur\", \"symposiu\", \n",
" \"screenin\", \"rall\", \"celebratio\", \"ceremon\", \"get-togethe\", \"perfor\", \n",
" \"gatherin\", \"competitio\", \"maratho\", \"speec\", \"workout\", \"showcas\", \"bowling\"\n",
"]\n",
"\n",
"food_missing = [\n",
" \"sush\", \"pizz\", \"ramen\", \"bbq\", \"vega\", \"steak\", \"taco\", \"burg\", \"pasta\", \n",
" \"brunc\", \"desse\", \"drink\", \"grill\", \"bake\", \"buffet\", \"sandwich\", \"noodle\", \n",
" \"cafe\", \"taver\", \"gastro\", \"bistro\", \"del\", \"saloo\", \"barbecue\", \"snack\", \n",
" \"confectio\", \"pub\",\n",
"]\n",
"\n",
"transport_and_directions = [\n",
" \"direc\", \"map\", \"bus\", \"train\", \"car\", \"park\", \"taxi\", \"subwa\", \"fly\", \n",
" \"plane\", \"ticke\", \"pass\", \"ferr\", \"bicycl\", \"scoote\", \"shuttl\", \"walkin\", \n",
" \"rideshar\", \"transi\", \"toll\", \"metr\", \"road\", \"route\", \"stop\"\n",
"]\n",
"\n",
"celebrities = [\n",
" \"Leonardo DiCaprio\", \"Tom Cruise\", \"Dwayne Johnson\", \"Zendaya\", \n",
" \"Timothée Chalamet\", \"Florence Pugh\", \"Margot Robbie\", \"Chris Hemsworth\", \n",
" \"Robert Downey Jr.\", \"Scarlett Johansson\", \"Tom Holland\", \"Ryan Reynolds\", \n",
" \"Gal Gadot\", \"Pedro Pascal\", \"Elizabeth Olsen\", \"Jenna Ortega\", \n",
" \"Millie Bobby Brown\", \"Finn Wolfhard\", \"Anya Taylor-Joy\", \"Jason Momoa\", \n",
" \"Chris Evans\", \"Natalie Portman\", \"Henry Cavill\", \"Daniel Radcliffe\", \n",
" \"Emma Watson\", \"Rupert Grint\", \"Michael B. Jordan\", \"Anne Hathaway\", \n",
" \"Brad Pitt\", \"Angelina Jolie\", \"Keanu Reeves\", \"Sandra Bullock\", \n",
" \"Jake Gyllenhaal\", \"Christian Bale\", \"Cate Blanchett\", \"Hugh Jackman\", \n",
" \"Jennifer Lawrence\", \"Will Smith\", \"Jada Pinkett Smith\", \"Viola Davis\", \n",
" \"Austin Butler\", \"Jamie Lee Curtis\", \"Paul Mescal\", \"Tobey Maguire\", \n",
" \"Andrew Garfield\", \"Harrison Ford\", \"Helen Mirren\", \"Brendan Fraser\", \n",
"\n",
" # Classic Hollywood Legends\n",
" \"Marlon Brando\", \"James Dean\", \"Audrey Hepburn\", \"Marilyn Monroe\", \n",
" \"Humphrey Bogart\", \"Clark Gable\", \"Bette Davis\", \"Elizabeth Taylor\",\n",
" \"Fred Astaire\", \"Ginger Rogers\", \"Ingrid Bergman\", \"Greta Garbo\", \n",
" \"Katharine Hepburn\", \"Cary Grant\", \"Spencer Tracy\", \"Rita Hayworth\",\n",
" \"Grace Kelly\", \"Vivien Leigh\", \"Judy Garland\", \"Henry Fonda\",\n",
" \"Lauren Bacall\", \"Paul Newman\", \"Charlton Heston\", \"Joan Crawford\",\n",
"\n",
" # Modern Hollywood Icons\n",
" \"Meryl Streep\", \"Tom Hanks\", \"Denzel Washington\", \"Robert De Niro\", \n",
" \"Al Pacino\", \"Jack Nicholson\", \"Julia Roberts\", \"Leonardo DiCaprio\",\n",
" \"Brad Pitt\", \"Angelina Jolie\", \"George Clooney\", \"Cate Blanchett\",\n",
" \"Johnny Depp\", \"Tom Cruise\", \"Sandra Bullock\", \"Nicole Kidman\", \n",
" \"Halle Berry\", \"Harrison Ford\", \"Sigourney Weaver\", \"Morgan Freeman\", \n",
" \"Michelle Pfeiffer\", \"Dustin Hoffman\", \"Robin Williams\", \"Will Smith\",\n",
"\n",
" # Franchise and Action-Adventure Stars\n",
" \"Orlando Bloom\", \"Viggo Mortensen\", \"Ian McKellen\", \"Elijah Wood\",\n",
" \"Sean Astin\", \"Dominic Monaghan\", \"Billy Boyd\", \"Liv Tyler\", \n",
" \"Hugo Weaving\", \"Andy Serkis\", \"Keira Knightley\", \"Geoffrey Rush\",\n",
" \"Johnny Depp\", \"Daniel Radcliffe\", \"Emma Watson\", \"Rupert Grint\",\n",
" \"Helena Bonham Carter\", \"Ralph Fiennes\", \"Alan Rickman\", \"Michael Gambon\",\n",
" \"Ewan McGregor\", \"Liam Neeson\", \"Natalie Portman\", \"Hayden Christensen\",\n",
" \"Mark Hamill\", \"Carrie Fisher\", \"Harrison Ford\", \"Daisy Ridley\",\n",
" \"Adam Driver\", \"John Boyega\", \"Oscar Isaac\", \"Diego Luna\", \n",
" \"Felicity Jones\", \"Pedro Pascal\", \"Chris Hemsworth\", \"Chris Evans\", \n",
" \"Scarlett Johansson\", \"Robert Downey Jr.\", \"Mark Ruffalo\", \"Chris Pratt\",\n",
" \"Tom Holland\", \"Zendaya\", \"Benedict Cumberbatch\", \"Tobey Maguire\", \n",
" \"Andrew Garfield\", \"Hugh Jackman\", \"Patrick Stewart\", \"Ian McKellen\", \n",
" \"Ryan Reynolds\", \"Gal Gadot\", \"Henry Cavill\", \"Jason Momoa\", \n",
" \"Ben Affleck\", \"Zoe Saldaña\", \"Dave Bautista\", \"Karen Gillan\",\n",
"\n",
" # Versatile and Popular Contemporary Actors\n",
" \"Christian Bale\", \"Amy Adams\", \"Ryan Gosling\", \"Emma Stone\",\n",
" \"Anne Hathaway\", \"Jennifer Lawrence\", \"Joaquin Phoenix\", \"Margot Robbie\",\n",
" \"Adam Driver\", \"Michael B. Jordan\", \"Florence Pugh\", \"Timothée Chalamet\",\n",
" \"Austin Butler\", \"Jessica Chastain\", \"Mahershala Ali\", \"Viola Davis\", \n",
" \"Octavia Spencer\", \"Toni Collette\", \"Rami Malek\", \"Lakeith Stanfield\",\n",
" \"Cillian Murphy\", \"Matt Damon\", \"Ben Affleck\", \"Jeremy Renner\", \n",
"\n",
" # Young Rising Stars\n",
" \"Millie Bobby Brown\", \"Finn Wolfhard\", \"Sadie Sink\", \"Noah Schnapp\", \n",
" \"Anya Taylor-Joy\", \"Jenna Ortega\", \"Hunter Schafer\", \"Hailee Steinfeld\", \n",
" \"Lucas Hedges\", \"Elle Fanning\", \"Dakota Fanning\", \"Jacob Elordi\", \n",
" \"Sydney Sweeney\", \"Joey King\", \"Sophie Turner\", \"Maisie Williams\",\n",
"\n",
" # Comedy and Character Actors\n",
" \"Steve Carell\", \"Tina Fey\", \"Amy Poehler\", \"Melissa McCarthy\", \n",
" \"Kristen Wiig\", \"Seth Rogen\", \"Will Ferrell\", \"Paul Rudd\", \n",
" \"Bill Hader\", \"Jason Bateman\", \"Jonah Hill\", \"Michael Cera\",\n",
" \"Ken Jeong\", \"Kevin Hart\", \"Maya Rudolph\", \"Chris Rock\", \n",
"\n",
" # Iconic Action and Adventure Stars\n",
" \"Dwayne Johnson\", \"Arnold Schwarzenegger\", \"Sylvester Stallone\", \n",
" \"Bruce Willis\", \"Jason Statham\", \"Keanu Reeves\", \"Vin Diesel\", \n",
" \"Charlize Theron\", \"Emily Blunt\", \"John Cena\", \"Liam Neeson\", \n",
" \"Daniel Craig\", \"Idris Elba\", \"Pierce Brosnan\", \"Angelina Jolie\", \n",
" \"Kate Beckinsale\", \"Milla Jovovich\",\n",
"\n",
" # Supporting Actors and Other Notables\n",
" \"John Goodman\", \"Jeff Goldblum\", \"J.K. Simmons\", \"Stanley Tucci\",\n",
" \"Frances McDormand\", \"Allison Janney\", \"Angela Bassett\", \"Regina King\",\n",
" \"Jessica Lange\", \"Bryan Cranston\", \"Aaron Paul\", \"Bob Odenkirk\", \n",
" \"Giancarlo Esposito\", \"David Harbour\", \"Winona Ryder\", \n",
"\n",
" # Diverse and Internationally Acclaimed Actors\n",
" \"Salma Hayek\", \"Antonio Banderas\", \"Diego Luna\", \"Oscar Isaac\", \n",
" \"Gael García Bernal\", \"Eva Longoria\", \"Jessica Alba\", \n",
" \"Awkwafina\", \"Sandra Oh\", \"Steven Yeun\", \"Simu Liu\", \n",
" \"Lucy Liu\", \"Gemma Chan\", \"Mindy Kaling\", \"Ali Wong\", \n",
" \"Lupita Nyong'o\", \"Chadwick Boseman\", \"Daniel Kaluuya\", \"Letitia Wright\",\n",
" \"Dev Patel\", \"Riz Ahmed\", \"Zazie Beetz\", \"Mahershala Ali\",\n",
"\n",
" # Sports\n",
" \"Lionel Messi\", \"Cristiano Ronaldo\", \"Neymar Jr.\", \"Kylian Mbappé\", \n",
" \"LeBron James\", \"Serena Williams\", \"Roger Federer\", \"Novak Djokovic\", \n",
" \"Rafael Nadal\", \"Simone Biles\", \"Naomi Osaka\", \"Stephen Curry\", \n",
" \"Kevin Durant\", \"Tom Brady\", \"Patrick Mahomes\", \"Virat Kohli\", \n",
" \"Rohit Sharma\", \"Shaquille O'Neal\", \"Tiger Woods\", \"Lewis Hamilton\", \n",
" \"Max Verstappen\", \"Charles Leclerc\", \"Usain Bolt\", \"Megan Rapinoe\", \n",
" \"Alex Morgan\", \"Katie Ledecky\", \"Michael Phelps\", \"Giannis Antetokounmpo\", \n",
" \"Damian Lillard\", \"Anthony Davis\", \"Zlatan Ibrahimović\", \"Harry Kane\", \n",
" \"Sadio Mané\", \"Karim Benzema\", \"Gareth Bale\", \"Robert Lewandowski\", \n",
" \"Erling Haaland\", \"Venus Williams\", \"Iga Świątek\", \"Aryna Sabalenka\", \n",
"\n",
" # Politics and Leaders\n",
" \"Joe Biden\", \"Kamala Harris\", \"Barack Obama\", \"Michelle Obama\", \n",
" \"Donald Trump\", \"Melania Trump\", \"Emmanuel Macron\", \"Olaf Scholz\", \n",
" \"Volodymyr Zelenskyy\", \"Rishi Sunak\", \"Narendra Modi\", \"Jacinda Ardern\", \n",
" \"Justin Trudeau\", \"Xi Jinping\", \"Vladimir Putin\", \"Angela Merkel\", \n",
" \"Elizabeth II\", \"King Charles III\", \"Prince William\", \"Prince Harry\", \n",
" \"Meghan Markle\", \"Queen Letizia\", \"Pope Francis\", \"Dalai Lama\", \n",
" \"Greta Thunberg\", \"Alexandria Ocasio-Cortez\", \"Bernie Sanders\", \n",
" \"Nicolas Maduro\", \"Jair Bolsonaro\", \"Fumio Kishida\", \"Yoon Suk-yeol\",\n",
"\n",
" # Business and Technology\n",
" \"Elon Musk\", \"Jeff Bezos\", \"Mark Zuckerberg\", \"Bill Gates\", \"Tim Cook\", \n",
" \"Sundar Pichai\", \"Satya Nadella\", \"Warren Buffett\", \"Bernard Arnault\", \n",
" \"Larry Page\", \"Sergey Brin\", \"Steve Wozniak\", \"Reed Hastings\", \"Susan Wojcicki\", \n",
" \"Jack Ma\", \"Daniel Ek\", \"Evan Spiegel\", \"Andrew Ng\", \"Sam Altman\", \n",
" \"Sheryl Sandberg\", \"Peter Thiel\", \"Marc Benioff\", \"Richard Branson\", \n",
" \"Oprah Winfrey\", \"Howard Schultz\", \"Larry Ellison\", \"David Baszucki\", \n",
" \"Parag Agrawal\", \"Adam Neumann\", \"Kylie Jenner\", \"Kim Kardashian\", \n",
" \"Khloé Kardashian\", \"Kris Jenner\", \"Robert Kiyosaki\", \"Barbara Corcoran\", \n",
"\n",
" # Science and Innovation\n",
" \"Jane Goodall\", \"Neil deGrasse Tyson\", \"Brian Cox\", \"Michio Kaku\", \n",
" \"Katherine Johnson\", \"Jennifer Doudna\", \"Emmanuelle Charpentier\", \"Tim Berners-Lee\", \n",
" \"Mae Jemison\", \"Katie Bouman\", \"Brian Greene\", \"James Lovelock\", \n",
" \"Roger Penrose\", \"Dmitry Muratov\", \"Frances Arnold\", \"Venki Ramakrishnan\", \n",
" \"Paul Nurse\", \"Elizabeth Blackburn\", \"Carol Greider\", \"David Julius\", \n",
" \"Abhijit Banerjee\", \"Esther Duflo\", \"Michael Kremer\", \"Andrea Ghez\", \n",
" \"Reinhard Genzel\", \"Jennifer Hudson\", \"Ashoke Sen\", \"Subrahmanyan Chandrasekhar\", \n",
"\n",
" # Others\n",
" \"Ellen DeGeneres\", \"Oprah Winfrey\", \"Trevor Noah\", \"Jimmy Fallon\", \n",
" \"Stephen Colbert\", \"John Oliver\", \"James Corden\", \"Conan O'Brien\", \n",
" \"Dolly Parton\", \"Gordon Ramsay\", \"David Beckham\", \"Victoria Beckham\", \n",
" \"RuPaul\", \"Chris Rock\", \"Dave Chappelle\", \"Trevor Noah\", \"Hasan Minhaj\", \n",
" \"Ali Wong\", \"Bo Burnham\", \"Jo Koy\", \"Kevin Hart\", \"Sarah Silverman\", \n",
" \"Tiffany Haddish\", \"Joe Rogan\", \"Logan Paul\", \"MrBeast\", \"PewDiePie\", \n",
" \"Emma Chamberlain\", \"Charli D'Amelio\", \"Addison Rae\", \"Bella Poarch\",\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2a166f90-7262-4047-92e4-f83a18c7c5d4",
"metadata": {},
"outputs": [],
"source": [
"\n",
"def get_sample_from_cities(city_info, city_weights, actual_threshold=0.7, city_partial_threshold=0.1):\n",
" cities = list(city_info.keys())\n",
" weights = [city_weights[city] for city in cities]\n",
" city_random = random.choices(cities, weights=weights, k=1)[0]\n",
" rand_val = random.random()\n",
" if rand_val <= actual_threshold:\n",
" if rand_val <= city_partial_threshold and len(city_random) > 6:\n",
" return city_random[:-1]\n",
" return city_random\n",
" return random.choice(city_info[city_random])\n",
"\n",
"def get_sample_from_states(state_info, actual_threshold=0.5):\n",
" states = list(state_info.keys())\n",
" state_random = random.choice(states)\n",
" rand_val = random.random()\n",
" if rand_val <= actual_threshold:\n",
" return state_random\n",
" return random.choice([state_info[state_random]])\n",
"\n",
"def get_sample_from_cities_and_states(city_state_code_info, city_state_name_info, state_code_threshold=0.8, comma_threshold=0.6):\n",
" rand_val = random.random()\n",
" if rand_val <= state_code_threshold:\n",
" if rand_val <= comma_threshold:\n",
" return ', '.join(city_state_code_info.sample(1, weights='city_weight', replace=True)[['city_name', 'state_code']].values.tolist()[0])\n",
" else:\n",
" return ' '.join(city_state_code_info.sample(1, weights='city_weight', replace=True)[['city_name', 'state_code']].values.tolist()[0])\n",
" return ', '.join(city_state_name_info.sample(1, weights='city_weight', replace=True)[['city_name', 'state_name']].values.tolist()[0])\n",
"\n",
"def get_random_choice_from_list(choices_list):\n",
" return random.choice(choices_list)\n",
" \n",
"def get_sample_fake_city():\n",
" return get_random_choice_from_list(fake_cities)\n",
"\n",
"def get_sample_fake_state_code():\n",
" return get_random_choice_from_list(fake_state_codes)\n",
"\n",
"def get_sample_fake_state_name():\n",
" return get_random_choice_from_list(fake_state_names)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a4f1fd1a-cd74-4ebe-b9aa-c3cd07dcf2bb",
"metadata": {},
"outputs": [],
"source": [
"# for _ in range(100):\n",
"# print(get_sample_from_cities_and_states(city_state_code_info, city_state_name_info, state_code_threshold=0.8))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cb760f94-1dba-4ec3-816f-7cfb3bb9b7b0",
"metadata": {},
"outputs": [],
"source": [
"templates = [\n",
" # Simple City-Based Queries\n",
" \"weather {city}\",\n",
" \"{city} temperature\",\n",
" \"sushi {city}\",\n",
" \"ramen {city}\",\n",
" \"pizza {city}\",\n",
" \"plumber {city}\",\n",
" \"electrician {city}\",\n",
" \"roof repair {city}\",\n",
" \"physio therapy {city}\",\n",
" \"hospital {city}\",\n",
" \"doctor {city}\",\n",
" \"nurse {city}\",\n",
" \"home improvement {city}\",\n",
" \"home services {city}\",\n",
" \"weather forecast {city}\",\n",
" \"current weather {city}\",\n",
" \"best restaurants {city}\",\n",
" \"top yelp reviews {city}\",\n",
" \"places to visit in {city}\",\n",
" \"best cafes in {city}\",\n",
" \"emergency services {city}\",\n",
" \"gyms in {city}\",\n",
" \"car repair {city}\",\n",
" \"florist {city}\",\n",
" \"lawyers in {city}\",\n",
" \"real estate agents {city}\",\n",
" \"hiking trails {city}\",\n",
" \"parks in {city}\",\n",
" \"movie theaters {city}\",\n",
" \"top hotels in {city}\",\n",
" \"events in {city} this weekend\",\n",
" \"pharmacies {city}\",\n",
" \"{food} near me {city}\",\n",
" \"coffee near me {city}\",\n",
" \"breakfast near me {city}\",\n",
" \"restaurants near me {city}\",\n",
"\n",
" # State-Based Queries\n",
" \"home services in {state}\",\n",
" \"best restaurants in {state}\",\n",
" \"real estate agents {state}\",\n",
" \"roof repair services {state}\",\n",
" \"hospitals in {state}\",\n",
" \"weather {state}\",\n",
" \"temperature {state}\",\n",
" \"physio therapy {state}\",\n",
" \"doctors in {state}\",\n",
" \"top-rated plumbers {state}\",\n",
" \"electricians {state}\",\n",
" \"emergency services {state}\",\n",
" \"sushi {state}\",\n",
" \"ramen {state}\",\n",
" \"pizza {state}\",\n",
" \"parks in {state}\",\n",
" \"hiking trails {state}\",\n",
" \"pharmacies in {state}\",\n",
" \"best cafes {state}\",\n",
" \"movie theaters {state}\",\n",
"\n",
" # City-State Combination Queries (Now using {city_state})\n",
" \"weather {city_state}\",\n",
" \"{city_state} temperature\",\n",
" \"sushi {city_state}\",\n",
" \"plumber {city_state}\",\n",
" \"best restaurants in {city_state}\",\n",
" \"top-rated roof repair {city_state}\",\n",
" \"hospital {city_state}\",\n",
" \"physio therapy {city_state}\",\n",
" \"doctor {city_state}\",\n",
" \"events in {city_state} this weekend\",\n",
" \"lawyers in {city_state}\",\n",
" \"home improvement services {city_state}\",\n",
" \"florist {city_state}\",\n",
" \"best cafes in {city_state}\",\n",
" \"parks in {city_state}\",\n",
" \"movie theaters {city_state}\",\n",
" \"top hotels in {city_state}\",\n",
" \"emergency services {city_state}\",\n",
" \"car repair {city_state}\",\n",
" \"pharmacies {city_state}\",\n",
"\n",
" \"sushi {city_state}\",\n",
" \"ramen {city_state}\",\n",
" \"pizza {city_state}\",\n",
" \"parks {city_state}\",\n",
" \"hiking trails {city_state}\",\n",
" \"pharmacies {city_state}\",\n",
" \"best cafes {city_state}\",\n",
" \"movie theaters {city_state}\",\n",
" \"hamburgers {city_state}\",\n",
" \"burgers {city_state}\",\n",
" \"pasta {city_state}\",\n",
" \"salads {city_state}\",\n",
" \"vegan food {city_state}\",\n",
" \"fried chicken {city_state}\",\n",
" \"ramen {city_state}\",\n",
" \"tacos {city_state}\",\n",
" \"sandwiches {city_state}\",\n",
" \"noodles {city_state}\",\n",
" \"soups {city_state}\",\n",
" \"cakes {city_state}\",\n",
" \"ice cream {city_state}\",\n",
" \"steak {city_state}\",\n",
" \"seafood {city_state}\",\n",
" \"breakfast food {city_state}\",\n",
" \"brunch {city_state}\",\n",
" \"desserts {city_state}\",\n",
" \n",
" # CITY state order swapped\n",
" \"{city_state} sushi\",\n",
" \"{city_state} ramen\",\n",
" \"{city_state} pizza\",\n",
" \"{city_state} parks\",\n",
" \"{city_state} hiking trails\",\n",
" \"{city_state} pharmacies\",\n",
" \"{city_state} best cafes\",\n",
" \"{city_state} movie theaters\",\n",
" \"{city_state} hamburgers\",\n",
" \"{city_state} burgers\",\n",
" \"{city_state} pasta\",\n",
" \"{city_state} salads\",\n",
" \"{city_state} vegan food\",\n",
" \"{city_state} fried chicken\",\n",
" \"{city_state} ramen\",\n",
" \"{city_state} tacos\",\n",
" \"{city_state} sandwiches\",\n",
" \"{city_state} noodles\",\n",
" \"{city_state} soups\",\n",
" \"{city_state} cakes\",\n",
" \"{city_state} ice cream\",\n",
" \"{city_state} steak\",\n",
" \"{city_state} seafood\",\n",
" \"{city_state} breakfast food\",\n",
" \"{city_state} brunch\",\n",
" \"{city_state} desserts\",\n",
" \n",
" # Organization-Based Queries\n",
" \"{organization} in {city_state}\",\n",
" \"contact {organization} in {city}\",\n",
" \"locations of {organization} in {state}\",\n",
" \"does {organization} provide home repair services in {city}?\",\n",
" \"can I book a doctor appointment at {organization} in {state}?\",\n",
" \"does {organization} offer roof repair in {city_state}?\",\n",
" \"hours of {organization} in {city}\",\n",
" \"{organization} reviews in {state}\",\n",
" \"best rated {organization} in {city_state}\",\n",
" \"nearest branch of {organization} in {city}\",\n",
" \n",
" # Person-Based Queries\n",
" \"Where is {person} hosting an event?\",\n",
" \"Can I meet {person} in {city_state}?\",\n",
" \"Is {person} available for an appointment in {city}?\",\n",
" \"Is {person} traveling to {state} next week?\",\n",
" \"Does {person} have a speech in {city_state}?\",\n",
" \n",
" # Mixed and Specialized Queries\n",
" \"roof repair near {city}\",\n",
" \"best sushi in {city_state}\",\n",
" \"what's the weather forecast for {city}?\",\n",
" \"who are the top doctors in {city_state}?\",\n",
" \"restaurants near {city} with good reviews\",\n",
" \"plumbing services in {city_state}\",\n",
" \"upcoming events in {city} this weekend\",\n",
" \"find hiking trails in {city_state}\",\n",
" \"local electricians in {city_state}\",\n",
" \"ramen places in {city}\",\n",
" \"home improvement contractors near {city_state}\",\n",
" \"best pizza near {city}\",\n",
" \"does {organization} operate in {city_state}?\",\n",
" \"find top-rated hospitals in {city_state}\",\n",
" \"home maintenance services in {city_state}\",\n",
" \"weather forecast for {city} this weekend\",\n",
" \"roof repair specialists in {city}\",\n",
" \"top-rated movie theaters in {city_state}\",\n",
" \n",
"\n",
" # City-State Queries\n",
" \"Best {restaurant} in {city_state}\",\n",
" \"Top-rated {restaurant} in {city_state}\",\n",
" \"Affordable {restaurant} in {city_state}\",\n",
" \"Where to find the best {food} in {city_state}?\",\n",
" \"Popular {food} places in {city_state}\",\n",
" \"Top destinations for {travel} in {city_state}\",\n",
" \"Best deals on {travel} in {city_state}\",\n",
" \"Where to eat {food} in {city_state}?\",\n",
" \"What are the most famous {restaurant} in {city_state}?\",\n",
" \"Top {food} restaurants in {city_state} this weekend\",\n",
"\n",
" # Non-City/State Queries\n",
" \"Best {restaurant} in the country\",\n",
" \"Where to find the best {food} near me?\",\n",
" \"Top destinations for {travel} this summer\",\n",
" \"Best deals on {travel} packages\",\n",
" \"Where to find cheap {travel} options?\",\n",
" \"Popular {food} dishes in the USA\",\n",
" \"Best {restaurant} chains in the country\",\n",
" \"What are the healthiest {food} options?\",\n",
" \"How to book affordable {travel} for families?\",\n",
" \"Most popular {restaurant} for takeout\",\n",
"\n",
" # Additional Templates\n",
" \"What is the best {food} to eat for dinner?\",\n",
" \"Where to order {food} online?\",\n",
" \"Best {restaurant} for date night\",\n",
" \"Top {travel} websites for booking vacations\",\n",
" \"Where to find {restaurant} reviews?\",\n",
" \"What are the top-rated {travel} apps?\",\n",
" \"Best {restaurant} near tourist attractions\",\n",
" \"What is the most popular {food} in the USA?\",\n",
" \"Best deals on {travel} for students\",\n",
" \"Top {restaurant} for family gatherings\",\n",
" \"Most affordable {food} delivery services\",\n",
" \"What are the best {travel} insurance options?\",\n",
" \"How to find luxury {restaurant} reservations\",\n",
" \"Where to get authentic {food} near me?\",\n",
" \"Top {restaurant} for business lunches\",\n",
" \"How to plan a {travel} adventure?\",\n",
" \"Best {restaurant} for weekend brunch\",\n",
" \"What are the most popular {food} trends?\",\n",
" \"Best {restaurant} for a large group\",\n",
" \"How to get discounts on {travel} bookings?\"\n",
"\n",
" # Product-Based Queries\n",
" \"Where to buy {product} online?\",\n",
" \"Best deals on {product}\",\n",
" \"How to repair a {product}?\",\n",
" \"Latest reviews of {product}\",\n",
" \"When will the next {product} be released?\",\n",
" \"Top features of {product}\",\n",
" \"Is {product} worth buying in 2024?\",\n",
" \"User reviews of {product}\",\n",
" \"Alternatives to {product}\",\n",
" \"What is the price of {product}?\",\n",
"\n",
" # Country-Based Queries\n",
" \"How to travel to {country}?\",\n",
" \"Best tourist destinations in {country}\",\n",
" \"Top hotels to stay in {country}\",\n",
" \"Do I need a visa to visit {country}?\",\n",
" \"Cultural traditions in {country}\",\n",
" \"What is the official language of {country}?\",\n",
" \"How to do business in {country}?\",\n",
" \"What are the top exports of {country}?\",\n",
" \"Current political situation in {country}\",\n",
" \"Famous landmarks in {country}\",\n",
"\n",
" # Service-Based Queries\n",
" \"How to cancel my {service} subscription?\",\n",
" \"Is {service} worth the price?\",\n",
" \"How does {service} compare to competitors?\",\n",
" \"User reviews of {service}\",\n",
" \"How to get a discount on {service}?\",\n",
" \"What are the benefits of {service}?\",\n",
" \"Best alternatives to {service}\",\n",
" \"How to troubleshoot issues with {service}?\",\n",
" \"Does {service} have a free trial?\",\n",
" \"Is {service} available internationally?\",\n",
"\n",
" # Cars-Based Queries\n",
" \"What is the top speed of {car}?\",\n",
" \"User reviews of {car}\",\n",
" \"How to finance a {car}?\",\n",
" \"Fuel efficiency of {car}\",\n",
" \"How to buy a second-hand {car}?\",\n",
" \"What are the safety features of {car}?\",\n",
" \"Maintenance costs of owning a {car}\",\n",
" \"What is the resale value of {car}?\",\n",
" \"Is {car} electric or gas-powered?\",\n",
" \"Best upgrades for {car}\",\n",
"\n",
" # Gadgets-Based Queries\n",
" \"What are the best apps for {gadget}?\",\n",
" \"How to set up a {gadget}?\",\n",
" \"User reviews of {gadget}\",\n",
" \"Best accessories for {gadget}\",\n",
" \"What are the health benefits of using a {gadget}?\",\n",
" \"What is the battery life of {gadget}?\",\n",
" \"How to sync {gadget} with my phone?\",\n",
" \"Alternatives to {gadget}\",\n",
" \"What are the best productivity apps for {gadget}?\",\n",
" \"Is {gadget} waterproof?\",\n",
"\n",
" # Stocks-Based Queries\n",
" \"What is the latest price of {stock}?\",\n",
" \"How to buy shares of {stock}?\",\n",
" \"Is {stock} a good investment in 2024?\",\n",
" \"What are analysts saying about {stock}?\",\n",
" \"Current stock performance of {stock}\",\n",
" \"What is the market cap of {stock}?\",\n",
" \"How to invest in {stock}?\",\n",
" \"Latest earnings report of {stock}\",\n",
" \"What are the dividend yields of {stock}?\",\n",
" \"How to trade {stock} on the stock market?\",\n",
"\n",
" # Money-Based Queries\n",
" \"How to convert {money} to another currency?\",\n",
" \"Best ways to transfer {money} internationally\",\n",
" \"What are the risks of using {money}?\",\n",
" \"How to save {money} for the future?\",\n",
" \"What is the best way to invest {money}?\",\n",
" \"How to protect {money} from fraud?\",\n",
" \"What are the fees for using {money}?\",\n",
" \"Is {money} safe for online transactions?\",\n",
" \"Best apps for managing {money}\",\n",
" \"How to track spending with {money}?\",\n",
"\n",
" # Finance-Based Queries\n",
" \"How to invest in a {finance}?\",\n",
" \"What are the benefits of having a {finance}?\",\n",
" \"How to calculate the returns on {finance}?\",\n",
" \"What are the risks of investing in {finance}?\",\n",
" \"How to get advice for managing my {finance}?\",\n",
" \"How to apply for a {finance}?\",\n",
" \"What are the tax benefits of {finance}?\",\n",
" \"What are the best options for a {finance}?\",\n",
" \"How to open a {finance} account?\",\n",
" \"What is the interest rate on {finance}?\",\n",
"\n",
" # sports_term, location_and_landmark, activity_and_event, food_m, transport_and_direction\n",
" # incomplete or misspelled sport/activity names\n",
" \"{sports_term} near me\", \n",
" \"find {sports_term}\", \n",
" \"{sports_term} schedule\", \n",
" \"{sports_term} news\", \n",
" \"book {sports_term} tickets\", \n",
" \"{sports_term} team\", \n",
" \"{sports_term} game time\", \n",
" \"when is the {sports_term} game\", \n",
" \"top {sports_term} players\", \n",
" \"local {sports_term} clubs\", \n",
" \"where to play {sports_term}\", \n",
" \"best {sports_term} venues\", \n",
" \"{sports_term} tournament\",\n",
" \"{sports_term}\",\n",
"\n",
" # Generic landmarks and location queries\n",
" \"{location_and_landmark} nearby\", \n",
" \"famous {location_and_landmark}\", \n",
" \"{location_and_landmark} open now\", \n",
" \"visit {location_and_landmark}\", \n",
" \"{location_and_landmark} directions\", \n",
" \"how to get to {location_and_landmark}\", \n",
" \"nearest {location_and_landmark}\", \n",
" \"{location_and_landmark} address\", \n",
" \"top-rated {location_and_landmark}\", \n",
" \"{location_and_landmark} hours\", \n",
" \"find {location_and_landmark} near me\", \n",
" \"{location_and_landmark} entry fee\", \n",
" \"best {location_and_landmark} in {city}\",\n",
"\n",
" # Food and dining queries\n",
" \"{food_m} place\", \n",
" \"find {food_m}\", \n",
" \"best {food_m} spot\", \n",
" \"{food_m} delivery\", \n",
" \"{food_m} open near me\", \n",
" \"order {food_m}\", \n",
" \"{food_m} deals\", \n",
" \"{food_m} options\", \n",
" \"{food_m} near me\", \n",
" \"{food_m} reservation\", \n",
" \"top-rated {food_m} restaurants\", \n",
" \"{food_m} reviews\", \n",
" \"{food_m} menu\", \n",
" \"popular {food_m} dishes\", \n",
" \"where to eat {food_m}\",\n",
"\n",
" # activities_and_events\n",
" \"{activity_and_event} tickets\", \n",
" \"nearest {activity_and_event}\", \n",
" \"{activity_and_event} today\", \n",
" \"upcoming {activity_and_event}\", \n",
" \"book {activity_and_event}\", \n",
" \"{activity_and_event} in {city}\", \n",
" \"find {activity_and_event}\", \n",
" \"{activity_and_event} schedule\", \n",
" \"{activity_and_event} near me\", \n",
" \"top-rated {activity_and_event} venues\", \n",
" \"{activity_and_event} details\", \n",
" \"how to attend {activity_and_event}\", \n",
" \"{activity_and_event} location\", \n",
" \"{activity_and_event} opening hours\",\n",
" \"{activity_and_event}\",\n",
"\n",
" # Single-word incomplete or ambiguous queries (standalone)\n",
" # Sports and Games (single or incomplete)\n",
" \"footbal\", \"baske\", \"golf\", \"sush\", \"pizz\", \"zoo\", \"conc\", \"direc\", \n",
" \"theate\", \"stadiu\", \"brunc\", \"tourn\", \"parad\", \"swimmin\", \"train\", \"taxi\", \n",
" \"game\", \"meet\", \"mountain\", \"beac\", \"lake\", \"forest\", \"ligh\", \"restauran\", \n",
" \"parki\", \"stor\", \"monumen\", \"aren\", \"boardwal\",\n",
" # Locations and Landmarks (single or incomplete)\n",
" \"statue\", \"museum\", \"plaza\", \"zoo\", \"church\", \"theater\", \"stadium\", \"mountain\", \n",
" \"park\", \"lake\", \"beach\", \"river\", \"palace\", \"cathedra\", \"mansion\", \"monument\", \n",
" \"temple\", \"observato\", \"canyon\", \"garden\", \"conservato\", \"boardwal\", \"forest\", \n",
" \"pier\", \"lighthouse\", \"arena\", \"campgroun\", \"arch\", \"reservoi\", \"dam\", \"fountai\", \n",
" \"waterfal\", \"galleri\", \"amphitheate\", \"sculptur\", \"trail\", \"cliff\", \"tower\", \"islan\",\n",
" # Activities and Events (single or incomplete)\n",
" \"conc\", \"exhib\", \"meet\", \"parad\", \"festi\", \"tourn\", \"game\", \"sho\", \"even\", \"gala\", \n",
" \"confere\", \"seminar\", \"webina\", \"worksho\", \"lectur\", \"symposiu\", \"screenin\", \n",
" \"rall\", \"celebratio\", \"ceremon\", \"get-togethe\", \"perfor\", \"gatherin\", \"competitio\", \n",
" \"maratho\", \"speec\", \"workout\", \"exercis\", \"demonstratio\", \"ceremony\", \"readin\", \n",
" \"daytrip\", \"lectur\", \"social\", \"activit\", \"performanc\", \"worksho\", \"openin\", \n",
" \"finale\", \"comedy\", \"poetr\", \"talent\", \"match\",\n",
" # Restaurants and Food Types (single or incomplete)\n",
" \"sush\", \"pizz\", \"ramen\", \"bbq\", \"vega\", \"steak\", \"taco\", \"burg\", \"pasta\", \"brunc\", \n",
" \"desse\", \"drink\", \"grill\", \"bake\", \"buffet\", \"sandwich\", \"noodle\", \"cafe\", \n",
" \"taver\", \"gastro\", \"bistro\", \"deli\", \"saloo\", \"barbecue\", \"snack\", \"confectio\", \n",
" \"pub\", \"salad\", \"cuisine\", \"fries\", \"wings\", \"pantr\", \"meatbal\", \"sub\", \"omel\", \n",
" \"crepe\", \"wrap\", \"beverag\", \"dessert\", \"smoothie\", \"juice\", \"shake\", \"frappe\", \"coffee\",\n",
" # Transport and Directions (single or incomplete)\n",
" \"direc\", \"map\", \"bus\", \"train\", \"car\", \"park\", \"taxi\", \"subwa\", \"fly\", \"plane\", \n",
" \"ticke\", \"pass\", \"ferr\", \"bicycl\", \"scoote\", \"shuttl\", \"walkin\", \"rideshar\", \n",
" \"transi\", \"toll\", \"metr\", \"road\", \"route\", \"stop\", \"junctio\", \"termina\", \"highwa\", \n",
" \"pathwa\", \"drivewa\", \"loop\", \"intersectio\", \"trailhead\", \"tub\", \"sidestro\", \n",
" \"crosswal\", \"rout\", \"navigatio\", \"crossing\", \"pave\", \"deck\", \"lane\",\n",
" # Technology and Gadgets (single or incomplete)\n",
" \"lapt\", \"smartphon\", \"comput\", \"tablet\", \"earbuds\", \"bluetooth\", \"charg\", \"cabl\", \n",
" \"headset\", \"monitor\", \"consol\", \"keyboard\", \"drive\", \"storag\", \"gaming\", \"mouse\", \n",
" \"projector\", \"flashdriv\", \"powerban\", \"adapter\", \"webcam\", \"router\", \"modem\", \n",
" \"camcorder\", \"printer\", \"copier\", \"recorde\", \"remote\", \"surge\", \"extend\", \"plug\", \n",
" \"portabl\", \"backu\", \"networ\", \"recharge\", \"uplo\", \"downlo\", \"strea\", \"screencas\", \n",
" \"googl\", \"apple\", \"micros\", \"andr\",\n",
"\n",
" # actual city and states\n",
" \"{food} {city}\", \n",
" \"{food} {state}\", \n",
" \"{city} {food}\", \n",
" \"{state} {food}\", \n",
" \"{food} in {city}\", \n",
" \"{food} in {state}\", \n",
"\n",
" # fake cities and states\n",
" \"{food} {fake_cty}\", \n",
" \"{food} {fake_state_cd}\", \n",
" \"{food} {fake_state_nam}\", \n",
" \"{fake_cty} {food}\", \n",
" \"{fake_state_cd} {food}\", \n",
" \"{fake_state_nam} {food}\", \n",
" \"{food} in {fake_cty}\", \n",
" \"{food} in {fake_state_cd}\", \n",
" \"{food} in {fake_state_nam}\", \n",
"\n",
" # celebrities\n",
" \"{celebrity}\",\n",
" \"{celebrity} age\",\n",
" \"{celebrity} net worth\",\n",
" \"{celebrity} movies\",\n",
" \"What shows has {celebrity} been on?\",\n",
" \"What awards has {celebrity} won?\",\n",
" \"Where does {celebrity} live?\",\n",
" \"What are {celebrity}'s upcoming projects?\",\n",
" \"{celebrity} diet\",\n",
" \"is {celebrity} married?\",\n",
" \"does {celebrity} live in {city}\",\n",
"\n",
" ## unknown random queries\n",
" 'snoozlegrip', 'shenanigans', 'kerplunk', 'clip', 'snappyy', 'spindlywhack', 'crinkly', 'pressed enter too soon', \n",
" 'try this', 'query here', 'mistyped selection', 'smorgasbord', 'crumplify', 'snooze', 'twonkle', 'bamboozlemate', \n",
" 'this doesn’t matter', 'zap', 'mind blank', 'hiss', 'snagged', 'splurgy', 'snagglebash', 'guess', 'zapz', 'frap', \n",
" 'blotter', \"don't even know\", 'don’t know answer', 'spindletastic', 'zizzlesplat', 'jinkled', 'placeholder search', \n",
" 'uncertain search', 'splode', 'abcxyz', 'twangleblop', 'shifty', 'bumfuzzle', 'plunge', 'thingy', \n",
" 'swooshenator', 'quark', 'tatterblast', 'frizzlefry', 'something random', 'puff', 'blobby', 'placeholder attempt', \n",
" 'weird example', 'wiggle', 'snortleboo', 'bouncy', 'qwerty', 'whirl', 'nix', 'idk what', 'random search', \n",
" 'glimmering', 'guzzle', 'strange text', 'accidental hit', 'forgot keypress', 'dazzleplunk', 'snurply', \n",
" 'confused', 'weird gibberish', 'idc either', 'test123', 'huff', 'supercalifragilistic', 'clap', 'whoopsie', 'nump', \n",
" 'lorem ipsum', 'snuffle', 'unknown phrase', 'whizz', 'bloop', 'glitch', 'zomp', 'clappy', 'gush', 'zappletastic', \n",
" 'hooey', 'bing', 'slap', 'ting', 'miscellaneous', 'jingle', 'idk just looking', 'twangy', 'dinglefrizzle', \n",
" 'just clicking', 'quizzical', 'splatterdash', 'kerplunkitude', 'fizzlematic', 'piff', 'jazz', 'jib', 'random phrase', \n",
" 'flapper', 'uhmm', 'nothing much', 'sdf', 'snub', 'confusing example', 'keyboard smash', 'randomized words', \n",
" 'nothing useful', 'random sentence', 'placeholder input', 'splattergrip', 'zorp', 'fluffernutter', 'splopp', \n",
" 'incomplete search', 'check this out', 'woozle', 'bananarama', 'quiz', 'spiffy', 'undefined', 'confusing term', 'sploom', \n",
" 'randomized example', 'spliffy', 'ooze', 'blazing', 'uncertain input', 'unknown search', 'random guesses', \n",
" 'unknown', 'concept unclear', 'accidental input', 'sporkinator', 'whats this', 'maybe', 'ignore this', 'twinkle', \n",
" 'whatchamacallit', 'splank', 'weird thing', 'huh', 'into the unknown', 'chaos', 'wigglie', 'twistamatic', 'kerflapify', \n",
" 'twizzletude', 'mock', 'thud', 'shrug', 'grizzed', 'jibberjabber', 'weirdness', 'anything', 'plop', 'dazzlicious', \n",
" 'random selection', 'splatt', 'abracadabra', 'whooshenator', 'random mouse click', 'sparklefish', 'banal', \n",
" \"what's the word\", 'mistyped search', 'twinklebash', 'splush', 'splazz', 'forgot search term', 'crumplamatic', 'glee', \n",
" 'whizzy', 'whizzlemate', 'jumpy', 'dork', 'randomxyz', 'gobsmacktastic', 'no clue what', 'zazz', 'beyond the void', \n",
" 'weird try', 'drift', 'yank', 'yodelsnap', 'biff', 'forgot randomness', 'splatterblast', 'no idea', 'smooshify', \n",
" 'peep', 'rick', 'splendiferous', 'squishy', 'muff', 'flabbergizmo', 'confuzzled', 'I think so', 'zing', \n",
" 'meaningless typing', 'shush', 'zany', 'don’t need help', 'randomly chosen', 'warpydash', 'forgot words', \n",
" 'placeholder typing', 'spunky', 'spindleplop', 'crash', 'flabbergast', 'snaggleplop', 'hootnanny', 'blurp', \n",
" 'miff', 'snarkle', 'snookie', 'gleamitude', 'hello world', 'zag', 'accidental gibberish', 'nothing in mind', \n",
" 'bash', 'spiv', 'rift', 'don’t know what to search', 'splong', 'no point', 'forgot attempt', 'fluttermate', \n",
" 'flub', 'guff', 'dazzled', 'doodad', 'forgot term', 'blotchy', 'odd', 'kerplazzle', 'grubby', 'try to see', 'glop', \n",
" 'whooshify', 'snicker', 'snuffly', 'random thought', 'mixed up stuff', 'zapper', 'sort of searching', 'slushy', \n",
" 'blurification', 'mop', 'smit', 'splurge', 'meaningless input', 'quix', 'zapplarific', 'splang', 'zoinkalicious', \n",
" 'unclear selection', 'splushy', 'guesstimate', 'snazzie', 'what about this', 'input fail', 'codswallop', 'dink', 'splunk', \n",
" 'unclear', 'strange example', 'jitter', 'sploff', 'blip', 'unknown meaning', 'nope', 'gadzooks', 'odd example', \n",
" 'zappomatic', 'janglystorm', 'ink', 'wobbled', 'wigglyy', 'typed by mistake', 'twirly', 'lurk', 'kerplottify', \n",
" 'twizzlefang', 'muck', 'clunky', 'splatterific', 'clippy', 'oops input', 'what am I doing', 'qazwsxedc', 'does it matter', \n",
" 'nonsensical', 'swooshinator', 'poiuuy', 'splish', 'mistyped query', 'squizzlewhack', 'what now', 'spluzz', 'glim', \n",
" 'placeholder keypress', 'mistyped randomness', 'what is it', 'don’t know why', 'quibbleplop', 'guess what', 'snizzlezap', \n",
" 'meaning of nothing', 'wiggles', 'zxcvbn', 'spur', 'uncertain term', 'what am I typing', 'zoodleblorp', 'floppy', 'asdfasdf', \n",
" 'confused input', 'unclear sentence', 'snortlematic', 'smooshinator', 'random term', 'searching something', \n",
" 'snorflemate', 'twinkly', 'skip', 'quib', 'forgotten term', 'oops', 'splodge', 'meaningless words', 'unclear input', \n",
" 'unclear phrase', 'zoom', 'sneeze', 'cat on keyboard', 'nincompoop', 'zappification', 'warpington', 'splurty', \n",
" 'do I know', 'splott', 'splurb', 'plink', 'dazzlematic', 'could be anything', 'lost thoughts', 'what', 'pizz', \n",
" 'jiggles', 'splodgy', 'twang', 'i forgot', 'meaningless term', 'unclear search', 'thunderplunk', 'just pressing keys', \n",
" 'splodgify', 'flit', 'snazzify', 'zoop', 'totally confused', 'quip', 'womp', 'wham', 'wigglyz', 'fuzzyy', 'why is this here', \n",
" 'malarkey', 'widget', 'don’t care', 'scoff', 'randomized search', 'unclear example', 'pop', 'quash', 'uh oh', \n",
" 'placeholder randomness', 'splatification', 'snickerplunk', 'nutterbutter', 'whisk', 'nibs', 'help', 'strange attempt', \n",
" 'blurptacular', 'gizmo', 'forgotten query', 'spazzy', 'ding', 'lost search', 'buzzing', 'hum', 'nonsensicality', \n",
" 'gloop', 'globby', 'lost meaning', 'plopperific', 'hard to say', 'snappy', 'don’t type this', 'blunderous', 'twizzlegrip', \n",
" 'flappy', 'random keypress', 'zizzlewhack', 'forgot what I typed', 'zingerdoodle', 'randomized attempt', 'unsure words', \n",
" 'strange sentence', 'asfjkl', 'frizz', 'idk', 'gobbledygook', 'flibbertigibbet', 'gadzookify', 'flabberzap', 'vroom', \n",
" 'splitch', 'glimmerstorm', 'blurt', 'frizzle', 'meaningless search', 'thingamajig', 'murmur', 'not this', 'sploof', \n",
" 'fiddlewhip', 'mumbojumbo', 'something strange', 'splurg', 'fake input', 'whiffle', 'forgot query', 'search mix', \n",
" 'yapplify', 'zippy', 'splurpy', 'splat', 'zoinks', 'bizz', 'crumby', 'meaningless query', 'snickerdoodle', 'weird word', \n",
" 'squidge', 'don’t know term', 'spangletude', 'spazzmatic', 'just testing', 'baffled', 'splurt', 'gaze', 'frizzy', \n",
" 'bamboozling', 'slurp', 'zappertude', 'splorch', 'swooshtastic', 'dunk', 'honk', 'smudgy', 'flimmerstorm', 'tizz', \n",
" 'uncertain randomness', 'jangletude', 'perhaps this', 'placeholder search term', 'whoosh', 'spike', 'glitterbop', \n",
" 'idiosyncratic', 'odd typing', 'blob', 'bazzlemate', 'crumpleton', 'clutterbomb', 'whatever', 'kerfuffle', 'test input', \n",
" 'randomized keypress', 'meaningless randomness', 'why not', 'snizzleblap', 'bonk', 'forgot search', 'zonk', 'whatsisname', \n",
" 'doesn’t matter', 'splurgz', 'twig', 'ramblethorp', 'fake query', 'ping', 'smack', 'buzz', 'tingly', 'warpydoodle', \n",
" 'filler words', 'buzzed', 'unclear thought', 'weird input', 'blap', 'snazzy', 'look for this', 'snorkelwhip', 'spoon', \n",
" 'just guessing', 'glitche', 'swirl', 'snooker', 'search fail', 'random gibberish', 'abstract thought', 'spindelicious', \n",
" 'snorple', 'fell asleep typing', 'splunge', 'twit', 'grippy', 'flip', 'whatsisface', 'maybe something', 'bamboozle', \n",
" 'zinger', 'drizzleblip', 'splonky', 'what do I search', 'blat', 'another try', 'odd randomness', 'yarn', 'squib', \n",
" 'confused term', 'flabbergasted', 'testing input', 'don’t know', 'thunderbop', 'blurpsational', 'janglydash', 'brouhaha', \n",
" 'find out about', 'strange randomness', 'kerplizzle', 'meaningless attempt', 'spud', 'placeholder term', 'woof', 'splaff', \n",
" 'jigglez', 'fuzzed', 'blahblah', 'grizzle', 'something here', 'blink', 'snuggly', 'yelp', 'chop', 'eternal question', 'splift', \n",
" 'what do you mean', 'hullabazoo', 'cloggy', 'wrong key pressed', 'test again', 'don’t ask me', 'blur', 'twisty', 'flapperdash', \n",
" 'crinklewhip', 'plinky', 'gobbleplop', 'I don’t understand', 'random', 'dummy text', 'blurblenator', 'try something', 'input here', \n",
" 'thing', 'fringe', 'no answer', 'placeholder selection', 'test', 'spangleplop', 'splash', 'lost in thought', 'zest', \n",
" 'fiddleplop', 'bunk', 'snag', 'vex', 'placeholder randomness example', 'spat', 'placeholder phrase', 'random search term', \n",
" 'squigg', 'tinge', 'random words', 'unknown query', 'not useful', 'snuzzlefrump', 'type here', 'snuzzle', 'drip', 'gibberish', \n",
" 'hodgepodge', 'forgot the term', 'completely random', 'doesn’t make sense', 'lost', 'splatterstorm', 'meaningless text', \n",
" 'twizzle', 'find something', 'twinkletude', 'zine', 'spunked', 'crikey', 'mistaken input', 'no idea what this is', 'spork', \n",
" 'glimmertastic', 'sloppy', 'twirky', 'abstract query', 'fluffytude', 'randomized selection', 'randomized randomness', \n",
" 'nudge', 'gawk', 'buzzer', 'nonsensical search', 'i was curious', 'zapplify', 'cloppy', 'doohickey', 'snickly', 'doodle', \n",
" 'placeholder example', 'placeholder text', 'nonsense search', 'why search this', \"this doesn't work\", 'splendiferific', \n",
" 'crappy', 'what are words', 'clop', 'randomized term', 'weird', 'snazztastic', 'whizzbang', 'blaze', 'twangaloo', \n",
" 'strange keypress', 'placeholder query', 'skew', 'splink', 'lkjhgfd', 'unclear meaning', 'flummoxify', 'lollygag', \n",
" 'odd gibberish', 'clunk', 'snap', 'zapf', 'flummoxed', 'yawn', 'random input', 'strange word', 'zapplomatic', \n",
" 'does this work', 'gasp', 'typing nothing', 'idk anymore', 'empty thoughts', 'pluck', 'randomized test', \n",
" 'brain fog', 'squibbletude', 'fizzle', 'jinglyy', 'mistyped term', 'confused mind', 'random typing', 'asdfgh', \n",
" 'infinity', 'twist', 'something typed', 'kerplunktastic', 'just trying this', 'mistaken search', 'sparklematic', \n",
" 'woop', 'jittery', 'oopsie', 'snippy', 'splinky', 'splint', 'swooshification', 'spit', 'zinged', 'blop', 'lost words',\n",
" 'crux', 'blurbleplop', 'balderdash', 'perhaps not', 'flibber', 'snickerwhack', 'try later', 'zork', 'void', \n",
" 'accidental query', 'fumble', 'snarked', 'don’t care search', 'just looking', 'spindling', 'snip', 'squish', \n",
" 'blazer', 'splo', 'splunky', 'unclear randomness', 'spliff', 'not this either', 'nonsensical words', \n",
" 'testing random', 'snigglewhap', 'odd input', 'whizzlegrip', 'dazzlegrip', 'fling', 'meaning of gibberish', \n",
" 'weird thoughts', 'gunk', 'does this help', 'flux', 'wink', 'wonky', 'wisp', 'drizzlematic', 'another test', \n",
" 'test search', 'just wondering', 'crumblewhack', 'spaz', 'splung', 'skid', 'quirky', 'odd search', 'accidental term', \n",
" 'dunno', 'quizzicality', 'gleam', 'glimmer', 'don’t press enter', 'gadget', 'whizzleplop', 'don’t know exactly', \n",
" 'odd words', 'blotty', 'thunderblop', 'maybe not', 'spludge', 'discombobulated', 'stuff', 'halfway done', \n",
" 'sparklenator', 'zang', 'jolt', 'accidental search', 'what is going on', 'wiggler', 'mnbvcxz', 'yip', 'wriggle', \n",
" 'hullaballoo', 'janglenut', 'zapplesmash', 'janglitude', 'what is this', 'whip', 'tiddlywinks', 'wiggly', 'weird randomness', \n",
" 'sporkalicious', 'wriggy', 'meaningless selection', 'crumble', 'weird thought', 'splurch', 'don’t understand', \n",
" 'sploosh', 'yap', 'nonsense', 'wobble', 'question of life', 'randomly typed', 'snuggle', 'snizzlegrip', 'oops I typed', \n",
" 'zappy', 'twinkleplop', 'uncertain example', 'idc', 'mash', 'not sure', 'pandemonium', 'perhaps later', 'quirked', \n",
" 'smug', 'warp', 'dash', 'could be nothing', 'unsure search', 'jumbled phrases', 'hush', 'wibble', 'weird search', \n",
" 'quibberish', 'flop', 'discombobulate', 'this makes no sense', 'fizz', 'quirkitude', 'zingzang', 'dank', 'limitless', \n",
" 'this is random', 'crunch', 'vibe', 'nothing specific', 'forgot', 'not important', 'slosh', 'question mark', 'zoopendous', \n",
" 'flummify', 'splosh', 'splorp', 'splishy', 'snurkle', 'blah', 'guess answer', 'twitch', 'flap', 'snooperdoodle', \n",
" 'janglybits', 'snizzleflap', 'slush', 'snortlemate', 'quirk', 'void query', 'fizzled', 'lollygagging', 'wonkifying', \n",
" 'nothing', 'splunch', 'hullabaloo', 'thingamabob', 'dazzlebash', 'whizzie', 'this and that', 'shard', 'twix',\n",
" \"crumpled\", \"splizzle\", \"gargle\", \"mangled\", \"shamble\", \"wobblish\", \"drizzlepop\",\n",
" \"splinker\", \"fiddlest\", \"twizzlepop\", \"blurzzle\", \"snizzlewick\", \"wozzle\", \n",
" \"cracklepop\", \"glibbish\", \"twezzle\", \"boondock\", \"sizzleflip\", \"snigglemash\",\n",
" \"zazzle\", \"fizzlepot\", \"scramble\", \"tinglish\", \"sprozzle\", \"blimble\", \"zibble\",\n",
" \"slapdash\", \"gobstork\", \"ziggler\", \"flingle\", \"wrangly\", \"twizzlebit\", \"brambly\",\n",
" \"snubble\", \"splintery\", \"fizznack\", \"tibber\", \"quaggly\", \"whooshpop\", \"snibble\",\n",
" \"plunkish\", \"glimflash\", \"wobbert\", \"squidgy\", \"kerplonk\", \"fobble\", \"blurzy\",\n",
" \"scriggly\", \"smudgify\", \"tassler\", \"whipple\", \"snuzzify\", \"zaggle\", \"plonker\",\n",
" \"smizzle\", \"quiggle\", \"spongle\", \"shizzle\", \"drippity\", \"bogglepop\", \"twiddly\",\n",
" \"puzzleth\", \"flummish\", \"sniggleflop\", \"crumplish\", \"twiggle\", \"nubbish\", \n",
" \"splurkle\", \"whibber\", \"jibblish\", \"twonker\", \"fizzlewhip\", \"spazzle\", \"splorpish\",\n",
" \"snuffler\", \"hubble\", \"twinkler\", \"crumpler\", \"wimbley\", \"twazzle\", \"blurbonic\",\n",
" \"zapplepop\", \"flippery\", \"snuzle\", \"quizzwhip\", \"clatter\", \"garglunk\", \"splingle\",\n",
" \"drabbler\", \"spunkly\", \"jumbler\", \"snappish\", \"zingify\", \"buzzpop\", \"snizzlehop\",\n",
" \"plobber\", \"scribble\", \"twongle\", \"scrabbly\", \"sniggler\", \"bimblepop\", \"snorplebop\",\n",
" \"wizzle\", \"blimpy\", \"splinglepop\", \"frizzlepop\", \"grizzleton\", \"whizbang\", \n",
" \"tinklish\", \"blopple\", \"blurbit\", \"wozzly\", \"zingpong\", \"splimble\", \"twinklypop\",\n",
" \"spinkly\", \"snubbleton\", \"glozzle\", \"splonkle\", \"quizzle\", \"drizzlebot\", \"snarbly\",\n",
" \"twizzleth\", \"whizzleton\", \"crumblish\", \"snapple\", \"splozzle\", \"glimmish\", \n",
" \"plimbish\", \"snuzzleblop\", \"twinklish\", \"fizzywhip\", \"snorblish\", \"drizzler\", \n",
" \"flopplish\", \"smizzlepop\", \"crumpledash\", \"twizzlefizz\", \"plumbly\", \"smuzzle\",\n",
" \"tizzler\", \"gobblish\", \"splunkton\", \"jibberdash\", \"sproingly\", \"snizzler\", \n",
" \"glabble\", \"twinkleflip\", \"flobble\", \"twonklepop\", \"splittish\", \"grumblepop\",\n",
" \"whimblish\", \"splingledash\", \"snarpish\", \"twinklybit\", \"spindlish\", \"grubble\",\n",
" \"smarple\", \"twonkerish\", \"sniffly\", \"snibbleton\", \"grizzlepop\", \"tazzler\", \n",
" \"splinsh\", \"snazzler\", \"twinklepuff\", \"zopple\", \"glunkish\", \"crizzlepop\", \n",
" \"snarklebot\", \"whibblish\", \"flimmerdash\", \"splurpyton\", \"snuzzlepop\", \"wigglerish\",\n",
" \"sniggleplop\", \"jigglish\", \"splurble\", \"buzzsnip\", \"plomble\", \"splattypop\", \n",
" \"twinklepip\", \"twonglish\", \"flobber\", \"grimpish\", \"quaggler\", \"sporkish\", \n",
" \"drizzleth\", \"squiggler\", \"splobber\", \"ploppish\", \"snigglerish\", \"splingleth\",\n",
" \"grizzleblop\", \"sploblish\", \"snarbler\", \"smarvish\", \"quizzlet\", \"snapplish\",\n",
" \"snuzleflip\", \"plongish\", \"crizzlebot\", \"grimpish\", \"twinklebot\", \"blurpish\",\n",
" \"splopple\", \"gizzleth\", \"drizzlepuff\", \"twonklish\", \"snubbler\", \"blurblebot\",\n",
" \"splizzy\", \"twinkleton\", \"jibbler\", \"splizzlepop\", \"splurbit\", \"plobblish\", \n",
" \"crumplish\", \"snizzlebit\", \"twinklishbot\", \"spinkler\", \"snibbleflip\", \"wigglebot\",\n",
" \"twonglishbot\", \"snizzleton\", \"splongle\", \"blonker\", \"glimmerbit\", \"snarvish\",\n",
" \"love\", \"anger\", \"hope\", \"dream\", \"thought\", \"courage\", \n",
" \"strength\", \"patience\", \"birthday\", \"anniversary\", \n",
" \"vacation\", \"weekend\", \"holiday\", \"winter\", \"summer\", \n",
" \"autumn\", \"spring\", \"success\", \"failure\", \"freedom\", \"peace\", \"wisdom\", \n",
" \"kindness\", \"respect\", \"free\", \"freedom\", \"great\", \"best\", \"worst\", \"last\", \"first\", \"second\", \n",
" \"next\", \"there\", \"banana\", \"apple\",\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cc3a0477-6ae4-4794-8fef-b562d79dbbe9",
"metadata": {},
"outputs": [],
"source": [
"len(templates)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ad319a0f-7d5c-4e97-8632-c23007424ae0",
"metadata": {},
"outputs": [],
"source": [
"PERSON_ENTITY = \"{person}\"\n",
"ORG_ENTITY = \"{organization}\"\n",
"CITY_ENTITY = \"{city}\"\n",
"STATE_ENTITY = \"{state}\"\n",
"CITY_STATE_ENTITY = \"{city_state}\"\n",
"PRODUCT_ENTITY = \"{product}\"\n",
"COUNTRY_ENTITY = \"{country}\"\n",
"SERVICE_ENTITY = \"{services}\"\n",
"CAR_ENTITY = \"{car}\"\n",
"GADGET_ENTITY = \"{gadget}\"\n",
"STOCK_ENTITY = \"{stock}\"\n",
"MONEY_ENTITY = \"{money}\"\n",
"FINANCE_ENTITY = \"{finance}\"\n",
"TRAVEL_ENTITY = \"{travel}\"\n",
"FOOD_ENTITY = \"{food}\"\n",
"RESTAURANT_ENTITY = \"{restaurant}\"\n",
"SPORTS_TERMS_MISSING_ENTITY = \"{sports_term}\"\n",
"LOCATIONS_AND_LANDMARKS_ENTITY = \"{location_and_landmark}\"\n",
"ACTIVTIES_AND_EVENTS_ENTITY = \"{activity_and_event}\"\n",
"FOOD_MISSING_ENTITY = \"{food_m}\"\n",
"TRANSPORT_AND_DIRECTIONS_ENTITY = \"{transport_and_direction}\"\n",
"\n",
"FAKE_CITY_ENTITY = \"{fake_cty}\"\n",
"FAKE_STATE_CODE_ENTITY = \"{fake_state_cd}\"\n",
"FAKE_STATE_NAME_ENTITY = \"{fake_state_nam}\"\n",
"CELEBRITY_ENTITY = \"{celebrity}\"\n",
"\n",
"\n",
"def detect_entity(entity_name, template):\n",
" return entity_name in template\n",
"\n",
"def tokenize(text):\n",
" # Use regular expression to split words while keeping punctuation as separate tokens\n",
" return re.findall(r'\\w+|[^\\w\\s]', text)\n",
"\n",
"# Tokenize the query and generate corresponding NER labels\n",
"def tokenize_and_label(query, city, state, city_state, organization, person, celebrity):\n",
" tokens = tokenize(query) # Tokenize the query using the improved function\n",
" ner_labels = [0] * len(tokens) # Initialize all labels as \"O\" (outside any entity)\n",
" \n",
" # Label city_state entity\n",
" if city_state:\n",
" city_state_tokens = tokenize(city_state)\n",
" start_idx = find_token_index(tokens, city_state_tokens)\n",
" if start_idx is not None:\n",
" ner_labels[start_idx] = 9 # CSB-LOC (beginning of city_state)\n",
" for i in range(1, len(city_state_tokens)):\n",
" ner_labels[start_idx + i] = 10 # CSI-LOC (inside city_state)\n",
"\n",
" # Label city entity\n",
" if city:\n",
" city_tokens = tokenize(city)\n",
" start_idx = find_token_index(tokens, city_tokens)\n",
" if start_idx is not None:\n",
" ner_labels[start_idx] = 5 # CB-LOC (beginning of city)\n",
" for i in range(1, len(city_tokens)):\n",
" ner_labels[start_idx + i] = 6 # CI-LOC (inside city)\n",
" \n",
" # Label state entity\n",
" if state:\n",
" state_tokens = tokenize(state)\n",
" start_idx = find_token_index(tokens, state_tokens)\n",
" if start_idx is not None:\n",
" ner_labels[start_idx] = 7 # SB-LOC (beginning of state)\n",
" for i in range(1, len(state_tokens)):\n",
" ner_labels[start_idx + i] = 8 # SI-LOC (inside state)\n",
"\n",
" # Label organization entity\n",
" if organization:\n",
" org_tokens = tokenize(organization)\n",
" start_idx = find_token_index(tokens, org_tokens)\n",
" if start_idx is not None:\n",
" ner_labels[start_idx] = 3 # B-ORG (beginning of organization)\n",
" for i in range(1, len(org_tokens)):\n",
" ner_labels[start_idx + i] = 4 # I-ORG (inside organization)\n",
"\n",
" # Label person entity\n",
" if person:\n",
" person_tokens = tokenize(person)\n",
" start_idx = find_token_index(tokens, person_tokens)\n",
" if start_idx is not None:\n",
" ner_labels[start_idx] = 1 # B-PER (beginning of person)\n",
" for i in range(1, len(person_tokens)):\n",
" ner_labels[start_idx + i] = 2 # I-PER (inside person)\n",
" # Label person entity\n",
" if celebrity:\n",
" person_tokens = tokenize(celebrity)\n",
" start_idx = find_token_index(tokens, person_tokens)\n",
" if start_idx is not None:\n",
" ner_labels[start_idx] = 1 # B-PER (beginning of person)\n",
" for i in range(1, len(person_tokens)):\n",
" ner_labels[start_idx + i] = 2 # I-PER (inside person)\n",
" \n",
" return tokens, ner_labels\n",
"\n",
"# Function to find the starting index of an entity's tokens in the query tokens\n",
"def find_token_index(tokens, entity_tokens):\n",
" for i in range(len(tokens) - len(entity_tokens) + 1):\n",
" if tokens[i:i + len(entity_tokens)] == entity_tokens:\n",
" return i\n",
" return None\n",
"\n",
"def generate_queries(templates, n_queries=10000):\n",
" cnt = 0\n",
" fake_cnt = 0\n",
" celeb_cnt = 0\n",
" queries_with_labels = []\n",
" query_counter = Counter()\n",
" while cnt < n_queries:\n",
" if (cnt %10000) == 0:\n",
" print(f\"completed generating {cnt} queries\")\n",
" template = random.choice(templates)\n",
" # print(template)\n",
" person, organization, city, state, city_state = (None,) * 5\n",
" product, country, service, car, gadget, stock, money, finance, travel, food, restaurant = (None,) * 11\n",
" sports_term, location_and_landmark, activity_and_event, food_m, transport_and_direction = (None,) * 5\n",
" fake_cty, fake_state_cd, fake_state_nam, celebrity = (None,) * 4\n",
"\n",
" if detect_entity(PERSON_ENTITY, template):\n",
" person=get_random_choice_from_list(persons)\n",
" if detect_entity(ORG_ENTITY, template):\n",
" organization = get_random_choice_from_list(organizations)\n",
" if detect_entity(PRODUCT_ENTITY, template):\n",
" product = get_random_choice_from_list(products)\n",
" if detect_entity(COUNTRY_ENTITY, template):\n",
" country = get_random_choice_from_list(countries)\n",
" if detect_entity(COUNTRY_ENTITY, template):\n",
" service = get_random_choice_from_list(services)\n",
" if detect_entity(CAR_ENTITY, template):\n",
" car = get_random_choice_from_list(cars)\n",
" if detect_entity(GADGET_ENTITY, template):\n",
" gadget = get_random_choice_from_list(gadgets)\n",
" if detect_entity(STOCK_ENTITY, template):\n",
" stock = get_random_choice_from_list(stocks)\n",
" if detect_entity(MONEY_ENTITY, template):\n",
" money = get_random_choice_from_list(moneys)\n",
" if detect_entity(FINANCE_ENTITY, template):\n",
" finance = get_random_choice_from_list(finances)\n",
" if detect_entity(TRAVEL_ENTITY, template):\n",
" travel = get_random_choice_from_list(travels)\n",
" if detect_entity(FOOD_ENTITY, template):\n",
" food = get_random_choice_from_list(foods)\n",
" if detect_entity(RESTAURANT_ENTITY, template):\n",
" restaurant = get_random_choice_from_list(restaurants)\n",
" if detect_entity(SPORTS_TERMS_MISSING_ENTITY, template):\n",
" sports_term = get_random_choice_from_list(sports_terms_missing)\n",
" if detect_entity(LOCATIONS_AND_LANDMARKS_ENTITY, template):\n",
" location_and_landmark = get_random_choice_from_list(locations_and_landmarks)\n",
" if detect_entity(ACTIVTIES_AND_EVENTS_ENTITY, template):\n",
" activity_and_event = get_random_choice_from_list(activities_and_events)\n",
" if detect_entity(FOOD_MISSING_ENTITY, template):\n",
" food_m = get_random_choice_from_list(food_missing)\n",
" if detect_entity(TRANSPORT_AND_DIRECTIONS_ENTITY, template):\n",
" transport_and_direction = get_random_choice_from_list(transport_and_directions)\n",
"\n",
" if detect_entity(FAKE_CITY_ENTITY, template):\n",
" fake_cty = get_sample_fake_city()\n",
" if detect_entity(FAKE_STATE_CODE_ENTITY, template):\n",
" fake_state_cd = get_sample_fake_state_code()\n",
" if detect_entity(FAKE_STATE_NAME_ENTITY, template):\n",
" fake_state_nam = get_sample_fake_state_name()\n",
"\n",
" if detect_entity(CITY_ENTITY, template):\n",
" city=get_sample_from_cities(city_info, city_weights, actual_threshold=0.7)\n",
" if detect_entity(STATE_ENTITY, template):\n",
" state=get_sample_from_states(state_info, actual_threshold=0.5)\n",
" if detect_entity(CITY_STATE_ENTITY, template):\n",
" city_state=get_sample_from_cities_and_states(city_state_code_info, city_state_name_info, state_code_threshold=0.8)\n",
"\n",
" if detect_entity(CELEBRITY_ENTITY, template):\n",
" celebrity=get_random_choice_from_list(celebrities)\n",
" \n",
" query = template.format(person=person,\n",
" organization=organization,\n",
" city=city,\n",
" state=state,\n",
" city_state=city_state,\n",
" product=product,\n",
" country=country,\n",
" service=service,\n",
" car=car,\n",
" gadget=gadget,\n",
" stock=stock,\n",
" money=money,\n",
" finance=finance,\n",
" travel=travel,\n",
" food=food,\n",
" restaurant=restaurant,\n",
" sports_term=sports_term,\n",
" location_and_landmark=location_and_landmark,\n",
" activity_and_event=activity_and_event,\n",
" food_m=food_m,\n",
" transport_and_direction=transport_and_direction,\n",
" fake_cty=fake_cty,\n",
" fake_state_cd=fake_state_cd,\n",
" fake_state_nam=fake_state_nam,\n",
" celebrity=celebrity\n",
" )\n",
" tokens, ner_labels = tokenize_and_label(query, city, state, city_state, organization, person, celebrity)\n",
" if query_counter.get(query, 0) == 0:\n",
" queries_with_labels.append((query, tokens, ner_labels))\n",
" query_counter.update([query])\n",
" cnt += 1\n",
" if (detect_entity(FAKE_CITY_ENTITY, template) or \n",
" detect_entity(FAKE_STATE_CODE_ENTITY, template) or \n",
" detect_entity(FAKE_STATE_NAME_ENTITY, template)):\n",
" fake_cnt += 1\n",
" if detect_entity(CELEBRITY_ENTITY, template):\n",
" celeb_cnt += 1\n",
" print(f\"fake_cnt = {fake_cnt}\")\n",
" print(f\"celeb_cnt = {celeb_cnt}\")\n",
" print(f\"cnt = {cnt}\")\n",
" return queries_with_labels"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fad4a249-7151-4c50-833d-9584819b4105",
"metadata": {},
"outputs": [],
"source": [
"queries_with_labels = generate_queries(templates, n_queries=450000) # 300000"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f1270765-2a56-4dda-9e25-f0f6ac26f473",
"metadata": {},
"outputs": [],
"source": [
"len(queries_with_labels)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4145760e-8b8c-4896-a072-87ac5484dcf3",
"metadata": {},
"outputs": [],
"source": [
"# queries_with_labels[:10]\n",
"df_ner_examples = pd.DataFrame(queries_with_labels, columns=['query', 'tokens', 'ner_tags'])\n",
"df_ner_examples"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a6b071e2-c7bf-4891-a662-5c14898da9f8",
"metadata": {},
"outputs": [],
"source": [
"df_ner_examples['ner_tags'].apply(lambda tags: len([tag for tag in tags if tag > 4])).value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "022690de-9348-4d98-9007-b730571d6d6a",
"metadata": {},
"outputs": [],
"source": [
"label_map"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d5d49f63-72c7-41da-a16c-536789713297",
"metadata": {},
"outputs": [],
"source": [
"df_ner_examples['query'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2de441c8-4e9c-4b59-9bcc-cfbe009718bf",
"metadata": {},
"outputs": [],
"source": [
"# df_ner_examples.to_csv(\"../data/df_ner_examples_v3.csv\", index=False)\n",
"# df_ner_examples.to_csv(\"../data/df_ner_examples_v4.csv\", index=False)\n",
"# df_ner_examples.to_csv(\"../data/df_ner_examples_v5.csv\", index=False) # Additional partial cities\n",
"df_ner_examples.to_csv(\"../data/df_ner_examples_v6.csv\", index=False) # Additional partial cities"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "df043822-779c-4f9c-89eb-b331e2b0de19",
"metadata": {},
"outputs": [],
"source": [
"# useful for post processing to standardize the city names\n",
"def build_lookup(dataframe):\n",
" # Initialize an empty dictionary for the lookup\n",
" lookup = {}\n",
" \n",
" # Iterate over each row in the DataFrame\n",
" for index, row in dataframe.iterrows():\n",
" city_name = row['city_name']\n",
" alternate_names = row['alternate_names']\n",
" \n",
" # Iterate over the list of alternate names and map them to the city_name\n",
" for alt_name in alternate_names:\n",
" lookup[alt_name.lower()] = city_name # Convert alternate names to lowercase for consistency\n",
" \n",
" return lookup\n",
"\n",
"city_alternate_to_city_lkp = build_lookup(city_states_data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "62a392e3-e18e-470f-9f95-ad35ebaebca8",
"metadata": {},
"outputs": [],
"source": [
"len(city_alternate_to_city_lkp)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ca88070b-318e-4f7f-850a-e6ed176748a0",
"metadata": {},
"outputs": [],
"source": [
"# city_alternate_to_city_lkp"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "85bdeff1-a3f2-443e-a31b-d80e836c6ebe",
"metadata": {},
"outputs": [],
"source": [
"# !python -m pip install onnxruntime"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "689e6844-2a90-4b7a-a9a5-bb298dce2b70",
"metadata": {},
"outputs": [],
"source": [
"# !python -m pip freeze| grep onnxruntime"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fc61067c-6e8a-499a-9d08-07fb4fb0eb2f",
"metadata": {},
"outputs": [],
"source": [
"# !mkdir ../models"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "74bca5a8-0bb0-46c1-8429-598e172f34af",
"metadata": {},
"outputs": [],
"source": [
"import onnxruntime as ort\n",
"import numpy as np\n",
"from transformers import AutoTokenizer, BertTokenizer\n",
"\n",
"# Download the ONNX model\n",
"# model_url = \"https://huggingface.co/Xenova/bert-base-NER/resolve/main/onnx/model_quantized.onnx\"\n",
"# model_url = \"https://huggingface.co/Mozilla/distilbert-NER-LoRA/resolve/main/onnx/model_quantized.onnx\"\n",
"model_url = \"https://huggingface.co/Mozilla/distilbert-uncased-NER-LoRA/resolve/main/onnx/model_quantized.onnx\"\n",
"# model_url = \"https://huggingface.co/chidamnat2002/distilbert-uncased-NER-LoRA/resolve/main/onnx/model_quantized.onnx\"\n",
"# model_path = \"../models/distilbert-NER-LoRA.onnx\"\n",
"model_path = \"../models/distilbert-uncased-NER-LoRA.onnx\"\n",
"\n",
"# Download the ONNX model if not already present\n",
"response = requests.get(model_url)\n",
"with open(model_path, 'wb') as f:\n",
" f.write(response.content)\n",
"\n",
"# Load the ONNX model using ONNX Runtime\n",
"session = ort.InferenceSession(model_path)\n",
"\n",
"# Load the tokenizer (assuming it's based on BERT)\n",
"# tokenizer = BertTokenizer.from_pretrained(\"Mozilla/distilbert-NER-LoRA\")\n",
"tokenizer = AutoTokenizer.from_pretrained(\"Mozilla/distilbert-uncased-NER-LoRA\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "838001d1-a252-4a4f-bfab-8c7698b7c79b",
"metadata": {},
"outputs": [],
"source": [
"def compute_model_inputs_and_outputs(session, tokenizer, query):\n",
" # Tokenize the input\n",
" # inputs = tokenizer(query, return_tensors=\"np\", truncation=True, padding=True)\n",
" inputs = tokenizer(query, return_tensors=\"np\", truncation=True, padding='max_length', max_length=64)\n",
" # is_split_into_words=True,\n",
" # truncation=True,\n",
" # padding='max_length',\n",
" # max_length=64\n",
" \n",
" # The ONNX model expects 'input_ids', 'attention_mask', and 'token_type_ids'\n",
" # Convert all necessary inputs to numpy arrays and prepare the input feed\n",
" input_feed = {\n",
" 'input_ids': inputs['input_ids'].astype(np.int64),\n",
" 'attention_mask': inputs['attention_mask'].astype(np.int64),\n",
" # 'token_type_ids': inputs['token_type_ids'].astype(np.int64) # Some models might not need this; check if it's really required\n",
" }\n",
" \n",
" # Run inference with the ONNX model\n",
" outputs = session.run(None, input_feed)\n",
" # print(outputs)\n",
" return inputs, outputs\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f66190d3-5601-4593-b7b9-0eebde13e23e",
"metadata": {},
"outputs": [],
"source": [
"label_map"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "08ecb315-3896-4a7e-8c03-37e3ecb1fa9a",
"metadata": {},
"outputs": [],
"source": [
"## With Xenova/bert-base-NER\n",
"# Number of examples = 349\n",
"# #hits = 135; #hit rate = 0.3868194842406877\n",
"\n",
"## After finetuning the Mozilla/distilbert-NER-LoRA\n",
"#hits = 220; #hit rate = 0.6303724928366762\n",
"\n",
"## After finetuning the chidamnat2002/distilbert-uncased-NER-LoRA\n",
"#hits = 207; #hit rate = 0.5931232091690545\n",
"\n",
"## After finetuning the Mozilla/distilbert-uncased-NER-LoRA\n",
"#hits = 252; #hit rate = 0.7220630372492837"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1eed2554-784c-4f49-aad5-72b795f19295",
"metadata": {},
"outputs": [],
"source": [
"# len(missing_locations)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "feaed0b3-5fb8-4686-b57a-3a8d9764ec79",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# print(missing_locations)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d04d5258-16b4-4773-b585-b5f31db3926c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "ef09b219-dd01-4d66-92e2-c438935e8654",
"metadata": {},
"source": [
"#### Looking into CONLL 2003 dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4233afed-374f-4f2f-baaa-078447959367",
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset, Dataset\n",
"import re\n",
"\n",
"# Load the CoNLL-2003 dataset\n",
"dataset = load_dataset(\"conll2003\")\n",
"\n",
"loc_examples = dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "14216057-228f-467a-aa8e-02108d56cb92",
"metadata": {},
"outputs": [],
"source": [
"# dataset['train'].to_pandas()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e259586a-f67b-42b2-9665-a571da352f57",
"metadata": {},
"outputs": [],
"source": [
"# dataset['train']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "12e91919-6dc4-4ad3-a388-e5b90d4efa79",
"metadata": {},
"outputs": [],
"source": [
"synthetic_loc_dataset = Dataset.from_pandas(df_ner_examples.drop('query', axis=1))\n",
"print(synthetic_loc_dataset)\n",
"\n",
"print(synthetic_loc_dataset[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0d91ba34-cb67-418a-8a4e-4b442b144be6",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "496a76a7-3329-4849-affa-63166d427183",
"metadata": {},
"outputs": [],
"source": [
"# loc_dataset = dataset['train'].filter(lambda example: 5 in example['ner_tags'])\n",
"loc_dataset = dataset['train']\n",
"loc_dataset_filtered = loc_dataset.remove_columns(['pos_tags', 'chunk_tags'])\n",
"\n",
"# Set the format to ensure the order is 'id', 'tokens', and 'ner_tags'\n",
"loc_dataset_filtered[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "42652aaf-399f-413f-a8f6-e082f1057e3f",
"metadata": {},
"outputs": [],
"source": [
"# loc_dataset_filtered[-1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c47584e0-0612-400b-81e9-212a61209b94",
"metadata": {},
"outputs": [],
"source": [
"from datasets import concatenate_datasets\n",
"\n",
"from datasets import Sequence, ClassLabel, Value\n",
"\n",
"# Step 1: Get the full feature schema from synthetic_loc_dataset\n",
"features = synthetic_loc_dataset.features\n",
"\n",
"# Step 2: Update the 'ner_tags' feature to use ClassLabel from loc_dataset_filtered\n",
"# features['ner_tags'] = Sequence(feature=ClassLabel(names=loc_dataset_filtered.features['ner_tags'].feature.names))\n",
"features['ner_tags'] = Sequence(feature=ClassLabel(names=list(label_map.values())))\n",
"\n",
"# Step 3: Cast synthetic_loc_dataset to the updated feature schema\n",
"synthetic_loc_dataset = synthetic_loc_dataset.cast(features)\n",
"\n",
"# Check the updated features to confirm\n",
"print(synthetic_loc_dataset.features)\n",
"\n",
"# Now concatenate the datasets\n",
"# combined_dataset = concatenate_datasets([loc_dataset_filtered, synthetic_loc_dataset])\n",
"\n",
"# Verify the combined dataset\n",
"print(synthetic_loc_dataset[0])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "15f8ec72-8a43-43f2-932a-ef76b5efb4d2",
"metadata": {},
"outputs": [],
"source": [
"# ClassLabel(names=loc_dataset_filtered.features['ner_tags'].feature.names)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6e3b90ed-9bbf-4b8a-9990-b5db059de0ea",
"metadata": {},
"outputs": [],
"source": [
"# ClassLabel(names=list(label_map.values()))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6138a427-f03b-4355-bdac-ffec783f5a2b",
"metadata": {},
"outputs": [],
"source": [
"len(synthetic_loc_dataset)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "caac8e36-6d1c-4a42-8acd-7e81f816fa9b",
"metadata": {},
"outputs": [],
"source": [
"synthetic_loc_dataset[3]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2aa98e69-bf5f-4bcc-b387-2abdc60a99be",
"metadata": {},
"outputs": [],
"source": [
"synthetic_loc_dataset = synthetic_loc_dataset.map(\n",
" lambda example, idx: {'id': idx}, # Assign running count as the new 'id'\n",
" with_indices=True # Ensures we get an index for each example\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5906e294-6a1b-436d-a229-628f99190887",
"metadata": {},
"outputs": [],
"source": [
"synthetic_loc_dataset.to_pandas()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "46c0d423-3b8c-47ed-a8ae-a3316cd78bd0",
"metadata": {},
"outputs": [],
"source": [
"synthetic_loc_dataset[-1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c35b1a0b-303c-4eee-bc31-770872c212e5",
"metadata": {},
"outputs": [],
"source": [
"# synthetic_loc_dataset.to_parquet(\"../data/synthetic_loc_dataset_v3.parquet\")\n",
"# synthetic_loc_dataset.to_parquet(\"../data/synthetic_loc_dataset_v4.parquet\")\n",
"synthetic_loc_dataset.to_parquet(\"../data/synthetic_loc_dataset_v6.parquet\") # some partial cities examples"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d33bb9a1-bd49-49cd-aa90-5428d46fbad7",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer, AutoModelForTokenClassification\n",
"from transformers import pipeline\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(\"Mozilla/distilbert-uncased-NER-LoRA\")\n",
"model = AutoModelForTokenClassification.from_pretrained(\"Mozilla/distilbert-uncased-NER-LoRA\")\n",
"\n",
"nlp = pipeline(\"ner\", model=model, tokenizer=tokenizer)\n",
"example = \"New York\"\n",
"\n",
"ner_results = nlp(example)\n",
"print(ner_results)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "32524933-23f7-41ae-8597-da0300e6ac60",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}