notebooks/city_state_exploration_and_dataprep_v2.ipynb (2,238 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "id": "73d1863d-1d54-4cdd-843c-c033b28f15f6", "metadata": {}, "source": [ "Explore whether the weather keywords and locations are captured correctly" ] }, { "cell_type": "code", "execution_count": null, "id": "bd4805cc-8d46-40fa-8d39-35158d9212d4", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import requests\n", "from bs4 import BeautifulSoup\n", "import re\n", "from datasets import load_dataset, Dataset" ] }, { "cell_type": "code", "execution_count": null, "id": "b64db933-17ab-47cc-b0ba-ae37e89e450a", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import random\n", "from collections import Counter" ] }, { "cell_type": "markdown", "id": "8bcf91d7-8344-4b5e-9641-461b2630cb0f", "metadata": {}, "source": [ "#### Read the data/geonames-cities-states.json" ] }, { "cell_type": "code", "execution_count": null, "id": "738661a5-668f-4b2c-8823-dc3c0c92be94", "metadata": {}, "outputs": [], "source": [ "import json \n", "\n", "def get_geonames_city_state_data():\n", " geonames_file = \"../data/geonames-cities-states.json\"\n", " with open(geonames_file, 'r') as f:\n", " geonames_dict = json.load(f)\n", " \n", " \n", " cities_data = pd.DataFrame(geonames_dict['cities'])\\\n", " .rename(columns={'admin1_code': 'state_code', 'name': 'city_name', 'population': 'city_popln'})\n", " cities_data = cities_data[['id', 'state_code', 'city_name', 'city_popln', 'alternate_names']]\n", " states_data = pd.DataFrame(geonames_dict['states_by_abbr'].values())\\\n", " .rename(columns={'admin1_code': 'state_code', 'name': 'state_name'})\n", " states_data = states_data[['state_code', 'state_name']]\n", " city_states_data = cities_data.merge(states_data, how='left', on='state_code')\n", " city_states_data['city_weight'] = city_states_data['city_popln'] / city_states_data['city_popln'].sum()\n", " return city_states_data\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "a3aeb4bd-2e84-4121-84b7-8ffb1118ca37", "metadata": {}, "outputs": [], "source": [ "city_states_data = get_geonames_city_state_data()\n", "print(len(city_states_data))\n", "city_states_data" ] }, { "cell_type": "markdown", "id": "45711e5b-1f06-4cac-aea8-97deaea292a5", "metadata": {}, "source": [ "<!-- #### Add some partial city names for capturing the consumer needs \n", "if they type partial city names such as `coffee near me sunnyval` -->" ] }, { "cell_type": "code", "execution_count": null, "id": "f31c7125-c7ea-426c-b85c-1f71d1507fdd", "metadata": {}, "outputs": [], "source": [ "# city_states_data['city_name'].apply(len).describe(percentiles=[.1, .2, .25, .3, .4, .5, .6 ,.7, .75, .8, .9, .95, .98, .99])" ] }, { "cell_type": "code", "execution_count": null, "id": "0a53d52c-5d9d-4963-90a6-31cb269bf71d", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "4c84a18b-1f21-4006-a634-9da9ff725070", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "d35076ae-1d45-4699-8257-e98612500e43", "metadata": {}, "outputs": [], "source": [ "city_states_data.sort_values('city_weight', ascending=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "ad8ca7e6-7511-42f2-92df-a02526637f23", "metadata": {}, "outputs": [], "source": [ "city_weights = city_states_data[['city_name', 'city_weight']].set_index('city_name').to_dict()['city_weight']\n", "# city_weights" ] }, { "cell_type": "code", "execution_count": null, "id": "0eca185b-0ff3-4cad-a878-92f1d065081c", "metadata": {}, "outputs": [], "source": [ "city_info = city_states_data[['city_name', 'alternate_names']].set_index('city_name').to_dict()['alternate_names']\n", "state_info = city_states_data[['state_code', 'state_name']].set_index('state_code').to_dict()['state_name']\n", "city_state_code_info = city_states_data[['city_name', 'state_code', 'city_weight']].copy()\n", "city_state_name_info = city_states_data[['city_name', 'state_name', 'city_weight']].copy()" ] }, { "cell_type": "code", "execution_count": null, "id": "986b55c1-c92f-4722-91b8-29e48bbe2813", "metadata": {}, "outputs": [], "source": [ "# city_info" ] }, { "cell_type": "code", "execution_count": null, "id": "9cc31eb8-b5eb-4daa-a466-f873db8e3038", "metadata": {}, "outputs": [], "source": [ "city_state_code_info" ] }, { "cell_type": "code", "execution_count": null, "id": "bfa88c33-da07-4261-b890-2aa111988d3c", "metadata": {}, "outputs": [], "source": [ "city_state_name_info" ] }, { "cell_type": "code", "execution_count": null, "id": "116d168e-dd72-407c-8cad-865d11143307", "metadata": {}, "outputs": [], "source": [ "# list(city_info.keys())" ] }, { "cell_type": "code", "execution_count": null, "id": "3692858c-faa7-4219-8a4b-d25c82144e34", "metadata": {}, "outputs": [], "source": [ "fake_cities = [\n", " 'Umber Glade', 'Crimson Hollow', 'Midland Creek', 'Boulderfield', 'Fairbrook', 'Mossmere', 'Hearthfield', 'Norwyn', \n", " 'Elysian Ridge', 'Dover Hollow', 'Mistral Cove', 'Starfall', 'Eaglebrooke', 'Granite Ridge', 'Umbrafield', 'Goldenstone', \n", " 'Palisade Brook', 'Willowfield', 'Noblehaven', 'Frostgrove', 'Oasis Ridge', 'Larkspur Vale', 'Elderstone', 'Forest Vale', \n", " 'Yonder Bluff', 'Cloverstone', 'Kingsvale', 'Ashen Bluff', 'Yarrow Bluff', 'Zion Hollow', 'Velvet Pine', 'Fernspire', \n", " 'Inkwell', 'Eaglewood', 'Driftshade', 'Prairiefield', 'Northshade', 'Riverwatch', 'Sapphire Hollow', 'Jadehaven', \n", " 'Lunaris', 'Quailstone', 'Birchvale', 'Blossom Creek', 'Union Hollow', 'Whispering Brook', 'Yarrowstone', 'Candlevale', \n", " 'Ravenshire', 'Willowhaven', 'Wyrmrest', 'Frostshade', 'Silverbrook', 'Azure Hollow', 'Tanglefield', 'Umberstone', \n", " 'Glimmerbrook', 'Ravencrest', 'Larkridge', 'Windspire', 'Oakheart', 'Obsidian Point', 'Newstone', 'Moonlit Vale', \n", " 'Tranquil Ridge', 'Gilded Summit', 'Lunarshade', 'Seabrook', 'Quartzwood', 'Juniper Crest', 'Norvale', 'Hollowmere', \n", " 'Kindlewood', 'Dawnspire', 'Obelisk Point', 'Kindred Hollow', 'Autumn Hollow', 'Orchard Ridge', 'Underbrook', 'Kingshaven', \n", " 'Ebonwood', 'Violet Haven', 'Peregrine Spire', 'Summitwood', 'Lakeshore Valley', 'Umbra Shores', 'Trillium Vale', \n", " 'Halcyon Creek', 'Xander Cove', 'Glenstone', 'Nimbus Grove', 'Willowfern', 'Vista Hollow', 'Jasperwood', 'Jasmine Vale', \n", " 'Rustvale', 'Quillbrook', 'Ravenmere', 'Zerith Hollow', 'Golden Ridge', 'Thistlewood', 'Quiet Hollow', \n", " 'Ridgevale', 'Bluewater Ridge', 'Unity Crest', 'Cedar Hollow', 'Bluffstone', 'Larchfield', 'Quarry Hollow', \n", " 'Laurel Ridge', 'Yellowfield', 'Amberfield', 'Quartz Creek', 'Zephyr Vale', 'Larkfield', 'Verdant Hollow', \n", " 'Cinder Hollow', 'Havencliff', 'Harborwood', 'Onyx Ridge', \"Kite's Hollow\", 'Brookfield', 'Brightveil', 'Redhawk', \n", " 'Valleywood', 'Havenwood', 'Thornhill', 'Silverwood', 'Duskfield', 'Tidesreach', 'Cypress Vale', 'Fernwood', \n", " 'Moonwillow', 'Verdant Shade', 'Willowthorn', 'Garnet Crossing', 'Ivy Hollow', 'Kestrel Cove', 'Amberpeak', 'Meadowcrest', \n", " 'Yellowvine', 'Violet Sands', 'Ironwood', 'Timber Shade', 'Dovewood Creek', 'Pinecairn', 'Driftvale', 'Crescent Vale', \n", " 'Juniper Grove', 'Ridgehaven', 'Timbervale', 'Hollowstone', 'Dawnbreak', 'Oceangrove', 'Pinegrove', 'Alderstone', \n", " 'Primrose Point', 'Jasper Vale', 'Pinevale', 'Quartzfield', 'Crescent Bluff', 'Jasperstone', 'Umbra Vale', \n", " 'Violet Ridge', 'Knollfield', 'Ironshade', 'Zephyr Crossing', 'Zenith Valley', 'Ashmoor', 'Xyron Bay', 'Everstone', \n", " 'Moonstone Creek', 'Foxshade', 'Ashfield', 'Xyros Hill', 'Sapphire Ridge', 'Elmfield', 'Ivoryfield', 'Hollowvale', 'Frostbluff', \n", " 'Xenia Ridge', 'Briarcliff', 'Kestrel Bluff', 'Nightingale Ridge', 'Peridot Bay', 'Islefield', 'Ivory Spire', 'Solace Grove', \n", " 'Xanadu Grove', 'Ecliptus', 'Zephyr Hollow', 'Oakenhill', 'Glade Ridge', 'Winterridge', 'Jadestone', 'Indigo Bay', 'Duskhaven',\n", " \"Shadowpine\", \"Crystal Vale\", \"Harbor Reach\", \"Eldermoor\",\n", " \"Thornhollow\", \"Silverpeak\", \"Mistwood\", \"Shadowfall\",\n", " \"Willowbright\", \"Dusklight\", \"Havenvale\", \"Starcrest\",\n", " \"Glacier Hollow\", \"Cinderbluff\", \"Ironpeak\", \"Frostwood\",\n", " \"Embergrove\", \"Aurora Ridge\", \"Driftmoor\", \"Mooncrest\",\n", " \"Stonehearth\", \"Riverwood\", \"Briarfrost\", \"Quillhaven\",\n", " \"Stormvale\", \"Eaglesong\", \"Wanderwood\", \"Summervale\",\n", " \"Brightwood\", \"Cloudspire\", \"Snowhaven\", \"Golden Hollow\",\n", " \"Northcove\", \"Miststone\", \"Clearbrook\", \"Suncrest\",\n", " \"Twilight Vale\", \"Aspen Hollow\", \"Boulderhaven\", \"Shimmerwood\",\n", " \"Darkspire\", \"Oakbluff\", \"Hollowbright\", \"Sablewood\",\n", " \"Lunarfrost\", \"Dovewood Point\", \"Crescent Glade\", \"Wraithstone\",\n", " \"Foxwood Hollow\", \"Amberwood\", \"Midnight Ridge\", \"Garnet Hollow\",\n", " \"big city\", \"Big City\", \"Silver City\", \"Golden City\", \"Mystic City\",\n", " \"Sunset City\", \"Iron City\", \"Emerald City\", \"Shadow City\", \"Crystal City\",\n", " \"Harmony City\", \"Aurora City\", \"Dream City\", \"Thorn City\", \"Lunar City\", \"Twilight City\", \n", " \"Velvet City\", \"Willow City\", \"Ivory City\", \"Eclipse City\",\n", " \"Storm City\", \"Bliss City\", \"Shimmer City\", \"Echo City\", \"Frost City\",\n", " \"Sapphire City\", \"Obsidian City\", \"Tranquil City\", \"Starlight City\",\n", " \"Drift City\", \"Amber City\", \"Hollow City\", \"Gilded City\", \"Quartz City\",\n", " \"Meadow City\", \"Rosewood City\", \"Timber City\", \"Bright City\", \"Fox City\",\n", " \"Dusk City\", \"Goldenleaf City\", \"Wind City\", \"Harbor City\", \"Cedar City\",\n", " \"Azure City\", \"Elder City\", \"Crescent City\", \"Pine City\", \"Summit City\",\n", " \"Cobalt City\", \"Bluff City\", \"Stone City\",\n", "]\n", "\n", "fake_state_names = [\n", " 'Meadowvale', 'Boulderwatch', 'Harperfield', 'Verdantia', 'Redhaven', 'Ashspire', 'Ecliptica', 'Cindermist', \n", " 'Stormhaven', 'Crystalbourne', 'Sunspire', 'Twilight Hollow', 'Frostspire', 'Silverwatch', 'Keystone Ridge', \n", " 'Gilded Vale', 'Bluewater', 'Jadewood', 'Northgate', 'Timberland', 'Ravenmark', 'Auroravale', 'Zephyr Bay', \n", " 'Stormspire', 'Stonemeadow', 'Quintarra', 'Stonepeak', 'Willowcrown', 'Thistledown', 'Verdantreach', 'Lunaris', \n", " 'Oakenshire', 'Brightwatch', 'Dawnhaven', 'Northreach', 'Verdant Hollow', 'Horizon Ridge', 'Xantria', 'Ironvale', \n", " 'Amberreach', 'Silverveil', 'Moonwatch', 'Umbershade', 'Windswept', 'Shadowpine', 'Shadowreach', 'Zionshade', \n", " 'Oasisland', 'Goldmere', 'Frosthaven', 'Drakemont', 'Emberland', 'Rivermist', 'Duskland', 'Firgrove', 'Driftstone', \n", " 'Frostveil', 'Amberwyn', 'Velvet Ridge', 'Mystic Vale', 'Snowpoint', 'Bluehaven', 'Opal Grove', 'Jasper Hollow', \n", " 'Tideridge', 'Crimson Bay', 'Aurorawood', 'Larkland', 'Thornvale', 'Shadewind', 'Ridgefall', 'Darkfall', 'Silvercrown', \n", " 'Goldenreach', 'Ivory Plains', 'Nobleshore', 'Yellowcove', 'Hollowbrook', 'Ravendale', 'Frostwood', 'Brightshade', 'Brightmere', \n", " 'Wytherstone', 'Eaglecrest', 'Frostmere', 'Moonbrooke', 'Goldenvale', 'Quillsprings', 'Pinemark', 'Prairiefield', \n", " 'Cascade', 'Kindlemark', 'Aspenvale', 'Ivoryreach', 'Thorncrest', 'Cloudwood', 'Jade Ridge', 'Westmarch', \n", " 'Wintercrest', 'Copperfield', 'Prairiefrost', 'Bladewind', 'Everwind', 'Quarrycrest', 'Lunashire', 'Hollowreach', \n", " 'Whispering Pines', 'Blueshore', 'Glacier Point', 'Gildan', 'Zephyrlight', 'Sablepeak', 'Northspire', 'Starhearth', \n", " 'Whispercrown', 'Valewind', 'Umbravale', 'Kindleland', 'Westwatch',\n", "]\n", "\n", "fake_state_codes = [\n", " 'QT', 'WX', 'CZ', 'GW', 'FR', 'VW', 'BN', 'BM', 'LS', 'ZR', 'QN', 'KP', 'WS', 'ZZ', 'YW', 'XK', 'LR', 'NX', \n", " 'SW', 'XT', 'QB', 'ZT', 'SR', 'CW', 'JT', 'RP', 'HW', 'JV', 'FV', 'XW', 'PD', 'WR', 'QQ', 'UV', 'LK', 'LD', \n", " 'LM', 'HT', 'VR', 'XY', 'RG', 'UR', 'NT', 'PT', 'YT', 'MQ', 'DR', 'SP', 'FG', 'YS', 'ZS', 'PW', 'FN', 'XF', \n", " 'LV', 'RX', 'TG', 'CQ', 'LW', 'MX', 'BL', 'TF', 'GH', 'DX', 'QT', 'KV', 'RW', 'XL', 'FW', 'JR', 'PL', 'FB', \n", " 'ZN', 'KR', 'QZ', 'DF', 'HD',\n", "]\n" ] }, { "cell_type": "code", "execution_count": null, "id": "6d4509f1-6cb0-4478-8f89-90328e8e4f5a", "metadata": {}, "outputs": [], "source": [ "len(fake_state_names), len(set(fake_state_names))" ] }, { "cell_type": "code", "execution_count": null, "id": "d45a21d0-b71f-4464-80e1-e365b8bf20dd", "metadata": {}, "outputs": [], "source": [ "len(fake_state_codes), len(set(fake_state_codes))" ] }, { "cell_type": "code", "execution_count": null, "id": "9e83904d-9a24-412a-b3fa-bd2c66d20f8b", "metadata": {}, "outputs": [], "source": [ "len(fake_cities), len(set(fake_cities))" ] }, { "cell_type": "code", "execution_count": null, "id": "95deb546-c1cb-44ac-ba5a-52cfff3402b9", "metadata": {}, "outputs": [], "source": [ "# print(set(fake_state_codes))" ] }, { "cell_type": "code", "execution_count": null, "id": "12c31739-2930-438e-9515-47a8bf4ca8ee", "metadata": {}, "outputs": [], "source": [ "# valid_state_codes = set(city_state_code_info['state_code'].values.tolist())\n", "# len(valid_state_codes)" ] }, { "cell_type": "code", "execution_count": null, "id": "65bdb448-77b9-480e-955b-17d6f4193607", "metadata": {}, "outputs": [], "source": [ "# print([state_code for state_code in fake_state_codes if state_code not in valid_state_codes])" ] }, { "cell_type": "code", "execution_count": null, "id": "1e49ae90-5560-42b7-8483-5ddeae911b7e", "metadata": {}, "outputs": [], "source": [ "# # fake_state_names\n", "# valid_state_names = set(city_state_name_info['state_name'].values.tolist())\n", "\n", "# print([state_name for state_name in fake_state_names if state_name not in valid_state_names])" ] }, { "cell_type": "code", "execution_count": null, "id": "2bda51cd-13fd-4ba8-9dbd-ed19d3f11250", "metadata": {}, "outputs": [], "source": [ "# len(valid_state_names)" ] }, { "cell_type": "code", "execution_count": null, "id": "5ae739c3-6486-4a66-9978-9b3d3decb1fd", "metadata": {}, "outputs": [], "source": [ "# # fake_cities\n", "# # city_info\n", "# print([city_name for city_name in fake_cities if city_name not in city_info])" ] }, { "cell_type": "code", "execution_count": null, "id": "ddd77b4d-13c8-4017-acd6-8714d98f1579", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "ac238a0e-7526-40d3-8606-42d65fda3bd9", "metadata": {}, "outputs": [], "source": [ "label_map = {\n", " 0: \"O\", # Outside any named entity\n", " 1: \"B-PER\", # Beginning of a person entity\n", " 2: \"I-PER\", # Inside a person entity\n", " 3: \"B-ORG\", # Beginning of an organization entity\n", " 4: \"I-ORG\", # Inside an organization entity\n", " 5: \"B-CITY\", # Beginning of a city entity\n", " 6: \"I-CITY\", # Inside a city entity\n", " 7: \"B-STATE\", # Beginning of a state entity\n", " 8: \"I-STATE\", # Inside a state entity\n", " 9: \"B-CITYSTATE\", # Beginning of a city_state entity\n", " 10: \"I-CITYSTATE\", # Inside a city_state entity\n", " }\n", "\n", "\n", "persons = [\n", " 'Donald Trump', 'John Smith', 'Roger Williams', 'Michelle Obama', 'Elon Musk',\n", " 'Barack Obama', 'Bill Gates', 'Steve Jobs', 'Warren Buffett', 'Oprah Winfrey',\n", " 'Jeff Bezos', 'Taylor Swift', 'Jennifer Lawrence', 'Brad Pitt', 'Leonardo DiCaprio',\n", " 'Katy Perry', 'Tom Hanks', 'Emma Watson', 'Johnny Depp', 'Scarlett Johansson',\n", " 'Mark Zuckerberg', 'Sheryl Sandberg', 'Ivanka Trump', 'Joe Biden', 'Kamala Harris',\n", " 'Serena Williams', 'Michael Jordan', 'LeBron James', 'Tiger Woods', 'Cristiano Ronaldo',\n", " 'Lionel Messi', 'Roger Federer', 'Usain Bolt', 'Simone Biles', 'Tom Brady',\n", " 'Peyton Manning', 'David Beckham', 'Rafael Nadal', 'Novak Djokovic', 'Andy Murray',\n", " 'George Clooney', 'Matt Damon', 'Julia Roberts', 'Angelina Jolie', 'Morgan Freeman',\n", " 'Chris Hemsworth', 'Dwayne Johnson', 'Vin Diesel', 'Keanu Reeves', 'Robert Downey Jr.',\n", " 'Chris Evans', 'Will Smith', 'Johnny Cash', 'Bob Dylan', 'Paul McCartney',\n", " 'Ringo Starr', 'John Lennon', 'George Harrison', 'Madonna', 'Prince',\n", " 'Bruce Springsteen', 'Elton John', 'David Bowie', 'Whitney Houston', 'Celine Dion',\n", " 'Marilyn Monroe', 'Audrey Hepburn', 'Albert Einstein', 'Isaac Newton', 'Marie Curie',\n", " 'Galileo Galilei', 'Nikola Tesla', 'Stephen Hawking', 'Richard Feynman', 'Carl Sagan',\n", " 'Neil Armstrong', 'Yuri Gagarin', 'Sally Ride', 'Jane Goodall', 'Charles Darwin',\n", " 'Mahatma Gandhi', 'Nelson Mandela', 'Martin Luther King Jr.', 'Malala Yousafzai', 'Angela Merkel',\n", " 'Theresa May', 'Vladimir Putin', 'Xi Jinping', 'Justin Trudeau', 'Jacinda Ardern',\n", " 'Pope Francis', 'Dalai Lama', 'Queen Elizabeth II', 'Prince William', 'Prince Harry',\n", " 'James Anderson', 'Michael Brown', 'David Clark', 'John Doe', 'Robert Evans',\n", " 'Christopher Foster', 'William Garcia', 'Charles Hall', 'Joseph Harris', 'Daniel Jackson',\n", " 'Matthew Johnson', 'George King', 'Anthony Lewis', 'Mark Miller', 'Paul Moore',\n", " 'Steven Nelson', 'Kevin Perry', 'Thomas Reed', 'Brian Roberts', 'Jason Scott',\n", " 'Andrew Smith', 'Joshua Thompson', 'Ryan Turner', 'Brandon Walker', 'Nicholas White',\n", " 'Jonathan Young', 'Adam Baker', 'Justin Carter', 'Benjamin Collins', 'Aaron Cook',\n", " 'Alexander Davis', 'Tyler Edwards', 'Zachary Fisher', 'Ethan Graham', 'Jacob Green',\n", " 'Austin Hernandez', 'Mason Hill', 'Logan Hughes', 'Owen Jenkins', 'Lucas Kelly',\n", " 'Nathan Lee', 'Caleb Long', 'Henry Martinez', 'Dylan Mitchell', 'Gabriel Morris',\n", " 'Jack Murphy', 'Connor Myers', 'Liam Parker', 'Isaac Patterson', 'Evan Phillips',\n", " 'Hunter Price', 'Noah Richardson', 'Samuel Rivera', 'Gavin Rogers', 'Aiden Ross',\n", " 'Christian Russell', 'Ian Sanders', 'Eli Simmons', 'Chase Stewart', 'Cameron Sullivan',\n", " 'Bryan Taylor', 'Cole Thomas', 'Jake Thompson', 'Luke Torres', 'Blake Turner',\n", " 'Jesse Ward', 'Joel Watson', 'Derek Williams', 'Mitchell Wright', 'Dustin Young',\n", " 'Megan Allen', 'Jennifer Bailey', 'Jessica Bennett', 'Emily Brooks', 'Sarah Campbell',\n", " 'Amanda Carter', 'Rebecca Collins', 'Samantha Cooper', 'Stephanie Diaz', 'Rachel Evans',\n", " 'Christine Flores', 'Laura Foster', 'Michelle Garcia', 'Amber Gonzales', 'Lisa Gray',\n", " 'Kimberly Green', 'Heather Harris', 'Tiffany Henderson', 'Natalie Hernandez', 'Crystal Hill',\n", " 'Victoria Hughes', 'Erica Jenkins', 'Nicole Johnson', 'Katherine Kelly', 'Danielle Lee',\n", " 'Hannah Lewis', 'Melissa Lopez', 'Patricia Martin', 'Brittany Moore', 'Brenda Morgan',\n", "\n", " ]\n", "organizations = [\n", " 'Google Inc.', 'Apple Inc.', 'Amazon.com', 'Facebook Inc.', 'Microsoft Corporation',\n", " 'Tesla Motors', 'Netflix Inc.', 'The New York Times', 'The Washington Post', 'Wall Street Journal',\n", " 'Intel Corporation', 'Oracle Corporation', 'IBM', 'Coca-Cola Company', 'PepsiCo',\n", " 'Starbucks', 'Walmart Inc.', 'Target Corporation', 'ExxonMobil', 'Shell Oil Company',\n", " 'Ford Motor Company', 'General Motors', 'Toyota Motor Corporation', 'Volkswagen Group', 'BMW Group',\n", " 'American Airlines', 'Delta Airlines', 'United Airlines', 'Boeing Company', 'Lockheed Martin',\n", " 'SpaceX', 'NASA', 'Harvard University', 'Stanford University', 'Massachusetts Institute of Technology',\n", " 'University of California, Berkeley', 'University of Oxford', 'University of Cambridge', 'Princeton University', 'Yale University',\n", " 'University of Chicago', 'Columbia University', 'Johns Hopkins University', 'University of Southern California', 'University of Michigan',\n", " 'Goldman Sachs', 'JPMorgan Chase', 'Citibank', 'Morgan Stanley', 'Bank of America',\n", " 'Deloitte', 'Ernst & Young', 'PricewaterhouseCoopers', 'KPMG', 'McKinsey & Company',\n", " 'Boston Consulting Group', 'Accenture', 'BlackRock', 'Fidelity Investments', 'Vanguard Group',\n", " 'Nike Inc.', 'Adidas', 'Under Armour', 'Patagonia', 'The Walt Disney Company',\n", " 'Time Warner', 'NBCUniversal', 'Sony Corporation', 'Warner Bros.', 'Paramount Pictures',\n", " 'Universal Music Group', 'Sony Music Entertainment', 'Warner Music Group', 'Pfizer Inc.', 'Johnson & Johnson',\n", " 'Novartis', 'Merck & Co.', 'GlaxoSmithKline', 'AstraZeneca', 'Moderna',\n", " 'New York City Hospital', 'Los Angeles County Library', 'San Francisco Community College',\n", " 'Miami International University', 'Chicago Regional Bank', 'Dallas Medical Center',\n", " 'Boston Tech Solutions', 'Atlanta City Bank', 'Seattle Software Hub', 'Phoenix Energy Solutions',\n", " 'Denver Financial Group', 'Houston General Hospital', 'Portland Health Services', 'Las Vegas Convention Center',\n", " 'San Diego Software Innovations', 'Philadelphia Law Firm', 'Orlando Realty Group',\n", " 'Austin Engineering Solutions', 'Cleveland City Schools', 'Detroit Manufacturing Hub',\n", " 'Baltimore Technology Inc.', 'Minneapolis Insurance Group', 'St. Louis Transportation Services',\n", " 'Tampa Healthcare Network', 'Pittsburgh Steelworks Corporation', 'Sacramento Business Ventures',\n", " 'Indianapolis Marketing Solutions', 'Columbus Financial Advisors', 'Fort Worth Electric Company',\n", " 'Charlotte Digital Marketing', 'Milwaukee Industrial Solutions', 'Memphis Logistics Services',\n", " 'Washington DC Development', 'Nashville Business Enterprises', 'Louisville Fitness Center',\n", " 'Kansas City Architectural Firm', 'Oklahoma City University', 'Virginia Beach Law Associates',\n", " 'Raleigh Research Institute', 'Salt Lake City Analytics', 'Richmond Financial Group',\n", " 'Newark Data Solutions', 'Anchorage Energy Solutions', 'Fresno Water Authority',\n", " 'Omaha Financial Services', 'Colorado Springs Health Institute', 'Mesa Auto Parts',\n", " 'Virginia Beach Shipping', 'Sacramento Community Center', 'Albuquerque Electronics Company',\n", " 'Tucson Data Science Center', 'Miami Lakes Software Solutions', 'Wichita Steel Corporation',\n", " 'Arlington Cybersecurity Group', 'Bakersfield Construction Services', 'Aurora Logistics Firm',\n", " 'Anaheim Technology Hub', 'Santa Ana Healthcare Services', 'Riverside Manufacturing Co.',\n", " 'St. Paul Medical Associates', 'Lexington University Hospital', 'Plano Technology Solutions',\n", " 'Lincoln Manufacturing Inc.', 'Greensboro Industrial Partners', 'Jersey City Financial Group',\n", " 'Chandler Electronics', 'Madison Biotechnology Solutions', 'Lubbock Medical Supplies',\n", " 'Scottsdale Real Estate Group', 'Reno Venture Capitalists', 'Henderson Engineering Consultants',\n", " 'Norfolk Health Services', 'Chesapeake Data Systems', 'Fremont Software Group',\n", " 'Irvine Legal Services', 'San Bernardino Logistics Group', 'Boise Energy Technologies',\n", " 'Spokane Steel Fabricators', 'Glendale Solar Power Corporation', 'Garland Medical Services',\n", " 'Hialeah Shipping and Logistics', 'Chesapeake Financial Advisors', 'Frisco Software Hub',\n", " 'McKinney Electronics Corporation', 'Gilbert Transportation Group', 'Baton Rouge Financial Services',\n", " 'Shreveport Data Analytics', 'Mobile Business Solutions', 'Huntsville Rocket Technologies',\n", " 'Knoxville Agricultural Partners', 'Dayton Software Innovations', 'Grand Rapids Healthcare Network',\n", " 'Fort Lauderdale Construction Group', 'Tempe Electric Vehicles', 'Winston-Salem Marketing Firm',\n", " 'Fayetteville Consulting Services', 'Springfield Realty Group', 'Yonkers Manufacturing Hub',\n", " 'Augusta Insurance Group', 'Salem Solar Energy Solutions', 'Pasadena Legal Consultants',\n", " 'Seattle Pacific University', 'San Diego Zoo', 'Portland Art Museum',\n", " 'Boston Medical Group', 'Chicago Tribune', 'Dallas Cowboys Football Club',\n", " 'Los Angeles Philharmonic Orchestra', 'New York University', 'Houston Community College',\n", " 'Phoenix Solar Power', 'Denver Public Library', 'Miami International Airport',\n", " 'Atlanta Symphony Orchestra', 'San Francisco Opera', 'Orlando City Soccer Club',\n", " 'Nashville Symphony', 'Baltimore Ravens Football Team', 'Cleveland Clinic',\n", " 'Pittsburgh Steelers Football Team', 'Detroit Institute of Arts',\n", " 'Tampa Bay Buccaneers Football Club', 'St. Louis Cardinals Baseball Team',\n", " 'Indianapolis Colts Football Team', 'Austin Film Society', 'Seattle Sounders Football Club',\n", " 'Minneapolis Institute of Art', 'Charlotte Hornets Basketball Club', 'Portland Trail Blazers Basketball Team',\n", " 'Las Vegas Convention and Visitors Authority', 'New Orleans Saints Football Club',\n", " 'San Antonio Spurs Basketball Club', 'Philadelphia Eagles Football Club',\n", " 'Kansas City Chiefs Football Team', 'Cincinnati Reds Baseball Club',\n", " 'Memphis Grizzlies Basketball Team', 'Washington Wizards Basketball Club',\n", " 'Milwaukee Bucks Basketball Club', 'Sacramento Kings Basketball Team',\n", " 'Salt Lake City Ballet', 'Boise State University', 'Albuquerque International Balloon Fiesta',\n", " 'Raleigh-Durham International Airport', 'Richmond Symphony', 'Fresno Pacific University',\n", " 'Spokane Transit Authority', 'Henderson Engineering', 'Mesa Public Schools',\n", " 'Scottsdale Museum of Contemporary Art', 'Chandler Regional Medical Center', 'Glendale Unified School District',\n", " 'Riverside Community Hospital', 'Aurora Public Schools', 'Anaheim Ducks Hockey Team',\n", " 'Santa Ana College', 'Stockton Unified School District', 'Irvine Company', 'San Bernardino Community College District',\n", " 'Modesto Junior College', 'Bakersfield Condors Hockey Team', 'Fresno State University',\n", " 'Chesapeake Energy Corporation', 'Omaha World-Herald', 'Tucson Medical Center',\n", " 'Virginia Beach Public Schools', 'Norfolk Naval Shipyard', 'Newark Beth Israel Medical Center',\n", " 'Fort Wayne Mad Ants Basketball Team', 'Fremont High School', 'Shreveport Regional Airport',\n", " 'Mobile Public Library', 'Huntsville Hospital', 'Knoxville Symphony Orchestra',\n", " 'Dayton International Airport', 'Grand Rapids Symphony', 'Winston-Salem Dash Baseball Team',\n", " 'Fayetteville Technical Community College', 'Springfield Cardinals Baseball Team',\n", " 'Augusta National Golf Club', 'Salem Health', 'Pasadena Playhouse', 'Yonkers Public Schools',\n", " 'Boulder Community Health', 'Naperville North High School', 'Lansing Community College',\n", " 'Reno-Tahoe International Airport', 'Columbia University Medical Center', 'Albany Law School',\n", " 'Buffalo Sabres Hockey Team', 'Syracuse University', 'Toledo Museum of Art', 'Akron Public Schools',\n", " 'Daytona International Speedway', 'Des Moines Public Library', 'Rochester Philharmonic Orchestra',\n", " 'Flint Institute of Arts', 'Lincoln Memorial University', 'Baton Rouge Community College',\n", " 'Chattanooga Symphony and Opera', 'Greenville Technical College', 'Cedar Rapids Opera Theatre',\n", " 'Pensacola Naval Air Station'\n", " ]\n", "\n", "products = [\n", " 'iPhone', 'Samsung Galaxy', 'MacBook', 'PlayStation 5', 'Nike shoes', \n", " 'AirPods', 'Xbox Series X', 'Canon DSLR', 'GoPro', 'Adidas sneakers', \n", " 'Fitbit', 'Google Pixel', 'Kindle', 'Bose headphones', 'Sony TV', \n", " 'Dyson vacuum', 'KitchenAid mixer', 'Surface Pro', 'Roomba', 'Apple Watch'\n", "]\n", "\n", "countries = [\n", " 'USA', 'France', 'Japan', 'Germany', 'Canada', \n", " 'Australia', 'Mexico', 'China', 'Brazil', 'India', \n", " 'Italy', 'Spain', 'South Korea', 'Russia', 'Netherlands', \n", " 'United Kingdom', 'Sweden', 'Norway', 'Switzerland', 'Argentina'\n", "]\n", "\n", "services = [\n", " 'Netflix', 'Spotify', 'Uber', 'Amazon Prime', 'Google Drive', \n", " 'Zoom', 'Dropbox', 'Slack', 'LinkedIn', 'Disney+', \n", " 'YouTube Premium', 'Venmo', 'DoorDash', 'Postmates', 'Hulu', \n", " 'Skype', 'Grubhub', 'Twitch', 'Instacart', 'Lyft'\n", "]\n", "\n", "cars = [\n", " 'Tesla Model S', 'Ford Mustang', 'Chevrolet Camaro', 'Toyota Corolla', 'Honda Civic', \n", " 'BMW 3 Series', 'Audi A4', 'Mercedes-Benz C-Class', 'Jeep Wrangler', 'Ford F-150', \n", " 'Hyundai Elantra', 'Mazda CX-5', 'Chevrolet Tahoe', 'Nissan Altima', 'Kia Sorento', \n", " 'Volkswagen Golf', 'Subaru Outback', 'Tesla Model 3', 'Dodge Charger', 'Volvo XC90'\n", "]\n", "\n", "gadgets = [\n", " 'smartwatch', 'Bluetooth headphones', 'fitness tracker', 'smart speaker', 'tablet', \n", " 'laptop', 'gaming mouse', 'wireless charger', 'VR headset', 'noise-canceling headphones', \n", " 'dashcam', 'e-reader', 'action camera', 'portable hard drive', 'gaming console', \n", " 'mechanical keyboard', '4K monitor', 'digital camera', 'portable power bank', 'USB-C hub'\n", "]\n", "\n", "stocks = [\n", " 'AAPL', 'GOOGL', 'AMZN', 'MSFT', 'TSLA', \n", " 'NFLX', 'FB', 'BABA', 'NVDA', 'JPM', \n", " 'V', 'PYPL', 'BRK.A', 'DIS', 'INTC', \n", " 'PFE', 'NKE', 'ORCL', 'VZ', 'BA'\n", "]\n", "\n", "moneys = [\n", " 'cryptocurrency', 'cash', 'PayPal', 'credit card', 'Bitcoin', \n", " 'Ethereum', 'bank transfer', 'wire transfer', 'Western Union', 'Venmo', \n", " 'debit card', 'Zelle', 'Apple Pay', 'Google Pay', 'Coinbase', \n", " 'Tether', 'Litecoin', 'Dogecoin', 'cash app', 'Ripple'\n", "]\n", "\n", "finances = [\n", " '401(k)', 'IRA', 'mutual funds', 'mortgage', 'student loan', \n", " 'savings account', 'retirement fund', 'bond', 'annuity', 'index fund', \n", " 'Roth IRA', 'tax-free savings account', 'pension', 'trust fund', 'hedge fund', \n", " 'credit score', 'auto loan', 'home equity loan', 'personal loan', 'debt consolidation'\n", "]\n", "\n", "travels = [\n", " 'flights', 'hotels', 'car rentals', 'vacation packages', 'cruise trips', \n", " 'road trips', 'train tickets', 'adventure tours', 'guided tours', 'backpacking trips',\n", " 'honeymoon destinations', 'beach resorts', 'luxury travel', 'budget travel', 'camping gear', \n", " 'family vacations', 'ski trips', 'all-inclusive resorts', 'last-minute deals', 'travel insurance'\n", "]\n", "\n", "foods = [\n", " 'pizza', 'sushi', 'burgers', 'pasta', 'salads', \n", " 'vegan food', 'barbecue', 'fried chicken', 'ramen', 'tacos', \n", " 'sandwiches', 'noodles', 'soups', 'cakes', 'ice cream', \n", " 'steak', 'seafood', 'breakfast food', 'brunch', 'desserts',\n", " 'hot dogs', 'waffles', 'pancakes', 'donuts', 'cookies',\n", " 'bagels', 'burritos', 'pho', 'fried rice', 'dim sum',\n", " 'smoothies', 'milkshakes', 'cupcakes', 'cheesecake', 'crepes',\n", " 'nachos', 'guacamole', 'shawarma', 'gyros', 'kebabs',\n", " 'clam chowder', 'chili', 'mac and cheese', 'meatballs', 'lasagna',\n", " 'quesadillas', 'falafel', 'curry', 'pork ribs', 'buffalo wings',\n", " 'brownies', 'apple pie', 'frozen yogurt', 'churros', 'stuffed crust pizza',\n", " 'poutine', 'pad thai', 'dim sum', 'korean barbecue', 'bibimbap',\n", " 'tandoori chicken', 'naan', 'samosa', 'biryani', 'dumplings',\n", " 'bao buns', 'poke bowl', 'ceviche', 'tamales', 'empanadas',\n", " 'shabu shabu', 'jollof rice', 'laksa', 'banh mi', 'spring rolls',\n", " 'paella', 'gnocchi', 'risotto', 'french fries', 'croissants',\n", " 'hummus', 'tzatziki', 'miso soup', 'kimchi', 'baklava',\n", " 'souvlaki', 'galbi', 'arepas', 'roti', 'malai kofta',\n", " 'sichuan chicken', 'teriyaki', 'yakitori', 'fettuccine alfredo',\n", " 'gnocchi', 'ratatouille', 'tempura', 'onigiri', 'calamari',\n", " 'chimichurri steak', 'goulash', 'pierogi', 'fondue', 'strudel',\n", " 'schnitzel', 'tikka masala', 'paneer', 'plantains', 'croquettes',\n", " 'coffee', \n", "]\n", "\n", "restaurants = [\n", " 'Italian restaurants', 'Mexican restaurants', 'Japanese restaurants', 'Chinese restaurants', 'Indian restaurants', \n", " 'fast food chains', 'fine dining', 'vegan restaurants', 'steakhouses', 'seafood restaurants', \n", " 'barbecue joints', 'sushi bars', 'cafes', 'pizzerias', 'buffet restaurants', \n", " 'food trucks', 'family-friendly restaurants', 'gastropubs', 'brunch spots', 'diner',\n", "]\n", "\n", "## Additional partial terms\n", "sports_terms_missing = [\n", " \"footbal\", \"baske\", \"socce\", \"golf\", \"cricke\", \"rugby\", \"hocke\", \"tenni\", \n", " \"swimmin\", \"athleti\", \"fishi\", \"basebal\", \"volleybal\", \"badminto\", \"maratho\", \n", " \"skatin\", \"climbin\", \"racquetball\", \"bowlin\", \"darts\", \"gymnasti\", \"bikin\", \"bowling\",\n", "]\n", "\n", "locations_and_landmarks = [\n", " \"statue\", \"museum\", \"plaza\", \"zoo\", \"church\", \"theater\", \"stadium\", \"mountain\", \n", " \"park\", \"lake\", \"beach\", \"river\", \"palace\", \"cathedra\", \"mansion\", \"monument\", \n", " \"temple\", \"observato\", \"canyon\", \"garden\", \"conservato\", \"boardwal\", \"forest\", \n", " \"pier\", \"lighthouse\", \"arena\",\n", "]\n", "\n", "activities_and_events = [\n", " \"conc\", \"exhib\", \"meet\", \"parad\", \"festi\", \"tourn\", \"game\", \"sho\", \"even\", \n", " \"gala\", \"confere\", \"seminar\", \"webina\", \"worksho\", \"lectur\", \"symposiu\", \n", " \"screenin\", \"rall\", \"celebratio\", \"ceremon\", \"get-togethe\", \"perfor\", \n", " \"gatherin\", \"competitio\", \"maratho\", \"speec\", \"workout\", \"showcas\", \"bowling\"\n", "]\n", "\n", "food_missing = [\n", " \"sush\", \"pizz\", \"ramen\", \"bbq\", \"vega\", \"steak\", \"taco\", \"burg\", \"pasta\", \n", " \"brunc\", \"desse\", \"drink\", \"grill\", \"bake\", \"buffet\", \"sandwich\", \"noodle\", \n", " \"cafe\", \"taver\", \"gastro\", \"bistro\", \"del\", \"saloo\", \"barbecue\", \"snack\", \n", " \"confectio\", \"pub\",\n", "]\n", "\n", "transport_and_directions = [\n", " \"direc\", \"map\", \"bus\", \"train\", \"car\", \"park\", \"taxi\", \"subwa\", \"fly\", \n", " \"plane\", \"ticke\", \"pass\", \"ferr\", \"bicycl\", \"scoote\", \"shuttl\", \"walkin\", \n", " \"rideshar\", \"transi\", \"toll\", \"metr\", \"road\", \"route\", \"stop\"\n", "]\n", "\n", "celebrities = [\n", " \"Leonardo DiCaprio\", \"Tom Cruise\", \"Dwayne Johnson\", \"Zendaya\", \n", " \"Timothée Chalamet\", \"Florence Pugh\", \"Margot Robbie\", \"Chris Hemsworth\", \n", " \"Robert Downey Jr.\", \"Scarlett Johansson\", \"Tom Holland\", \"Ryan Reynolds\", \n", " \"Gal Gadot\", \"Pedro Pascal\", \"Elizabeth Olsen\", \"Jenna Ortega\", \n", " \"Millie Bobby Brown\", \"Finn Wolfhard\", \"Anya Taylor-Joy\", \"Jason Momoa\", \n", " \"Chris Evans\", \"Natalie Portman\", \"Henry Cavill\", \"Daniel Radcliffe\", \n", " \"Emma Watson\", \"Rupert Grint\", \"Michael B. Jordan\", \"Anne Hathaway\", \n", " \"Brad Pitt\", \"Angelina Jolie\", \"Keanu Reeves\", \"Sandra Bullock\", \n", " \"Jake Gyllenhaal\", \"Christian Bale\", \"Cate Blanchett\", \"Hugh Jackman\", \n", " \"Jennifer Lawrence\", \"Will Smith\", \"Jada Pinkett Smith\", \"Viola Davis\", \n", " \"Austin Butler\", \"Jamie Lee Curtis\", \"Paul Mescal\", \"Tobey Maguire\", \n", " \"Andrew Garfield\", \"Harrison Ford\", \"Helen Mirren\", \"Brendan Fraser\", \n", "\n", " # Classic Hollywood Legends\n", " \"Marlon Brando\", \"James Dean\", \"Audrey Hepburn\", \"Marilyn Monroe\", \n", " \"Humphrey Bogart\", \"Clark Gable\", \"Bette Davis\", \"Elizabeth Taylor\",\n", " \"Fred Astaire\", \"Ginger Rogers\", \"Ingrid Bergman\", \"Greta Garbo\", \n", " \"Katharine Hepburn\", \"Cary Grant\", \"Spencer Tracy\", \"Rita Hayworth\",\n", " \"Grace Kelly\", \"Vivien Leigh\", \"Judy Garland\", \"Henry Fonda\",\n", " \"Lauren Bacall\", \"Paul Newman\", \"Charlton Heston\", \"Joan Crawford\",\n", "\n", " # Modern Hollywood Icons\n", " \"Meryl Streep\", \"Tom Hanks\", \"Denzel Washington\", \"Robert De Niro\", \n", " \"Al Pacino\", \"Jack Nicholson\", \"Julia Roberts\", \"Leonardo DiCaprio\",\n", " \"Brad Pitt\", \"Angelina Jolie\", \"George Clooney\", \"Cate Blanchett\",\n", " \"Johnny Depp\", \"Tom Cruise\", \"Sandra Bullock\", \"Nicole Kidman\", \n", " \"Halle Berry\", \"Harrison Ford\", \"Sigourney Weaver\", \"Morgan Freeman\", \n", " \"Michelle Pfeiffer\", \"Dustin Hoffman\", \"Robin Williams\", \"Will Smith\",\n", "\n", " # Franchise and Action-Adventure Stars\n", " \"Orlando Bloom\", \"Viggo Mortensen\", \"Ian McKellen\", \"Elijah Wood\",\n", " \"Sean Astin\", \"Dominic Monaghan\", \"Billy Boyd\", \"Liv Tyler\", \n", " \"Hugo Weaving\", \"Andy Serkis\", \"Keira Knightley\", \"Geoffrey Rush\",\n", " \"Johnny Depp\", \"Daniel Radcliffe\", \"Emma Watson\", \"Rupert Grint\",\n", " \"Helena Bonham Carter\", \"Ralph Fiennes\", \"Alan Rickman\", \"Michael Gambon\",\n", " \"Ewan McGregor\", \"Liam Neeson\", \"Natalie Portman\", \"Hayden Christensen\",\n", " \"Mark Hamill\", \"Carrie Fisher\", \"Harrison Ford\", \"Daisy Ridley\",\n", " \"Adam Driver\", \"John Boyega\", \"Oscar Isaac\", \"Diego Luna\", \n", " \"Felicity Jones\", \"Pedro Pascal\", \"Chris Hemsworth\", \"Chris Evans\", \n", " \"Scarlett Johansson\", \"Robert Downey Jr.\", \"Mark Ruffalo\", \"Chris Pratt\",\n", " \"Tom Holland\", \"Zendaya\", \"Benedict Cumberbatch\", \"Tobey Maguire\", \n", " \"Andrew Garfield\", \"Hugh Jackman\", \"Patrick Stewart\", \"Ian McKellen\", \n", " \"Ryan Reynolds\", \"Gal Gadot\", \"Henry Cavill\", \"Jason Momoa\", \n", " \"Ben Affleck\", \"Zoe Saldaña\", \"Dave Bautista\", \"Karen Gillan\",\n", "\n", " # Versatile and Popular Contemporary Actors\n", " \"Christian Bale\", \"Amy Adams\", \"Ryan Gosling\", \"Emma Stone\",\n", " \"Anne Hathaway\", \"Jennifer Lawrence\", \"Joaquin Phoenix\", \"Margot Robbie\",\n", " \"Adam Driver\", \"Michael B. Jordan\", \"Florence Pugh\", \"Timothée Chalamet\",\n", " \"Austin Butler\", \"Jessica Chastain\", \"Mahershala Ali\", \"Viola Davis\", \n", " \"Octavia Spencer\", \"Toni Collette\", \"Rami Malek\", \"Lakeith Stanfield\",\n", " \"Cillian Murphy\", \"Matt Damon\", \"Ben Affleck\", \"Jeremy Renner\", \n", "\n", " # Young Rising Stars\n", " \"Millie Bobby Brown\", \"Finn Wolfhard\", \"Sadie Sink\", \"Noah Schnapp\", \n", " \"Anya Taylor-Joy\", \"Jenna Ortega\", \"Hunter Schafer\", \"Hailee Steinfeld\", \n", " \"Lucas Hedges\", \"Elle Fanning\", \"Dakota Fanning\", \"Jacob Elordi\", \n", " \"Sydney Sweeney\", \"Joey King\", \"Sophie Turner\", \"Maisie Williams\",\n", "\n", " # Comedy and Character Actors\n", " \"Steve Carell\", \"Tina Fey\", \"Amy Poehler\", \"Melissa McCarthy\", \n", " \"Kristen Wiig\", \"Seth Rogen\", \"Will Ferrell\", \"Paul Rudd\", \n", " \"Bill Hader\", \"Jason Bateman\", \"Jonah Hill\", \"Michael Cera\",\n", " \"Ken Jeong\", \"Kevin Hart\", \"Maya Rudolph\", \"Chris Rock\", \n", "\n", " # Iconic Action and Adventure Stars\n", " \"Dwayne Johnson\", \"Arnold Schwarzenegger\", \"Sylvester Stallone\", \n", " \"Bruce Willis\", \"Jason Statham\", \"Keanu Reeves\", \"Vin Diesel\", \n", " \"Charlize Theron\", \"Emily Blunt\", \"John Cena\", \"Liam Neeson\", \n", " \"Daniel Craig\", \"Idris Elba\", \"Pierce Brosnan\", \"Angelina Jolie\", \n", " \"Kate Beckinsale\", \"Milla Jovovich\",\n", "\n", " # Supporting Actors and Other Notables\n", " \"John Goodman\", \"Jeff Goldblum\", \"J.K. Simmons\", \"Stanley Tucci\",\n", " \"Frances McDormand\", \"Allison Janney\", \"Angela Bassett\", \"Regina King\",\n", " \"Jessica Lange\", \"Bryan Cranston\", \"Aaron Paul\", \"Bob Odenkirk\", \n", " \"Giancarlo Esposito\", \"David Harbour\", \"Winona Ryder\", \n", "\n", " # Diverse and Internationally Acclaimed Actors\n", " \"Salma Hayek\", \"Antonio Banderas\", \"Diego Luna\", \"Oscar Isaac\", \n", " \"Gael García Bernal\", \"Eva Longoria\", \"Jessica Alba\", \n", " \"Awkwafina\", \"Sandra Oh\", \"Steven Yeun\", \"Simu Liu\", \n", " \"Lucy Liu\", \"Gemma Chan\", \"Mindy Kaling\", \"Ali Wong\", \n", " \"Lupita Nyong'o\", \"Chadwick Boseman\", \"Daniel Kaluuya\", \"Letitia Wright\",\n", " \"Dev Patel\", \"Riz Ahmed\", \"Zazie Beetz\", \"Mahershala Ali\",\n", "\n", " # Sports\n", " \"Lionel Messi\", \"Cristiano Ronaldo\", \"Neymar Jr.\", \"Kylian Mbappé\", \n", " \"LeBron James\", \"Serena Williams\", \"Roger Federer\", \"Novak Djokovic\", \n", " \"Rafael Nadal\", \"Simone Biles\", \"Naomi Osaka\", \"Stephen Curry\", \n", " \"Kevin Durant\", \"Tom Brady\", \"Patrick Mahomes\", \"Virat Kohli\", \n", " \"Rohit Sharma\", \"Shaquille O'Neal\", \"Tiger Woods\", \"Lewis Hamilton\", \n", " \"Max Verstappen\", \"Charles Leclerc\", \"Usain Bolt\", \"Megan Rapinoe\", \n", " \"Alex Morgan\", \"Katie Ledecky\", \"Michael Phelps\", \"Giannis Antetokounmpo\", \n", " \"Damian Lillard\", \"Anthony Davis\", \"Zlatan Ibrahimović\", \"Harry Kane\", \n", " \"Sadio Mané\", \"Karim Benzema\", \"Gareth Bale\", \"Robert Lewandowski\", \n", " \"Erling Haaland\", \"Venus Williams\", \"Iga Świątek\", \"Aryna Sabalenka\", \n", "\n", " # Politics and Leaders\n", " \"Joe Biden\", \"Kamala Harris\", \"Barack Obama\", \"Michelle Obama\", \n", " \"Donald Trump\", \"Melania Trump\", \"Emmanuel Macron\", \"Olaf Scholz\", \n", " \"Volodymyr Zelenskyy\", \"Rishi Sunak\", \"Narendra Modi\", \"Jacinda Ardern\", \n", " \"Justin Trudeau\", \"Xi Jinping\", \"Vladimir Putin\", \"Angela Merkel\", \n", " \"Elizabeth II\", \"King Charles III\", \"Prince William\", \"Prince Harry\", \n", " \"Meghan Markle\", \"Queen Letizia\", \"Pope Francis\", \"Dalai Lama\", \n", " \"Greta Thunberg\", \"Alexandria Ocasio-Cortez\", \"Bernie Sanders\", \n", " \"Nicolas Maduro\", \"Jair Bolsonaro\", \"Fumio Kishida\", \"Yoon Suk-yeol\",\n", "\n", " # Business and Technology\n", " \"Elon Musk\", \"Jeff Bezos\", \"Mark Zuckerberg\", \"Bill Gates\", \"Tim Cook\", \n", " \"Sundar Pichai\", \"Satya Nadella\", \"Warren Buffett\", \"Bernard Arnault\", \n", " \"Larry Page\", \"Sergey Brin\", \"Steve Wozniak\", \"Reed Hastings\", \"Susan Wojcicki\", \n", " \"Jack Ma\", \"Daniel Ek\", \"Evan Spiegel\", \"Andrew Ng\", \"Sam Altman\", \n", " \"Sheryl Sandberg\", \"Peter Thiel\", \"Marc Benioff\", \"Richard Branson\", \n", " \"Oprah Winfrey\", \"Howard Schultz\", \"Larry Ellison\", \"David Baszucki\", \n", " \"Parag Agrawal\", \"Adam Neumann\", \"Kylie Jenner\", \"Kim Kardashian\", \n", " \"Khloé Kardashian\", \"Kris Jenner\", \"Robert Kiyosaki\", \"Barbara Corcoran\", \n", "\n", " # Science and Innovation\n", " \"Jane Goodall\", \"Neil deGrasse Tyson\", \"Brian Cox\", \"Michio Kaku\", \n", " \"Katherine Johnson\", \"Jennifer Doudna\", \"Emmanuelle Charpentier\", \"Tim Berners-Lee\", \n", " \"Mae Jemison\", \"Katie Bouman\", \"Brian Greene\", \"James Lovelock\", \n", " \"Roger Penrose\", \"Dmitry Muratov\", \"Frances Arnold\", \"Venki Ramakrishnan\", \n", " \"Paul Nurse\", \"Elizabeth Blackburn\", \"Carol Greider\", \"David Julius\", \n", " \"Abhijit Banerjee\", \"Esther Duflo\", \"Michael Kremer\", \"Andrea Ghez\", \n", " \"Reinhard Genzel\", \"Jennifer Hudson\", \"Ashoke Sen\", \"Subrahmanyan Chandrasekhar\", \n", "\n", " # Others\n", " \"Ellen DeGeneres\", \"Oprah Winfrey\", \"Trevor Noah\", \"Jimmy Fallon\", \n", " \"Stephen Colbert\", \"John Oliver\", \"James Corden\", \"Conan O'Brien\", \n", " \"Dolly Parton\", \"Gordon Ramsay\", \"David Beckham\", \"Victoria Beckham\", \n", " \"RuPaul\", \"Chris Rock\", \"Dave Chappelle\", \"Trevor Noah\", \"Hasan Minhaj\", \n", " \"Ali Wong\", \"Bo Burnham\", \"Jo Koy\", \"Kevin Hart\", \"Sarah Silverman\", \n", " \"Tiffany Haddish\", \"Joe Rogan\", \"Logan Paul\", \"MrBeast\", \"PewDiePie\", \n", " \"Emma Chamberlain\", \"Charli D'Amelio\", \"Addison Rae\", \"Bella Poarch\",\n", "]" ] }, { "cell_type": "code", "execution_count": null, "id": "2a166f90-7262-4047-92e4-f83a18c7c5d4", "metadata": {}, "outputs": [], "source": [ "\n", "def get_sample_from_cities(city_info, city_weights, actual_threshold=0.7, city_partial_threshold=0.1):\n", " cities = list(city_info.keys())\n", " weights = [city_weights[city] for city in cities]\n", " city_random = random.choices(cities, weights=weights, k=1)[0]\n", " rand_val = random.random()\n", " if rand_val <= actual_threshold:\n", " if rand_val <= city_partial_threshold and len(city_random) > 6:\n", " return city_random[:-1]\n", " return city_random\n", " return random.choice(city_info[city_random])\n", "\n", "def get_sample_from_states(state_info, actual_threshold=0.5):\n", " states = list(state_info.keys())\n", " state_random = random.choice(states)\n", " rand_val = random.random()\n", " if rand_val <= actual_threshold:\n", " return state_random\n", " return random.choice([state_info[state_random]])\n", "\n", "def get_sample_from_cities_and_states(city_state_code_info, city_state_name_info, state_code_threshold=0.8, comma_threshold=0.6):\n", " rand_val = random.random()\n", " if rand_val <= state_code_threshold:\n", " if rand_val <= comma_threshold:\n", " return ', '.join(city_state_code_info.sample(1, weights='city_weight', replace=True)[['city_name', 'state_code']].values.tolist()[0])\n", " else:\n", " return ' '.join(city_state_code_info.sample(1, weights='city_weight', replace=True)[['city_name', 'state_code']].values.tolist()[0])\n", " return ', '.join(city_state_name_info.sample(1, weights='city_weight', replace=True)[['city_name', 'state_name']].values.tolist()[0])\n", "\n", "def get_random_choice_from_list(choices_list):\n", " return random.choice(choices_list)\n", " \n", "def get_sample_fake_city():\n", " return get_random_choice_from_list(fake_cities)\n", "\n", "def get_sample_fake_state_code():\n", " return get_random_choice_from_list(fake_state_codes)\n", "\n", "def get_sample_fake_state_name():\n", " return get_random_choice_from_list(fake_state_names)" ] }, { "cell_type": "code", "execution_count": null, "id": "a4f1fd1a-cd74-4ebe-b9aa-c3cd07dcf2bb", "metadata": {}, "outputs": [], "source": [ "# for _ in range(100):\n", "# print(get_sample_from_cities_and_states(city_state_code_info, city_state_name_info, state_code_threshold=0.8))" ] }, { "cell_type": "code", "execution_count": null, "id": "cb760f94-1dba-4ec3-816f-7cfb3bb9b7b0", "metadata": {}, "outputs": [], "source": [ "templates = [\n", " # Simple City-Based Queries\n", " \"weather {city}\",\n", " \"{city} temperature\",\n", " \"sushi {city}\",\n", " \"ramen {city}\",\n", " \"pizza {city}\",\n", " \"plumber {city}\",\n", " \"electrician {city}\",\n", " \"roof repair {city}\",\n", " \"physio therapy {city}\",\n", " \"hospital {city}\",\n", " \"doctor {city}\",\n", " \"nurse {city}\",\n", " \"home improvement {city}\",\n", " \"home services {city}\",\n", " \"weather forecast {city}\",\n", " \"current weather {city}\",\n", " \"best restaurants {city}\",\n", " \"top yelp reviews {city}\",\n", " \"places to visit in {city}\",\n", " \"best cafes in {city}\",\n", " \"emergency services {city}\",\n", " \"gyms in {city}\",\n", " \"car repair {city}\",\n", " \"florist {city}\",\n", " \"lawyers in {city}\",\n", " \"real estate agents {city}\",\n", " \"hiking trails {city}\",\n", " \"parks in {city}\",\n", " \"movie theaters {city}\",\n", " \"top hotels in {city}\",\n", " \"events in {city} this weekend\",\n", " \"pharmacies {city}\",\n", " \"{food} near me {city}\",\n", " \"coffee near me {city}\",\n", " \"breakfast near me {city}\",\n", " \"restaurants near me {city}\",\n", "\n", " # State-Based Queries\n", " \"home services in {state}\",\n", " \"best restaurants in {state}\",\n", " \"real estate agents {state}\",\n", " \"roof repair services {state}\",\n", " \"hospitals in {state}\",\n", " \"weather {state}\",\n", " \"temperature {state}\",\n", " \"physio therapy {state}\",\n", " \"doctors in {state}\",\n", " \"top-rated plumbers {state}\",\n", " \"electricians {state}\",\n", " \"emergency services {state}\",\n", " \"sushi {state}\",\n", " \"ramen {state}\",\n", " \"pizza {state}\",\n", " \"parks in {state}\",\n", " \"hiking trails {state}\",\n", " \"pharmacies in {state}\",\n", " \"best cafes {state}\",\n", " \"movie theaters {state}\",\n", "\n", " # City-State Combination Queries (Now using {city_state})\n", " \"weather {city_state}\",\n", " \"{city_state} temperature\",\n", " \"sushi {city_state}\",\n", " \"plumber {city_state}\",\n", " \"best restaurants in {city_state}\",\n", " \"top-rated roof repair {city_state}\",\n", " \"hospital {city_state}\",\n", " \"physio therapy {city_state}\",\n", " \"doctor {city_state}\",\n", " \"events in {city_state} this weekend\",\n", " \"lawyers in {city_state}\",\n", " \"home improvement services {city_state}\",\n", " \"florist {city_state}\",\n", " \"best cafes in {city_state}\",\n", " \"parks in {city_state}\",\n", " \"movie theaters {city_state}\",\n", " \"top hotels in {city_state}\",\n", " \"emergency services {city_state}\",\n", " \"car repair {city_state}\",\n", " \"pharmacies {city_state}\",\n", "\n", " \"sushi {city_state}\",\n", " \"ramen {city_state}\",\n", " \"pizza {city_state}\",\n", " \"parks {city_state}\",\n", " \"hiking trails {city_state}\",\n", " \"pharmacies {city_state}\",\n", " \"best cafes {city_state}\",\n", " \"movie theaters {city_state}\",\n", " \"hamburgers {city_state}\",\n", " \"burgers {city_state}\",\n", " \"pasta {city_state}\",\n", " \"salads {city_state}\",\n", " \"vegan food {city_state}\",\n", " \"fried chicken {city_state}\",\n", " \"ramen {city_state}\",\n", " \"tacos {city_state}\",\n", " \"sandwiches {city_state}\",\n", " \"noodles {city_state}\",\n", " \"soups {city_state}\",\n", " \"cakes {city_state}\",\n", " \"ice cream {city_state}\",\n", " \"steak {city_state}\",\n", " \"seafood {city_state}\",\n", " \"breakfast food {city_state}\",\n", " \"brunch {city_state}\",\n", " \"desserts {city_state}\",\n", " \n", " # CITY state order swapped\n", " \"{city_state} sushi\",\n", " \"{city_state} ramen\",\n", " \"{city_state} pizza\",\n", " \"{city_state} parks\",\n", " \"{city_state} hiking trails\",\n", " \"{city_state} pharmacies\",\n", " \"{city_state} best cafes\",\n", " \"{city_state} movie theaters\",\n", " \"{city_state} hamburgers\",\n", " \"{city_state} burgers\",\n", " \"{city_state} pasta\",\n", " \"{city_state} salads\",\n", " \"{city_state} vegan food\",\n", " \"{city_state} fried chicken\",\n", " \"{city_state} ramen\",\n", " \"{city_state} tacos\",\n", " \"{city_state} sandwiches\",\n", " \"{city_state} noodles\",\n", " \"{city_state} soups\",\n", " \"{city_state} cakes\",\n", " \"{city_state} ice cream\",\n", " \"{city_state} steak\",\n", " \"{city_state} seafood\",\n", " \"{city_state} breakfast food\",\n", " \"{city_state} brunch\",\n", " \"{city_state} desserts\",\n", " \n", " # Organization-Based Queries\n", " \"{organization} in {city_state}\",\n", " \"contact {organization} in {city}\",\n", " \"locations of {organization} in {state}\",\n", " \"does {organization} provide home repair services in {city}?\",\n", " \"can I book a doctor appointment at {organization} in {state}?\",\n", " \"does {organization} offer roof repair in {city_state}?\",\n", " \"hours of {organization} in {city}\",\n", " \"{organization} reviews in {state}\",\n", " \"best rated {organization} in {city_state}\",\n", " \"nearest branch of {organization} in {city}\",\n", " \n", " # Person-Based Queries\n", " \"Where is {person} hosting an event?\",\n", " \"Can I meet {person} in {city_state}?\",\n", " \"Is {person} available for an appointment in {city}?\",\n", " \"Is {person} traveling to {state} next week?\",\n", " \"Does {person} have a speech in {city_state}?\",\n", " \n", " # Mixed and Specialized Queries\n", " \"roof repair near {city}\",\n", " \"best sushi in {city_state}\",\n", " \"what's the weather forecast for {city}?\",\n", " \"who are the top doctors in {city_state}?\",\n", " \"restaurants near {city} with good reviews\",\n", " \"plumbing services in {city_state}\",\n", " \"upcoming events in {city} this weekend\",\n", " \"find hiking trails in {city_state}\",\n", " \"local electricians in {city_state}\",\n", " \"ramen places in {city}\",\n", " \"home improvement contractors near {city_state}\",\n", " \"best pizza near {city}\",\n", " \"does {organization} operate in {city_state}?\",\n", " \"find top-rated hospitals in {city_state}\",\n", " \"home maintenance services in {city_state}\",\n", " \"weather forecast for {city} this weekend\",\n", " \"roof repair specialists in {city}\",\n", " \"top-rated movie theaters in {city_state}\",\n", " \n", "\n", " # City-State Queries\n", " \"Best {restaurant} in {city_state}\",\n", " \"Top-rated {restaurant} in {city_state}\",\n", " \"Affordable {restaurant} in {city_state}\",\n", " \"Where to find the best {food} in {city_state}?\",\n", " \"Popular {food} places in {city_state}\",\n", " \"Top destinations for {travel} in {city_state}\",\n", " \"Best deals on {travel} in {city_state}\",\n", " \"Where to eat {food} in {city_state}?\",\n", " \"What are the most famous {restaurant} in {city_state}?\",\n", " \"Top {food} restaurants in {city_state} this weekend\",\n", "\n", " # Non-City/State Queries\n", " \"Best {restaurant} in the country\",\n", " \"Where to find the best {food} near me?\",\n", " \"Top destinations for {travel} this summer\",\n", " \"Best deals on {travel} packages\",\n", " \"Where to find cheap {travel} options?\",\n", " \"Popular {food} dishes in the USA\",\n", " \"Best {restaurant} chains in the country\",\n", " \"What are the healthiest {food} options?\",\n", " \"How to book affordable {travel} for families?\",\n", " \"Most popular {restaurant} for takeout\",\n", "\n", " # Additional Templates\n", " \"What is the best {food} to eat for dinner?\",\n", " \"Where to order {food} online?\",\n", " \"Best {restaurant} for date night\",\n", " \"Top {travel} websites for booking vacations\",\n", " \"Where to find {restaurant} reviews?\",\n", " \"What are the top-rated {travel} apps?\",\n", " \"Best {restaurant} near tourist attractions\",\n", " \"What is the most popular {food} in the USA?\",\n", " \"Best deals on {travel} for students\",\n", " \"Top {restaurant} for family gatherings\",\n", " \"Most affordable {food} delivery services\",\n", " \"What are the best {travel} insurance options?\",\n", " \"How to find luxury {restaurant} reservations\",\n", " \"Where to get authentic {food} near me?\",\n", " \"Top {restaurant} for business lunches\",\n", " \"How to plan a {travel} adventure?\",\n", " \"Best {restaurant} for weekend brunch\",\n", " \"What are the most popular {food} trends?\",\n", " \"Best {restaurant} for a large group\",\n", " \"How to get discounts on {travel} bookings?\"\n", "\n", " # Product-Based Queries\n", " \"Where to buy {product} online?\",\n", " \"Best deals on {product}\",\n", " \"How to repair a {product}?\",\n", " \"Latest reviews of {product}\",\n", " \"When will the next {product} be released?\",\n", " \"Top features of {product}\",\n", " \"Is {product} worth buying in 2024?\",\n", " \"User reviews of {product}\",\n", " \"Alternatives to {product}\",\n", " \"What is the price of {product}?\",\n", "\n", " # Country-Based Queries\n", " \"How to travel to {country}?\",\n", " \"Best tourist destinations in {country}\",\n", " \"Top hotels to stay in {country}\",\n", " \"Do I need a visa to visit {country}?\",\n", " \"Cultural traditions in {country}\",\n", " \"What is the official language of {country}?\",\n", " \"How to do business in {country}?\",\n", " \"What are the top exports of {country}?\",\n", " \"Current political situation in {country}\",\n", " \"Famous landmarks in {country}\",\n", "\n", " # Service-Based Queries\n", " \"How to cancel my {service} subscription?\",\n", " \"Is {service} worth the price?\",\n", " \"How does {service} compare to competitors?\",\n", " \"User reviews of {service}\",\n", " \"How to get a discount on {service}?\",\n", " \"What are the benefits of {service}?\",\n", " \"Best alternatives to {service}\",\n", " \"How to troubleshoot issues with {service}?\",\n", " \"Does {service} have a free trial?\",\n", " \"Is {service} available internationally?\",\n", "\n", " # Cars-Based Queries\n", " \"What is the top speed of {car}?\",\n", " \"User reviews of {car}\",\n", " \"How to finance a {car}?\",\n", " \"Fuel efficiency of {car}\",\n", " \"How to buy a second-hand {car}?\",\n", " \"What are the safety features of {car}?\",\n", " \"Maintenance costs of owning a {car}\",\n", " \"What is the resale value of {car}?\",\n", " \"Is {car} electric or gas-powered?\",\n", " \"Best upgrades for {car}\",\n", "\n", " # Gadgets-Based Queries\n", " \"What are the best apps for {gadget}?\",\n", " \"How to set up a {gadget}?\",\n", " \"User reviews of {gadget}\",\n", " \"Best accessories for {gadget}\",\n", " \"What are the health benefits of using a {gadget}?\",\n", " \"What is the battery life of {gadget}?\",\n", " \"How to sync {gadget} with my phone?\",\n", " \"Alternatives to {gadget}\",\n", " \"What are the best productivity apps for {gadget}?\",\n", " \"Is {gadget} waterproof?\",\n", "\n", " # Stocks-Based Queries\n", " \"What is the latest price of {stock}?\",\n", " \"How to buy shares of {stock}?\",\n", " \"Is {stock} a good investment in 2024?\",\n", " \"What are analysts saying about {stock}?\",\n", " \"Current stock performance of {stock}\",\n", " \"What is the market cap of {stock}?\",\n", " \"How to invest in {stock}?\",\n", " \"Latest earnings report of {stock}\",\n", " \"What are the dividend yields of {stock}?\",\n", " \"How to trade {stock} on the stock market?\",\n", "\n", " # Money-Based Queries\n", " \"How to convert {money} to another currency?\",\n", " \"Best ways to transfer {money} internationally\",\n", " \"What are the risks of using {money}?\",\n", " \"How to save {money} for the future?\",\n", " \"What is the best way to invest {money}?\",\n", " \"How to protect {money} from fraud?\",\n", " \"What are the fees for using {money}?\",\n", " \"Is {money} safe for online transactions?\",\n", " \"Best apps for managing {money}\",\n", " \"How to track spending with {money}?\",\n", "\n", " # Finance-Based Queries\n", " \"How to invest in a {finance}?\",\n", " \"What are the benefits of having a {finance}?\",\n", " \"How to calculate the returns on {finance}?\",\n", " \"What are the risks of investing in {finance}?\",\n", " \"How to get advice for managing my {finance}?\",\n", " \"How to apply for a {finance}?\",\n", " \"What are the tax benefits of {finance}?\",\n", " \"What are the best options for a {finance}?\",\n", " \"How to open a {finance} account?\",\n", " \"What is the interest rate on {finance}?\",\n", "\n", " # sports_term, location_and_landmark, activity_and_event, food_m, transport_and_direction\n", " # incomplete or misspelled sport/activity names\n", " \"{sports_term} near me\", \n", " \"find {sports_term}\", \n", " \"{sports_term} schedule\", \n", " \"{sports_term} news\", \n", " \"book {sports_term} tickets\", \n", " \"{sports_term} team\", \n", " \"{sports_term} game time\", \n", " \"when is the {sports_term} game\", \n", " \"top {sports_term} players\", \n", " \"local {sports_term} clubs\", \n", " \"where to play {sports_term}\", \n", " \"best {sports_term} venues\", \n", " \"{sports_term} tournament\",\n", " \"{sports_term}\",\n", "\n", " # Generic landmarks and location queries\n", " \"{location_and_landmark} nearby\", \n", " \"famous {location_and_landmark}\", \n", " \"{location_and_landmark} open now\", \n", " \"visit {location_and_landmark}\", \n", " \"{location_and_landmark} directions\", \n", " \"how to get to {location_and_landmark}\", \n", " \"nearest {location_and_landmark}\", \n", " \"{location_and_landmark} address\", \n", " \"top-rated {location_and_landmark}\", \n", " \"{location_and_landmark} hours\", \n", " \"find {location_and_landmark} near me\", \n", " \"{location_and_landmark} entry fee\", \n", " \"best {location_and_landmark} in {city}\",\n", "\n", " # Food and dining queries\n", " \"{food_m} place\", \n", " \"find {food_m}\", \n", " \"best {food_m} spot\", \n", " \"{food_m} delivery\", \n", " \"{food_m} open near me\", \n", " \"order {food_m}\", \n", " \"{food_m} deals\", \n", " \"{food_m} options\", \n", " \"{food_m} near me\", \n", " \"{food_m} reservation\", \n", " \"top-rated {food_m} restaurants\", \n", " \"{food_m} reviews\", \n", " \"{food_m} menu\", \n", " \"popular {food_m} dishes\", \n", " \"where to eat {food_m}\",\n", "\n", " # activities_and_events\n", " \"{activity_and_event} tickets\", \n", " \"nearest {activity_and_event}\", \n", " \"{activity_and_event} today\", \n", " \"upcoming {activity_and_event}\", \n", " \"book {activity_and_event}\", \n", " \"{activity_and_event} in {city}\", \n", " \"find {activity_and_event}\", \n", " \"{activity_and_event} schedule\", \n", " \"{activity_and_event} near me\", \n", " \"top-rated {activity_and_event} venues\", \n", " \"{activity_and_event} details\", \n", " \"how to attend {activity_and_event}\", \n", " \"{activity_and_event} location\", \n", " \"{activity_and_event} opening hours\",\n", " \"{activity_and_event}\",\n", "\n", " # Single-word incomplete or ambiguous queries (standalone)\n", " # Sports and Games (single or incomplete)\n", " \"footbal\", \"baske\", \"golf\", \"sush\", \"pizz\", \"zoo\", \"conc\", \"direc\", \n", " \"theate\", \"stadiu\", \"brunc\", \"tourn\", \"parad\", \"swimmin\", \"train\", \"taxi\", \n", " \"game\", \"meet\", \"mountain\", \"beac\", \"lake\", \"forest\", \"ligh\", \"restauran\", \n", " \"parki\", \"stor\", \"monumen\", \"aren\", \"boardwal\",\n", " # Locations and Landmarks (single or incomplete)\n", " \"statue\", \"museum\", \"plaza\", \"zoo\", \"church\", \"theater\", \"stadium\", \"mountain\", \n", " \"park\", \"lake\", \"beach\", \"river\", \"palace\", \"cathedra\", \"mansion\", \"monument\", \n", " \"temple\", \"observato\", \"canyon\", \"garden\", \"conservato\", \"boardwal\", \"forest\", \n", " \"pier\", \"lighthouse\", \"arena\", \"campgroun\", \"arch\", \"reservoi\", \"dam\", \"fountai\", \n", " \"waterfal\", \"galleri\", \"amphitheate\", \"sculptur\", \"trail\", \"cliff\", \"tower\", \"islan\",\n", " # Activities and Events (single or incomplete)\n", " \"conc\", \"exhib\", \"meet\", \"parad\", \"festi\", \"tourn\", \"game\", \"sho\", \"even\", \"gala\", \n", " \"confere\", \"seminar\", \"webina\", \"worksho\", \"lectur\", \"symposiu\", \"screenin\", \n", " \"rall\", \"celebratio\", \"ceremon\", \"get-togethe\", \"perfor\", \"gatherin\", \"competitio\", \n", " \"maratho\", \"speec\", \"workout\", \"exercis\", \"demonstratio\", \"ceremony\", \"readin\", \n", " \"daytrip\", \"lectur\", \"social\", \"activit\", \"performanc\", \"worksho\", \"openin\", \n", " \"finale\", \"comedy\", \"poetr\", \"talent\", \"match\",\n", " # Restaurants and Food Types (single or incomplete)\n", " \"sush\", \"pizz\", \"ramen\", \"bbq\", \"vega\", \"steak\", \"taco\", \"burg\", \"pasta\", \"brunc\", \n", " \"desse\", \"drink\", \"grill\", \"bake\", \"buffet\", \"sandwich\", \"noodle\", \"cafe\", \n", " \"taver\", \"gastro\", \"bistro\", \"deli\", \"saloo\", \"barbecue\", \"snack\", \"confectio\", \n", " \"pub\", \"salad\", \"cuisine\", \"fries\", \"wings\", \"pantr\", \"meatbal\", \"sub\", \"omel\", \n", " \"crepe\", \"wrap\", \"beverag\", \"dessert\", \"smoothie\", \"juice\", \"shake\", \"frappe\", \"coffee\",\n", " # Transport and Directions (single or incomplete)\n", " \"direc\", \"map\", \"bus\", \"train\", \"car\", \"park\", \"taxi\", \"subwa\", \"fly\", \"plane\", \n", " \"ticke\", \"pass\", \"ferr\", \"bicycl\", \"scoote\", \"shuttl\", \"walkin\", \"rideshar\", \n", " \"transi\", \"toll\", \"metr\", \"road\", \"route\", \"stop\", \"junctio\", \"termina\", \"highwa\", \n", " \"pathwa\", \"drivewa\", \"loop\", \"intersectio\", \"trailhead\", \"tub\", \"sidestro\", \n", " \"crosswal\", \"rout\", \"navigatio\", \"crossing\", \"pave\", \"deck\", \"lane\",\n", " # Technology and Gadgets (single or incomplete)\n", " \"lapt\", \"smartphon\", \"comput\", \"tablet\", \"earbuds\", \"bluetooth\", \"charg\", \"cabl\", \n", " \"headset\", \"monitor\", \"consol\", \"keyboard\", \"drive\", \"storag\", \"gaming\", \"mouse\", \n", " \"projector\", \"flashdriv\", \"powerban\", \"adapter\", \"webcam\", \"router\", \"modem\", \n", " \"camcorder\", \"printer\", \"copier\", \"recorde\", \"remote\", \"surge\", \"extend\", \"plug\", \n", " \"portabl\", \"backu\", \"networ\", \"recharge\", \"uplo\", \"downlo\", \"strea\", \"screencas\", \n", " \"googl\", \"apple\", \"micros\", \"andr\",\n", "\n", " # actual city and states\n", " \"{food} {city}\", \n", " \"{food} {state}\", \n", " \"{city} {food}\", \n", " \"{state} {food}\", \n", " \"{food} in {city}\", \n", " \"{food} in {state}\", \n", "\n", " # fake cities and states\n", " \"{food} {fake_cty}\", \n", " \"{food} {fake_state_cd}\", \n", " \"{food} {fake_state_nam}\", \n", " \"{fake_cty} {food}\", \n", " \"{fake_state_cd} {food}\", \n", " \"{fake_state_nam} {food}\", \n", " \"{food} in {fake_cty}\", \n", " \"{food} in {fake_state_cd}\", \n", " \"{food} in {fake_state_nam}\", \n", "\n", " # celebrities\n", " \"{celebrity}\",\n", " \"{celebrity} age\",\n", " \"{celebrity} net worth\",\n", " \"{celebrity} movies\",\n", " \"What shows has {celebrity} been on?\",\n", " \"What awards has {celebrity} won?\",\n", " \"Where does {celebrity} live?\",\n", " \"What are {celebrity}'s upcoming projects?\",\n", " \"{celebrity} diet\",\n", " \"is {celebrity} married?\",\n", " \"does {celebrity} live in {city}\",\n", "\n", " ## unknown random queries\n", " 'snoozlegrip', 'shenanigans', 'kerplunk', 'clip', 'snappyy', 'spindlywhack', 'crinkly', 'pressed enter too soon', \n", " 'try this', 'query here', 'mistyped selection', 'smorgasbord', 'crumplify', 'snooze', 'twonkle', 'bamboozlemate', \n", " 'this doesn’t matter', 'zap', 'mind blank', 'hiss', 'snagged', 'splurgy', 'snagglebash', 'guess', 'zapz', 'frap', \n", " 'blotter', \"don't even know\", 'don’t know answer', 'spindletastic', 'zizzlesplat', 'jinkled', 'placeholder search', \n", " 'uncertain search', 'splode', 'abcxyz', 'twangleblop', 'shifty', 'bumfuzzle', 'plunge', 'thingy', \n", " 'swooshenator', 'quark', 'tatterblast', 'frizzlefry', 'something random', 'puff', 'blobby', 'placeholder attempt', \n", " 'weird example', 'wiggle', 'snortleboo', 'bouncy', 'qwerty', 'whirl', 'nix', 'idk what', 'random search', \n", " 'glimmering', 'guzzle', 'strange text', 'accidental hit', 'forgot keypress', 'dazzleplunk', 'snurply', \n", " 'confused', 'weird gibberish', 'idc either', 'test123', 'huff', 'supercalifragilistic', 'clap', 'whoopsie', 'nump', \n", " 'lorem ipsum', 'snuffle', 'unknown phrase', 'whizz', 'bloop', 'glitch', 'zomp', 'clappy', 'gush', 'zappletastic', \n", " 'hooey', 'bing', 'slap', 'ting', 'miscellaneous', 'jingle', 'idk just looking', 'twangy', 'dinglefrizzle', \n", " 'just clicking', 'quizzical', 'splatterdash', 'kerplunkitude', 'fizzlematic', 'piff', 'jazz', 'jib', 'random phrase', \n", " 'flapper', 'uhmm', 'nothing much', 'sdf', 'snub', 'confusing example', 'keyboard smash', 'randomized words', \n", " 'nothing useful', 'random sentence', 'placeholder input', 'splattergrip', 'zorp', 'fluffernutter', 'splopp', \n", " 'incomplete search', 'check this out', 'woozle', 'bananarama', 'quiz', 'spiffy', 'undefined', 'confusing term', 'sploom', \n", " 'randomized example', 'spliffy', 'ooze', 'blazing', 'uncertain input', 'unknown search', 'random guesses', \n", " 'unknown', 'concept unclear', 'accidental input', 'sporkinator', 'whats this', 'maybe', 'ignore this', 'twinkle', \n", " 'whatchamacallit', 'splank', 'weird thing', 'huh', 'into the unknown', 'chaos', 'wigglie', 'twistamatic', 'kerflapify', \n", " 'twizzletude', 'mock', 'thud', 'shrug', 'grizzed', 'jibberjabber', 'weirdness', 'anything', 'plop', 'dazzlicious', \n", " 'random selection', 'splatt', 'abracadabra', 'whooshenator', 'random mouse click', 'sparklefish', 'banal', \n", " \"what's the word\", 'mistyped search', 'twinklebash', 'splush', 'splazz', 'forgot search term', 'crumplamatic', 'glee', \n", " 'whizzy', 'whizzlemate', 'jumpy', 'dork', 'randomxyz', 'gobsmacktastic', 'no clue what', 'zazz', 'beyond the void', \n", " 'weird try', 'drift', 'yank', 'yodelsnap', 'biff', 'forgot randomness', 'splatterblast', 'no idea', 'smooshify', \n", " 'peep', 'rick', 'splendiferous', 'squishy', 'muff', 'flabbergizmo', 'confuzzled', 'I think so', 'zing', \n", " 'meaningless typing', 'shush', 'zany', 'don’t need help', 'randomly chosen', 'warpydash', 'forgot words', \n", " 'placeholder typing', 'spunky', 'spindleplop', 'crash', 'flabbergast', 'snaggleplop', 'hootnanny', 'blurp', \n", " 'miff', 'snarkle', 'snookie', 'gleamitude', 'hello world', 'zag', 'accidental gibberish', 'nothing in mind', \n", " 'bash', 'spiv', 'rift', 'don’t know what to search', 'splong', 'no point', 'forgot attempt', 'fluttermate', \n", " 'flub', 'guff', 'dazzled', 'doodad', 'forgot term', 'blotchy', 'odd', 'kerplazzle', 'grubby', 'try to see', 'glop', \n", " 'whooshify', 'snicker', 'snuffly', 'random thought', 'mixed up stuff', 'zapper', 'sort of searching', 'slushy', \n", " 'blurification', 'mop', 'smit', 'splurge', 'meaningless input', 'quix', 'zapplarific', 'splang', 'zoinkalicious', \n", " 'unclear selection', 'splushy', 'guesstimate', 'snazzie', 'what about this', 'input fail', 'codswallop', 'dink', 'splunk', \n", " 'unclear', 'strange example', 'jitter', 'sploff', 'blip', 'unknown meaning', 'nope', 'gadzooks', 'odd example', \n", " 'zappomatic', 'janglystorm', 'ink', 'wobbled', 'wigglyy', 'typed by mistake', 'twirly', 'lurk', 'kerplottify', \n", " 'twizzlefang', 'muck', 'clunky', 'splatterific', 'clippy', 'oops input', 'what am I doing', 'qazwsxedc', 'does it matter', \n", " 'nonsensical', 'swooshinator', 'poiuuy', 'splish', 'mistyped query', 'squizzlewhack', 'what now', 'spluzz', 'glim', \n", " 'placeholder keypress', 'mistyped randomness', 'what is it', 'don’t know why', 'quibbleplop', 'guess what', 'snizzlezap', \n", " 'meaning of nothing', 'wiggles', 'zxcvbn', 'spur', 'uncertain term', 'what am I typing', 'zoodleblorp', 'floppy', 'asdfasdf', \n", " 'confused input', 'unclear sentence', 'snortlematic', 'smooshinator', 'random term', 'searching something', \n", " 'snorflemate', 'twinkly', 'skip', 'quib', 'forgotten term', 'oops', 'splodge', 'meaningless words', 'unclear input', \n", " 'unclear phrase', 'zoom', 'sneeze', 'cat on keyboard', 'nincompoop', 'zappification', 'warpington', 'splurty', \n", " 'do I know', 'splott', 'splurb', 'plink', 'dazzlematic', 'could be anything', 'lost thoughts', 'what', 'pizz', \n", " 'jiggles', 'splodgy', 'twang', 'i forgot', 'meaningless term', 'unclear search', 'thunderplunk', 'just pressing keys', \n", " 'splodgify', 'flit', 'snazzify', 'zoop', 'totally confused', 'quip', 'womp', 'wham', 'wigglyz', 'fuzzyy', 'why is this here', \n", " 'malarkey', 'widget', 'don’t care', 'scoff', 'randomized search', 'unclear example', 'pop', 'quash', 'uh oh', \n", " 'placeholder randomness', 'splatification', 'snickerplunk', 'nutterbutter', 'whisk', 'nibs', 'help', 'strange attempt', \n", " 'blurptacular', 'gizmo', 'forgotten query', 'spazzy', 'ding', 'lost search', 'buzzing', 'hum', 'nonsensicality', \n", " 'gloop', 'globby', 'lost meaning', 'plopperific', 'hard to say', 'snappy', 'don’t type this', 'blunderous', 'twizzlegrip', \n", " 'flappy', 'random keypress', 'zizzlewhack', 'forgot what I typed', 'zingerdoodle', 'randomized attempt', 'unsure words', \n", " 'strange sentence', 'asfjkl', 'frizz', 'idk', 'gobbledygook', 'flibbertigibbet', 'gadzookify', 'flabberzap', 'vroom', \n", " 'splitch', 'glimmerstorm', 'blurt', 'frizzle', 'meaningless search', 'thingamajig', 'murmur', 'not this', 'sploof', \n", " 'fiddlewhip', 'mumbojumbo', 'something strange', 'splurg', 'fake input', 'whiffle', 'forgot query', 'search mix', \n", " 'yapplify', 'zippy', 'splurpy', 'splat', 'zoinks', 'bizz', 'crumby', 'meaningless query', 'snickerdoodle', 'weird word', \n", " 'squidge', 'don’t know term', 'spangletude', 'spazzmatic', 'just testing', 'baffled', 'splurt', 'gaze', 'frizzy', \n", " 'bamboozling', 'slurp', 'zappertude', 'splorch', 'swooshtastic', 'dunk', 'honk', 'smudgy', 'flimmerstorm', 'tizz', \n", " 'uncertain randomness', 'jangletude', 'perhaps this', 'placeholder search term', 'whoosh', 'spike', 'glitterbop', \n", " 'idiosyncratic', 'odd typing', 'blob', 'bazzlemate', 'crumpleton', 'clutterbomb', 'whatever', 'kerfuffle', 'test input', \n", " 'randomized keypress', 'meaningless randomness', 'why not', 'snizzleblap', 'bonk', 'forgot search', 'zonk', 'whatsisname', \n", " 'doesn’t matter', 'splurgz', 'twig', 'ramblethorp', 'fake query', 'ping', 'smack', 'buzz', 'tingly', 'warpydoodle', \n", " 'filler words', 'buzzed', 'unclear thought', 'weird input', 'blap', 'snazzy', 'look for this', 'snorkelwhip', 'spoon', \n", " 'just guessing', 'glitche', 'swirl', 'snooker', 'search fail', 'random gibberish', 'abstract thought', 'spindelicious', \n", " 'snorple', 'fell asleep typing', 'splunge', 'twit', 'grippy', 'flip', 'whatsisface', 'maybe something', 'bamboozle', \n", " 'zinger', 'drizzleblip', 'splonky', 'what do I search', 'blat', 'another try', 'odd randomness', 'yarn', 'squib', \n", " 'confused term', 'flabbergasted', 'testing input', 'don’t know', 'thunderbop', 'blurpsational', 'janglydash', 'brouhaha', \n", " 'find out about', 'strange randomness', 'kerplizzle', 'meaningless attempt', 'spud', 'placeholder term', 'woof', 'splaff', \n", " 'jigglez', 'fuzzed', 'blahblah', 'grizzle', 'something here', 'blink', 'snuggly', 'yelp', 'chop', 'eternal question', 'splift', \n", " 'what do you mean', 'hullabazoo', 'cloggy', 'wrong key pressed', 'test again', 'don’t ask me', 'blur', 'twisty', 'flapperdash', \n", " 'crinklewhip', 'plinky', 'gobbleplop', 'I don’t understand', 'random', 'dummy text', 'blurblenator', 'try something', 'input here', \n", " 'thing', 'fringe', 'no answer', 'placeholder selection', 'test', 'spangleplop', 'splash', 'lost in thought', 'zest', \n", " 'fiddleplop', 'bunk', 'snag', 'vex', 'placeholder randomness example', 'spat', 'placeholder phrase', 'random search term', \n", " 'squigg', 'tinge', 'random words', 'unknown query', 'not useful', 'snuzzlefrump', 'type here', 'snuzzle', 'drip', 'gibberish', \n", " 'hodgepodge', 'forgot the term', 'completely random', 'doesn’t make sense', 'lost', 'splatterstorm', 'meaningless text', \n", " 'twizzle', 'find something', 'twinkletude', 'zine', 'spunked', 'crikey', 'mistaken input', 'no idea what this is', 'spork', \n", " 'glimmertastic', 'sloppy', 'twirky', 'abstract query', 'fluffytude', 'randomized selection', 'randomized randomness', \n", " 'nudge', 'gawk', 'buzzer', 'nonsensical search', 'i was curious', 'zapplify', 'cloppy', 'doohickey', 'snickly', 'doodle', \n", " 'placeholder example', 'placeholder text', 'nonsense search', 'why search this', \"this doesn't work\", 'splendiferific', \n", " 'crappy', 'what are words', 'clop', 'randomized term', 'weird', 'snazztastic', 'whizzbang', 'blaze', 'twangaloo', \n", " 'strange keypress', 'placeholder query', 'skew', 'splink', 'lkjhgfd', 'unclear meaning', 'flummoxify', 'lollygag', \n", " 'odd gibberish', 'clunk', 'snap', 'zapf', 'flummoxed', 'yawn', 'random input', 'strange word', 'zapplomatic', \n", " 'does this work', 'gasp', 'typing nothing', 'idk anymore', 'empty thoughts', 'pluck', 'randomized test', \n", " 'brain fog', 'squibbletude', 'fizzle', 'jinglyy', 'mistyped term', 'confused mind', 'random typing', 'asdfgh', \n", " 'infinity', 'twist', 'something typed', 'kerplunktastic', 'just trying this', 'mistaken search', 'sparklematic', \n", " 'woop', 'jittery', 'oopsie', 'snippy', 'splinky', 'splint', 'swooshification', 'spit', 'zinged', 'blop', 'lost words',\n", " 'crux', 'blurbleplop', 'balderdash', 'perhaps not', 'flibber', 'snickerwhack', 'try later', 'zork', 'void', \n", " 'accidental query', 'fumble', 'snarked', 'don’t care search', 'just looking', 'spindling', 'snip', 'squish', \n", " 'blazer', 'splo', 'splunky', 'unclear randomness', 'spliff', 'not this either', 'nonsensical words', \n", " 'testing random', 'snigglewhap', 'odd input', 'whizzlegrip', 'dazzlegrip', 'fling', 'meaning of gibberish', \n", " 'weird thoughts', 'gunk', 'does this help', 'flux', 'wink', 'wonky', 'wisp', 'drizzlematic', 'another test', \n", " 'test search', 'just wondering', 'crumblewhack', 'spaz', 'splung', 'skid', 'quirky', 'odd search', 'accidental term', \n", " 'dunno', 'quizzicality', 'gleam', 'glimmer', 'don’t press enter', 'gadget', 'whizzleplop', 'don’t know exactly', \n", " 'odd words', 'blotty', 'thunderblop', 'maybe not', 'spludge', 'discombobulated', 'stuff', 'halfway done', \n", " 'sparklenator', 'zang', 'jolt', 'accidental search', 'what is going on', 'wiggler', 'mnbvcxz', 'yip', 'wriggle', \n", " 'hullaballoo', 'janglenut', 'zapplesmash', 'janglitude', 'what is this', 'whip', 'tiddlywinks', 'wiggly', 'weird randomness', \n", " 'sporkalicious', 'wriggy', 'meaningless selection', 'crumble', 'weird thought', 'splurch', 'don’t understand', \n", " 'sploosh', 'yap', 'nonsense', 'wobble', 'question of life', 'randomly typed', 'snuggle', 'snizzlegrip', 'oops I typed', \n", " 'zappy', 'twinkleplop', 'uncertain example', 'idc', 'mash', 'not sure', 'pandemonium', 'perhaps later', 'quirked', \n", " 'smug', 'warp', 'dash', 'could be nothing', 'unsure search', 'jumbled phrases', 'hush', 'wibble', 'weird search', \n", " 'quibberish', 'flop', 'discombobulate', 'this makes no sense', 'fizz', 'quirkitude', 'zingzang', 'dank', 'limitless', \n", " 'this is random', 'crunch', 'vibe', 'nothing specific', 'forgot', 'not important', 'slosh', 'question mark', 'zoopendous', \n", " 'flummify', 'splosh', 'splorp', 'splishy', 'snurkle', 'blah', 'guess answer', 'twitch', 'flap', 'snooperdoodle', \n", " 'janglybits', 'snizzleflap', 'slush', 'snortlemate', 'quirk', 'void query', 'fizzled', 'lollygagging', 'wonkifying', \n", " 'nothing', 'splunch', 'hullabaloo', 'thingamabob', 'dazzlebash', 'whizzie', 'this and that', 'shard', 'twix',\n", " \"crumpled\", \"splizzle\", \"gargle\", \"mangled\", \"shamble\", \"wobblish\", \"drizzlepop\",\n", " \"splinker\", \"fiddlest\", \"twizzlepop\", \"blurzzle\", \"snizzlewick\", \"wozzle\", \n", " \"cracklepop\", \"glibbish\", \"twezzle\", \"boondock\", \"sizzleflip\", \"snigglemash\",\n", " \"zazzle\", \"fizzlepot\", \"scramble\", \"tinglish\", \"sprozzle\", \"blimble\", \"zibble\",\n", " \"slapdash\", \"gobstork\", \"ziggler\", \"flingle\", \"wrangly\", \"twizzlebit\", \"brambly\",\n", " \"snubble\", \"splintery\", \"fizznack\", \"tibber\", \"quaggly\", \"whooshpop\", \"snibble\",\n", " \"plunkish\", \"glimflash\", \"wobbert\", \"squidgy\", \"kerplonk\", \"fobble\", \"blurzy\",\n", " \"scriggly\", \"smudgify\", \"tassler\", \"whipple\", \"snuzzify\", \"zaggle\", \"plonker\",\n", " \"smizzle\", \"quiggle\", \"spongle\", \"shizzle\", \"drippity\", \"bogglepop\", \"twiddly\",\n", " \"puzzleth\", \"flummish\", \"sniggleflop\", \"crumplish\", \"twiggle\", \"nubbish\", \n", " \"splurkle\", \"whibber\", \"jibblish\", \"twonker\", \"fizzlewhip\", \"spazzle\", \"splorpish\",\n", " \"snuffler\", \"hubble\", \"twinkler\", \"crumpler\", \"wimbley\", \"twazzle\", \"blurbonic\",\n", " \"zapplepop\", \"flippery\", \"snuzle\", \"quizzwhip\", \"clatter\", \"garglunk\", \"splingle\",\n", " \"drabbler\", \"spunkly\", \"jumbler\", \"snappish\", \"zingify\", \"buzzpop\", \"snizzlehop\",\n", " \"plobber\", \"scribble\", \"twongle\", \"scrabbly\", \"sniggler\", \"bimblepop\", \"snorplebop\",\n", " \"wizzle\", \"blimpy\", \"splinglepop\", \"frizzlepop\", \"grizzleton\", \"whizbang\", \n", " \"tinklish\", \"blopple\", \"blurbit\", \"wozzly\", \"zingpong\", \"splimble\", \"twinklypop\",\n", " \"spinkly\", \"snubbleton\", \"glozzle\", \"splonkle\", \"quizzle\", \"drizzlebot\", \"snarbly\",\n", " \"twizzleth\", \"whizzleton\", \"crumblish\", \"snapple\", \"splozzle\", \"glimmish\", \n", " \"plimbish\", \"snuzzleblop\", \"twinklish\", \"fizzywhip\", \"snorblish\", \"drizzler\", \n", " \"flopplish\", \"smizzlepop\", \"crumpledash\", \"twizzlefizz\", \"plumbly\", \"smuzzle\",\n", " \"tizzler\", \"gobblish\", \"splunkton\", \"jibberdash\", \"sproingly\", \"snizzler\", \n", " \"glabble\", \"twinkleflip\", \"flobble\", \"twonklepop\", \"splittish\", \"grumblepop\",\n", " \"whimblish\", \"splingledash\", \"snarpish\", \"twinklybit\", \"spindlish\", \"grubble\",\n", " \"smarple\", \"twonkerish\", \"sniffly\", \"snibbleton\", \"grizzlepop\", \"tazzler\", \n", " \"splinsh\", \"snazzler\", \"twinklepuff\", \"zopple\", \"glunkish\", \"crizzlepop\", \n", " \"snarklebot\", \"whibblish\", \"flimmerdash\", \"splurpyton\", \"snuzzlepop\", \"wigglerish\",\n", " \"sniggleplop\", \"jigglish\", \"splurble\", \"buzzsnip\", \"plomble\", \"splattypop\", \n", " \"twinklepip\", \"twonglish\", \"flobber\", \"grimpish\", \"quaggler\", \"sporkish\", \n", " \"drizzleth\", \"squiggler\", \"splobber\", \"ploppish\", \"snigglerish\", \"splingleth\",\n", " \"grizzleblop\", \"sploblish\", \"snarbler\", \"smarvish\", \"quizzlet\", \"snapplish\",\n", " \"snuzleflip\", \"plongish\", \"crizzlebot\", \"grimpish\", \"twinklebot\", \"blurpish\",\n", " \"splopple\", \"gizzleth\", \"drizzlepuff\", \"twonklish\", \"snubbler\", \"blurblebot\",\n", " \"splizzy\", \"twinkleton\", \"jibbler\", \"splizzlepop\", \"splurbit\", \"plobblish\", \n", " \"crumplish\", \"snizzlebit\", \"twinklishbot\", \"spinkler\", \"snibbleflip\", \"wigglebot\",\n", " \"twonglishbot\", \"snizzleton\", \"splongle\", \"blonker\", \"glimmerbit\", \"snarvish\",\n", " \"love\", \"anger\", \"hope\", \"dream\", \"thought\", \"courage\", \n", " \"strength\", \"patience\", \"birthday\", \"anniversary\", \n", " \"vacation\", \"weekend\", \"holiday\", \"winter\", \"summer\", \n", " \"autumn\", \"spring\", \"success\", \"failure\", \"freedom\", \"peace\", \"wisdom\", \n", " \"kindness\", \"respect\", \"free\", \"freedom\", \"great\", \"best\", \"worst\", \"last\", \"first\", \"second\", \n", " \"next\", \"there\", \"banana\", \"apple\",\n", "]" ] }, { "cell_type": "code", "execution_count": null, "id": "cc3a0477-6ae4-4794-8fef-b562d79dbbe9", "metadata": {}, "outputs": [], "source": [ "len(templates)" ] }, { "cell_type": "code", "execution_count": null, "id": "ad319a0f-7d5c-4e97-8632-c23007424ae0", "metadata": {}, "outputs": [], "source": [ "PERSON_ENTITY = \"{person}\"\n", "ORG_ENTITY = \"{organization}\"\n", "CITY_ENTITY = \"{city}\"\n", "STATE_ENTITY = \"{state}\"\n", "CITY_STATE_ENTITY = \"{city_state}\"\n", "PRODUCT_ENTITY = \"{product}\"\n", "COUNTRY_ENTITY = \"{country}\"\n", "SERVICE_ENTITY = \"{services}\"\n", "CAR_ENTITY = \"{car}\"\n", "GADGET_ENTITY = \"{gadget}\"\n", "STOCK_ENTITY = \"{stock}\"\n", "MONEY_ENTITY = \"{money}\"\n", "FINANCE_ENTITY = \"{finance}\"\n", "TRAVEL_ENTITY = \"{travel}\"\n", "FOOD_ENTITY = \"{food}\"\n", "RESTAURANT_ENTITY = \"{restaurant}\"\n", "SPORTS_TERMS_MISSING_ENTITY = \"{sports_term}\"\n", "LOCATIONS_AND_LANDMARKS_ENTITY = \"{location_and_landmark}\"\n", "ACTIVTIES_AND_EVENTS_ENTITY = \"{activity_and_event}\"\n", "FOOD_MISSING_ENTITY = \"{food_m}\"\n", "TRANSPORT_AND_DIRECTIONS_ENTITY = \"{transport_and_direction}\"\n", "\n", "FAKE_CITY_ENTITY = \"{fake_cty}\"\n", "FAKE_STATE_CODE_ENTITY = \"{fake_state_cd}\"\n", "FAKE_STATE_NAME_ENTITY = \"{fake_state_nam}\"\n", "CELEBRITY_ENTITY = \"{celebrity}\"\n", "\n", "\n", "def detect_entity(entity_name, template):\n", " return entity_name in template\n", "\n", "def tokenize(text):\n", " # Use regular expression to split words while keeping punctuation as separate tokens\n", " return re.findall(r'\\w+|[^\\w\\s]', text)\n", "\n", "# Tokenize the query and generate corresponding NER labels\n", "def tokenize_and_label(query, city, state, city_state, organization, person, celebrity):\n", " tokens = tokenize(query) # Tokenize the query using the improved function\n", " ner_labels = [0] * len(tokens) # Initialize all labels as \"O\" (outside any entity)\n", " \n", " # Label city_state entity\n", " if city_state:\n", " city_state_tokens = tokenize(city_state)\n", " start_idx = find_token_index(tokens, city_state_tokens)\n", " if start_idx is not None:\n", " ner_labels[start_idx] = 9 # CSB-LOC (beginning of city_state)\n", " for i in range(1, len(city_state_tokens)):\n", " ner_labels[start_idx + i] = 10 # CSI-LOC (inside city_state)\n", "\n", " # Label city entity\n", " if city:\n", " city_tokens = tokenize(city)\n", " start_idx = find_token_index(tokens, city_tokens)\n", " if start_idx is not None:\n", " ner_labels[start_idx] = 5 # CB-LOC (beginning of city)\n", " for i in range(1, len(city_tokens)):\n", " ner_labels[start_idx + i] = 6 # CI-LOC (inside city)\n", " \n", " # Label state entity\n", " if state:\n", " state_tokens = tokenize(state)\n", " start_idx = find_token_index(tokens, state_tokens)\n", " if start_idx is not None:\n", " ner_labels[start_idx] = 7 # SB-LOC (beginning of state)\n", " for i in range(1, len(state_tokens)):\n", " ner_labels[start_idx + i] = 8 # SI-LOC (inside state)\n", "\n", " # Label organization entity\n", " if organization:\n", " org_tokens = tokenize(organization)\n", " start_idx = find_token_index(tokens, org_tokens)\n", " if start_idx is not None:\n", " ner_labels[start_idx] = 3 # B-ORG (beginning of organization)\n", " for i in range(1, len(org_tokens)):\n", " ner_labels[start_idx + i] = 4 # I-ORG (inside organization)\n", "\n", " # Label person entity\n", " if person:\n", " person_tokens = tokenize(person)\n", " start_idx = find_token_index(tokens, person_tokens)\n", " if start_idx is not None:\n", " ner_labels[start_idx] = 1 # B-PER (beginning of person)\n", " for i in range(1, len(person_tokens)):\n", " ner_labels[start_idx + i] = 2 # I-PER (inside person)\n", " # Label person entity\n", " if celebrity:\n", " person_tokens = tokenize(celebrity)\n", " start_idx = find_token_index(tokens, person_tokens)\n", " if start_idx is not None:\n", " ner_labels[start_idx] = 1 # B-PER (beginning of person)\n", " for i in range(1, len(person_tokens)):\n", " ner_labels[start_idx + i] = 2 # I-PER (inside person)\n", " \n", " return tokens, ner_labels\n", "\n", "# Function to find the starting index of an entity's tokens in the query tokens\n", "def find_token_index(tokens, entity_tokens):\n", " for i in range(len(tokens) - len(entity_tokens) + 1):\n", " if tokens[i:i + len(entity_tokens)] == entity_tokens:\n", " return i\n", " return None\n", "\n", "def generate_queries(templates, n_queries=10000):\n", " cnt = 0\n", " fake_cnt = 0\n", " celeb_cnt = 0\n", " queries_with_labels = []\n", " query_counter = Counter()\n", " while cnt < n_queries:\n", " if (cnt %10000) == 0:\n", " print(f\"completed generating {cnt} queries\")\n", " template = random.choice(templates)\n", " # print(template)\n", " person, organization, city, state, city_state = (None,) * 5\n", " product, country, service, car, gadget, stock, money, finance, travel, food, restaurant = (None,) * 11\n", " sports_term, location_and_landmark, activity_and_event, food_m, transport_and_direction = (None,) * 5\n", " fake_cty, fake_state_cd, fake_state_nam, celebrity = (None,) * 4\n", "\n", " if detect_entity(PERSON_ENTITY, template):\n", " person=get_random_choice_from_list(persons)\n", " if detect_entity(ORG_ENTITY, template):\n", " organization = get_random_choice_from_list(organizations)\n", " if detect_entity(PRODUCT_ENTITY, template):\n", " product = get_random_choice_from_list(products)\n", " if detect_entity(COUNTRY_ENTITY, template):\n", " country = get_random_choice_from_list(countries)\n", " if detect_entity(COUNTRY_ENTITY, template):\n", " service = get_random_choice_from_list(services)\n", " if detect_entity(CAR_ENTITY, template):\n", " car = get_random_choice_from_list(cars)\n", " if detect_entity(GADGET_ENTITY, template):\n", " gadget = get_random_choice_from_list(gadgets)\n", " if detect_entity(STOCK_ENTITY, template):\n", " stock = get_random_choice_from_list(stocks)\n", " if detect_entity(MONEY_ENTITY, template):\n", " money = get_random_choice_from_list(moneys)\n", " if detect_entity(FINANCE_ENTITY, template):\n", " finance = get_random_choice_from_list(finances)\n", " if detect_entity(TRAVEL_ENTITY, template):\n", " travel = get_random_choice_from_list(travels)\n", " if detect_entity(FOOD_ENTITY, template):\n", " food = get_random_choice_from_list(foods)\n", " if detect_entity(RESTAURANT_ENTITY, template):\n", " restaurant = get_random_choice_from_list(restaurants)\n", " if detect_entity(SPORTS_TERMS_MISSING_ENTITY, template):\n", " sports_term = get_random_choice_from_list(sports_terms_missing)\n", " if detect_entity(LOCATIONS_AND_LANDMARKS_ENTITY, template):\n", " location_and_landmark = get_random_choice_from_list(locations_and_landmarks)\n", " if detect_entity(ACTIVTIES_AND_EVENTS_ENTITY, template):\n", " activity_and_event = get_random_choice_from_list(activities_and_events)\n", " if detect_entity(FOOD_MISSING_ENTITY, template):\n", " food_m = get_random_choice_from_list(food_missing)\n", " if detect_entity(TRANSPORT_AND_DIRECTIONS_ENTITY, template):\n", " transport_and_direction = get_random_choice_from_list(transport_and_directions)\n", "\n", " if detect_entity(FAKE_CITY_ENTITY, template):\n", " fake_cty = get_sample_fake_city()\n", " if detect_entity(FAKE_STATE_CODE_ENTITY, template):\n", " fake_state_cd = get_sample_fake_state_code()\n", " if detect_entity(FAKE_STATE_NAME_ENTITY, template):\n", " fake_state_nam = get_sample_fake_state_name()\n", "\n", " if detect_entity(CITY_ENTITY, template):\n", " city=get_sample_from_cities(city_info, city_weights, actual_threshold=0.7)\n", " if detect_entity(STATE_ENTITY, template):\n", " state=get_sample_from_states(state_info, actual_threshold=0.5)\n", " if detect_entity(CITY_STATE_ENTITY, template):\n", " city_state=get_sample_from_cities_and_states(city_state_code_info, city_state_name_info, state_code_threshold=0.8)\n", "\n", " if detect_entity(CELEBRITY_ENTITY, template):\n", " celebrity=get_random_choice_from_list(celebrities)\n", " \n", " query = template.format(person=person,\n", " organization=organization,\n", " city=city,\n", " state=state,\n", " city_state=city_state,\n", " product=product,\n", " country=country,\n", " service=service,\n", " car=car,\n", " gadget=gadget,\n", " stock=stock,\n", " money=money,\n", " finance=finance,\n", " travel=travel,\n", " food=food,\n", " restaurant=restaurant,\n", " sports_term=sports_term,\n", " location_and_landmark=location_and_landmark,\n", " activity_and_event=activity_and_event,\n", " food_m=food_m,\n", " transport_and_direction=transport_and_direction,\n", " fake_cty=fake_cty,\n", " fake_state_cd=fake_state_cd,\n", " fake_state_nam=fake_state_nam,\n", " celebrity=celebrity\n", " )\n", " tokens, ner_labels = tokenize_and_label(query, city, state, city_state, organization, person, celebrity)\n", " if query_counter.get(query, 0) == 0:\n", " queries_with_labels.append((query, tokens, ner_labels))\n", " query_counter.update([query])\n", " cnt += 1\n", " if (detect_entity(FAKE_CITY_ENTITY, template) or \n", " detect_entity(FAKE_STATE_CODE_ENTITY, template) or \n", " detect_entity(FAKE_STATE_NAME_ENTITY, template)):\n", " fake_cnt += 1\n", " if detect_entity(CELEBRITY_ENTITY, template):\n", " celeb_cnt += 1\n", " print(f\"fake_cnt = {fake_cnt}\")\n", " print(f\"celeb_cnt = {celeb_cnt}\")\n", " print(f\"cnt = {cnt}\")\n", " return queries_with_labels" ] }, { "cell_type": "code", "execution_count": null, "id": "fad4a249-7151-4c50-833d-9584819b4105", "metadata": {}, "outputs": [], "source": [ "queries_with_labels = generate_queries(templates, n_queries=450000) # 300000" ] }, { "cell_type": "code", "execution_count": null, "id": "f1270765-2a56-4dda-9e25-f0f6ac26f473", "metadata": {}, "outputs": [], "source": [ "len(queries_with_labels)" ] }, { "cell_type": "code", "execution_count": null, "id": "4145760e-8b8c-4896-a072-87ac5484dcf3", "metadata": {}, "outputs": [], "source": [ "# queries_with_labels[:10]\n", "df_ner_examples = pd.DataFrame(queries_with_labels, columns=['query', 'tokens', 'ner_tags'])\n", "df_ner_examples" ] }, { "cell_type": "code", "execution_count": null, "id": "a6b071e2-c7bf-4891-a662-5c14898da9f8", "metadata": {}, "outputs": [], "source": [ "df_ner_examples['ner_tags'].apply(lambda tags: len([tag for tag in tags if tag > 4])).value_counts()" ] }, { "cell_type": "code", "execution_count": null, "id": "022690de-9348-4d98-9007-b730571d6d6a", "metadata": {}, "outputs": [], "source": [ "label_map" ] }, { "cell_type": "code", "execution_count": null, "id": "d5d49f63-72c7-41da-a16c-536789713297", "metadata": {}, "outputs": [], "source": [ "df_ner_examples['query'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "id": "2de441c8-4e9c-4b59-9bcc-cfbe009718bf", "metadata": {}, "outputs": [], "source": [ "# df_ner_examples.to_csv(\"../data/df_ner_examples_v3.csv\", index=False)\n", "# df_ner_examples.to_csv(\"../data/df_ner_examples_v4.csv\", index=False)\n", "# df_ner_examples.to_csv(\"../data/df_ner_examples_v5.csv\", index=False) # Additional partial cities\n", "df_ner_examples.to_csv(\"../data/df_ner_examples_v6.csv\", index=False) # Additional partial cities" ] }, { "cell_type": "code", "execution_count": null, "id": "df043822-779c-4f9c-89eb-b331e2b0de19", "metadata": {}, "outputs": [], "source": [ "# useful for post processing to standardize the city names\n", "def build_lookup(dataframe):\n", " # Initialize an empty dictionary for the lookup\n", " lookup = {}\n", " \n", " # Iterate over each row in the DataFrame\n", " for index, row in dataframe.iterrows():\n", " city_name = row['city_name']\n", " alternate_names = row['alternate_names']\n", " \n", " # Iterate over the list of alternate names and map them to the city_name\n", " for alt_name in alternate_names:\n", " lookup[alt_name.lower()] = city_name # Convert alternate names to lowercase for consistency\n", " \n", " return lookup\n", "\n", "city_alternate_to_city_lkp = build_lookup(city_states_data)" ] }, { "cell_type": "code", "execution_count": null, "id": "62a392e3-e18e-470f-9f95-ad35ebaebca8", "metadata": {}, "outputs": [], "source": [ "len(city_alternate_to_city_lkp)" ] }, { "cell_type": "code", "execution_count": null, "id": "ca88070b-318e-4f7f-850a-e6ed176748a0", "metadata": {}, "outputs": [], "source": [ "# city_alternate_to_city_lkp" ] }, { "cell_type": "code", "execution_count": null, "id": "85bdeff1-a3f2-443e-a31b-d80e836c6ebe", "metadata": {}, "outputs": [], "source": [ "# !python -m pip install onnxruntime" ] }, { "cell_type": "code", "execution_count": null, "id": "689e6844-2a90-4b7a-a9a5-bb298dce2b70", "metadata": {}, "outputs": [], "source": [ "# !python -m pip freeze| grep onnxruntime" ] }, { "cell_type": "code", "execution_count": null, "id": "fc61067c-6e8a-499a-9d08-07fb4fb0eb2f", "metadata": {}, "outputs": [], "source": [ "# !mkdir ../models" ] }, { "cell_type": "code", "execution_count": null, "id": "74bca5a8-0bb0-46c1-8429-598e172f34af", "metadata": {}, "outputs": [], "source": [ "import onnxruntime as ort\n", "import numpy as np\n", "from transformers import AutoTokenizer, BertTokenizer\n", "\n", "# Download the ONNX model\n", "# model_url = \"https://huggingface.co/Xenova/bert-base-NER/resolve/main/onnx/model_quantized.onnx\"\n", "# model_url = \"https://huggingface.co/Mozilla/distilbert-NER-LoRA/resolve/main/onnx/model_quantized.onnx\"\n", "model_url = \"https://huggingface.co/Mozilla/distilbert-uncased-NER-LoRA/resolve/main/onnx/model_quantized.onnx\"\n", "# model_url = \"https://huggingface.co/chidamnat2002/distilbert-uncased-NER-LoRA/resolve/main/onnx/model_quantized.onnx\"\n", "# model_path = \"../models/distilbert-NER-LoRA.onnx\"\n", "model_path = \"../models/distilbert-uncased-NER-LoRA.onnx\"\n", "\n", "# Download the ONNX model if not already present\n", "response = requests.get(model_url)\n", "with open(model_path, 'wb') as f:\n", " f.write(response.content)\n", "\n", "# Load the ONNX model using ONNX Runtime\n", "session = ort.InferenceSession(model_path)\n", "\n", "# Load the tokenizer (assuming it's based on BERT)\n", "# tokenizer = BertTokenizer.from_pretrained(\"Mozilla/distilbert-NER-LoRA\")\n", "tokenizer = AutoTokenizer.from_pretrained(\"Mozilla/distilbert-uncased-NER-LoRA\")" ] }, { "cell_type": "code", "execution_count": null, "id": "838001d1-a252-4a4f-bfab-8c7698b7c79b", "metadata": {}, "outputs": [], "source": [ "def compute_model_inputs_and_outputs(session, tokenizer, query):\n", " # Tokenize the input\n", " # inputs = tokenizer(query, return_tensors=\"np\", truncation=True, padding=True)\n", " inputs = tokenizer(query, return_tensors=\"np\", truncation=True, padding='max_length', max_length=64)\n", " # is_split_into_words=True,\n", " # truncation=True,\n", " # padding='max_length',\n", " # max_length=64\n", " \n", " # The ONNX model expects 'input_ids', 'attention_mask', and 'token_type_ids'\n", " # Convert all necessary inputs to numpy arrays and prepare the input feed\n", " input_feed = {\n", " 'input_ids': inputs['input_ids'].astype(np.int64),\n", " 'attention_mask': inputs['attention_mask'].astype(np.int64),\n", " # 'token_type_ids': inputs['token_type_ids'].astype(np.int64) # Some models might not need this; check if it's really required\n", " }\n", " \n", " # Run inference with the ONNX model\n", " outputs = session.run(None, input_feed)\n", " # print(outputs)\n", " return inputs, outputs\n" ] }, { "cell_type": "code", "execution_count": null, "id": "f66190d3-5601-4593-b7b9-0eebde13e23e", "metadata": {}, "outputs": [], "source": [ "label_map" ] }, { "cell_type": "code", "execution_count": null, "id": "08ecb315-3896-4a7e-8c03-37e3ecb1fa9a", "metadata": {}, "outputs": [], "source": [ "## With Xenova/bert-base-NER\n", "# Number of examples = 349\n", "# #hits = 135; #hit rate = 0.3868194842406877\n", "\n", "## After finetuning the Mozilla/distilbert-NER-LoRA\n", "#hits = 220; #hit rate = 0.6303724928366762\n", "\n", "## After finetuning the chidamnat2002/distilbert-uncased-NER-LoRA\n", "#hits = 207; #hit rate = 0.5931232091690545\n", "\n", "## After finetuning the Mozilla/distilbert-uncased-NER-LoRA\n", "#hits = 252; #hit rate = 0.7220630372492837" ] }, { "cell_type": "code", "execution_count": null, "id": "1eed2554-784c-4f49-aad5-72b795f19295", "metadata": {}, "outputs": [], "source": [ "# len(missing_locations)" ] }, { "cell_type": "code", "execution_count": null, "id": "feaed0b3-5fb8-4686-b57a-3a8d9764ec79", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# print(missing_locations)" ] }, { "cell_type": "code", "execution_count": null, "id": "d04d5258-16b4-4773-b585-b5f31db3926c", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "ef09b219-dd01-4d66-92e2-c438935e8654", "metadata": {}, "source": [ "#### Looking into CONLL 2003 dataset" ] }, { "cell_type": "code", "execution_count": null, "id": "4233afed-374f-4f2f-baaa-078447959367", "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset, Dataset\n", "import re\n", "\n", "# Load the CoNLL-2003 dataset\n", "dataset = load_dataset(\"conll2003\")\n", "\n", "loc_examples = dataset" ] }, { "cell_type": "code", "execution_count": null, "id": "14216057-228f-467a-aa8e-02108d56cb92", "metadata": {}, "outputs": [], "source": [ "# dataset['train'].to_pandas()" ] }, { "cell_type": "code", "execution_count": null, "id": "e259586a-f67b-42b2-9665-a571da352f57", "metadata": {}, "outputs": [], "source": [ "# dataset['train']" ] }, { "cell_type": "code", "execution_count": null, "id": "12e91919-6dc4-4ad3-a388-e5b90d4efa79", "metadata": {}, "outputs": [], "source": [ "synthetic_loc_dataset = Dataset.from_pandas(df_ner_examples.drop('query', axis=1))\n", "print(synthetic_loc_dataset)\n", "\n", "print(synthetic_loc_dataset[0])" ] }, { "cell_type": "code", "execution_count": null, "id": "0d91ba34-cb67-418a-8a4e-4b442b144be6", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "496a76a7-3329-4849-affa-63166d427183", "metadata": {}, "outputs": [], "source": [ "# loc_dataset = dataset['train'].filter(lambda example: 5 in example['ner_tags'])\n", "loc_dataset = dataset['train']\n", "loc_dataset_filtered = loc_dataset.remove_columns(['pos_tags', 'chunk_tags'])\n", "\n", "# Set the format to ensure the order is 'id', 'tokens', and 'ner_tags'\n", "loc_dataset_filtered[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "42652aaf-399f-413f-a8f6-e082f1057e3f", "metadata": {}, "outputs": [], "source": [ "# loc_dataset_filtered[-1]" ] }, { "cell_type": "code", "execution_count": null, "id": "c47584e0-0612-400b-81e9-212a61209b94", "metadata": {}, "outputs": [], "source": [ "from datasets import concatenate_datasets\n", "\n", "from datasets import Sequence, ClassLabel, Value\n", "\n", "# Step 1: Get the full feature schema from synthetic_loc_dataset\n", "features = synthetic_loc_dataset.features\n", "\n", "# Step 2: Update the 'ner_tags' feature to use ClassLabel from loc_dataset_filtered\n", "# features['ner_tags'] = Sequence(feature=ClassLabel(names=loc_dataset_filtered.features['ner_tags'].feature.names))\n", "features['ner_tags'] = Sequence(feature=ClassLabel(names=list(label_map.values())))\n", "\n", "# Step 3: Cast synthetic_loc_dataset to the updated feature schema\n", "synthetic_loc_dataset = synthetic_loc_dataset.cast(features)\n", "\n", "# Check the updated features to confirm\n", "print(synthetic_loc_dataset.features)\n", "\n", "# Now concatenate the datasets\n", "# combined_dataset = concatenate_datasets([loc_dataset_filtered, synthetic_loc_dataset])\n", "\n", "# Verify the combined dataset\n", "print(synthetic_loc_dataset[0])\n" ] }, { "cell_type": "code", "execution_count": null, "id": "15f8ec72-8a43-43f2-932a-ef76b5efb4d2", "metadata": {}, "outputs": [], "source": [ "# ClassLabel(names=loc_dataset_filtered.features['ner_tags'].feature.names)" ] }, { "cell_type": "code", "execution_count": null, "id": "6e3b90ed-9bbf-4b8a-9990-b5db059de0ea", "metadata": {}, "outputs": [], "source": [ "# ClassLabel(names=list(label_map.values()))" ] }, { "cell_type": "code", "execution_count": null, "id": "6138a427-f03b-4355-bdac-ffec783f5a2b", "metadata": {}, "outputs": [], "source": [ "len(synthetic_loc_dataset)" ] }, { "cell_type": "code", "execution_count": null, "id": "caac8e36-6d1c-4a42-8acd-7e81f816fa9b", "metadata": {}, "outputs": [], "source": [ "synthetic_loc_dataset[3]" ] }, { "cell_type": "code", "execution_count": null, "id": "2aa98e69-bf5f-4bcc-b387-2abdc60a99be", "metadata": {}, "outputs": [], "source": [ "synthetic_loc_dataset = synthetic_loc_dataset.map(\n", " lambda example, idx: {'id': idx}, # Assign running count as the new 'id'\n", " with_indices=True # Ensures we get an index for each example\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "5906e294-6a1b-436d-a229-628f99190887", "metadata": {}, "outputs": [], "source": [ "synthetic_loc_dataset.to_pandas()" ] }, { "cell_type": "code", "execution_count": null, "id": "46c0d423-3b8c-47ed-a8ae-a3316cd78bd0", "metadata": {}, "outputs": [], "source": [ "synthetic_loc_dataset[-1]" ] }, { "cell_type": "code", "execution_count": null, "id": "c35b1a0b-303c-4eee-bc31-770872c212e5", "metadata": {}, "outputs": [], "source": [ "# synthetic_loc_dataset.to_parquet(\"../data/synthetic_loc_dataset_v3.parquet\")\n", "# synthetic_loc_dataset.to_parquet(\"../data/synthetic_loc_dataset_v4.parquet\")\n", "synthetic_loc_dataset.to_parquet(\"../data/synthetic_loc_dataset_v6.parquet\") # some partial cities examples" ] }, { "cell_type": "code", "execution_count": null, "id": "d33bb9a1-bd49-49cd-aa90-5428d46fbad7", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer, AutoModelForTokenClassification\n", "from transformers import pipeline\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"Mozilla/distilbert-uncased-NER-LoRA\")\n", "model = AutoModelForTokenClassification.from_pretrained(\"Mozilla/distilbert-uncased-NER-LoRA\")\n", "\n", "nlp = pipeline(\"ner\", model=model, tokenizer=tokenizer)\n", "example = \"New York\"\n", "\n", "ner_results = nlp(example)\n", "print(ner_results)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "32524933-23f7-41ae-8597-da0300e6ac60", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.0" } }, "nbformat": 4, "nbformat_minor": 5 }