Explore whether the weather keywords and locations are captured correctly

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from datasets import load_dataset, Dataset

In [None]:
import numpy as np
import random
from collections import Counter

#### Read the data/geonames-cities-states.json

In [None]:
import json 

def get_geonames_city_state_data():
    geonames_file = "../data/geonames-cities-states.json"
    with open(geonames_file, 'r') as f:
        geonames_dict = json.load(f)
    
    
    cities_data = pd.DataFrame(geonames_dict['cities'])\
                    .rename(columns={'admin1_code': 'state_code', 'name': 'city_name', 'population': 'city_popln'})
    cities_data = cities_data[['id', 'state_code', 'city_name', 'city_popln', 'alternate_names']]
    states_data = pd.DataFrame(geonames_dict['states_by_abbr'].values())\
                    .rename(columns={'admin1_code': 'state_code', 'name': 'state_name'})
    states_data = states_data[['state_code', 'state_name']]
    city_states_data = cities_data.merge(states_data, how='left', on='state_code')
    city_states_data['city_weight'] = city_states_data['city_popln'] / city_states_data['city_popln'].sum()
    return city_states_data



In [None]:
city_states_data = get_geonames_city_state_data()
print(len(city_states_data))
city_states_data

<!-- #### Add some partial city names for capturing the consumer needs 
if they type partial city names such as `coffee near me sunnyval` -->

In [None]:
# city_states_data['city_name'].apply(len).describe(percentiles=[.1, .2, .25, .3, .4, .5, .6 ,.7, .75, .8, .9, .95, .98, .99])

In [None]:
city_states_data.sort_values('city_weight', ascending=False)

In [None]:
city_weights = city_states_data[['city_name', 'city_weight']].set_index('city_name').to_dict()['city_weight']
# city_weights

In [None]:
city_info = city_states_data[['city_name', 'alternate_names']].set_index('city_name').to_dict()['alternate_names']
state_info = city_states_data[['state_code', 'state_name']].set_index('state_code').to_dict()['state_name']
city_state_code_info = city_states_data[['city_name', 'state_code', 'city_weight']].copy()
city_state_name_info = city_states_data[['city_name', 'state_name', 'city_weight']].copy()

In [None]:
# city_info

In [None]:
city_state_code_info

In [None]:
city_state_name_info

In [None]:
# list(city_info.keys())

In [None]:
fake_cities = [
    'Umber Glade', 'Crimson Hollow', 'Midland Creek', 'Boulderfield', 'Fairbrook', 'Mossmere', 'Hearthfield', 'Norwyn', 
    'Elysian Ridge', 'Dover Hollow', 'Mistral Cove', 'Starfall', 'Eaglebrooke', 'Granite Ridge', 'Umbrafield', 'Goldenstone', 
    'Palisade Brook', 'Willowfield', 'Noblehaven', 'Frostgrove', 'Oasis Ridge', 'Larkspur Vale', 'Elderstone', 'Forest Vale', 
    'Yonder Bluff', 'Cloverstone', 'Kingsvale', 'Ashen Bluff', 'Yarrow Bluff', 'Zion Hollow', 'Velvet Pine', 'Fernspire', 
    'Inkwell', 'Eaglewood', 'Driftshade', 'Prairiefield', 'Northshade', 'Riverwatch', 'Sapphire Hollow', 'Jadehaven', 
    'Lunaris', 'Quailstone', 'Birchvale', 'Blossom Creek', 'Union Hollow', 'Whispering Brook', 'Yarrowstone', 'Candlevale', 
    'Ravenshire', 'Willowhaven', 'Wyrmrest', 'Frostshade', 'Silverbrook', 'Azure Hollow', 'Tanglefield', 'Umberstone', 
    'Glimmerbrook', 'Ravencrest', 'Larkridge', 'Windspire', 'Oakheart', 'Obsidian Point', 'Newstone', 'Moonlit Vale', 
    'Tranquil Ridge', 'Gilded Summit', 'Lunarshade', 'Seabrook', 'Quartzwood', 'Juniper Crest', 'Norvale', 'Hollowmere', 
    'Kindlewood', 'Dawnspire', 'Obelisk Point', 'Kindred Hollow', 'Autumn Hollow', 'Orchard Ridge', 'Underbrook', 'Kingshaven', 
    'Ebonwood', 'Violet Haven', 'Peregrine Spire', 'Summitwood', 'Lakeshore Valley', 'Umbra Shores', 'Trillium Vale', 
    'Halcyon Creek', 'Xander Cove', 'Glenstone', 'Nimbus Grove', 'Willowfern', 'Vista Hollow', 'Jasperwood', 'Jasmine Vale', 
    'Rustvale', 'Quillbrook', 'Ravenmere', 'Zerith Hollow', 'Golden Ridge', 'Thistlewood', 'Quiet Hollow', 
    'Ridgevale', 'Bluewater Ridge', 'Unity Crest', 'Cedar Hollow', 'Bluffstone', 'Larchfield', 'Quarry Hollow', 
    'Laurel Ridge', 'Yellowfield', 'Amberfield', 'Quartz Creek', 'Zephyr Vale', 'Larkfield', 'Verdant Hollow', 
    'Cinder Hollow', 'Havencliff', 'Harborwood', 'Onyx Ridge', "Kite's Hollow", 'Brookfield', 'Brightveil', 'Redhawk', 
    'Valleywood', 'Havenwood', 'Thornhill', 'Silverwood', 'Duskfield', 'Tidesreach', 'Cypress Vale', 'Fernwood', 
    'Moonwillow', 'Verdant Shade', 'Willowthorn', 'Garnet Crossing', 'Ivy Hollow', 'Kestrel Cove', 'Amberpeak', 'Meadowcrest', 
    'Yellowvine', 'Violet Sands', 'Ironwood', 'Timber Shade', 'Dovewood Creek', 'Pinecairn', 'Driftvale', 'Crescent Vale', 
    'Juniper Grove', 'Ridgehaven', 'Timbervale', 'Hollowstone', 'Dawnbreak', 'Oceangrove', 'Pinegrove', 'Alderstone', 
    'Primrose Point', 'Jasper Vale', 'Pinevale', 'Quartzfield', 'Crescent Bluff', 'Jasperstone', 'Umbra Vale', 
    'Violet Ridge', 'Knollfield', 'Ironshade', 'Zephyr Crossing', 'Zenith Valley', 'Ashmoor', 'Xyron Bay', 'Everstone', 
    'Moonstone Creek', 'Foxshade', 'Ashfield', 'Xyros Hill', 'Sapphire Ridge', 'Elmfield', 'Ivoryfield', 'Hollowvale', 'Frostbluff', 
    'Xenia Ridge', 'Briarcliff', 'Kestrel Bluff', 'Nightingale Ridge', 'Peridot Bay', 'Islefield', 'Ivory Spire', 'Solace Grove', 
    'Xanadu Grove', 'Ecliptus', 'Zephyr Hollow', 'Oakenhill', 'Glade Ridge', 'Winterridge', 'Jadestone', 'Indigo Bay', 'Duskhaven',
    "Shadowpine", "Crystal Vale", "Harbor Reach", "Eldermoor",
    "Thornhollow", "Silverpeak", "Mistwood", "Shadowfall",
    "Willowbright", "Dusklight", "Havenvale", "Starcrest",
    "Glacier Hollow", "Cinderbluff", "Ironpeak", "Frostwood",
    "Embergrove", "Aurora Ridge", "Driftmoor", "Mooncrest",
    "Stonehearth", "Riverwood", "Briarfrost", "Quillhaven",
    "Stormvale", "Eaglesong", "Wanderwood", "Summervale",
    "Brightwood", "Cloudspire", "Snowhaven", "Golden Hollow",
    "Northcove", "Miststone", "Clearbrook", "Suncrest",
    "Twilight Vale", "Aspen Hollow", "Boulderhaven", "Shimmerwood",
    "Darkspire", "Oakbluff", "Hollowbright", "Sablewood",
    "Lunarfrost", "Dovewood Point", "Crescent Glade", "Wraithstone",
    "Foxwood Hollow", "Amberwood", "Midnight Ridge", "Garnet Hollow",
    "big city", "Big City", "Silver City", "Golden City", "Mystic City",
    "Sunset City", "Iron City", "Emerald City", "Shadow City", "Crystal City",
    "Harmony City", "Aurora City", "Dream City", "Thorn City", "Lunar City", "Twilight City", 
    "Velvet City", "Willow City", "Ivory City", "Eclipse City",
    "Storm City", "Bliss City", "Shimmer City", "Echo City", "Frost City",
    "Sapphire City", "Obsidian City", "Tranquil City", "Starlight City",
    "Drift City", "Amber City", "Hollow City", "Gilded City", "Quartz City",
    "Meadow City", "Rosewood City", "Timber City", "Bright City", "Fox City",
    "Dusk City", "Goldenleaf City", "Wind City", "Harbor City", "Cedar City",
    "Azure City", "Elder City", "Crescent City", "Pine City", "Summit City",
    "Cobalt City", "Bluff City", "Stone City",
]

fake_state_names = [
    'Meadowvale', 'Boulderwatch', 'Harperfield', 'Verdantia', 'Redhaven', 'Ashspire', 'Ecliptica', 'Cindermist', 
    'Stormhaven', 'Crystalbourne', 'Sunspire', 'Twilight Hollow', 'Frostspire', 'Silverwatch', 'Keystone Ridge', 
    'Gilded Vale', 'Bluewater', 'Jadewood', 'Northgate', 'Timberland', 'Ravenmark', 'Auroravale', 'Zephyr Bay', 
    'Stormspire', 'Stonemeadow', 'Quintarra', 'Stonepeak', 'Willowcrown', 'Thistledown', 'Verdantreach', 'Lunaris', 
    'Oakenshire', 'Brightwatch', 'Dawnhaven', 'Northreach', 'Verdant Hollow', 'Horizon Ridge', 'Xantria', 'Ironvale', 
    'Amberreach', 'Silverveil', 'Moonwatch', 'Umbershade', 'Windswept', 'Shadowpine', 'Shadowreach', 'Zionshade', 
    'Oasisland', 'Goldmere', 'Frosthaven', 'Drakemont', 'Emberland', 'Rivermist', 'Duskland', 'Firgrove', 'Driftstone', 
    'Frostveil', 'Amberwyn', 'Velvet Ridge', 'Mystic Vale', 'Snowpoint', 'Bluehaven', 'Opal Grove', 'Jasper Hollow', 
    'Tideridge', 'Crimson Bay', 'Aurorawood', 'Larkland', 'Thornvale', 'Shadewind', 'Ridgefall', 'Darkfall', 'Silvercrown', 
    'Goldenreach', 'Ivory Plains', 'Nobleshore', 'Yellowcove', 'Hollowbrook', 'Ravendale', 'Frostwood', 'Brightshade', 'Brightmere', 
    'Wytherstone', 'Eaglecrest', 'Frostmere', 'Moonbrooke', 'Goldenvale', 'Quillsprings', 'Pinemark', 'Prairiefield', 
    'Cascade', 'Kindlemark', 'Aspenvale', 'Ivoryreach', 'Thorncrest', 'Cloudwood', 'Jade Ridge', 'Westmarch', 
    'Wintercrest', 'Copperfield', 'Prairiefrost', 'Bladewind', 'Everwind', 'Quarrycrest', 'Lunashire', 'Hollowreach', 
    'Whispering Pines', 'Blueshore', 'Glacier Point', 'Gildan', 'Zephyrlight', 'Sablepeak', 'Northspire', 'Starhearth', 
    'Whispercrown', 'Valewind', 'Umbravale', 'Kindleland', 'Westwatch',
]

fake_state_codes = [
    'QT', 'WX', 'CZ', 'GW', 'FR', 'VW', 'BN', 'BM', 'LS', 'ZR', 'QN', 'KP', 'WS', 'ZZ', 'YW', 'XK', 'LR', 'NX', 
    'SW', 'XT', 'QB', 'ZT', 'SR', 'CW', 'JT', 'RP', 'HW', 'JV', 'FV', 'XW', 'PD', 'WR', 'QQ', 'UV', 'LK', 'LD', 
    'LM', 'HT', 'VR', 'XY', 'RG', 'UR', 'NT', 'PT', 'YT', 'MQ', 'DR', 'SP', 'FG', 'YS', 'ZS', 'PW', 'FN', 'XF', 
    'LV', 'RX', 'TG', 'CQ', 'LW', 'MX', 'BL', 'TF', 'GH', 'DX', 'QT', 'KV', 'RW', 'XL', 'FW', 'JR', 'PL', 'FB', 
    'ZN', 'KR', 'QZ', 'DF', 'HD',
]


In [None]:
len(fake_state_names), len(set(fake_state_names))

In [None]:
len(fake_state_codes), len(set(fake_state_codes))

In [None]:
len(fake_cities), len(set(fake_cities))

In [None]:
# print(set(fake_state_codes))

In [None]:
# valid_state_codes = set(city_state_code_info['state_code'].values.tolist())
# len(valid_state_codes)

In [None]:
# print([state_code for state_code in fake_state_codes if state_code not in valid_state_codes])

In [None]:
# # fake_state_names
# valid_state_names = set(city_state_name_info['state_name'].values.tolist())

# print([state_name for state_name in fake_state_names if state_name not in valid_state_names])

In [None]:
# len(valid_state_names)

In [None]:
# # fake_cities
# # city_info
# print([city_name for city_name in fake_cities if city_name not in city_info])

In [None]:
label_map = {
        0: "O",        # Outside any named entity
        1: "B-PER",    # Beginning of a person entity
        2: "I-PER",    # Inside a person entity
        3: "B-ORG",    # Beginning of an organization entity
        4: "I-ORG",    # Inside an organization entity
        5: "B-CITY",    # Beginning of a city entity
        6: "I-CITY",    # Inside a city entity
        7: "B-STATE",    # Beginning of a state entity
        8: "I-STATE",    # Inside a state entity
        9: "B-CITYSTATE",   # Beginning of a city_state entity
       10: "I-CITYSTATE",   # Inside a city_state entity
    }


persons = [
        'Donald Trump', 'John Smith', 'Roger Williams', 'Michelle Obama', 'Elon Musk',
        'Barack Obama', 'Bill Gates', 'Steve Jobs', 'Warren Buffett', 'Oprah Winfrey',
        'Jeff Bezos', 'Taylor Swift', 'Jennifer Lawrence', 'Brad Pitt', 'Leonardo DiCaprio',
        'Katy Perry', 'Tom Hanks', 'Emma Watson', 'Johnny Depp', 'Scarlett Johansson',
        'Mark Zuckerberg', 'Sheryl Sandberg', 'Ivanka Trump', 'Joe Biden', 'Kamala Harris',
        'Serena Williams', 'Michael Jordan', 'LeBron James', 'Tiger Woods', 'Cristiano Ronaldo',
        'Lionel Messi', 'Roger Federer', 'Usain Bolt', 'Simone Biles', 'Tom Brady',
        'Peyton Manning', 'David Beckham', 'Rafael Nadal', 'Novak Djokovic', 'Andy Murray',
        'George Clooney', 'Matt Damon', 'Julia Roberts', 'Angelina Jolie', 'Morgan Freeman',
        'Chris Hemsworth', 'Dwayne Johnson', 'Vin Diesel', 'Keanu Reeves', 'Robert Downey Jr.',
        'Chris Evans', 'Will Smith', 'Johnny Cash', 'Bob Dylan', 'Paul McCartney',
        'Ringo Starr', 'John Lennon', 'George Harrison', 'Madonna', 'Prince',
        'Bruce Springsteen', 'Elton John', 'David Bowie', 'Whitney Houston', 'Celine Dion',
        'Marilyn Monroe', 'Audrey Hepburn', 'Albert Einstein', 'Isaac Newton', 'Marie Curie',
        'Galileo Galilei', 'Nikola Tesla', 'Stephen Hawking', 'Richard Feynman', 'Carl Sagan',
        'Neil Armstrong', 'Yuri Gagarin', 'Sally Ride', 'Jane Goodall', 'Charles Darwin',
        'Mahatma Gandhi', 'Nelson Mandela', 'Martin Luther King Jr.', 'Malala Yousafzai', 'Angela Merkel',
        'Theresa May', 'Vladimir Putin', 'Xi Jinping', 'Justin Trudeau', 'Jacinda Ardern',
        'Pope Francis', 'Dalai Lama', 'Queen Elizabeth II', 'Prince William', 'Prince Harry',
        'James Anderson', 'Michael Brown', 'David Clark', 'John Doe', 'Robert Evans',
    'Christopher Foster', 'William Garcia', 'Charles Hall', 'Joseph Harris', 'Daniel Jackson',
    'Matthew Johnson', 'George King', 'Anthony Lewis', 'Mark Miller', 'Paul Moore',
    'Steven Nelson', 'Kevin Perry', 'Thomas Reed', 'Brian Roberts', 'Jason Scott',
    'Andrew Smith', 'Joshua Thompson', 'Ryan Turner', 'Brandon Walker', 'Nicholas White',
    'Jonathan Young', 'Adam Baker', 'Justin Carter', 'Benjamin Collins', 'Aaron Cook',
    'Alexander Davis', 'Tyler Edwards', 'Zachary Fisher', 'Ethan Graham', 'Jacob Green',
    'Austin Hernandez', 'Mason Hill', 'Logan Hughes', 'Owen Jenkins', 'Lucas Kelly',
    'Nathan Lee', 'Caleb Long', 'Henry Martinez', 'Dylan Mitchell', 'Gabriel Morris',
    'Jack Murphy', 'Connor Myers', 'Liam Parker', 'Isaac Patterson', 'Evan Phillips',
    'Hunter Price', 'Noah Richardson', 'Samuel Rivera', 'Gavin Rogers', 'Aiden Ross',
    'Christian Russell', 'Ian Sanders', 'Eli Simmons', 'Chase Stewart', 'Cameron Sullivan',
    'Bryan Taylor', 'Cole Thomas', 'Jake Thompson', 'Luke Torres', 'Blake Turner',
    'Jesse Ward', 'Joel Watson', 'Derek Williams', 'Mitchell Wright', 'Dustin Young',
    'Megan Allen', 'Jennifer Bailey', 'Jessica Bennett', 'Emily Brooks', 'Sarah Campbell',
    'Amanda Carter', 'Rebecca Collins', 'Samantha Cooper', 'Stephanie Diaz', 'Rachel Evans',
    'Christine Flores', 'Laura Foster', 'Michelle Garcia', 'Amber Gonzales', 'Lisa Gray',
    'Kimberly Green', 'Heather Harris', 'Tiffany Henderson', 'Natalie Hernandez', 'Crystal Hill',
    'Victoria Hughes', 'Erica Jenkins', 'Nicole Johnson', 'Katherine Kelly', 'Danielle Lee',
    'Hannah Lewis', 'Melissa Lopez', 'Patricia Martin', 'Brittany Moore', 'Brenda Morgan',

    ]
organizations = [
        'Google Inc.', 'Apple Inc.', 'Amazon.com', 'Facebook Inc.', 'Microsoft Corporation',
        'Tesla Motors', 'Netflix Inc.', 'The New York Times', 'The Washington Post', 'Wall Street Journal',
        'Intel Corporation', 'Oracle Corporation', 'IBM', 'Coca-Cola Company', 'PepsiCo',
        'Starbucks', 'Walmart Inc.', 'Target Corporation', 'ExxonMobil', 'Shell Oil Company',
        'Ford Motor Company', 'General Motors', 'Toyota Motor Corporation', 'Volkswagen Group', 'BMW Group',
        'American Airlines', 'Delta Airlines', 'United Airlines', 'Boeing Company', 'Lockheed Martin',
        'SpaceX', 'NASA', 'Harvard University', 'Stanford University', 'Massachusetts Institute of Technology',
        'University of California, Berkeley', 'University of Oxford', 'University of Cambridge', 'Princeton University', 'Yale University',
        'University of Chicago', 'Columbia University', 'Johns Hopkins University', 'University of Southern California', 'University of Michigan',
        'Goldman Sachs', 'JPMorgan Chase', 'Citibank', 'Morgan Stanley', 'Bank of America',
        'Deloitte', 'Ernst & Young', 'PricewaterhouseCoopers', 'KPMG', 'McKinsey & Company',
        'Boston Consulting Group', 'Accenture', 'BlackRock', 'Fidelity Investments', 'Vanguard Group',
        'Nike Inc.', 'Adidas', 'Under Armour', 'Patagonia', 'The Walt Disney Company',
        'Time Warner', 'NBCUniversal', 'Sony Corporation', 'Warner Bros.', 'Paramount Pictures',
        'Universal Music Group', 'Sony Music Entertainment', 'Warner Music Group', 'Pfizer Inc.', 'Johnson & Johnson',
        'Novartis', 'Merck & Co.', 'GlaxoSmithKline', 'AstraZeneca', 'Moderna',
        'New York City Hospital', 'Los Angeles County Library', 'San Francisco Community College',
    'Miami International University', 'Chicago Regional Bank', 'Dallas Medical Center',
    'Boston Tech Solutions', 'Atlanta City Bank', 'Seattle Software Hub', 'Phoenix Energy Solutions',
    'Denver Financial Group', 'Houston General Hospital', 'Portland Health Services', 'Las Vegas Convention Center',
    'San Diego Software Innovations', 'Philadelphia Law Firm', 'Orlando Realty Group',
    'Austin Engineering Solutions', 'Cleveland City Schools', 'Detroit Manufacturing Hub',
    'Baltimore Technology Inc.', 'Minneapolis Insurance Group', 'St. Louis Transportation Services',
    'Tampa Healthcare Network', 'Pittsburgh Steelworks Corporation', 'Sacramento Business Ventures',
    'Indianapolis Marketing Solutions', 'Columbus Financial Advisors', 'Fort Worth Electric Company',
    'Charlotte Digital Marketing', 'Milwaukee Industrial Solutions', 'Memphis Logistics Services',
    'Washington DC Development', 'Nashville Business Enterprises', 'Louisville Fitness Center',
    'Kansas City Architectural Firm', 'Oklahoma City University', 'Virginia Beach Law Associates',
    'Raleigh Research Institute', 'Salt Lake City Analytics', 'Richmond Financial Group',
    'Newark Data Solutions', 'Anchorage Energy Solutions', 'Fresno Water Authority',
    'Omaha Financial Services', 'Colorado Springs Health Institute', 'Mesa Auto Parts',
    'Virginia Beach Shipping', 'Sacramento Community Center', 'Albuquerque Electronics Company',
    'Tucson Data Science Center', 'Miami Lakes Software Solutions', 'Wichita Steel Corporation',
    'Arlington Cybersecurity Group', 'Bakersfield Construction Services', 'Aurora Logistics Firm',
    'Anaheim Technology Hub', 'Santa Ana Healthcare Services', 'Riverside Manufacturing Co.',
    'St. Paul Medical Associates', 'Lexington University Hospital', 'Plano Technology Solutions',
    'Lincoln Manufacturing Inc.', 'Greensboro Industrial Partners', 'Jersey City Financial Group',
    'Chandler Electronics', 'Madison Biotechnology Solutions', 'Lubbock Medical Supplies',
    'Scottsdale Real Estate Group', 'Reno Venture Capitalists', 'Henderson Engineering Consultants',
    'Norfolk Health Services', 'Chesapeake Data Systems', 'Fremont Software Group',
    'Irvine Legal Services', 'San Bernardino Logistics Group', 'Boise Energy Technologies',
    'Spokane Steel Fabricators', 'Glendale Solar Power Corporation', 'Garland Medical Services',
    'Hialeah Shipping and Logistics', 'Chesapeake Financial Advisors', 'Frisco Software Hub',
    'McKinney Electronics Corporation', 'Gilbert Transportation Group', 'Baton Rouge Financial Services',
    'Shreveport Data Analytics', 'Mobile Business Solutions', 'Huntsville Rocket Technologies',
    'Knoxville Agricultural Partners', 'Dayton Software Innovations', 'Grand Rapids Healthcare Network',
    'Fort Lauderdale Construction Group', 'Tempe Electric Vehicles', 'Winston-Salem Marketing Firm',
    'Fayetteville Consulting Services', 'Springfield Realty Group', 'Yonkers Manufacturing Hub',
    'Augusta Insurance Group', 'Salem Solar Energy Solutions', 'Pasadena Legal Consultants',
    'Seattle Pacific University', 'San Diego Zoo', 'Portland Art Museum',
    'Boston Medical Group', 'Chicago Tribune', 'Dallas Cowboys Football Club',
    'Los Angeles Philharmonic Orchestra', 'New York University', 'Houston Community College',
    'Phoenix Solar Power', 'Denver Public Library', 'Miami International Airport',
    'Atlanta Symphony Orchestra', 'San Francisco Opera', 'Orlando City Soccer Club',
    'Nashville Symphony', 'Baltimore Ravens Football Team', 'Cleveland Clinic',
    'Pittsburgh Steelers Football Team', 'Detroit Institute of Arts',
    'Tampa Bay Buccaneers Football Club', 'St. Louis Cardinals Baseball Team',
    'Indianapolis Colts Football Team', 'Austin Film Society', 'Seattle Sounders Football Club',
    'Minneapolis Institute of Art', 'Charlotte Hornets Basketball Club', 'Portland Trail Blazers Basketball Team',
    'Las Vegas Convention and Visitors Authority', 'New Orleans Saints Football Club',
    'San Antonio Spurs Basketball Club', 'Philadelphia Eagles Football Club',
    'Kansas City Chiefs Football Team', 'Cincinnati Reds Baseball Club',
    'Memphis Grizzlies Basketball Team', 'Washington Wizards Basketball Club',
    'Milwaukee Bucks Basketball Club', 'Sacramento Kings Basketball Team',
    'Salt Lake City Ballet', 'Boise State University', 'Albuquerque International Balloon Fiesta',
    'Raleigh-Durham International Airport', 'Richmond Symphony', 'Fresno Pacific University',
    'Spokane Transit Authority', 'Henderson Engineering', 'Mesa Public Schools',
    'Scottsdale Museum of Contemporary Art', 'Chandler Regional Medical Center', 'Glendale Unified School District',
    'Riverside Community Hospital', 'Aurora Public Schools', 'Anaheim Ducks Hockey Team',
    'Santa Ana College', 'Stockton Unified School District', 'Irvine Company', 'San Bernardino Community College District',
    'Modesto Junior College', 'Bakersfield Condors Hockey Team', 'Fresno State University',
    'Chesapeake Energy Corporation', 'Omaha World-Herald', 'Tucson Medical Center',
    'Virginia Beach Public Schools', 'Norfolk Naval Shipyard', 'Newark Beth Israel Medical Center',
    'Fort Wayne Mad Ants Basketball Team', 'Fremont High School', 'Shreveport Regional Airport',
    'Mobile Public Library', 'Huntsville Hospital', 'Knoxville Symphony Orchestra',
    'Dayton International Airport', 'Grand Rapids Symphony', 'Winston-Salem Dash Baseball Team',
    'Fayetteville Technical Community College', 'Springfield Cardinals Baseball Team',
    'Augusta National Golf Club', 'Salem Health', 'Pasadena Playhouse', 'Yonkers Public Schools',
    'Boulder Community Health', 'Naperville North High School', 'Lansing Community College',
    'Reno-Tahoe International Airport', 'Columbia University Medical Center', 'Albany Law School',
    'Buffalo Sabres Hockey Team', 'Syracuse University', 'Toledo Museum of Art', 'Akron Public Schools',
    'Daytona International Speedway', 'Des Moines Public Library', 'Rochester Philharmonic Orchestra',
    'Flint Institute of Arts', 'Lincoln Memorial University', 'Baton Rouge Community College',
    'Chattanooga Symphony and Opera', 'Greenville Technical College', 'Cedar Rapids Opera Theatre',
    'Pensacola Naval Air Station'
    ]

products = [
    'iPhone', 'Samsung Galaxy', 'MacBook', 'PlayStation 5', 'Nike shoes', 
    'AirPods', 'Xbox Series X', 'Canon DSLR', 'GoPro', 'Adidas sneakers', 
    'Fitbit', 'Google Pixel', 'Kindle', 'Bose headphones', 'Sony TV', 
    'Dyson vacuum', 'KitchenAid mixer', 'Surface Pro', 'Roomba', 'Apple Watch'
]

countries = [
    'USA', 'France', 'Japan', 'Germany', 'Canada', 
    'Australia', 'Mexico', 'China', 'Brazil', 'India', 
    'Italy', 'Spain', 'South Korea', 'Russia', 'Netherlands', 
    'United Kingdom', 'Sweden', 'Norway', 'Switzerland', 'Argentina'
]

services = [
    'Netflix', 'Spotify', 'Uber', 'Amazon Prime', 'Google Drive', 
    'Zoom', 'Dropbox', 'Slack', 'LinkedIn', 'Disney+', 
    'YouTube Premium', 'Venmo', 'DoorDash', 'Postmates', 'Hulu', 
    'Skype', 'Grubhub', 'Twitch', 'Instacart', 'Lyft'
]

cars = [
    'Tesla Model S', 'Ford Mustang', 'Chevrolet Camaro', 'Toyota Corolla', 'Honda Civic', 
    'BMW 3 Series', 'Audi A4', 'Mercedes-Benz C-Class', 'Jeep Wrangler', 'Ford F-150', 
    'Hyundai Elantra', 'Mazda CX-5', 'Chevrolet Tahoe', 'Nissan Altima', 'Kia Sorento', 
    'Volkswagen Golf', 'Subaru Outback', 'Tesla Model 3', 'Dodge Charger', 'Volvo XC90'
]

gadgets = [
    'smartwatch', 'Bluetooth headphones', 'fitness tracker', 'smart speaker', 'tablet', 
    'laptop', 'gaming mouse', 'wireless charger', 'VR headset', 'noise-canceling headphones', 
    'dashcam', 'e-reader', 'action camera', 'portable hard drive', 'gaming console', 
    'mechanical keyboard', '4K monitor', 'digital camera', 'portable power bank', 'USB-C hub'
]

stocks = [
    'AAPL', 'GOOGL', 'AMZN', 'MSFT', 'TSLA', 
    'NFLX', 'FB', 'BABA', 'NVDA', 'JPM', 
    'V', 'PYPL', 'BRK.A', 'DIS', 'INTC', 
    'PFE', 'NKE', 'ORCL', 'VZ', 'BA'
]

moneys = [
    'cryptocurrency', 'cash', 'PayPal', 'credit card', 'Bitcoin', 
    'Ethereum', 'bank transfer', 'wire transfer', 'Western Union', 'Venmo', 
    'debit card', 'Zelle', 'Apple Pay', 'Google Pay', 'Coinbase', 
    'Tether', 'Litecoin', 'Dogecoin', 'cash app', 'Ripple'
]

finances = [
    '401(k)', 'IRA', 'mutual funds', 'mortgage', 'student loan', 
    'savings account', 'retirement fund', 'bond', 'annuity', 'index fund', 
    'Roth IRA', 'tax-free savings account', 'pension', 'trust fund', 'hedge fund', 
    'credit score', 'auto loan', 'home equity loan', 'personal loan', 'debt consolidation'
]

travels = [
    'flights', 'hotels', 'car rentals', 'vacation packages', 'cruise trips', 
    'road trips', 'train tickets', 'adventure tours', 'guided tours', 'backpacking trips',
    'honeymoon destinations', 'beach resorts', 'luxury travel', 'budget travel', 'camping gear', 
    'family vacations', 'ski trips', 'all-inclusive resorts', 'last-minute deals', 'travel insurance'
]

foods = [
    'pizza', 'sushi', 'burgers', 'pasta', 'salads', 
    'vegan food', 'barbecue', 'fried chicken', 'ramen', 'tacos', 
    'sandwiches', 'noodles', 'soups', 'cakes', 'ice cream', 
    'steak', 'seafood', 'breakfast food', 'brunch', 'desserts',
    'hot dogs', 'waffles', 'pancakes', 'donuts', 'cookies',
    'bagels', 'burritos', 'pho', 'fried rice', 'dim sum',
    'smoothies', 'milkshakes', 'cupcakes', 'cheesecake', 'crepes',
    'nachos', 'guacamole', 'shawarma', 'gyros', 'kebabs',
    'clam chowder', 'chili', 'mac and cheese', 'meatballs', 'lasagna',
    'quesadillas', 'falafel', 'curry', 'pork ribs', 'buffalo wings',
    'brownies', 'apple pie', 'frozen yogurt', 'churros', 'stuffed crust pizza',
    'poutine', 'pad thai', 'dim sum', 'korean barbecue', 'bibimbap',
    'tandoori chicken', 'naan', 'samosa', 'biryani', 'dumplings',
    'bao buns', 'poke bowl', 'ceviche', 'tamales', 'empanadas',
    'shabu shabu', 'jollof rice', 'laksa', 'banh mi', 'spring rolls',
    'paella', 'gnocchi', 'risotto', 'french fries', 'croissants',
    'hummus', 'tzatziki', 'miso soup', 'kimchi', 'baklava',
    'souvlaki', 'galbi', 'arepas', 'roti', 'malai kofta',
    'sichuan chicken', 'teriyaki', 'yakitori', 'fettuccine alfredo',
    'gnocchi', 'ratatouille', 'tempura', 'onigiri', 'calamari',
    'chimichurri steak', 'goulash', 'pierogi', 'fondue', 'strudel',
    'schnitzel', 'tikka masala', 'paneer', 'plantains', 'croquettes',
    'coffee', 
]

restaurants = [
    'Italian restaurants', 'Mexican restaurants', 'Japanese restaurants', 'Chinese restaurants', 'Indian restaurants', 
    'fast food chains', 'fine dining', 'vegan restaurants', 'steakhouses', 'seafood restaurants', 
    'barbecue joints', 'sushi bars', 'cafes', 'pizzerias', 'buffet restaurants', 
    'food trucks', 'family-friendly restaurants', 'gastropubs', 'brunch spots', 'diner',
]

## Additional partial terms
sports_terms_missing = [
    "footbal", "baske", "socce", "golf", "cricke", "rugby", "hocke", "tenni", 
    "swimmin", "athleti", "fishi", "basebal", "volleybal", "badminto", "maratho", 
    "skatin", "climbin", "racquetball", "bowlin", "darts", "gymnasti", "bikin", "bowling",
]

locations_and_landmarks = [
    "statue", "museum", "plaza", "zoo", "church", "theater", "stadium", "mountain", 
    "park", "lake", "beach", "river", "palace", "cathedra", "mansion", "monument", 
    "temple", "observato", "canyon", "garden", "conservato", "boardwal", "forest", 
    "pier", "lighthouse", "arena",
]

activities_and_events = [
    "conc", "exhib", "meet", "parad", "festi", "tourn", "game", "sho", "even", 
    "gala", "confere", "seminar", "webina", "worksho", "lectur", "symposiu", 
    "screenin", "rall", "celebratio", "ceremon", "get-togethe", "perfor", 
    "gatherin", "competitio", "maratho", "speec", "workout", "showcas", "bowling"
]

food_missing = [
    "sush", "pizz", "ramen", "bbq", "vega", "steak", "taco", "burg", "pasta", 
    "brunc", "desse", "drink", "grill", "bake", "buffet", "sandwich", "noodle", 
    "cafe", "taver", "gastro", "bistro", "del", "saloo", "barbecue", "snack", 
    "confectio", "pub",
]

transport_and_directions = [
    "direc", "map", "bus", "train", "car", "park", "taxi", "subwa", "fly", 
    "plane", "ticke", "pass", "ferr", "bicycl", "scoote", "shuttl", "walkin", 
    "rideshar", "transi", "toll", "metr", "road", "route", "stop"
]

celebrities = [
    "Leonardo DiCaprio", "Tom Cruise", "Dwayne Johnson", "Zendaya", 
    "Timothée Chalamet", "Florence Pugh", "Margot Robbie", "Chris Hemsworth", 
    "Robert Downey Jr.", "Scarlett Johansson", "Tom Holland", "Ryan Reynolds", 
    "Gal Gadot", "Pedro Pascal", "Elizabeth Olsen", "Jenna Ortega", 
    "Millie Bobby Brown", "Finn Wolfhard", "Anya Taylor-Joy", "Jason Momoa", 
    "Chris Evans", "Natalie Portman", "Henry Cavill", "Daniel Radcliffe", 
    "Emma Watson", "Rupert Grint", "Michael B. Jordan", "Anne Hathaway", 
    "Brad Pitt", "Angelina Jolie", "Keanu Reeves", "Sandra Bullock", 
    "Jake Gyllenhaal", "Christian Bale", "Cate Blanchett", "Hugh Jackman", 
    "Jennifer Lawrence", "Will Smith", "Jada Pinkett Smith", "Viola Davis", 
    "Austin Butler", "Jamie Lee Curtis", "Paul Mescal", "Tobey Maguire", 
    "Andrew Garfield", "Harrison Ford", "Helen Mirren", "Brendan Fraser", 

    # Classic Hollywood Legends
    "Marlon Brando", "James Dean", "Audrey Hepburn", "Marilyn Monroe", 
    "Humphrey Bogart", "Clark Gable", "Bette Davis", "Elizabeth Taylor",
    "Fred Astaire", "Ginger Rogers", "Ingrid Bergman", "Greta Garbo", 
    "Katharine Hepburn", "Cary Grant", "Spencer Tracy", "Rita Hayworth",
    "Grace Kelly", "Vivien Leigh", "Judy Garland", "Henry Fonda",
    "Lauren Bacall", "Paul Newman", "Charlton Heston", "Joan Crawford",

    # Modern Hollywood Icons
    "Meryl Streep", "Tom Hanks", "Denzel Washington", "Robert De Niro", 
    "Al Pacino", "Jack Nicholson", "Julia Roberts", "Leonardo DiCaprio",
    "Brad Pitt", "Angelina Jolie", "George Clooney", "Cate Blanchett",
    "Johnny Depp", "Tom Cruise", "Sandra Bullock", "Nicole Kidman", 
    "Halle Berry", "Harrison Ford", "Sigourney Weaver", "Morgan Freeman", 
    "Michelle Pfeiffer", "Dustin Hoffman", "Robin Williams", "Will Smith",

    # Franchise and Action-Adventure Stars
    "Orlando Bloom", "Viggo Mortensen", "Ian McKellen", "Elijah Wood",
    "Sean Astin", "Dominic Monaghan", "Billy Boyd", "Liv Tyler", 
    "Hugo Weaving", "Andy Serkis", "Keira Knightley", "Geoffrey Rush",
    "Johnny Depp", "Daniel Radcliffe", "Emma Watson", "Rupert Grint",
    "Helena Bonham Carter", "Ralph Fiennes", "Alan Rickman", "Michael Gambon",
    "Ewan McGregor", "Liam Neeson", "Natalie Portman", "Hayden Christensen",
    "Mark Hamill", "Carrie Fisher", "Harrison Ford", "Daisy Ridley",
    "Adam Driver", "John Boyega", "Oscar Isaac", "Diego Luna", 
    "Felicity Jones", "Pedro Pascal", "Chris Hemsworth", "Chris Evans", 
    "Scarlett Johansson", "Robert Downey Jr.", "Mark Ruffalo", "Chris Pratt",
    "Tom Holland", "Zendaya", "Benedict Cumberbatch", "Tobey Maguire", 
    "Andrew Garfield", "Hugh Jackman", "Patrick Stewart", "Ian McKellen", 
    "Ryan Reynolds", "Gal Gadot", "Henry Cavill", "Jason Momoa", 
    "Ben Affleck", "Zoe Saldaña", "Dave Bautista", "Karen Gillan",

    # Versatile and Popular Contemporary Actors
    "Christian Bale", "Amy Adams", "Ryan Gosling", "Emma Stone",
    "Anne Hathaway", "Jennifer Lawrence", "Joaquin Phoenix", "Margot Robbie",
    "Adam Driver", "Michael B. Jordan", "Florence Pugh", "Timothée Chalamet",
    "Austin Butler", "Jessica Chastain", "Mahershala Ali", "Viola Davis", 
    "Octavia Spencer", "Toni Collette", "Rami Malek", "Lakeith Stanfield",
    "Cillian Murphy", "Matt Damon", "Ben Affleck", "Jeremy Renner", 

    # Young Rising Stars
    "Millie Bobby Brown", "Finn Wolfhard", "Sadie Sink", "Noah Schnapp", 
    "Anya Taylor-Joy", "Jenna Ortega", "Hunter Schafer", "Hailee Steinfeld", 
    "Lucas Hedges", "Elle Fanning", "Dakota Fanning", "Jacob Elordi", 
    "Sydney Sweeney", "Joey King", "Sophie Turner", "Maisie Williams",

    # Comedy and Character Actors
    "Steve Carell", "Tina Fey", "Amy Poehler", "Melissa McCarthy", 
    "Kristen Wiig", "Seth Rogen", "Will Ferrell", "Paul Rudd", 
    "Bill Hader", "Jason Bateman", "Jonah Hill", "Michael Cera",
    "Ken Jeong", "Kevin Hart", "Maya Rudolph", "Chris Rock", 

    # Iconic Action and Adventure Stars
    "Dwayne Johnson", "Arnold Schwarzenegger", "Sylvester Stallone", 
    "Bruce Willis", "Jason Statham", "Keanu Reeves", "Vin Diesel", 
    "Charlize Theron", "Emily Blunt", "John Cena", "Liam Neeson", 
    "Daniel Craig", "Idris Elba", "Pierce Brosnan", "Angelina Jolie", 
    "Kate Beckinsale", "Milla Jovovich",

    # Supporting Actors and Other Notables
    "John Goodman", "Jeff Goldblum", "J.K. Simmons", "Stanley Tucci",
    "Frances McDormand", "Allison Janney", "Angela Bassett", "Regina King",
    "Jessica Lange", "Bryan Cranston", "Aaron Paul", "Bob Odenkirk", 
    "Giancarlo Esposito", "David Harbour", "Winona Ryder", 

    # Diverse and Internationally Acclaimed Actors
    "Salma Hayek", "Antonio Banderas", "Diego Luna", "Oscar Isaac", 
    "Gael García Bernal", "Eva Longoria", "Jessica Alba", 
    "Awkwafina", "Sandra Oh", "Steven Yeun", "Simu Liu", 
    "Lucy Liu", "Gemma Chan", "Mindy Kaling", "Ali Wong", 
    "Lupita Nyong'o", "Chadwick Boseman", "Daniel Kaluuya", "Letitia Wright",
    "Dev Patel", "Riz Ahmed", "Zazie Beetz", "Mahershala Ali",

    # Sports
    "Lionel Messi", "Cristiano Ronaldo", "Neymar Jr.", "Kylian Mbappé", 
    "LeBron James", "Serena Williams", "Roger Federer", "Novak Djokovic", 
    "Rafael Nadal", "Simone Biles", "Naomi Osaka", "Stephen Curry", 
    "Kevin Durant", "Tom Brady", "Patrick Mahomes", "Virat Kohli", 
    "Rohit Sharma", "Shaquille O'Neal", "Tiger Woods", "Lewis Hamilton", 
    "Max Verstappen", "Charles Leclerc", "Usain Bolt", "Megan Rapinoe", 
    "Alex Morgan", "Katie Ledecky", "Michael Phelps", "Giannis Antetokounmpo", 
    "Damian Lillard", "Anthony Davis", "Zlatan Ibrahimović", "Harry Kane", 
    "Sadio Mané", "Karim Benzema", "Gareth Bale", "Robert Lewandowski", 
    "Erling Haaland", "Venus Williams", "Iga Świątek", "Aryna Sabalenka", 

    # Politics and Leaders
    "Joe Biden", "Kamala Harris", "Barack Obama", "Michelle Obama", 
    "Donald Trump", "Melania Trump", "Emmanuel Macron", "Olaf Scholz", 
    "Volodymyr Zelenskyy", "Rishi Sunak", "Narendra Modi", "Jacinda Ardern", 
    "Justin Trudeau", "Xi Jinping", "Vladimir Putin", "Angela Merkel", 
    "Elizabeth II", "King Charles III", "Prince William", "Prince Harry", 
    "Meghan Markle", "Queen Letizia", "Pope Francis", "Dalai Lama", 
    "Greta Thunberg", "Alexandria Ocasio-Cortez", "Bernie Sanders", 
    "Nicolas Maduro", "Jair Bolsonaro", "Fumio Kishida", "Yoon Suk-yeol",

    # Business and Technology
    "Elon Musk", "Jeff Bezos", "Mark Zuckerberg", "Bill Gates", "Tim Cook", 
    "Sundar Pichai", "Satya Nadella", "Warren Buffett", "Bernard Arnault", 
    "Larry Page", "Sergey Brin", "Steve Wozniak", "Reed Hastings", "Susan Wojcicki", 
    "Jack Ma", "Daniel Ek", "Evan Spiegel", "Andrew Ng", "Sam Altman", 
    "Sheryl Sandberg", "Peter Thiel", "Marc Benioff", "Richard Branson", 
    "Oprah Winfrey", "Howard Schultz", "Larry Ellison", "David Baszucki", 
    "Parag Agrawal", "Adam Neumann", "Kylie Jenner", "Kim Kardashian", 
    "Khloé Kardashian", "Kris Jenner", "Robert Kiyosaki", "Barbara Corcoran", 

    # Science and Innovation
    "Jane Goodall", "Neil deGrasse Tyson", "Brian Cox", "Michio Kaku", 
    "Katherine Johnson", "Jennifer Doudna", "Emmanuelle Charpentier", "Tim Berners-Lee", 
    "Mae Jemison", "Katie Bouman", "Brian Greene", "James Lovelock", 
    "Roger Penrose", "Dmitry Muratov", "Frances Arnold", "Venki Ramakrishnan", 
    "Paul Nurse", "Elizabeth Blackburn", "Carol Greider", "David Julius", 
    "Abhijit Banerjee", "Esther Duflo", "Michael Kremer", "Andrea Ghez", 
    "Reinhard Genzel", "Jennifer Hudson", "Ashoke Sen", "Subrahmanyan Chandrasekhar", 

    # Others
    "Ellen DeGeneres", "Oprah Winfrey", "Trevor Noah", "Jimmy Fallon", 
    "Stephen Colbert", "John Oliver", "James Corden", "Conan O'Brien", 
    "Dolly Parton", "Gordon Ramsay", "David Beckham", "Victoria Beckham", 
    "RuPaul", "Chris Rock", "Dave Chappelle", "Trevor Noah", "Hasan Minhaj", 
    "Ali Wong", "Bo Burnham", "Jo Koy", "Kevin Hart", "Sarah Silverman", 
    "Tiffany Haddish", "Joe Rogan", "Logan Paul", "MrBeast", "PewDiePie", 
    "Emma Chamberlain", "Charli D'Amelio", "Addison Rae", "Bella Poarch",
]

In [None]:

def get_sample_from_cities(city_info, city_weights, actual_threshold=0.7, city_partial_threshold=0.1):
    cities = list(city_info.keys())
    weights = [city_weights[city] for city in cities]
    city_random = random.choices(cities, weights=weights, k=1)[0]
    rand_val = random.random()
    if rand_val <= actual_threshold:
        if rand_val <= city_partial_threshold and len(city_random) > 6:
            return city_random[:-1]
        return city_random
    return random.choice(city_info[city_random])

def get_sample_from_states(state_info, actual_threshold=0.5):
    states = list(state_info.keys())
    state_random = random.choice(states)
    rand_val = random.random()
    if rand_val <= actual_threshold:
        return state_random
    return random.choice([state_info[state_random]])

def get_sample_from_cities_and_states(city_state_code_info, city_state_name_info, state_code_threshold=0.8, comma_threshold=0.6):
    rand_val = random.random()
    if rand_val <= state_code_threshold:
        if rand_val <= comma_threshold:
            return ', '.join(city_state_code_info.sample(1, weights='city_weight', replace=True)[['city_name', 'state_code']].values.tolist()[0])
        else:
            return ' '.join(city_state_code_info.sample(1, weights='city_weight', replace=True)[['city_name', 'state_code']].values.tolist()[0])
    return ', '.join(city_state_name_info.sample(1, weights='city_weight', replace=True)[['city_name', 'state_name']].values.tolist()[0])

def get_random_choice_from_list(choices_list):
    return random.choice(choices_list)
    
def get_sample_fake_city():
    return get_random_choice_from_list(fake_cities)

def get_sample_fake_state_code():
    return get_random_choice_from_list(fake_state_codes)

def get_sample_fake_state_name():
    return get_random_choice_from_list(fake_state_names)

In [None]:
# for _ in range(100):
#     print(get_sample_from_cities_and_states(city_state_code_info, city_state_name_info, state_code_threshold=0.8))

In [None]:
templates = [
    # Simple City-Based Queries
    "weather {city}",
    "{city} temperature",
    "sushi {city}",
    "ramen {city}",
    "pizza {city}",
    "plumber {city}",
    "electrician {city}",
    "roof repair {city}",
    "physio therapy {city}",
    "hospital {city}",
    "doctor {city}",
    "nurse {city}",
    "home improvement {city}",
    "home services {city}",
    "weather forecast {city}",
    "current weather {city}",
    "best restaurants {city}",
    "top yelp reviews {city}",
    "places to visit in {city}",
    "best cafes in {city}",
    "emergency services {city}",
    "gyms in {city}",
    "car repair {city}",
    "florist {city}",
    "lawyers in {city}",
    "real estate agents {city}",
    "hiking trails {city}",
    "parks in {city}",
    "movie theaters {city}",
    "top hotels in {city}",
    "events in {city} this weekend",
    "pharmacies {city}",
    "{food} near me {city}",
    "coffee near me {city}",
    "breakfast near me {city}",
    "restaurants near me {city}",

    # State-Based Queries
    "home services in {state}",
    "best restaurants in {state}",
    "real estate agents {state}",
    "roof repair services {state}",
    "hospitals in {state}",
    "weather {state}",
    "temperature {state}",
    "physio therapy {state}",
    "doctors in {state}",
    "top-rated plumbers {state}",
    "electricians {state}",
    "emergency services {state}",
    "sushi {state}",
    "ramen {state}",
    "pizza {state}",
    "parks in {state}",
    "hiking trails {state}",
    "pharmacies in {state}",
    "best cafes {state}",
    "movie theaters {state}",

    # City-State Combination Queries (Now using {city_state})
    "weather {city_state}",
    "{city_state} temperature",
    "sushi {city_state}",
    "plumber {city_state}",
    "best restaurants in {city_state}",
    "top-rated roof repair {city_state}",
    "hospital {city_state}",
    "physio therapy {city_state}",
    "doctor {city_state}",
    "events in {city_state} this weekend",
    "lawyers in {city_state}",
    "home improvement services {city_state}",
    "florist {city_state}",
    "best cafes in {city_state}",
    "parks in {city_state}",
    "movie theaters {city_state}",
    "top hotels in {city_state}",
    "emergency services {city_state}",
    "car repair {city_state}",
    "pharmacies {city_state}",

    "sushi {city_state}",
    "ramen {city_state}",
    "pizza {city_state}",
    "parks {city_state}",
    "hiking trails {city_state}",
    "pharmacies {city_state}",
    "best cafes {city_state}",
    "movie theaters {city_state}",
    "hamburgers {city_state}",
    "burgers {city_state}",
    "pasta {city_state}",
    "salads {city_state}",
    "vegan food {city_state}",
    "fried chicken {city_state}",
    "ramen {city_state}",
    "tacos {city_state}",
    "sandwiches {city_state}",
    "noodles {city_state}",
    "soups {city_state}",
    "cakes {city_state}",
    "ice cream {city_state}",
    "steak {city_state}",
    "seafood {city_state}",
    "breakfast food {city_state}",
    "brunch {city_state}",
    "desserts {city_state}",
    
    # CITY state order swapped
    "{city_state} sushi",
    "{city_state} ramen",
    "{city_state} pizza",
    "{city_state} parks",
    "{city_state} hiking trails",
    "{city_state} pharmacies",
    "{city_state} best cafes",
    "{city_state} movie theaters",
    "{city_state} hamburgers",
    "{city_state} burgers",
    "{city_state} pasta",
    "{city_state} salads",
    "{city_state} vegan food",
    "{city_state} fried chicken",
    "{city_state} ramen",
    "{city_state} tacos",
    "{city_state} sandwiches",
    "{city_state} noodles",
    "{city_state} soups",
    "{city_state} cakes",
    "{city_state} ice cream",
    "{city_state} steak",
    "{city_state} seafood",
    "{city_state} breakfast food",
    "{city_state} brunch",
    "{city_state} desserts",
    
    # Organization-Based Queries
    "{organization} in {city_state}",
    "contact {organization} in {city}",
    "locations of {organization} in {state}",
    "does {organization} provide home repair services in {city}?",
    "can I book a doctor appointment at {organization} in {state}?",
    "does {organization} offer roof repair in {city_state}?",
    "hours of {organization} in {city}",
    "{organization} reviews in {state}",
    "best rated {organization} in {city_state}",
    "nearest branch of {organization} in {city}",
    
    # Person-Based Queries
    "Where is {person} hosting an event?",
    "Can I meet {person} in {city_state}?",
    "Is {person} available for an appointment in {city}?",
    "Is {person} traveling to {state} next week?",
    "Does {person} have a speech in {city_state}?",
    
    # Mixed and Specialized Queries
    "roof repair near {city}",
    "best sushi in {city_state}",
    "what's the weather forecast for {city}?",
    "who are the top doctors in {city_state}?",
    "restaurants near {city} with good reviews",
    "plumbing services in {city_state}",
    "upcoming events in {city} this weekend",
    "find hiking trails in {city_state}",
    "local electricians in {city_state}",
    "ramen places in {city}",
    "home improvement contractors near {city_state}",
    "best pizza near {city}",
    "does {organization} operate in {city_state}?",
    "find top-rated hospitals in {city_state}",
    "home maintenance services in {city_state}",
    "weather forecast for {city} this weekend",
    "roof repair specialists in {city}",
    "top-rated movie theaters in {city_state}",
    

    # City-State Queries
    "Best {restaurant} in {city_state}",
    "Top-rated {restaurant} in {city_state}",
    "Affordable {restaurant} in {city_state}",
    "Where to find the best {food} in {city_state}?",
    "Popular {food} places in {city_state}",
    "Top destinations for {travel} in {city_state}",
    "Best deals on {travel} in {city_state}",
    "Where to eat {food} in {city_state}?",
    "What are the most famous {restaurant} in {city_state}?",
    "Top {food} restaurants in {city_state} this weekend",

    # Non-City/State Queries
    "Best {restaurant} in the country",
    "Where to find the best {food} near me?",
    "Top destinations for {travel} this summer",
    "Best deals on {travel} packages",
    "Where to find cheap {travel} options?",
    "Popular {food} dishes in the USA",
    "Best {restaurant} chains in the country",
    "What are the healthiest {food} options?",
    "How to book affordable {travel} for families?",
    "Most popular {restaurant} for takeout",

    # Additional Templates
    "What is the best {food} to eat for dinner?",
    "Where to order {food} online?",
    "Best {restaurant} for date night",
    "Top {travel} websites for booking vacations",
    "Where to find {restaurant} reviews?",
    "What are the top-rated {travel} apps?",
    "Best {restaurant} near tourist attractions",
    "What is the most popular {food} in the USA?",
    "Best deals on {travel} for students",
    "Top {restaurant} for family gatherings",
    "Most affordable {food} delivery services",
    "What are the best {travel} insurance options?",
    "How to find luxury {restaurant} reservations",
    "Where to get authentic {food} near me?",
    "Top {restaurant} for business lunches",
    "How to plan a {travel} adventure?",
    "Best {restaurant} for weekend brunch",
    "What are the most popular {food} trends?",
    "Best {restaurant} for a large group",
    "How to get discounts on {travel} bookings?"

    # Product-Based Queries
    "Where to buy {product} online?",
    "Best deals on {product}",
    "How to repair a {product}?",
    "Latest reviews of {product}",
    "When will the next {product} be released?",
    "Top features of {product}",
    "Is {product} worth buying in 2024?",
    "User reviews of {product}",
    "Alternatives to {product}",
    "What is the price of {product}?",

    # Country-Based Queries
    "How to travel to {country}?",
    "Best tourist destinations in {country}",
    "Top hotels to stay in {country}",
    "Do I need a visa to visit {country}?",
    "Cultural traditions in {country}",
    "What is the official language of {country}?",
    "How to do business in {country}?",
    "What are the top exports of {country}?",
    "Current political situation in {country}",
    "Famous landmarks in {country}",

    # Service-Based Queries
    "How to cancel my {service} subscription?",
    "Is {service} worth the price?",
    "How does {service} compare to competitors?",
    "User reviews of {service}",
    "How to get a discount on {service}?",
    "What are the benefits of {service}?",
    "Best alternatives to {service}",
    "How to troubleshoot issues with {service}?",
    "Does {service} have a free trial?",
    "Is {service} available internationally?",

    # Cars-Based Queries
    "What is the top speed of {car}?",
    "User reviews of {car}",
    "How to finance a {car}?",
    "Fuel efficiency of {car}",
    "How to buy a second-hand {car}?",
    "What are the safety features of {car}?",
    "Maintenance costs of owning a {car}",
    "What is the resale value of {car}?",
    "Is {car} electric or gas-powered?",
    "Best upgrades for {car}",

    # Gadgets-Based Queries
    "What are the best apps for {gadget}?",
    "How to set up a {gadget}?",
    "User reviews of {gadget}",
    "Best accessories for {gadget}",
    "What are the health benefits of using a {gadget}?",
    "What is the battery life of {gadget}?",
    "How to sync {gadget} with my phone?",
    "Alternatives to {gadget}",
    "What are the best productivity apps for {gadget}?",
    "Is {gadget} waterproof?",

    # Stocks-Based Queries
    "What is the latest price of {stock}?",
    "How to buy shares of {stock}?",
    "Is {stock} a good investment in 2024?",
    "What are analysts saying about {stock}?",
    "Current stock performance of {stock}",
    "What is the market cap of {stock}?",
    "How to invest in {stock}?",
    "Latest earnings report of {stock}",
    "What are the dividend yields of {stock}?",
    "How to trade {stock} on the stock market?",

    # Money-Based Queries
    "How to convert {money} to another currency?",
    "Best ways to transfer {money} internationally",
    "What are the risks of using {money}?",
    "How to save {money} for the future?",
    "What is the best way to invest {money}?",
    "How to protect {money} from fraud?",
    "What are the fees for using {money}?",
    "Is {money} safe for online transactions?",
    "Best apps for managing {money}",
    "How to track spending with {money}?",

    # Finance-Based Queries
    "How to invest in a {finance}?",
    "What are the benefits of having a {finance}?",
    "How to calculate the returns on {finance}?",
    "What are the risks of investing in {finance}?",
    "How to get advice for managing my {finance}?",
    "How to apply for a {finance}?",
    "What are the tax benefits of {finance}?",
    "What are the best options for a {finance}?",
    "How to open a {finance} account?",
    "What is the interest rate on {finance}?",

    # sports_term, location_and_landmark, activity_and_event, food_m, transport_and_direction
    # incomplete or misspelled sport/activity names
    "{sports_term} near me", 
    "find {sports_term}", 
    "{sports_term} schedule", 
    "{sports_term} news", 
    "book {sports_term} tickets", 
    "{sports_term} team", 
    "{sports_term} game time", 
    "when is the {sports_term} game", 
    "top {sports_term} players", 
    "local {sports_term} clubs", 
    "where to play {sports_term}", 
    "best {sports_term} venues", 
    "{sports_term} tournament",
    "{sports_term}",

    # Generic landmarks and location queries
    "{location_and_landmark} nearby", 
    "famous {location_and_landmark}", 
    "{location_and_landmark} open now", 
    "visit {location_and_landmark}", 
    "{location_and_landmark} directions", 
    "how to get to {location_and_landmark}", 
    "nearest {location_and_landmark}", 
    "{location_and_landmark} address", 
    "top-rated {location_and_landmark}", 
    "{location_and_landmark} hours", 
    "find {location_and_landmark} near me", 
    "{location_and_landmark} entry fee", 
    "best {location_and_landmark} in {city}",

    # Food and dining queries
    "{food_m} place", 
    "find {food_m}", 
    "best {food_m} spot", 
    "{food_m} delivery", 
    "{food_m} open near me", 
    "order {food_m}", 
    "{food_m} deals", 
    "{food_m} options", 
    "{food_m} near me", 
    "{food_m} reservation", 
    "top-rated {food_m} restaurants", 
    "{food_m} reviews", 
    "{food_m} menu", 
    "popular {food_m} dishes", 
    "where to eat {food_m}",

    # activities_and_events
    "{activity_and_event} tickets", 
    "nearest {activity_and_event}", 
    "{activity_and_event} today", 
    "upcoming {activity_and_event}", 
    "book {activity_and_event}", 
    "{activity_and_event} in {city}", 
    "find {activity_and_event}", 
    "{activity_and_event} schedule", 
    "{activity_and_event} near me", 
    "top-rated {activity_and_event} venues", 
    "{activity_and_event} details", 
    "how to attend {activity_and_event}", 
    "{activity_and_event} location", 
    "{activity_and_event} opening hours",
    "{activity_and_event}",

    # Single-word incomplete or ambiguous queries (standalone)
    # Sports and Games (single or incomplete)
    "footbal", "baske", "golf", "sush", "pizz", "zoo", "conc", "direc", 
    "theate", "stadiu", "brunc", "tourn", "parad", "swimmin", "train", "taxi", 
    "game", "meet", "mountain", "beac", "lake", "forest", "ligh", "restauran", 
    "parki", "stor", "monumen", "aren", "boardwal",
    # Locations and Landmarks (single or incomplete)
    "statue", "museum", "plaza", "zoo", "church", "theater", "stadium", "mountain", 
    "park", "lake", "beach", "river", "palace", "cathedra", "mansion", "monument", 
    "temple", "observato", "canyon", "garden", "conservato", "boardwal", "forest", 
    "pier", "lighthouse", "arena", "campgroun", "arch", "reservoi", "dam", "fountai", 
    "waterfal", "galleri", "amphitheate", "sculptur", "trail", "cliff", "tower", "islan",
    # Activities and Events (single or incomplete)
    "conc", "exhib", "meet", "parad", "festi", "tourn", "game", "sho", "even", "gala", 
    "confere", "seminar", "webina", "worksho", "lectur", "symposiu", "screenin", 
    "rall", "celebratio", "ceremon", "get-togethe", "perfor", "gatherin", "competitio", 
    "maratho", "speec", "workout", "exercis", "demonstratio", "ceremony", "readin", 
    "daytrip", "lectur", "social", "activit", "performanc", "worksho", "openin", 
    "finale", "comedy", "poetr", "talent", "match",
    # Restaurants and Food Types (single or incomplete)
    "sush", "pizz", "ramen", "bbq", "vega", "steak", "taco", "burg", "pasta", "brunc", 
    "desse", "drink", "grill", "bake", "buffet", "sandwich", "noodle", "cafe", 
    "taver", "gastro", "bistro", "deli", "saloo", "barbecue", "snack", "confectio", 
    "pub", "salad", "cuisine", "fries", "wings", "pantr", "meatbal", "sub", "omel", 
    "crepe", "wrap", "beverag", "dessert", "smoothie", "juice", "shake", "frappe", "coffee",
    # Transport and Directions (single or incomplete)
    "direc", "map", "bus", "train", "car", "park", "taxi", "subwa", "fly", "plane", 
    "ticke", "pass", "ferr", "bicycl", "scoote", "shuttl", "walkin", "rideshar", 
    "transi", "toll", "metr", "road", "route", "stop", "junctio", "termina", "highwa", 
    "pathwa", "drivewa", "loop", "intersectio", "trailhead", "tub", "sidestro", 
    "crosswal", "rout", "navigatio", "crossing", "pave", "deck", "lane",
    # Technology and Gadgets (single or incomplete)
    "lapt", "smartphon", "comput", "tablet", "earbuds", "bluetooth", "charg", "cabl", 
    "headset", "monitor", "consol", "keyboard", "drive", "storag", "gaming", "mouse", 
    "projector", "flashdriv", "powerban", "adapter", "webcam", "router", "modem", 
    "camcorder", "printer", "copier", "recorde", "remote", "surge", "extend", "plug", 
    "portabl", "backu", "networ", "recharge", "uplo", "downlo", "strea", "screencas", 
    "googl", "apple", "micros", "andr",

    # actual city and states
    "{food} {city}", 
    "{food} {state}", 
    "{city} {food}", 
    "{state} {food}", 
    "{food} in {city}", 
    "{food} in {state}", 

    # fake cities and states
    "{food} {fake_cty}", 
    "{food} {fake_state_cd}", 
    "{food} {fake_state_nam}", 
    "{fake_cty} {food}", 
    "{fake_state_cd} {food}", 
    "{fake_state_nam} {food}", 
    "{food} in {fake_cty}", 
    "{food} in {fake_state_cd}", 
    "{food} in {fake_state_nam}", 

    # celebrities
    "{celebrity}",
    "{celebrity} age",
    "{celebrity} net worth",
    "{celebrity} movies",
    "What shows has {celebrity} been on?",
    "What awards has {celebrity} won?",
    "Where does {celebrity} live?",
    "What are {celebrity}'s upcoming projects?",
    "{celebrity} diet",
    "is {celebrity} married?",
    "does {celebrity} live in {city}",

    ## unknown random queries
    'snoozlegrip', 'shenanigans', 'kerplunk', 'clip', 'snappyy', 'spindlywhack', 'crinkly', 'pressed enter too soon', 
    'try this', 'query here', 'mistyped selection', 'smorgasbord', 'crumplify', 'snooze', 'twonkle', 'bamboozlemate', 
    'this doesn’t matter', 'zap', 'mind blank', 'hiss', 'snagged', 'splurgy', 'snagglebash', 'guess', 'zapz', 'frap', 
    'blotter', "don't even know", 'don’t know answer', 'spindletastic', 'zizzlesplat', 'jinkled', 'placeholder search', 
    'uncertain search', 'splode', 'abcxyz', 'twangleblop', 'shifty', 'bumfuzzle', 'plunge', 'thingy', 
    'swooshenator', 'quark', 'tatterblast', 'frizzlefry', 'something random', 'puff', 'blobby', 'placeholder attempt', 
    'weird example', 'wiggle', 'snortleboo', 'bouncy', 'qwerty', 'whirl', 'nix', 'idk what', 'random search', 
    'glimmering', 'guzzle', 'strange text', 'accidental hit', 'forgot keypress', 'dazzleplunk', 'snurply', 
    'confused', 'weird gibberish', 'idc either', 'test123', 'huff', 'supercalifragilistic', 'clap', 'whoopsie', 'nump', 
    'lorem ipsum', 'snuffle', 'unknown phrase', 'whizz', 'bloop', 'glitch', 'zomp', 'clappy', 'gush', 'zappletastic', 
    'hooey', 'bing', 'slap', 'ting', 'miscellaneous', 'jingle', 'idk just looking', 'twangy', 'dinglefrizzle', 
    'just clicking', 'quizzical', 'splatterdash', 'kerplunkitude', 'fizzlematic', 'piff', 'jazz', 'jib', 'random phrase', 
    'flapper', 'uhmm', 'nothing much', 'sdf', 'snub', 'confusing example', 'keyboard smash', 'randomized words', 
    'nothing useful', 'random sentence', 'placeholder input', 'splattergrip', 'zorp', 'fluffernutter', 'splopp', 
    'incomplete search', 'check this out', 'woozle', 'bananarama', 'quiz', 'spiffy', 'undefined', 'confusing term', 'sploom', 
    'randomized example', 'spliffy', 'ooze', 'blazing', 'uncertain input', 'unknown search', 'random guesses', 
    'unknown', 'concept unclear', 'accidental input', 'sporkinator', 'whats this', 'maybe', 'ignore this', 'twinkle', 
    'whatchamacallit', 'splank', 'weird thing', 'huh', 'into the unknown', 'chaos', 'wigglie', 'twistamatic', 'kerflapify', 
    'twizzletude', 'mock', 'thud', 'shrug', 'grizzed', 'jibberjabber', 'weirdness', 'anything', 'plop', 'dazzlicious', 
    'random selection', 'splatt', 'abracadabra', 'whooshenator', 'random mouse click', 'sparklefish', 'banal', 
    "what's the word", 'mistyped search', 'twinklebash', 'splush', 'splazz', 'forgot search term', 'crumplamatic', 'glee', 
    'whizzy', 'whizzlemate', 'jumpy', 'dork', 'randomxyz', 'gobsmacktastic', 'no clue what', 'zazz', 'beyond the void', 
    'weird try', 'drift', 'yank', 'yodelsnap', 'biff', 'forgot randomness', 'splatterblast', 'no idea', 'smooshify', 
    'peep', 'rick', 'splendiferous', 'squishy', 'muff', 'flabbergizmo', 'confuzzled', 'I think so', 'zing', 
    'meaningless typing', 'shush', 'zany', 'don’t need help', 'randomly chosen', 'warpydash', 'forgot words', 
    'placeholder typing', 'spunky', 'spindleplop', 'crash', 'flabbergast', 'snaggleplop', 'hootnanny', 'blurp', 
    'miff', 'snarkle', 'snookie', 'gleamitude', 'hello world', 'zag', 'accidental gibberish', 'nothing in mind', 
    'bash', 'spiv', 'rift', 'don’t know what to search', 'splong', 'no point', 'forgot attempt', 'fluttermate', 
    'flub', 'guff', 'dazzled', 'doodad', 'forgot term', 'blotchy', 'odd', 'kerplazzle', 'grubby', 'try to see', 'glop', 
    'whooshify', 'snicker', 'snuffly', 'random thought', 'mixed up stuff', 'zapper', 'sort of searching', 'slushy', 
    'blurification', 'mop', 'smit', 'splurge', 'meaningless input', 'quix', 'zapplarific', 'splang', 'zoinkalicious', 
    'unclear selection', 'splushy', 'guesstimate', 'snazzie', 'what about this', 'input fail', 'codswallop', 'dink', 'splunk', 
    'unclear', 'strange example', 'jitter', 'sploff', 'blip', 'unknown meaning', 'nope', 'gadzooks', 'odd example', 
    'zappomatic', 'janglystorm', 'ink', 'wobbled', 'wigglyy', 'typed by mistake', 'twirly', 'lurk', 'kerplottify', 
    'twizzlefang', 'muck', 'clunky', 'splatterific', 'clippy', 'oops input', 'what am I doing', 'qazwsxedc', 'does it matter', 
    'nonsensical', 'swooshinator', 'poiuuy', 'splish', 'mistyped query', 'squizzlewhack', 'what now', 'spluzz', 'glim', 
    'placeholder keypress', 'mistyped randomness', 'what is it', 'don’t know why', 'quibbleplop', 'guess what', 'snizzlezap', 
    'meaning of nothing', 'wiggles', 'zxcvbn', 'spur', 'uncertain term', 'what am I typing', 'zoodleblorp', 'floppy', 'asdfasdf', 
    'confused input', 'unclear sentence', 'snortlematic', 'smooshinator', 'random term', 'searching something', 
    'snorflemate', 'twinkly', 'skip', 'quib', 'forgotten term', 'oops', 'splodge', 'meaningless words', 'unclear input', 
    'unclear phrase', 'zoom', 'sneeze', 'cat on keyboard', 'nincompoop', 'zappification', 'warpington', 'splurty', 
    'do I know', 'splott', 'splurb', 'plink', 'dazzlematic', 'could be anything', 'lost thoughts', 'what', 'pizz', 
    'jiggles', 'splodgy', 'twang', 'i forgot', 'meaningless term', 'unclear search', 'thunderplunk', 'just pressing keys', 
    'splodgify', 'flit', 'snazzify', 'zoop', 'totally confused', 'quip', 'womp', 'wham', 'wigglyz', 'fuzzyy', 'why is this here', 
    'malarkey', 'widget', 'don’t care', 'scoff', 'randomized search', 'unclear example', 'pop', 'quash', 'uh oh', 
    'placeholder randomness', 'splatification', 'snickerplunk', 'nutterbutter', 'whisk', 'nibs', 'help', 'strange attempt', 
    'blurptacular', 'gizmo', 'forgotten query', 'spazzy', 'ding', 'lost search', 'buzzing', 'hum', 'nonsensicality', 
    'gloop', 'globby', 'lost meaning', 'plopperific', 'hard to say', 'snappy', 'don’t type this', 'blunderous', 'twizzlegrip', 
    'flappy', 'random keypress', 'zizzlewhack', 'forgot what I typed', 'zingerdoodle', 'randomized attempt', 'unsure words', 
    'strange sentence', 'asfjkl', 'frizz', 'idk', 'gobbledygook', 'flibbertigibbet', 'gadzookify', 'flabberzap', 'vroom', 
    'splitch', 'glimmerstorm', 'blurt', 'frizzle', 'meaningless search', 'thingamajig', 'murmur', 'not this', 'sploof', 
    'fiddlewhip', 'mumbojumbo', 'something strange', 'splurg', 'fake input', 'whiffle', 'forgot query', 'search mix', 
    'yapplify', 'zippy', 'splurpy', 'splat', 'zoinks', 'bizz', 'crumby', 'meaningless query', 'snickerdoodle', 'weird word', 
    'squidge', 'don’t know term', 'spangletude', 'spazzmatic', 'just testing', 'baffled', 'splurt', 'gaze', 'frizzy', 
    'bamboozling', 'slurp', 'zappertude', 'splorch', 'swooshtastic', 'dunk', 'honk', 'smudgy', 'flimmerstorm', 'tizz', 
    'uncertain randomness', 'jangletude', 'perhaps this', 'placeholder search term', 'whoosh', 'spike', 'glitterbop', 
    'idiosyncratic', 'odd typing', 'blob', 'bazzlemate', 'crumpleton', 'clutterbomb', 'whatever', 'kerfuffle', 'test input', 
    'randomized keypress', 'meaningless randomness', 'why not', 'snizzleblap', 'bonk', 'forgot search', 'zonk', 'whatsisname', 
    'doesn’t matter', 'splurgz', 'twig', 'ramblethorp', 'fake query', 'ping', 'smack', 'buzz', 'tingly', 'warpydoodle', 
    'filler words', 'buzzed', 'unclear thought', 'weird input', 'blap', 'snazzy', 'look for this', 'snorkelwhip', 'spoon', 
    'just guessing', 'glitche', 'swirl', 'snooker', 'search fail', 'random gibberish', 'abstract thought', 'spindelicious', 
    'snorple', 'fell asleep typing', 'splunge', 'twit', 'grippy', 'flip', 'whatsisface', 'maybe something', 'bamboozle', 
    'zinger', 'drizzleblip', 'splonky', 'what do I search', 'blat', 'another try', 'odd randomness', 'yarn', 'squib', 
    'confused term', 'flabbergasted', 'testing input', 'don’t know', 'thunderbop', 'blurpsational', 'janglydash', 'brouhaha', 
    'find out about', 'strange randomness', 'kerplizzle', 'meaningless attempt', 'spud', 'placeholder term', 'woof', 'splaff', 
    'jigglez', 'fuzzed', 'blahblah', 'grizzle', 'something here', 'blink', 'snuggly', 'yelp', 'chop', 'eternal question', 'splift', 
    'what do you mean', 'hullabazoo', 'cloggy', 'wrong key pressed', 'test again', 'don’t ask me', 'blur', 'twisty', 'flapperdash', 
    'crinklewhip', 'plinky', 'gobbleplop', 'I don’t understand', 'random', 'dummy text', 'blurblenator', 'try something', 'input here', 
    'thing', 'fringe', 'no answer', 'placeholder selection', 'test', 'spangleplop', 'splash', 'lost in thought', 'zest', 
    'fiddleplop', 'bunk', 'snag', 'vex', 'placeholder randomness example', 'spat', 'placeholder phrase', 'random search term', 
    'squigg', 'tinge', 'random words', 'unknown query', 'not useful', 'snuzzlefrump', 'type here', 'snuzzle', 'drip', 'gibberish', 
    'hodgepodge', 'forgot the term', 'completely random', 'doesn’t make sense', 'lost', 'splatterstorm', 'meaningless text', 
    'twizzle', 'find something', 'twinkletude', 'zine', 'spunked', 'crikey', 'mistaken input', 'no idea what this is', 'spork', 
    'glimmertastic', 'sloppy', 'twirky', 'abstract query', 'fluffytude', 'randomized selection', 'randomized randomness', 
    'nudge', 'gawk', 'buzzer', 'nonsensical search', 'i was curious', 'zapplify', 'cloppy', 'doohickey', 'snickly', 'doodle', 
    'placeholder example', 'placeholder text', 'nonsense search', 'why search this', "this doesn't work", 'splendiferific', 
    'crappy', 'what are words', 'clop', 'randomized term', 'weird', 'snazztastic', 'whizzbang', 'blaze', 'twangaloo', 
    'strange keypress', 'placeholder query', 'skew', 'splink', 'lkjhgfd', 'unclear meaning', 'flummoxify', 'lollygag', 
    'odd gibberish', 'clunk', 'snap', 'zapf', 'flummoxed', 'yawn', 'random input', 'strange word', 'zapplomatic', 
    'does this work', 'gasp', 'typing nothing', 'idk anymore', 'empty thoughts', 'pluck', 'randomized test', 
    'brain fog', 'squibbletude', 'fizzle', 'jinglyy', 'mistyped term', 'confused mind', 'random typing', 'asdfgh', 
    'infinity', 'twist', 'something typed', 'kerplunktastic', 'just trying this', 'mistaken search', 'sparklematic', 
    'woop', 'jittery', 'oopsie', 'snippy', 'splinky', 'splint', 'swooshification', 'spit', 'zinged', 'blop', 'lost words',
    'crux', 'blurbleplop', 'balderdash', 'perhaps not', 'flibber', 'snickerwhack', 'try later', 'zork', 'void', 
    'accidental query', 'fumble', 'snarked', 'don’t care search', 'just looking', 'spindling', 'snip', 'squish', 
    'blazer', 'splo', 'splunky', 'unclear randomness', 'spliff', 'not this either', 'nonsensical words', 
    'testing random', 'snigglewhap', 'odd input', 'whizzlegrip', 'dazzlegrip', 'fling', 'meaning of gibberish', 
    'weird thoughts', 'gunk', 'does this help', 'flux', 'wink', 'wonky', 'wisp', 'drizzlematic', 'another test', 
    'test search', 'just wondering', 'crumblewhack', 'spaz', 'splung', 'skid', 'quirky', 'odd search', 'accidental term', 
    'dunno', 'quizzicality', 'gleam', 'glimmer', 'don’t press enter', 'gadget', 'whizzleplop', 'don’t know exactly', 
    'odd words', 'blotty', 'thunderblop', 'maybe not', 'spludge', 'discombobulated', 'stuff', 'halfway done', 
    'sparklenator', 'zang', 'jolt', 'accidental search', 'what is going on', 'wiggler', 'mnbvcxz', 'yip', 'wriggle', 
    'hullaballoo', 'janglenut', 'zapplesmash', 'janglitude', 'what is this', 'whip', 'tiddlywinks', 'wiggly', 'weird randomness', 
    'sporkalicious', 'wriggy', 'meaningless selection', 'crumble', 'weird thought', 'splurch', 'don’t understand', 
    'sploosh', 'yap', 'nonsense', 'wobble', 'question of life', 'randomly typed', 'snuggle', 'snizzlegrip', 'oops I typed', 
    'zappy', 'twinkleplop', 'uncertain example', 'idc', 'mash', 'not sure', 'pandemonium', 'perhaps later', 'quirked', 
    'smug', 'warp', 'dash', 'could be nothing', 'unsure search', 'jumbled phrases', 'hush', 'wibble', 'weird search', 
    'quibberish', 'flop', 'discombobulate', 'this makes no sense', 'fizz', 'quirkitude', 'zingzang', 'dank', 'limitless', 
    'this is random', 'crunch', 'vibe', 'nothing specific', 'forgot', 'not important', 'slosh', 'question mark', 'zoopendous', 
    'flummify', 'splosh', 'splorp', 'splishy', 'snurkle', 'blah', 'guess answer', 'twitch', 'flap', 'snooperdoodle', 
    'janglybits', 'snizzleflap', 'slush', 'snortlemate', 'quirk', 'void query', 'fizzled', 'lollygagging', 'wonkifying', 
    'nothing', 'splunch', 'hullabaloo', 'thingamabob', 'dazzlebash', 'whizzie', 'this and that', 'shard', 'twix',
    "crumpled", "splizzle", "gargle", "mangled", "shamble", "wobblish", "drizzlepop",
    "splinker", "fiddlest", "twizzlepop", "blurzzle", "snizzlewick", "wozzle", 
    "cracklepop", "glibbish", "twezzle", "boondock", "sizzleflip", "snigglemash",
    "zazzle", "fizzlepot", "scramble", "tinglish", "sprozzle", "blimble", "zibble",
    "slapdash", "gobstork", "ziggler", "flingle", "wrangly", "twizzlebit", "brambly",
    "snubble", "splintery", "fizznack", "tibber", "quaggly", "whooshpop", "snibble",
    "plunkish", "glimflash", "wobbert", "squidgy", "kerplonk", "fobble", "blurzy",
    "scriggly", "smudgify", "tassler", "whipple", "snuzzify", "zaggle", "plonker",
    "smizzle", "quiggle", "spongle", "shizzle", "drippity", "bogglepop", "twiddly",
    "puzzleth", "flummish", "sniggleflop", "crumplish", "twiggle", "nubbish", 
    "splurkle", "whibber", "jibblish", "twonker", "fizzlewhip", "spazzle", "splorpish",
    "snuffler", "hubble", "twinkler", "crumpler", "wimbley", "twazzle", "blurbonic",
    "zapplepop", "flippery", "snuzle", "quizzwhip", "clatter", "garglunk", "splingle",
    "drabbler", "spunkly", "jumbler", "snappish", "zingify", "buzzpop", "snizzlehop",
    "plobber", "scribble", "twongle", "scrabbly", "sniggler", "bimblepop", "snorplebop",
    "wizzle", "blimpy", "splinglepop", "frizzlepop", "grizzleton", "whizbang", 
    "tinklish", "blopple", "blurbit", "wozzly", "zingpong", "splimble", "twinklypop",
    "spinkly", "snubbleton", "glozzle", "splonkle", "quizzle", "drizzlebot", "snarbly",
    "twizzleth", "whizzleton", "crumblish", "snapple", "splozzle", "glimmish", 
    "plimbish", "snuzzleblop", "twinklish", "fizzywhip", "snorblish", "drizzler", 
    "flopplish", "smizzlepop", "crumpledash", "twizzlefizz", "plumbly", "smuzzle",
    "tizzler", "gobblish", "splunkton", "jibberdash", "sproingly", "snizzler", 
    "glabble", "twinkleflip", "flobble", "twonklepop", "splittish", "grumblepop",
    "whimblish", "splingledash", "snarpish", "twinklybit", "spindlish", "grubble",
    "smarple", "twonkerish", "sniffly", "snibbleton", "grizzlepop", "tazzler", 
    "splinsh", "snazzler", "twinklepuff", "zopple", "glunkish", "crizzlepop", 
    "snarklebot", "whibblish", "flimmerdash", "splurpyton", "snuzzlepop", "wigglerish",
    "sniggleplop", "jigglish", "splurble", "buzzsnip", "plomble", "splattypop", 
    "twinklepip", "twonglish", "flobber", "grimpish", "quaggler", "sporkish", 
    "drizzleth", "squiggler", "splobber", "ploppish", "snigglerish", "splingleth",
    "grizzleblop", "sploblish", "snarbler", "smarvish", "quizzlet", "snapplish",
    "snuzleflip", "plongish", "crizzlebot", "grimpish", "twinklebot", "blurpish",
    "splopple", "gizzleth", "drizzlepuff", "twonklish", "snubbler", "blurblebot",
    "splizzy", "twinkleton", "jibbler", "splizzlepop", "splurbit", "plobblish", 
    "crumplish", "snizzlebit", "twinklishbot", "spinkler", "snibbleflip", "wigglebot",
    "twonglishbot", "snizzleton", "splongle", "blonker", "glimmerbit", "snarvish",
    "love", "anger", "hope", "dream", "thought", "courage", 
    "strength", "patience", "birthday", "anniversary", 
    "vacation", "weekend", "holiday", "winter", "summer", 
    "autumn", "spring", "success", "failure", "freedom", "peace", "wisdom", 
    "kindness", "respect", "free", "freedom", "great", "best", "worst", "last", "first", "second", 
    "next", "there", "banana", "apple",
]

In [None]:
len(templates)

In [None]:
PERSON_ENTITY = "{person}"
ORG_ENTITY = "{organization}"
CITY_ENTITY = "{city}"
STATE_ENTITY = "{state}"
CITY_STATE_ENTITY = "{city_state}"
PRODUCT_ENTITY = "{product}"
COUNTRY_ENTITY = "{country}"
SERVICE_ENTITY = "{services}"
CAR_ENTITY = "{car}"
GADGET_ENTITY = "{gadget}"
STOCK_ENTITY = "{stock}"
MONEY_ENTITY = "{money}"
FINANCE_ENTITY = "{finance}"
TRAVEL_ENTITY = "{travel}"
FOOD_ENTITY = "{food}"
RESTAURANT_ENTITY = "{restaurant}"
SPORTS_TERMS_MISSING_ENTITY = "{sports_term}"
LOCATIONS_AND_LANDMARKS_ENTITY = "{location_and_landmark}"
ACTIVTIES_AND_EVENTS_ENTITY = "{activity_and_event}"
FOOD_MISSING_ENTITY = "{food_m}"
TRANSPORT_AND_DIRECTIONS_ENTITY = "{transport_and_direction}"

FAKE_CITY_ENTITY = "{fake_cty}"
FAKE_STATE_CODE_ENTITY = "{fake_state_cd}"
FAKE_STATE_NAME_ENTITY = "{fake_state_nam}"
CELEBRITY_ENTITY = "{celebrity}"


def detect_entity(entity_name, template):
    return entity_name in template

def tokenize(text):
    # Use regular expression to split words while keeping punctuation as separate tokens
    return re.findall(r'\w+|[^\w\s]', text)

# Tokenize the query and generate corresponding NER labels
def tokenize_and_label(query, city, state, city_state, organization, person, celebrity):
    tokens = tokenize(query)  # Tokenize the query using the improved function
    ner_labels = [0] * len(tokens)  # Initialize all labels as "O" (outside any entity)
    
    # Label city_state entity
    if city_state:
        city_state_tokens = tokenize(city_state)
        start_idx = find_token_index(tokens, city_state_tokens)
        if start_idx is not None:
            ner_labels[start_idx] = 9  # CSB-LOC (beginning of city_state)
            for i in range(1, len(city_state_tokens)):
                ner_labels[start_idx + i] = 10  # CSI-LOC (inside city_state)

    # Label city entity
    if city:
        city_tokens = tokenize(city)
        start_idx = find_token_index(tokens, city_tokens)
        if start_idx is not None:
            ner_labels[start_idx] = 5  # CB-LOC (beginning of city)
            for i in range(1, len(city_tokens)):
                ner_labels[start_idx + i] = 6  # CI-LOC (inside city)
    
    # Label state entity
    if state:
        state_tokens = tokenize(state)
        start_idx = find_token_index(tokens, state_tokens)
        if start_idx is not None:
            ner_labels[start_idx] = 7  # SB-LOC (beginning of state)
            for i in range(1, len(state_tokens)):
                ner_labels[start_idx + i] = 8  # SI-LOC (inside state)

    # Label organization entity
    if organization:
        org_tokens = tokenize(organization)
        start_idx = find_token_index(tokens, org_tokens)
        if start_idx is not None:
            ner_labels[start_idx] = 3  # B-ORG (beginning of organization)
            for i in range(1, len(org_tokens)):
                ner_labels[start_idx + i] = 4  # I-ORG (inside organization)

    # Label person entity
    if person:
        person_tokens = tokenize(person)
        start_idx = find_token_index(tokens, person_tokens)
        if start_idx is not None:
            ner_labels[start_idx] = 1  # B-PER (beginning of person)
            for i in range(1, len(person_tokens)):
                ner_labels[start_idx + i] = 2  # I-PER (inside person)
    # Label person entity
    if celebrity:
        person_tokens = tokenize(celebrity)
        start_idx = find_token_index(tokens, person_tokens)
        if start_idx is not None:
            ner_labels[start_idx] = 1  # B-PER (beginning of person)
            for i in range(1, len(person_tokens)):
                ner_labels[start_idx + i] = 2  # I-PER (inside person)
    
    return tokens, ner_labels

# Function to find the starting index of an entity's tokens in the query tokens
def find_token_index(tokens, entity_tokens):
    for i in range(len(tokens) - len(entity_tokens) + 1):
        if tokens[i:i + len(entity_tokens)] == entity_tokens:
            return i
    return None

def generate_queries(templates, n_queries=10000):
    cnt = 0
    fake_cnt = 0
    celeb_cnt = 0
    queries_with_labels = []
    query_counter = Counter()
    while cnt < n_queries:
        if (cnt %10000) == 0:
            print(f"completed generating {cnt} queries")
        template = random.choice(templates)
        # print(template)
        person, organization, city, state, city_state = (None,) * 5
        product, country, service, car, gadget, stock, money, finance, travel, food, restaurant = (None,) * 11
        sports_term, location_and_landmark, activity_and_event, food_m, transport_and_direction = (None,) * 5
        fake_cty, fake_state_cd, fake_state_nam, celebrity = (None,) * 4

        if detect_entity(PERSON_ENTITY, template):
            person=get_random_choice_from_list(persons)
        if detect_entity(ORG_ENTITY, template):
            organization = get_random_choice_from_list(organizations)
        if detect_entity(PRODUCT_ENTITY, template):
            product = get_random_choice_from_list(products)
        if detect_entity(COUNTRY_ENTITY, template):
            country = get_random_choice_from_list(countries)
        if detect_entity(COUNTRY_ENTITY, template):
            service = get_random_choice_from_list(services)
        if detect_entity(CAR_ENTITY, template):
            car = get_random_choice_from_list(cars)
        if detect_entity(GADGET_ENTITY, template):
            gadget = get_random_choice_from_list(gadgets)
        if detect_entity(STOCK_ENTITY, template):
            stock = get_random_choice_from_list(stocks)
        if detect_entity(MONEY_ENTITY, template):
            money = get_random_choice_from_list(moneys)
        if detect_entity(FINANCE_ENTITY, template):
            finance = get_random_choice_from_list(finances)
        if detect_entity(TRAVEL_ENTITY, template):
            travel = get_random_choice_from_list(travels)
        if detect_entity(FOOD_ENTITY, template):
            food = get_random_choice_from_list(foods)
        if detect_entity(RESTAURANT_ENTITY, template):
            restaurant = get_random_choice_from_list(restaurants)
        if detect_entity(SPORTS_TERMS_MISSING_ENTITY, template):
            sports_term = get_random_choice_from_list(sports_terms_missing)
        if detect_entity(LOCATIONS_AND_LANDMARKS_ENTITY, template):
            location_and_landmark = get_random_choice_from_list(locations_and_landmarks)
        if detect_entity(ACTIVTIES_AND_EVENTS_ENTITY, template):
            activity_and_event = get_random_choice_from_list(activities_and_events)
        if detect_entity(FOOD_MISSING_ENTITY, template):
            food_m = get_random_choice_from_list(food_missing)
        if detect_entity(TRANSPORT_AND_DIRECTIONS_ENTITY, template):
            transport_and_direction = get_random_choice_from_list(transport_and_directions)

        if detect_entity(FAKE_CITY_ENTITY, template):
            fake_cty = get_sample_fake_city()
        if detect_entity(FAKE_STATE_CODE_ENTITY, template):
            fake_state_cd = get_sample_fake_state_code()
        if detect_entity(FAKE_STATE_NAME_ENTITY, template):
            fake_state_nam = get_sample_fake_state_name()

        if detect_entity(CITY_ENTITY, template):
            city=get_sample_from_cities(city_info, city_weights, actual_threshold=0.7)
        if detect_entity(STATE_ENTITY, template):
            state=get_sample_from_states(state_info, actual_threshold=0.5)
        if detect_entity(CITY_STATE_ENTITY, template):
            city_state=get_sample_from_cities_and_states(city_state_code_info, city_state_name_info, state_code_threshold=0.8)

        if detect_entity(CELEBRITY_ENTITY, template):
            celebrity=get_random_choice_from_list(celebrities)
        
        query = template.format(person=person,
                                organization=organization,
                                city=city,
                                state=state,
                                city_state=city_state,
                                product=product,
                                country=country,
                                service=service,
                                car=car,
                                gadget=gadget,
                                stock=stock,
                                money=money,
                                finance=finance,
                                travel=travel,
                                food=food,
                                restaurant=restaurant,
                                sports_term=sports_term,
                                location_and_landmark=location_and_landmark,
                                activity_and_event=activity_and_event,
                                food_m=food_m,
                                transport_and_direction=transport_and_direction,
                                fake_cty=fake_cty,
                                fake_state_cd=fake_state_cd,
                                fake_state_nam=fake_state_nam,
                                celebrity=celebrity
                               )
        tokens, ner_labels = tokenize_and_label(query, city, state, city_state, organization, person, celebrity)
        if query_counter.get(query, 0) == 0:
            queries_with_labels.append((query, tokens, ner_labels))
            query_counter.update([query])
            cnt += 1
            if (detect_entity(FAKE_CITY_ENTITY, template) or 
                detect_entity(FAKE_STATE_CODE_ENTITY, template) or 
                detect_entity(FAKE_STATE_NAME_ENTITY, template)):
                fake_cnt += 1
            if detect_entity(CELEBRITY_ENTITY, template):
                celeb_cnt += 1
    print(f"fake_cnt = {fake_cnt}")
    print(f"celeb_cnt = {celeb_cnt}")
    print(f"cnt = {cnt}")
    return queries_with_labels

In [None]:
queries_with_labels = generate_queries(templates, n_queries=450000) # 300000

In [None]:
len(queries_with_labels)

In [None]:
# queries_with_labels[:10]
df_ner_examples = pd.DataFrame(queries_with_labels, columns=['query', 'tokens', 'ner_tags'])
df_ner_examples

In [None]:
df_ner_examples['ner_tags'].apply(lambda tags: len([tag for tag in tags if tag > 4])).value_counts()

In [None]:
label_map

In [None]:
df_ner_examples['query'].value_counts()

In [None]:
# df_ner_examples.to_csv("../data/df_ner_examples_v3.csv", index=False)
# df_ner_examples.to_csv("../data/df_ner_examples_v4.csv", index=False)
# df_ner_examples.to_csv("../data/df_ner_examples_v5.csv", index=False)  # Additional partial cities
df_ner_examples.to_csv("../data/df_ner_examples_v6.csv", index=False)  # Additional partial cities

In [None]:
# useful for post processing to standardize the city names
def build_lookup(dataframe):
    # Initialize an empty dictionary for the lookup
    lookup = {}
    
    # Iterate over each row in the DataFrame
    for index, row in dataframe.iterrows():
        city_name = row['city_name']
        alternate_names = row['alternate_names']
        
        # Iterate over the list of alternate names and map them to the city_name
        for alt_name in alternate_names:
            lookup[alt_name.lower()] = city_name  # Convert alternate names to lowercase for consistency
    
    return lookup

city_alternate_to_city_lkp = build_lookup(city_states_data)

In [None]:
len(city_alternate_to_city_lkp)

In [None]:
# city_alternate_to_city_lkp

In [None]:
# !python -m pip install onnxruntime

In [None]:
# !python -m pip freeze| grep  onnxruntime

In [None]:
# !mkdir ../models

In [None]:
import onnxruntime as ort
import numpy as np
from transformers import AutoTokenizer, BertTokenizer

# Download the ONNX model
# model_url = "https://huggingface.co/Xenova/bert-base-NER/resolve/main/onnx/model_quantized.onnx"
# model_url = "https://huggingface.co/Mozilla/distilbert-NER-LoRA/resolve/main/onnx/model_quantized.onnx"
model_url = "https://huggingface.co/Mozilla/distilbert-uncased-NER-LoRA/resolve/main/onnx/model_quantized.onnx"
# model_url = "https://huggingface.co/chidamnat2002/distilbert-uncased-NER-LoRA/resolve/main/onnx/model_quantized.onnx"
# model_path = "../models/distilbert-NER-LoRA.onnx"
model_path = "../models/distilbert-uncased-NER-LoRA.onnx"

# Download the ONNX model if not already present
response = requests.get(model_url)
with open(model_path, 'wb') as f:
    f.write(response.content)

# Load the ONNX model using ONNX Runtime
session = ort.InferenceSession(model_path)

# Load the tokenizer (assuming it's based on BERT)
# tokenizer = BertTokenizer.from_pretrained("Mozilla/distilbert-NER-LoRA")
tokenizer = AutoTokenizer.from_pretrained("Mozilla/distilbert-uncased-NER-LoRA")

In [None]:
def compute_model_inputs_and_outputs(session, tokenizer, query):
    # Tokenize the input
    # inputs = tokenizer(query, return_tensors="np", truncation=True, padding=True)
    inputs = tokenizer(query, return_tensors="np", truncation=True, padding='max_length', max_length=64)
    # is_split_into_words=True,
                                          # truncation=True,
                                          # padding='max_length',
                                          # max_length=64
    
    # The ONNX model expects 'input_ids', 'attention_mask', and 'token_type_ids'
    # Convert all necessary inputs to numpy arrays and prepare the input feed
    input_feed = {
        'input_ids': inputs['input_ids'].astype(np.int64),
        'attention_mask': inputs['attention_mask'].astype(np.int64),
        # 'token_type_ids': inputs['token_type_ids'].astype(np.int64)  # Some models might not need this; check if it's really required
    }
    
    # Run inference with the ONNX model
    outputs = session.run(None, input_feed)
    # print(outputs)
    return inputs, outputs


In [None]:
label_map

In [None]:
## With Xenova/bert-base-NER
# Number of examples = 349
# #hits = 135; #hit rate = 0.3868194842406877

## After finetuning the Mozilla/distilbert-NER-LoRA
#hits = 220; #hit rate = 0.6303724928366762

## After finetuning the chidamnat2002/distilbert-uncased-NER-LoRA
#hits = 207; #hit rate = 0.5931232091690545

## After finetuning the Mozilla/distilbert-uncased-NER-LoRA
#hits = 252; #hit rate = 0.7220630372492837

In [None]:
# len(missing_locations)

In [None]:
# print(missing_locations)

#### Looking into CONLL 2003 dataset

In [None]:
from datasets import load_dataset, Dataset
import re

# Load the CoNLL-2003 dataset
dataset = load_dataset("conll2003")

loc_examples = dataset

In [None]:
# dataset['train'].to_pandas()

In [None]:
# dataset['train']

In [None]:
synthetic_loc_dataset = Dataset.from_pandas(df_ner_examples.drop('query', axis=1))
print(synthetic_loc_dataset)

print(synthetic_loc_dataset[0])

In [None]:
# loc_dataset = dataset['train'].filter(lambda example: 5 in example['ner_tags'])
loc_dataset = dataset['train']
loc_dataset_filtered = loc_dataset.remove_columns(['pos_tags', 'chunk_tags'])

# Set the format to ensure the order is 'id', 'tokens', and 'ner_tags'
loc_dataset_filtered[0]

In [None]:
# loc_dataset_filtered[-1]

In [None]:
from datasets import concatenate_datasets

from datasets import Sequence, ClassLabel, Value

# Step 1: Get the full feature schema from synthetic_loc_dataset
features = synthetic_loc_dataset.features

# Step 2: Update the 'ner_tags' feature to use ClassLabel from loc_dataset_filtered
# features['ner_tags'] = Sequence(feature=ClassLabel(names=loc_dataset_filtered.features['ner_tags'].feature.names))
features['ner_tags'] = Sequence(feature=ClassLabel(names=list(label_map.values())))

# Step 3: Cast synthetic_loc_dataset to the updated feature schema
synthetic_loc_dataset = synthetic_loc_dataset.cast(features)

# Check the updated features to confirm
print(synthetic_loc_dataset.features)

# Now concatenate the datasets
# combined_dataset = concatenate_datasets([loc_dataset_filtered, synthetic_loc_dataset])

# Verify the combined dataset
print(synthetic_loc_dataset[0])


In [None]:
# ClassLabel(names=loc_dataset_filtered.features['ner_tags'].feature.names)

In [None]:
# ClassLabel(names=list(label_map.values()))

In [None]:
len(synthetic_loc_dataset)

In [None]:
synthetic_loc_dataset[3]

In [None]:
synthetic_loc_dataset = synthetic_loc_dataset.map(
    lambda example, idx: {'id': idx},  # Assign running count as the new 'id'
    with_indices=True  # Ensures we get an index for each example
)

In [None]:
synthetic_loc_dataset.to_pandas()

In [None]:
synthetic_loc_dataset[-1]

In [None]:
# synthetic_loc_dataset.to_parquet("../data/synthetic_loc_dataset_v3.parquet")
# synthetic_loc_dataset.to_parquet("../data/synthetic_loc_dataset_v4.parquet")
synthetic_loc_dataset.to_parquet("../data/synthetic_loc_dataset_v6.parquet")  # some partial cities examples

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("Mozilla/distilbert-uncased-NER-LoRA")
model = AutoModelForTokenClassification.from_pretrained("Mozilla/distilbert-uncased-NER-LoRA")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "New York"

ner_results = nlp(example)
print(ner_results)
