Explore whether the weather keywords and locations are captured correctly

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [None]:
import numpy as np
import random

In [None]:
url = "https://en.m.wikipedia.org/wiki/List_of_television_stations_in_North_America_by_media_market"
response = requests.get(url)


In [None]:
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    dma_heading = soup.find('h4', string='DMAs')
    dma_list = dma_heading.find_next('ul')
    
    dma_data = []
    if dma_list:
        for li in dma_list.find_all('li'):
            market_name = li.get_text(strip=True)

            # Split by dash (-) or en-dash (–) to handle cases like "Dallas-Fort Worth"
            split_names = re.split(r'–|-', market_name)

            # Process each split name
            for name in split_names:
                # Remove the (#NUM) part using regex
                name = re.sub(r'\s*\(#\d+\)', '', name).strip()

                # Check if there's a city in parentheses and split them
                match = re.match(r'(.+?)\s*\((.+?)\)', name)
                if match:
                    main_city = match.group(1).strip()
                    parenthetical_city = match.group(2).strip()
                    dma_data.append(main_city)  # Add the main city
                    dma_data.append(parenthetical_city)  # Add the city in parentheses
                else:
                    dma_data.append(name) 

    for index, dma in enumerate(dma_data, start=1):
        print(f"{index}. {dma}")



In [None]:
dma_data[:5]

#### Read the data/geonames-cities-states.json

In [None]:
import json 

def get_geonames_city_state_data():
    geonames_file = "../data/geonames-cities-states.json"
    with open(geonames_file, 'r') as f:
        geonames_dict = json.load(f)
    
    
    cities_data = pd.DataFrame(geonames_dict['cities'])\
                    .rename(columns={'admin1_code': 'state_code', 'name': 'city_name', 'population': 'city_popln'})
    cities_data = cities_data[['id', 'state_code', 'city_name', 'city_popln', 'alternate_names']]
    states_data = pd.DataFrame(geonames_dict['states_by_abbr'].values())\
                    .rename(columns={'admin1_code': 'state_code', 'name': 'state_name'})
    states_data = states_data[['state_code', 'state_name']]
    city_states_data = cities_data.merge(states_data, how='left', on='state_code')
    city_states_data['city_weight'] = city_states_data['city_popln'] / city_states_data['city_popln'].sum()
    return city_states_data



In [None]:
city_states_data = get_geonames_city_state_data()
print(len(city_states_data))
city_states_data

In [None]:
city_states_data.sort_values('city_weight', ascending=False)

In [None]:
# useful for post processing to standardize the city names
def build_lookup(dataframe):
    # Initialize an empty dictionary for the lookup
    lookup = {}
    
    # Iterate over each row in the DataFrame
    for index, row in dataframe.iterrows():
        city_name = row['city_name']
        alternate_names = row['alternate_names']
        
        # Iterate over the list of alternate names and map them to the city_name
        for alt_name in alternate_names:
            lookup[alt_name.lower()] = city_name  # Convert alternate names to lowercase for consistency
    
    return lookup

city_alternate_to_city_lkp = build_lookup(city_states_data)

In [None]:
len(city_alternate_to_city_lkp)

In [None]:
city_states_data['alternate_names'].apply(len).value_counts()

In [None]:
np.random.seed(42)

def get_alternate_or_actual_name(row):
    if row['alternate_names'] and isinstance(row['alternate_names'], list):
        return random.choice(row['alternate_names'])
    return row['city_name']

def combine_city_with_states(row):
    if row['state_code'] is not None:
        # return row['city'] + ", " + row['state_code']
        return row['city'] + ", " + random.choice([row['state_code'], row['state_name']])
    return row['city']
    
def sample_location(df, n_examples=10000, state_ratio=0.5):
    weights = df['city_weight']
    samples = df[['id', 'city_name', 'alternate_names', 'state_code', 'state_name', 'city_popln']].sample(n=n_examples, weights=weights, replace=True)
    states_idx = np.random.random(n_examples) <= state_ratio
    samples.loc[states_idx, 'state_code'] = None
    random_alternate_name = samples.apply(get_alternate_or_actual_name, axis=1)
    samples['city'] = random_alternate_name
    samples['location'] = samples.apply(combine_city_with_states, axis=1)
    return samples

In [None]:
sample_df = sample_location(city_states_data, n_examples=100000, state_ratio=0.5)

In [None]:
sample_df

In [None]:
sample_df.loc[sample_df['location'] == 'san']

In [None]:
sample_df['location'].value_counts()[:60]

In [None]:
geo_city_state_data = sample_df['location'].values.tolist()
print(len(geo_city_state_data))
geo_city_state_data[:10]

In [None]:
# !python -m pip install onnxruntime

In [None]:
# !python -m pip freeze| grep  onnxruntime

In [None]:
# !mkdir ../models

In [None]:
import onnxruntime as ort
import numpy as np
from transformers import AutoTokenizer, BertTokenizer

# Download the ONNX model
# model_url = "https://huggingface.co/Xenova/bert-base-NER/resolve/main/onnx/model_quantized.onnx"
# model_url = "https://huggingface.co/Mozilla/distilbert-NER-LoRA/resolve/main/onnx/model_quantized.onnx"
# model_url = "https://huggingface.co/Mozilla/distilbert-uncased-NER-LoRA/resolve/main/onnx/model_quantized.onnx"
model_url = "https://huggingface.co/chidamnat2002/distilbert-uncased-NER-LoRA/resolve/main/onnx/model_quantized.onnx"
# model_path = "../models/distilbert-NER-LoRA.onnx"
model_path = "../models/distilbert-uncased-NER-LoRA.onnx"

# Download the ONNX model if not already present
response = requests.get(model_url)
with open(model_path, 'wb') as f:
    f.write(response.content)

# Load the ONNX model using ONNX Runtime
session = ort.InferenceSession(model_path)

# Load the tokenizer (assuming it's based on BERT)
# tokenizer = BertTokenizer.from_pretrained("Mozilla/distilbert-NER-LoRA")
# tokenizer = AutoTokenizer.from_pretrained("Mozilla/distilbert-uncased-NER-LoRA")
tokenizer = AutoTokenizer.from_pretrained("chidamnat2002/distilbert-uncased-NER-LoRA")

In [None]:
def compute_model_inputs_and_outputs(session, tokenizer, query):
    # Tokenize the input
    # inputs = tokenizer(query, return_tensors="np", truncation=True, padding=True)
    inputs = tokenizer(query, return_tensors="np", truncation=True, padding='max_length', max_length=64)
    # is_split_into_words=True,
                                          # truncation=True,
                                          # padding='max_length',
                                          # max_length=64
    
    # The ONNX model expects 'input_ids', 'attention_mask', and 'token_type_ids'
    # Convert all necessary inputs to numpy arrays and prepare the input feed
    input_feed = {
        'input_ids': inputs['input_ids'].astype(np.int64),
        'attention_mask': inputs['attention_mask'].astype(np.int64),
        # 'token_type_ids': inputs['token_type_ids'].astype(np.int64)  # Some models might not need this; check if it's really required
    }
    
    # Run inference with the ONNX model
    outputs = session.run(None, input_feed)
    # print(outputs)
    return inputs, outputs


In [None]:
def detect_location(inputs, outputs, tokenizer):
    # print("Shape of outputs:", [o.shape for o in outputs])

    # Post-process the output (this will depend on the model's output structure)
    logits = outputs[0]  # Assuming the model output is logits
    probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
    
    predicted_ids = np.argmax(logits, axis=-1)
    predicted_probs = np.max(probabilities, axis=-1)
    
    # Define the threshold for NER probability
    threshold = 0.5
    
    label_map = {
        0: "O",        # Outside any named entity
        1: "B-PER",    # Beginning of a person entity
        2: "I-PER",    # Inside a person entity
        3: "B-ORG",    # Beginning of an organization entity
        4: "I-ORG",    # Inside an organization entity
        5: "B-LOC",    # Beginning of a location entity
        6: "I-LOC",    # Inside a location entity
        7: "B-MISC",   # Beginning of a miscellaneous entity (for example)
        8: "I-MISC"    # Inside a miscellaneous entity (for example)
    }
    
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    
    # List to hold the detected location terms
    location_entities = []
    current_location = []
    
    # Loop through each token and its predicted label and probability
    for i, (token, predicted_id, prob) in enumerate(zip(tokens, predicted_ids[0], predicted_probs[0])):
    # for i, (token, predicted_id, prob) in enumerate(zip(tokens, predicted_ids.flatten(), predicted_probs.flatten())):
        label = label_map[predicted_id]

        # Ignore special tokens like [CLS], [SEP]
        if token in ["[CLS]", "[SEP]", "[PAD]"]:
            continue
    
        # Only consider tokens with probability above the threshold
        if prob > threshold:
            # If the token is a part of a location entity (B-LOC or I-LOC)
            if label in ["B-LOC", "I-LOC"]:
                if label == "B-LOC":
                    # If we encounter a B-LOC, we may need to store the previous location
                    if current_location:
                        location_entities.append(" ".join(current_location).replace("##", ""))
                    # Start a new location entity
                    current_location = [token]
                elif label == "I-LOC" and current_location:
                    # Continue appending to the current location entity
                    current_location.append(token)
            else:
                # If we encounter a non-location entity, store the current location and reset
                if current_location:
                    location_entities.append(" ".join(current_location).replace("##", ""))
                    current_location = []
    
    # Append the last location entity if it exists
    if current_location:
        location_entities.append(" ".join(current_location).replace("##", ""))

    # Return the detected location terms
    return location_entities[0] if location_entities != [] else None


In [None]:
# query = "restaurants in Philadelphia"
query = "weather Boston"
# query = "Boston weather"
inputs, outputs = compute_model_inputs_and_outputs(session, tokenizer, query)
detect_location(inputs, outputs, tokenizer)


In [None]:
# inputs
outputs[0].shape

In [None]:
num_examples = len(dma_data)
hit = 0
match = 0
missing_locations = set()
for index, dma in enumerate(dma_data, start=1):
    # location = detect_location(session, tokenizer, dma)
    inputs, outputs = compute_model_inputs_and_outputs(session, tokenizer, dma)
    location = detect_location(inputs, outputs, tokenizer)
    print(f"{index}. {dma} -> {location}, : {dma.lower() == location}")
    if location:
        hit += 1
        if dma.lower() == location:
            match += 1
    else:
        missing_locations.add(dma)

print()
print(f"Number of examples = {num_examples}")
print(f"#hits = {hit}; #hit rate = {hit/num_examples}")
print(f"#matches = {match}; #match rate = {match/num_examples}")

In [None]:
# num_examples = len(geo_city_state_data)
# hit = 0
# match = 0
# missing_locations = set()
# for index, city_data in enumerate(geo_city_state_data, start=1):
#     # location = detect_location(session, tokenizer, city_data)
#     inputs, outputs = compute_model_inputs_and_outputs(session, tokenizer, city_data)
#     location = detect_location(inputs, outputs, tokenizer)
#     print(f"{index}. {city_data} -> {location}, : {city_data == location}")
#     if location:
#         hit += 1
#         if city_data == location:
#             match += 1
#     else:
#         missing_locations.add(city_data)

# print()
# print(f"Number of examples = {num_examples}")
# print(f"#hits = {hit}; #hit rate = {hit/num_examples}")
# print(f"#matches = {match}; #match rate = {match/num_examples}")

In [None]:
## With Xenova/bert-base-NER
# Number of examples = 349
# #hits = 135; #hit rate = 0.3868194842406877

## After finetuning the Mozilla/distilbert-NER-LoRA
#hits = 220; #hit rate = 0.6303724928366762

## After finetuning the chidamnat2002/distilbert-uncased-NER-LoRA
#hits = 207; #hit rate = 0.5931232091690545

## After finetuning the Mozilla/distilbert-uncased-NER-LoRA
#hits = 252; #hit rate = 0.7220630372492837

In [None]:
len(missing_locations)

In [None]:
print(missing_locations)

#### Looking into CONLL 2003 dataset

In [None]:
from datasets import load_dataset, Dataset
import re

# Load the CoNLL-2003 dataset
dataset = load_dataset("conll2003")

loc_examples = dataset

In [None]:
dataset['train'].to_pandas()

In [None]:
dataset['train']

In [None]:
label_map = {
        0: "O",        # Outside any named entity
        1: "B-PER",    # Beginning of a person entity
        2: "I-PER",    # Inside a person entity
        3: "B-ORG",    # Beginning of an organization entity
        4: "I-ORG",    # Inside an organization entity
        5: "B-LOC",    # Beginning of a location entity
        6: "I-LOC",    # Inside a location entity
        7: "B-MISC",   # Beginning of a miscellaneous entity (for example)
        8: "I-MISC"    # Inside a miscellaneous entity (for example)
    }

In [None]:
import random
import pandas as pd
from collections import Counter

# List of sample cities
# cities = list(missing_locations)
# cities = dma_data[:]
cities = geo_city_state_data[:]
NUM_EXAMPLES = 50000
# Sample sentence templates
templates = [
    "John visited {} last summer.",
    "The headquarters is located in {}.",
    "My cousin moved to {} recently.",
    "{} is famous for its historical landmarks.",
    "A new park was opened in {}.",
    "The festival in {} was a great success.",
    "I am planning a trip to {} next month.",
    "The weather in {} has been wonderful this year.",
    "{} is known for its beautiful scenery.",
    "{} is home to several tech companies.",
    # "{} weather",
    "weather {}",
# # ]
# # addtional_weather_yelp_templates = [
    'The weather in {}',
     'What is the weather in {}',
     "What's the weather in {}",
     'Weather forecast in {}',
     '{} weather',
     'temperature {}',
     '{} temperature',
     'What are the best restaurants in {}',
     'Top-rated restaurants in {}',
     'Popular coffee shops in {}',
     'Best pizza places in {}',
     'Best sushi places in {}',
     'Cheap restaurants in {}',
     'Best places to eat in {}',
     'Restaurants near me in {}',
     '{} restaurants',
     '{} hotels',
     '{} food',
]

print(f"Size of templates = {len(templates)}")

# Function to create NER tags
def create_ner_tags(tokens, city):
    ner_tags = []
    for token in tokens:
        if token in city.split():
            # Assign B-LOC for the first token of the city, and I-LOC for the rest
            ner_tag = 5 if city.split().index(token) == 0 else 6
            ner_tags.append(ner_tag)
        else:
            ner_tags.append(0)  # O tag for non-entity words
    return ner_tags

# Generate 10000 NER examples with IDs, tokens, and ner_tags
ner_examples = []
queries_set = set()
pattern_counter = Counter()
lower_case_prob = 0.4
i = 0
# for i in range(NUM_EXAMPLES):
while i < NUM_EXAMPLES:
    if i % 1000 == 0:
        print(f"completed {i+1} examples")
    city = random.choice(cities)
    if random.random() < lower_case_prob:
        city = city.lower()
    # if i%2 == 0:
    #     city = city.lower()
    template = random.choice(templates)
    sentence = template.format(city)
    if sentence in queries_set:
        continue
    if pattern_counter.get(template, 0) > NUM_EXAMPLES//6:
        continue
    queries_set.add(sentence)
    pattern_counter.update([template])
    tokens = sentence.split()
    ner_tags = create_ner_tags(tokens, city)
    
    # Append the example in the format of {'id', 'tokens', 'ner_tags'}
    ner_examples.append({
        'id': str(i),
        'tokens': tokens,
        'ner_tags': ner_tags
    })
    i += 1


# Convert the examples into a pandas DataFrame
df_ner_examples = pd.DataFrame(ner_examples)
df_ner_examples

In [None]:
synthetic_loc_dataset = Dataset.from_pandas(df_ner_examples)
synthetic_loc_dataset

In [None]:
synthetic_loc_dataset[0]

In [None]:
# loc_dataset = dataset['train'].filter(lambda example: 5 in example['ner_tags'])
loc_dataset = dataset['train']
loc_dataset_filtered = loc_dataset.remove_columns(['pos_tags', 'chunk_tags'])

# Set the format to ensure the order is 'id', 'tokens', and 'ner_tags'
loc_dataset_filtered[0]

In [None]:
loc_dataset_filtered[-1]

In [None]:
from datasets import concatenate_datasets

from datasets import Sequence, ClassLabel, Value

# Step 1: Get the full feature schema from synthetic_loc_dataset
features = synthetic_loc_dataset.features

# Step 2: Update the 'ner_tags' feature to use ClassLabel from loc_dataset_filtered
features['ner_tags'] = Sequence(feature=ClassLabel(names=loc_dataset_filtered.features['ner_tags'].feature.names))

# Step 3: Cast synthetic_loc_dataset to the updated feature schema
synthetic_loc_dataset = synthetic_loc_dataset.cast(features)

# Check the updated features to confirm
print(synthetic_loc_dataset.features)

# Now concatenate the datasets
combined_dataset = concatenate_datasets([loc_dataset_filtered, synthetic_loc_dataset])

# Verify the combined dataset
print(combined_dataset[0])


In [None]:
len(combined_dataset)

In [None]:
combined_dataset[3]

In [None]:
combined_dataset = combined_dataset.map(
    lambda example, idx: {'id': idx},  # Assign running count as the new 'id'
    with_indices=True  # Ensures we get an index for each example
)

In [None]:
combined_dataset.to_pandas()

In [None]:
combined_dataset[-1]

In [None]:
combined_dataset.to_parquet("../data/combined_ner_examples_v3.parquet")

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("Mozilla/distilbert-uncased-NER-LoRA")
model = AutoModelForTokenClassification.from_pretrained("Mozilla/distilbert-uncased-NER-LoRA")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "New York"

ner_results = nlp(example)
print(ner_results)


In [None]:
len(ner_examples)

In [None]:
example = ' '.join( ner_examples[1]['tokens'])
example

In [None]:
sample_inputs = tokenizer(example, return_tensors="np", truncation=True, padding='max_length', max_length=64)
sample_inputs['input_ids']

In [None]:
tokenizer.decode(tokenizer(example, return_tensors="np", truncation=True, padding='max_length', max_length=64)['input_ids'][0])

In [None]:
tokenizer.vocab['land']

In [None]:
df_ner_examples