notebooks/ner_evaluation.ipynb (1,423 lines of code) (raw):

{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "247555cc-db4a-44a4-909d-4c951896763e", "metadata": {}, "outputs": [], "source": [ "import sys\n", "import os\n", "import json\n", "import pandas as pd\n", "\n", "# Add the project root directory to the Python path\n", "project_root = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n", "sys.path.append(project_root)\n", "\n", "# Now you can import the IntentClassifier from src/infer_intent.py\n", "from src.infer_location import LocationFinder\n", "from tqdm import tqdm\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "id": "383dff8c-3382-4c69-8609-711ae6a2c0a1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloading ONNX model...\n", "ONNX model downloaded.\n" ] } ], "source": [ "cls = LocationFinder()" ] }, { "cell_type": "markdown", "id": "dc678c1f-9e9f-4050-a164-5e533fde6e50", "metadata": {}, "source": [ "#### NER validate the generated data with pre and post modifiers" ] }, { "cell_type": "code", "execution_count": 3, "id": "da4d187b-7989-402d-90ef-1cd3680ccd15", "metadata": {}, "outputs": [], "source": [ "tqdm.pandas()" ] }, { "cell_type": "code", "execution_count": 4, "id": "e1b1f45d-5664-4b6b-9cd7-ecf3f70a0481", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:12<00:00, 78.53it/s]\n" ] } ], "source": [ "ner_val_generated_data = pd.read_csv(\"../data/named_entity_val_generated_data.csv\")\n", "ner_val_generated_data['city'] = ner_val_generated_data['city'].fillna('')\n", "ner_val_generated_data['state'] = ner_val_generated_data['state'].fillna('')\n", "\n", "city_state_preds = ner_val_generated_data['queries'].progress_apply(lambda query: cls.find_location(query))\n", "ner_val_generated_data['city_pred'] = city_state_preds.apply(lambda cs: cs.get('city', '')).fillna('')\n", "ner_val_generated_data['state_pred'] = city_state_preds.apply(lambda cs: cs.get('state', '')).fillna('')" ] }, { "cell_type": "code", "execution_count": 5, "id": "36116aaa-cc4d-4bc4-8ec6-fe919ccb96d2", "metadata": {}, "outputs": [], "source": [ "## Prepare the same file in JSON format for FX ML inference\n", "queries_array = ner_val_generated_data['queries'].values\n", "\n", "# Create a dictionary structure\n", "data = {\n", " \"queries\": queries_array.tolist() \n", "}\n", "\n", "# Define the output JSON file path\n", "output_file_path = \"../data/named_entity_val_generated_data.json\"\n", "\n", "## Uncomment to write it else its almost static dataset\n", "# Write to a JSON file\n", "with open(output_file_path, \"w\") as json_file:\n", " json.dump(data, json_file, indent=2)\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "2856d1b7-88e1-4f28-bd67-602a4ed4df58", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>queries</th>\n", " <th>city</th>\n", " <th>state</th>\n", " <th>city_pred</th>\n", " <th>state_pred</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>train stations near detroit</td>\n", " <td>detroit</td>\n", " <td></td>\n", " <td>detroit</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>voter registration in wa</td>\n", " <td></td>\n", " <td>wa</td>\n", " <td></td>\n", " <td>wa</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>weather report for los angeles, california</td>\n", " <td>los angeles</td>\n", " <td>california</td>\n", " <td>los angeles</td>\n", " <td>california</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>city hall in boston</td>\n", " <td>boston</td>\n", " <td></td>\n", " <td>boston</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>bike rentals in baltimore, maryland</td>\n", " <td>baltimore</td>\n", " <td>maryland</td>\n", " <td>baltimore</td>\n", " <td>maryland</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>995</th>\n", " <td>tax offices in las vegas</td>\n", " <td>las vegas</td>\n", " <td></td>\n", " <td>las vegas</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>996</th>\n", " <td>homes for rent in baltimore, maryland</td>\n", " <td>baltimore</td>\n", " <td>maryland</td>\n", " <td>baltimore</td>\n", " <td>maryland</td>\n", " </tr>\n", " <tr>\n", " <th>997</th>\n", " <td>top universities in san diego, california</td>\n", " <td>san diego</td>\n", " <td>california</td>\n", " <td>san diego</td>\n", " <td>california</td>\n", " </tr>\n", " <tr>\n", " <th>998</th>\n", " <td>top universities in tampa, florida</td>\n", " <td>tampa</td>\n", " <td>florida</td>\n", " <td>tampa</td>\n", " <td>florida</td>\n", " </tr>\n", " <tr>\n", " <th>999</th>\n", " <td>how far is san diego from ca</td>\n", " <td>san diego</td>\n", " <td>ca</td>\n", " <td>san diego</td>\n", " <td>ca</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>1000 rows × 5 columns</p>\n", "</div>" ], "text/plain": [ " queries city state \\\n", "0 train stations near detroit detroit \n", "1 voter registration in wa wa \n", "2 weather report for los angeles, california los angeles california \n", "3 city hall in boston boston \n", "4 bike rentals in baltimore, maryland baltimore maryland \n", ".. ... ... ... \n", "995 tax offices in las vegas las vegas \n", "996 homes for rent in baltimore, maryland baltimore maryland \n", "997 top universities in san diego, california san diego california \n", "998 top universities in tampa, florida tampa florida \n", "999 how far is san diego from ca san diego ca \n", "\n", " city_pred state_pred \n", "0 detroit \n", "1 wa \n", "2 los angeles california \n", "3 boston \n", "4 baltimore maryland \n", ".. ... ... \n", "995 las vegas \n", "996 baltimore maryland \n", "997 san diego california \n", "998 tampa florida \n", "999 san diego ca \n", "\n", "[1000 rows x 5 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ner_val_generated_data" ] }, { "cell_type": "markdown", "id": "cf14eec0-ae6d-40f7-81d8-7604acfde457", "metadata": {}, "source": [ "#### Partial city & state matches" ] }, { "cell_type": "code", "execution_count": 7, "id": "8ed95306-52b4-4944-92b5-5978b9fd5fc5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "partial NER (City / state) accuracy = 0.987\n" ] } ], "source": [ "partial_match_rate = float(((ner_val_generated_data['city'] == ner_val_generated_data['city_pred']) | \n", "(ner_val_generated_data['state'] == ner_val_generated_data['state_pred'])).value_counts(normalize=True)[True])\n", "\n", "print(f\"partial NER (City / state) accuracy = {partial_match_rate}\")" ] }, { "cell_type": "markdown", "id": "90c4e08a-bdac-46bb-bb02-3c4824a1483d", "metadata": {}, "source": [ "#### Full city & state matches (using Python)" ] }, { "cell_type": "code", "execution_count": 8, "id": "1b79a3ea-f9a6-46d0-9657-6a0b9c57bb94", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NER accuracy = 0.931\n" ] } ], "source": [ "full_match_rate = float(((ner_val_generated_data['city'] == ner_val_generated_data['city_pred']) & \n", "(ner_val_generated_data['state'] == ner_val_generated_data['state_pred'])).value_counts(normalize=True)[True])\n", "\n", "print(f\"NER accuracy = {full_match_rate}\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "ee469045-0144-4522-b05d-18e696b4becf", "metadata": {}, "outputs": [], "source": [ "# NER accuracy = 0.9\n", "# NER accuracy = 0.931 # v0.1.6" ] }, { "cell_type": "code", "execution_count": 10, "id": "c80cafd0-7edf-42aa-9225-aa05b4dffb41", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>queries</th>\n", " <th>city</th>\n", " <th>state</th>\n", " <th>city_pred</th>\n", " <th>state_pred</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>19</th>\n", " <td>pool cleaning services in st. louis, missouri</td>\n", " <td>st. louis</td>\n", " <td>missouri</td>\n", " <td>st . louis</td>\n", " <td>missouri</td>\n", " </tr>\n", " <tr>\n", " <th>32</th>\n", " <td>detroit, mi public schools</td>\n", " <td>detroit</td>\n", " <td>mi</td>\n", " <td>detroit</td>\n", " <td>mi public</td>\n", " </tr>\n", " <tr>\n", " <th>39</th>\n", " <td>las vegas, nv public schools</td>\n", " <td>las vegas</td>\n", " <td>nv</td>\n", " <td>las vegas</td>\n", " <td>nv schools</td>\n", " </tr>\n", " <tr>\n", " <th>67</th>\n", " <td>miami, fl public schools</td>\n", " <td>miami</td>\n", " <td>fl</td>\n", " <td>miami</td>\n", " <td>fl public</td>\n", " </tr>\n", " <tr>\n", " <th>117</th>\n", " <td>humidity levels in st. louis</td>\n", " <td>st. louis</td>\n", " <td></td>\n", " <td>st . louis</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>911</th>\n", " <td>plumbing services in st. louis, missouri</td>\n", " <td>st. louis</td>\n", " <td>missouri</td>\n", " <td>st . louis</td>\n", " <td>missouri</td>\n", " </tr>\n", " <tr>\n", " <th>937</th>\n", " <td>nv licensed contractors</td>\n", " <td></td>\n", " <td>nv</td>\n", " <td></td>\n", " <td>n</td>\n", " </tr>\n", " <tr>\n", " <th>944</th>\n", " <td>real estate agents in st. louis, mo</td>\n", " <td>st. louis</td>\n", " <td>mo</td>\n", " <td>st . louis</td>\n", " <td>mo</td>\n", " </tr>\n", " <tr>\n", " <th>968</th>\n", " <td>new york housing prices</td>\n", " <td></td>\n", " <td>new york</td>\n", " <td>new york</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>993</th>\n", " <td>hvac repair in st. louis</td>\n", " <td>st. louis</td>\n", " <td></td>\n", " <td>st . louis</td>\n", " <td></td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>69 rows × 5 columns</p>\n", "</div>" ], "text/plain": [ " queries city state \\\n", "19 pool cleaning services in st. louis, missouri st. louis missouri \n", "32 detroit, mi public schools detroit mi \n", "39 las vegas, nv public schools las vegas nv \n", "67 miami, fl public schools miami fl \n", "117 humidity levels in st. louis st. louis \n", ".. ... ... ... \n", "911 plumbing services in st. louis, missouri st. louis missouri \n", "937 nv licensed contractors nv \n", "944 real estate agents in st. louis, mo st. louis mo \n", "968 new york housing prices new york \n", "993 hvac repair in st. louis st. louis \n", "\n", " city_pred state_pred \n", "19 st . louis missouri \n", "32 detroit mi public \n", "39 las vegas nv schools \n", "67 miami fl public \n", "117 st . louis \n", ".. ... ... \n", "911 st . louis missouri \n", "937 n \n", "944 st . louis mo \n", "968 new york \n", "993 st . louis \n", "\n", "[69 rows x 5 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ner_val_generated_data.loc[\n", "~((ner_val_generated_data['city'] == ner_val_generated_data['city_pred']) & \n", "(ner_val_generated_data['state'] == ner_val_generated_data['state_pred']))\n", "]" ] }, { "cell_type": "markdown", "id": "d1697e10-b73a-4f31-ac7c-aa5b408c2b02", "metadata": {}, "source": [ "#### Full city & state matches (using Fx ML js)" ] }, { "cell_type": "code", "execution_count": 11, "id": "565124a9-a0de-469e-a262-2436c805ebf6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1000" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# ML_output_NER_VAL_DATA.json\n", "fx_ml_ner_val = pd.read_json(\"../data/ML_output_NER_VAL_DATA.json\")\n", "fx_ml_ner_val = fx_ml_ner_val.rename(columns={'query': 'queries', 'city': 'city_pred', 'state': 'state_pred', 'intent': 'intent_pred'})\n", "fx_ml_ner_val['city_pred'] = fx_ml_ner_val['city_pred'].fillna('')\n", "fx_ml_ner_val['state_pred'] = fx_ml_ner_val['state_pred'].fillna('')\n", "len(fx_ml_ner_val)" ] }, { "cell_type": "code", "execution_count": 12, "id": "bc2ee33a-a683-4f9e-9876-4bc1f9f16e11", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>queries</th>\n", " <th>intent_pred</th>\n", " <th>city_pred</th>\n", " <th>state_pred</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>train stations near detroit</td>\n", " <td>yelp_intent</td>\n", " <td>detroit</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>voter registration in wa</td>\n", " <td>navigation_intent</td>\n", " <td></td>\n", " <td>wa</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>weather report for los angeles, california</td>\n", " <td>weather_intent</td>\n", " <td>los angeles</td>\n", " <td>california</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>city hall in boston</td>\n", " <td>yelp_intent</td>\n", " <td>boston</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>bike rentals in baltimore, maryland</td>\n", " <td>yelp_intent</td>\n", " <td>baltimore</td>\n", " <td>maryland</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>995</th>\n", " <td>tax offices in las vegas</td>\n", " <td>yelp_intent</td>\n", " <td>las vegas</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>996</th>\n", " <td>homes for rent in baltimore, maryland</td>\n", " <td>yelp_intent</td>\n", " <td>baltimore</td>\n", " <td>maryland</td>\n", " </tr>\n", " <tr>\n", " <th>997</th>\n", " <td>top universities in san diego, california</td>\n", " <td>yelp_intent</td>\n", " <td>san diego</td>\n", " <td>california</td>\n", " </tr>\n", " <tr>\n", " <th>998</th>\n", " <td>top universities in tampa, florida</td>\n", " <td>yelp_intent</td>\n", " <td>tampa</td>\n", " <td>florida</td>\n", " </tr>\n", " <tr>\n", " <th>999</th>\n", " <td>how far is san diego from ca</td>\n", " <td>information_intent</td>\n", " <td>san diego</td>\n", " <td>ca</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>1000 rows × 4 columns</p>\n", "</div>" ], "text/plain": [ " queries intent_pred \\\n", "0 train stations near detroit yelp_intent \n", "1 voter registration in wa navigation_intent \n", "2 weather report for los angeles, california weather_intent \n", "3 city hall in boston yelp_intent \n", "4 bike rentals in baltimore, maryland yelp_intent \n", ".. ... ... \n", "995 tax offices in las vegas yelp_intent \n", "996 homes for rent in baltimore, maryland yelp_intent \n", "997 top universities in san diego, california yelp_intent \n", "998 top universities in tampa, florida yelp_intent \n", "999 how far is san diego from ca information_intent \n", "\n", " city_pred state_pred \n", "0 detroit \n", "1 wa \n", "2 los angeles california \n", "3 boston \n", "4 baltimore maryland \n", ".. ... ... \n", "995 las vegas \n", "996 baltimore maryland \n", "997 san diego california \n", "998 tampa florida \n", "999 san diego ca \n", "\n", "[1000 rows x 4 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fx_ml_ner_val" ] }, { "cell_type": "code", "execution_count": 13, "id": "1f830672-4844-40e1-b2f3-849f5a3715b1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>queries</th>\n", " <th>city</th>\n", " <th>state</th>\n", " <th>intent_pred</th>\n", " <th>city_pred</th>\n", " <th>state_pred</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>train stations near detroit</td>\n", " <td>detroit</td>\n", " <td></td>\n", " <td>yelp_intent</td>\n", " <td>detroit</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>voter registration in wa</td>\n", " <td></td>\n", " <td>wa</td>\n", " <td>navigation_intent</td>\n", " <td></td>\n", " <td>wa</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>weather report for los angeles, california</td>\n", " <td>los angeles</td>\n", " <td>california</td>\n", " <td>weather_intent</td>\n", " <td>los angeles</td>\n", " <td>california</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>city hall in boston</td>\n", " <td>boston</td>\n", " <td></td>\n", " <td>yelp_intent</td>\n", " <td>boston</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>bike rentals in baltimore, maryland</td>\n", " <td>baltimore</td>\n", " <td>maryland</td>\n", " <td>yelp_intent</td>\n", " <td>baltimore</td>\n", " <td>maryland</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>1053</th>\n", " <td>tax offices in las vegas</td>\n", " <td>las vegas</td>\n", " <td></td>\n", " <td>yelp_intent</td>\n", " <td>las vegas</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>1054</th>\n", " <td>homes for rent in baltimore, maryland</td>\n", " <td>baltimore</td>\n", " <td>maryland</td>\n", " <td>yelp_intent</td>\n", " <td>baltimore</td>\n", " <td>maryland</td>\n", " </tr>\n", " <tr>\n", " <th>1055</th>\n", " <td>top universities in san diego, california</td>\n", " <td>san diego</td>\n", " <td>california</td>\n", " <td>yelp_intent</td>\n", " <td>san diego</td>\n", " <td>california</td>\n", " </tr>\n", " <tr>\n", " <th>1056</th>\n", " <td>top universities in tampa, florida</td>\n", " <td>tampa</td>\n", " <td>florida</td>\n", " <td>yelp_intent</td>\n", " <td>tampa</td>\n", " <td>florida</td>\n", " </tr>\n", " <tr>\n", " <th>1057</th>\n", " <td>how far is san diego from ca</td>\n", " <td>san diego</td>\n", " <td>ca</td>\n", " <td>information_intent</td>\n", " <td>san diego</td>\n", " <td>ca</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>1058 rows × 6 columns</p>\n", "</div>" ], "text/plain": [ " queries city state \\\n", "0 train stations near detroit detroit \n", "1 voter registration in wa wa \n", "2 weather report for los angeles, california los angeles california \n", "3 city hall in boston boston \n", "4 bike rentals in baltimore, maryland baltimore maryland \n", "... ... ... ... \n", "1053 tax offices in las vegas las vegas \n", "1054 homes for rent in baltimore, maryland baltimore maryland \n", "1055 top universities in san diego, california san diego california \n", "1056 top universities in tampa, florida tampa florida \n", "1057 how far is san diego from ca san diego ca \n", "\n", " intent_pred city_pred state_pred \n", "0 yelp_intent detroit \n", "1 navigation_intent wa \n", "2 weather_intent los angeles california \n", "3 yelp_intent boston \n", "4 yelp_intent baltimore maryland \n", "... ... ... ... \n", "1053 yelp_intent las vegas \n", "1054 yelp_intent baltimore maryland \n", "1055 yelp_intent san diego california \n", "1056 yelp_intent tampa florida \n", "1057 information_intent san diego ca \n", "\n", "[1058 rows x 6 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ner_fx_results = pd.merge(ner_val_generated_data[['queries', 'city', 'state']], fx_ml_ner_val, on='queries', how='left')\n", "ner_fx_results" ] }, { "cell_type": "code", "execution_count": 14, "id": "648d10ca-75f6-4d10-8c01-f7046e4eedc8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fx ML NER accuracy = 0.9640831758034026\n" ] } ], "source": [ "fx_ml_full_match_rate = float(((ner_fx_results['city'] == ner_fx_results['city_pred']) & \n", "(ner_fx_results['state'] == ner_fx_results['state_pred'])).value_counts(normalize=True)[True])\n", "\n", "print(f\"Fx ML NER accuracy = {fx_ml_full_match_rate}\")" ] }, { "cell_type": "code", "execution_count": 15, "id": "9ab7e59e-b877-4ae5-85ab-002678165945", "metadata": {}, "outputs": [], "source": [ "# Fx ML NER accuracy = 0.9073724007561437\n", "# Fx ML NER accuracy = 0.9640831758034026 # v0.1.6" ] }, { "cell_type": "code", "execution_count": 16, "id": "a3a3a5f2-c075-47c6-ac1d-b03a51d0bc0d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>queries</th>\n", " <th>city</th>\n", " <th>state</th>\n", " <th>intent_pred</th>\n", " <th>city_pred</th>\n", " <th>state_pred</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>130</th>\n", " <td>chicago weather radar</td>\n", " <td>chicago</td>\n", " <td></td>\n", " <td>weather_intent</td>\n", " <td>chicago radar</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>227</th>\n", " <td>las vegas weather radar</td>\n", " <td>las vegas</td>\n", " <td></td>\n", " <td>weather_intent</td>\n", " <td>las vegas radar</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>284</th>\n", " <td>denver weather radar</td>\n", " <td>denver</td>\n", " <td></td>\n", " <td>weather_intent</td>\n", " <td>denver radar</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>294</th>\n", " <td>oh dmv locations</td>\n", " <td></td>\n", " <td>oh</td>\n", " <td>navigation_intent</td>\n", " <td>##m</td>\n", " <td>oh</td>\n", " </tr>\n", " <tr>\n", " <th>331</th>\n", " <td>chicago, il public schools</td>\n", " <td>chicago</td>\n", " <td>il</td>\n", " <td></td>\n", " <td>chicago</td>\n", " <td>il public</td>\n", " </tr>\n", " <tr>\n", " <th>356</th>\n", " <td>new york weather radar</td>\n", " <td>new york</td>\n", " <td></td>\n", " <td>weather_intent</td>\n", " <td>new radar</td>\n", " <td>york</td>\n", " </tr>\n", " <tr>\n", " <th>379</th>\n", " <td>st. louis, mo public schools</td>\n", " <td>st. louis</td>\n", " <td>mo</td>\n", " <td></td>\n", " <td>st. louis</td>\n", " <td>mo public</td>\n", " </tr>\n", " <tr>\n", " <th>405</th>\n", " <td>portland, or public schools</td>\n", " <td>portland</td>\n", " <td>or</td>\n", " <td>information_intent</td>\n", " <td>portland</td>\n", " <td>or public</td>\n", " </tr>\n", " <tr>\n", " <th>431</th>\n", " <td>mortgage rates in new york</td>\n", " <td></td>\n", " <td>new york</td>\n", " <td>yelp_intent</td>\n", " <td>new york</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>437</th>\n", " <td>new york high school rankings</td>\n", " <td></td>\n", " <td>new york</td>\n", " <td>information_intent</td>\n", " <td>new york</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>453</th>\n", " <td>theater performances in new york</td>\n", " <td></td>\n", " <td>new york</td>\n", " <td>translation_intent</td>\n", " <td>new</td>\n", " <td>york</td>\n", " </tr>\n", " <tr>\n", " <th>454</th>\n", " <td>nv dmv locations</td>\n", " <td></td>\n", " <td>nv</td>\n", " <td>navigation_intent</td>\n", " <td></td>\n", " <td>n</td>\n", " </tr>\n", " <tr>\n", " <th>527</th>\n", " <td>washington housing prices</td>\n", " <td></td>\n", " <td>washington</td>\n", " <td>yelp_intent</td>\n", " <td>washington</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>531</th>\n", " <td>portland, or locksmith services</td>\n", " <td>portland</td>\n", " <td>or</td>\n", " <td>yelp_intent</td>\n", " <td>portland</td>\n", " <td>or locksmith</td>\n", " </tr>\n", " <tr>\n", " <th>555</th>\n", " <td>how far is pittsburgh from pa</td>\n", " <td>pittsburgh</td>\n", " <td>pa</td>\n", " <td>information_intent</td>\n", " <td>pittsburgh pa</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>564</th>\n", " <td>new york airport information</td>\n", " <td>new york</td>\n", " <td></td>\n", " <td>navigation_intent</td>\n", " <td>new york information</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>588</th>\n", " <td>new york marathon registration</td>\n", " <td>new york</td>\n", " <td></td>\n", " <td>navigation_intent</td>\n", " <td></td>\n", " <td>new york</td>\n", " </tr>\n", " <tr>\n", " <th>617</th>\n", " <td>how far is dallas from tx</td>\n", " <td>dallas</td>\n", " <td>tx</td>\n", " <td>information_intent</td>\n", " <td>dallas tx</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>640</th>\n", " <td>how far is atlanta from ga</td>\n", " <td>atlanta</td>\n", " <td>ga</td>\n", " <td>information_intent</td>\n", " <td></td>\n", " <td>ga</td>\n", " </tr>\n", " <tr>\n", " <th>641</th>\n", " <td>boston, ma locksmith services</td>\n", " <td>boston</td>\n", " <td>ma</td>\n", " <td>yelp_intent</td>\n", " <td>boston</td>\n", " <td>mamith</td>\n", " </tr>\n", " <tr>\n", " <th>654</th>\n", " <td>pittsburgh weather radar</td>\n", " <td>pittsburgh</td>\n", " <td></td>\n", " <td>weather_intent</td>\n", " <td>pittsburgh radar</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>721</th>\n", " <td>denver, co public schools</td>\n", " <td>denver</td>\n", " <td>co</td>\n", " <td>information_intent</td>\n", " <td>denver</td>\n", " <td>co public</td>\n", " </tr>\n", " <tr>\n", " <th>725</th>\n", " <td>san jose airport information</td>\n", " <td>san jose</td>\n", " <td></td>\n", " <td>travel_intent</td>\n", " <td>san jose information</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>733</th>\n", " <td>cleveland, oh public schools</td>\n", " <td>cleveland</td>\n", " <td>oh</td>\n", " <td></td>\n", " <td>cleveland</td>\n", " <td>oh public</td>\n", " </tr>\n", " <tr>\n", " <th>740</th>\n", " <td>mortgage rates in washington</td>\n", " <td></td>\n", " <td>washington</td>\n", " <td>yelp_intent</td>\n", " <td>washington</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>752</th>\n", " <td>san antonio airport information</td>\n", " <td>san antonio</td>\n", " <td></td>\n", " <td>navigation_intent</td>\n", " <td>san antonio information</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>798</th>\n", " <td>how far is san jose from ca</td>\n", " <td>san jose</td>\n", " <td>ca</td>\n", " <td>information_intent</td>\n", " <td>san</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>813</th>\n", " <td>st. louis parking information</td>\n", " <td>st. louis</td>\n", " <td></td>\n", " <td>navigation_intent</td>\n", " <td>st. louis information</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>830</th>\n", " <td>how far is charlotte from nc</td>\n", " <td>charlotte</td>\n", " <td>nc</td>\n", " <td>information_intent</td>\n", " <td></td>\n", " <td>nc</td>\n", " </tr>\n", " <tr>\n", " <th>833</th>\n", " <td>dallas weather radar</td>\n", " <td>dallas</td>\n", " <td></td>\n", " <td>weather_intent</td>\n", " <td>dallas radar</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>852</th>\n", " <td>st. louis airport information</td>\n", " <td>st. louis</td>\n", " <td></td>\n", " <td>navigation_intent</td>\n", " <td>st. louis information</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>886</th>\n", " <td>san jose parking information</td>\n", " <td>san jose</td>\n", " <td></td>\n", " <td>navigation_intent</td>\n", " <td>san jose information</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>924</th>\n", " <td>portland weather radar</td>\n", " <td>portland</td>\n", " <td></td>\n", " <td>weather_intent</td>\n", " <td>portland radar</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>927</th>\n", " <td>how far is nashville from tn</td>\n", " <td>nashville</td>\n", " <td>tn</td>\n", " <td>information_intent</td>\n", " <td>nashville tn</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>993</th>\n", " <td>nv licensed contractors</td>\n", " <td></td>\n", " <td>nv</td>\n", " <td>yelp_intent</td>\n", " <td></td>\n", " <td>n</td>\n", " </tr>\n", " <tr>\n", " <th>1023</th>\n", " <td>texas high school rankings</td>\n", " <td></td>\n", " <td>texas</td>\n", " <td>information_intent</td>\n", " <td>texas</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>1026</th>\n", " <td>new york housing prices</td>\n", " <td></td>\n", " <td>new york</td>\n", " <td></td>\n", " <td>new york</td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>1033</th>\n", " <td>mental health services in new york</td>\n", " <td></td>\n", " <td>new york</td>\n", " <td>yelp_intent</td>\n", " <td>new</td>\n", " <td>york</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " queries city state \\\n", "130 chicago weather radar chicago \n", "227 las vegas weather radar las vegas \n", "284 denver weather radar denver \n", "294 oh dmv locations oh \n", "331 chicago, il public schools chicago il \n", "356 new york weather radar new york \n", "379 st. louis, mo public schools st. louis mo \n", "405 portland, or public schools portland or \n", "431 mortgage rates in new york new york \n", "437 new york high school rankings new york \n", "453 theater performances in new york new york \n", "454 nv dmv locations nv \n", "527 washington housing prices washington \n", "531 portland, or locksmith services portland or \n", "555 how far is pittsburgh from pa pittsburgh pa \n", "564 new york airport information new york \n", "588 new york marathon registration new york \n", "617 how far is dallas from tx dallas tx \n", "640 how far is atlanta from ga atlanta ga \n", "641 boston, ma locksmith services boston ma \n", "654 pittsburgh weather radar pittsburgh \n", "721 denver, co public schools denver co \n", "725 san jose airport information san jose \n", "733 cleveland, oh public schools cleveland oh \n", "740 mortgage rates in washington washington \n", "752 san antonio airport information san antonio \n", "798 how far is san jose from ca san jose ca \n", "813 st. louis parking information st. louis \n", "830 how far is charlotte from nc charlotte nc \n", "833 dallas weather radar dallas \n", "852 st. louis airport information st. louis \n", "886 san jose parking information san jose \n", "924 portland weather radar portland \n", "927 how far is nashville from tn nashville tn \n", "993 nv licensed contractors nv \n", "1023 texas high school rankings texas \n", "1026 new york housing prices new york \n", "1033 mental health services in new york new york \n", "\n", " intent_pred city_pred state_pred \n", "130 weather_intent chicago radar \n", "227 weather_intent las vegas radar \n", "284 weather_intent denver radar \n", "294 navigation_intent ##m oh \n", "331 chicago il public \n", "356 weather_intent new radar york \n", "379 st. louis mo public \n", "405 information_intent portland or public \n", "431 yelp_intent new york \n", "437 information_intent new york \n", "453 translation_intent new york \n", "454 navigation_intent n \n", "527 yelp_intent washington \n", "531 yelp_intent portland or locksmith \n", "555 information_intent pittsburgh pa \n", "564 navigation_intent new york information \n", "588 navigation_intent new york \n", "617 information_intent dallas tx \n", "640 information_intent ga \n", "641 yelp_intent boston mamith \n", "654 weather_intent pittsburgh radar \n", "721 information_intent denver co public \n", "725 travel_intent san jose information \n", "733 cleveland oh public \n", "740 yelp_intent washington \n", "752 navigation_intent san antonio information \n", "798 information_intent san \n", "813 navigation_intent st. louis information \n", "830 information_intent nc \n", "833 weather_intent dallas radar \n", "852 navigation_intent st. louis information \n", "886 navigation_intent san jose information \n", "924 weather_intent portland radar \n", "927 information_intent nashville tn \n", "993 yelp_intent n \n", "1023 information_intent texas \n", "1026 new york \n", "1033 yelp_intent new york " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ner_fx_results.loc[\n", "~((ner_fx_results['city'] == ner_fx_results['city_pred']) & \n", "(ner_fx_results['state'] == ner_fx_results['state_pred']))\n", "]" ] }, { "cell_type": "code", "execution_count": null, "id": "5f2d0ec9-7fa7-415b-9ede-c915835698bb", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.0" } }, "nbformat": 4, "nbformat_minor": 5 }