notebooks/intent_evaluation.ipynb (3,319 lines of code) (raw):
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "247555cc-db4a-44a4-909d-4c951896763e",
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import os\n",
"import pandas as pd\n",
"import json\n",
"\n",
"# Add the project root directory to the Python path\n",
"project_root = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n",
"sys.path.append(project_root)\n",
"\n",
"# Now you can import the IntentClassifier from src/infer_intent.py\n",
"from src.infer_intent import IntentClassifier\n",
"from tqdm import tqdm\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "412924e2-5042-4f15-a4f1-f05fc7a06e8b",
"metadata": {},
"outputs": [],
"source": [
"# with open('../data/internal/search_examples.txt', 'r', encoding='utf-8') as f:\n",
"# yelp_texts = f.read().split('\\n')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "dc46d20e-6ed9-4606-b0c7-7f9128f7b70c",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# yelp_texts[:10]"
]
},
{
"cell_type": "markdown",
"id": "80c69759-97a4-4d3b-8fe7-013a1c042129",
"metadata": {},
"source": [
"## Using yelp Keywords for validation"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f1716546-a51b-4f73-adf1-6c94ea8eeef8",
"metadata": {},
"outputs": [],
"source": [
"tmp2 = pd.read_json(\"https://firefox-settings-attachments.cdn.mozilla.net/main-workspace/quicksuggest/33987d71-9e87-4b7e-86d3-6f292b89e8bf.json\")\n",
"yelp_texts = tmp2['subjects'][0]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "1e74e7cb-6795-45d4-8e64-13d289035d7f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2870"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(tmp2['subjects'][0])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b4d515ee-9a95-4e08-85a0-4046cc417035",
"metadata": {},
"outputs": [],
"source": [
"# !ls -ltrsh models/"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "383dff8c-3382-4c69-8609-711ae6a2c0a1",
"metadata": {},
"outputs": [],
"source": [
"cls = IntentClassifier()"
]
},
{
"cell_type": "markdown",
"id": "aa3f52e3-80c9-477a-aa46-0beafa20b9ea",
"metadata": {},
"source": [
"#### Yelp internal data evaluation (python)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "2d921e12-2d87-405f-9bc0-c655d2e61ff0",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 2%|█▏ | 52/2870 [00:00<00:42, 65.60it/s]/Users/cgopal/work/smart_intent/src/infer_intent.py:52: RuntimeWarning: overflow encountered in exp\n",
" probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)\n",
"/Users/cgopal/work/smart_intent/src/infer_intent.py:52: RuntimeWarning: invalid value encountered in divide\n",
" probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)\n",
"100%|█████████████████████████████████████████████████████████████| 2870/2870 [00:43<00:00, 66.38it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"yelp hit rate (accuracy) = 0.9275261324041811\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"yelp_hit_count = 0\n",
"yelp_queries_misclassified = []\n",
"yelp_queries_pred_res = []\n",
"for query in tqdm(yelp_texts):\n",
" pred_result, pred_proba = cls.find_intent(query)\n",
" if pred_result == 'yelp_intent' and pred_proba[pred_result] > 0.5:\n",
" yelp_hit_count += 1\n",
" else:\n",
" yelp_queries_misclassified.append({'query': query, 'pred_result': pred_result, 'pred_proba': pred_proba[pred_result]})\n",
" yelp_queries_pred_res.append({'query': query, 'pred_result': pred_result, 'pred_proba': pred_proba[pred_result]})\n",
"print(f\"yelp hit rate (accuracy) = {yelp_hit_count/len(yelp_texts)}\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "3d8d68a4-7478-47da-ac23-e56053f6570d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>query</th>\n",
" <th>pred_result</th>\n",
" <th>pred_proba</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>24 hour cleaning services</td>\n",
" <td>yelp_intent</td>\n",
" <td>1.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>24 hour maid service</td>\n",
" <td>yelp_intent</td>\n",
" <td>0.975</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>24 hour pharmacy</td>\n",
" <td>yelp_intent</td>\n",
" <td>0.999</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>24 hour restaurants</td>\n",
" <td>yelp_intent</td>\n",
" <td>1.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>24 hour walmart</td>\n",
" <td>navigation_intent</td>\n",
" <td>0.761</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2865</th>\n",
" <td>yoga</td>\n",
" <td>yelp_intent</td>\n",
" <td>0.901</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2866</th>\n",
" <td>yogurstory</td>\n",
" <td>yelp_intent</td>\n",
" <td>0.786</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2867</th>\n",
" <td>you move me</td>\n",
" <td>yelp_intent</td>\n",
" <td>0.614</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2868</th>\n",
" <td>yume wo katare</td>\n",
" <td>yelp_intent</td>\n",
" <td>0.676</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2869</th>\n",
" <td>zara charlotte nc</td>\n",
" <td>weather_intent</td>\n",
" <td>0.397</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2870 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" query pred_result pred_proba\n",
"0 24 hour cleaning services yelp_intent 1.000\n",
"1 24 hour maid service yelp_intent 0.975\n",
"2 24 hour pharmacy yelp_intent 0.999\n",
"3 24 hour restaurants yelp_intent 1.000\n",
"4 24 hour walmart navigation_intent 0.761\n",
"... ... ... ...\n",
"2865 yoga yelp_intent 0.901\n",
"2866 yogurstory yelp_intent 0.786\n",
"2867 you move me yelp_intent 0.614\n",
"2868 yume wo katare yelp_intent 0.676\n",
"2869 zara charlotte nc weather_intent 0.397\n",
"\n",
"[2870 rows x 3 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"yelp_queries_pred_res_df = pd.DataFrame(yelp_queries_pred_res)\n",
"yelp_queries_pred_res_df\n",
"# .groupby('pred_result')['pred_proba'].describe()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "df8ba85b-5436-4477-bde1-825610bf2f40",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>min</th>\n",
" <th>5%</th>\n",
" <th>10%</th>\n",
" <th>20%</th>\n",
" <th>25%</th>\n",
" <th>30%</th>\n",
" <th>40%</th>\n",
" <th>50%</th>\n",
" <th>60%</th>\n",
" <th>70%</th>\n",
" <th>75%</th>\n",
" <th>80%</th>\n",
" <th>90%</th>\n",
" <th>95%</th>\n",
" <th>98%</th>\n",
" <th>99%</th>\n",
" <th>max</th>\n",
" </tr>\n",
" <tr>\n",
" <th>pred_result</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>information_intent</th>\n",
" <td>75.0</td>\n",
" <td>0.734067</td>\n",
" <td>0.165696</td>\n",
" <td>0.345</td>\n",
" <td>0.42590</td>\n",
" <td>0.4862</td>\n",
" <td>0.6062</td>\n",
" <td>0.62400</td>\n",
" <td>0.6584</td>\n",
" <td>0.6996</td>\n",
" <td>0.7550</td>\n",
" <td>0.8048</td>\n",
" <td>0.8372</td>\n",
" <td>0.86750</td>\n",
" <td>0.8936</td>\n",
" <td>0.9296</td>\n",
" <td>0.96260</td>\n",
" <td>0.98472</td>\n",
" <td>0.99026</td>\n",
" <td>0.991</td>\n",
" </tr>\n",
" <tr>\n",
" <th>navigation_intent</th>\n",
" <td>8.0</td>\n",
" <td>0.684125</td>\n",
" <td>0.201880</td>\n",
" <td>0.390</td>\n",
" <td>0.43585</td>\n",
" <td>0.4817</td>\n",
" <td>0.5346</td>\n",
" <td>0.54650</td>\n",
" <td>0.5577</td>\n",
" <td>0.5766</td>\n",
" <td>0.6715</td>\n",
" <td>0.7706</td>\n",
" <td>0.8042</td>\n",
" <td>0.82600</td>\n",
" <td>0.8498</td>\n",
" <td>0.9073</td>\n",
" <td>0.94265</td>\n",
" <td>0.96386</td>\n",
" <td>0.97093</td>\n",
" <td>0.978</td>\n",
" </tr>\n",
" <tr>\n",
" <th>purchase_intent</th>\n",
" <td>10.0</td>\n",
" <td>0.531500</td>\n",
" <td>0.189461</td>\n",
" <td>0.265</td>\n",
" <td>0.29245</td>\n",
" <td>0.3199</td>\n",
" <td>0.3892</td>\n",
" <td>0.40700</td>\n",
" <td>0.4106</td>\n",
" <td>0.4484</td>\n",
" <td>0.4960</td>\n",
" <td>0.5572</td>\n",
" <td>0.6499</td>\n",
" <td>0.70525</td>\n",
" <td>0.7396</td>\n",
" <td>0.7597</td>\n",
" <td>0.78535</td>\n",
" <td>0.80074</td>\n",
" <td>0.80587</td>\n",
" <td>0.811</td>\n",
" </tr>\n",
" <tr>\n",
" <th>translation_intent</th>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>travel_intent</th>\n",
" <td>21.0</td>\n",
" <td>0.688714</td>\n",
" <td>0.190144</td>\n",
" <td>0.398</td>\n",
" <td>0.41100</td>\n",
" <td>0.4580</td>\n",
" <td>0.5280</td>\n",
" <td>0.53700</td>\n",
" <td>0.5500</td>\n",
" <td>0.6340</td>\n",
" <td>0.6680</td>\n",
" <td>0.6820</td>\n",
" <td>0.8040</td>\n",
" <td>0.86500</td>\n",
" <td>0.9120</td>\n",
" <td>0.9550</td>\n",
" <td>0.96800</td>\n",
" <td>0.97520</td>\n",
" <td>0.97760</td>\n",
" <td>0.980</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unknown</th>\n",
" <td>22.0</td>\n",
" <td>0.606636</td>\n",
" <td>0.214391</td>\n",
" <td>0.228</td>\n",
" <td>0.32735</td>\n",
" <td>0.3547</td>\n",
" <td>0.4006</td>\n",
" <td>0.48225</td>\n",
" <td>0.4923</td>\n",
" <td>0.5140</td>\n",
" <td>0.5755</td>\n",
" <td>0.6664</td>\n",
" <td>0.7280</td>\n",
" <td>0.76175</td>\n",
" <td>0.8340</td>\n",
" <td>0.9240</td>\n",
" <td>0.93000</td>\n",
" <td>0.93696</td>\n",
" <td>0.93948</td>\n",
" <td>0.942</td>\n",
" </tr>\n",
" <tr>\n",
" <th>weather_intent</th>\n",
" <td>4.0</td>\n",
" <td>0.626000</td>\n",
" <td>0.220675</td>\n",
" <td>0.397</td>\n",
" <td>0.40900</td>\n",
" <td>0.4210</td>\n",
" <td>0.4450</td>\n",
" <td>0.45700</td>\n",
" <td>0.4690</td>\n",
" <td>0.5442</td>\n",
" <td>0.6450</td>\n",
" <td>0.7458</td>\n",
" <td>0.8134</td>\n",
" <td>0.81400</td>\n",
" <td>0.8146</td>\n",
" <td>0.8158</td>\n",
" <td>0.81640</td>\n",
" <td>0.81676</td>\n",
" <td>0.81688</td>\n",
" <td>0.817</td>\n",
" </tr>\n",
" <tr>\n",
" <th>yelp_intent</th>\n",
" <td>2683.0</td>\n",
" <td>0.963171</td>\n",
" <td>0.095206</td>\n",
" <td>0.339</td>\n",
" <td>0.73610</td>\n",
" <td>0.8890</td>\n",
" <td>0.9734</td>\n",
" <td>0.98600</td>\n",
" <td>0.9920</td>\n",
" <td>0.9970</td>\n",
" <td>0.9990</td>\n",
" <td>0.9990</td>\n",
" <td>1.0000</td>\n",
" <td>1.00000</td>\n",
" <td>1.0000</td>\n",
" <td>1.0000</td>\n",
" <td>1.00000</td>\n",
" <td>1.00000</td>\n",
" <td>1.00000</td>\n",
" <td>1.000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" count mean std min 5% 10% \\\n",
"pred_result \n",
"information_intent 75.0 0.734067 0.165696 0.345 0.42590 0.4862 \n",
"navigation_intent 8.0 0.684125 0.201880 0.390 0.43585 0.4817 \n",
"purchase_intent 10.0 0.531500 0.189461 0.265 0.29245 0.3199 \n",
"translation_intent 0.0 NaN NaN NaN NaN NaN \n",
"travel_intent 21.0 0.688714 0.190144 0.398 0.41100 0.4580 \n",
"unknown 22.0 0.606636 0.214391 0.228 0.32735 0.3547 \n",
"weather_intent 4.0 0.626000 0.220675 0.397 0.40900 0.4210 \n",
"yelp_intent 2683.0 0.963171 0.095206 0.339 0.73610 0.8890 \n",
"\n",
" 20% 25% 30% 40% 50% 60% 70% \\\n",
"pred_result \n",
"information_intent 0.6062 0.62400 0.6584 0.6996 0.7550 0.8048 0.8372 \n",
"navigation_intent 0.5346 0.54650 0.5577 0.5766 0.6715 0.7706 0.8042 \n",
"purchase_intent 0.3892 0.40700 0.4106 0.4484 0.4960 0.5572 0.6499 \n",
"translation_intent NaN NaN NaN NaN NaN NaN NaN \n",
"travel_intent 0.5280 0.53700 0.5500 0.6340 0.6680 0.6820 0.8040 \n",
"unknown 0.4006 0.48225 0.4923 0.5140 0.5755 0.6664 0.7280 \n",
"weather_intent 0.4450 0.45700 0.4690 0.5442 0.6450 0.7458 0.8134 \n",
"yelp_intent 0.9734 0.98600 0.9920 0.9970 0.9990 0.9990 1.0000 \n",
"\n",
" 75% 80% 90% 95% 98% 99% max \n",
"pred_result \n",
"information_intent 0.86750 0.8936 0.9296 0.96260 0.98472 0.99026 0.991 \n",
"navigation_intent 0.82600 0.8498 0.9073 0.94265 0.96386 0.97093 0.978 \n",
"purchase_intent 0.70525 0.7396 0.7597 0.78535 0.80074 0.80587 0.811 \n",
"translation_intent NaN NaN NaN NaN NaN NaN NaN \n",
"travel_intent 0.86500 0.9120 0.9550 0.96800 0.97520 0.97760 0.980 \n",
"unknown 0.76175 0.8340 0.9240 0.93000 0.93696 0.93948 0.942 \n",
"weather_intent 0.81400 0.8146 0.8158 0.81640 0.81676 0.81688 0.817 \n",
"yelp_intent 1.00000 1.0000 1.0000 1.00000 1.00000 1.00000 1.000 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"yelp_queries_pred_res_df.groupby('pred_result')['pred_proba'].describe(percentiles=[.05, .1,.2,.25,.3, .4,.5,.6, .7, .75, .8, .9, .95, .98, .99])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "ff3a2f1c-cd7e-44e1-ae5d-f7a53baa7eb9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"48"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(yelp_queries_pred_res_df.loc[yelp_queries_pred_res_df['pred_proba'] < 0.5])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "46edb91e-4134-4466-9485-769820f94389",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 12,
"id": "541325d3-a508-481b-b4aa-b798c04d8791",
"metadata": {},
"outputs": [],
"source": [
"## old\n",
"# yelp hit rate (accuracy) = 0.3256021409455843\n",
"\n",
"## new (Mozilla/mobilebert-uncased-finetuned-LoRA-intent-classifier)\n",
"# yelp hit rate (accuracy) = 0.6703835860838537\n",
"# yelp hit rate (accuracy) = 0.7832292595896521\n",
"# yelp hit rate (accuracy) = 0.8862622658340767\n",
"# yelp hit rate (accuracy) = 0.8742160278745644\n",
"# yelp hit rate (accuracy) = 0.9243902439024391"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "c73f4c56-deea-41de-a32a-8f27de46bf5d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of yelp queries misclassified = 208 out of 2870 examples\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>query</th>\n",
" <th>pred_result</th>\n",
" <th>pred_proba</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>24 hour walmart</td>\n",
" <td>navigation_intent</td>\n",
" <td>0.761</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>absolute bagels</td>\n",
" <td>information_intent</td>\n",
" <td>0.796</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>all that shabu</td>\n",
" <td>information_intent</td>\n",
" <td>0.893</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>aloha mamacita</td>\n",
" <td>information_intent</td>\n",
" <td>0.840</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>american cruise lines</td>\n",
" <td>travel_intent</td>\n",
" <td>0.968</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>203</th>\n",
" <td>where to find a painter</td>\n",
" <td>information_intent</td>\n",
" <td>0.532</td>\n",
" </tr>\n",
" <tr>\n",
" <th>204</th>\n",
" <td>where to hire a painter</td>\n",
" <td>navigation_intent</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>205</th>\n",
" <td>wicked spoon</td>\n",
" <td>unknown</td>\n",
" <td>0.493</td>\n",
" </tr>\n",
" <tr>\n",
" <th>206</th>\n",
" <td>wings</td>\n",
" <td>information_intent</td>\n",
" <td>0.669</td>\n",
" </tr>\n",
" <tr>\n",
" <th>207</th>\n",
" <td>zara charlotte nc</td>\n",
" <td>weather_intent</td>\n",
" <td>0.397</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>208 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" query pred_result pred_proba\n",
"0 24 hour walmart navigation_intent 0.761\n",
"1 absolute bagels information_intent 0.796\n",
"2 all that shabu information_intent 0.893\n",
"3 aloha mamacita information_intent 0.840\n",
"4 american cruise lines travel_intent 0.968\n",
".. ... ... ...\n",
"203 where to find a painter information_intent 0.532\n",
"204 where to hire a painter navigation_intent NaN\n",
"205 wicked spoon unknown 0.493\n",
"206 wings information_intent 0.669\n",
"207 zara charlotte nc weather_intent 0.397\n",
"\n",
"[208 rows x 3 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"yelp_queries_misclassified_df = pd.DataFrame(yelp_queries_misclassified)\n",
"print(f\"Number of yelp queries misclassified = {len(yelp_queries_misclassified_df)} out of {len(yelp_texts)} examples\")\n",
"yelp_queries_misclassified_df"
]
},
{
"cell_type": "markdown",
"id": "e7ac91e1-7e9e-45e1-bfb5-4b0ff18eed31",
"metadata": {},
"source": [
"#### Load the Fx ML inference output (using Fx js)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "35fbb3af-bf21-455c-b226-0a8ab2e50732",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fx ML yelp intent hit rate = 0.926829268292683\n"
]
}
],
"source": [
"fx_ml_intent = pd.read_json(\"../data/ML_output_YELP_KEYWORDS_DATA.json\")\n",
"fx_ml_hit_rate_for_yelp_intent = float(fx_ml_intent['intent'].value_counts(normalize=True)['yelp_intent'])\n",
"print(f\"Fx ML yelp intent hit rate = {fx_ml_hit_rate_for_yelp_intent}\")"
]
},
{
"cell_type": "markdown",
"id": "dc678c1f-9e9f-4050-a164-5e533fde6e50",
"metadata": {},
"source": [
"## Using Yelp additional validation dataset"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "da4d187b-7989-402d-90ef-1cd3680ccd15",
"metadata": {},
"outputs": [],
"source": [
"tqdm.pandas()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "e1b1f45d-5664-4b6b-9cd7-ecf3f70a0481",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 2%|█ | 32/2000 [00:00<00:27, 71.10it/s]/Users/cgopal/work/smart_intent/src/infer_intent.py:52: RuntimeWarning: overflow encountered in exp\n",
" probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)\n",
"/Users/cgopal/work/smart_intent/src/infer_intent.py:52: RuntimeWarning: invalid value encountered in divide\n",
" probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)\n",
"100%|█████████████████████████████████████████████████████████████| 2000/2000 [00:30<00:00, 65.80it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"yelp generated val data hit rate (accuracy) = 0.956\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"yelp_val_generated_data = pd.read_csv(\"../data/yelp_val_generated_data.csv\")\n",
"yelp_val_generated_data['pred_result'] = yelp_val_generated_data['queries'].progress_apply(lambda query: cls.find_intent(query)[0])\n",
"hit_rate = float(yelp_val_generated_data['pred_result'].value_counts(normalize=True)['yelp_intent'])\n",
"print(f\"yelp generated val data hit rate (accuracy) = {hit_rate}\")"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "d704924f-e350-4034-808e-7083b35f972f",
"metadata": {},
"outputs": [],
"source": [
"# yelp generated val data hit rate (accuracy) = 0.982"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "3b03051e-90d2-46ef-bba2-840f3d999dee",
"metadata": {},
"outputs": [],
"source": [
"queries_array = yelp_val_generated_data['queries'].values\n",
"\n",
"# Create a dictionary structure\n",
"data = {\n",
" \"queries\": queries_array.tolist() \n",
"}\n",
"\n",
"# Define the output JSON file path\n",
"output_file_path = \"../data/yelp_val_generated_data.json\"\n",
"\n",
"## Uncomment to write it else its almost static dataset\n",
"# # Write to a JSON file\n",
"# with open(output_file_path, \"w\") as json_file:\n",
"# json.dump(data, json_file, indent=2)\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "f18639e1-1fa8-44a0-9eb3-b63ce6007f2f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fx ML yelp intent2 hit rate = 0.959\n"
]
}
],
"source": [
"## yelp validation from Fx MLSuggest\n",
"\n",
"fx_ml_intent_val = pd.read_json(\"../data/ML_output_YELP_VAL_DATA.json\")\n",
"fx_ml_hit_rate_for_yelp_intent_2 = float(fx_ml_intent_val['intent'].value_counts(normalize=True)['yelp_intent'])\n",
"print(f\"Fx ML yelp intent2 hit rate = {fx_ml_hit_rate_for_yelp_intent_2}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f158b9d4-e5b7-4383-ae9d-eaeb490bf821",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "fb8eba2d-fcf2-471e-a7c0-9f815145c97a",
"metadata": {},
"source": [
"#### Weather internal data evaluation"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "04b547ef-854e-4127-8280-66198a1ba1d4",
"metadata": {},
"outputs": [],
"source": [
"with open('../data/internal/weather_search_examples.txt', 'r', encoding='utf-8') as f:\n",
" weather_texts = f.read().split('\\n')"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "d8b64037-063c-428b-be5d-769f5e9bec13",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 17%|███████████▎ | 7/41 [00:00<00:00, 63.55it/s]/Users/cgopal/work/smart_intent/src/infer_intent.py:52: RuntimeWarning: overflow encountered in exp\n",
" probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)\n",
"/Users/cgopal/work/smart_intent/src/infer_intent.py:52: RuntimeWarning: invalid value encountered in divide\n",
" probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)\n",
"100%|█████████████████████████████████████████████████████████████████| 41/41 [00:00<00:00, 65.72it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"weather hit rate (accuracy) = 0.7317073170731707\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"weather_hit_count = 0\n",
"weather_queries_misclassified = []\n",
"weather_queries_pred_res = []\n",
"for query in tqdm(weather_texts):\n",
" pred_result, pred_proba = cls.find_intent(query)\n",
" if pred_result == 'weather_intent' and pred_proba[pred_result] > 0.5:\n",
" weather_hit_count += 1\n",
" else:\n",
" weather_queries_misclassified.append({'query': query, 'pred_result': pred_result, 'pred_proba': pred_proba[pred_result]})\n",
" weather_queries_pred_res.append({'query': query, 'pred_result': pred_result, 'pred_proba': pred_proba[pred_result]})\n",
"print(f\"weather hit rate (accuracy) = {weather_hit_count/len(weather_texts)}\")"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "4389844a-bd81-4bd2-a6b4-5deb76387ba8",
"metadata": {},
"outputs": [],
"source": [
"## old\n",
"# weather hit rate (accuracy) = 0.3902439024390244\n",
"\n",
"## new (Mozilla/mobilebert-uncased-finetuned-LoRA-intent-classifier)\n",
"# weather hit rate (accuracy) = 0.4878048780487805\n",
"# weather hit rate (accuracy) = 0.6341463414634146\n",
"# weather hit rate (accuracy) = 0.6585365853658537\n",
"# weather hit rate (accuracy) = 0.8048780487804879\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "c8c6804a-a79d-4287-b32d-6420e72698df",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of weather queries misclassified = 11 out of 41 examples\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>query</th>\n",
" <th>pred_result</th>\n",
" <th>pred_proba</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>forcast</td>\n",
" <td>translation_intent</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>wather</td>\n",
" <td>unknown</td>\n",
" <td>0.280</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>vindy</td>\n",
" <td>yelp_intent</td>\n",
" <td>0.795</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>sunny</td>\n",
" <td>translation_intent</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>air quality</td>\n",
" <td>yelp_intent</td>\n",
" <td>0.730</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>pollen</td>\n",
" <td>information_intent</td>\n",
" <td>0.736</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>tiempo</td>\n",
" <td>yelp_intent</td>\n",
" <td>0.959</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>clima</td>\n",
" <td>unknown</td>\n",
" <td>0.367</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>doppler radar</td>\n",
" <td>information_intent</td>\n",
" <td>0.711</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>map</td>\n",
" <td>information_intent</td>\n",
" <td>0.775</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>radar near me</td>\n",
" <td>yelp_intent</td>\n",
" <td>0.831</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" query pred_result pred_proba\n",
"0 forcast translation_intent NaN\n",
"1 wather unknown 0.280\n",
"2 vindy yelp_intent 0.795\n",
"3 sunny translation_intent NaN\n",
"4 air quality yelp_intent 0.730\n",
"5 pollen information_intent 0.736\n",
"6 tiempo yelp_intent 0.959\n",
"7 clima unknown 0.367\n",
"8 doppler radar information_intent 0.711\n",
"9 map information_intent 0.775\n",
"10 radar near me yelp_intent 0.831"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"weather_queries_misclassified_df = pd.DataFrame(weather_queries_misclassified)\n",
"print(f\"Number of weather queries misclassified = {len(weather_queries_misclassified)} out of {len(weather_texts)} examples\")\n",
"weather_queries_misclassified_df"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "45b63df3-964e-4f1b-b672-bb5fb1e081ea",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 39.000000\n",
"mean 0.872385\n",
"std 0.171236\n",
"min 0.280000\n",
"10% 0.681000\n",
"20% 0.759400\n",
"25% 0.804500\n",
"30% 0.837400\n",
"40% 0.874200\n",
"50% 0.959000\n",
"60% 0.971800\n",
"70% 0.995800\n",
"75% 0.997000\n",
"80% 0.997400\n",
"90% 0.999200\n",
"95% 1.000000\n",
"98% 1.000000\n",
"99% 1.000000\n",
"max 1.000000\n",
"Name: pred_proba, dtype: float64"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(weather_queries_pred_res)['pred_proba'].describe(percentiles=[.1,.2,.25,.3, .4,.5,.6, .7, .75, .8, .9, .95, .98, .99])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6d73149b-1095-496a-8034-21b9f309d89e",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "4a33983b-370d-44e6-809b-658a34b2c09c",
"metadata": {},
"source": [
"#### Weather validate the generated data with pre and post modifiers"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "fe4ce915-b9f5-4c8a-aacb-bfe82cbbd8ec",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 7%|████ | 132/2000 [00:01<00:28, 65.96it/s]/Users/cgopal/work/smart_intent/src/infer_intent.py:52: RuntimeWarning: overflow encountered in exp\n",
" probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)\n",
"/Users/cgopal/work/smart_intent/src/infer_intent.py:52: RuntimeWarning: invalid value encountered in divide\n",
" probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)\n",
"100%|█████████████████████████████████████████████████████████████| 2000/2000 [00:30<00:00, 65.49it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"weather generated val data hit rate (accuracy) = 0.7205\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"weather_val_generated_data = pd.read_csv(\"../data/weather_val_generated_data.csv\")\n",
"weather_val_generated_data['pred_result'] = weather_val_generated_data['queries'].progress_apply(lambda query: cls.find_intent(query)[0])\n",
"hit_rate = float(weather_val_generated_data['pred_result'].value_counts(normalize=True)['weather_intent'])\n",
"print(f\"weather generated val data hit rate (accuracy) = {hit_rate}\")"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "ca8e932c-7488-45f0-b123-7fac2c01204d",
"metadata": {},
"outputs": [],
"source": [
"# weather generated val data hit rate (accuracy) = 0.643\n",
"# weather generated val data hit rate (accuracy) = 0.79"
]
},
{
"cell_type": "markdown",
"id": "32b0aa13-0558-4515-bd21-35dcdd9f4bc3",
"metadata": {},
"source": [
"#### Orcas golden dataset for intent validation"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "471b09c8-fa99-4d39-b442-5e69f91ec4d3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1000\n"
]
}
],
"source": [
"orcas_data = pd.read_csv(\"../data/ORCAS_golden_dataset.tsv\", sep='\\t')\n",
"print(len(orcas_data))"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "632b43a2-d39f-4f3a-af32-f7645084e671",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>query</th>\n",
" <th>url</th>\n",
" <th>label_manual</th>\n",
" <th>target</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>best reads</td>\n",
" <td>http://thegreatestbooks.org/</td>\n",
" <td>Abstain</td>\n",
" <td>information_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>tamerind</td>\n",
" <td>https://en.wikipedia.org/wiki/Tamarind</td>\n",
" <td>Factual</td>\n",
" <td>information_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>show mi ip</td>\n",
" <td>http://showip.net/</td>\n",
" <td>Instrumental</td>\n",
" <td>navigation_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>do carpenter ants eat wood</td>\n",
" <td>https://doyourownpestcontrol.com/carp.htm</td>\n",
" <td>Factual</td>\n",
" <td>information_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>rheumatoid arthritis in children</td>\n",
" <td>https://www.webmd.com/rheumatoid-arthritis/und...</td>\n",
" <td>Abstain</td>\n",
" <td>information_intent</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" query \\\n",
"0 best reads \n",
"1 tamerind \n",
"2 show mi ip \n",
"3 do carpenter ants eat wood \n",
"4 rheumatoid arthritis in children \n",
"\n",
" url label_manual \\\n",
"0 http://thegreatestbooks.org/ Abstain \n",
"1 https://en.wikipedia.org/wiki/Tamarind Factual \n",
"2 http://showip.net/ Instrumental \n",
"3 https://doyourownpestcontrol.com/carp.htm Factual \n",
"4 https://www.webmd.com/rheumatoid-arthritis/und... Abstain \n",
"\n",
" target \n",
"0 information_intent \n",
"1 information_intent \n",
"2 navigation_intent \n",
"3 information_intent \n",
"4 information_intent "
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"orcas_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "d4e2f712-b0a7-4b98-a314-eba7b0eaaf72",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"target\n",
"information_intent 591\n",
"navigation_intent 265\n",
"unknown 78\n",
"purchase_intent 27\n",
"yelp_intent 15\n",
"weather_intent 14\n",
"travel_intent 9\n",
"translation_intent 1\n",
"Name: count, dtype: int64"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"orcas_data['target'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "91e30e51-b14a-4c31-8ecc-985a951a700a",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 0%| | 0/1000 [00:00<?, ?it/s]/Users/cgopal/work/smart_intent/src/infer_intent.py:52: RuntimeWarning: overflow encountered in exp\n",
" probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)\n",
"/Users/cgopal/work/smart_intent/src/infer_intent.py:52: RuntimeWarning: invalid value encountered in divide\n",
" probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)\n",
"100%|█████████████████████████████████████████████████████████████| 1000/1000 [00:15<00:00, 65.11it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"intent hit rate (accuracy) = 0.517\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"# orcas_data['query'].values\n",
"\n",
"intent_hit_count = 0\n",
"intent_queries_misclassified = []\n",
"intent_queries_classified = []\n",
"intent_queries_below_thresholds = []\n",
"skipped_for_threshold = 0\n",
"for query, target in tqdm(orcas_data[['query', 'target']].values.tolist()):\n",
" pred_result, pred_proba = cls.find_intent(query)\n",
" ## TODO\n",
" if pred_proba[pred_result] > 0.5:\n",
" intent_queries_classified.append({'query': query, 'pred_result': pred_result, 'target': target, 'pred_proba': pred_proba[pred_result]})\n",
" if pred_result == target:\n",
" intent_hit_count += 1\n",
" else:\n",
" intent_queries_misclassified.append({'query': query, 'pred_result': pred_result, 'target': target})\n",
" else:\n",
" skipped_for_threshold += 1\n",
" intent_queries_below_thresholds.append({'query': query, 'pred_result': pred_result, 'target': target, 'pred_proba': pred_proba[pred_result]})\n",
"print(f\"intent hit rate (accuracy) = {intent_hit_count/len(orcas_data)}\")"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "31821c8b-a87b-4102-baae-07fb29ad725a",
"metadata": {},
"outputs": [],
"source": [
"# intent hit rate (accuracy) = 0.521"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "bdbae18c-3b37-41f7-995c-60a6795e8796",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"188"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"skipped_for_threshold"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "7d4fc02c-e3d1-4311-b06e-1b6878a0af14",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>query</th>\n",
" <th>pred_result</th>\n",
" <th>target</th>\n",
" <th>pred_proba</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>tamerind</td>\n",
" <td>travel_intent</td>\n",
" <td>information_intent</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>rheumatoid arthritis in children</td>\n",
" <td>translation_intent</td>\n",
" <td>information_intent</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>outlamder</td>\n",
" <td>unknown</td>\n",
" <td>unknown</td>\n",
" <td>0.346</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>is richard gere dead</td>\n",
" <td>unknown</td>\n",
" <td>information_intent</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>cdczika</td>\n",
" <td>unknown</td>\n",
" <td>navigation_intent</td>\n",
" <td>0.441</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>183</th>\n",
" <td>usaa san antonio</td>\n",
" <td>information_intent</td>\n",
" <td>navigation_intent</td>\n",
" <td>0.338</td>\n",
" </tr>\n",
" <tr>\n",
" <th>184</th>\n",
" <td>news sport football</td>\n",
" <td>translation_intent</td>\n",
" <td>information_intent</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>185</th>\n",
" <td>alveoli structure and function</td>\n",
" <td>navigation_intent</td>\n",
" <td>information_intent</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>186</th>\n",
" <td>first health insurance reviews</td>\n",
" <td>information_intent</td>\n",
" <td>information_intent</td>\n",
" <td>0.325</td>\n",
" </tr>\n",
" <tr>\n",
" <th>187</th>\n",
" <td>sylane</td>\n",
" <td>unknown</td>\n",
" <td>unknown</td>\n",
" <td>0.484</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>188 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" query pred_result target \\\n",
"0 tamerind travel_intent information_intent \n",
"1 rheumatoid arthritis in children translation_intent information_intent \n",
"2 outlamder unknown unknown \n",
"3 is richard gere dead unknown information_intent \n",
"4 cdczika unknown navigation_intent \n",
".. ... ... ... \n",
"183 usaa san antonio information_intent navigation_intent \n",
"184 news sport football translation_intent information_intent \n",
"185 alveoli structure and function navigation_intent information_intent \n",
"186 first health insurance reviews information_intent information_intent \n",
"187 sylane unknown unknown \n",
"\n",
" pred_proba \n",
"0 NaN \n",
"1 NaN \n",
"2 0.346 \n",
"3 NaN \n",
"4 0.441 \n",
".. ... \n",
"183 0.338 \n",
"184 NaN \n",
"185 NaN \n",
"186 0.325 \n",
"187 0.484 \n",
"\n",
"[188 rows x 4 columns]"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(intent_queries_below_thresholds)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "df13e884-9789-4e47-8157-8c0a99f6f42c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>min</th>\n",
" <th>5%</th>\n",
" <th>10%</th>\n",
" <th>20%</th>\n",
" <th>25%</th>\n",
" <th>30%</th>\n",
" <th>40%</th>\n",
" <th>50%</th>\n",
" <th>60%</th>\n",
" <th>70%</th>\n",
" <th>75%</th>\n",
" <th>80%</th>\n",
" <th>90%</th>\n",
" <th>95%</th>\n",
" <th>98%</th>\n",
" <th>99%</th>\n",
" <th>max</th>\n",
" </tr>\n",
" <tr>\n",
" <th>pred_result</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>information_intent</th>\n",
" <td>38.0</td>\n",
" <td>0.425658</td>\n",
" <td>0.056925</td>\n",
" <td>0.325</td>\n",
" <td>0.33665</td>\n",
" <td>0.3557</td>\n",
" <td>0.3614</td>\n",
" <td>0.37175</td>\n",
" <td>0.3874</td>\n",
" <td>0.4046</td>\n",
" <td>0.4285</td>\n",
" <td>0.4682</td>\n",
" <td>0.4719</td>\n",
" <td>0.47825</td>\n",
" <td>0.4800</td>\n",
" <td>0.4906</td>\n",
" <td>0.49375</td>\n",
" <td>0.49852</td>\n",
" <td>0.49926</td>\n",
" <td>0.500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>navigation_intent</th>\n",
" <td>9.0</td>\n",
" <td>0.404444</td>\n",
" <td>0.046763</td>\n",
" <td>0.325</td>\n",
" <td>0.33900</td>\n",
" <td>0.3530</td>\n",
" <td>0.3648</td>\n",
" <td>0.36800</td>\n",
" <td>0.3768</td>\n",
" <td>0.3934</td>\n",
" <td>0.4070</td>\n",
" <td>0.4342</td>\n",
" <td>0.4428</td>\n",
" <td>0.44400</td>\n",
" <td>0.4440</td>\n",
" <td>0.4474</td>\n",
" <td>0.45420</td>\n",
" <td>0.45828</td>\n",
" <td>0.45964</td>\n",
" <td>0.461</td>\n",
" </tr>\n",
" <tr>\n",
" <th>purchase_intent</th>\n",
" <td>10.0</td>\n",
" <td>0.389600</td>\n",
" <td>0.075851</td>\n",
" <td>0.269</td>\n",
" <td>0.27440</td>\n",
" <td>0.2798</td>\n",
" <td>0.3210</td>\n",
" <td>0.34125</td>\n",
" <td>0.3597</td>\n",
" <td>0.3810</td>\n",
" <td>0.4050</td>\n",
" <td>0.4242</td>\n",
" <td>0.4344</td>\n",
" <td>0.44700</td>\n",
" <td>0.4584</td>\n",
" <td>0.4761</td>\n",
" <td>0.47655</td>\n",
" <td>0.47682</td>\n",
" <td>0.47691</td>\n",
" <td>0.477</td>\n",
" </tr>\n",
" <tr>\n",
" <th>translation_intent</th>\n",
" <td>3.0</td>\n",
" <td>0.405333</td>\n",
" <td>0.087848</td>\n",
" <td>0.304</td>\n",
" <td>0.31880</td>\n",
" <td>0.3336</td>\n",
" <td>0.3632</td>\n",
" <td>0.37800</td>\n",
" <td>0.3928</td>\n",
" <td>0.4224</td>\n",
" <td>0.4520</td>\n",
" <td>0.4536</td>\n",
" <td>0.4552</td>\n",
" <td>0.45600</td>\n",
" <td>0.4568</td>\n",
" <td>0.4584</td>\n",
" <td>0.45920</td>\n",
" <td>0.45968</td>\n",
" <td>0.45984</td>\n",
" <td>0.460</td>\n",
" </tr>\n",
" <tr>\n",
" <th>travel_intent</th>\n",
" <td>10.0</td>\n",
" <td>0.398400</td>\n",
" <td>0.058525</td>\n",
" <td>0.302</td>\n",
" <td>0.30830</td>\n",
" <td>0.3146</td>\n",
" <td>0.3576</td>\n",
" <td>0.37075</td>\n",
" <td>0.3757</td>\n",
" <td>0.3976</td>\n",
" <td>0.4140</td>\n",
" <td>0.4200</td>\n",
" <td>0.4236</td>\n",
" <td>0.42450</td>\n",
" <td>0.4300</td>\n",
" <td>0.4543</td>\n",
" <td>0.47365</td>\n",
" <td>0.48526</td>\n",
" <td>0.48913</td>\n",
" <td>0.493</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unknown</th>\n",
" <td>9.0</td>\n",
" <td>0.405444</td>\n",
" <td>0.057090</td>\n",
" <td>0.334</td>\n",
" <td>0.33480</td>\n",
" <td>0.3356</td>\n",
" <td>0.3420</td>\n",
" <td>0.34600</td>\n",
" <td>0.3640</td>\n",
" <td>0.3946</td>\n",
" <td>0.4090</td>\n",
" <td>0.4346</td>\n",
" <td>0.4422</td>\n",
" <td>0.44300</td>\n",
" <td>0.4518</td>\n",
" <td>0.4688</td>\n",
" <td>0.47640</td>\n",
" <td>0.48096</td>\n",
" <td>0.48248</td>\n",
" <td>0.484</td>\n",
" </tr>\n",
" <tr>\n",
" <th>weather_intent</th>\n",
" <td>3.0</td>\n",
" <td>0.456000</td>\n",
" <td>0.017059</td>\n",
" <td>0.442</td>\n",
" <td>0.44290</td>\n",
" <td>0.4438</td>\n",
" <td>0.4456</td>\n",
" <td>0.44650</td>\n",
" <td>0.4474</td>\n",
" <td>0.4492</td>\n",
" <td>0.4510</td>\n",
" <td>0.4558</td>\n",
" <td>0.4606</td>\n",
" <td>0.46300</td>\n",
" <td>0.4654</td>\n",
" <td>0.4702</td>\n",
" <td>0.47260</td>\n",
" <td>0.47404</td>\n",
" <td>0.47452</td>\n",
" <td>0.475</td>\n",
" </tr>\n",
" <tr>\n",
" <th>yelp_intent</th>\n",
" <td>25.0</td>\n",
" <td>0.433320</td>\n",
" <td>0.043974</td>\n",
" <td>0.340</td>\n",
" <td>0.36480</td>\n",
" <td>0.3788</td>\n",
" <td>0.3994</td>\n",
" <td>0.40600</td>\n",
" <td>0.4084</td>\n",
" <td>0.4216</td>\n",
" <td>0.4360</td>\n",
" <td>0.4490</td>\n",
" <td>0.4646</td>\n",
" <td>0.47100</td>\n",
" <td>0.4722</td>\n",
" <td>0.4894</td>\n",
" <td>0.49460</td>\n",
" <td>0.49760</td>\n",
" <td>0.49880</td>\n",
" <td>0.500</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" count mean std min 5% 10% 20% \\\n",
"pred_result \n",
"information_intent 38.0 0.425658 0.056925 0.325 0.33665 0.3557 0.3614 \n",
"navigation_intent 9.0 0.404444 0.046763 0.325 0.33900 0.3530 0.3648 \n",
"purchase_intent 10.0 0.389600 0.075851 0.269 0.27440 0.2798 0.3210 \n",
"translation_intent 3.0 0.405333 0.087848 0.304 0.31880 0.3336 0.3632 \n",
"travel_intent 10.0 0.398400 0.058525 0.302 0.30830 0.3146 0.3576 \n",
"unknown 9.0 0.405444 0.057090 0.334 0.33480 0.3356 0.3420 \n",
"weather_intent 3.0 0.456000 0.017059 0.442 0.44290 0.4438 0.4456 \n",
"yelp_intent 25.0 0.433320 0.043974 0.340 0.36480 0.3788 0.3994 \n",
"\n",
" 25% 30% 40% 50% 60% 70% 75% \\\n",
"pred_result \n",
"information_intent 0.37175 0.3874 0.4046 0.4285 0.4682 0.4719 0.47825 \n",
"navigation_intent 0.36800 0.3768 0.3934 0.4070 0.4342 0.4428 0.44400 \n",
"purchase_intent 0.34125 0.3597 0.3810 0.4050 0.4242 0.4344 0.44700 \n",
"translation_intent 0.37800 0.3928 0.4224 0.4520 0.4536 0.4552 0.45600 \n",
"travel_intent 0.37075 0.3757 0.3976 0.4140 0.4200 0.4236 0.42450 \n",
"unknown 0.34600 0.3640 0.3946 0.4090 0.4346 0.4422 0.44300 \n",
"weather_intent 0.44650 0.4474 0.4492 0.4510 0.4558 0.4606 0.46300 \n",
"yelp_intent 0.40600 0.4084 0.4216 0.4360 0.4490 0.4646 0.47100 \n",
"\n",
" 80% 90% 95% 98% 99% max \n",
"pred_result \n",
"information_intent 0.4800 0.4906 0.49375 0.49852 0.49926 0.500 \n",
"navigation_intent 0.4440 0.4474 0.45420 0.45828 0.45964 0.461 \n",
"purchase_intent 0.4584 0.4761 0.47655 0.47682 0.47691 0.477 \n",
"translation_intent 0.4568 0.4584 0.45920 0.45968 0.45984 0.460 \n",
"travel_intent 0.4300 0.4543 0.47365 0.48526 0.48913 0.493 \n",
"unknown 0.4518 0.4688 0.47640 0.48096 0.48248 0.484 \n",
"weather_intent 0.4654 0.4702 0.47260 0.47404 0.47452 0.475 \n",
"yelp_intent 0.4722 0.4894 0.49460 0.49760 0.49880 0.500 "
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(intent_queries_below_thresholds).groupby('pred_result')['pred_proba'].describe(percentiles=[.05, .1,.2,.25,.3, .4,.5,.6, .7, .75, .8, .9, .95, .98, .99])"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "af55a1a0-d705-4612-b42b-ce7133760e91",
"metadata": {},
"outputs": [],
"source": [
"intent_queries_classified_df = pd.DataFrame(intent_queries_classified)\n",
"intent_queries_misclassified_df = pd.DataFrame(intent_queries_misclassified)\n",
"# intent_queries_misclassified_df"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "687a7ce6-6ff5-4a1c-a1ad-3116e27fb5b8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"target\n",
"information_intent 122\n",
"navigation_intent 99\n",
"unknown 57\n",
"purchase_intent 10\n",
"yelp_intent 3\n",
"travel_intent 3\n",
"translation_intent 1\n",
"Name: count, dtype: int64"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"intent_queries_misclassified_df['target'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "12e7a3b6-0156-4843-9538-fba3850272d9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>pred_result</th>\n",
" <th>information_intent</th>\n",
" <th>navigation_intent</th>\n",
" <th>purchase_intent</th>\n",
" <th>translation_intent</th>\n",
" <th>travel_intent</th>\n",
" <th>unknown</th>\n",
" <th>weather_intent</th>\n",
" <th>yelp_intent</th>\n",
" </tr>\n",
" <tr>\n",
" <th>target</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>information_intent</th>\n",
" <td>354</td>\n",
" <td>32</td>\n",
" <td>8</td>\n",
" <td>2</td>\n",
" <td>8</td>\n",
" <td>8</td>\n",
" <td>3</td>\n",
" <td>61</td>\n",
" </tr>\n",
" <tr>\n",
" <th>navigation_intent</th>\n",
" <td>44</td>\n",
" <td>117</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>purchase_intent</th>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>14</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>translation_intent</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>travel_intent</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unknown</th>\n",
" <td>33</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>weather_intent</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>14</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>yelp_intent</th>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>12</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"pred_result information_intent navigation_intent purchase_intent \\\n",
"target \n",
"information_intent 354 32 8 \n",
"navigation_intent 44 117 4 \n",
"purchase_intent 5 1 14 \n",
"translation_intent 1 0 0 \n",
"travel_intent 1 0 0 \n",
"unknown 33 5 3 \n",
"weather_intent 0 0 0 \n",
"yelp_intent 3 0 0 \n",
"\n",
"pred_result translation_intent travel_intent unknown \\\n",
"target \n",
"information_intent 2 8 8 \n",
"navigation_intent 0 6 4 \n",
"purchase_intent 0 0 0 \n",
"translation_intent 0 0 0 \n",
"travel_intent 0 4 0 \n",
"unknown 0 1 2 \n",
"weather_intent 0 0 0 \n",
"yelp_intent 0 0 0 \n",
"\n",
"pred_result weather_intent yelp_intent \n",
"target \n",
"information_intent 3 61 \n",
"navigation_intent 1 40 \n",
"purchase_intent 0 4 \n",
"translation_intent 0 0 \n",
"travel_intent 0 2 \n",
"unknown 0 15 \n",
"weather_intent 14 0 \n",
"yelp_intent 0 12 "
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.crosstab(intent_queries_classified_df['target'], intent_queries_classified_df['pred_result'])"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "6fc89f45-5f49-42fb-aa01-135c3996dd4a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>query</th>\n",
" <th>pred_result</th>\n",
" <th>target</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>almond butter</td>\n",
" <td>information_intent</td>\n",
" <td>yelp_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>172</th>\n",
" <td>health insurance plans</td>\n",
" <td>information_intent</td>\n",
" <td>yelp_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>183</th>\n",
" <td>gordo's</td>\n",
" <td>information_intent</td>\n",
" <td>yelp_intent</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" query pred_result target\n",
"9 almond butter information_intent yelp_intent\n",
"172 health insurance plans information_intent yelp_intent\n",
"183 gordo's information_intent yelp_intent"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"intent_queries_misclassified_df.loc[intent_queries_misclassified_df['target'] == 'yelp_intent']"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "5de170eb-8ce9-4a48-94a4-9bc416b009db",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>query</th>\n",
" <th>pred_result</th>\n",
" <th>target</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>show mi ip</td>\n",
" <td>yelp_intent</td>\n",
" <td>navigation_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>apple phone replacement</td>\n",
" <td>yelp_intent</td>\n",
" <td>navigation_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>paycvor</td>\n",
" <td>unknown</td>\n",
" <td>navigation_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>background sound effects free download</td>\n",
" <td>information_intent</td>\n",
" <td>navigation_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>dell support assistant repair</td>\n",
" <td>yelp_intent</td>\n",
" <td>navigation_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>286</th>\n",
" <td>windows 10 turn off screen lock</td>\n",
" <td>information_intent</td>\n",
" <td>navigation_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>288</th>\n",
" <td>starwars disneyland</td>\n",
" <td>travel_intent</td>\n",
" <td>navigation_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>289</th>\n",
" <td>ncdps forms</td>\n",
" <td>information_intent</td>\n",
" <td>navigation_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>292</th>\n",
" <td>riverside county ca property tax search</td>\n",
" <td>information_intent</td>\n",
" <td>navigation_intent</td>\n",
" </tr>\n",
" <tr>\n",
" <th>293</th>\n",
" <td>bank of america abuse reporting</td>\n",
" <td>information_intent</td>\n",
" <td>navigation_intent</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>99 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" query pred_result \\\n",
"1 show mi ip yelp_intent \n",
"8 apple phone replacement yelp_intent \n",
"11 paycvor unknown \n",
"13 background sound effects free download information_intent \n",
"15 dell support assistant repair yelp_intent \n",
".. ... ... \n",
"286 windows 10 turn off screen lock information_intent \n",
"288 starwars disneyland travel_intent \n",
"289 ncdps forms information_intent \n",
"292 riverside county ca property tax search information_intent \n",
"293 bank of america abuse reporting information_intent \n",
"\n",
" target \n",
"1 navigation_intent \n",
"8 navigation_intent \n",
"11 navigation_intent \n",
"13 navigation_intent \n",
"15 navigation_intent \n",
".. ... \n",
"286 navigation_intent \n",
"288 navigation_intent \n",
"289 navigation_intent \n",
"292 navigation_intent \n",
"293 navigation_intent \n",
"\n",
"[99 rows x 3 columns]"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"intent_queries_misclassified_df.loc[intent_queries_misclassified_df['target'] == 'navigation_intent']"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "4a0275f9-c7a5-401d-bdfc-5df76fe82d97",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>query</th>\n",
" <th>pred_result</th>\n",
" <th>target</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>44</th>\n",
" <td>chilie</td>\n",
" <td>yelp_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45</th>\n",
" <td>mr hans</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>47</th>\n",
" <td>what is this ip</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>49</th>\n",
" <td>rmb and cny</td>\n",
" <td>navigation_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>65</th>\n",
" <td>dcm</td>\n",
" <td>yelp_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74</th>\n",
" <td>i make mistakes</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>76</th>\n",
" <td>pho delivery</td>\n",
" <td>yelp_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>77</th>\n",
" <td>who sang it</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>83</th>\n",
" <td>j</td>\n",
" <td>yelp_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>87</th>\n",
" <td>no till food plot seed</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>90</th>\n",
" <td>system mechanic download bought already</td>\n",
" <td>purchase_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>93</th>\n",
" <td>media creator</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>chinese first name</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100</th>\n",
" <td>great golf</td>\n",
" <td>yelp_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106</th>\n",
" <td>hhn</td>\n",
" <td>yelp_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>113</th>\n",
" <td>mt vernon ky</td>\n",
" <td>travel_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>114</th>\n",
" <td>history drama</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>117</th>\n",
" <td>puerto penasco</td>\n",
" <td>yelp_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>122</th>\n",
" <td>punctuality in the workplace</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>123</th>\n",
" <td>6 months pregnant</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>124</th>\n",
" <td>#NAME?</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>125</th>\n",
" <td>letter end</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>129</th>\n",
" <td>krotan</td>\n",
" <td>yelp_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>134</th>\n",
" <td>dd 441</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>141</th>\n",
" <td>bradley il</td>\n",
" <td>yelp_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>145</th>\n",
" <td>9 digit zip code</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>146</th>\n",
" <td>birth certificate</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151</th>\n",
" <td>reforms</td>\n",
" <td>yelp_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>158</th>\n",
" <td>california magazine capacity</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>159</th>\n",
" <td>ms patches</td>\n",
" <td>purchase_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>163</th>\n",
" <td>bengals</td>\n",
" <td>yelp_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>179</th>\n",
" <td>death index search</td>\n",
" <td>navigation_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>182</th>\n",
" <td>high school equivalent</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>189</th>\n",
" <td>ninda</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>192</th>\n",
" <td>edby</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>196</th>\n",
" <td>projector screen</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>197</th>\n",
" <td>hoise</td>\n",
" <td>yelp_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>198</th>\n",
" <td>ppo</td>\n",
" <td>yelp_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199</th>\n",
" <td>external display</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>212</th>\n",
" <td>james v reyes</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>218</th>\n",
" <td>pc support scam</td>\n",
" <td>navigation_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>222</th>\n",
" <td>history of trolls</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224</th>\n",
" <td>severe abdominal pain</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>237</th>\n",
" <td>bpa</td>\n",
" <td>yelp_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>246</th>\n",
" <td>navigate</td>\n",
" <td>navigation_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>247</th>\n",
" <td>edit access</td>\n",
" <td>navigation_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>251</th>\n",
" <td>tender package</td>\n",
" <td>yelp_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>256</th>\n",
" <td>table numbers and stands</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>257</th>\n",
" <td>define pons</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>264</th>\n",
" <td>solar screen</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>265</th>\n",
" <td>agb whatsapp</td>\n",
" <td>purchase_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>266</th>\n",
" <td>psychology graduate jobs</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>274</th>\n",
" <td>12 lakhs</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>281</th>\n",
" <td>duplicate word page</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>287</th>\n",
" <td>samples of contracts</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>290</th>\n",
" <td>micro anti-snoring device</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" <tr>\n",
" <th>291</th>\n",
" <td>cvs market</td>\n",
" <td>information_intent</td>\n",
" <td>unknown</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" query pred_result target\n",
"44 chilie yelp_intent unknown\n",
"45 mr hans information_intent unknown\n",
"47 what is this ip information_intent unknown\n",
"49 rmb and cny navigation_intent unknown\n",
"65 dcm yelp_intent unknown\n",
"74 i make mistakes information_intent unknown\n",
"76 pho delivery yelp_intent unknown\n",
"77 who sang it information_intent unknown\n",
"83 j yelp_intent unknown\n",
"87 no till food plot seed information_intent unknown\n",
"90 system mechanic download bought already purchase_intent unknown\n",
"93 media creator information_intent unknown\n",
"99 chinese first name information_intent unknown\n",
"100 great golf yelp_intent unknown\n",
"106 hhn yelp_intent unknown\n",
"113 mt vernon ky travel_intent unknown\n",
"114 history drama information_intent unknown\n",
"117 puerto penasco yelp_intent unknown\n",
"122 punctuality in the workplace information_intent unknown\n",
"123 6 months pregnant information_intent unknown\n",
"124 #NAME? information_intent unknown\n",
"125 letter end information_intent unknown\n",
"129 krotan yelp_intent unknown\n",
"134 dd 441 information_intent unknown\n",
"141 bradley il yelp_intent unknown\n",
"145 9 digit zip code information_intent unknown\n",
"146 birth certificate information_intent unknown\n",
"151 reforms yelp_intent unknown\n",
"158 california magazine capacity information_intent unknown\n",
"159 ms patches purchase_intent unknown\n",
"163 bengals yelp_intent unknown\n",
"179 death index search navigation_intent unknown\n",
"182 high school equivalent information_intent unknown\n",
"189 ninda information_intent unknown\n",
"192 edby information_intent unknown\n",
"196 projector screen information_intent unknown\n",
"197 hoise yelp_intent unknown\n",
"198 ppo yelp_intent unknown\n",
"199 external display information_intent unknown\n",
"212 james v reyes information_intent unknown\n",
"218 pc support scam navigation_intent unknown\n",
"222 history of trolls information_intent unknown\n",
"224 severe abdominal pain information_intent unknown\n",
"237 bpa yelp_intent unknown\n",
"246 navigate navigation_intent unknown\n",
"247 edit access navigation_intent unknown\n",
"251 tender package yelp_intent unknown\n",
"256 table numbers and stands information_intent unknown\n",
"257 define pons information_intent unknown\n",
"264 solar screen information_intent unknown\n",
"265 agb whatsapp purchase_intent unknown\n",
"266 psychology graduate jobs information_intent unknown\n",
"274 12 lakhs information_intent unknown\n",
"281 duplicate word page information_intent unknown\n",
"287 samples of contracts information_intent unknown\n",
"290 micro anti-snoring device information_intent unknown\n",
"291 cvs market information_intent unknown"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"intent_queries_misclassified_df.loc[intent_queries_misclassified_df['target'] == 'unknown']"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "c610465c-f436-4354-a598-2a9e61f5cfef",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
]
}
],
"source": [
"from sklearn.metrics import classification_report"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "55488e4b-a5c5-4b34-a877-9ca171f7ce5e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
"information_intent 0.80 0.74 0.77 476\n",
" navigation_intent 0.75 0.54 0.63 216\n",
" purchase_intent 0.48 0.58 0.53 24\n",
"translation_intent 0.00 0.00 0.00 1\n",
" travel_intent 0.21 0.57 0.31 7\n",
" unknown 0.14 0.03 0.05 59\n",
" weather_intent 0.78 1.00 0.88 14\n",
" yelp_intent 0.09 0.80 0.16 15\n",
"\n",
" accuracy 0.64 812\n",
" macro avg 0.41 0.53 0.42 812\n",
" weighted avg 0.71 0.64 0.66 812\n",
"\n"
]
}
],
"source": [
"print(classification_report(intent_queries_classified_df['target'],\n",
" intent_queries_classified_df['pred_result']))"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "c5c99699-c338-4d1e-8e31-5ba2f52c72b1",
"metadata": {},
"outputs": [],
"source": [
"# print(classification_report(intent_queries_classified_df['target'],\n",
"# intent_queries_classified_df['pred_result']))\n",
"\n",
"### with no thresholds\n",
"\n",
"# precision recall f1-score support\n",
"\n",
"# information_intent 0.80 0.70 0.75 591\n",
"# navigation_intent 0.69 0.55 0.61 265\n",
"# purchase_intent 0.45 0.52 0.48 27\n",
"# translation_intent 0.50 1.00 0.67 1\n",
"# travel_intent 0.09 0.56 0.16 9\n",
"# unknown 0.21 0.08 0.11 78\n",
"# weather_intent 0.57 0.93 0.70 14\n",
"# yelp_intent 0.09 0.80 0.16 15\n",
"\n",
"# accuracy 0.61 1000\n",
"# macro avg 0.43 0.64 0.46 1000\n",
"# weighted avg 0.70 0.61 0.64 1000"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "60a3ee53-c3ba-494c-bd64-be72c7f26a4e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"target\n",
"information_intent 61\n",
"navigation_intent 40\n",
"unknown 15\n",
"purchase_intent 4\n",
"travel_intent 2\n",
"Name: count, dtype: int64"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"intent_queries_misclassified_df.loc[intent_queries_misclassified_df['pred_result'] == 'yelp_intent']['target'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "23924f46-6846-40b4-9447-191af428d5ae",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>min</th>\n",
" <th>5%</th>\n",
" <th>10%</th>\n",
" <th>20%</th>\n",
" <th>25%</th>\n",
" <th>30%</th>\n",
" <th>40%</th>\n",
" <th>50%</th>\n",
" <th>60%</th>\n",
" <th>70%</th>\n",
" <th>75%</th>\n",
" <th>80%</th>\n",
" <th>90%</th>\n",
" <th>95%</th>\n",
" <th>98%</th>\n",
" <th>99%</th>\n",
" <th>max</th>\n",
" </tr>\n",
" <tr>\n",
" <th>pred_result</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>information_intent</th>\n",
" <td>441.0</td>\n",
" <td>0.891952</td>\n",
" <td>0.138047</td>\n",
" <td>0.501</td>\n",
" <td>0.58600</td>\n",
" <td>0.6580</td>\n",
" <td>0.7810</td>\n",
" <td>0.83000</td>\n",
" <td>0.8690</td>\n",
" <td>0.9250</td>\n",
" <td>0.9580</td>\n",
" <td>0.9800</td>\n",
" <td>0.9930</td>\n",
" <td>0.99600</td>\n",
" <td>0.9980</td>\n",
" <td>1.0000</td>\n",
" <td>1.00000</td>\n",
" <td>1.00000</td>\n",
" <td>1.00000</td>\n",
" <td>1.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>navigation_intent</th>\n",
" <td>155.0</td>\n",
" <td>0.894232</td>\n",
" <td>0.135502</td>\n",
" <td>0.512</td>\n",
" <td>0.60700</td>\n",
" <td>0.6750</td>\n",
" <td>0.7734</td>\n",
" <td>0.84050</td>\n",
" <td>0.8746</td>\n",
" <td>0.9362</td>\n",
" <td>0.9640</td>\n",
" <td>0.9798</td>\n",
" <td>0.9930</td>\n",
" <td>0.99600</td>\n",
" <td>0.9980</td>\n",
" <td>0.9996</td>\n",
" <td>1.00000</td>\n",
" <td>1.00000</td>\n",
" <td>1.00000</td>\n",
" <td>1.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>purchase_intent</th>\n",
" <td>29.0</td>\n",
" <td>0.734448</td>\n",
" <td>0.154549</td>\n",
" <td>0.518</td>\n",
" <td>0.52020</td>\n",
" <td>0.5284</td>\n",
" <td>0.5742</td>\n",
" <td>0.59400</td>\n",
" <td>0.6300</td>\n",
" <td>0.6576</td>\n",
" <td>0.7760</td>\n",
" <td>0.8042</td>\n",
" <td>0.8176</td>\n",
" <td>0.82600</td>\n",
" <td>0.8888</td>\n",
" <td>0.9350</td>\n",
" <td>0.96920</td>\n",
" <td>0.98084</td>\n",
" <td>0.98392</td>\n",
" <td>0.987</td>\n",
" </tr>\n",
" <tr>\n",
" <th>translation_intent</th>\n",
" <td>2.0</td>\n",
" <td>0.698500</td>\n",
" <td>0.036062</td>\n",
" <td>0.673</td>\n",
" <td>0.67555</td>\n",
" <td>0.6781</td>\n",
" <td>0.6832</td>\n",
" <td>0.68575</td>\n",
" <td>0.6883</td>\n",
" <td>0.6934</td>\n",
" <td>0.6985</td>\n",
" <td>0.7036</td>\n",
" <td>0.7087</td>\n",
" <td>0.71125</td>\n",
" <td>0.7138</td>\n",
" <td>0.7189</td>\n",
" <td>0.72145</td>\n",
" <td>0.72298</td>\n",
" <td>0.72349</td>\n",
" <td>0.724</td>\n",
" </tr>\n",
" <tr>\n",
" <th>travel_intent</th>\n",
" <td>19.0</td>\n",
" <td>0.794053</td>\n",
" <td>0.160364</td>\n",
" <td>0.552</td>\n",
" <td>0.57180</td>\n",
" <td>0.5764</td>\n",
" <td>0.6404</td>\n",
" <td>0.66750</td>\n",
" <td>0.6746</td>\n",
" <td>0.7324</td>\n",
" <td>0.8310</td>\n",
" <td>0.8594</td>\n",
" <td>0.9232</td>\n",
" <td>0.94100</td>\n",
" <td>0.9596</td>\n",
" <td>0.9934</td>\n",
" <td>0.99900</td>\n",
" <td>0.99900</td>\n",
" <td>0.99900</td>\n",
" <td>0.999</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unknown</th>\n",
" <td>14.0</td>\n",
" <td>0.753143</td>\n",
" <td>0.139711</td>\n",
" <td>0.552</td>\n",
" <td>0.58125</td>\n",
" <td>0.6033</td>\n",
" <td>0.6276</td>\n",
" <td>0.63975</td>\n",
" <td>0.6547</td>\n",
" <td>0.7106</td>\n",
" <td>0.7300</td>\n",
" <td>0.7494</td>\n",
" <td>0.8313</td>\n",
" <td>0.86575</td>\n",
" <td>0.8966</td>\n",
" <td>0.9419</td>\n",
" <td>0.96330</td>\n",
" <td>0.97812</td>\n",
" <td>0.98306</td>\n",
" <td>0.988</td>\n",
" </tr>\n",
" <tr>\n",
" <th>weather_intent</th>\n",
" <td>18.0</td>\n",
" <td>0.905556</td>\n",
" <td>0.134964</td>\n",
" <td>0.637</td>\n",
" <td>0.66675</td>\n",
" <td>0.6839</td>\n",
" <td>0.7648</td>\n",
" <td>0.79975</td>\n",
" <td>0.8771</td>\n",
" <td>0.9754</td>\n",
" <td>0.9990</td>\n",
" <td>0.9992</td>\n",
" <td>1.0000</td>\n",
" <td>1.00000</td>\n",
" <td>1.0000</td>\n",
" <td>1.0000</td>\n",
" <td>1.00000</td>\n",
" <td>1.00000</td>\n",
" <td>1.00000</td>\n",
" <td>1.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>yelp_intent</th>\n",
" <td>134.0</td>\n",
" <td>0.781007</td>\n",
" <td>0.163846</td>\n",
" <td>0.508</td>\n",
" <td>0.52265</td>\n",
" <td>0.5486</td>\n",
" <td>0.5934</td>\n",
" <td>0.61400</td>\n",
" <td>0.6620</td>\n",
" <td>0.7564</td>\n",
" <td>0.8095</td>\n",
" <td>0.8650</td>\n",
" <td>0.9053</td>\n",
" <td>0.93900</td>\n",
" <td>0.9536</td>\n",
" <td>0.9827</td>\n",
" <td>0.99400</td>\n",
" <td>0.99734</td>\n",
" <td>0.99934</td>\n",
" <td>1.000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" count mean std min 5% 10% 20% \\\n",
"pred_result \n",
"information_intent 441.0 0.891952 0.138047 0.501 0.58600 0.6580 0.7810 \n",
"navigation_intent 155.0 0.894232 0.135502 0.512 0.60700 0.6750 0.7734 \n",
"purchase_intent 29.0 0.734448 0.154549 0.518 0.52020 0.5284 0.5742 \n",
"translation_intent 2.0 0.698500 0.036062 0.673 0.67555 0.6781 0.6832 \n",
"travel_intent 19.0 0.794053 0.160364 0.552 0.57180 0.5764 0.6404 \n",
"unknown 14.0 0.753143 0.139711 0.552 0.58125 0.6033 0.6276 \n",
"weather_intent 18.0 0.905556 0.134964 0.637 0.66675 0.6839 0.7648 \n",
"yelp_intent 134.0 0.781007 0.163846 0.508 0.52265 0.5486 0.5934 \n",
"\n",
" 25% 30% 40% 50% 60% 70% 75% \\\n",
"pred_result \n",
"information_intent 0.83000 0.8690 0.9250 0.9580 0.9800 0.9930 0.99600 \n",
"navigation_intent 0.84050 0.8746 0.9362 0.9640 0.9798 0.9930 0.99600 \n",
"purchase_intent 0.59400 0.6300 0.6576 0.7760 0.8042 0.8176 0.82600 \n",
"translation_intent 0.68575 0.6883 0.6934 0.6985 0.7036 0.7087 0.71125 \n",
"travel_intent 0.66750 0.6746 0.7324 0.8310 0.8594 0.9232 0.94100 \n",
"unknown 0.63975 0.6547 0.7106 0.7300 0.7494 0.8313 0.86575 \n",
"weather_intent 0.79975 0.8771 0.9754 0.9990 0.9992 1.0000 1.00000 \n",
"yelp_intent 0.61400 0.6620 0.7564 0.8095 0.8650 0.9053 0.93900 \n",
"\n",
" 80% 90% 95% 98% 99% max \n",
"pred_result \n",
"information_intent 0.9980 1.0000 1.00000 1.00000 1.00000 1.000 \n",
"navigation_intent 0.9980 0.9996 1.00000 1.00000 1.00000 1.000 \n",
"purchase_intent 0.8888 0.9350 0.96920 0.98084 0.98392 0.987 \n",
"translation_intent 0.7138 0.7189 0.72145 0.72298 0.72349 0.724 \n",
"travel_intent 0.9596 0.9934 0.99900 0.99900 0.99900 0.999 \n",
"unknown 0.8966 0.9419 0.96330 0.97812 0.98306 0.988 \n",
"weather_intent 1.0000 1.0000 1.00000 1.00000 1.00000 1.000 \n",
"yelp_intent 0.9536 0.9827 0.99400 0.99734 0.99934 1.000 "
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"intent_queries_classified_df.groupby('pred_result')['pred_proba'].describe(percentiles=[.05, .1,.2,.25,.3, .4,.5,.6, .7, .75, .8, .9, .95, .98, .99])"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "a25bf495-8907-49b9-9dbf-d3c43fcd5d7c",
"metadata": {},
"outputs": [],
"source": [
"# intent_queries_misclassified_df.loc[(intent_queries_misclassified_df['target'] == 'information_intent') &\n",
"# (intent_queries_misclassified_df['pred_result'] == 'yelp_intent')]['query'].values"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "adebaf6b-d590-48b2-8e9c-c9e244882a63",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "0c547dcf-58b4-4d80-b162-da29f8f9fe70",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}