notebooks/explore_semantic_search.ipynb (582 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"id": "7d9ac78b-577d-4df3-ba83-2772e72aa44e",
"metadata": {},
"source": [
"Purpose of this notebook is to explore the semantic search use case with browsing history in mind\n",
"- Important caveat is to explore the support for multiple languges\n",
"\n",
"Reference link -> https://data.firefox.com/dashboard/usage-behavior\n",
"\n",
" Worldwide, English (US) remains the most common, at about 40% of the population, with German (11%) and French (8.1%) coming 2nd and 3rd. Simplified Chinese is the 4th most common language (6.7%), and Spanish (Spain) is the 5th most common language (5%)."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "75ed6f87-125e-4b61-8038-1447fe5fefb9",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import onnxruntime as ort\n",
"from transformers import AutoTokenizer\n",
"import numpy as np\n",
"import requests\n",
"import os\n",
"import sys"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "df6bb164-1bed-4b58-96c9-19e7070c2037",
"metadata": {},
"outputs": [],
"source": [
"# Add the project root directory to the Python path\n",
"project_root = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n",
"sys.path.append(project_root)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "082f0f21-7a18-4e17-a9e5-36571b06679f",
"metadata": {},
"outputs": [],
"source": [
"from src.constants import EMBEDDING_MODELS_DICT\n",
"from src.feature_extractor import FeatureExtractor"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7fd90da7-afc6-4c89-bf4e-057cd29e7ee9",
"metadata": {},
"outputs": [],
"source": [
"# !cp /tmp/output_file.txt /Users/cgopal/Downloads/places_output_file_v1.txt"
]
},
{
"cell_type": "markdown",
"id": "f926249d-62f9-4d50-a40a-9b3599122b4e",
"metadata": {},
"source": [
"#### Lets try reading browsing history"
]
},
{
"cell_type": "markdown",
"id": "7b0e403e-5e3e-4fbd-af8a-41a9a9783231",
"metadata": {},
"source": [
"Download browsing history:\n",
"\n",
"1) cp \"/Users/<username>/Library/Application Support/Firefox/Profiles/<profilename>/places.sqlite\" /tmp/places.sqlite\n",
"2) sqlite3 /tmp/places.sqlite\n",
"3) within sqlite run below commands one by one\n",
"```\n",
".mode csv\n",
".headers on\n",
".output temp_data.csv\n",
"SELECT url,title,description,preview_image_url,frecency,last_visit_date\n",
"FROM moz_places\n",
"WHERE title NOTNULL\n",
"AND url not like '%google.com/search?%'\n",
"ORDER BY frecency DESC\n",
"LIMIT 1000;\n",
"```\n",
"<!-- 4) copy the file output_file_v2 to ~/Downloads/places_output_file_v2.txt -->\n",
"4) cp temp_data.csv ../data/history_output_file.csv\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b2eaa29f-ff0f-453f-af8e-e82aab89c051",
"metadata": {},
"outputs": [],
"source": [
"# history = pd.read_csv(\"/Users/cgopal/Downloads/places_output_file_v2.txt\",\n",
"# sep=\"~\\\\|\", engine=\"python\", header=None, encoding=\"utf-8\", on_bad_lines=\"skip\", index_col=False,\n",
"# names=['url', 'title', 'description', 'preview_image_url', 'frecency', 'last_visit_date'])\n",
"\n",
"# print(len(history))\n",
"# history.head().T\n",
"history = pd.read_csv(\"../data/history_output_file.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "201d017d-9ddb-4674-81f0-b1f83710061d",
"metadata": {},
"outputs": [],
"source": [
"# history['last_visit_date'].fillna(0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "358f5a7d-6353-4ac8-bb44-a2ede8a7084a",
"metadata": {},
"outputs": [],
"source": [
"history['last_visit_date'] = pd.to_datetime(history['last_visit_date'], unit='us')\n",
"\n",
"# fill empty last_visit_date with default value \"1970-01-01\"\n",
"history['last_visit_date'] = history['last_visit_date'].fillna(pd.to_datetime(\"1970-01-01\"))\n",
"history['combined_text'] = history['title'].fillna('') + \" \" + history['description'].fillna('')\n",
"history = history.loc[history['combined_text'] != ''].reset_index(drop=True)\n",
"\n",
"print(len(history))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "001c3a82-ea77-4dac-af08-0c7a7ccb6bec",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"history"
]
},
{
"cell_type": "markdown",
"id": "8484e2cc-b101-44ff-87b0-1f7b5f5ac732",
"metadata": {},
"source": [
"#### find appropriate max token length"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "71b131ae-6bfb-4dbc-893f-5dff571e2c5b",
"metadata": {},
"outputs": [],
"source": [
"!python -V"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0fe80a99-d54c-40fb-bbd1-4ba72a78c9d8",
"metadata": {},
"outputs": [],
"source": [
"# !python -m pip install tiktoken\n",
"# !python -m pip freeze| grep tiktoken"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a4ccdbf2-582a-4d43-9b85-af8926ed1f5c",
"metadata": {},
"outputs": [],
"source": [
"# print(tiktoken.list_encoding_names())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3495f161-de13-4fff-b9e1-e6bda4d0c3a7",
"metadata": {},
"outputs": [],
"source": [
"# # import pandas as pd\n",
"# import tiktoken\n",
"# # import numpy as np\n",
"\n",
"# # Sample data\n",
"# # history\n",
"\n",
"# # Initialize the tokenizer\n",
"# # Replace 'gpt-3.5-turbo' with the model/tokenizer you want to use\n",
"# tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
"\n",
"# # Tokenize each text and count tokens\n",
"# history['token_count'] = history['combined_text'].apply(lambda x: len(tokenizer.encode(x)))\n",
"\n",
"# # Compute statistics\n",
"# max_length = history['token_count'].max()\n",
"# percentile_95 = np.percentile(history['token_count'], 95)\n",
"# percentile_99 = np.percentile(history['token_count'], 99)\n",
"\n",
"# print(f\"Maximum token count: {max_length}\")\n",
"# print(f\"95th percentile token count: {percentile_95}\")\n",
"# print(f\"99th percentile token count: {percentile_99}\")\n",
"\n",
"# # Decide on an appropriate max_length based on these statistics\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5976192a-a663-4ad7-b5e5-f795abb30c3a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "e2295900-4902-4f6c-b1af-0dd391f46037",
"metadata": {},
"outputs": [],
"source": [
"EMBEDDING_MODELS_DICT"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "64da1867-5f42-49cd-adc9-d60a5cc1e6c0",
"metadata": {},
"outputs": [],
"source": [
"texts = history['combined_text'].values.tolist()\n",
"embeddings_dict = {}\n",
"embeddings_sizes = {}\n",
"\n",
"for model in EMBEDDING_MODELS_DICT.keys():\n",
" fe = FeatureExtractor(EMBEDDING_MODELS_DICT, model_name=model)\n",
" embeddings_dict[model] = fe.get_embeddings(texts)\n",
" print(model, embeddings_dict[model].shape)\n",
" embeddings_sizes[model] = embeddings_dict[model].shape[1]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c6ecf32f-af2a-4fe5-bb3e-d1a0f0d39e13",
"metadata": {},
"outputs": [],
"source": [
"embeddings_sizes"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "62f4f1e5-fd70-4ec9-9644-c6f68b577b0c",
"metadata": {},
"outputs": [],
"source": [
"embeddings_dict.keys()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cbae0da7-da91-413d-9da4-db3cbefb42af",
"metadata": {},
"outputs": [],
"source": [
"embeddings_dict['nomic-ai/modernbert-embed-base'].shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "055a6158-d681-4aa6-b66c-b4c1eb4d0e13",
"metadata": {},
"outputs": [],
"source": [
"# embeddings_dict['answerdotai/ModernBERT-base'][0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0043dcc0-8373-4288-8b0c-3df00546dc53",
"metadata": {},
"outputs": [],
"source": [
"!mkdir -p ../data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "df588b4d-61d4-4e30-9cb9-001cb9747ece",
"metadata": {},
"outputs": [],
"source": [
"import pickle\n",
"\n",
"with open(\"../data/embeddings_dict.pkl\", \"wb\") as f:\n",
" pickle.dump(embeddings_dict, f)\n",
"\n",
"with open(\"../data/embeddings_sizes.pkl\", \"wb\") as f:\n",
" pickle.dump(embeddings_sizes, f)\n",
"\n",
"history.to_csv(\"../data/history.csv\", index=False)"
]
},
{
"cell_type": "markdown",
"id": "61265f7e-aff7-4893-b3c8-9a8c1b666738",
"metadata": {},
"source": [
"#### Explore sqlite vector DB"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "80ed6d29-8887-4b10-b21d-3a02b1b78362",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import sqlite3\n",
"import sqlite_vec\n",
"\n",
"from typing import List\n",
"import struct"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9836b48c-77ed-40d3-b0df-a542df31630d",
"metadata": {},
"outputs": [],
"source": [
"\n",
"def serialize_f32(vector: List[float]) -> bytes:\n",
" \"\"\"serializes a list of floats into a compact \"raw bytes\" format\"\"\"\n",
" return struct.pack(\"%sf\" % len(vector), *vector)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "012041ff-936b-4163-bc21-fa34012f7b6f",
"metadata": {},
"outputs": [],
"source": [
"db = sqlite3.connect(\":memory:\")\n",
"db.enable_load_extension(True)\n",
"sqlite_vec.load(db)\n",
"db.enable_load_extension(False)\n",
"\n",
"sqlite_version, vec_version = db.execute(\n",
" \"select sqlite_version(), vec_version()\"\n",
").fetchone()\n",
"print(f\"sqlite_version={sqlite_version}, vec_version={vec_version}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "572d7447-4cd5-4467-8c48-cbdaa1f1e611",
"metadata": {},
"outputs": [],
"source": [
"path = \"../data/embeddings_dict.pkl\"\n",
"\n",
"with open(path, \"rb\") as f:\n",
" embeddings_dict = pickle.load(f)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a42a2215-5c2e-4784-82f0-d71da4fddf55",
"metadata": {},
"outputs": [],
"source": [
"embeddings_dict.keys()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a0936cdb-c75a-4dd4-8bc5-8659729c69f0",
"metadata": {},
"outputs": [],
"source": [
"# model_name = \"Xenova/paraphrase-multilingual-MiniLM-L12-v2\"\n",
"# model_name = \"Xenova/distiluse-base-multilingual-cased-v1\"\n",
"# model_name = \"Xenova/all-MiniLM-L6-v2\"\n",
"# model_name = \"nomic-ai/nomic-embed-text-v1.5\"\n",
"model_name = \"nomic-ai/modernbert-embed-base\"\n",
"EMBEDDING_SIZE = embeddings_sizes[model_name]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f36de95c-ff05-4175-8cde-8b5003f77caf",
"metadata": {},
"outputs": [],
"source": [
"items = []\n",
"for idx, vec in enumerate(embeddings_dict[model_name]):\n",
" items.append((idx, list(vec)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8b286fac-0924-42fe-aa5b-2910311f0f68",
"metadata": {},
"outputs": [],
"source": [
"model_name_normalized = model_name.replace(\"/\",\"_\").replace(\"-\",\"_\").replace(\".\",\"_\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6f8d21e2-51ba-41b0-86f0-f13178902d67",
"metadata": {},
"outputs": [],
"source": [
"db.execute(f\"CREATE VIRTUAL TABLE vec_items_{model_name_normalized} USING vec0(embedding float[{EMBEDDING_SIZE}])\")\n",
"\n",
"with db:\n",
" for item in items:\n",
" db.execute(\n",
" f\"INSERT INTO vec_items_{model_name_normalized}(rowid, embedding) VALUES (?, ?)\",\n",
" [item[0], serialize_f32(item[1])],\n",
" )\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c2adb94b-e78e-43a4-a23a-905a9d72657d",
"metadata": {},
"outputs": [],
"source": [
"history = pd.read_csv(\"../data/history.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "18496f88-d953-4c44-8a89-a6d58eb40ef3",
"metadata": {},
"outputs": [],
"source": [
"query = \"quantization\"\n",
"\n",
"fe = FeatureExtractor(EMBEDDING_MODELS_DICT, model_name=model_name)\n",
"query_embedding = fe.get_embeddings([query])[0]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fabd843c-fa3d-48df-a7ed-3bee83e636ed",
"metadata": {},
"outputs": [],
"source": [
"query_embedding.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2c6c6098-5206-463d-8315-262edc5ae3d9",
"metadata": {},
"outputs": [],
"source": [
"# using cosine distance\n",
"rows = db.execute(\n",
" f\"\"\"\n",
" SELECT\n",
" rowid,\n",
" vec_distance_cosine(embedding, ?) AS cosine_distance\n",
" FROM vec_items_{model_name_normalized}\n",
" ORDER BY cosine_distance\n",
" LIMIT 3\n",
" \"\"\",\n",
" [serialize_f32(query_embedding)],\n",
").fetchall()\n",
"\n",
"print(rows)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9df3ae87-fa0b-4528-993a-4bb8ea03ee51",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_colwidth', 200)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b6148508-cd63-4483-9c7f-5839e16fba50",
"metadata": {},
"outputs": [],
"source": [
"print(f\"query = {query}\")\n",
"# history.iloc[[row for row, score in rows]]\n",
"row_indices = [row for row, score in rows]\n",
"distance = [score for row, score in rows]\n",
"\n",
"selected_rows = history.iloc[row_indices].copy()\n",
"selected_rows[\"distance\"] = distance\n",
"selected_rows"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "616f7b84-a967-42ae-8a2d-3e9bdbf2075b",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "243ce38c-de8e-47aa-a977-8e188f19de8f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}