notebooks/explore_semantic_search.ipynb (582 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "id": "7d9ac78b-577d-4df3-ba83-2772e72aa44e", "metadata": {}, "source": [ "Purpose of this notebook is to explore the semantic search use case with browsing history in mind\n", "- Important caveat is to explore the support for multiple languges\n", "\n", "Reference link -> https://data.firefox.com/dashboard/usage-behavior\n", "\n", " Worldwide, English (US) remains the most common, at about 40% of the population, with German (11%) and French (8.1%) coming 2nd and 3rd. Simplified Chinese is the 4th most common language (6.7%), and Spanish (Spain) is the 5th most common language (5%)." ] }, { "cell_type": "code", "execution_count": null, "id": "75ed6f87-125e-4b61-8038-1447fe5fefb9", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import onnxruntime as ort\n", "from transformers import AutoTokenizer\n", "import numpy as np\n", "import requests\n", "import os\n", "import sys" ] }, { "cell_type": "code", "execution_count": null, "id": "df6bb164-1bed-4b58-96c9-19e7070c2037", "metadata": {}, "outputs": [], "source": [ "# Add the project root directory to the Python path\n", "project_root = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n", "sys.path.append(project_root)" ] }, { "cell_type": "code", "execution_count": null, "id": "082f0f21-7a18-4e17-a9e5-36571b06679f", "metadata": {}, "outputs": [], "source": [ "from src.constants import EMBEDDING_MODELS_DICT\n", "from src.feature_extractor import FeatureExtractor" ] }, { "cell_type": "code", "execution_count": null, "id": "7fd90da7-afc6-4c89-bf4e-057cd29e7ee9", "metadata": {}, "outputs": [], "source": [ "# !cp /tmp/output_file.txt /Users/cgopal/Downloads/places_output_file_v1.txt" ] }, { "cell_type": "markdown", "id": "f926249d-62f9-4d50-a40a-9b3599122b4e", "metadata": {}, "source": [ "#### Lets try reading browsing history" ] }, { "cell_type": "markdown", "id": "7b0e403e-5e3e-4fbd-af8a-41a9a9783231", "metadata": {}, "source": [ "Download browsing history:\n", "\n", "1) cp \"/Users/<username>/Library/Application Support/Firefox/Profiles/<profilename>/places.sqlite\" /tmp/places.sqlite\n", "2) sqlite3 /tmp/places.sqlite\n", "3) within sqlite run below commands one by one\n", "```\n", ".mode csv\n", ".headers on\n", ".output temp_data.csv\n", "SELECT url,title,description,preview_image_url,frecency,last_visit_date\n", "FROM moz_places\n", "WHERE title NOTNULL\n", "AND url not like '%google.com/search?%'\n", "ORDER BY frecency DESC\n", "LIMIT 1000;\n", "```\n", "<!-- 4) copy the file output_file_v2 to ~/Downloads/places_output_file_v2.txt -->\n", "4) cp temp_data.csv ../data/history_output_file.csv\n" ] }, { "cell_type": "code", "execution_count": null, "id": "b2eaa29f-ff0f-453f-af8e-e82aab89c051", "metadata": {}, "outputs": [], "source": [ "# history = pd.read_csv(\"/Users/cgopal/Downloads/places_output_file_v2.txt\",\n", "# sep=\"~\\\\|\", engine=\"python\", header=None, encoding=\"utf-8\", on_bad_lines=\"skip\", index_col=False,\n", "# names=['url', 'title', 'description', 'preview_image_url', 'frecency', 'last_visit_date'])\n", "\n", "# print(len(history))\n", "# history.head().T\n", "history = pd.read_csv(\"../data/history_output_file.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "201d017d-9ddb-4674-81f0-b1f83710061d", "metadata": {}, "outputs": [], "source": [ "# history['last_visit_date'].fillna(0)" ] }, { "cell_type": "code", "execution_count": null, "id": "358f5a7d-6353-4ac8-bb44-a2ede8a7084a", "metadata": {}, "outputs": [], "source": [ "history['last_visit_date'] = pd.to_datetime(history['last_visit_date'], unit='us')\n", "\n", "# fill empty last_visit_date with default value \"1970-01-01\"\n", "history['last_visit_date'] = history['last_visit_date'].fillna(pd.to_datetime(\"1970-01-01\"))\n", "history['combined_text'] = history['title'].fillna('') + \" \" + history['description'].fillna('')\n", "history = history.loc[history['combined_text'] != ''].reset_index(drop=True)\n", "\n", "print(len(history))" ] }, { "cell_type": "code", "execution_count": null, "id": "001c3a82-ea77-4dac-af08-0c7a7ccb6bec", "metadata": { "scrolled": true }, "outputs": [], "source": [ "history" ] }, { "cell_type": "markdown", "id": "8484e2cc-b101-44ff-87b0-1f7b5f5ac732", "metadata": {}, "source": [ "#### find appropriate max token length" ] }, { "cell_type": "code", "execution_count": null, "id": "71b131ae-6bfb-4dbc-893f-5dff571e2c5b", "metadata": {}, "outputs": [], "source": [ "!python -V" ] }, { "cell_type": "code", "execution_count": null, "id": "0fe80a99-d54c-40fb-bbd1-4ba72a78c9d8", "metadata": {}, "outputs": [], "source": [ "# !python -m pip install tiktoken\n", "# !python -m pip freeze| grep tiktoken" ] }, { "cell_type": "code", "execution_count": null, "id": "a4ccdbf2-582a-4d43-9b85-af8926ed1f5c", "metadata": {}, "outputs": [], "source": [ "# print(tiktoken.list_encoding_names())" ] }, { "cell_type": "code", "execution_count": null, "id": "3495f161-de13-4fff-b9e1-e6bda4d0c3a7", "metadata": {}, "outputs": [], "source": [ "# # import pandas as pd\n", "# import tiktoken\n", "# # import numpy as np\n", "\n", "# # Sample data\n", "# # history\n", "\n", "# # Initialize the tokenizer\n", "# # Replace 'gpt-3.5-turbo' with the model/tokenizer you want to use\n", "# tokenizer = tiktoken.get_encoding(\"gpt2\")\n", "\n", "# # Tokenize each text and count tokens\n", "# history['token_count'] = history['combined_text'].apply(lambda x: len(tokenizer.encode(x)))\n", "\n", "# # Compute statistics\n", "# max_length = history['token_count'].max()\n", "# percentile_95 = np.percentile(history['token_count'], 95)\n", "# percentile_99 = np.percentile(history['token_count'], 99)\n", "\n", "# print(f\"Maximum token count: {max_length}\")\n", "# print(f\"95th percentile token count: {percentile_95}\")\n", "# print(f\"99th percentile token count: {percentile_99}\")\n", "\n", "# # Decide on an appropriate max_length based on these statistics\n" ] }, { "cell_type": "code", "execution_count": null, "id": "5976192a-a663-4ad7-b5e5-f795abb30c3a", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "e2295900-4902-4f6c-b1af-0dd391f46037", "metadata": {}, "outputs": [], "source": [ "EMBEDDING_MODELS_DICT" ] }, { "cell_type": "code", "execution_count": null, "id": "64da1867-5f42-49cd-adc9-d60a5cc1e6c0", "metadata": {}, "outputs": [], "source": [ "texts = history['combined_text'].values.tolist()\n", "embeddings_dict = {}\n", "embeddings_sizes = {}\n", "\n", "for model in EMBEDDING_MODELS_DICT.keys():\n", " fe = FeatureExtractor(EMBEDDING_MODELS_DICT, model_name=model)\n", " embeddings_dict[model] = fe.get_embeddings(texts)\n", " print(model, embeddings_dict[model].shape)\n", " embeddings_sizes[model] = embeddings_dict[model].shape[1]\n" ] }, { "cell_type": "code", "execution_count": null, "id": "c6ecf32f-af2a-4fe5-bb3e-d1a0f0d39e13", "metadata": {}, "outputs": [], "source": [ "embeddings_sizes" ] }, { "cell_type": "code", "execution_count": null, "id": "62f4f1e5-fd70-4ec9-9644-c6f68b577b0c", "metadata": {}, "outputs": [], "source": [ "embeddings_dict.keys()" ] }, { "cell_type": "code", "execution_count": null, "id": "cbae0da7-da91-413d-9da4-db3cbefb42af", "metadata": {}, "outputs": [], "source": [ "embeddings_dict['nomic-ai/modernbert-embed-base'].shape" ] }, { "cell_type": "code", "execution_count": null, "id": "055a6158-d681-4aa6-b66c-b4c1eb4d0e13", "metadata": {}, "outputs": [], "source": [ "# embeddings_dict['answerdotai/ModernBERT-base'][0]" ] }, { "cell_type": "code", "execution_count": null, "id": "0043dcc0-8373-4288-8b0c-3df00546dc53", "metadata": {}, "outputs": [], "source": [ "!mkdir -p ../data" ] }, { "cell_type": "code", "execution_count": null, "id": "df588b4d-61d4-4e30-9cb9-001cb9747ece", "metadata": {}, "outputs": [], "source": [ "import pickle\n", "\n", "with open(\"../data/embeddings_dict.pkl\", \"wb\") as f:\n", " pickle.dump(embeddings_dict, f)\n", "\n", "with open(\"../data/embeddings_sizes.pkl\", \"wb\") as f:\n", " pickle.dump(embeddings_sizes, f)\n", "\n", "history.to_csv(\"../data/history.csv\", index=False)" ] }, { "cell_type": "markdown", "id": "61265f7e-aff7-4893-b3c8-9a8c1b666738", "metadata": {}, "source": [ "#### Explore sqlite vector DB" ] }, { "cell_type": "code", "execution_count": null, "id": "80ed6d29-8887-4b10-b21d-3a02b1b78362", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import sqlite3\n", "import sqlite_vec\n", "\n", "from typing import List\n", "import struct" ] }, { "cell_type": "code", "execution_count": null, "id": "9836b48c-77ed-40d3-b0df-a542df31630d", "metadata": {}, "outputs": [], "source": [ "\n", "def serialize_f32(vector: List[float]) -> bytes:\n", " \"\"\"serializes a list of floats into a compact \"raw bytes\" format\"\"\"\n", " return struct.pack(\"%sf\" % len(vector), *vector)" ] }, { "cell_type": "code", "execution_count": null, "id": "012041ff-936b-4163-bc21-fa34012f7b6f", "metadata": {}, "outputs": [], "source": [ "db = sqlite3.connect(\":memory:\")\n", "db.enable_load_extension(True)\n", "sqlite_vec.load(db)\n", "db.enable_load_extension(False)\n", "\n", "sqlite_version, vec_version = db.execute(\n", " \"select sqlite_version(), vec_version()\"\n", ").fetchone()\n", "print(f\"sqlite_version={sqlite_version}, vec_version={vec_version}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "572d7447-4cd5-4467-8c48-cbdaa1f1e611", "metadata": {}, "outputs": [], "source": [ "path = \"../data/embeddings_dict.pkl\"\n", "\n", "with open(path, \"rb\") as f:\n", " embeddings_dict = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": null, "id": "a42a2215-5c2e-4784-82f0-d71da4fddf55", "metadata": {}, "outputs": [], "source": [ "embeddings_dict.keys()" ] }, { "cell_type": "code", "execution_count": null, "id": "a0936cdb-c75a-4dd4-8bc5-8659729c69f0", "metadata": {}, "outputs": [], "source": [ "# model_name = \"Xenova/paraphrase-multilingual-MiniLM-L12-v2\"\n", "# model_name = \"Xenova/distiluse-base-multilingual-cased-v1\"\n", "# model_name = \"Xenova/all-MiniLM-L6-v2\"\n", "# model_name = \"nomic-ai/nomic-embed-text-v1.5\"\n", "model_name = \"nomic-ai/modernbert-embed-base\"\n", "EMBEDDING_SIZE = embeddings_sizes[model_name]" ] }, { "cell_type": "code", "execution_count": null, "id": "f36de95c-ff05-4175-8cde-8b5003f77caf", "metadata": {}, "outputs": [], "source": [ "items = []\n", "for idx, vec in enumerate(embeddings_dict[model_name]):\n", " items.append((idx, list(vec)))" ] }, { "cell_type": "code", "execution_count": null, "id": "8b286fac-0924-42fe-aa5b-2910311f0f68", "metadata": {}, "outputs": [], "source": [ "model_name_normalized = model_name.replace(\"/\",\"_\").replace(\"-\",\"_\").replace(\".\",\"_\")" ] }, { "cell_type": "code", "execution_count": null, "id": "6f8d21e2-51ba-41b0-86f0-f13178902d67", "metadata": {}, "outputs": [], "source": [ "db.execute(f\"CREATE VIRTUAL TABLE vec_items_{model_name_normalized} USING vec0(embedding float[{EMBEDDING_SIZE}])\")\n", "\n", "with db:\n", " for item in items:\n", " db.execute(\n", " f\"INSERT INTO vec_items_{model_name_normalized}(rowid, embedding) VALUES (?, ?)\",\n", " [item[0], serialize_f32(item[1])],\n", " )\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "c2adb94b-e78e-43a4-a23a-905a9d72657d", "metadata": {}, "outputs": [], "source": [ "history = pd.read_csv(\"../data/history.csv\")" ] }, { "cell_type": "code", "execution_count": null, "id": "18496f88-d953-4c44-8a89-a6d58eb40ef3", "metadata": {}, "outputs": [], "source": [ "query = \"quantization\"\n", "\n", "fe = FeatureExtractor(EMBEDDING_MODELS_DICT, model_name=model_name)\n", "query_embedding = fe.get_embeddings([query])[0]\n" ] }, { "cell_type": "code", "execution_count": null, "id": "fabd843c-fa3d-48df-a7ed-3bee83e636ed", "metadata": {}, "outputs": [], "source": [ "query_embedding.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "2c6c6098-5206-463d-8315-262edc5ae3d9", "metadata": {}, "outputs": [], "source": [ "# using cosine distance\n", "rows = db.execute(\n", " f\"\"\"\n", " SELECT\n", " rowid,\n", " vec_distance_cosine(embedding, ?) AS cosine_distance\n", " FROM vec_items_{model_name_normalized}\n", " ORDER BY cosine_distance\n", " LIMIT 3\n", " \"\"\",\n", " [serialize_f32(query_embedding)],\n", ").fetchall()\n", "\n", "print(rows)" ] }, { "cell_type": "code", "execution_count": null, "id": "9df3ae87-fa0b-4528-993a-4bb8ea03ee51", "metadata": {}, "outputs": [], "source": [ "pd.set_option('display.max_colwidth', 200)" ] }, { "cell_type": "code", "execution_count": null, "id": "b6148508-cd63-4483-9c7f-5839e16fba50", "metadata": {}, "outputs": [], "source": [ "print(f\"query = {query}\")\n", "# history.iloc[[row for row, score in rows]]\n", "row_indices = [row for row, score in rows]\n", "distance = [score for row, score in rows]\n", "\n", "selected_rows = history.iloc[row_indices].copy()\n", "selected_rows[\"distance\"] = distance\n", "selected_rows" ] }, { "cell_type": "code", "execution_count": null, "id": "616f7b84-a967-42ae-8a2d-3e9bdbf2075b", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "243ce38c-de8e-47aa-a977-8e188f19de8f", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.8" } }, "nbformat": 4, "nbformat_minor": 5 }