Purpose of this notebook is to explore the semantic search use case with browsing history in mind
- Important caveat is to explore the support for multiple languges

Reference link -> https://data.firefox.com/dashboard/usage-behavior

  Worldwide, English (US) remains the most common, at about 40% of the population, with German (11%) and French (8.1%) coming 2nd and 3rd. Simplified Chinese is the 4th most common language (6.7%), and Spanish (Spain) is the 5th most common language (5%).

In [None]:
import pandas as pd
import onnxruntime as ort
from transformers import AutoTokenizer
import numpy as np
import requests
import os
import sys

In [None]:
# Add the project root directory to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

In [None]:
from src.constants import EMBEDDING_MODELS_DICT
from src.feature_extractor import FeatureExtractor

In [None]:
# !cp /tmp/output_file.txt /Users/cgopal/Downloads/places_output_file_v1.txt

#### Lets try reading browsing history

Download browsing history:

1) cp "/Users/<username>/Library/Application Support/Firefox/Profiles/<profilename>/places.sqlite" /tmp/places.sqlite
2) sqlite3 /tmp/places.sqlite
3) within sqlite run below commands one by one
```
.mode csv
.headers on
.output temp_data.csv
SELECT url,title,description,preview_image_url,frecency,last_visit_date
FROM moz_places
WHERE title NOTNULL
AND url not like '%google.com/search?%'
ORDER BY frecency DESC
LIMIT 1000;
```
<!-- 4) copy the file output_file_v2 to ~/Downloads/places_output_file_v2.txt -->
4) cp temp_data.csv ../data/history_output_file.csv


In [None]:
# history = pd.read_csv("/Users/cgopal/Downloads/places_output_file_v2.txt",
#                       sep="~\\|", engine="python", header=None, encoding="utf-8", on_bad_lines="skip", index_col=False,
#                       names=['url', 'title', 'description', 'preview_image_url', 'frecency', 'last_visit_date'])

# print(len(history))
# history.head().T
history = pd.read_csv("../data/history_output_file.csv")

In [None]:
# history['last_visit_date'].fillna(0)

In [None]:
history['last_visit_date'] = pd.to_datetime(history['last_visit_date'], unit='us')

# fill empty last_visit_date with default value "1970-01-01"
history['last_visit_date'] = history['last_visit_date'].fillna(pd.to_datetime("1970-01-01"))
history['combined_text'] = history['title'].fillna('') + " " + history['description'].fillna('')
history = history.loc[history['combined_text'] != ''].reset_index(drop=True)

print(len(history))

In [None]:
history

#### find appropriate max token length

In [None]:
!python -V

In [None]:
# !python -m pip install tiktoken
# !python -m pip freeze| grep tiktoken

In [None]:
# print(tiktoken.list_encoding_names())

In [None]:
# # import pandas as pd
# import tiktoken
# # import numpy as np

# # Sample data
# # history

# # Initialize the tokenizer
# # Replace 'gpt-3.5-turbo' with the model/tokenizer you want to use
# tokenizer = tiktoken.get_encoding("gpt2")

# # Tokenize each text and count tokens
# history['token_count'] = history['combined_text'].apply(lambda x: len(tokenizer.encode(x)))

# # Compute statistics
# max_length = history['token_count'].max()
# percentile_95 = np.percentile(history['token_count'], 95)
# percentile_99 = np.percentile(history['token_count'], 99)

# print(f"Maximum token count: {max_length}")
# print(f"95th percentile token count: {percentile_95}")
# print(f"99th percentile token count: {percentile_99}")

# # Decide on an appropriate max_length based on these statistics


In [None]:
EMBEDDING_MODELS_DICT

In [None]:
texts = history['combined_text'].values.tolist()
embeddings_dict = {}
embeddings_sizes = {}

for model in EMBEDDING_MODELS_DICT.keys():
    fe = FeatureExtractor(EMBEDDING_MODELS_DICT, model_name=model)
    embeddings_dict[model] = fe.get_embeddings(texts)
    print(model, embeddings_dict[model].shape)
    embeddings_sizes[model] = embeddings_dict[model].shape[1]


In [None]:
embeddings_sizes

In [None]:
embeddings_dict.keys()

In [None]:
embeddings_dict['nomic-ai/modernbert-embed-base'].shape

In [None]:
# embeddings_dict['answerdotai/ModernBERT-base'][0]

In [None]:
!mkdir -p ../data

In [None]:
import pickle

with open("../data/embeddings_dict.pkl", "wb") as f:
    pickle.dump(embeddings_dict, f)

with open("../data/embeddings_sizes.pkl", "wb") as f:
    pickle.dump(embeddings_sizes, f)

history.to_csv("../data/history.csv", index=False)

#### Explore sqlite vector DB

In [None]:
import numpy as np
import sqlite3
import sqlite_vec

from typing import List
import struct

In [None]:

def serialize_f32(vector: List[float]) -> bytes:
    """serializes a list of floats into a compact "raw bytes" format"""
    return struct.pack("%sf" % len(vector), *vector)

In [None]:
db = sqlite3.connect(":memory:")
db.enable_load_extension(True)
sqlite_vec.load(db)
db.enable_load_extension(False)

sqlite_version, vec_version = db.execute(
    "select sqlite_version(), vec_version()"
).fetchone()
print(f"sqlite_version={sqlite_version}, vec_version={vec_version}")

In [None]:
path = "../data/embeddings_dict.pkl"

with open(path, "rb") as f:
    embeddings_dict = pickle.load(f)

In [None]:
embeddings_dict.keys()

In [None]:
# model_name = "Xenova/paraphrase-multilingual-MiniLM-L12-v2"
# model_name = "Xenova/distiluse-base-multilingual-cased-v1"
# model_name = "Xenova/all-MiniLM-L6-v2"
# model_name = "nomic-ai/nomic-embed-text-v1.5"
model_name = "nomic-ai/modernbert-embed-base"
EMBEDDING_SIZE = embeddings_sizes[model_name]

In [None]:
items = []
for idx, vec in enumerate(embeddings_dict[model_name]):
    items.append((idx, list(vec)))

In [None]:
model_name_normalized = model_name.replace("/","_").replace("-","_").replace(".","_")

In [None]:
db.execute(f"CREATE VIRTUAL TABLE vec_items_{model_name_normalized} USING vec0(embedding float[{EMBEDDING_SIZE}])")

with db:
    for item in items:
        db.execute(
            f"INSERT INTO vec_items_{model_name_normalized}(rowid, embedding) VALUES (?, ?)",
            [item[0], serialize_f32(item[1])],
        )



In [None]:
history = pd.read_csv("../data/history.csv")

In [None]:
query = "quantization"

fe = FeatureExtractor(EMBEDDING_MODELS_DICT, model_name=model_name)
query_embedding = fe.get_embeddings([query])[0]


In [None]:
query_embedding.shape

In [None]:
# using cosine distance
rows = db.execute(
    f"""
      SELECT
        rowid,
        vec_distance_cosine(embedding, ?) AS cosine_distance
      FROM vec_items_{model_name_normalized}
      ORDER BY cosine_distance
      LIMIT 3
    """,
    [serialize_f32(query_embedding)],
).fetchall()

print(rows)

In [None]:
pd.set_option('display.max_colwidth', 200)

In [None]:
print(f"query = {query}")
# history.iloc[[row for row, score in rows]]
row_indices = [row for row, score in rows]
distance = [score for row, score in rows]

selected_rows = history.iloc[row_indices].copy()
selected_rows["distance"] = distance
selected_rows