Explore binary quantization

https://alexgarcia.xyz/sqlite-vec/guides/binary-quant.html

In [None]:
import os
import sys
import pandas as pd
import numpy as np

In [None]:
import sqlite3
import sqlite_vec
from typing import List
import struct

In [None]:
# Add the project root directory to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

In [None]:
from src.constants import EMBEDDING_MODELS_DICT
from src.feature_extractor import FeatureExtractor
from src.metrics import run_traditional_eval

In [None]:
!python -V

In [None]:
!python -m pip freeze| grep sqlite

In [None]:
# !export LDFLAGS="-L/opt/homebrew/opt/sqlite/lib"
# !export CPPFLAGS="-I/opt/homebrew/opt/sqlite/include"


In [None]:
db = sqlite3.connect(":memory:")
db.enable_load_extension(True)
sqlite_vec.load(db)
db.enable_load_extension(False)

sqlite_version, vec_version = db.execute(
    "select sqlite_version(), vec_version()"
).fetchone()
print(f"sqlite_version={sqlite_version}, vec_version={vec_version}")

In [None]:
res = db.execute(f"""select vec_quantize_binary(
  '[-0.73, -0.80, 0.12, -0.73, 0.79, -0.11, 0.23, 0.97]'
);""").fetchall()

In [None]:
res

In [None]:
type(res)

In [None]:
# int(res[0][0])
byte_value = res[0][0]
binary_representation = bin(int.from_bytes(byte_value, "big"))
print(f"Binary Representation: {binary_representation}")


In [None]:
row_limit = 10000

In [None]:
firefox_conn = sqlite3.connect("../data/places.sqlite")  
firefox_cursor = firefox_conn.cursor()

input_data = firefox_cursor.execute(f"""
WITH TOP_FRECENT_PLACES AS
(SELECT p.url, p.title, COALESCE(p.description, '') AS description, p.id AS place_id, p.frecency, p.origin_id, p.url_hash,
        p.last_visit_date
FROM moz_places p
WHERE p.title NOTNULL
AND url not like '%google.com/search?%'
ORDER BY frecency DESC
LIMIT {row_limit}
) 

SELECT * FROM TOP_FRECENT_PLACES;
""").fetchall()

In [None]:
history = pd.read_csv("../data/history_output_file.csv")
# history = pd.DataFrame(input_data, 
#                        columns=['url', 'title', 'description', 'place_id', 'frecency', 'origin_id', 'url_hash', 'last_visit_date'])
history['last_visit_date'] = pd.to_datetime(history['last_visit_date'], unit='us')

# fill empty last_visit_date with default value "1970-01-01"
history['last_visit_date'] = history['last_visit_date'].fillna(pd.to_datetime("1970-01-01"))
history['combined_text'] = history['title'].fillna('') + " " + history['description'].fillna('')
history = history.loc[history['combined_text'] != ''].reset_index(drop=True).head(row_limit)

In [None]:
history

In [None]:
EMBEDDING_MODELS_DICT['Xenova/all-MiniLM-L6-v2']

In [None]:
EMBEDDING_MODELS_DICT

In [None]:
model_name, embeddings_size = 'Xenova/all-MiniLM-L6-v2', 384
# model_name, embeddings_size = 'nomic-ai/nomic-embed-text-v1.5', 768
# model_name, embeddings_size = "Xenova/all-mpnet-base-v2", 768
# model_name, embeddings_size = 'Xenova/paraphrase-mpnet-base-v2', 768
# model_name, embeddings_size = 'Xenova/all-MiniLM-L12-v2', 384
# model_name, embeddings_size = 'nomic-ai/modernbert-embed-base', 768
fe = FeatureExtractor(EMBEDDING_MODELS_DICT, model_name=model_name)
texts = history['combined_text'].values.tolist()
embeddings = fe.get_embeddings(texts)
embeddings.shape

In [None]:
model_name_normalized = model_name.replace("/","_").replace("-","_").replace(".","_")

# Function to convert float vectors to binary format for SQLite
def serialize_f32_from_np(vector: np.ndarray) -> bytes:
    """Serializes a NumPy float32 vector into raw bytes format for SQLite."""
    return struct.pack(f"{len(vector)}f", *vector.astype(np.float32))  # Convert to float32


items = []
for idx, vec in enumerate(embeddings):
    items.append((idx, vec))

for item in items[:5]:
    print(type(item[1][0]))

#### Approach 1 just using the binary quantization

In [None]:
db.execute(f"CREATE VIRTUAL TABLE vec_items_{model_name_normalized}_1 USING vec0(embedding bit[{embeddings_size}])")
# db.execute(f"CREATE VIRTUAL TABLE vec_items_{model_name_normalized}_1 USING vec0(embedding bit[768])")
# db.execute(f"CREATE VIRTUAL TABLE vec_items_{model_name_normalized}_1 USING vec0(embedding bit[128])")

with db:
    for idx, vec in enumerate(embeddings):
        db.execute(
            f"INSERT INTO vec_items_{model_name_normalized}_1(rowid, embedding) VALUES (?, vec_quantize_binary(?))",
            [idx, serialize_f32_from_np(vec)],  # Convert vector to binary format
        )



In [None]:

def predict_with_bin_quantized(query):
    query_serialized_vec = serialize_f32_from_np(fe.get_embeddings([query])[0])
    
    retrived_results = db.execute(f"""
    select
      rowid,
      distance
    from vec_items_{model_name_normalized}_1
    where embedding match vec_quantize_binary(:query_serialized_vec)
    order by distance
    limit 2;
    """, {"query_serialized_vec": query_serialized_vec}).fetchall()
    
    return history.iloc[[row for row,dist in retrived_results]]

In [None]:
%timeit predict_with_bin_quantized(query="mail box")

In [None]:
predict_with_bin_quantized(query="canada news")

#### Approach 2 just using the binary quantization & re-scoring

In [None]:
db.execute(f"CREATE VIRTUAL TABLE vec_items_{model_name_normalized}_2 USING vec0(embedding float[{embeddings_size}], embedding_coarse bit[{embeddings_size}])")
# db.execute(f"CREATE VIRTUAL TABLE vec_items_{model_name_normalized}_2 USING vec0(embedding float[768], embedding_coarse bit[768])")
# db.execute(f"CREATE VIRTUAL TABLE vec_items_{model_name_normalized}_2 USING vec0(embedding float[128], embedding_coarse bit[128])")

with db:
    for idx, vec in enumerate(embeddings):
        embedding = serialize_f32_from_np(vec)
        db.execute(
            f"INSERT INTO vec_items_{model_name_normalized}_2(rowid, embedding, embedding_coarse) VALUES (?, ?, vec_quantize_binary(?))",
            [idx, embedding, embedding],  # Convert vector to binary format
        )



In [None]:


def predict_coarse(query):
    query_serialized_vec = serialize_f32_from_np(fe.get_embeddings([query])[0])
    
    retrived_results = db.execute(f"""
    with coarse_matches as (
      select
        rowid,
        embedding
      from vec_items_{model_name_normalized}_2
      where embedding_coarse match vec_quantize_binary(:query_serialized_vec)
      order by distance
      limit 200
    )
    select
      rowid,
      vec_distance_cosine(embedding, :query_serialized_vec)
    from coarse_matches
    order by 2
    limit 2;
    """, {"query_serialized_vec": query_serialized_vec}).fetchall()
    return history.iloc[[row for row,dist in retrived_results]]
    
    # final_res = history.iloc[[row for row,dist in retrived_results]]
    # final_res['distance'] = [dist for row,dist in retrived_results]
    # return final_res

In [None]:
%timeit predict_coarse(query="scheduler")

In [None]:
predict_coarse(query="usa news")

In [None]:
db_size = db.execute("PRAGMA page_count;").fetchone()[0] * db.execute("PRAGMA page_size;").fetchone()[0]
print(f"Estimated in-memory SQLite DB size: {db_size / (1024)**2} mb")


In [None]:
DISK_DB_PATH = "temp_semantic_vec.db"

# Save the in-memory database to disk
disk_db = sqlite3.connect(DISK_DB_PATH)
db.backup(disk_db)  # Copy in-memory DB to file
disk_db.close()

# Get file size
db_size = os.path.getsize(DISK_DB_PATH)
print(f"Size of SQLite database file: {db_size / (1024)**2} mb")


#### Validation

In [None]:
golden_data = pd.read_csv("../data/chidam_golden_query.csv", usecols=['search_query', 'url'])
print(len(golden_data))
golden_data.head()

In [None]:
def validate(pred_fn):
    eval_rows = []
    print(f"Validating approach `{pred_fn.__name__}`:")
    correct = 0
    for idx, (query, actual) in golden_data.iterrows():
        retrieved = pred_fn(query)['url'].values.tolist()
        if actual in retrieved:
            correct += 1
        eval_row = run_traditional_eval(idx, query, [actual], retrieved, retrieved_distances=None, k=2)
        eval_rows.append(eval_row)
        # else:
        #     print(query, actual, retrieved)
    print(f"correct count = {correct}")
    print(f"recall = {correct/len(golden_data)}")
    print("\n")
    return pd.DataFrame(eval_rows)



In [None]:
eval_df = validate(predict_with_bin_quantized)
eval_df[['precision@2', 'recall@2', 'ndcg@2', 'reciprocal_rank', 'average_precision']].mean()

In [None]:
eval_df = validate(predict_coarse)
eval_df[['precision@2', 'recall@2', 'ndcg@2', 'reciprocal_rank', 'average_precision']].mean()

In [None]:
# Validating approach `predict_coarse`:
# correct count = 14
# recall = 0.2857142857142857


# precision@2          0.142857
# recall@2             0.285714
# ndcg@2               0.263118
# reciprocal_rank      0.255102
# average_precision    0.183673
# dtype: float64


