# Simplified Vector Search (kNN) Implementation Guide


# Loading the Embedding Model
Loading embedding model: [sentence-transformers/all-distilroberta-v1](https://huggingface.co/sentence-transformers/all-distilroberta-v1)

Loading code borrowed from [elasticsearch-labs](https://www.elastic.co/search-labs) NLP text search [example notebook](https://colab.research.google.com/github/elastic/elasticsearch-labs/blob/main/notebooks/integrations/hugging-face/loading-model-from-hugging-face.ipynb)


In [None]:
# install packages
!pip install -qU eland elasticsearch transformers sentence-transformers==2.7.0 torch==1.13

In [None]:
# import modules
import pandas as pd, json
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from getpass import getpass
from urllib.request import urlopen
from pprint import pprint

In [None]:
API_KEY = getpass("Elastic deployment API Key")
CLOUD_ID = getpass("Elastic deployment Cloud ID")
HUB_MODEL_ID = getpass(
    "Hugging Face Model Hub ID"
)  # eg sentence-transformers/all-distilroberta-v1

es = Elasticsearch(cloud_id=CLOUD_ID, api_key=API_KEY)
es.info()  # should return cluster info

In [None]:
!eland_import_hub_model --cloud-id $CLOUD_ID --hub-model-id $HUB_MODEL_ID --task-type text_embedding --es-api-key $API_KEY --start

# Ingest pipeline setup

In [48]:
pipeline = {
    "processors": [
        {
            "inference": {
                "field_map": {"my_text": "text_field"},
                "model_id": "sentence-transformers__all-distilroberta-v1",
                "target_field": "ml.inference.my_vector",
                "on_failure": [
                    {
                        "append": {
                            "field": "_source._ingest.inference_errors",
                            "value": [
                                {
                                    "message": "Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'",
                                    "pipeline": "ml-inference-title-vector",
                                    "timestamp": "{{{ _ingest.timestamp }}}",
                                }
                            ],
                        }
                    }
                ],
            }
        },
        {
            "set": {
                "field": "my_vector",
                "if": "ctx?.ml?.inference != null && ctx.ml.inference['my_vector'] != null",
                "copy_from": "ml.inference.my_vector.predicted_value",
                "description": "Copy the predicted_value to 'my_vector'",
            }
        },
        {"remove": {"field": "ml.inference.my_vector", "ignore_missing": True}},
    ]
}

pipeline_id = "vector_embedding_demo"
response = es.ingest.put_pipeline(id=pipeline_id, body=pipeline)

# Print the response
print(response)

{'acknowledged': True}


# Index Mapping / Template setup

In [49]:
index_patterns = ["my_vector_index-*"]

priority = 1

settings = {
    "index.default_pipeline": pipeline_id,
}

mappings = {
    "properties": {
        "my_vector": {"type": "dense_vector", "dims": 768},
        "my_text": {"type": "text"},
    },
    "_source": {"excludes": ["my_vector"]},
}

# Exclude `my_vector` from `_source` explicitly
source_exclusions = {"_source": {"excludes": ["my_vector"]}}

# Create the index template using put_index_template
response = es.indices.put_index_template(
    name="my_vector_index_template",  # Template name
    index_patterns=index_patterns,
    priority=priority,
    template={
        "settings": settings,
        "mappings": mappings,
    },
)

# Print the response
print(response)

{'acknowledged': True}


# Indexing Data


In [50]:
index_name = "my_vector_index-01"

In [51]:
data = [
    ("Hey, careful, man, there's a beverage here!", "The Dude"),
    (
        "I’m The Dude. So, that’s what you call me. You know, that or, uh, His Dudeness, or, uh, Duder, or El Duderino, if you’re not into the whole brevity thing",
        "The Dude",
    ),
    (
        "You don't go out looking for a job dressed like that? On a weekday?",
        "The Big Lebowski",
    ),
    ("What do you mean brought it bowling, Dude?", "Walter Sobchak"),
    (
        "Donny was a good bowler, and a good man. He was one of us. He was a man who loved the outdoors... and bowling, and as a surfer he explored the beaches of Southern California, from La Jolla to Leo Carrillo and... up to... Pismo",
        "Walter Sobchak",
    ),
]

actions = [
    {
        "_op_type": "index",
        "_index": "my_vector_index-01",
        "_source": {"my_text": text, "my_metadata": metadata},
    }
    for text, metadata in data
]

bulk(es, actions)

# Refresh the index to make sure all data is searchable
es.indices.refresh(index="my_vector_index-01")

ObjectApiResponse({'_shards': {'total': 2, 'successful': 1, 'failed': 0}})

# Querying Data


Approximate k-nearest neighbor (kNN)

In [52]:
knn = {
    "field": "my_vector",
    "k": 1,
    "num_candidates": 5,
    "query_vector_builder": {
        "text_embedding": {
            "model_id": "sentence-transformers__all-distilroberta-v1",
            "model_text": "Watchout I have a drink",
        }
    },
}

response = es.search(index=index_name, knn=knn, source=True)

pprint(response["hits"]["hits"])

[{'_id': 'PoHEcpIB5JwEUwVjEs6E',
  '_index': 'my_vector_index-01',
  '_score': 0.7825787,
  '_source': {'ml': {'inference': {}},
              'my_metadata': 'The Dude',
              'my_text': "Hey, careful, man, there's a beverage here!"}}]


## Hybrid Searching (kNN + BM25) with RRF

In [53]:
query = {"match": {"my_text": "bowling"}}

knn = {
    "field": "my_vector",
    "k": 3,
    "num_candidates": 5,
    "query_vector_builder": {
        "text_embedding": {
            "model_id": "sentence-transformers__all-distilroberta-v1",
            "model_text": "He enjoyed the game",
        }
    },
}

rank: {"rrf": {}}

fields = ["my_text", "my_metadata"]


response = es.search(
    index=index_name, fields=fields, knn=knn, query=query, size=2, source=False
)

pprint(response["hits"]["hits"])

[{'_id': 'QYHEcpIB5JwEUwVjEs6E',
  '_index': 'my_vector_index-01',
  '_score': 1.8082356,
  'fields': {'my_metadata': ['Walter Sobchak'],
             'my_text': ['What do you mean brought it bowling, Dude?']}},
 {'_id': 'QoHEcpIB5JwEUwVjEs6E',
  '_index': 'my_vector_index-01',
  '_score': 1.2366624,
  'fields': {'my_metadata': ['Walter Sobchak'],
             'my_text': ['Donny was a good bowler, and a good man. He was one '
                         'of us. He was a man who loved the outdoors... and '
                         'bowling, and as a surfer he explored the beaches of '
                         'Southern California, from La Jolla to Leo Carrillo '
                         'and... up to... Pismo']}}]


## Filtering

In [55]:
knn = {
    "field": "my_vector",
    "k": 1,
    "num_candidates": 5,
    "query_vector_builder": {
        "text_embedding": {
            "model_id": "sentence-transformers__all-distilroberta-v1",
            "model_text": "Did you bring the dog?",
        }
    },
    "filter": {"term": {"my_metadata.keyword": "The Dude"}},
}

fields = ["my_text", "my_metadata"]

response = es.search(index=index_name, fields=fields, knn=knn, source=False)

pprint(response["hits"]["hits"])

[{'_id': 'PoHEcpIB5JwEUwVjEs6E',
  '_index': 'my_vector_index-01',
  '_score': 0.59394693,
  'fields': {'my_metadata': ['The Dude'],
             'my_text': ["Hey, careful, man, there's a beverage here!"]}}]


# Aggregrations
and Select fields returned

In [56]:
knn = {
    "field": "my_vector",
    "k": 2,
    "num_candidates": 5,
    "query_vector_builder": {
        "text_embedding": {
            "model_id": "sentence-transformers__all-distilroberta-v1",
            "model_text": "did you bring it?",
        }
    },
}

aggs = {"metadata": {"terms": {"field": "my_metadata.keyword"}}}

fields = ["my_text", "my_metadata"]

response = es.search(index=index_name, fields=fields, aggs=aggs, knn=knn, source=False)

pprint(response["hits"]["hits"])

[{'_id': 'QYHEcpIB5JwEUwVjEs6E',
  '_index': 'my_vector_index-01',
  '_score': 0.7433834,
  'fields': {'my_metadata': ['Walter Sobchak'],
             'my_text': ['What do you mean brought it bowling, Dude?']}},
 {'_id': 'PoHEcpIB5JwEUwVjEs6E',
  '_index': 'my_vector_index-01',
  '_score': 0.6028075,
  'fields': {'my_metadata': ['The Dude'],
             'my_text': ["Hey, careful, man, there's a beverage here!"]}}]
