supporting-blog-content/vector-search-implementation-guide-api/vector_search_implementation_guide_api.ipynb (548 lines of code) (raw):
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "XU4UjiHpYdDT"
},
"source": [
"# Simplified Vector Search (kNN) Implementation Guide\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "5lV5UN90l4YN"
},
"source": [
"# Loading the Embedding Model\n",
"Loading embedding model: [sentence-transformers/all-distilroberta-v1](https://huggingface.co/sentence-transformers/all-distilroberta-v1)\n",
"\n",
"Loading code borrowed from [elasticsearch-labs](https://www.elastic.co/search-labs) NLP text search [example notebook](https://colab.research.google.com/github/elastic/elasticsearch-labs/blob/main/notebooks/integrations/hugging-face/loading-model-from-hugging-face.ipynb)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Z0TiDltHkebY"
},
"outputs": [],
"source": [
"# install packages\n",
"!pip install -qU eland elasticsearch transformers sentence-transformers==2.7.0 torch==1.13"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Riwvd3CHO9qU"
},
"outputs": [],
"source": [
"# import modules\n",
"import pandas as pd, json\n",
"from elasticsearch import Elasticsearch\n",
"from elasticsearch.helpers import bulk\n",
"from getpass import getpass\n",
"from urllib.request import urlopen\n",
"from pprint import pprint"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "So9bJJDVNzgF"
},
"outputs": [],
"source": [
"API_KEY = getpass(\"Elastic deployment API Key\")\n",
"CLOUD_ID = getpass(\"Elastic deployment Cloud ID\")\n",
"HUB_MODEL_ID = getpass(\n",
" \"Hugging Face Model Hub ID\"\n",
") # eg sentence-transformers/all-distilroberta-v1\n",
"\n",
"es = Elasticsearch(cloud_id=CLOUD_ID, api_key=API_KEY)\n",
"es.info() # should return cluster info"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "dsFsmzZwpujb"
},
"outputs": [],
"source": [
"!eland_import_hub_model --cloud-id $CLOUD_ID --hub-model-id $HUB_MODEL_ID --task-type text_embedding --es-api-key $API_KEY --start"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "71wNrH0vl4zi"
},
"source": [
"# Ingest pipeline setup"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "SL47BJNyl3-r",
"outputId": "fa707db7-b6ec-47b4-c802-2d14c346e7bd"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"{'acknowledged': True}\n"
]
}
],
"source": [
"pipeline = {\n",
" \"processors\": [\n",
" {\n",
" \"inference\": {\n",
" \"field_map\": {\"my_text\": \"text_field\"},\n",
" \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n",
" \"target_field\": \"ml.inference.my_vector\",\n",
" \"on_failure\": [\n",
" {\n",
" \"append\": {\n",
" \"field\": \"_source._ingest.inference_errors\",\n",
" \"value\": [\n",
" {\n",
" \"message\": \"Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'\",\n",
" \"pipeline\": \"ml-inference-title-vector\",\n",
" \"timestamp\": \"{{{ _ingest.timestamp }}}\",\n",
" }\n",
" ],\n",
" }\n",
" }\n",
" ],\n",
" }\n",
" },\n",
" {\n",
" \"set\": {\n",
" \"field\": \"my_vector\",\n",
" \"if\": \"ctx?.ml?.inference != null && ctx.ml.inference['my_vector'] != null\",\n",
" \"copy_from\": \"ml.inference.my_vector.predicted_value\",\n",
" \"description\": \"Copy the predicted_value to 'my_vector'\",\n",
" }\n",
" },\n",
" {\"remove\": {\"field\": \"ml.inference.my_vector\", \"ignore_missing\": True}},\n",
" ]\n",
"}\n",
"\n",
"pipeline_id = \"vector_embedding_demo\"\n",
"response = es.ingest.put_pipeline(id=pipeline_id, body=pipeline)\n",
"\n",
"# Print the response\n",
"print(response)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "TgBeEw_Ql5I5"
},
"source": [
"# Index Mapping / Template setup"
]
},
{
"cell_type": "code",
"source": [
"index_patterns = [\"my_vector_index-*\"]\n",
"\n",
"priority = 1\n",
"\n",
"settings = {\n",
" \"index.default_pipeline\": pipeline_id,\n",
"}\n",
"\n",
"mappings = {\n",
" \"properties\": {\n",
" \"my_vector\": {\"type\": \"dense_vector\", \"dims\": 768},\n",
" \"my_text\": {\"type\": \"text\"},\n",
" },\n",
" \"_source\": {\"excludes\": [\"my_vector\"]},\n",
"}\n",
"\n",
"# Exclude `my_vector` from `_source` explicitly\n",
"source_exclusions = {\"_source\": {\"excludes\": [\"my_vector\"]}}\n",
"\n",
"# Create the index template using put_index_template\n",
"response = es.indices.put_index_template(\n",
" name=\"my_vector_index_template\", # Template name\n",
" index_patterns=index_patterns,\n",
" priority=priority,\n",
" template={\n",
" \"settings\": settings,\n",
" \"mappings\": mappings,\n",
" },\n",
")\n",
"\n",
"# Print the response\n",
"print(response)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "I5F6DR8jroEM",
"outputId": "f1222091-cd17-4d8a-d811-2ac8e55d944e"
},
"execution_count": 49,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"{'acknowledged': True}\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "bztQcxbll5cs"
},
"source": [
"# Indexing Data\n"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"id": "XbapSs1c-hkd"
},
"outputs": [],
"source": [
"index_name = \"my_vector_index-01\""
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "bSIJ-AngVmUi",
"outputId": "c5cdd475-132d-4410-83e8-3557f4e05bb5"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"ObjectApiResponse({'_shards': {'total': 2, 'successful': 1, 'failed': 0}})"
]
},
"metadata": {},
"execution_count": 51
}
],
"source": [
"data = [\n",
" (\"Hey, careful, man, there's a beverage here!\", \"The Dude\"),\n",
" (\n",
" \"I’m The Dude. So, that’s what you call me. You know, that or, uh, His Dudeness, or, uh, Duder, or El Duderino, if you’re not into the whole brevity thing\",\n",
" \"The Dude\",\n",
" ),\n",
" (\n",
" \"You don't go out looking for a job dressed like that? On a weekday?\",\n",
" \"The Big Lebowski\",\n",
" ),\n",
" (\"What do you mean brought it bowling, Dude?\", \"Walter Sobchak\"),\n",
" (\n",
" \"Donny was a good bowler, and a good man. He was one of us. He was a man who loved the outdoors... and bowling, and as a surfer he explored the beaches of Southern California, from La Jolla to Leo Carrillo and... up to... Pismo\",\n",
" \"Walter Sobchak\",\n",
" ),\n",
"]\n",
"\n",
"actions = [\n",
" {\n",
" \"_op_type\": \"index\",\n",
" \"_index\": \"my_vector_index-01\",\n",
" \"_source\": {\"my_text\": text, \"my_metadata\": metadata},\n",
" }\n",
" for text, metadata in data\n",
"]\n",
"\n",
"bulk(es, actions)\n",
"\n",
"# Refresh the index to make sure all data is searchable\n",
"es.indices.refresh(index=\"my_vector_index-01\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ENlZ3Ndjl5yl"
},
"source": [
"# Querying Data\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Xk4CBDpimfDH"
},
"source": [
"Approximate k-nearest neighbor (kNN)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "xl76_rM4l3iC",
"outputId": "5d3b4c44-ff3c-4489-b850-e2e1bfc4880a"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[{'_id': 'PoHEcpIB5JwEUwVjEs6E',\n",
" '_index': 'my_vector_index-01',\n",
" '_score': 0.7825787,\n",
" '_source': {'ml': {'inference': {}},\n",
" 'my_metadata': 'The Dude',\n",
" 'my_text': \"Hey, careful, man, there's a beverage here!\"}}]\n"
]
}
],
"source": [
"knn = {\n",
" \"field\": \"my_vector\",\n",
" \"k\": 1,\n",
" \"num_candidates\": 5,\n",
" \"query_vector_builder\": {\n",
" \"text_embedding\": {\n",
" \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n",
" \"model_text\": \"Watchout I have a drink\",\n",
" }\n",
" },\n",
"}\n",
"\n",
"response = es.search(index=index_name, knn=knn, source=True)\n",
"\n",
"pprint(response[\"hits\"][\"hits\"])"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "vhefCRd-mjk8"
},
"source": [
"## Hybrid Searching (kNN + BM25) with RRF"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "wLY8Q6tEmk06",
"outputId": "3f1cc630-6e65-42b8-82eb-b83222fd43ce"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[{'_id': 'QYHEcpIB5JwEUwVjEs6E',\n",
" '_index': 'my_vector_index-01',\n",
" '_score': 1.8082356,\n",
" 'fields': {'my_metadata': ['Walter Sobchak'],\n",
" 'my_text': ['What do you mean brought it bowling, Dude?']}},\n",
" {'_id': 'QoHEcpIB5JwEUwVjEs6E',\n",
" '_index': 'my_vector_index-01',\n",
" '_score': 1.2366624,\n",
" 'fields': {'my_metadata': ['Walter Sobchak'],\n",
" 'my_text': ['Donny was a good bowler, and a good man. He was one '\n",
" 'of us. He was a man who loved the outdoors... and '\n",
" 'bowling, and as a surfer he explored the beaches of '\n",
" 'Southern California, from La Jolla to Leo Carrillo '\n",
" 'and... up to... Pismo']}}]\n"
]
}
],
"source": [
"query = {\"match\": {\"my_text\": \"bowling\"}}\n",
"\n",
"knn = {\n",
" \"field\": \"my_vector\",\n",
" \"k\": 3,\n",
" \"num_candidates\": 5,\n",
" \"query_vector_builder\": {\n",
" \"text_embedding\": {\n",
" \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n",
" \"model_text\": \"He enjoyed the game\",\n",
" }\n",
" },\n",
"}\n",
"\n",
"rank: {\"rrf\": {}}\n",
"\n",
"fields = [\"my_text\", \"my_metadata\"]\n",
"\n",
"\n",
"response = es.search(\n",
" index=index_name, fields=fields, knn=knn, query=query, size=2, source=False\n",
")\n",
"\n",
"pprint(response[\"hits\"][\"hits\"])"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "HDBHn_kamlIL"
},
"source": [
"## Filtering"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "yVDMHuM3mla7",
"outputId": "b39c13de-a97b-4112-b733-a246cdc7f364"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[{'_id': 'PoHEcpIB5JwEUwVjEs6E',\n",
" '_index': 'my_vector_index-01',\n",
" '_score': 0.59394693,\n",
" 'fields': {'my_metadata': ['The Dude'],\n",
" 'my_text': [\"Hey, careful, man, there's a beverage here!\"]}}]\n"
]
}
],
"source": [
"knn = {\n",
" \"field\": \"my_vector\",\n",
" \"k\": 1,\n",
" \"num_candidates\": 5,\n",
" \"query_vector_builder\": {\n",
" \"text_embedding\": {\n",
" \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n",
" \"model_text\": \"Did you bring the dog?\",\n",
" }\n",
" },\n",
" \"filter\": {\"term\": {\"my_metadata.keyword\": \"The Dude\"}},\n",
"}\n",
"\n",
"fields = [\"my_text\", \"my_metadata\"]\n",
"\n",
"response = es.search(index=index_name, fields=fields, knn=knn, source=False)\n",
"\n",
"pprint(response[\"hits\"][\"hits\"])"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "N_Msyv4-m5ow"
},
"source": [
"# Aggregrations\n",
"and Select fields returned"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "jbwinE0fm5-I",
"outputId": "7ae0af99-3260-475b-98fe-2b5d8d165645"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[{'_id': 'QYHEcpIB5JwEUwVjEs6E',\n",
" '_index': 'my_vector_index-01',\n",
" '_score': 0.7433834,\n",
" 'fields': {'my_metadata': ['Walter Sobchak'],\n",
" 'my_text': ['What do you mean brought it bowling, Dude?']}},\n",
" {'_id': 'PoHEcpIB5JwEUwVjEs6E',\n",
" '_index': 'my_vector_index-01',\n",
" '_score': 0.6028075,\n",
" 'fields': {'my_metadata': ['The Dude'],\n",
" 'my_text': [\"Hey, careful, man, there's a beverage here!\"]}}]\n"
]
}
],
"source": [
"knn = {\n",
" \"field\": \"my_vector\",\n",
" \"k\": 2,\n",
" \"num_candidates\": 5,\n",
" \"query_vector_builder\": {\n",
" \"text_embedding\": {\n",
" \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n",
" \"model_text\": \"did you bring it?\",\n",
" }\n",
" },\n",
"}\n",
"\n",
"aggs = {\"metadata\": {\"terms\": {\"field\": \"my_metadata.keyword\"}}}\n",
"\n",
"fields = [\"my_text\", \"my_metadata\"]\n",
"\n",
"response = es.search(index=index_name, fields=fields, aggs=aggs, knn=knn, source=False)\n",
"\n",
"pprint(response[\"hits\"][\"hits\"])"
]
}
],
"metadata": {
"colab": {
"provenance": [],
"toc_visible": true
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}