supporting-blog-content/vector-search-implementation-guide-api/vector_search_implementation_guide_api.ipynb (548 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "XU4UjiHpYdDT" }, "source": [ "# Simplified Vector Search (kNN) Implementation Guide\n" ] }, { "cell_type": "markdown", "metadata": { "id": "5lV5UN90l4YN" }, "source": [ "# Loading the Embedding Model\n", "Loading embedding model: [sentence-transformers/all-distilroberta-v1](https://huggingface.co/sentence-transformers/all-distilroberta-v1)\n", "\n", "Loading code borrowed from [elasticsearch-labs](https://www.elastic.co/search-labs) NLP text search [example notebook](https://colab.research.google.com/github/elastic/elasticsearch-labs/blob/main/notebooks/integrations/hugging-face/loading-model-from-hugging-face.ipynb)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Z0TiDltHkebY" }, "outputs": [], "source": [ "# install packages\n", "!pip install -qU eland elasticsearch transformers sentence-transformers==2.7.0 torch==1.13" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Riwvd3CHO9qU" }, "outputs": [], "source": [ "# import modules\n", "import pandas as pd, json\n", "from elasticsearch import Elasticsearch\n", "from elasticsearch.helpers import bulk\n", "from getpass import getpass\n", "from urllib.request import urlopen\n", "from pprint import pprint" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "So9bJJDVNzgF" }, "outputs": [], "source": [ "API_KEY = getpass(\"Elastic deployment API Key\")\n", "CLOUD_ID = getpass(\"Elastic deployment Cloud ID\")\n", "HUB_MODEL_ID = getpass(\n", " \"Hugging Face Model Hub ID\"\n", ") # eg sentence-transformers/all-distilroberta-v1\n", "\n", "es = Elasticsearch(cloud_id=CLOUD_ID, api_key=API_KEY)\n", "es.info() # should return cluster info" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "dsFsmzZwpujb" }, "outputs": [], "source": [ "!eland_import_hub_model --cloud-id $CLOUD_ID --hub-model-id $HUB_MODEL_ID --task-type text_embedding --es-api-key $API_KEY --start" ] }, { "cell_type": "markdown", "metadata": { "id": "71wNrH0vl4zi" }, "source": [ "# Ingest pipeline setup" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SL47BJNyl3-r", "outputId": "fa707db7-b6ec-47b4-c802-2d14c346e7bd" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "{'acknowledged': True}\n" ] } ], "source": [ "pipeline = {\n", " \"processors\": [\n", " {\n", " \"inference\": {\n", " \"field_map\": {\"my_text\": \"text_field\"},\n", " \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n", " \"target_field\": \"ml.inference.my_vector\",\n", " \"on_failure\": [\n", " {\n", " \"append\": {\n", " \"field\": \"_source._ingest.inference_errors\",\n", " \"value\": [\n", " {\n", " \"message\": \"Processor 'inference' in pipeline 'ml-inference-title-vector' failed with message '{{ _ingest.on_failure_message }}'\",\n", " \"pipeline\": \"ml-inference-title-vector\",\n", " \"timestamp\": \"{{{ _ingest.timestamp }}}\",\n", " }\n", " ],\n", " }\n", " }\n", " ],\n", " }\n", " },\n", " {\n", " \"set\": {\n", " \"field\": \"my_vector\",\n", " \"if\": \"ctx?.ml?.inference != null && ctx.ml.inference['my_vector'] != null\",\n", " \"copy_from\": \"ml.inference.my_vector.predicted_value\",\n", " \"description\": \"Copy the predicted_value to 'my_vector'\",\n", " }\n", " },\n", " {\"remove\": {\"field\": \"ml.inference.my_vector\", \"ignore_missing\": True}},\n", " ]\n", "}\n", "\n", "pipeline_id = \"vector_embedding_demo\"\n", "response = es.ingest.put_pipeline(id=pipeline_id, body=pipeline)\n", "\n", "# Print the response\n", "print(response)" ] }, { "cell_type": "markdown", "metadata": { "id": "TgBeEw_Ql5I5" }, "source": [ "# Index Mapping / Template setup" ] }, { "cell_type": "code", "source": [ "index_patterns = [\"my_vector_index-*\"]\n", "\n", "priority = 1\n", "\n", "settings = {\n", " \"index.default_pipeline\": pipeline_id,\n", "}\n", "\n", "mappings = {\n", " \"properties\": {\n", " \"my_vector\": {\"type\": \"dense_vector\", \"dims\": 768},\n", " \"my_text\": {\"type\": \"text\"},\n", " },\n", " \"_source\": {\"excludes\": [\"my_vector\"]},\n", "}\n", "\n", "# Exclude `my_vector` from `_source` explicitly\n", "source_exclusions = {\"_source\": {\"excludes\": [\"my_vector\"]}}\n", "\n", "# Create the index template using put_index_template\n", "response = es.indices.put_index_template(\n", " name=\"my_vector_index_template\", # Template name\n", " index_patterns=index_patterns,\n", " priority=priority,\n", " template={\n", " \"settings\": settings,\n", " \"mappings\": mappings,\n", " },\n", ")\n", "\n", "# Print the response\n", "print(response)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "I5F6DR8jroEM", "outputId": "f1222091-cd17-4d8a-d811-2ac8e55d944e" }, "execution_count": 49, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "{'acknowledged': True}\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "bztQcxbll5cs" }, "source": [ "# Indexing Data\n" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "id": "XbapSs1c-hkd" }, "outputs": [], "source": [ "index_name = \"my_vector_index-01\"" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bSIJ-AngVmUi", "outputId": "c5cdd475-132d-4410-83e8-3557f4e05bb5" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "ObjectApiResponse({'_shards': {'total': 2, 'successful': 1, 'failed': 0}})" ] }, "metadata": {}, "execution_count": 51 } ], "source": [ "data = [\n", " (\"Hey, careful, man, there's a beverage here!\", \"The Dude\"),\n", " (\n", " \"I’m The Dude. So, that’s what you call me. You know, that or, uh, His Dudeness, or, uh, Duder, or El Duderino, if you’re not into the whole brevity thing\",\n", " \"The Dude\",\n", " ),\n", " (\n", " \"You don't go out looking for a job dressed like that? On a weekday?\",\n", " \"The Big Lebowski\",\n", " ),\n", " (\"What do you mean brought it bowling, Dude?\", \"Walter Sobchak\"),\n", " (\n", " \"Donny was a good bowler, and a good man. He was one of us. He was a man who loved the outdoors... and bowling, and as a surfer he explored the beaches of Southern California, from La Jolla to Leo Carrillo and... up to... Pismo\",\n", " \"Walter Sobchak\",\n", " ),\n", "]\n", "\n", "actions = [\n", " {\n", " \"_op_type\": \"index\",\n", " \"_index\": \"my_vector_index-01\",\n", " \"_source\": {\"my_text\": text, \"my_metadata\": metadata},\n", " }\n", " for text, metadata in data\n", "]\n", "\n", "bulk(es, actions)\n", "\n", "# Refresh the index to make sure all data is searchable\n", "es.indices.refresh(index=\"my_vector_index-01\")" ] }, { "cell_type": "markdown", "metadata": { "id": "ENlZ3Ndjl5yl" }, "source": [ "# Querying Data\n" ] }, { "cell_type": "markdown", "metadata": { "id": "Xk4CBDpimfDH" }, "source": [ "Approximate k-nearest neighbor (kNN)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "xl76_rM4l3iC", "outputId": "5d3b4c44-ff3c-4489-b850-e2e1bfc4880a" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[{'_id': 'PoHEcpIB5JwEUwVjEs6E',\n", " '_index': 'my_vector_index-01',\n", " '_score': 0.7825787,\n", " '_source': {'ml': {'inference': {}},\n", " 'my_metadata': 'The Dude',\n", " 'my_text': \"Hey, careful, man, there's a beverage here!\"}}]\n" ] } ], "source": [ "knn = {\n", " \"field\": \"my_vector\",\n", " \"k\": 1,\n", " \"num_candidates\": 5,\n", " \"query_vector_builder\": {\n", " \"text_embedding\": {\n", " \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n", " \"model_text\": \"Watchout I have a drink\",\n", " }\n", " },\n", "}\n", "\n", "response = es.search(index=index_name, knn=knn, source=True)\n", "\n", "pprint(response[\"hits\"][\"hits\"])" ] }, { "cell_type": "markdown", "metadata": { "id": "vhefCRd-mjk8" }, "source": [ "## Hybrid Searching (kNN + BM25) with RRF" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wLY8Q6tEmk06", "outputId": "3f1cc630-6e65-42b8-82eb-b83222fd43ce" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[{'_id': 'QYHEcpIB5JwEUwVjEs6E',\n", " '_index': 'my_vector_index-01',\n", " '_score': 1.8082356,\n", " 'fields': {'my_metadata': ['Walter Sobchak'],\n", " 'my_text': ['What do you mean brought it bowling, Dude?']}},\n", " {'_id': 'QoHEcpIB5JwEUwVjEs6E',\n", " '_index': 'my_vector_index-01',\n", " '_score': 1.2366624,\n", " 'fields': {'my_metadata': ['Walter Sobchak'],\n", " 'my_text': ['Donny was a good bowler, and a good man. He was one '\n", " 'of us. He was a man who loved the outdoors... and '\n", " 'bowling, and as a surfer he explored the beaches of '\n", " 'Southern California, from La Jolla to Leo Carrillo '\n", " 'and... up to... Pismo']}}]\n" ] } ], "source": [ "query = {\"match\": {\"my_text\": \"bowling\"}}\n", "\n", "knn = {\n", " \"field\": \"my_vector\",\n", " \"k\": 3,\n", " \"num_candidates\": 5,\n", " \"query_vector_builder\": {\n", " \"text_embedding\": {\n", " \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n", " \"model_text\": \"He enjoyed the game\",\n", " }\n", " },\n", "}\n", "\n", "rank: {\"rrf\": {}}\n", "\n", "fields = [\"my_text\", \"my_metadata\"]\n", "\n", "\n", "response = es.search(\n", " index=index_name, fields=fields, knn=knn, query=query, size=2, source=False\n", ")\n", "\n", "pprint(response[\"hits\"][\"hits\"])" ] }, { "cell_type": "markdown", "metadata": { "id": "HDBHn_kamlIL" }, "source": [ "## Filtering" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "yVDMHuM3mla7", "outputId": "b39c13de-a97b-4112-b733-a246cdc7f364" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[{'_id': 'PoHEcpIB5JwEUwVjEs6E',\n", " '_index': 'my_vector_index-01',\n", " '_score': 0.59394693,\n", " 'fields': {'my_metadata': ['The Dude'],\n", " 'my_text': [\"Hey, careful, man, there's a beverage here!\"]}}]\n" ] } ], "source": [ "knn = {\n", " \"field\": \"my_vector\",\n", " \"k\": 1,\n", " \"num_candidates\": 5,\n", " \"query_vector_builder\": {\n", " \"text_embedding\": {\n", " \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n", " \"model_text\": \"Did you bring the dog?\",\n", " }\n", " },\n", " \"filter\": {\"term\": {\"my_metadata.keyword\": \"The Dude\"}},\n", "}\n", "\n", "fields = [\"my_text\", \"my_metadata\"]\n", "\n", "response = es.search(index=index_name, fields=fields, knn=knn, source=False)\n", "\n", "pprint(response[\"hits\"][\"hits\"])" ] }, { "cell_type": "markdown", "metadata": { "id": "N_Msyv4-m5ow" }, "source": [ "# Aggregrations\n", "and Select fields returned" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "jbwinE0fm5-I", "outputId": "7ae0af99-3260-475b-98fe-2b5d8d165645" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[{'_id': 'QYHEcpIB5JwEUwVjEs6E',\n", " '_index': 'my_vector_index-01',\n", " '_score': 0.7433834,\n", " 'fields': {'my_metadata': ['Walter Sobchak'],\n", " 'my_text': ['What do you mean brought it bowling, Dude?']}},\n", " {'_id': 'PoHEcpIB5JwEUwVjEs6E',\n", " '_index': 'my_vector_index-01',\n", " '_score': 0.6028075,\n", " 'fields': {'my_metadata': ['The Dude'],\n", " 'my_text': [\"Hey, careful, man, there's a beverage here!\"]}}]\n" ] } ], "source": [ "knn = {\n", " \"field\": \"my_vector\",\n", " \"k\": 2,\n", " \"num_candidates\": 5,\n", " \"query_vector_builder\": {\n", " \"text_embedding\": {\n", " \"model_id\": \"sentence-transformers__all-distilroberta-v1\",\n", " \"model_text\": \"did you bring it?\",\n", " }\n", " },\n", "}\n", "\n", "aggs = {\"metadata\": {\"terms\": {\"field\": \"my_metadata.keyword\"}}}\n", "\n", "fields = [\"my_text\", \"my_metadata\"]\n", "\n", "response = es.search(index=index_name, fields=fields, aggs=aggs, knn=knn, source=False)\n", "\n", "pprint(response[\"hits\"][\"hits\"])" ] } ], "metadata": { "colab": { "provenance": [], "toc_visible": true }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }