0_basic-rag/1_basic-rag.ipynb (1,039 lines of code) (raw):

{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Create basic RAG application with Azure AI Search\n", "\n", "This sample demonstrates how to create a basic RAG application with Azure AI Search. forked from [azure-search-vector-samples](https://github.com/Azure/azure-search-vector-samples/tree/main)\n", "\n", "> ✨ **_Note_** <br>\n", "> Please check the regional support for Azure AI Search before you get started - https://learn.microsoft.com/en-us/azure/search/search-region-support\n", "> In order to use the Semantic Search feature, check your region availability and pricing tier. Make sure it is at least Standard S3.\n", "\n", "Create an `.env` file based on the `sample.env` file. Copy the new `.env` file to the folder containing your notebook and update the variables.\n", "\n", "\n", "### References\n", "- https://github.com/Azure/azure-search-vector-samples/blob/main/demo-python/code/integrated-vectorization/azure-search-integrated-vectorization-sample.ipynb" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "==== Environment Variables ====\n", "azure_search_endpoint=https://rag-innovator-search-svc.search.windows.net\n", "azure_ai_inference_endpoint=https://<your-ai-inference>.services.ai.azure.com/models\n", "azure_openai_endpoint=https://aoai-services1.openai.azure.com/\n", "azure_openai_deployment_name=gpt-4o-mini\n", "azure_openai_embedding_deployment_name=text-embedding-3-small\n", "azure_openai_embedding_dimensions=1536\n", "index_name=hotels-sample-index\n", "All environment variables are valid.\n" ] } ], "source": [ "import os\n", "import json\n", "import requests\n", "\n", "from openai import AzureOpenAI\n", "from dotenv import load_dotenv\n", "from azure.identity import DefaultAzureCredential\n", "from azure.core.credentials import AzureKeyCredential\n", "from azure.search.documents.indexes import SearchIndexClient\n", "from azure.search.documents import SearchClient\n", "from azure.search.documents import SearchIndexingBufferedSender\n", "from azure.search.documents.models import (\n", " VectorizedQuery,\n", " VectorizableTextQuery,\n", " VectorFilterMode,\n", ")\n", "from azure.search.documents.models import QueryType, QueryCaptionType, QueryAnswerType\n", "from azure.search.documents.indexes.models import (\n", " SimpleField,\n", " SearchFieldDataType,\n", " SearchableField,\n", " SearchField,\n", " HnswAlgorithmConfiguration,\n", " VectorSearch,\n", " VectorSearchProfile,\n", " SemanticConfiguration,\n", " SemanticPrioritizedFields,\n", " SemanticField,\n", " SemanticSearch,\n", " ComplexField,\n", " SearchIndex,\n", " AzureOpenAIVectorizer,\n", " AzureOpenAIVectorizerParameters,\n", ")\n", "\n", "# Load environment variables\n", "load_dotenv(override=True)\n", "\n", "\n", "# Validate environment variables\n", "def get_env_var(name, required=True):\n", " value = os.getenv(name)\n", " if required and not value:\n", " raise ValueError(f\"Environment variable {name} is missing or empty.\")\n", " return value\n", "\n", "\n", "try:\n", " azure_ai_search_endpoint = os.getenv(\"AZURE_AI_SEARCH_ENDPOINT\")\n", " azure_ai_search_admin_key = get_env_var(\"AZURE_AI_SEARCH_API_KEY\", required=False)\n", "\n", " azure_ai_inference_endpoint = get_env_var(\"AZURE_AI_INFERENCE_ENDPOINT\")\n", " azure_ai_inference_key = get_env_var(\"AZURE_AI_INFERENCE_KEY\", required=False)\n", "\n", " azure_openai_endpoint = get_env_var(\"AZURE_OPENAI_ENDPOINT\")\n", " azure_openai_key = get_env_var(\"AZURE_OPENAI_API_KEY\")\n", " azure_openai_deployment_name = get_env_var(\"AZURE_OPENAI_CHAT_DEPLOYMENT_NAME\")\n", " azure_openai_embedding_deployment_name = (\n", " get_env_var(\"AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME\", required=False)\n", " or \"text-embedding-ada-002\"\n", " )\n", " azure_openai_embedding_dimensions = int(\n", " get_env_var(\"AZURE_OPENAI_EMBEDDING_DIMENSIONS\", required=False) or 1536\n", " )\n", " azure_openai_api_version = (\n", " get_env_var(\"AZURE_OPENAI_API_VERSION\", required=False) or \"2024-12-01-preview\"\n", " )\n", "\n", " index_name = (\n", " get_env_var(\"AZURE_SEARCH_INDEX_NAME\", required=False)\n", " or \"hotel_quickstart_vector\"\n", " )\n", "\n", " print(\"==== Environment Variables ====\")\n", " print(f\"azure_search_endpoint={azure_ai_search_endpoint}\")\n", " \n", " print(f\"azure_ai_inference_endpoint={azure_ai_inference_endpoint}\")\n", " \n", " print(f\"azure_openai_endpoint={azure_openai_endpoint}\")\n", " \n", " print(f\"azure_openai_deployment_name={azure_openai_deployment_name}\")\n", " print(\n", " f\"azure_openai_embedding_deployment_name={azure_openai_embedding_deployment_name}\"\n", " )\n", " print(f\"azure_openai_embedding_dimensions={azure_openai_embedding_dimensions}\")\n", " print(f\"index_name={index_name}\")\n", "\n", " # Validate credentials\n", " if azure_ai_search_admin_key:\n", " search_credential = AzureKeyCredential(azure_ai_search_admin_key)\n", " else:\n", " search_credential = DefaultAzureCredential()\n", " # Check if DefaultAzureCredential works\n", " try:\n", " search_credential.get_token(\"https://management.azure.com/.default\")\n", " except Exception as e:\n", " raise ValueError(\n", " \"DefaultAzureCredential authentication failed. Ensure you are logged in using `az login`.\"\n", " ) from e\n", "\n", " print(\"All environment variables are valid.\")\n", "\n", "except ValueError as e:\n", " print(f\"[ERROR] {e}\")\n", " exit(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create vector index\n", "\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "index_client = SearchIndexClient(\n", " endpoint=azure_ai_search_endpoint, credential=search_credential\n", ")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index 'hotels-sample-index' does not exist, creating a new one.\n" ] } ], "source": [ "from azure.core.exceptions import ResourceNotFoundError\n", "\n", "try:\n", " index_client.get_index(index_name)\n", " index_client.delete_index(index_name)\n", " print(f\"Index '{index_name}' deleted.\")\n", "except ResourceNotFoundError:\n", " print(f\"Index '{index_name}' does not exist, creating a new one.\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "hotels-sample-index created\n" ] } ], "source": [ "# vector field - titleVector, contentVector\n", "fields = [\n", " SimpleField(name=\"HotelId\", type=SearchFieldDataType.String, key=True),\n", " SearchableField(name=\"HotelName\", type=SearchFieldDataType.String, sortable=True),\n", " SearchableField(\n", " name=\"Description\", type=SearchFieldDataType.String, analyzer_name=\"en.lucene\"\n", " ),\n", " # https://learn.microsoft.com/en-us/azure/search/search-language-support\n", " SearchableField(\n", " name=\"Description_kr\",\n", " type=SearchFieldDataType.String,\n", " analyzer_name=\"ko.microsoft\",\n", " ),\n", " SearchableField(\n", " name=\"Description_fr\",\n", " type=SearchFieldDataType.String,\n", " analyzer_name=\"fr.lucene\",\n", " ),\n", " SearchableField(\n", " name=\"Category\",\n", " type=SearchFieldDataType.String,\n", " facetable=True,\n", " filterable=True,\n", " sortable=True,\n", " ),\n", " SearchableField(\n", " name=\"Tags\",\n", " collection=True,\n", " type=SearchFieldDataType.String,\n", " facetable=True,\n", " filterable=True,\n", " ),\n", " SimpleField(\n", " name=\"ParkingIncluded\",\n", " type=SearchFieldDataType.Boolean,\n", " facetable=True,\n", " filterable=True,\n", " sortable=True,\n", " ),\n", " SimpleField(\n", " name=\"LastRenovationDate\",\n", " type=SearchFieldDataType.DateTimeOffset,\n", " facetable=True,\n", " filterable=True,\n", " sortable=True,\n", " ),\n", " SimpleField(\n", " name=\"Rating\",\n", " type=SearchFieldDataType.Double,\n", " facetable=True,\n", " filterable=True,\n", " sortable=True,\n", " ),\n", " ComplexField(\n", " name=\"Address\",\n", " fields=[\n", " SearchableField(name=\"StreetAddress\", type=SearchFieldDataType.String),\n", " SearchableField(\n", " name=\"City\",\n", " type=SearchFieldDataType.String,\n", " facetable=True,\n", " filterable=True,\n", " sortable=True,\n", " ),\n", " SearchableField(\n", " name=\"StateProvince\",\n", " type=SearchFieldDataType.String,\n", " facetable=True,\n", " filterable=True,\n", " sortable=True,\n", " ),\n", " SearchableField(\n", " name=\"PostalCode\",\n", " type=SearchFieldDataType.String,\n", " facetable=True,\n", " filterable=True,\n", " sortable=True,\n", " ),\n", " SearchableField(\n", " name=\"Country\",\n", " type=SearchFieldDataType.String,\n", " facetable=True,\n", " filterable=True,\n", " sortable=True,\n", " ),\n", " ],\n", " ),\n", " SimpleField(\n", " name=\"Location\",\n", " type=SearchFieldDataType.GeographyPoint,\n", " filterable=True,\n", " sortable=True,\n", " ),\n", " ComplexField(\n", " name=\"Rooms\",\n", " collection=True,\n", " fields=[\n", " SearchableField(\n", " name=\"Description\",\n", " type=SearchFieldDataType.String,\n", " analyzer_name=\"en.lucene\",\n", " ),\n", " SearchableField(\n", " name=\"Description_kr\",\n", " type=SearchFieldDataType.String,\n", " analyzer_name=\"ko.microsoft\",\n", " ),\n", " SearchableField(\n", " name=\"Description_fr\",\n", " type=SearchFieldDataType.String,\n", " analyzer_name=\"fr.lucene\",\n", " ),\n", " SearchableField(\n", " name=\"Type\",\n", " type=SearchFieldDataType.String,\n", " facetable=True,\n", " filterable=True,\n", " ),\n", " SimpleField(\n", " name=\"BaseRate\",\n", " type=SearchFieldDataType.Double,\n", " facetable=True,\n", " filterable=True,\n", " ),\n", " SearchableField(\n", " name=\"BedOptions\",\n", " type=SearchFieldDataType.String,\n", " facetable=True,\n", " filterable=True,\n", " ),\n", " SimpleField(\n", " name=\"SleepsCount\",\n", " type=SearchFieldDataType.Int32,\n", " facetable=True,\n", " filterable=True,\n", " ),\n", " SimpleField(\n", " name=\"SmokingAllowed\",\n", " type=SearchFieldDataType.Boolean,\n", " facetable=True,\n", " filterable=True,\n", " ),\n", " SearchableField(\n", " name=\"Tags\",\n", " type=SearchFieldDataType.String,\n", " collection=True,\n", " facetable=True,\n", " filterable=True,\n", " ),\n", " ],\n", " ),\n", " SearchField(\n", " name=\"hotelNameVector\",\n", " type=SearchFieldDataType.Collection(SearchFieldDataType.Single),\n", " searchable=True,\n", " vector_search_dimensions=azure_openai_embedding_dimensions,\n", " vector_search_profile_name=\"myHnswProfile\",\n", " ),\n", " SearchField(\n", " name=\"descriptionVector\",\n", " type=SearchFieldDataType.Collection(SearchFieldDataType.Single),\n", " searchable=True,\n", " vector_search_dimensions=azure_openai_embedding_dimensions,\n", " vector_search_profile_name=\"myHnswProfile\",\n", " ),\n", " SearchField(\n", " name=\"descriptionKOVector\",\n", " type=SearchFieldDataType.Collection(SearchFieldDataType.Single),\n", " searchable=True,\n", " vector_search_dimensions=azure_openai_embedding_dimensions,\n", " vector_search_profile_name=\"myHnswProfile\",\n", " ),\n", "]\n", "\n", "\n", "# Define the vector search configuration\n", "vector_search = VectorSearch(\n", " profiles=[\n", " VectorSearchProfile(\n", " name=\"myHnswProfile\",\n", " algorithm_configuration_name=\"myHnsw\",\n", " vectorizer_name=\"myVectorizer\",\n", " )\n", " ],\n", " algorithms=[HnswAlgorithmConfiguration(name=\"myHnsw\")],\n", " vectorizers=[\n", " AzureOpenAIVectorizer(\n", " vectorizer_name=\"myVectorizer\",\n", " kind=\"azureOpenAI\",\n", " parameters=AzureOpenAIVectorizerParameters(\n", " resource_url=azure_openai_endpoint,\n", " deployment_name=azure_openai_embedding_deployment_name,\n", " model_name=azure_openai_embedding_deployment_name,\n", " api_key=azure_openai_key,\n", " ),\n", " )\n", " ],\n", ")\n", "\n", "semantic_config = SemanticConfiguration(\n", " name=\"my-semantic-config\",\n", " prioritized_fields=SemanticPrioritizedFields(\n", " title_field=SemanticField(field_name=\"HotelName\"),\n", " keywords_fields=[SemanticField(field_name=\"Category\")],\n", " content_fields=[SemanticField(field_name=\"Description\")],\n", " ),\n", ")\n", "\n", "# Create the semantic search with the configuration\n", "semantic_search = SemanticSearch(configurations=[semantic_config])\n", "\n", "# Create the search index\n", "index = SearchIndex(\n", " name=index_name,\n", " fields=fields,\n", " vector_search=vector_search,\n", " semantic_search=semantic_search,\n", ")\n", "result = index_client.create_or_update_index(index)\n", "\n", "print(f\"{result.name} created\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create embedding\n", "\n", "Reads the document to index, embeds certain fields (HotelName, Description), and indexes them." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "if azure_openai_endpoint is None:\n", " raise ValueError(\"The Azure OpenAI endpoint is not set.\")\n", "\n", "openai_client = AzureOpenAI(\n", " api_version=azure_openai_api_version,\n", " azure_endpoint=azure_openai_endpoint,\n", " api_key=azure_openai_key,\n", ")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# read the hotels_data.json file and save it to the output directory after embedding.\n", "# You don't need to run this code if you have already run it once.\n", "\n", "INDEX_JSON_DATA = True\n", "\n", "if INDEX_JSON_DATA:\n", " hotels_data_file_path = \"../sample-docs/hotels_data_ko.json\"\n", "\n", " with open(file=hotels_data_file_path, mode=\"r\", encoding=\"utf-8-sig\") as file:\n", " hotel_documents = json.load(file)[\"value\"]\n", "\n", " # document embedding\n", "\n", " # HotelName, Description embedding\n", " hotel_name = [item[\"HotelName\"] for item in hotel_documents]\n", " description = [item[\"Description\"] for item in hotel_documents]\n", " description_kr = [item[\"Description_kr\"] for item in hotel_documents]\n", "\n", " hotel_name_response = openai_client.embeddings.create(\n", " model=azure_openai_embedding_deployment_name, input=hotel_name\n", " )\n", " hotel_name_embeddings = [item.embedding for item in hotel_name_response.data]\n", "\n", " description_response = openai_client.embeddings.create(\n", " model=azure_openai_embedding_deployment_name, input=description\n", " )\n", " description_embeddings = [item.embedding for item in description_response.data]\n", "\n", " description_kr_response = openai_client.embeddings.create(\n", " model=azure_openai_embedding_deployment_name, input=description_kr\n", " )\n", " description_kr_embeddings = [\n", " item.embedding for item in description_kr_response.data\n", " ]\n", "\n", " for i, item in enumerate(hotel_documents):\n", " item[\"hotelNameVector\"] = hotel_name_embeddings[i]\n", " item[\"descriptionVector\"] = description_embeddings[i]\n", " item[\"descriptionKOVector\"] = description_kr_embeddings[i]\n", "\n", " # save the result in docVectors.json\n", " output_path = os.path.join(\".\", \"output\", \"docVectors.json\")\n", " output_directory = os.path.dirname(output_path)\n", " if not os.path.exists(output_directory):\n", " os.makedirs(output_directory)\n", " with open(output_path, \"w\") as f:\n", " json.dump(hotel_documents, f)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "document indexing: 50\n" ] } ], "source": [ "### input vector index\n", "output_path = os.path.join(\".\", \"output\", \"docVectors.json\")\n", "output_directory = os.path.dirname(output_path)\n", "\n", "if not os.path.exists(output_directory):\n", " os.makedirs(output_directory)\n", "with open(output_path, \"r\") as file:\n", " documents = json.load(file)\n", "\n", "if azure_ai_search_endpoint is None:\n", " raise ValueError(\"The Azure AI Search endpoint is not set.\")\n", "\n", "search_client = SearchClient(\n", " endpoint=azure_ai_search_endpoint,\n", " index_name=index_name,\n", " credential=search_credential,\n", ")\n", "result = search_client.upload_documents(documents)\n", "print(f\"document indexing: {len(documents)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### VectorizableQuery, VectorizableTextQuery\n", "\n", "- [VectorizableQuery](https://learn.microsoft.com/ko-kr/python/api/azure-search-documents/azure.search.documents.models.vectorizedquery?view=azure-python): User directly inputs embedding vectors to perform a search. In other words, the user passes the pre-vectorized values ​​to VectorizedQuery using a pre-trained model such as OpenAI, Hugging Face, Sentence Transformers, etc." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "========== Result with VectorizedQuery ==========\n", "Time taken for VectorizedQuery search: 0.4038126468658447 seconds\n" ] } ], "source": [ "import time\n", "\n", "# vector Search\n", "# query = \"WiFi가 제공되는 전통적인 호텔\"\n", "query = \"뉴욕에 중심부의의 WiFi가 제공되는 역사있고 전통적인 호텔\"\n", "\n", "start_time = time.time()\n", "\n", "embedding = (\n", " openai_client.embeddings.create(\n", " input=query, model=azure_openai_embedding_deployment_name\n", " )\n", " .data[0]\n", " .embedding\n", ")\n", "\n", "# Search with embdding query on descriptionVector field to return top 3 closest items\n", "vector_query = VectorizedQuery(\n", " vector=embedding, k_nearest_neighbors=3, fields=\"descriptionVector\"\n", ")\n", "\n", "results = search_client.search(\n", " search_text=None,\n", " vector_queries=[vector_query],\n", " select=[\"HotelName\", \"Description_kr\", \"Category\"],\n", ")\n", "\n", "print(\"========== Result with VectorizedQuery ==========\")\n", "for result in results:\n", " print(f\"HotelName: {result['HotelName']}\")\n", " print(f\"Score: {result['@search.score']}\")\n", " # print(f\"Description: {result['Description']}\")\n", " print(f\"Description_kr: {result['Description_kr']}\")\n", " print(f\"Category: {result['Category']}\\n\")\n", "\n", "end_time = time.time()\n", "print(f\"Time taken for VectorizedQuery search: {end_time - start_time} seconds\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- [VectorizedTextQuery](https://learn.microsoft.com/ko-kr/python/api/azure-search-documents/azure.search.documents.models.vectorizabletextquery?view=azure-python): If the user directly enters text, Azure AI Search internally embeds (vectorizes) it and performs vector search. " ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "========== Result with VectorizableTextQuery ==========\n", "HotelName: Stay-Kay City Hotel\n", "Score: 0.64250153\n", "Description: 이 고전적인 호텔은 완전히 리모델링되었으며 뉴욕의 중심부에 있는 주요 상업 거리의 이상적인 위치에 있습니다. 몇 분 거리에 타임스 스퀘어와 도시의 역사적인 중심지, 그리고 뉴욕을 미국에서 가장 매력적이고 세계적인 도시 중 하나로 만드는 다른 명소들이 있습니다.\n", "Category: Boutique\n", "\n", "HotelName: Treehouse Hotel\n", "Score: 0.6366472\n", "Description: 활기찬 도심과 분주한 비즈니스 지구의 중심 가까이에 위치한 저희 호텔에서 따뜻한 환대를 경험해 보세요. 무료 WiFi, 지역 교통편, 그리고 우유와 쿠키를 즐기실 수 있습니다.\n", "Category: Budget\n", "\n", "HotelName: Friendly Motor Inn\n", "Score: 0.61995727\n", "Description: 역사적인 명소, 지역 관광지, 도시 공원과 가까운 곳에 위치해 있습니다. 공항과 카지노까지 무료 셔틀 서비스가 제공됩니다. 무료 아침 식사와 WiFi가 제공됩니다.\n", "Category: Budget\n", "\n", "Time taken for VectorizableTextQuery search: 1.3934569358825684 seconds\n" ] } ], "source": [ "start_time = time.time()\n", "\n", "vector_query = VectorizableTextQuery(\n", " text=query, k_nearest_neighbors=3, fields=\"descriptionVector\"\n", ")\n", "\n", "results = search_client.search(\n", " search_text=None,\n", " vector_queries=[vector_query],\n", " select=[\"HotelName\", \"Description_kr\", \"Category\"],\n", ")\n", "\n", "print(\"========== Result with VectorizableTextQuery ==========\")\n", "for result in results:\n", " print(f\"HotelName: {result['HotelName']}\")\n", " print(f\"Score: {result['@search.score']}\")\n", " print(f\"Description: {result['Description_kr']}\")\n", " print(f\"Category: {result['Category']}\\n\")\n", "\n", "end_time = time.time()\n", "print(f\"Time taken for VectorizableTextQuery search: {end_time - start_time} seconds\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Multi-Vector Search\n", "- Cross-field vector search, which allows you to pass multiple query vectors to query multiple vector fields simultaneously.\n", "- In this case, you can pass query vectors from two different embedding models to the corresponding vector fields in the index.\n", "- For each vector field, you can also give different search settings, such as performing a vector search, weights, exhaustive KNN, etc." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "HotelName: Good Business Hotel\n", "Score: 0.01666666753590107\n", "Description: 1 Mile from the airport. Free WiFi, Outdoor Pool, Complimentary Airport Shuttle, 6 miles from Lake Lanier & 10 miles from downtown. Our business center includes printers, a copy machine, fax, and a work area.\n", "Category: Suite\n", "\n", "HotelName: Treehouse Hotel\n", "Score: 0.016393441706895828\n", "Description: Near the beating heart of our vibrant downtown and bustling business district. Experience the warmth of our hotel. Enjoy free WiFi, local transportation and Milk & Cookies.\n", "Category: Budget\n", "\n", "HotelName: Twin Vortex Hotel\n", "Score: 0.016129031777381897\n", "Description: New experience in the making. Be the first to experience the luxury of the Twin Vortex. Reserve one of our newly-renovated guest rooms today.\n", "Category: Luxury\n", "\n", "HotelName: Countryside Hotel\n", "Score: 0.011666666716337204\n", "Description: Save up to 50% off traditional hotels. Free WiFi, great location near downtown, full kitchen, washer & dryer, 24/7 support, bowling alley, fitness center and more.\n", "Category: Extended-Stay\n", "\n", "HotelName: Double Sanctuary Resort\n", "Score: 0.011475409381091595\n", "Description: 5 star Luxury Hotel - Biggest Rooms in the city. #1 Hotel in the area listed by Traveler magazine. Free WiFi, Flexible check in/out, Fitness Center & espresso in room.\n", "Category: Resort and Spa\n", "\n", "HotelName: Friendly Motor Inn\n", "Score: 0.011290322057902813\n", "Description: Close to historic sites, local attractions, and urban parks. Free Shuttle to the airport and casinos. Free breakfast and WiFi.\n", "Category: Budget\n", "\n" ] } ], "source": [ "query = \"traditional hotels with free wifi\"\n", "\n", "vector_query_1 = VectorizableTextQuery(\n", " text=query,\n", " k_nearest_neighbors=3,\n", " fields=\"hotelNameVector\",\n", " weight=1,\n", " exhaustive=True,\n", ")\n", "vector_query_2 = VectorizableTextQuery(\n", " text=query, k_nearest_neighbors=3, fields=\"descriptionVector\", weight=0.7\n", ")\n", "\n", "results = search_client.search(\n", " search_text=None,\n", " vector_queries=[vector_query_1, vector_query_2],\n", " select=[\"HotelName\", \"Description\", \"Category\"],\n", ")\n", "\n", "for result in results:\n", " print(f\"HotelName: {result['HotelName']}\")\n", " print(f\"Score: {result['@search.score']}\")\n", " print(f\"Description: {result['Description']}\")\n", " print(f\"Category: {result['Category']}\\n\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using filters in Vector search\n", "\n", "- Shows how to apply filters to your search.\n", "- You can choose whether to use pre-filtering (the default) or post-filtering." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "HotelName: Friendly Motor Inn\n", "Score: 0.6590744\n", "Description: Close to historic sites, local attractions, and urban parks. Free Shuttle to the airport and casinos. Free breakfast and WiFi.\n", "Category: Budget\n", "\n", "HotelName: Thunderbird Motel\n", "Score: 0.6488486\n", "Description: Book Now & Save. Clean, Comfortable rooms at the lowest price. Enjoy complimentary coffee and tea in common areas.\n", "Category: Budget\n", "\n", "HotelName: Treehouse Hotel\n", "Score: 0.6311266\n", "Description: Near the beating heart of our vibrant downtown and bustling business district. Experience the warmth of our hotel. Enjoy free WiFi, local transportation and Milk & Cookies.\n", "Category: Budget\n", "\n" ] } ], "source": [ "query = \"traditional hotels with free wifi\"\n", "\n", "vector_query = VectorizableTextQuery(\n", " text=query, k_nearest_neighbors=3, fields=\"descriptionVector\"\n", ")\n", "\n", "results = search_client.search(\n", " search_text=None,\n", " vector_queries=[vector_query],\n", " vector_filter_mode=VectorFilterMode.PRE_FILTER,\n", " filter=\"Category eq 'Budget'\",\n", " select=[\"HotelName\", \"Description\", \"Category\"],\n", ")\n", "\n", "for result in results:\n", " print(f\"HotelName: {result['HotelName']}\")\n", " print(f\"Score: {result['@search.score']}\")\n", " print(f\"Description: {result['Description']}\")\n", " print(f\"Category: {result['Category']}\\n\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Hybrid search\n", "- Performs a combination of Lexical and Vector searches and returns results.\n", "- In the case of Vector search, you can improve the quality of your search by performing a lexical exact search along with a search using similarity." ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "HotelName: Luxury Lion Resort\n", "Score: 0.03131881356239319\n", "Description: Unmatched Luxury. Visit our downtown hotel to indulge in luxury accommodations. Moments from the stadium and transportation hubs, we feature the best in convenience and comfort.\n", "Category: Luxury\n", "\n", "HotelName: Foot Happy Suites\n", "Score: 0.029462365433573723\n", "Description: Downtown in the heart of the business district. Close to everything. Leave your car behind and walk to the park, shopping, and restaurants. Or grab one of our bikes and take your explorations a little further.\n", "Category: Suite\n", "\n", "HotelName: Smile Up Hotel\n", "Score: 0.02916666865348816\n", "Description: Experience the fresh, modern downtown. Enjoy updated rooms, bold style & prime location. Don't miss our weekend live music series featuring who's new/next on the scene.\n", "Category: Suite\n", "\n" ] } ], "source": [ "query = \"near downtown hotels\"\n", "\n", "vector_query = VectorizableTextQuery(\n", " text=query, k_nearest_neighbors=3, fields=\"descriptionVector\"\n", ")\n", "\n", "# 어휘 검색(search_text=query)과 벡터 검색(vector_queries=[vector_query])을 함께 사용하여 검색 합니다.\n", "results = search_client.search(\n", " search_text=query,\n", " vector_queries=[vector_query],\n", " select=[\"HotelName\", \"Description\", \"Category\"],\n", " top=3,\n", ")\n", "\n", "for result in results:\n", " print(f\"HotelName: {result['HotelName']}\")\n", " print(f\"Score: {result['@search.score']}\")\n", " print(f\"Description: {result['Description']}\")\n", " print(f\"Category: {result['Category']}\\n\")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "HotelName: Smile Up Hotel\n", "Score: 0.69187903\n", "Description: Experience the fresh, modern downtown. Enjoy updated rooms, bold style & prime location. Don't miss our weekend live music series featuring who's new/next on the scene.\n", "Category: Suite\n", "\n", "HotelName: Luxury Lion Resort\n", "Score: 0.6909753\n", "Description: Unmatched Luxury. Visit our downtown hotel to indulge in luxury accommodations. Moments from the stadium and transportation hubs, we feature the best in convenience and comfort.\n", "Category: Luxury\n", "\n", "HotelName: Foot Happy Suites\n", "Score: 0.6904679\n", "Description: Downtown in the heart of the business district. Close to everything. Leave your car behind and walk to the park, shopping, and restaurants. Or grab one of our bikes and take your explorations a little further.\n", "Category: Suite\n", "\n" ] } ], "source": [ "# 위의 검색 결과와, 벡터 검색만 사용(search_text=None) 의 결과를 비교해 보면, 다르다는 것을 알 수 있습니다.\n", "vector_query = VectorizableTextQuery(\n", " text=query, k_nearest_neighbors=3, fields=\"descriptionVector\"\n", ")\n", "\n", "results = search_client.search(\n", " search_text=None,\n", " vector_queries=[vector_query],\n", " select=[\"HotelName\", \"Description\", \"Category\"],\n", " top=3,\n", ")\n", "\n", "for result in results:\n", " print(f\"HotelName: {result['HotelName']}\")\n", " print(f\"Score: {result['@search.score']}\")\n", " print(f\"Description: {result['Description']}\")\n", " print(f\"Category: {result['Category']}\\n\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Semantic Hybrid Search\n", "\n", "Hybrid search returns results that are captioned using the semantic ranker.\n", "\n", "When creating the vector index, we added a sementic configuration and set the title, content, and keyword fields to use for semantic ranking, captions, highlighting, and answers.\n", "\n", "``` python\n", "semantic_config = SemanticConfiguration(\n", " name=\"my-semantic-config\",\n", " prioritized_fields=SemanticPrioritizedFields(\n", " title_field=SemanticField(field_name=\"HotelName\"),\n", " keywords_fields=[SemanticField(field_name=\"Category\")],\n", " content_fields=[SemanticField(field_name=\"Description\")]\n", " )\n", ")\n", "```\n", "\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Semantic Answer: This<em> classic hotel </em>is<em> fully-refurbished and ideally located on the main commercial artery of the city in the heart of New York.</em> A few minutes away is Times Square and the historic centre of the city, as well as other places of interest that make New York one of America's most attractive and cosmopolitan cities.\n", "Semantic Answer Score: 0.9679999947547913\n", "\n", "HotelName: Stay-Kay City Hotel\n", "Reranker Score: 2.817405939102173\n", "Description: This classic hotel is fully-refurbished and ideally located on the main commercial artery of the city in the heart of New York. A few minutes away is Times Square and the historic centre of the city, as well as other places of interest that make New York one of America's most attractive and cosmopolitan cities.\n", "Category: Boutique\n", "Caption: This<em> classic hotel </em>is<em> fully-refurbished and ideally located on the main commercial artery of the city in the heart of New York.</em> A few minutes away is Times Square and the historic centre of the city, as well as other places of interest that make New York one of America's most attractive and cosmopolitan cities.\n", "\n", "HotelName: By the Market Hotel\n", "Reranker Score: 2.7280795574188232\n", "Description: Book now and Save up to 30%. Central location. Walking distance from the Empire State Building & Times Square, in the Chelsea neighborhood. Brand new rooms. Impeccable service.\n", "Category: Budget\n", "Caption: <em>Book now and Save up to 30%.</em> Central location. Walking distance from the<em> Empire State Building & Times Square, </em>in the<em> Chelsea neighborhood.</em> <em>Brand new rooms. Impeccable service.</em>\n", "\n", "HotelName: City Skyline Antiquity Hotel\n", "Reranker Score: 2.4852683544158936\n", "Description: In vogue since 1888, the Antiquity Hotel takes you back to bygone era. From the crystal chandeliers that adorn the Green Room, to the arched ceilings of the Grand Hall, the elegance of old New York beckons. Elevate Your Experience. Upgrade to a premiere city skyline view for less, where old world charm combines with dramatic views of the city, local cathedral and midtown.\n", "Category: Boutique\n", "Caption: In vogue since 1888, the<em> Antiquity Hotel </em>takes you back to bygone era. From<em> the crystal chandeliers </em>that<em> adorn the Green Room, </em>to the arched ceilings of the Grand Hall, the elegance of<em> old New York beckons.</em> Elevate Your Experience. <em>Upgrade </em>to<em> a premiere city skyline view for less, </em>where<em> old world charm </em>combines with dramatic<em> views </em>of the<em> city,.</em>\n", "\n" ] } ], "source": [ "query = \"Good hotels for times square\"\n", "\n", "vector_query = VectorizableTextQuery(\n", " text=query, k_nearest_neighbors=3, fields=\"descriptionVector\", exhaustive=True\n", ")\n", "\n", "# Hybrid & Semantic Search\n", "results = search_client.search(\n", " search_text=query,\n", " vector_queries=[vector_query],\n", " select=[\"HotelName\", \"Description\", \"Category\"],\n", " query_type=QueryType.SEMANTIC,\n", " semantic_configuration_name=\"my-semantic-config\",\n", " query_caption=QueryCaptionType.EXTRACTIVE,\n", " query_answer=QueryAnswerType.EXTRACTIVE,\n", " top=3,\n", ")\n", "\n", "semantic_answers = results.get_answers()\n", "for answer in semantic_answers:\n", " if answer.highlights:\n", " print(f\"Semantic Answer: {answer.highlights}\")\n", " else:\n", " print(f\"Semantic Answer: {answer.text}\")\n", " print(f\"Semantic Answer Score: {answer.score}\\n\")\n", "\n", "for result in results:\n", " print(f\"HotelName: {result['HotelName']}\")\n", " print(f\"Reranker Score: {result['@search.reranker_score']}\")\n", " print(f\"Description: {result['Description']}\")\n", " print(f\"Category: {result['Category']}\")\n", "\n", " captions = result[\"@search.captions\"]\n", " if captions:\n", " caption = captions[0]\n", " if caption.highlights:\n", " print(f\"Caption: {caption.highlights}\\n\")\n", " else:\n", " print(f\"Caption: {caption.text}\\n\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "py312-dev", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }