components/llm_service/notebooks/Embeddings.ipynb (359 lines of code) (raw):

{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "cdf14a05-aa5e-4388-b622-a4e3168d9955", "metadata": {}, "outputs": [], "source": [ "PROJECT_ID = \"genie-ui-dev\"\n", "REGION = \"us-central1\"\n", "import os\n", "os.environ[\"PROJECT_ID\"] = PROJECT_ID\n", "os.environ[\"PG_HOST\"] = \"\"" ] }, { "cell_type": "code", "execution_count": 2, "id": "7b93aff9-aafb-4d76-8ce8-7781b623e393", "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.append(\"../../common/src\")\n", "sys.path.append(\"../src\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "b3a9c5a0-8416-43f9-b569-6ce8a9ee97d4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO: [config/config.py:61 - <module>()] Namespace File not found, setting job namespace as default\n", "INFO: [config/model_config.py:341 - set_model_config()] Provider [Vertex] enablement status [True]\n", "INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-Text] enablement status [True]\n", "INFO: [config/model_config.py:341 - set_model_config()] Provider [Vertex] enablement status [True]\n", "INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-Chat] enablement status [True]\n", "INFO: [config/model_config.py:341 - set_model_config()] Provider [Vertex] enablement status [True]\n", "INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-Chat-Palm2] enablement status [True]\n", "INFO: [config/model_config.py:341 - set_model_config()] Provider [Vertex] enablement status [True]\n", "INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-Chat-Palm2-V2] enablement status [True]\n", "INFO: [config/model_config.py:341 - set_model_config()] Provider [Vertex] enablement status [True]\n", "INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-Gemini-Pro] enablement status [True]\n", "INFO: [config/model_config.py:341 - set_model_config()] Provider [Vertex] enablement status [True]\n", "INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-Gemini-Pro-Vision] enablement status [True]\n", "INFO: [config/model_config.py:341 - set_model_config()] Provider [Langchain] enablement status [True]\n", "INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-Chat-Palm2V2-Langchain] enablement status [True]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/lramsey/work/ailp/lukmanr-gps-core-solution-services/components/llm_service/notebooks/.venv/lib/python3.9/site-packages/langchain_community/llms/__init__.py:173: LangChainDeprecationWarning: `` was deprecated in LangChain 0.0.22 and will be removed in 0.2. An updated version of the exists in the langchain-community package and should be used instead. To use it run `pip install -U langchain-community` and import as `from langchain_community.chat_models import ChatDatabricks`.\n", " warn_deprecated(\n", "/Users/lramsey/work/ailp/lukmanr-gps-core-solution-services/components/llm_service/notebooks/.venv/lib/python3.9/site-packages/langchain_community/llms/__init__.py:343: LangChainDeprecationWarning: `` was deprecated in LangChain 0.0.22 and will be removed in 0.2. An updated version of the exists in the langchain-community package and should be used instead. To use it run `pip install -U langchain-community` and import as `from langchain_community.chat_models import ChatMlflow`.\n", " warn_deprecated(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO: [config/model_config.py:341 - set_model_config()] Provider [Vertex] enablement status [True]\n", "INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-Chat-Palm2-32k] enablement status [True]\n", "INFO: [config/model_config.py:341 - set_model_config()] Provider [Langchain] enablement status [True]\n", "INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-Chat-Palm2-32k-Langchain] enablement status [True]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/lramsey/work/ailp/lukmanr-gps-core-solution-services/components/llm_service/notebooks/.venv/lib/python3.9/site-packages/langchain_community/llms/__init__.py:173: LangChainDeprecationWarning: `` was deprecated in LangChain 0.0.22 and will be removed in 0.2. An updated version of the exists in the langchain-community package and should be used instead. To use it run `pip install -U langchain-community` and import as `from langchain_community.chat_models import ChatDatabricks`.\n", " warn_deprecated(\n", "/Users/lramsey/work/ailp/lukmanr-gps-core-solution-services/components/llm_service/notebooks/.venv/lib/python3.9/site-packages/langchain_community/llms/__init__.py:343: LangChainDeprecationWarning: `` was deprecated in LangChain 0.0.22 and will be removed in 0.2. An updated version of the exists in the langchain-community package and should be used instead. To use it run `pip install -U langchain-community` and import as `from langchain_community.chat_models import ChatMlflow`.\n", " warn_deprecated(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO: [config/model_config.py:341 - set_model_config()] Provider [Langchain] enablement status [True]\n", "INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-Chat-Gemini-Pro-Langchain] enablement status [True]\n", "INFO: [config/model_config.py:341 - set_model_config()] Provider [Truss] enablement status [False]\n", "INFO: [config/model_config.py:361 - set_model_config()] Model [Truss-Llama2-Chat] enablement status [False]\n", "INFO: [config/model_config.py:341 - set_model_config()] Provider [vLLM] enablement status [True]\n", "INFO: [config/model_config.py:361 - set_model_config()] Model [vLLM-Gemma-Chat] enablement status [False]\n", "INFO: [config/model_config.py:341 - set_model_config()] Provider [ModelGarden] enablement status [True]\n", "INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-ModelGarden-LLAMA2-Chat] enablement status [False]\n", "INFO: [config/model_config.py:341 - set_model_config()] Provider [Langchain] enablement status [True]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/lramsey/work/ailp/lukmanr-gps-core-solution-services/components/llm_service/notebooks/.venv/lib/python3.9/site-packages/langchain_community/llms/__init__.py:173: LangChainDeprecationWarning: `` was deprecated in LangChain 0.0.22 and will be removed in 0.2. An updated version of the exists in the langchain-community package and should be used instead. To use it run `pip install -U langchain-community` and import as `from langchain_community.chat_models import ChatDatabricks`.\n", " warn_deprecated(\n", "/Users/lramsey/work/ailp/lukmanr-gps-core-solution-services/components/llm_service/notebooks/.venv/lib/python3.9/site-packages/langchain_community/llms/__init__.py:343: LangChainDeprecationWarning: `` was deprecated in LangChain 0.0.22 and will be removed in 0.2. An updated version of the exists in the langchain-community package and should be used instead. To use it run `pip install -U langchain-community` and import as `from langchain_community.chat_models import ChatMlflow`.\n", " warn_deprecated(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO: [config/model_config.py:361 - set_model_config()] Model [OpenAI-GPT4] enablement status [False]\n", "INFO: [config/model_config.py:341 - set_model_config()] Provider [Langchain] enablement status [True]\n", "INFO: [config/model_config.py:361 - set_model_config()] Model [OpenAI-GPT4-latest] enablement status [False]\n", "INFO: [config/model_config.py:341 - set_model_config()] Provider [Langchain] enablement status [True]\n", "INFO: [config/model_config.py:361 - set_model_config()] Model [OpenAI-GPT3.5] enablement status [False]\n", "INFO: [config/model_config.py:341 - set_model_config()] Provider [Langchain] enablement status [True]\n", "INFO: [config/model_config.py:361 - set_model_config()] Model [Cohere] enablement status [False]\n", "INFO: [config/model_config.py:341 - set_model_config()] Provider [LLMService] enablement status [False]\n", "INFO: [config/model_config.py:361 - set_model_config()] Model [Llama2cpp] enablement status [False]\n", "INFO: [config/model_config.py:341 - set_model_config()] Provider [Vertex] enablement status [True]\n", "INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-Embedding] enablement status [True]\n", "INFO: [config/model_config.py:341 - set_model_config()] Provider [Vertex] enablement status [True]\n", "INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-Embedding-Vision] enablement status [True]\n", "INFO: [config/model_config.py:341 - set_model_config()] Provider [Langchain] enablement status [True]\n", "INFO: [config/model_config.py:361 - set_model_config()] Model [OpenAI-Embedding] enablement status [True]\n", "INFO: [config/model_config.py:341 - set_model_config()] Provider [Langchain] enablement status [True]\n", "INFO: [config/model_config.py:361 - set_model_config()] Model [HuggingFaceEmbeddings] enablement status [True]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/lramsey/work/ailp/lukmanr-gps-core-solution-services/components/llm_service/notebooks/.venv/lib/python3.9/site-packages/langchain_community/llms/__init__.py:173: LangChainDeprecationWarning: `` was deprecated in LangChain 0.0.22 and will be removed in 0.2. An updated version of the exists in the langchain-community package and should be used instead. To use it run `pip install -U langchain-community` and import as `from langchain_community.chat_models import ChatDatabricks`.\n", " warn_deprecated(\n", "/Users/lramsey/work/ailp/lukmanr-gps-core-solution-services/components/llm_service/notebooks/.venv/lib/python3.9/site-packages/langchain_community/llms/__init__.py:343: LangChainDeprecationWarning: `` was deprecated in LangChain 0.0.22 and will be removed in 0.2. An updated version of the exists in the langchain-community package and should be used instead. To use it run `pip install -U langchain-community` and import as `from langchain_community.chat_models import ChatMlflow`.\n", " warn_deprecated(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "INFO: [config/config.py:112 - <module>()] ENABLE_GOOGLE_LLM = True\n", "INFO: [config/config.py:113 - <module>()] ENABLE_OPENAI_LLM = False\n", "INFO: [config/config.py:114 - <module>()] ENABLE_COHERE_LLM = False\n", "INFO: [config/config.py:115 - <module>()] ENABLE_GOOGLE_MODEL_GARDEN = True\n", "INFO: [config/config.py:116 - <module>()] ENABLE_TRUSS_LLAMA2 = False\n", "INFO: [config/config.py:117 - <module>()] ENABLE_VLLM_GEMMA = True\n", "INFO: [config/config.py:180 - <module>()] Loaded default manifest from /Users/lramsey/work/ailp/lukmanr-gps-core-solution-services/components/llm_service/notebooks/../src/config/document_manifest.json\n", "INFO: [config/vector_store_config.py:44 - <module>()] PG_HOST = [None]\n", "INFO: [config/vector_store_config.py:45 - <module>()] PG_DBNAME = [pgvector]\n", "INFO: [config/vector_store_config.py:74 - <module>()] PG_HOST is set to [None], not connecting to pgvector\n", "INFO: [config/vector_store_config.py:81 - <module>()] Default vector store = [matching_engine]\n", "INFO: [config/onedrive_config.py:30 - <module>()] ONEDRIVE_CLIENT_ID = [None]\n", "INFO: [config/onedrive_config.py:31 - <module>()] ONEDRIVE_TENANT_ID = [None]\n", "WARNING: [config/onedrive_config.py:42 - <module>()] Can't access onedrive client secret\n", "WARNING: [config/onedrive_config.py:48 - <module>()] Can't access onedrive principle name\n", "INFO: [services/embeddings.py:149 - generate_embeddings()] generating embeddings for embedding type VertexAI-Embedding\n", "INFO: [config/model_config.py:488 - get_provider_value()] Get provider value:\n", "INFO: [config/model_config.py:489 - get_provider_value()] provider_id=Vertex\n", "INFO: [services/embeddings.py:149 - generate_embeddings()] generating embeddings for embedding type VertexAI-Embedding\n", "INFO: [config/model_config.py:490 - get_provider_value()] model_id=VertexAI-Embedding\n", "INFO: [services/embeddings.py:208 - get_vertex_embeddings()] generating Vertex embeddings for 250 chunk(s) embedding model text-embedding-004\n", "INFO: [config/model_config.py:488 - get_provider_value()] Get provider value:\n", "INFO: [config/model_config.py:489 - get_provider_value()] provider_id=Vertex\n", "INFO: [config/model_config.py:490 - get_provider_value()] model_id=VertexAI-Embedding\n", "INFO: [services/embeddings.py:208 - get_vertex_embeddings()] generating Vertex embeddings for 7 chunk(s) embedding model text-embedding-004\n", "INFO: [services/embeddings.py:149 - generate_embeddings()] generating embeddings for embedding type VertexAI-Embedding\n", "INFO: [config/model_config.py:488 - get_provider_value()] Get provider value:\n", "INFO: [config/model_config.py:489 - get_provider_value()] provider_id=Vertex\n", "INFO: [config/model_config.py:490 - get_provider_value()] model_id=VertexAI-Embedding\n", "INFO: [services/embeddings.py:208 - get_vertex_embeddings()] generating Vertex embeddings for 250 chunk(s) embedding model text-embedding-004\n" ] } ], "source": [ "from common.models import (UserQuery, QueryResult, QueryEngine, QueryDocument,\n", " QueryReference, QueryDocumentChunk, BatchJobModel)\n", "from services import llm_generate, embeddings" ] }, { "cell_type": "code", "execution_count": 16, "id": "cb40a4ab-1b26-44fc-9256-66c3bdddcee0", "metadata": {}, "outputs": [], "source": [ "from google.cloud import storage\n", "storage_client = storage.Client(project=PROJECT_ID)\n", "from config import DEFAULT_QUERY_EMBEDDING_MODEL\n", "from services.query.data_source import DataSource" ] }, { "cell_type": "code", "execution_count": 7, "id": "160f81a7-1659-4a27-bb78-6cb4cbdcd414", "metadata": {}, "outputs": [], "source": [ "data_source = DataSource(storage_client)" ] }, { "cell_type": "code", "execution_count": 17, "id": "636716d4-368d-4d62-bf1c-01c44a0c6e81", "metadata": {}, "outputs": [], "source": [ "filename = \"stz003.pdf\"\n", "filepath = f\"/Users/lramsey/work/ailp/nasa-demo-docs/{filename}\"\n", "doc_url = \"file:///Users/lramsey/work/ailp/nasa-demo-docs/{filename}\"" ] }, { "cell_type": "code", "execution_count": 18, "id": "03227295-ef8b-4f98-a303-27e5b9fdc967", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO: [query/data_source.py:183 - chunk_document()] generating index data for stz003.pdf\n", "INFO: [query/data_source.py:344 - read_doc()] Reading pdf file stz003.pdf with 21 pages\n", "INFO: [query/data_source.py:347 - read_doc()] Finished reading pdf file stz003.pdf\n", "INFO: [query/data_source.py:219 - chunk_document()] generated 511 text chunks for stz003.pdf\n" ] } ], "source": [ "text_chunks, embed_chunks = data_source.chunk_document(filename, doc_url, filepath)" ] }, { "cell_type": "code", "execution_count": 19, "id": "6efaa5b4-6657-4151-95d7-3477ebd6baf9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO: [services/embeddings.py:65 - get_embeddings()] generating embeddings with VertexAI-Embedding\n", "INFO: [services/embeddings.py:149 - generate_embeddings()] generating embeddings for embedding type VertexAI-Embedding\n", "INFO: [services/embeddings.py:149 - generate_embeddings()] generating embeddings for embedding type VertexAI-Embedding\n", "INFO: [config/model_config.py:488 - get_provider_value()] Get provider value:\n", "INFO: [config/model_config.py:488 - get_provider_value()] Get provider value:\n", "INFO: [config/model_config.py:489 - get_provider_value()] provider_id=Vertex\n", "INFO: [config/model_config.py:489 - get_provider_value()] provider_id=Vertex\n", "INFO: [config/model_config.py:490 - get_provider_value()] model_id=VertexAI-Embedding\n", "ERROR: [services/embeddings.py:206 - get_vertex_embeddings()] chunk exceeds model VertexAI-Embedding token limit 2000\n", "INFO: [config/model_config.py:490 - get_provider_value()] model_id=VertexAI-Embedding\n", "INFO: [services/embeddings.py:208 - get_vertex_embeddings()] generating Vertex embeddings for 250 chunk(s) embedding model text-embedding-004\n", "INFO: [services/embeddings.py:208 - get_vertex_embeddings()] generating Vertex embeddings for 11 chunk(s) embedding model text-embedding-004\n" ] } ], "source": [ "embedding_type = DEFAULT_QUERY_EMBEDDING_MODEL\n", "is_successful, chunk_embeddings = await embeddings.get_embeddings(\n", " embed_chunks,\n", " embedding_type)" ] }, { "cell_type": "code", "execution_count": 20, "id": "9f3d7631-73d1-4acb-9c99-9491b39ade5c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "768" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(chunk_embeddings[0])" ] }, { "cell_type": "code", "execution_count": 21, "id": "3302e667-8fc5-4baa-88e0-04f382d0b587", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0.00389346, 0.08078138, -0.05831575, ..., 0.01274224,\n", " 0.05898353, -0.0638032 ],\n", " [ 0.0112146 , 0.02839652, -0.04258197, ..., 0.02797478,\n", " 0.02603022, -0.00930778],\n", " [-0.03601784, 0.04554968, -0.01303454, ..., -0.03209574,\n", " -0.01342771, 0.00710446],\n", " ...,\n", " [-0.00905537, 0.00639326, -0.0510649 , ..., 0.03592524,\n", " 0.04868424, -0.02064117],\n", " [ 0.06447197, 0.08079967, -0.03770304, ..., 0.00549912,\n", " 0.08661247, -0.03501692],\n", " [ 0.04232629, 0.07304442, -0.01165721, ..., 0.00893772,\n", " 0.09279561, -0.04788226]])" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunk_embeddings" ] }, { "cell_type": "code", "execution_count": null, "id": "6df08c46-55f5-4da9-86ca-f1017d760ff4", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 5 }