components/llm_service/notebooks/Embeddings.ipynb (359 lines of code) (raw):
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "cdf14a05-aa5e-4388-b622-a4e3168d9955",
"metadata": {},
"outputs": [],
"source": [
"PROJECT_ID = \"genie-ui-dev\"\n",
"REGION = \"us-central1\"\n",
"import os\n",
"os.environ[\"PROJECT_ID\"] = PROJECT_ID\n",
"os.environ[\"PG_HOST\"] = \"\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "7b93aff9-aafb-4d76-8ce8-7781b623e393",
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append(\"../../common/src\")\n",
"sys.path.append(\"../src\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b3a9c5a0-8416-43f9-b569-6ce8a9ee97d4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO: [config/config.py:61 - <module>()] Namespace File not found, setting job namespace as default\n",
"INFO: [config/model_config.py:341 - set_model_config()] Provider [Vertex] enablement status [True]\n",
"INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-Text] enablement status [True]\n",
"INFO: [config/model_config.py:341 - set_model_config()] Provider [Vertex] enablement status [True]\n",
"INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-Chat] enablement status [True]\n",
"INFO: [config/model_config.py:341 - set_model_config()] Provider [Vertex] enablement status [True]\n",
"INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-Chat-Palm2] enablement status [True]\n",
"INFO: [config/model_config.py:341 - set_model_config()] Provider [Vertex] enablement status [True]\n",
"INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-Chat-Palm2-V2] enablement status [True]\n",
"INFO: [config/model_config.py:341 - set_model_config()] Provider [Vertex] enablement status [True]\n",
"INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-Gemini-Pro] enablement status [True]\n",
"INFO: [config/model_config.py:341 - set_model_config()] Provider [Vertex] enablement status [True]\n",
"INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-Gemini-Pro-Vision] enablement status [True]\n",
"INFO: [config/model_config.py:341 - set_model_config()] Provider [Langchain] enablement status [True]\n",
"INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-Chat-Palm2V2-Langchain] enablement status [True]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/lramsey/work/ailp/lukmanr-gps-core-solution-services/components/llm_service/notebooks/.venv/lib/python3.9/site-packages/langchain_community/llms/__init__.py:173: LangChainDeprecationWarning: `` was deprecated in LangChain 0.0.22 and will be removed in 0.2. An updated version of the exists in the langchain-community package and should be used instead. To use it run `pip install -U langchain-community` and import as `from langchain_community.chat_models import ChatDatabricks`.\n",
" warn_deprecated(\n",
"/Users/lramsey/work/ailp/lukmanr-gps-core-solution-services/components/llm_service/notebooks/.venv/lib/python3.9/site-packages/langchain_community/llms/__init__.py:343: LangChainDeprecationWarning: `` was deprecated in LangChain 0.0.22 and will be removed in 0.2. An updated version of the exists in the langchain-community package and should be used instead. To use it run `pip install -U langchain-community` and import as `from langchain_community.chat_models import ChatMlflow`.\n",
" warn_deprecated(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO: [config/model_config.py:341 - set_model_config()] Provider [Vertex] enablement status [True]\n",
"INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-Chat-Palm2-32k] enablement status [True]\n",
"INFO: [config/model_config.py:341 - set_model_config()] Provider [Langchain] enablement status [True]\n",
"INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-Chat-Palm2-32k-Langchain] enablement status [True]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/lramsey/work/ailp/lukmanr-gps-core-solution-services/components/llm_service/notebooks/.venv/lib/python3.9/site-packages/langchain_community/llms/__init__.py:173: LangChainDeprecationWarning: `` was deprecated in LangChain 0.0.22 and will be removed in 0.2. An updated version of the exists in the langchain-community package and should be used instead. To use it run `pip install -U langchain-community` and import as `from langchain_community.chat_models import ChatDatabricks`.\n",
" warn_deprecated(\n",
"/Users/lramsey/work/ailp/lukmanr-gps-core-solution-services/components/llm_service/notebooks/.venv/lib/python3.9/site-packages/langchain_community/llms/__init__.py:343: LangChainDeprecationWarning: `` was deprecated in LangChain 0.0.22 and will be removed in 0.2. An updated version of the exists in the langchain-community package and should be used instead. To use it run `pip install -U langchain-community` and import as `from langchain_community.chat_models import ChatMlflow`.\n",
" warn_deprecated(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO: [config/model_config.py:341 - set_model_config()] Provider [Langchain] enablement status [True]\n",
"INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-Chat-Gemini-Pro-Langchain] enablement status [True]\n",
"INFO: [config/model_config.py:341 - set_model_config()] Provider [Truss] enablement status [False]\n",
"INFO: [config/model_config.py:361 - set_model_config()] Model [Truss-Llama2-Chat] enablement status [False]\n",
"INFO: [config/model_config.py:341 - set_model_config()] Provider [vLLM] enablement status [True]\n",
"INFO: [config/model_config.py:361 - set_model_config()] Model [vLLM-Gemma-Chat] enablement status [False]\n",
"INFO: [config/model_config.py:341 - set_model_config()] Provider [ModelGarden] enablement status [True]\n",
"INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-ModelGarden-LLAMA2-Chat] enablement status [False]\n",
"INFO: [config/model_config.py:341 - set_model_config()] Provider [Langchain] enablement status [True]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/lramsey/work/ailp/lukmanr-gps-core-solution-services/components/llm_service/notebooks/.venv/lib/python3.9/site-packages/langchain_community/llms/__init__.py:173: LangChainDeprecationWarning: `` was deprecated in LangChain 0.0.22 and will be removed in 0.2. An updated version of the exists in the langchain-community package and should be used instead. To use it run `pip install -U langchain-community` and import as `from langchain_community.chat_models import ChatDatabricks`.\n",
" warn_deprecated(\n",
"/Users/lramsey/work/ailp/lukmanr-gps-core-solution-services/components/llm_service/notebooks/.venv/lib/python3.9/site-packages/langchain_community/llms/__init__.py:343: LangChainDeprecationWarning: `` was deprecated in LangChain 0.0.22 and will be removed in 0.2. An updated version of the exists in the langchain-community package and should be used instead. To use it run `pip install -U langchain-community` and import as `from langchain_community.chat_models import ChatMlflow`.\n",
" warn_deprecated(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO: [config/model_config.py:361 - set_model_config()] Model [OpenAI-GPT4] enablement status [False]\n",
"INFO: [config/model_config.py:341 - set_model_config()] Provider [Langchain] enablement status [True]\n",
"INFO: [config/model_config.py:361 - set_model_config()] Model [OpenAI-GPT4-latest] enablement status [False]\n",
"INFO: [config/model_config.py:341 - set_model_config()] Provider [Langchain] enablement status [True]\n",
"INFO: [config/model_config.py:361 - set_model_config()] Model [OpenAI-GPT3.5] enablement status [False]\n",
"INFO: [config/model_config.py:341 - set_model_config()] Provider [Langchain] enablement status [True]\n",
"INFO: [config/model_config.py:361 - set_model_config()] Model [Cohere] enablement status [False]\n",
"INFO: [config/model_config.py:341 - set_model_config()] Provider [LLMService] enablement status [False]\n",
"INFO: [config/model_config.py:361 - set_model_config()] Model [Llama2cpp] enablement status [False]\n",
"INFO: [config/model_config.py:341 - set_model_config()] Provider [Vertex] enablement status [True]\n",
"INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-Embedding] enablement status [True]\n",
"INFO: [config/model_config.py:341 - set_model_config()] Provider [Vertex] enablement status [True]\n",
"INFO: [config/model_config.py:361 - set_model_config()] Model [VertexAI-Embedding-Vision] enablement status [True]\n",
"INFO: [config/model_config.py:341 - set_model_config()] Provider [Langchain] enablement status [True]\n",
"INFO: [config/model_config.py:361 - set_model_config()] Model [OpenAI-Embedding] enablement status [True]\n",
"INFO: [config/model_config.py:341 - set_model_config()] Provider [Langchain] enablement status [True]\n",
"INFO: [config/model_config.py:361 - set_model_config()] Model [HuggingFaceEmbeddings] enablement status [True]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/lramsey/work/ailp/lukmanr-gps-core-solution-services/components/llm_service/notebooks/.venv/lib/python3.9/site-packages/langchain_community/llms/__init__.py:173: LangChainDeprecationWarning: `` was deprecated in LangChain 0.0.22 and will be removed in 0.2. An updated version of the exists in the langchain-community package and should be used instead. To use it run `pip install -U langchain-community` and import as `from langchain_community.chat_models import ChatDatabricks`.\n",
" warn_deprecated(\n",
"/Users/lramsey/work/ailp/lukmanr-gps-core-solution-services/components/llm_service/notebooks/.venv/lib/python3.9/site-packages/langchain_community/llms/__init__.py:343: LangChainDeprecationWarning: `` was deprecated in LangChain 0.0.22 and will be removed in 0.2. An updated version of the exists in the langchain-community package and should be used instead. To use it run `pip install -U langchain-community` and import as `from langchain_community.chat_models import ChatMlflow`.\n",
" warn_deprecated(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO: [config/config.py:112 - <module>()] ENABLE_GOOGLE_LLM = True\n",
"INFO: [config/config.py:113 - <module>()] ENABLE_OPENAI_LLM = False\n",
"INFO: [config/config.py:114 - <module>()] ENABLE_COHERE_LLM = False\n",
"INFO: [config/config.py:115 - <module>()] ENABLE_GOOGLE_MODEL_GARDEN = True\n",
"INFO: [config/config.py:116 - <module>()] ENABLE_TRUSS_LLAMA2 = False\n",
"INFO: [config/config.py:117 - <module>()] ENABLE_VLLM_GEMMA = True\n",
"INFO: [config/config.py:180 - <module>()] Loaded default manifest from /Users/lramsey/work/ailp/lukmanr-gps-core-solution-services/components/llm_service/notebooks/../src/config/document_manifest.json\n",
"INFO: [config/vector_store_config.py:44 - <module>()] PG_HOST = [None]\n",
"INFO: [config/vector_store_config.py:45 - <module>()] PG_DBNAME = [pgvector]\n",
"INFO: [config/vector_store_config.py:74 - <module>()] PG_HOST is set to [None], not connecting to pgvector\n",
"INFO: [config/vector_store_config.py:81 - <module>()] Default vector store = [matching_engine]\n",
"INFO: [config/onedrive_config.py:30 - <module>()] ONEDRIVE_CLIENT_ID = [None]\n",
"INFO: [config/onedrive_config.py:31 - <module>()] ONEDRIVE_TENANT_ID = [None]\n",
"WARNING: [config/onedrive_config.py:42 - <module>()] Can't access onedrive client secret\n",
"WARNING: [config/onedrive_config.py:48 - <module>()] Can't access onedrive principle name\n",
"INFO: [services/embeddings.py:149 - generate_embeddings()] generating embeddings for embedding type VertexAI-Embedding\n",
"INFO: [config/model_config.py:488 - get_provider_value()] Get provider value:\n",
"INFO: [config/model_config.py:489 - get_provider_value()] provider_id=Vertex\n",
"INFO: [services/embeddings.py:149 - generate_embeddings()] generating embeddings for embedding type VertexAI-Embedding\n",
"INFO: [config/model_config.py:490 - get_provider_value()] model_id=VertexAI-Embedding\n",
"INFO: [services/embeddings.py:208 - get_vertex_embeddings()] generating Vertex embeddings for 250 chunk(s) embedding model text-embedding-004\n",
"INFO: [config/model_config.py:488 - get_provider_value()] Get provider value:\n",
"INFO: [config/model_config.py:489 - get_provider_value()] provider_id=Vertex\n",
"INFO: [config/model_config.py:490 - get_provider_value()] model_id=VertexAI-Embedding\n",
"INFO: [services/embeddings.py:208 - get_vertex_embeddings()] generating Vertex embeddings for 7 chunk(s) embedding model text-embedding-004\n",
"INFO: [services/embeddings.py:149 - generate_embeddings()] generating embeddings for embedding type VertexAI-Embedding\n",
"INFO: [config/model_config.py:488 - get_provider_value()] Get provider value:\n",
"INFO: [config/model_config.py:489 - get_provider_value()] provider_id=Vertex\n",
"INFO: [config/model_config.py:490 - get_provider_value()] model_id=VertexAI-Embedding\n",
"INFO: [services/embeddings.py:208 - get_vertex_embeddings()] generating Vertex embeddings for 250 chunk(s) embedding model text-embedding-004\n"
]
}
],
"source": [
"from common.models import (UserQuery, QueryResult, QueryEngine, QueryDocument,\n",
" QueryReference, QueryDocumentChunk, BatchJobModel)\n",
"from services import llm_generate, embeddings"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "cb40a4ab-1b26-44fc-9256-66c3bdddcee0",
"metadata": {},
"outputs": [],
"source": [
"from google.cloud import storage\n",
"storage_client = storage.Client(project=PROJECT_ID)\n",
"from config import DEFAULT_QUERY_EMBEDDING_MODEL\n",
"from services.query.data_source import DataSource"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "160f81a7-1659-4a27-bb78-6cb4cbdcd414",
"metadata": {},
"outputs": [],
"source": [
"data_source = DataSource(storage_client)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "636716d4-368d-4d62-bf1c-01c44a0c6e81",
"metadata": {},
"outputs": [],
"source": [
"filename = \"stz003.pdf\"\n",
"filepath = f\"/Users/lramsey/work/ailp/nasa-demo-docs/{filename}\"\n",
"doc_url = \"file:///Users/lramsey/work/ailp/nasa-demo-docs/{filename}\""
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "03227295-ef8b-4f98-a303-27e5b9fdc967",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO: [query/data_source.py:183 - chunk_document()] generating index data for stz003.pdf\n",
"INFO: [query/data_source.py:344 - read_doc()] Reading pdf file stz003.pdf with 21 pages\n",
"INFO: [query/data_source.py:347 - read_doc()] Finished reading pdf file stz003.pdf\n",
"INFO: [query/data_source.py:219 - chunk_document()] generated 511 text chunks for stz003.pdf\n"
]
}
],
"source": [
"text_chunks, embed_chunks = data_source.chunk_document(filename, doc_url, filepath)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "6efaa5b4-6657-4151-95d7-3477ebd6baf9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO: [services/embeddings.py:65 - get_embeddings()] generating embeddings with VertexAI-Embedding\n",
"INFO: [services/embeddings.py:149 - generate_embeddings()] generating embeddings for embedding type VertexAI-Embedding\n",
"INFO: [services/embeddings.py:149 - generate_embeddings()] generating embeddings for embedding type VertexAI-Embedding\n",
"INFO: [config/model_config.py:488 - get_provider_value()] Get provider value:\n",
"INFO: [config/model_config.py:488 - get_provider_value()] Get provider value:\n",
"INFO: [config/model_config.py:489 - get_provider_value()] provider_id=Vertex\n",
"INFO: [config/model_config.py:489 - get_provider_value()] provider_id=Vertex\n",
"INFO: [config/model_config.py:490 - get_provider_value()] model_id=VertexAI-Embedding\n",
"ERROR: [services/embeddings.py:206 - get_vertex_embeddings()] chunk exceeds model VertexAI-Embedding token limit 2000\n",
"INFO: [config/model_config.py:490 - get_provider_value()] model_id=VertexAI-Embedding\n",
"INFO: [services/embeddings.py:208 - get_vertex_embeddings()] generating Vertex embeddings for 250 chunk(s) embedding model text-embedding-004\n",
"INFO: [services/embeddings.py:208 - get_vertex_embeddings()] generating Vertex embeddings for 11 chunk(s) embedding model text-embedding-004\n"
]
}
],
"source": [
"embedding_type = DEFAULT_QUERY_EMBEDDING_MODEL\n",
"is_successful, chunk_embeddings = await embeddings.get_embeddings(\n",
" embed_chunks,\n",
" embedding_type)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "9f3d7631-73d1-4acb-9c99-9491b39ade5c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"768"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(chunk_embeddings[0])"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "3302e667-8fc5-4baa-88e0-04f382d0b587",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0.00389346, 0.08078138, -0.05831575, ..., 0.01274224,\n",
" 0.05898353, -0.0638032 ],\n",
" [ 0.0112146 , 0.02839652, -0.04258197, ..., 0.02797478,\n",
" 0.02603022, -0.00930778],\n",
" [-0.03601784, 0.04554968, -0.01303454, ..., -0.03209574,\n",
" -0.01342771, 0.00710446],\n",
" ...,\n",
" [-0.00905537, 0.00639326, -0.0510649 , ..., 0.03592524,\n",
" 0.04868424, -0.02064117],\n",
" [ 0.06447197, 0.08079967, -0.03770304, ..., 0.00549912,\n",
" 0.08661247, -0.03501692],\n",
" [ 0.04232629, 0.07304442, -0.01165721, ..., 0.00893772,\n",
" 0.09279561, -0.04788226]])"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chunk_embeddings"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6df08c46-55f5-4da9-86ca-f1017d760ff4",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}