packages/blueprints/gen-ai-chatbot/static-assets/chatbot-genai-components/backend/python/embedding/wrapper.py (31 lines of code) (raw):
import logging
from app.bedrock import calculate_document_embeddings
from embedding.loaders.base import BaseLoader, Document
from llama_index.core.node_parser import TextSplitter
logger = logging.getLogger(__name__)
class DocumentSplitter:
"""Thin wrapper for `llama_index.TextSplitter` to split documents."""
def __init__(self, splitter: TextSplitter):
self.splitter = splitter
def split_documents(self, documents: list[Document]) -> list[Document]:
res = []
for document in documents:
splitted_content = self.splitter.split_text(document.page_content)
for content in splitted_content:
res.append(Document(page_content=content, metadata=document.metadata))
return res
class Embedder:
"""Thin wrapper class to calculate embeddings by Bedrock API."""
def __init__(self, verbose=False):
self.verbose = verbose
def print_documents_summary(self, documents: list[Document]):
for i, d in enumerate(documents):
logger.info(f"{i}th document metadata: {d.metadata}")
logger.info(f"{i}th document content length: {len(d.page_content)}")
logger.info(f"{i}th document head of content: {d.page_content[:30]}")
def embed_documents(self, documents: list[Document]) -> list[list[float]]:
if self.verbose:
logger.info(f"Embedding {len(documents)} documents.")
self.print_documents_summary(documents)
embeddings = calculate_document_embeddings([d.page_content for d in documents])
if self.verbose:
logger.info("Done embedding.")
return embeddings