in src/databao_context_engine/services/chunk_embedding_service.py [0:0]
def embed_chunks(self, *, datasource_run_id: int, chunks: list[EmbeddableChunk], result: str) -> None:
"""
Turn plugin chunks into persisted chunks and embeddings
Flow:
1) Embed each chunk into an embedded vector
2) Get or create embedding table for the appropriate model and embedding dimensions
3) Persist chunks and embeddings vectors in a single transaction
"""
if not chunks:
return
logger.debug(
f"Embedding {len(chunks)} chunks for datasource run {datasource_run_id}, with chunk_embedding_mode={self._chunk_embedding_mode}"
)
enriched_embeddings: list[ChunkEmbedding] = []
for chunk in chunks:
chunk_display_text = to_yaml_string(chunk.content)
generated_description = ""
match self._chunk_embedding_mode:
case ChunkEmbeddingMode.EMBEDDABLE_TEXT_ONLY:
embedding_text = chunk.embeddable_text
case ChunkEmbeddingMode.GENERATED_DESCRIPTION_ONLY:
generated_description = cast(DescriptionProvider, self._description_provider).describe(
text=chunk_display_text, context=result
)
embedding_text = generated_description
case ChunkEmbeddingMode.EMBEDDABLE_TEXT_AND_GENERATED_DESCRIPTION:
generated_description = cast(DescriptionProvider, self._description_provider).describe(
text=chunk_display_text, context=result
)
embedding_text = generated_description + "\n" + chunk.embeddable_text
vec = self._embedding_provider.embed(embedding_text)
enriched_embeddings.append(
ChunkEmbedding(
chunk=chunk,
vec=vec,
display_text=chunk_display_text,
generated_description=generated_description,
)
)
table_name = self._shard_resolver.resolve_or_create(
embedder=self._embedding_provider.embedder,
model_id=self._embedding_provider.model_id,
dim=self._embedding_provider.dim,
)
self._persistence_service.write_chunks_and_embeddings(
datasource_run_id=datasource_run_id,
chunk_embeddings=enriched_embeddings,
table_name=table_name,
)