in use-cases/rag-pipeline/embedding-models/multimodal-embedding/src/blip2_server.py [0:0]
def get_multimodal_embedding(image, caption):
"""Generates multimodal embeddings for a given image and caption.
Args:
image: A PIL.Image object.
caption: The input caption as a string.
Returns:
A torch.Tensor containing the multimodal embeddings.
Raises:
ValueError: If there is an error generating the multimodal embedding.
"""
try:
image = vis_processors["eval"](image).unsqueeze(0).to(device)
text_input = txt_processors["eval"](caption)
sample = {"image": image, "text_input": [text_input]}
features_multimodal = model.extract_features(sample)
return features_multimodal.multimodal_embeds
except Exception as e:
raise ValueError(f"Error generating multimodal embedding: {e}")