def get_multimodal_embedding()

in use-cases/rag-pipeline/embedding-models/multimodal-embedding/src/blip2_server.py [0:0]


def get_multimodal_embedding(image, caption):
    """Generates multimodal embeddings for a given image and caption.

    Args:
        image: A PIL.Image object.
        caption: The input caption as a string.

    Returns:
        A torch.Tensor containing the multimodal embeddings.

    Raises:
        ValueError: If there is an error generating the multimodal embedding.
    """
    try:
        image = vis_processors["eval"](image).unsqueeze(0).to(device)
        text_input = txt_processors["eval"](caption)
        sample = {"image": image, "text_input": [text_input]}
        features_multimodal = model.extract_features(sample)
        return features_multimodal.multimodal_embeds
    except Exception as e:
        raise ValueError(f"Error generating multimodal embedding: {e}")