def generate_multimodal_embeddings()

in use-cases/rag-pipeline/embedding-models/multimodal-embedding/src/blip2_server.py [0:0]


def generate_multimodal_embeddings():
    """Generates multimodal embeddings for a given image and caption.

    This endpoint accepts a POST request with either a JSON payload containing
    the `image_uri` and `caption` or a file upload named `image` and a form field
    named `text`.

    Args:
        None (from request body):
            JSON payload:
                image_uri: The GCS URI of the image file.
                caption: The input caption as a string.
            Form data:
                image: The image file.
                text: The input caption as a form field.

    Returns:
        A JSON response containing the multimodal embeddings
    """
    if request.method == "POST":
        try:
            if request.is_json:
                try:
                    json_req = request.get_json()
                except Exception as e:
                    return jsonify({"error": f"Invalid JSON payload: {e}"}), 400
                if "image_uri" not in json_req:
                    return jsonify({"error": "No image_uri provided"}), 400
                if "caption" not in json_req:
                    return jsonify({"error": "No caption provided"}), 400
                try:
                    image = download_image_from_gcs(json_req["image_uri"])
                except Exception as e:
                    return jsonify({"error": str(e)}), 400
                caption = json_req["caption"]
            else:
                if "image" not in request.files:
                    return jsonify({"error": "No image provided"}), 400
                if "text" not in request.form:
                    return jsonify({"error": "No text provided"}), 400
                try:
                    image_file = request.files["image"]
                    image = Image.open(image_file).convert("RGB")
                except Exception as e:
                    return jsonify({"error": f"Error processing image file: {e}"}), 400
                caption = request.form["text"]
        except Exception as e:
            return jsonify({"error": str(e)}), 400

        try:
            multimodal_features = get_multimodal_embedding(image, caption)
        except Exception as e:
            return jsonify({"error": str(e)}), 400
        logger.info("Multimodal embeddings generated successfully.")
        return jsonify(
            {
                "multimodal_embeds": multimodal_features.tolist()[0][0],
            }
        )
    else:
        return jsonify({"error": "Invalid request method"}), 405