def query_instruction_tuned_gemma()

in use-cases/rag-pipeline/backend/src/rerank.py [0:0]


def query_instruction_tuned_gemma(prompt):
    """
    Sends a request to the instruction tuned model endpoint for text completion.

    Args:
        prompt: The text prompt for the model.

    Returns:
        The generated text response from the VLLM model.
    """
    try:
        data = {
            "model": "google/gemma-2-2b-it",
            "messages": [{"role": "user", "content": prompt}],
            "temperature": 0.7,
            "max_tokens": 384,
            "top_p": 1.0,
            "top_k": 1.0,
        }
        response = requests.post(
            URL,
            json=data,
            headers={"Content-Type": "application/json"},
            timeout=100,
        )
        print("Printing response from the instruction tuned model:", response.text)
        response.raise_for_status()  # Raise an exception for HTTP errors

        return response.json()["choices"][0]["message"]["content"]

    except requests.exceptions.RequestException as e:
        logger.error(f"Error communicating with instruction model endpoint: {e}")
        print(e)
        return "Error: Could not generate a response."
    except KeyError as e:
        logger.error(f"Unexpected response format from instruction model endpoint: {e}")
        return "Error: Invalid response format."
    except Exception as e:
        logger.exception(f"An unexpected error occurred: {e}")
        return "Error: An unexpected error occurred."