in use-cases/rag-pipeline/backend/src/rerank.py [0:0]
def query_instruction_tuned_gemma(prompt):
"""
Sends a request to the instruction tuned model endpoint for text completion.
Args:
prompt: The text prompt for the model.
Returns:
The generated text response from the VLLM model.
"""
try:
data = {
"model": "google/gemma-2-2b-it",
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.7,
"max_tokens": 384,
"top_p": 1.0,
"top_k": 1.0,
}
response = requests.post(
URL,
json=data,
headers={"Content-Type": "application/json"},
timeout=100,
)
print("Printing response from the instruction tuned model:", response.text)
response.raise_for_status() # Raise an exception for HTTP errors
return response.json()["choices"][0]["message"]["content"]
except requests.exceptions.RequestException as e:
logger.error(f"Error communicating with instruction model endpoint: {e}")
print(e)
return "Error: Could not generate a response."
except KeyError as e:
logger.error(f"Unexpected response format from instruction model endpoint: {e}")
return "Error: Invalid response format."
except Exception as e:
logger.exception(f"An unexpected error occurred: {e}")
return "Error: An unexpected error occurred."