llm_demo/evaluation/eval_golden.py (288 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional
from pydantic import BaseModel, Field
from pytz import timezone
class ToolCall(BaseModel):
"""
Represents tool call by orchestration.
"""
name: str
arguments: Dict[str, Any] = Field(
default={}, description="Query arguments for tool call"
)
class EvalData(BaseModel):
"""
Evaluation data model.
This model represents the information needed for running rapid evaluation with Vertex AI.
"""
category: Optional[str] = Field(default=None, description="Evaluation category")
query: Optional[str] = Field(default=None, description="User query")
instruction: Optional[str] = Field(
default="",
description="Part of the input user prompt. It refers to the inference instruction that is sent to you llm",
)
content: Optional[str] = Field(
default=None,
description="Used in tool call evaluation. Content value is the text output from the model.",
)
tool_calls: List[ToolCall] = Field(
default=[], description="Golden tool call for evaluation"
)
prompt: Optional[str] = Field(
default="",
description="User input for the Gen AI model or application. It's optional in some cases.",
)
context: Optional[List[Dict[str, Any] | List[Dict[str, Any]]]] = Field(
default=None, description="Context given to llm in order to answer user query"
)
output: Optional[str] = Field(
default=None, description="Golden output for evaluation"
)
llm_tool_calls: List[ToolCall] = Field(
default=[], description="Tool call output from LLM"
)
llm_output: str = Field(default="", description="Final output from LLM")
reset: bool = Field(
default=True, description="Determine to reset the chat after invoke"
)
def get_date(day_delta: int):
DATE_FORMATTER = "%Y-%m-%d"
retrieved_date = datetime.now(timezone("US/Pacific")) + timedelta(days=day_delta)
return retrieved_date.strftime(DATE_FORMATTER)
goldens = [
EvalData(
category="Search Airport Tool",
query="What is the airport located in San Francisco?",
tool_calls=[
ToolCall(
name="Search Airport",
arguments={"country": "United States", "city": "San Francisco"},
),
],
),
EvalData(
category="Search Airport Tool",
query="Tell me more about Denver International Airport?",
tool_calls=[
ToolCall(
name="Search Airport",
arguments={
"country": "United States",
"city": "Denver",
"name": "Denver International Airport",
},
),
],
),
EvalData(
category="Search Flights By Flight Number Tool",
query="What is the departure gate for flight CY 922?",
tool_calls=[
ToolCall(
name="Search Flights By Flight Number",
arguments={
"airline": "CY",
"flight_number": "922",
},
),
],
),
EvalData(
category="Search Flights By Flight Number Tool",
query="What is flight CY 888 flying to?",
tool_calls=[
ToolCall(
name="Search Flights By Flight Number",
arguments={
"airline": "CY",
"flight_number": "888",
},
),
],
),
EvalData(
category="List Flights Tool",
query="What flights are headed to JFK tomorrow?",
tool_calls=[
ToolCall(
name="List Flights",
arguments={
"arrival_airport": "JFK",
"date": f"{get_date(1)}",
},
),
],
),
EvalData(
category="List Flights Tool",
query="Is there any flight from SFO to DEN?",
output="I will need the date to retrieve relevant flights.",
),
EvalData(
category="Search Amenities Tool",
query="Are there any luxury shops?",
tool_calls=[
ToolCall(
name="Search Amenities",
arguments={
"query": "luxury shops",
},
),
],
),
EvalData(
category="Search Amenities Tool",
query="Where can I get coffee near gate A6?",
tool_calls=[
ToolCall(
name="Search Amenities",
arguments={
"query": "coffee near gate A6",
},
),
],
),
EvalData(
category="Search Policies Tool",
query="What is the flight cancellation policy?",
tool_calls=[
ToolCall(
name="Search Policies",
arguments={
"query": "flight cancellation policy",
},
),
],
),
EvalData(
category="Search Policies Tool",
query="How many checked bags can I bring?",
tool_calls=[
ToolCall(
name="Search Policies",
arguments={
"query": "checked baggage allowance",
},
),
],
),
EvalData(
category="Insert Ticket",
query="I would like to book flight CY 922 departing from SFO on 2025-01-01 at 6:38am.",
tool_calls=[
ToolCall(
name="Insert Ticket",
arguments={
"airline": "CY",
"flight_number": "922",
"departure_airport": "SFO",
"departure_time": "2025-01-01 06:38:00",
},
),
],
),
EvalData(
category="Insert Ticket",
query="What flights are headed from SFO to DEN on January 1 2025?",
tool_calls=[
ToolCall(
name="List Flights",
arguments={
"departure_airport": "SFO",
"arrival_airport": "DEN",
"date": "2025-01-01",
},
),
],
reset=False,
),
EvalData(
category="Insert Ticket",
query="I would like to book the first flight.",
tool_calls=[
ToolCall(
name="Insert Ticket",
arguments={
"airline": "UA",
"flight_number": "1532",
"departure_airport": "SFO",
"arrival_airport": "DEN",
"departure_time": "2025-01-01 05:50:00",
"arrival_time": "2025-01-01 09:23:00",
},
),
],
),
EvalData(
category="List Tickets",
query="Do I have any tickets?",
tool_calls=[ToolCall(name="List Tickets")],
),
EvalData(
category="List Tickets",
query="When is my next flight?",
tool_calls=[ToolCall(name="List Tickets")],
),
EvalData(
category="Airline Related Question",
query="What is Cymbal Air?",
output="Cymbal Air is a passenger airline offering convenient flights to many cities around the world from its hub in San Francisco.",
),
EvalData(
category="Airline Related Question",
query="Where is the hub of cymbal air?",
output="The hub of Cymbal Air is in San Francisco.",
),
EvalData(
category="Assistant Related Question",
query="What can you help me with?",
output="I can help to book flights and answer a wide range of questions pertaining to travel on Cymbal Air, as well as amenities of San Francisco Airport.",
),
EvalData(
category="Assistant Related Question",
query="Can you help me book tickets?",
output="Yes, I can help with several tools such as search airports, list tickets, book tickets.",
),
EvalData(
category="Out-Of-Context Question",
query="Can you help me solve math problems?",
output="Sorry, I am not given the tools for this.",
),
EvalData(
category="Out-Of-Context Question",
query="Who is the CEO of Google?",
output="Sorry, I am not given the tools for this.",
),
EvalData(
category="Multitool Selections",
query="Where can I get a snack near the gate for flight CY 352?",
tool_calls=[
ToolCall(
name="Search Flights By Flight Number",
arguments={
"airline": "CY",
"flight_number": "352",
},
),
ToolCall(
name="Search Amenities",
arguments={
"query": "snack near gate A2.",
},
),
],
),
EvalData(
category="Multitool Selections",
query="What are some flights from SFO to Chicago tomorrow?",
tool_calls=[
ToolCall(
name="Search Airport",
arguments={
"city": "Chicago",
},
),
ToolCall(
name="List Flights",
arguments={
"departure_airport": "SFO",
"arrival_airport": "ORD",
"date": f"{get_date(1)}",
},
),
],
),
]