evaluations/qa_quality_eval.py (81 lines of code) (raw):
import os
import json
from datetime import datetime
from promptflow.client import PFClient
from promptflow.core import AzureOpenAIModelConfiguration
from promptflow.evals.evaluate import evaluate
from promptflow.evals.evaluators import RelevanceEvaluator, FluencyEvaluator, GroundednessEvaluator, CoherenceEvaluator
from azure_config import AzureConfig
def main():
# Read configuration
azure_config = AzureConfig()
# Set required environment variables
os.environ['AZURE_OPENAI_ENDPOINT'] = azure_config.aoai_endpoint
os.environ['AZURE_OPENAI_API_KEY'] = azure_config.aoai_api_key
##################################
## Base Run
##################################
pf = PFClient()
flow = "./src" # path to the flow
data = "./evaluations/test-dataset.jsonl" # path to the data file
# base run
base_run = pf.run(
flow=flow,
data=data,
column_mapping={
"question": "${data.question}",
"chat_history": []
},
stream=True,
)
responses = pf.get_details(base_run)
print(responses.head(10))
# Convert to jsonl
relevant_columns = responses[['inputs.question', 'inputs.chat_history', 'outputs.answer', 'outputs.context']]
relevant_columns.columns = ['question', 'chat_history', 'answer', 'context']
data_list = relevant_columns.to_dict(orient='records')
with open('responses.jsonl', 'w') as f:
for item in data_list:
f.write(json.dumps(item) + '\n')
##################################
## Evaluation
##################################
# Initialize Azure OpenAI Connection with your environment variables
model_config = AzureOpenAIModelConfiguration(
azure_endpoint=azure_config.aoai_endpoint,
api_key=azure_config.aoai_api_key,
azure_deployment=os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT"),
api_version=azure_config.aoai_api_version,
)
azure_ai_project = {
"subscription_id": azure_config.subscription_id,
"resource_group_name": azure_config.resource_group,
"project_name": azure_config.workspace_name,
}
# https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/flow-evaluate-sdk
fluency_evaluator = FluencyEvaluator(model_config=model_config)
groundedness_evaluator = GroundednessEvaluator(model_config=model_config)
relevance_evaluator = RelevanceEvaluator(model_config=model_config)
coherence_evaluator = CoherenceEvaluator(model_config=model_config)
data = "./responses.jsonl" # path to the data file
prefix = os.getenv("PREFIX", datetime.now().strftime("%y%m%d%H%M%S"))[:14]
evaluation_name=f"{prefix} Quality Evaluation"
print(f"Executing evaluation: {evaluation_name}.")
try:
result = evaluate(
evaluation_name=evaluation_name,
data=data,
evaluators={
"Fluency": fluency_evaluator,
"Groundedness": groundedness_evaluator,
"Relevance": relevance_evaluator,
"Coherence": coherence_evaluator
},
azure_ai_project=azure_ai_project,
output_path="./qa_flow_quality_eval.json"
)
except Exception as e:
print(f"An error occurred during evaluation: {e}. Retrying without reporting results to Azure AI Project.")
result = evaluate(
evaluation_name=evaluation_name,
data=data,
evaluators={
"Fluency": fluency_evaluator,
"Groundedness": groundedness_evaluator,
"Relevance": relevance_evaluator,
"Coherence": coherence_evaluator
},
output_path="./qa_flow_quality_eval.json"
)
print(f"Check QA evaluation result {evaluation_name} in the 'Evaluation' section of your project: {azure_config.workspace_name}.")
if __name__ == '__main__':
import promptflow as pf
main()