supporting-blog-content/github-assistant/evaluation.py (166 lines of code) (raw):
import logging
import sys
import os
import pandas as pd
from dotenv import load_dotenv
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Response
from llama_index.core.evaluation import (
DatasetGenerator,
RelevancyEvaluator,
FaithfulnessEvaluator,
EvaluationResult,
)
from llama_index.llms.openai import OpenAI
from tabulate import tabulate
import textwrap
import argparse
import traceback
from httpx import ReadTimeout
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
parser = argparse.ArgumentParser(
description="Process documents and questions for evaluation."
)
parser.add_argument(
"--num_documents",
type=int,
default=None,
help="Number of documents to process (default: all)",
)
parser.add_argument(
"--skip_documents",
type=int,
default=0,
help="Number of documents to skip at the beginning (default: 0)",
)
parser.add_argument(
"--num_questions",
type=int,
default=None,
help="Number of questions to process (default: all)",
)
parser.add_argument(
"--skip_questions",
type=int,
default=0,
help="Number of questions to skip at the beginning (default: 0)",
)
parser.add_argument(
"--process_last_questions",
action="store_true",
help="Process last N questions instead of first N",
)
args = parser.parse_args()
load_dotenv(".env")
reader = SimpleDirectoryReader("/tmp/elastic/production-readiness-review")
documents = reader.load_data()
print(f"First document: {documents[0].text}")
print(f"Second document: {documents[1].text}")
print(f"Thrid document: {documents[2].text}")
if args.skip_documents > 0:
documents = documents[args.skip_documents :]
if args.num_documents is not None:
documents = documents[: args.num_documents]
print(f"Number of documents loaded: {len(documents)}")
llm = OpenAI(model="gpt-4o", request_timeout=120)
data_generator = DatasetGenerator.from_documents(documents, llm=llm)
try:
eval_questions = data_generator.generate_questions_from_nodes()
if isinstance(eval_questions, str):
eval_questions_list = eval_questions.strip().split("\n")
else:
eval_questions_list = eval_questions
eval_questions_list = [q for q in eval_questions_list if q.strip()]
if args.skip_questions > 0:
eval_questions_list = eval_questions_list[args.skip_questions :]
if args.num_questions is not None:
if args.process_last_questions:
eval_questions_list = eval_questions_list[-args.num_questions :]
else:
eval_questions_list = eval_questions_list[: args.num_questions]
print("\All available questions generated:")
for idx, q in enumerate(eval_questions):
print(f"{idx}. {q}")
print("\nGenerated questions:")
for idx, q in enumerate(eval_questions_list, start=1):
print(f"{idx}. {q}")
except ReadTimeout as e:
print(
"Request to Ollama timed out during question generation. Please check the server or increase the timeout duration."
)
traceback.print_exc()
sys.exit(1)
except Exception as e:
print(f"An error occurred while generating questions: {e}")
traceback.print_exc()
sys.exit(1)
print(f"\nTotal number of questions generated: {len(eval_questions_list)}")
evaluator_relevancy = RelevancyEvaluator(llm=llm)
evaluator_faith = FaithfulnessEvaluator(llm=llm)
vector_index = VectorStoreIndex.from_documents(documents)
def display_eval_df(
query: str,
response: Response,
eval_result_relevancy: EvaluationResult,
eval_result_faith: EvaluationResult,
) -> None:
relevancy_feedback = getattr(eval_result_relevancy, "feedback", "")
relevancy_passing = getattr(eval_result_relevancy, "passing", False)
relevancy_passing_str = "Pass" if relevancy_passing else "Fail"
relevancy_score = 1.0 if relevancy_passing else 0.0
faithfulness_feedback = getattr(eval_result_faith, "feedback", "")
faithfulness_passing_bool = getattr(eval_result_faith, "passing", False)
faithfulness_passing = "Pass" if faithfulness_passing_bool else "Fail"
def wrap_text(text, width=50):
if text is None:
return ""
text = str(text)
text = text.replace("\r", "")
lines = text.split("\n")
wrapped_lines = []
for line in lines:
wrapped_lines.extend(textwrap.wrap(line, width=width))
wrapped_lines.append("")
return "\n".join(wrapped_lines)
if response.source_nodes:
source_content = wrap_text(response.source_nodes[0].node.get_content())
else:
source_content = ""
eval_data = {
"Query": wrap_text(query),
"Response": wrap_text(str(response)),
"Source": source_content,
"Relevancy Response": relevancy_passing_str,
"Relevancy Feedback": wrap_text(relevancy_feedback),
"Relevancy Score": wrap_text(str(relevancy_score)),
"Faith Response": faithfulness_passing,
"Faith Feedback": wrap_text(faithfulness_feedback),
}
eval_df = pd.DataFrame([eval_data])
print("\nEvaluation Result:")
print(
tabulate(
eval_df, headers="keys", tablefmt="grid", showindex=False, stralign="left"
)
)
query_engine = vector_index.as_query_engine(llm=llm)
total_questions = len(eval_questions_list)
for idx, question in enumerate(eval_questions_list, start=1):
try:
response_vector = query_engine.query(question)
eval_result_relevancy = evaluator_relevancy.evaluate_response(
query=question, response=response_vector
)
eval_result_faith = evaluator_faith.evaluate_response(response=response_vector)
print(f"\nProcessing Question {idx} of {total_questions}:")
display_eval_df(
question, response_vector, eval_result_relevancy, eval_result_faith
)
except ReadTimeout as e:
print(f"Request to OpenAI timed out while processing question {idx}.")
traceback.print_exc()
continue
except Exception as e:
print(f"An error occurred while processing question {idx}: {e}")
traceback.print_exc()
continue