in deploy_and_monitor/sm-model_monitor_byoc_llm_monitor/src/components/evaluator.py [0:0]
def _evaluate_relevance_and_accuracy(self, dataset_uri: str):
"""
Evaluates the data for relevance and accuracy using the FMEval library.
:param dataset_uri: The path to the dataset file.
:raises: ValueError if the dataset_uri is not a valid string.
:return: A dictionary containing the evaluation results. If data is empty/malformed, returns an empty dictionary.
"""
if not isinstance(dataset_uri, str):
raise ValueError("dataset_uri must be a valid string")
fileDownloader.retrieve_model(ANSWER_RELEVANCY_MODEL) # downloads / loads a 4.66GB LLM
model = GPT4All(model=ANSWER_RELEVANCY_MODEL, verbose=False, n_batch=128, n_threads=36 if 'DOCKER_CONTAINER' in os.environ else None)
evaluator_model = ScoreStringEvalChain.from_llm(
llm=model, verbose=False
)
line_count = 0
try:
with open(dataset_uri, 'r') as file:
lines = file.readlines()
except:
logger.error("Could not read file.")
return {}
if not lines:
logger.info("No data to evaluate")
return {}
# Initialize our list of individualy response scores and summed total scores (for later averaging)
results = []
totals = {field: 0 for field in RELEVANCE_AND_ACCURACY_EVALUATIONS}
# Randomly sample 10 prompt and responses for evaluation
if len(lines) <= 10:
sample_lines = lines
else:
sample_lines = random.sample(lines, 10)
logger.info("Starting evaluation")
for line in sample_lines:
try:
data = json.loads(line)
line_count += 1
logger.info(f"Evaluating line: {line_count}")
accuracy_relevance_eval_result = evaluator_model.evaluate_strings(
prediction=data["answer"],
input=data["content"],
)
result_dict = {
"prompt": data["content"],
"response": data["answer"],
"relevance_and_accuracy_analysis": accuracy_relevance_eval_result["reasoning"],
"relevance_and_accuracy_score": accuracy_relevance_eval_result["score"],
}
# Add all scores for this response to result list and sum total scores
results.append(result_dict)
for key, value in result_dict.items():
if key in totals:
totals[key] += value
except ValueError as e:
logger.warning(f"Error evaluating line, continuing: {e}")
continue
except (KeyError, JSONDecodeError) as e:
logger.error(f"Data malformed {e}")
return {}
report_filepath = os.path.join(REPORT_PATH, RELEVANCE_AND_ACCURACY_REPORT_FILENAME)
write_eval_result_file(report_filepath, results)
# Returns average scores
return {key: value / (line_count if line_count > 0 else 1) for key, value in totals.items()}