skills/summarization/evaluation/custom_evals/rouge_eval.py (23 lines of code) (raw):

import numpy as np from typing import Dict, TypedDict, Union, Any from rouge_score import rouge_scorer def rouge_eval(summary, ground_truth, threshold=0.3) -> float: """ Evaluate summary using ROUGE scores. Args: summary (str): The summary to evaluate. ground_truth (str): The ground_truth summary. threshold (float): The threshold for the ROUGE score (default: 0.3). Returns: bool: True if the average ROUGE score is above the threshold, False otherwise. """ scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True) scores = scorer.score(summary, ground_truth) # Calculate average ROUGE score avg_rouge = np.mean([scores['rouge1'].fmeasure, scores['rouge2'].fmeasure, scores['rougeL'].fmeasure]) return float(avg_rouge) def get_assert(output: str, context, threshold=0.3) -> Union[bool, float, Dict[str, Any]]: ground_truth = context['vars']['ground_truth'] score = rouge_eval(output, ground_truth) if score >= threshold: return { "pass": True, "score": score, "reason": "Average score is above threshold" } else: return { "pass": False, "score": score, "reason": "Average score is below threshold" }