def nltk_bleu_eval()

in skills/summarization/evaluation/custom_evals/bleu_eval.py [0:0]


def nltk_bleu_eval(output, ground_truth) -> float:
    """
    Calculate BLEU score using NLTK and evaluate against a threshold.
    
    Args:
    output (str): The output to evaluate.
    ground_truth (str): The ground_truth output.
    threshold (float): The threshold for the BLEU score (default: 0.5).
    
    Returns:
    tuple: (float, bool) - The BLEU score and whether it passes the threshold.
    """
    # Tokenize the summaries
    output_tokens = word_tokenize(output.lower())
    ground_truth_tokens = word_tokenize(ground_truth.lower())
    
    try:
        # Calculate BLEU score
        # Note: sentence_bleu expects a list of references, so we wrap reference_tokens in a list
        bleu_score = sentence_bleu([ground_truth_tokens], output_tokens, weights=(0.25, 0.25, 0.25, 0.25))
        
        # Ensure bleu_score is a float
        if isinstance(bleu_score, (int, float)):
            bleu_score_float = float(bleu_score)
        elif isinstance(bleu_score, (list, np.ndarray)):
            # If it's a list or array, take the mean
            bleu_score_float = float(np.mean(bleu_score))
        else:
            # If it's neither a number nor a list, default to 0
            print(f"Warning: Unexpected BLEU score type: {type(bleu_score)}. Defaulting to 0.")
            bleu_score_float = 0.0
    except Exception as e:
        print(f"Error calculating BLEU score: {e}. Defaulting to 0.")
        bleu_score_float = 0.0
    
    # Return both the BLEU score and whether it passes the threshold
    return bleu_score_float