in ultravox/evaluation/string_metrics.py [0:0]
def wer(samples: List[eval_types.Sample], args: Dict[str, Any]) -> eval_types.WerResult:
"""Compute WER or CER using Whisper's text normalization."""
lang_id = args.get("lang_id", "<undefined>").lower() # Ensure case-insensitive
# Initialize the appropriate text normalizer
if lang_id == "en":
normalizer = whisper_english.EnglishTextNormalizer()
else:
normalizer = whisper_basic.BasicTextNormalizer()
references = [sample.expected_answer for sample in samples]
hypotheses = [sample.generated_answer for sample in samples]
if lang_id == "ar":
references = [remove_diacritics(ref) for ref in references]
hypotheses = [remove_diacritics(hyp) for hyp in hypotheses]
# Normalize both reference and hypothesis
references = [normalizer(ref) for ref in references]
hypotheses = [normalizer(hyp) for hyp in hypotheses]
# Languages where we compute CER (space-separated characters)
if lang_id in ["zh", "ja", "th", "lo", "my"]:
# Convert to space-separated characters for CER
references = [" ".join(list(ref)) for ref in references]
hypotheses = [" ".join(list(hyp)) for hyp in hypotheses]
# Compute WER using space-separated words
wer_metric = evaluate.load("wer")
wer_score = wer_metric.compute(predictions=hypotheses, references=references)
return eval_types.WerResult(score=wer_score * 100)