in ultravox/data/text_proc.py [0:0]
def format_asr_text(text: str) -> str:
"""
Cleans the text for training. First one is Gigaspeech-specific, but the second one is useful for LibriSpeech as well.
- Convert punctuations
- Convert to true case
- This is not perfect, but it's better than nothing
- Strip leading/trailing spaces
Example:
"I SEE LOTS OF PEOPLE HAVE AH DRONES HERE <COMMA> AH MAVERICK AH AS WELL <PERIOD> "
--> "I see lots of people have drones here, maverick as well."
"""
remaining_words = []
for word in text.split():
if word in GIGASPEECH_GARBAGE_UTTERANCE_TAGS:
raise FormatASRError(f"Garbage utterance tag found: {word}")
if word in GIGASPEECH_PUNCTUATIONS:
word = GIGASPEECH_PUNCTUATIONS[word]
remaining_words.append(word)
text = " ".join(remaining_words)
text = truecase.get_true_case(text)
text_stripped = text.strip()
if len(text_stripped) == 0:
raise FormatASRError("Empty text after processing")
return text_stripped