def format_asr

def format_asr_text()

in ultravox/data/text_proc.py [0:0]

14 lines of code
5 McCabe index (conditional complexity)


def format_asr_text(text: str) -> str:
    """
    Cleans the text for training. First one is Gigaspeech-specific, but the second one is useful for LibriSpeech as well.
    - Convert punctuations
    - Convert to true case
        - This is not perfect, but it's better than nothing
    - Strip leading/trailing spaces

    Example:
        "I SEE LOTS OF PEOPLE HAVE AH DRONES HERE <COMMA> AH MAVERICK AH AS WELL <PERIOD>  "
        --> "I see lots of people have drones here, maverick as well."
    """
    remaining_words = []
    for word in text.split():
        if word in GIGASPEECH_GARBAGE_UTTERANCE_TAGS:
            raise FormatASRError(f"Garbage utterance tag found: {word}")
        if word in GIGASPEECH_PUNCTUATIONS:
            word = GIGASPEECH_PUNCTUATIONS[word]
        remaining_words.append(word)

    text = " ".join(remaining_words)
    text = truecase.get_true_case(text)
    text_stripped = text.strip()
    if len(text_stripped) == 0:
        raise FormatASRError("Empty text after processing")
    return text_stripped