ultravox/data/configs/gigaspeech.py (24 lines of code) (raw):

from ultravox.data import types GS_XL_CONFIG = types.DatasetConfig( name="gigaspeech-xl", path="fixie-ai/gigaspeech", subset="xl-empty-audio-removed", splits=[ types.DatasetSplitConfig(name="train", num_samples=8_266_422), ], transcript_template="{{text_proc.format_asr_text(text)}}", assistant_template="{{text_proc.format_asr_text(text)}}", ) GS_XL_TRANS_CONFIG = types.DatasetConfig( name="gigaspeech-xl-transcription", base="gigaspeech-xl", user_template=types.TRANSCRIPTION_USER_TEMPLATE, eval_config=types.EvalConfig(metric="wer", args={"lang_id": "en"}), ) GS_XL_CONT_CONFIG = types.DatasetConfig( name="gigaspeech-xl-continuation", base="gigaspeech-xl", user_template=types.CONTINUATION_USER_TEMPLATE, assistant_template=types.CONTINUATION_ASSISTANT_TEMPLATE, ) configs = [GS_XL_CONFIG, GS_XL_TRANS_CONFIG, GS_XL_CONT_CONFIG]