def make_demo()

in ultravox/tools/gradio_voice.py [0:0]


def make_demo(args, inference):

    def transcribe(
        audio: tuple[int, np.ndarray],
        conversation: list[dict],
        max_new_tokens: int = 200,
        temperature: float = 0,
    ):

        sampling_rate, audio_array = audio

        conversation.append(
            {
                "role": "user",
                "content": gr.Audio(value=(sampling_rate, audio_array.squeeze())),
            }
        )
        yield AdditionalOutputs(conversation)

        sample = datasets.VoiceSample.from_prompt_and_raw(
            "<|audio|>", audio_array.squeeze(), sampling_rate
        )

        output = cast(ultravox_infer.UltravoxInference, inference).infer_stream(
            sample,
            max_tokens=max_new_tokens,
            temperature=temperature,
        )
        conversation.append({"role": "assistant", "content": ""})
        for chunk in output:
            if isinstance(chunk, infer_base.InferenceChunk):
                conversation[-1]["content"] += chunk.text
                yield AdditionalOutputs(conversation)

    with gr.Blocks() as voice_demo:

        placeholder = """
<h1 style='text-align: center'>
    Talk to Ultravox Llama 3.1 8b (Powered by WebRTC ⚡️)
</h1>
<p style='text-align: center'>
    Once you grant access to your microphone, you can talk naturally to Ultravox.
    When you stop talking, the audio will be sent for processing.
</p>
<p style='text-align: center'>
    Each conversation is limited to 90 seconds. Once the time limit is up you can rejoin the conversation.
</p>
"""
        with gr.Row():
            conversation = gr.Chatbot(
                label="transcript", placeholder=placeholder, type="messages"
            )
        with gr.Row():
            with gr.Column(scale=4):
                audio = WebRTC(
                    rtc_configuration=rtc_configuration,
                    label="Stream",
                    mode="send",
                    modality="audio",
                )
            with gr.Column(scale=1):
                max_new_tokens = gr.Slider(
                    minimum=50,
                    maximum=2000,
                    value=args.max_new_tokens,
                    step=10,
                    interactive=True,
                    label="max_new_tokens",
                )
                temperature = gr.Slider(
                    minimum=0,
                    maximum=5.0,
                    value=args.temperature,
                    step=0.1,
                    interactive=True,
                    label="temperature",
                )

        audio.stream(
            ReplyOnPause(transcribe, input_sample_rate=16000),
            inputs=[audio, conversation, max_new_tokens, temperature],
            outputs=[audio],
            time_limit=90,
        )
        audio.on_additional_outputs(
            lambda g: g,
            outputs=[conversation],
            queue=False,
            show_progress="hidden",
        )

    return voice_demo