lib/client.js (361 lines of code) (raw):

import { RealtimeEventHandler } from './event_handler.js'; import { RealtimeAPI } from './api.js'; import { RealtimeConversation } from './conversation.js'; import { RealtimeUtils } from './utils.js'; /** * Valid audio formats * @typedef {"pcm16"|"g711_ulaw"|"g711_alaw"} AudioFormatType */ /** * @typedef {Object} AudioTranscriptionType * @property {"whisper-1"} model */ /** * @typedef {Object} TurnDetectionServerVadType * @property {"server_vad"} type * @property {number} [threshold] * @property {number} [prefix_padding_ms] * @property {number} [silence_duration_ms] */ /** * Tool definitions * @typedef {Object} ToolDefinitionType * @property {"function"} [type] * @property {string} name * @property {string} description * @property {{[key: string]: any}} parameters */ /** * @typedef {Object} SessionResourceType * @property {string} [model] * @property {string[]} [modalities] * @property {string} [instructions] * @property {"alloy"|"ash"|"ballad"|"coral"|"echo"|"sage"|"shimmer"|"verse"} [voice] * @property {AudioFormatType} [input_audio_format] * @property {AudioFormatType} [output_audio_format] * @property {AudioTranscriptionType|null} [input_audio_transcription] * @property {TurnDetectionServerVadType|null} [turn_detection] * @property {ToolDefinitionType[]} [tools] * @property {"auto"|"none"|"required"|{type:"function",name:string}} [tool_choice] * @property {number} [temperature] * @property {number|"inf"} [max_response_output_tokens] */ /** * @typedef {"in_progress"|"completed"|"incomplete"} ItemStatusType */ /** * @typedef {Object} InputTextContentType * @property {"input_text"} type * @property {string} text */ /** * @typedef {Object} InputAudioContentType * @property {"input_audio"} type * @property {string} [audio] base64-encoded audio data * @property {string|null} [transcript] */ /** * @typedef {Object} TextContentType * @property {"text"} type * @property {string} text */ /** * @typedef {Object} AudioContentType * @property {"audio"} type * @property {string} [audio] base64-encoded audio data * @property {string|null} [transcript] */ /** * @typedef {Object} SystemItemType * @property {string|null} [previous_item_id] * @property {"message"} type * @property {ItemStatusType} status * @property {"system"} role * @property {Array<InputTextContentType>} content */ /** * @typedef {Object} UserItemType * @property {string|null} [previous_item_id] * @property {"message"} type * @property {ItemStatusType} status * @property {"user"} role * @property {Array<InputTextContentType|InputAudioContentType>} content */ /** * @typedef {Object} AssistantItemType * @property {string|null} [previous_item_id] * @property {"message"} type * @property {ItemStatusType} status * @property {"assistant"} role * @property {Array<TextContentType|AudioContentType>} content */ /** * @typedef {Object} FunctionCallItemType * @property {string|null} [previous_item_id] * @property {"function_call"} type * @property {ItemStatusType} status * @property {string} call_id * @property {string} name * @property {string} arguments */ /** * @typedef {Object} FunctionCallOutputItemType * @property {string|null} [previous_item_id] * @property {"function_call_output"} type * @property {string} call_id * @property {string} output */ /** * @typedef {Object} FormattedToolType * @property {"function"} type * @property {string} name * @property {string} call_id * @property {string} arguments */ /** * @typedef {Object} FormattedPropertyType * @property {Int16Array} [audio] * @property {string} [text] * @property {string} [transcript] * @property {FormattedToolType} [tool] * @property {string} [output] * @property {any} [file] */ /** * @typedef {Object} FormattedItemType * @property {string} id * @property {string} object * @property {"user"|"assistant"|"system"} [role] * @property {FormattedPropertyType} formatted */ /** * @typedef {SystemItemType|UserItemType|AssistantItemType|FunctionCallItemType|FunctionCallOutputItemType} BaseItemType */ /** * @typedef {FormattedItemType & BaseItemType} ItemType */ /** * @typedef {Object} IncompleteResponseStatusType * @property {"incomplete"} type * @property {"interruption"|"max_output_tokens"|"content_filter"} reason */ /** * @typedef {Object} FailedResponseStatusType * @property {"failed"} type * @property {{code: string, message: string}|null} error */ /** * @typedef {Object} UsageType * @property {number} total_tokens * @property {number} input_tokens * @property {number} output_tokens */ /** * @typedef {Object} ResponseResourceType * @property {"in_progress"|"completed"|"incomplete"|"cancelled"|"failed"} status * @property {IncompleteResponseStatusType|FailedResponseStatusType|null} status_details * @property {ItemType[]} output * @property {UsageType|null} usage */ /** * RealtimeClient Class * @class */ export class RealtimeClient extends RealtimeEventHandler { /** * Create a new RealtimeClient instance * @param {{url?: string, apiKey?: string, dangerouslyAllowAPIKeyInBrowser?: boolean, debug?: boolean}} [settings] */ constructor({ url, apiKey, dangerouslyAllowAPIKeyInBrowser, debug } = {}) { super(); this.defaultSessionConfig = { modalities: ['text', 'audio'], instructions: '', voice: 'verse', input_audio_format: 'pcm16', output_audio_format: 'pcm16', input_audio_transcription: null, turn_detection: null, tools: [], tool_choice: 'auto', temperature: 0.8, max_response_output_tokens: 4096, }; this.sessionConfig = {}; this.transcriptionModels = [ { model: 'whisper-1', }, ]; this.defaultServerVadConfig = { type: 'server_vad', threshold: 0.5, // 0.0 to 1.0, prefix_padding_ms: 300, // How much audio to include in the audio stream before the speech starts. silence_duration_ms: 200, // How long to wait to mark the speech as stopped. }; this.realtime = new RealtimeAPI({ url, apiKey, dangerouslyAllowAPIKeyInBrowser, debug, }); this.conversation = new RealtimeConversation(); this._resetConfig(); this._addAPIEventHandlers(); } /** * Resets sessionConfig and conversationConfig to default * @private * @returns {true} */ _resetConfig() { this.sessionCreated = false; this.tools = {}; this.sessionConfig = JSON.parse(JSON.stringify(this.defaultSessionConfig)); this.inputAudioBuffer = new Int16Array(0); return true; } /** * Sets up event handlers for a fully-functional application control flow * @private * @returns {true} */ _addAPIEventHandlers() { // Event Logging handlers this.realtime.on('client.*', (event) => { const realtimeEvent = { time: new Date().toISOString(), source: 'client', event: event, }; this.dispatch('realtime.event', realtimeEvent); }); this.realtime.on('server.*', (event) => { const realtimeEvent = { time: new Date().toISOString(), source: 'server', event: event, }; this.dispatch('realtime.event', realtimeEvent); }); // Handles session created event, can optionally wait for it this.realtime.on( 'server.session.created', () => (this.sessionCreated = true), ); // Setup for application control flow const handler = (event, ...args) => { const { item, delta } = this.conversation.processEvent(event, ...args); return { item, delta }; }; const handlerWithDispatch = (event, ...args) => { const { item, delta } = handler(event, ...args); if (item) { // FIXME: If statement is only here because item.input_audio_transcription.completed // can fire before `item.created`, resulting in empty item. // This happens in VAD mode with empty audio this.dispatch('conversation.updated', { item, delta }); } return { item, delta }; }; const callTool = async (tool) => { try { const jsonArguments = JSON.parse(tool.arguments); const toolConfig = this.tools[tool.name]; if (!toolConfig) { throw new Error(`Tool "${tool.name}" has not been added`); } const result = await toolConfig.handler(jsonArguments); this.realtime.send('conversation.item.create', { item: { type: 'function_call_output', call_id: tool.call_id, output: JSON.stringify(result), }, }); } catch (e) { this.realtime.send('conversation.item.create', { item: { type: 'function_call_output', call_id: tool.call_id, output: JSON.stringify({ error: e.message }), }, }); } this.createResponse(); }; // Handlers to update internal conversation state this.realtime.on('server.response.created', handler); this.realtime.on('server.response.output_item.added', handler); this.realtime.on('server.response.content_part.added', handler); this.realtime.on('server.input_audio_buffer.speech_started', (event) => { handler(event); this.dispatch('conversation.interrupted'); }); this.realtime.on('server.input_audio_buffer.speech_stopped', (event) => handler(event, this.inputAudioBuffer), ); // Handlers to update application state this.realtime.on('server.conversation.item.created', (event) => { const { item } = handlerWithDispatch(event); this.dispatch('conversation.item.appended', { item }); if (item.status === 'completed') { this.dispatch('conversation.item.completed', { item }); } }); this.realtime.on('server.conversation.item.truncated', handlerWithDispatch); this.realtime.on('server.conversation.item.deleted', handlerWithDispatch); this.realtime.on( 'server.conversation.item.input_audio_transcription.completed', handlerWithDispatch, ); this.realtime.on( 'server.response.audio_transcript.delta', handlerWithDispatch, ); this.realtime.on('server.response.audio.delta', handlerWithDispatch); this.realtime.on('server.response.text.delta', handlerWithDispatch); this.realtime.on( 'server.response.function_call_arguments.delta', handlerWithDispatch, ); this.realtime.on('server.response.output_item.done', async (event) => { const { item } = handlerWithDispatch(event); if (item.status === 'completed') { this.dispatch('conversation.item.completed', { item }); } if (item.formatted.tool) { callTool(item.formatted.tool); } }); return true; } /** * Tells us whether the realtime socket is connected and the session has started * @returns {boolean} */ isConnected() { return this.realtime.isConnected(); } /** * Resets the client instance entirely: disconnects and clears active config * @returns {true} */ reset() { this.disconnect(); this.clearEventHandlers(); this.realtime.clearEventHandlers(); this._resetConfig(); this._addAPIEventHandlers(); return true; } /** * Connects to the Realtime WebSocket API * Updates session config and conversation config * @returns {Promise<true>} */ async connect() { if (this.isConnected()) { throw new Error(`Already connected, use .disconnect() first`); } await this.realtime.connect(); this.updateSession(); return true; } /** * Waits for a session.created event to be executed before proceeding * @returns {Promise<true>} */ async waitForSessionCreated() { if (!this.isConnected()) { throw new Error(`Not connected, use .connect() first`); } while (!this.sessionCreated) { await new Promise((r) => setTimeout(() => r(), 1)); } return true; } /** * Disconnects from the Realtime API and clears the conversation history */ disconnect() { this.sessionCreated = false; this.realtime.isConnected() && this.realtime.disconnect(); this.conversation.clear(); } /** * Gets the active turn detection mode * @returns {"server_vad"|null} */ getTurnDetectionType() { return this.sessionConfig.turn_detection?.type || null; } /** * Add a tool and handler * @param {ToolDefinitionType} definition * @param {function} handler * @returns {{definition: ToolDefinitionType, handler: function}} */ addTool(definition, handler) { if (!definition?.name) { throw new Error(`Missing tool name in definition`); } const name = definition?.name; if (this.tools[name]) { throw new Error( `Tool "${name}" already added. Please use .removeTool("${name}") before trying to add again.`, ); } if (typeof handler !== 'function') { throw new Error(`Tool "${name}" handler must be a function`); } this.tools[name] = { definition, handler }; this.updateSession(); return this.tools[name]; } /** * Removes a tool * @param {string} name * @returns {true} */ removeTool(name) { if (!this.tools[name]) { throw new Error(`Tool "${name}" does not exist, can not be removed.`); } delete this.tools[name]; return true; } /** * Deletes an item * @param {string} id * @returns {true} */ deleteItem(id) { this.realtime.send('conversation.item.delete', { item_id: id }); return true; } /** * Updates session configuration * If the client is not yet connected, will save details and instantiate upon connection * @param {SessionResourceType} [sessionConfig] */ updateSession({ modalities = void 0, instructions = void 0, voice = void 0, input_audio_format = void 0, output_audio_format = void 0, input_audio_transcription = void 0, turn_detection = void 0, tools = void 0, tool_choice = void 0, temperature = void 0, max_response_output_tokens = void 0, } = {}) { modalities !== void 0 && (this.sessionConfig.modalities = modalities); instructions !== void 0 && (this.sessionConfig.instructions = instructions); voice !== void 0 && (this.sessionConfig.voice = voice); input_audio_format !== void 0 && (this.sessionConfig.input_audio_format = input_audio_format); output_audio_format !== void 0 && (this.sessionConfig.output_audio_format = output_audio_format); input_audio_transcription !== void 0 && (this.sessionConfig.input_audio_transcription = input_audio_transcription); turn_detection !== void 0 && (this.sessionConfig.turn_detection = turn_detection); tools !== void 0 && (this.sessionConfig.tools = tools); tool_choice !== void 0 && (this.sessionConfig.tool_choice = tool_choice); temperature !== void 0 && (this.sessionConfig.temperature = temperature); max_response_output_tokens !== void 0 && (this.sessionConfig.max_response_output_tokens = max_response_output_tokens); // Load tools from tool definitions + already loaded tools const useTools = [].concat( (tools || []).map((toolDefinition) => { const definition = { type: 'function', ...toolDefinition, }; if (this.tools[definition?.name]) { throw new Error( `Tool "${definition?.name}" has already been defined`, ); } return definition; }), Object.keys(this.tools).map((key) => { return { type: 'function', ...this.tools[key].definition, }; }), ); const session = { ...this.sessionConfig }; session.tools = useTools; if (this.realtime.isConnected()) { this.realtime.send('session.update', { session }); } return true; } /** * Sends user message content and generates a response * @param {Array<InputTextContentType|InputAudioContentType>} content * @returns {true} */ sendUserMessageContent(content = []) { if (content.length) { for (const c of content) { if (c.type === 'input_audio') { if (c.audio instanceof ArrayBuffer || c.audio instanceof Int16Array) { c.audio = RealtimeUtils.arrayBufferToBase64(c.audio); } } } this.realtime.send('conversation.item.create', { item: { type: 'message', role: 'user', content, }, }); } this.createResponse(); return true; } /** * Appends user audio to the existing audio buffer * @param {Int16Array|ArrayBuffer} arrayBuffer * @returns {true} */ appendInputAudio(arrayBuffer) { if (arrayBuffer.byteLength > 0) { this.realtime.send('input_audio_buffer.append', { audio: RealtimeUtils.arrayBufferToBase64(arrayBuffer), }); this.inputAudioBuffer = RealtimeUtils.mergeInt16Arrays( this.inputAudioBuffer, arrayBuffer, ); } return true; } /** * Forces a model response generation * @returns {true} */ createResponse() { if ( this.getTurnDetectionType() === null && this.inputAudioBuffer.byteLength > 0 ) { this.realtime.send('input_audio_buffer.commit'); this.conversation.queueInputAudio(this.inputAudioBuffer); this.inputAudioBuffer = new Int16Array(0); } this.realtime.send('response.create'); return true; } /** * Cancels the ongoing server generation and truncates ongoing generation, if applicable * If no id provided, will simply call `cancel_generation` command * @param {string} id The id of the message to cancel * @param {number} [sampleCount] The number of samples to truncate past for the ongoing generation * @returns {{item: (AssistantItemType | null)}} */ cancelResponse(id, sampleCount = 0) { if (!id) { this.realtime.send('response.cancel'); return { item: null }; } else if (id) { const item = this.conversation.getItem(id); if (!item) { throw new Error(`Could not find item "${id}"`); } if (item.type !== 'message') { throw new Error(`Can only cancelResponse messages with type "message"`); } else if (item.role !== 'assistant') { throw new Error( `Can only cancelResponse messages with role "assistant"`, ); } this.realtime.send('response.cancel'); const audioIndex = item.content.findIndex((c) => c.type === 'audio'); if (audioIndex === -1) { throw new Error(`Could not find audio on item to cancel`); } this.realtime.send('conversation.item.truncate', { item_id: id, content_index: audioIndex, audio_end_ms: Math.floor( (sampleCount / this.conversation.defaultFrequency) * 1000, ), }); return { item }; } } /** * Utility for waiting for the next `conversation.item.appended` event to be triggered by the server * @returns {Promise<{item: ItemType}>} */ async waitForNextItem() { const event = await this.waitForNext('conversation.item.appended'); const { item } = event; return { item }; } /** * Utility for waiting for the next `conversation.item.completed` event to be triggered by the server * @returns {Promise<{item: ItemType}>} */ async waitForNextCompletedItem() { const event = await this.waitForNext('conversation.item.completed'); const { item } = event; return { item }; } }