projects/deliberation_at_scale/packages/frontend/hooks/useWhisper/useWhisper.ts (420 lines of code) (raw):
/* eslint-disable no-console */
/* eslint-disable @typescript-eslint/ban-ts-comment */
import { useEffectAsync, useMemoAsync } from '@chengsokdara/react-hooks-async';
import type { RawAxiosRequestHeaders } from 'axios';
import type { Harker } from 'hark';
import type { Encoder } from 'lamejs';
import { useEffect, useRef, useState } from 'react';
import type { Options, RecordRTCPromisesHandler } from 'recordrtc';
import {
defaultStopTimeout,
ffmpegCoreUrl,
silenceRemoveCommand,
whisperApiEndpoint,
} from './configs';
import {
UseWhisperConfig,
UseWhisperHook,
UseWhisperTimeout,
UseWhisperTranscript,
} from './types';
/**
* default useWhisper configuration
*/
const defaultConfig: UseWhisperConfig = {
apiKey: '',
autoStart: false,
autoTranscribe: true,
mode: 'transcriptions',
nonStop: false,
removeSilence: false,
stopTimeout: defaultStopTimeout,
streaming: false,
timeSlice: 1_000,
onDataAvailable: undefined,
onTranscribe: undefined,
};
/**
* default timeout for recorder
*/
const defaultTimeout: UseWhisperTimeout = {
stop: undefined,
};
/**
* default transcript object
*/
const defaultTranscript: UseWhisperTranscript = {
blob: undefined,
text: undefined,
};
/**
* React Hook for OpenAI Whisper
*/
export const useWhisper: UseWhisperHook = (config) => {
const {
apiKey,
autoStart,
autoTranscribe,
mode,
nonStop,
removeSilence,
stopTimeout,
streaming,
timeSlice,
whisperConfig,
onDataAvailable: onDataAvailableCallback,
onTranscribe: onTranscribeCallback,
} = {
...defaultConfig,
...config,
};
if (!apiKey && !onTranscribeCallback) {
throw new Error('apiKey is required if onTranscribe is not provided');
}
const chunks = useRef<Blob[]>([]);
const encoder = useRef<Encoder>();
const listener = useRef<Harker>();
const recorder = useRef<RecordRTCPromisesHandler>();
const stream = useRef<MediaStream>();
const timeout = useRef<UseWhisperTimeout>(defaultTimeout);
const [recording, setRecording] = useState<boolean>(false);
const [speaking, setSpeaking] = useState<boolean>(false);
const [transcribing, setTranscribing] = useState<boolean>(false);
const [transcript, setTranscript] =
useState<UseWhisperTranscript>(defaultTranscript);
/**
* cleanup on component unmounted
* - flush out and cleanup lamejs encoder instance
* - destroy recordrtc instance and clear it from ref
* - clear setTimout for onStopRecording
* - clean up hark speaking detection listeners and clear it from ref
* - stop all user's media steaming track and remove it from ref
*/
useEffect(() => {
return () => {
if (chunks.current) {
chunks.current = [];
}
if (encoder.current) {
encoder.current.flush();
encoder.current = undefined;
}
if (recorder.current) {
recorder.current.destroy();
recorder.current = undefined;
}
onStopTimeout('stop');
if (listener.current) {
// @ts-ignore
listener.current.off('speaking', onStartSpeaking);
// @ts-ignore
listener.current.off('stopped_speaking', onStopSpeaking);
}
if (stream.current) {
stream.current.getTracks().forEach((track) => track.stop());
stream.current = undefined;
}
};
// eslint-disable-next-line react-hooks/exhaustive-deps
}, []);
/**
* if config.autoStart is true
* start speech recording immediately upon component mounted
*/
useEffectAsync(async () => {
if (autoStart) {
await onStartRecording();
}
}, [autoStart]);
/**
* start speech recording and start listen for speaking event
*/
const startRecording = async () => {
await onStartRecording();
};
/**
* pause speech recording also stop media stream
*/
const pauseRecording = async () => {
await onPauseRecording();
};
/**
* stop speech recording and start the transcription
*/
const stopRecording = async () => {
await onStopRecording();
};
/**
* start speech recording event
* - first ask user for media stream
* - create recordrtc instance and pass media stream to it
* - create lamejs encoder instance
* - check recorder state and start or resume recorder accordingly
* - start timeout for stop timeout config
* - update recording state to true
*/
const onStartRecording = async () => {
try {
if (!stream.current) {
await onStartStreaming();
}
if (stream.current) {
if (!recorder.current) {
const {
default: { RecordRTCPromisesHandler, StereoAudioRecorder },
} = await import('recordrtc');
const recorderConfig: Options = {
mimeType: 'audio/wav',
numberOfAudioChannels: 1, // mono
recorderType: StereoAudioRecorder,
sampleRate: 44100, // Sample rate = 44.1khz
timeSlice: streaming ? timeSlice : undefined,
type: 'audio',
ondataavailable:
autoTranscribe && streaming ? onDataAvailable : undefined,
};
recorder.current = new RecordRTCPromisesHandler(
stream.current,
recorderConfig
);
}
if (!encoder.current) {
const { Mp3Encoder } = await import('lamejs');
// @ts-ignore
encoder.current = new Mp3Encoder(1, 44100, 96);
}
const recordState = await recorder.current.getState();
if (recordState === 'inactive' || recordState === 'stopped') {
await recorder.current.startRecording();
}
if (recordState === 'paused') {
await recorder.current.resumeRecording();
}
if (nonStop) {
onStartTimeout('stop');
}
setRecording(true);
}
} catch (err) {
console.error(err);
}
};
/**
* get user media stream event
* - try to stop all previous media streams
* - ask user for media stream with a system popup
* - register hark speaking detection listeners
*/
const onStartStreaming = async () => {
try {
if (stream.current) {
stream.current.getTracks().forEach((track) => track.stop());
}
stream.current = await navigator.mediaDevices.getUserMedia({
audio: true,
});
if (!listener.current) {
const { default: hark } = await import('hark');
listener.current = hark(stream.current, {
interval: 100,
play: false,
});
listener.current.on('speaking', onStartSpeaking);
listener.current.on('stopped_speaking', onStopSpeaking);
}
} catch (err) {
console.error(err);
}
};
/**
* start stop timeout event
*/
const onStartTimeout = (type: keyof UseWhisperTimeout) => {
if (!timeout.current[type]) {
timeout.current[type] = setTimeout(onStopRecording, stopTimeout);
}
};
/**
* user start speaking event
* - set speaking state to true
* - clear stop timeout
*/
const onStartSpeaking = () => {
console.log('start speaking');
setSpeaking(true);
onStopTimeout('stop');
};
/**
* user stop speaking event
* - set speaking state to false
* - start stop timeout back
*/
const onStopSpeaking = () => {
console.log('stop speaking');
setSpeaking(false);
if (nonStop) {
onStartTimeout('stop');
}
};
/**
* pause speech recording event
* - if recorder state is recording, pause the recorder
* - clear stop timeout
* - set recoriding state to false
*/
const onPauseRecording = async () => {
try {
if (recorder.current) {
const recordState = await recorder.current.getState();
if (recordState === 'recording') {
await recorder.current.pauseRecording();
}
onStopTimeout('stop');
setRecording(false);
}
} catch (err) {
console.error(err);
}
};
/**
* stop speech recording event
* - flush out lamejs encoder and set it to undefined
* - if recorder state is recording or paused, stop the recorder
* - stop user media stream
* - clear stop timeout
* - set recording state to false
* - start Whisper transcription event
* - destroy recordrtc instance and clear it from ref
*/
const onStopRecording = async () => {
try {
if (recorder.current) {
const recordState = await recorder.current.getState();
if (recordState === 'recording' || recordState === 'paused') {
await recorder.current.stopRecording();
}
onStopStreaming();
onStopTimeout('stop');
setRecording(false);
if (autoTranscribe) {
await onTranscribing();
} else {
const blob = await recorder.current.getBlob();
setTranscript({
blob,
});
}
await recorder.current.destroy();
chunks.current = [];
if (encoder.current) {
encoder.current.flush();
encoder.current = undefined;
}
recorder.current = undefined;
}
} catch (err) {
console.error(err);
}
};
/**
* stop media stream event
* - remove hark speaking detection listeners
* - stop all media stream tracks
* - clear media stream from ref
*/
const onStopStreaming = () => {
if (listener.current) {
// @ts-ignore
listener.current.off('speaking', onStartSpeaking);
// @ts-ignore
listener.current.off('stopped_speaking', onStopSpeaking);
listener.current = undefined;
}
if (stream.current) {
stream.current.getTracks().forEach((track) => track.stop());
stream.current = undefined;
}
};
/**
* stop timeout event
* - clear stop timeout and remove it from ref
*/
const onStopTimeout = (type: keyof UseWhisperTimeout) => {
if (timeout.current[type]) {
clearTimeout(timeout.current[type]);
timeout.current[type] = undefined;
}
};
/**
* start Whisper transcrition event
* - make sure recorder state is stopped
* - set transcribing state to true
* - get audio blob from recordrtc
* - if config.removeSilence is true, load ffmpeg-wasp and try to remove silence from speec
* - if config.customServer is true, send audio data to custom server in base64 string
* - if config.customServer is false, send audio data to Whisper api in multipart/form-data
* - set transcript object with audio blob and transcription result from Whisper
* - set transcribing state to false
*/
const onTranscribing = async () => {
console.log('transcribing speech');
try {
if (encoder.current && recorder.current) {
const recordState = await recorder.current.getState();
if (recordState === 'stopped') {
setTranscribing(true);
let blob = await recorder.current.getBlob();
if (removeSilence) {
const { createFFmpeg } = await import('@ffmpeg/ffmpeg');
const ffmpeg = createFFmpeg({
// @ts-ignore
mainName: 'main',
corePath: ffmpegCoreUrl,
log: true,
});
if (!ffmpeg.isLoaded()) {
await ffmpeg.load();
}
const buffer = await blob.arrayBuffer();
console.log({ in: buffer.byteLength });
ffmpeg.FS('writeFile', 'in.wav', new Uint8Array(buffer));
await ffmpeg.run(
'-i', // Input
'in.wav',
'-acodec', // Audio codec
'libmp3lame',
'-b:a', // Audio bitrate
'96k',
'-ar', // Audio sample rate
'44100',
'-af', // Audio filter = remove silence from start to end with 2 seconds in between
silenceRemoveCommand,
'out.mp3' // Output
);
const out = ffmpeg.FS('readFile', 'out.mp3');
console.log({ out: out.buffer.byteLength });
// 225 seems to be empty mp3 file
if (out.length <= 225) {
ffmpeg.exit();
setTranscript({
blob,
});
setTranscribing(false);
return;
}
blob = new Blob([out.buffer], { type: 'audio/mpeg' });
ffmpeg.exit();
} else {
const buffer = await blob.arrayBuffer();
console.log({ wav: buffer.byteLength });
const mp3 = encoder.current.encodeBuffer(new Int16Array(buffer));
blob = new Blob([mp3], { type: 'audio/mpeg' });
console.log({ blob, mp3: mp3.byteLength });
}
if (typeof onTranscribeCallback === 'function') {
const transcribed = await onTranscribeCallback(blob);
console.log('onTranscribe', transcribed);
setTranscript(transcribed);
} else {
const file = new File([blob], 'speech.mp3', { type: 'audio/mpeg' });
const text = await onWhispered(file);
console.log('onTranscribing', { text });
setTranscript({
blob,
text,
});
}
setTranscribing(false);
}
}
} catch (err) {
console.info(err);
setTranscribing(false);
}
};
/**
* Get audio data in chunk based on timeSlice
* - while recording send audio chunk to Whisper
* - chunks are concatenated in succession
* - set transcript text with interim result
*/
const onDataAvailable = async (data: Blob) => {
console.log('onDataAvailable', data);
try {
if (streaming && recorder.current) {
onDataAvailableCallback?.(data);
if (encoder.current) {
const buffer = await data.arrayBuffer();
const mp3chunk = encoder.current.encodeBuffer(new Int16Array(buffer));
const mp3blob = new Blob([mp3chunk], { type: 'audio/mpeg' });
chunks.current.push(mp3blob);
}
const recorderState = await recorder.current.getState();
if (recorderState === 'recording') {
const blob = new Blob(chunks.current, {
type: 'audio/mpeg',
});
if (typeof onTranscribeCallback === 'function') {
const transcribed = await onTranscribeCallback(blob);
console.log('onTranscribe', transcribed);
if (transcribed.text) {
setTranscript((prev) => ({ ...prev, text: transcribed.text }));
}
} else {
const file = new File([blob], 'speech.mp3', {
type: 'audio/mpeg',
});
const text = await onWhispered(file);
console.log('onTranscribing', { text });
if (text) {
setTranscript((prev) => ({ ...prev, text }));
}
}
}
}
} catch (err) {
console.error(err);
}
};
/**
* Send audio file to Whisper to be transcribed
* - create formdata and append file, model, and language
* - append more Whisper config if whisperConfig is provided
* - add OpenAPI Token to header Authorization Bearer
* - post with axios to OpenAI Whisper transcript endpoint
* - return transcribed text result
*/
const onWhispered = useMemoAsync(
async (file: File) => {
// Whisper only accept multipart/form-data currently
const body = new FormData();
body.append('file', file);
body.append('model', 'whisper-1');
if (mode === 'transcriptions') {
body.append('language', whisperConfig?.language ?? 'en');
}
if (whisperConfig?.prompt) {
body.append('prompt', whisperConfig.prompt);
}
if (whisperConfig?.response_format) {
body.append('response_format', whisperConfig.response_format);
}
if (whisperConfig?.temperature) {
body.append('temperature', `${whisperConfig.temperature}`);
}
const headers: RawAxiosRequestHeaders = {};
headers['Content-Type'] = 'multipart/form-data';
if (apiKey) {
headers['Authorization'] = `Bearer ${apiKey}`;
}
const { default: axios } = await import('axios');
const response = await axios.post(whisperApiEndpoint + mode, body, {
headers,
});
return response.data.text;
},
[apiKey, mode, whisperConfig]
);
/**
* reset transcript to the default state
* - set defaultTranscript value to the transcript
*/
const resetTranscript = () => {
setTranscript(defaultTranscript);
};
/**
* reset the recorder buffer while continuing to record when it was originally still recording
*/
const resetRecordings = async () => {
if (!recorder.current) {
return;
}
const recordState = await recorder.current.getState();
recorder.current.reset();
chunks.current = [];
if (recordState === 'recording') {
await recorder.current.startRecording();
}
};
return {
recording,
speaking,
transcribing,
transcript,
pauseRecording,
startRecording,
stopRecording,
resetTranscript,
resetRecordings,
};
};