projects/deliberation_at_scale/packages/edge-functions/supabase/functions/transcribe/index.ts (244 lines of code) (raw):
import { serve } from "https://deno.land/std@0.168.0/http/server.ts";
import { createClient } from '@supabase/supabase-js';
import { Deepgram } from "https://esm.sh/@deepgram/sdk@v2.4.0";
import dayjs from "dayjs";
import base64 from "https://deno.land/x/b64@1.1.28/src/base64.js";
const API_MODE: ApiMode = 'whisper';
const OPENAI_API_KEY = Deno.env.get('OPENAI_API_KEY');
const DEEPGRAM_API_KEY = Deno.env.get('DEEPGRAM_API_KEY');
const WHISPER_API_URL = Deno.env.get('WHISPER_API_URL');
const DEEPGRAM_API_URL = Deno.env.get('DEEPGRAM_API_URL');
const SUPABASE_URL = Deno.env.get('SUPABASE_URL') ?? '';
const SUPABASE_ANON_KEY = Deno.env.get('SUPABASE_ANON_KEY') ?? '';
const SUPABASE_SERVICE_ROLE_KEY = Deno.env.get('SUPABASE_SERVICE_ROLE_KEY') ?? '';
const SENTENCES_PER_MESSAGE = 2;
const MIN_TEXT_LENGTH_SENTENCE = 50;
const MIN_TEXT_LENGTH_BEFORE_UPSERT = 60;
const DEFAULT_WHISPER_LANGUAGE = 'en';
const DEFAULT_WHISPER_MODEL = 'whisper-1';
const DEFAULT_HEADERS = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Headers': 'authorization, x-client-info, apikey, content-type',
};
const supabaseAdminClient = createClient(
SUPABASE_URL,
SUPABASE_SERVICE_ROLE_KEY
);
const deepgramClient = new Deepgram(DEEPGRAM_API_KEY);
type ApiMode = 'whisper' | 'deepgram';
interface RequestBody {
content: string;
language?: string;
model?: string;
roomId: string;
participantId: string;
chunkStartTime: string;
}
serve(async (req) => {
// guard: handle options request
if (req.method === 'OPTIONS') {
return new Response('ok', { headers: DEFAULT_HEADERS });
}
// get the data from the request
const {
content, // base64 encoded
language,
model,
roomId,
participantId,
chunkStartTime,
} = (await req.json() as RequestBody);
const supabaseUserClient = await getSupabaseUserClient(req);
const authUser = await supabaseUserClient.auth.getUser();
let text = '';
let result = {};
// guard: check if the user is logged in
if (!authUser) {
return createResponse({
error: 'You need to be logged in to use this function.',
});
}
// guard: check if passed data is valid
if (!chunkStartTime) {
return createResponse({
error: 'Invalid data is passed to handle the transcription.',
});
}
// convert base64 content to a file which is required by Whisper
// SOURCE: https://gist.github.com/AshikNesin/ca4ad1ff1d24c26cb228a3fb5c72e0d5
const fetchedContent = await fetch(content);
const blob = await fetchedContent.blob();
const file = new File([blob], 'speech.mp3', { type: 'audio/mpeg' });
if (API_MODE === 'whisper') {
// convert to text
result = await transcribeAtWhisper({
file,
language,
model,
});
text = result.text;
} else if (API_MODE === 'deepgram') {
result = await transcribeAtDeepgram({
file,
content,
});
console.log('DEEPGRAM RESULT')
console.log(result);
// TODO: handle results
}
// only update messages when there is text
if (text) {
// await upsertMesagesForTranscript({
await insertMesagesForTranscript({
text,
roomId,
participantId,
afterCreatedAt: chunkStartTime,
});
}
return createResponse(result);
});
interface MessagesContext {
text: string;
roomId: string;
participantId: string;
afterCreatedAt: string;
}
/**
* Update the messages for the given transcript.
*/
async function upsertMesagesForTranscript(context: MessagesContext) {
const { text: rawText, roomId, participantId } = context;
const text = rawText?.trim() ?? '';
const existingMessages = await getMessagesByContext(context);
const existingMessageAmount = existingMessages.length;
const sentences = getSentencesWithPunctuation(text);
const targetMessageAmount = Math.ceil(calculateAmountOfSentences(text) / SENTENCES_PER_MESSAGE);
const upsertPromises = [];
const handledExistingMessageIds = [];
// guard: check if the text is long enough to be split into messages
if (text.length <= MIN_TEXT_LENGTH_BEFORE_UPSERT) {
return;
}
// loop all the messages
for (let messageIndex = 0; messageIndex < targetMessageAmount; messageIndex++) {
// get the content for the message
// NOTE: multiply index by 2 because we need to get the punctuation as well
let messageContent = '';
let sentenceAmount = 0;
let targetSentenceAmount = SENTENCES_PER_MESSAGE;
let targetCharacterAmount = MIN_TEXT_LENGTH_SENTENCE * SENTENCES_PER_MESSAGE;
const copiedSentences = [...sentences];
// loop all the sentences
for (let sentenceIndex = 0; sentenceIndex < copiedSentences.length; sentenceIndex += 2) {
const punctuationIndex = sentenceIndex + 1;
const sentence = copiedSentences?.[sentenceIndex] ?? '';
const punctuation = copiedSentences?.[punctuationIndex] ?? '';
const sentenceLength = sentence.length;
// guard: check if the message is long enough
if (messageContent.length >= targetCharacterAmount) {
break;
}
// guard: check if the message has enough sentences
if (sentenceAmount >= targetSentenceAmount) {
break;
}
// add the sentence and punctuation to the message
messageContent += sentence + punctuation;
// update the sentence amount
sentenceAmount += 1;
// remove the sentence and punctuation so it won't be used again
// NOTE: always remove the first one, because splice is cutting the array
sentences.splice(0, 2);
}
// guard: make sure the message is valid
if (!messageContent) {
break;
}
// check whether this should be existing message or a new one
if (messageIndex < existingMessageAmount) {
const existingMessage = existingMessages[messageIndex];
const { id: existingMessageId, content: existingMessageContent } = existingMessage;
// guard: skip when content is identical
if (existingMessageContent === messageContent) {
continue;
}
const updatePromise = supabaseAdminClient
.from('messages')
.update({
active: true,
content: messageContent,
})
.eq('id', existingMessageId);
// console.log('updatePromise', existingMessageId, messageContent);
upsertPromises.push(updatePromise);
handledExistingMessageIds.push(existingMessageId);
} else {
const insertPromise = supabaseAdminClient
.from('messages')
.insert({
active: true,
content: messageContent,
room_id: roomId,
participant_id: participantId,
type: 'voice',
});
// console.log('insertPromise', messageContent);
upsertPromises.push(insertPromise);
}
}
// deactivate the unhandled existing messages where no content was left for anymore
// const unhandledExistingMessages = existingMessages.filter((message) => {
// const { id: existingMessageId } = message;
// return !handledExistingMessageIds.includes(existingMessageId);
// });
// unhandledExistingMessages.map((message) => {
// const { id: existingMessageId } = message;
// const updatePromise = supabaseAdminClient
// .from('messages')
// .update({
// active: false,
// })
// .eq('id', existingMessageId);
// upsertPromises.push(updatePromise);
// });
const results = await Promise.allSettled(upsertPromises);
}
async function insertMesagesForTranscript(context: MessagesContext) {
const { text, roomId, participantId } = context;
const result = await supabaseAdminClient
.from('messages')
.insert({
active: true,
content: text,
room_id: roomId,
participant_id: participantId,
type: 'voice',
});
return result;
}
/**
* Get all messages by the given context.
*/
async function getMessagesByContext(context: MessagesContext) {
const { roomId, participantId, afterCreatedAt } = context;
const { data: messages, error } = await supabaseAdminClient
.from('messages')
.select('*')
.eq('room_id', roomId)
.eq('participant_id', participantId)
.gte('created_at', afterCreatedAt)
.order('created_at', { ascending: true });
return messages ?? [];
}
interface TranscribeAtWhisperOptions {
file: File;
language?: string;
model?: string;
}
/**
* Request to the whisper API to transcribe the audio file
*/
async function transcribeAtWhisper(options: TranscribeAtWhisperOptions) {
const {
file,
language = DEFAULT_WHISPER_LANGUAGE,
model = DEFAULT_WHISPER_MODEL,
} = options;
const url = `${WHISPER_API_URL}/transcriptions`;
const body = new FormData();
body.append('file', file);
body.append('language', language);
body.append('model', model);
const response = await fetch(url, {
method: 'POST',
headers: {
// Don't set contentType manually → https://github.com/github/fetch/issues/505#issuecomment-293064470
'Authorization': `Bearer ${OPENAI_API_KEY}`,
},
body: body,
});
return response.json();
}
interface TranscribeAtDeepgramOptions {
file?: File;
content?: string;
language?: string;
model?: string;
}
/**
* Request to the Deepgram API to transcribe the audio file
*/
async function transcribeAtDeepgram(options: TranscribeAtDeepgramOptions) {
const {
content,
file,
} = options;
const transcription = await deepgramClient.transcription.preRecorded({
buffer: new Uint8Array(base64.toArrayBuffer(content, true)),
mimetype: 'audio/mpeg',
});
return transcription
}
/**
* Get the number of sentences in a given text.
*/
function calculateAmountOfSentences(text: string) {
const sentences = getSentencesWithPunctuation(text);
const amountOfSentences = (sentences.length / 2);
return amountOfSentences;
}
/**
* Split the given text into sentences.
*/
function getSentencesWithPunctuation(text: string) {
return text.split(/([\.!?]+)/g).filter((sentence) => !!sentence);
}
/**
* Get the number of words in a given text.
*/
function calculateAmountOfWords(text: string) {
const words = text.split(' ');
const amountOfWords = words.length;
return amountOfWords;
}
/**
* Get the average amount of words per sentence in a given text.
*/
function calculateAverageWordsPerSentence(text: string) {
const amountOfWords = calculateAmountOfWords(text);
const amountOfSentences = calculateAmountOfSentences(text) || 1;
return amountOfWords / amountOfSentences;
}
/**
* Create a Supabase client with the Auth context of the logged in user.
*/
async function getSupabaseUserClient(req) {
const supabaseUserClient = createClient(
SUPABASE_URL,
SUPABASE_ANON_KEY,
{ global: { headers: { Authorization: req.headers.get('Authorization')! } } }
);
return supabaseUserClient;
};
/**
* Create a response with the given data.
*/
function createResponse(data: object) {
return new Response(
JSON.stringify(data),
{
headers: {
"Content-Type": "application/json",
...DEFAULT_HEADERS,
}
},
);
}