node-audio-processing/index.js (29 lines of code) (raw):
import { pipeline } from "@huggingface/transformers";
import wavefile from "wavefile";
// Load model
const transcriber = await pipeline(
"automatic-speech-recognition",
"onnx-community/whisper-tiny.en",
);
// Load audio data
const url =
"https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav";
const buffer = Buffer.from(await fetch(url).then((x) => x.arrayBuffer()));
// Read .wav file and convert it to required format
const wav = new wavefile.WaveFile(buffer);
wav.toBitDepth("32f"); // Pipeline expects input as a Float32Array
wav.toSampleRate(16000); // Whisper expects audio with a sampling rate of 16000
let audioData = wav.getSamples();
if (Array.isArray(audioData)) {
if (audioData.length > 1) {
const SCALING_FACTOR = Math.sqrt(2);
// Merge channels (into first channel to save memory)
for (let i = 0; i < audioData[0].length; ++i) {
audioData[0][i] =
(SCALING_FACTOR * (audioData[0][i] + audioData[1][i])) / 2;
}
}
// Select first channel
audioData = audioData[0];
}
// Run model
const start = performance.now();
const output = await transcriber(audioData);
const end = performance.now();
console.log(`Execution duration: ${(end - start) / 1000} seconds`);
console.log(output);
// { text: ' And so my fellow Americans ask not what your country can do for you, ask what you can do for your country.' }