in packages/directlinespeech/src/playCognitiveServicesStream.js [103:207]
export default async function playCognitiveServicesStream(audioContext, stream, { signal = {} } = {}) {
if (!audioContext) {
throw new Error('botframework-directlinespeech-sdk: audioContext must be specified.');
} else if (!stream) {
throw new Error('botframework-directlinespeech-sdk: stream must be specified.');
} else if (!stream.format) {
throw new Error('botframework-directlinespeech-sdk: stream is missing format.');
} else if (typeof stream.read !== 'function') {
throw new Error('botframework-directlinespeech-sdk: stream is missing read().');
}
const queuedBufferSourceNodes = [];
try {
const { format } = stream;
const abortPromise = abortToReject(signal);
const array = new Uint8Array(DEFAULT_BUFFER_SIZE);
const read = () =>
Promise.race([
abortPromise.catch(() => {
// Abort will gracefully end the queue. We will check signal.aborted later to throw abort exception.
}),
stream
.read(array.buffer)
.then(numBytes => (numBytes === array.byteLength ? array : numBytes ? array.slice(0, numBytes) : undefined))
]);
if (signal.aborted) {
throw new Error('aborted');
}
let { samplesPerSec } = format;
// TODO: [P0] #3692 Remove the following if-condition block when the underlying bugs are resolved.
// There is a bug in Speech SDK 1.15.0 that returns 24kHz instead of 16kHz.
// Even if we explicitly specify the output audio format to 16kHz, there is another bug that ignored it.
// In short, DLSpeech service currently always streams in RIFF WAV format, instead of MP3.
// https://github.com/microsoft/cognitive-services-speech-sdk-js/issues/313
// https://github.com/microsoft/cognitive-services-speech-sdk-js/issues/314
if (format.requestAudioFormatString === 'audio-24khz-48kbitrate-mono-mp3') {
samplesPerSec = 16000;
}
let newSamplesPerSec = samplesPerSec;
let sampleRateMultiplier = 1;
// Safari requires a minimum sample rate of 22100 Hz.
// A multiplier is calculated the the data meets the minimum sample rate.
// An integer-based multiplier to simplify our upsampler.
// For security, data will only be upsampled up to 96000 Hz.
while (newSamplesPerSec < MIN_SAMPLE_RATE && newSamplesPerSec < 96000) {
sampleRateMultiplier++;
newSamplesPerSec = samplesPerSec * sampleRateMultiplier;
}
// The third parameter is the sample size in bytes.
// For example, if the Speech SDK sends Web Chat 4096 bytes of 16-bit samples, there will be 2048 samples per channel.
// The multi-buffering player is set up to handle 2048 samples per buffer.
// If the multiplier 3x, it will handle 6144 samples per buffer.
const player = createMultiBufferingPlayer(
audioContext,
{ ...format, samplesPerSec: newSamplesPerSec },
(DEFAULT_BUFFER_SIZE / (format.bitsPerSample / 8)) * sampleRateMultiplier
);
// For security, the maximum number of chunks handled will be 1000.
for (
let chunk = await read(), maxChunks = 0;
chunk && maxChunks < 1000 && !signal.aborted;
chunk = await read(), maxChunks++
) {
if (signal.aborted) {
break;
}
// Data received from Speech SDK is interleaved; 2 channels (e.g. A and B) will be sent as "ABABABABAB"
// And each sample (A/B) will be an 8 to 32-bit number.
// Convert the 8 - 32-bit number into a floating-point number, as required by Web Audio API.
const interleavedArray = formatAudioDataArrayBufferToFloatArray(format, chunk.buffer);
// Deinterleave data back into two array buffer, e.g. "AAAAA" and "BBBBB".
const multiChannelArray = deinterleave(interleavedArray, format);
// Upsample data if necessary. If the multiplier is 2x, "AAAAA" will be upsampled to "AAAAAAAAAA" (with anti-alias).
const upsampledMultiChannelArray = multiChannelArray.map(array =>
multiplySampleRate(array, sampleRateMultiplier)
);
// Queue to the buffering player.
player.push(upsampledMultiChannelArray);
}
abortPromise.catch(() => player.cancelAll());
if (signal.aborted) {
throw new Error('aborted');
}
await Promise.race([abortPromise, player.flush()]);
} finally {
queuedBufferSourceNodes.forEach(node => node.stop());
}
}