in speech-commands/src/browser_fft_recognizer.ts [174:281]
async listen(
callback: RecognizerCallback,
config?: StreamingRecognitionConfig): Promise<void> {
if (this.streaming) {
throw new Error(
'Cannot start streaming again when streaming is ongoing.');
}
await this.ensureModelLoaded();
if (config == null) {
config = {};
}
let probabilityThreshold =
config.probabilityThreshold == null ? 0 : config.probabilityThreshold;
if (config.includeEmbedding) {
// Override probability threshold to 0 if includeEmbedding is true.
probabilityThreshold = 0;
}
tf.util.assert(
probabilityThreshold >= 0 && probabilityThreshold <= 1,
() => `Invalid probabilityThreshold value: ${probabilityThreshold}`);
let invokeCallbackOnNoiseAndUnknown =
config.invokeCallbackOnNoiseAndUnknown == null ?
false :
config.invokeCallbackOnNoiseAndUnknown;
if (config.includeEmbedding) {
// Override invokeCallbackOnNoiseAndUnknown threshold to true if
// includeEmbedding is true.
invokeCallbackOnNoiseAndUnknown = true;
}
if (config.suppressionTimeMillis < 0) {
throw new Error(
`suppressionTimeMillis is expected to be >= 0, ` +
`but got ${config.suppressionTimeMillis}`);
}
const overlapFactor =
config.overlapFactor == null ? 0.5 : config.overlapFactor;
tf.util.assert(
overlapFactor >= 0 && overlapFactor < 1,
() => `Expected overlapFactor to be >= 0 and < 1, but got ${
overlapFactor}`);
const spectrogramCallback: SpectrogramCallback =
async (x: tf.Tensor, timeData?: tf.Tensor) => {
const normalizedX = normalize(x);
let y: tf.Tensor;
let embedding: tf.Tensor;
if (config.includeEmbedding) {
await this.ensureModelWithEmbeddingOutputCreated();
[y, embedding] =
this.modelWithEmbeddingOutput.predict(normalizedX) as tf.Tensor[];
} else {
y = this.model.predict(normalizedX) as tf.Tensor;
}
const scores = await y.data() as Float32Array;
const maxIndexTensor = y.argMax(-1);
const maxIndex = (await maxIndexTensor.data())[0];
const maxScore = Math.max(...scores);
tf.dispose([y, maxIndexTensor, normalizedX]);
if (maxScore < probabilityThreshold) {
return false;
} else {
let spectrogram: SpectrogramData = undefined;
if (config.includeSpectrogram) {
spectrogram = {
data: await x.data() as Float32Array,
frameSize: this.nonBatchInputShape[1],
};
}
let wordDetected = true;
if (!invokeCallbackOnNoiseAndUnknown) {
// Skip background noise and unknown tokens.
if (this.words[maxIndex] === BACKGROUND_NOISE_TAG ||
this.words[maxIndex] === UNKNOWN_TAG) {
wordDetected = false;
}
}
if (wordDetected) {
callback({scores, spectrogram, embedding});
}
// Trigger suppression only if the word is neither unknown or
// background noise.
return wordDetected;
}
};
const suppressionTimeMillis = config.suppressionTimeMillis == null ?
this.DEFAULT_SUPPRESSION_TIME_MILLIS :
config.suppressionTimeMillis;
this.audioDataExtractor = new BrowserFftFeatureExtractor({
sampleRateHz: this.parameters.sampleRateHz,
numFramesPerSpectrogram: this.nonBatchInputShape[0],
columnTruncateLength: this.nonBatchInputShape[1],
suppressionTimeMillis,
spectrogramCallback,
overlapFactor
});
await this.audioDataExtractor.start(config.audioTrackConstraints);
this.streaming = true;
}