speech/betaFeatures.js

// Copyright 2017 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. /** * This application demonstrates how to perform basic recognize operations with * with the Google Cloud Speech API. * * For more information, see the README.md under /speech and the documentation * at https://cloud.google.com/speech/docs. */ 'use strict'; async function speechTranscribeDiarization(fileName) { // [START speech_transcribe_diarization_beta] const fs = require('fs'); // Imports the Google Cloud client library const speech = require('@google-cloud/speech').v1p1beta1; // Creates a client const client = new speech.SpeechClient(); /** * TODO(developer): Uncomment the following lines before running the sample. */ // const fileName = 'Local path to audio file, e.g. /path/to/audio.raw'; const config = { encoding: 'LINEAR16', sampleRateHertz: 8000, languageCode: 'en-US', enableSpeakerDiarization: true, minSpeakerCount: 2, maxSpeakerCount: 2, model: 'phone_call', }; const audio = { content: fs.readFileSync(fileName).toString('base64'), }; const request = { config: config, audio: audio, }; const [response] = await client.recognize(request); const transcription = response.results .map(result => result.alternatives[0].transcript) .join('\n'); console.log(`Transcription: ${transcription}`); console.log('Speaker Diarization:'); const result = response.results[response.results.length - 1]; const wordsInfo = result.alternatives[0].words; // Note: The transcript within each result is separate and sequential per result. // However, the words list within an alternative includes all the words // from all the results thus far. Thus, to get all the words with speaker // tags, you only have to take the words list from the last result: wordsInfo.forEach(a => console.log(` word: ${a.word}, speakerTag: ${a.speakerTag}`) ); // [END speech_transcribe_diarization_beta] } async function asyncSpeechTranscribeDiarizationGCS(gcsUri) { // [START speech_transcribe_diarization_gcs_beta] // Imports the Google Cloud client library const speech = require('@google-cloud/speech').v1p1beta1; // Creates a client const client = new speech.SpeechClient(); /** * TODO(developer): Uncomment the following line before running the sample. */ // const uri = path to GCS audio file e.g. `gs:/bucket/audio.wav`; const config = { encoding: 'LINEAR16', sampleRateHertz: 8000, languageCode: 'en-US', enableSpeakerDiarization: true, minSpeakerCount: 2, maxSpeakerCount: 2, model: 'phone_call', }; const audio = { uri: gcsUri, }; const request = { config: config, audio: audio, }; const [response] = await client.recognize(request); const transcription = response.results .map(result => result.alternatives[0].transcript) .join('\n'); console.log(`Transcription: ${transcription}`); console.log('Speaker Diarization:'); const result = response.results[response.results.length - 1]; const wordsInfo = result.alternatives[0].words; // Note: The transcript within each result is separate and sequential per result. // However, the words list within an alternative includes all the words // from all the results thus far. Thus, to get all the words with speaker // tags, you only have to take the words list from the last result: wordsInfo.forEach(a => console.log(` word: ${a.word}, speakerTag: ${a.speakerTag}`) ); // [END speech_transcribe_diarization_gcs_beta] } async function speechTranscribeMultiChannel(fileName) { // [START speech_transcribe_multichannel_beta] const fs = require('fs'); // Imports the Google Cloud client library const speech = require('@google-cloud/speech').v1p1beta1; // Creates a client const client = new speech.SpeechClient(); /** * TODO(developer): Uncomment the following lines before running the sample. */ // const fileName = 'Local path to audio file, e.g. /path/to/audio.raw'; const config = { encoding: 'LINEAR16', languageCode: 'en-US', audioChannelCount: 2, enableSeparateRecognitionPerChannel: true, }; const audio = { content: fs.readFileSync(fileName).toString('base64'), }; const request = { config: config, audio: audio, }; const [response] = await client.recognize(request); const transcription = response.results .map( result => ` Channel Tag: ${result.channelTag} ${result.alternatives[0].transcript}` ) .join('\n'); console.log(`Transcription: \n${transcription}`); // [END speech_transcribe_multichannel_beta] } async function speechTranscribeMultichannelGCS(gcsUri) { // [START speech_transcribe_multichannel_gcs_beta] const speech = require('@google-cloud/speech').v1p1beta1; // Creates a client const client = new speech.SpeechClient(); const config = { encoding: 'LINEAR16', languageCode: 'en-US', audioChannelCount: 2, enableSeparateRecognitionPerChannel: true, }; const audio = { uri: gcsUri, }; const request = { config: config, audio: audio, }; const [response] = await client.recognize(request); const transcription = response.results .map( result => ` Channel Tag: ${result.channelTag} ${result.alternatives[0].transcript}` ) .join('\n'); console.log(`Transcription: \n${transcription}`); // [END speech_transcribe_multichannel_gcs_beta] } async function speechTranscribeMultilang(fileName) { // [START speech_transcribe_multilanguage_beta] const fs = require('fs'); // Imports the Google Cloud client library const speech = require('@google-cloud/speech').v1p1beta1; // Creates a client const client = new speech.SpeechClient(); /** * TODO(developer): Uncomment the following lines before running the sample. */ // const fileName = 'Local path to audio file, e.g. /path/to/audio.raw'; const config = { encoding: 'LINEAR16', sampleRateHertz: 44100, languageCode: 'en-US', alternativeLanguageCodes: ['es-ES', 'en-US'], }; const audio = { content: fs.readFileSync(fileName).toString('base64'), }; const request = { config: config, audio: audio, }; const [response] = await client.recognize(request); const transcription = response.results .map(result => result.alternatives[0].transcript) .join('\n'); console.log(`Transcription: ${transcription}`); // [END speech_transcribe_multilanguage_beta] } async function speechTranscribeMultilangGCS(gcsUri) { // [START speech_transcribe_multilanguage_gcs_beta] // Imports the Google Cloud client library const speech = require('@google-cloud/speech').v1p1beta1; // Creates a client const client = new speech.SpeechClient(); /** * TODO(developer): Uncomment the following line before running the sample. */ // const uri = path to GCS audio file e.g. `gs:/bucket/audio.wav`; const config = { encoding: 'LINEAR16', sampleRateHertz: 44100, languageCode: 'en-US', alternativeLanguageCodes: ['es-ES', 'en-US'], }; const audio = { uri: gcsUri, }; const request = { config: config, audio: audio, }; const [operation] = await client.longRunningRecognize(request); const [response] = await operation.promise(); const transcription = response.results .map(result => result.alternatives[0].transcript) .join('\n'); console.log(`Transcription: ${transcription}`); // [END speech_transcribe_multilanguage_gcs_beta] } async function speechTranscribeWordLevelConfidence(fileName) { // [START speech_transcribe_word_level_confidence_beta] const fs = require('fs'); // Imports the Google Cloud client library const speech = require('@google-cloud/speech').v1p1beta1; // Creates a client const client = new speech.SpeechClient(); /** * TODO(developer): Uncomment the following lines before running the sample. */ // const fileName = 'Local path to audio file, e.g. /path/to/audio.raw'; const config = { encoding: 'FLAC', sampleRateHertz: 16000, languageCode: 'en-US', enableWordConfidence: true, }; const audio = { content: fs.readFileSync(fileName).toString('base64'), }; const request = { config: config, audio: audio, }; const [response] = await client.recognize(request); const transcription = response.results .map(result => result.alternatives[0].transcript) .join('\n'); const confidence = response.results .map(result => result.alternatives[0].confidence) .join('\n'); console.log(`Transcription: ${transcription} \n Confidence: ${confidence}`); console.log('Word-Level-Confidence:'); const words = response.results.map(result => result.alternatives[0]); words[0].words.forEach(a => { console.log(` word: ${a.word}, confidence: ${a.confidence}`); }); // [END speech_transcribe_word_level_confidence_beta] } async function speechTranscribeWordLevelConfidenceGCS(gcsUri) { // [START speech_transcribe_word_level_confidence_gcs_beta] // Imports the Google Cloud client library const speech = require('@google-cloud/speech').v1p1beta1; // Creates a client const client = new speech.SpeechClient(); /** * TODO(developer): Uncomment the following line before running the sample. */ // const uri = path to GCS audio file e.g. `gs:/bucket/audio.wav`; const config = { encoding: 'FLAC', sampleRateHertz: 16000, languageCode: 'en-US', enableWordConfidence: true, }; const audio = { uri: gcsUri, }; const request = { config: config, audio: audio, }; const [response] = await client.recognize(request); const transcription = response.results .map(result => result.alternatives[0].transcript) .join('\n'); const confidence = response.results .map(result => result.alternatives[0].confidence) .join('\n'); console.log(`Transcription: ${transcription} \n Confidence: ${confidence}`); console.log('Word-Level-Confidence:'); const words = response.results.map(result => result.alternatives[0]); words[0].words.forEach(a => { console.log(` word: ${a.word}, confidence: ${a.confidence}`); }); // [END speech_transcribe_word_level_confidence_gcs_beta] } require('yargs') .demand(1) .command( 'Diarization', 'Isolate distinct speakers in an audio file', {}, opts => speechTranscribeDiarization(opts.speechFile) ) .command( 'DiarizationGCS', 'Isolate distinct speakers in an audio file located in a Google Cloud Storage bucket.', {}, opts => asyncSpeechTranscribeDiarizationGCS(opts.gcsUri) ) .command( 'multiChannelTranscribe', 'Differentiates input by audio channel in local audio file.', {}, opts => speechTranscribeMultiChannel(opts.speechFile) ) .command( 'multiChannelTranscribeGCS', 'Differentiates input by audio channel in an audio file located in a Google Cloud Storage bucket.', {}, opts => speechTranscribeMultichannelGCS(opts.gcsUri) ) .command( 'multiLanguageTranscribe', 'Transcribes multiple languages from local audio file.', {}, opts => speechTranscribeMultilang(opts.speechFile) ) .command( 'multiLanguageTranscribeGCS', 'Transcribes multiple languages from GCS audio file.', {}, opts => speechTranscribeMultilangGCS(opts.gcsUri) ) .command( 'wordLevelConfidence', 'Detects word level confidence from local audio file.', {}, opts => speechTranscribeWordLevelConfidence(opts.speechFile) ) .command( 'wordLevelConfidenceGCS', 'Detects word level confidence from GCS audio file.', {}, opts => speechTranscribeWordLevelConfidenceGCS(opts.gcsUri) ) .options({ speechFile: { alias: 'f', global: true, requiresArg: false, type: 'string', }, gcsUri: { alias: 'u', global: true, requiresArg: true, type: 'string', }, }) .example('node $0 Diarization -f ./resources/commercial_mono.wav') .example( 'node $0 DiarizationGCS -u gs://cloud-samples-tests/speech/commercial_mono.wav' ) .example( 'node $0 multiChannelTranscribe -f ./resources/commercial_stereo.wav' ) .example( 'node $0 multiChannelTranscribeGCS -u gs://cloud-samples-tests/speech/commercial_stereo.wav' ) .example('node $0 multiLanguageTranscribe -f ./resources/multi.wav') .example( 'node $0 multiLanguageTranscribeGCS -u gs://nodejs-docs-samples/multi_mono.wav' ) .example('node $0 wordLevelConfidence -f ./resources/brooklyn.flac') .example( 'node $0 wordLevelConfidenceGCS -u gs://cloud-samples-tests/speech/brooklyn.flac' ) .wrap(120) .recommendCommands() .epilogue('For more information, see https://cloud.google.com/speech/docs') .help() .strict().argv;

speech/betaFeatures.js (309 lines of code) (raw):