src/extensions/speech_extension/commands/synthesize_command.cs (465 lines of code) (raw):
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
using System;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using System.Collections.Generic;
using System.Net;
namespace Azure.AI.Details.Common.CLI
{
public class SynthesizeCommand : Command
{
public SynthesizeCommand(ICommandValues values) : base(values)
{
}
public bool RunCommand()
{
var listVoices = _values.GetOrDefault("synthesizer.list.voices", false);
if (listVoices) return DoListVoices();
Synthesize();
return _values.GetOrDefault("passed", true);
}
private bool DoListVoices()
{
string url = GetVoiceNameListUrl();
var downloaded = HttpHelpers.DownloadFileWithRetry(url, "Downloading voice name list...", _values);
var content = FileHelpers.ReadAllText(downloaded, Encoding.Default);
Console.WriteLine("Voice names:");
JsonHelpers.PrintJson(content);
return !string.IsNullOrEmpty(content);
}
private string GetVoiceNameListUrl()
{
var host = _values["service.config.host"];
var region = _values["service.config.region"];
var endpoint = _values["service.config.endpoint.uri"];
var url = !string.IsNullOrEmpty(endpoint)
? endpoint
: !string.IsNullOrEmpty(host)
? $"{host}/cognitiveservices/voices/list"
: !string.IsNullOrEmpty(region)
? $"https://{region}.tts.speech.microsoft.com/cognitiveservices/voices/list"
: null;
return url;
}
private void Synthesize()
{
StartCommand();
var kind = _values["synthesizer.input.type"];
switch (kind)
{
case "":
case null:
case "interactive":
// SynthesizeInteractive(false);
// break;
case "interactive+":
SynthesizeInteractive(true);
break;
case "text":
SynthesizeText();
break;
case "text.file":
SynthesizeTextFile();
break;
case "ssml":
SynthesizeSsml();
break;
case "ssml.file":
SynthesizeSsmlFile();
break;
}
StopCommand();
DisposeAfterStop();
DeleteTemporaryFiles();
}
private void SynthesizeInteractive(bool repeatedly = false)
{
SpeechSynthesizer synthesizer = CreateSpeechSynthesizer();
synthesizer.SynthesisStarted += SynthesisStarted;
synthesizer.Synthesizing += Synthesizing;
synthesizer.SynthesisCompleted += SynthesisCompleted;
synthesizer.SynthesisCanceled += SynthesisCanceled;
var wordboundary = _values["config.metadata.wordBoundaryEnabled"];
if (!string.IsNullOrWhiteSpace(wordboundary) && wordboundary == "true")
{
synthesizer.WordBoundary += SynthesisWordBoundary;
}
while (true)
{
Console.Write("Enter text: ");
var text = ConsoleHelpers.ReadLineOrDefault("", "exit");
if (text.ToLower() == "") break;
if (text.ToLower() == "stop") break;
if (text.ToLower() == "quit") break;
if (text.ToLower() == "exit") break;
var task = synthesizer.SpeakTextAsync(text);
WaitForStopOrCancel(synthesizer, task);
if (!repeatedly) break;
if (_canceledEvent.WaitOne(0)) break;
}
}
private void SynthesizeText()
{
var text = _values.GetOrEmpty("synthesizer.input.text");
SynthesizeText(text);
}
private void SynthesizeTextFile()
{
var fileName = _values.GetOrEmpty("synthesizer.input.text.file");
var existing = FileHelpers.DemandFindFileInDataPath(fileName, _values, "text input");
var text = FileHelpers.ReadAllText(existing, Encoding.Default);
SynthesizeText(text);
}
private void SynthesizeText(string text)
{
SpeechSynthesizer synthesizer = CreateSpeechSynthesizer();
synthesizer.SynthesisStarted += SynthesisStarted;
synthesizer.Synthesizing += Synthesizing;
synthesizer.SynthesisCompleted += SynthesisCompleted;
synthesizer.SynthesisCanceled += SynthesisCanceled;
var wordboundary = _values["config.metadata.wordBoundaryEnabled"];
if (!string.IsNullOrWhiteSpace(wordboundary) && wordboundary == "true")
{
synthesizer.WordBoundary += SynthesisWordBoundary;
}
var task = synthesizer.SpeakTextAsync(text);
WaitForStopOrCancel(synthesizer, task);
}
private void SynthesizeSsml()
{
var ssml = _values.GetOrEmpty("synthesizer.input.ssml");
SynthesizeSsml(ssml);
}
private void SynthesizeSsmlFile()
{
var fileName = _values.GetOrDefault("synthesizer.input.ssml.file", _values.GetOrEmpty("synthesizer.input.text.file"));
var existing = FileHelpers.DemandFindFileInDataPath(fileName, _values, "ssml input");
var content = FileHelpers.ReadAllText(existing, Encoding.Default);
var isText = !content.TrimStart().StartsWith("<");
if (isText)
{
SynthesizeText(content);
}
else
{
SynthesizeSsml(content);
}
}
private void SynthesizeSsml(string ssml)
{
SpeechSynthesizer synthesizer = CreateSpeechSynthesizer();
synthesizer.SynthesisStarted += SynthesisStarted;
synthesizer.Synthesizing += Synthesizing;
synthesizer.SynthesisCompleted += SynthesisCompleted;
synthesizer.SynthesisCanceled += SynthesisCanceled;
var wordboundary = _values["config.metadata.wordBoundaryEnabled"];
if (!string.IsNullOrWhiteSpace(wordboundary) && wordboundary == "true")
{
synthesizer.WordBoundary += SynthesisWordBoundary;
}
var task = synthesizer.SpeakSsmlAsync(ssml);
WaitForStopOrCancel(synthesizer, task);
}
private SpeechSynthesizer CreateSpeechSynthesizer()
{
SpeechConfig config = CreateSpeechConfig();
AudioConfig audioConfig = CreateAudioConfig();
var synthesizer = audioConfig != null
? new SpeechSynthesizer(config, audioConfig)
: new SpeechSynthesizer(config);
_disposeAfterStop.Add(audioConfig);
_disposeAfterStop.Add(synthesizer);
// _output!.EnsureCachePropertyCollection("synthesizer", synthesizer.Properties);
return synthesizer;
}
private SpeechConfig CreateSpeechConfig()
{
var key = _values["service.config.key"];
var host = _values["service.config.host"];
var region = _values["service.config.region"];
var endpoint = _values["service.config.endpoint.uri"];
var tokenValue = _values["service.config.token.value"];
if (_values.Contains("embedded.config.embedded"))
{
key = "UNUSED";
region = "UNUSED";
}
if (string.IsNullOrEmpty(endpoint) && string.IsNullOrEmpty(region) && string.IsNullOrEmpty(host))
{
_values.AddThrowError("ERROR:", $"Creating SpeechConfig; requires one of: region, endpoint, or host.");
}
else if (!string.IsNullOrEmpty(region) && string.IsNullOrEmpty(tokenValue) && string.IsNullOrEmpty(key))
{
_values.AddThrowError("ERROR:", $"Creating SpeechConfig; use of region requires one of: key or token.");
}
SpeechConfig? config = null;
if (!string.IsNullOrEmpty(endpoint))
{
config = string.IsNullOrEmpty(key)
? SpeechConfig.FromEndpoint(new Uri(endpoint))
: SpeechConfig.FromEndpoint(new Uri(endpoint), key);
}
else if (!string.IsNullOrEmpty(host))
{
config = string.IsNullOrEmpty(key)
? SpeechConfig.FromHost(new Uri(host))
: SpeechConfig.FromHost(new Uri(host), key);
}
else // if (!string.IsNullOrEmpty(region))
{
config = string.IsNullOrEmpty(tokenValue)
? SpeechConfig.FromSubscription(key, region)
: SpeechConfig.FromAuthorizationToken(tokenValue, region);
}
if (!string.IsNullOrEmpty(tokenValue))
{
config.AuthorizationToken = tokenValue;
}
var format = _values["audio.output.format"];
if (!string.IsNullOrEmpty(format)) config.SetSpeechSynthesisOutputFormat(AudioOutputHelpers.OutputFormatFrom(format));
SetSpeechConfigProperties(config);
return config;
}
private void SetSpeechConfigProperties(SpeechConfig config)
{
ConfigHelpers.SetupLogFile(config, _values);
var voice = _values["synthesizer.output.voice.name"];
if (!string.IsNullOrEmpty(voice)) config.SpeechSynthesisVoiceName = voice;
var language = _values["target.language.config"];
if (!string.IsNullOrEmpty(language)) config.SpeechSynthesisLanguage = language;
var proxyHost = _values["service.config.proxy.host"];
if (!string.IsNullOrEmpty(proxyHost)) config.SetProxy(proxyHost, _values.GetOrDefault("service.config.proxy.port", 80));
var endpointId = _values["service.config.endpoint.id"];
if (!string.IsNullOrEmpty(endpointId)) config.EndpointId = endpointId;
// var needDetailedText = _output != null && (_output.NeedsLexicalText() || _output.NeedsItnText());
// if (needDetailedText) config.OutputFormat = OutputFormat.Detailed;
// var profanity = _values["service.output.config.profanity.option"];
// if (profanity == "removed") config.SetProfanity(ProfanityOption.Removed);
// if (profanity == "masked") config.SetProfanity(ProfanityOption.Masked);
// if (profanity == "raw") config.SetProfanity(ProfanityOption.Raw);
// var contentLogging = _values.GetOrDefault("service.config.content.logging.enabled", false);
// if (contentLogging) config.EnableAudioLogging();
var trafficType = _values.GetOrDefault("service.config.endpoint.traffic.type", "spx");
config.SetServiceProperty("traffictype", trafficType, ServicePropertyChannel.UriQueryParameter);
var endpointParam = _values.GetOrEmpty("service.config.endpoint.query.string");
if (!string.IsNullOrEmpty(endpointParam)) ConfigHelpers.SetEndpointParams(config, endpointParam);
var httpHeader = _values.GetOrEmpty("service.config.endpoint.http.header");
if (!string.IsNullOrEmpty(httpHeader)) SetHttpHeaderProperty(config, httpHeader);
var stringProperty = _values.GetOrEmpty("config.string.property");
if (!string.IsNullOrEmpty(stringProperty)) ConfigHelpers.SetStringProperty(config, stringProperty);
var stringProperties = _values.GetOrEmpty("config.string.properties");
if (!string.IsNullOrEmpty(stringProperties)) ConfigHelpers.SetStringProperties(config, stringProperties);
var embedded = _values.GetOrDefault("embedded.config.embedded", false);
if (embedded) SetEmbeddedProperties(config);
CheckNotYetImplementedConfigProperties();
}
private void SetEmbeddedProperties(SpeechConfig config)
{
// Use embedded (offline) text-to-speech engine.
config.SetProperty("SPEECH-SynthesisBackend", "offline");
// The device neural voices only support 24kHz and the offline engine has no ability to resample
config.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm);
var modelKey = _values.GetOrEmpty("embedded.config.model.key");
config.SetProperty("SPEECH-SynthesisModelKey", modelKey);
var modelPath = _values.GetOrEmpty("embedded.config.model.path");
var modelXmlFileFullPath = Path.GetFullPath(Path.Combine(modelPath, "Tokens.xml"));
if (!File.Exists(modelXmlFileFullPath))
{
_values.AddThrowError(
"ERROR:", $"Missing or invalid speech synthesis model path!", "",
"USE:", $"{Program.Name} synthesize --embedded --embeddedModelPath PATH [...]");
}
config.SetProperty("SPEECH-SynthesisOfflineDataPath", modelPath);
}
private static void SetHttpHeaderProperty(SpeechConfig config, string httpHeader)
{
string name = "", value = "";
if (StringHelpers.SplitNameValue(httpHeader, out name, out value)) config.SetServiceProperty(name, value, ServicePropertyChannel.HttpHeader);
}
private void CheckNotYetImplementedConfigProperties()
{
var notYetImplemented =
";config.token.type;config.token.password;config.token.username" +
";synthesizer.property";
foreach (var key in notYetImplemented.Split(';'))
{
var value = _values[key];
if (!string.IsNullOrEmpty(value))
{
_values.AddThrowError("WARNING:", $"'{key}={value}' NOT YET IMPLEMENTED!!");
}
}
}
private void CheckSynthesizerInput()
{
var id = _values["synthesizer.input.id"];
var device = _values["audio.input.microphone.device"];
var input = _values["synthesizer.input.type"];
var fileValueDisplayName = _values.Contains("synthesizer.input.text.file") ? "text file" : "ssml file";
var fileValueName = _values.Contains("synthesizer.input.text.file") ? "synthesizer.input.text.file" : "synthesizer.input.ssml.file";
var file = _values.GetOrEmpty(fileValueName);
var url = "";
if (!string.IsNullOrEmpty(file) && file.StartsWith("http"))
{
file = DownloadInputFile(url = file, fileValueName, fileValueDisplayName);
}
if (string.IsNullOrEmpty(id) && !string.IsNullOrEmpty(url))
{
id = GetIdFromInputUrl(url, "synthesizer.input.id");
}
if (string.IsNullOrEmpty(id) && !string.IsNullOrEmpty(file))
{
id = GetIdFromInputFile(input, file, "synthesizer.input.id", fileValueDisplayName);
}
if (string.IsNullOrEmpty(input) && !string.IsNullOrEmpty(id))
{
input = GetInputFromId(id);
}
if (input.EndsWith("file") && string.IsNullOrEmpty(file) && !string.IsNullOrEmpty(id))
{
file = GetInputFileFromId(id);
}
// _interactive = (input == "microphone" || string.IsNullOrEmpty(input));
}
private string GetIdFromInputFile(string input, string file, string idValueName, string fileValueDisplayName)
{
string id;
if (input == "speaker" || string.IsNullOrEmpty(input))
{
id = "speaker";
}
else if (input.EndsWith("file") && !string.IsNullOrEmpty(file))
{
var existing = FileHelpers.DemandFindFileInDataPath(file, _values, fileValueDisplayName);
id = Path.GetFileNameWithoutExtension(existing);
}
else
{
id = "error";
}
_values.Add(idValueName, id);
return id;
}
private string GetInputFromId(string id)
{
string input;
if (id == "speaker")
{
input = "speaker";
}
else if (FileHelpers.FileExistsInDataPath(id + ".txt", _values))
{
input = "text.file";
}
else if (FileHelpers.FileExistsInDataPath(id, _values) ||
FileHelpers.FileExistsInDataPath(id + ".ssml", _values))
{
input = "ssml.file";
}
else if (_values.Contains("synthesizer.input.id.url"))
{
input = "ssml.file";
}
else
{
_values.AddThrowError("ERROR:", $"Cannot find synthesis input file: \"{id}.txt\" or \"{id}.ssml\"");
return null;
}
_values.Add("synthesizer.input.type", input);
return input;
}
private string GetInputFileFromId(string id)
{
string file;
var existing = FileHelpers.FindFileInDataPath(id, _values);
if (existing == null) existing = FileHelpers.FindFileInDataPath(id + ".txt", _values);
if (existing == null) existing = FileHelpers.FindFileInDataPath(id + ".ssml", _values);
if (existing == null)
{
var url = _values["synthesizer.input.id.url"];
if (!string.IsNullOrEmpty(url))
{
url = url.Replace("{id}", id);
existing = HttpHelpers.DownloadFileWithRetry(url);
}
}
file = existing;
_values.Add(existing.EndsWith(".txt") ? "synthesizer.input.text.file" : "synthesizer.input.ssml.file", file);
return file;
}
private AudioConfig? CreateAudioConfig()
{
var output = _values["audio.output.type"];
var file = _values["audio.output.file"];
AudioConfig audioConfig = null;
if (output == "speaker" || string.IsNullOrEmpty(output))
{
audioConfig = AudioOutputHelpers.CreateAudioConfigForSpeaker();
}
else if (output == "file" && !string.IsNullOrEmpty(file))
{
file = ReplaceFileNameValues(file, "synthesizer.input.id");
audioConfig = AudioOutputHelpers.CreateAudioConfigForFile(file);
}
else
{
_values.AddThrowError("WARNING:", $"'audio.output.type={output}' NOT YET IMPLEMENTED!!");
}
return audioConfig;
}
private void SynthesisStarted(object? sender, SpeechSynthesisEventArgs e)
{
_lock!.EnterReaderLockOnce(ref _expectSynthesisCompleted);
_stopEvent.Reset();
_display!.DisplaySynthesisStarted(e);
_output!.SynthesisStarted(e);
}
private void Synthesizing(object? sender, SpeechSynthesisEventArgs e)
{
_display!.DisplaySynthesizing(e);
_output!.Synthesizing(e);
}
private void SynthesisCompleted(object? sender, SpeechSynthesisEventArgs e)
{
_display!.DisplaySynthesisCompleted(e);
_output!.SynthesisCompleted(e);
_stopEvent.Set();
_lock!.ExitReaderLockOnce(ref _expectSynthesisCompleted);
}
private void SynthesisCanceled(object? sender, SpeechSynthesisEventArgs e)
{
_display!.DisplaySynthesisCanceled(e);
_output!.SynthesisCanceled(e);
_canceledEvent.Set();
}
private void SynthesisWordBoundary(object? sender, SpeechSynthesisWordBoundaryEventArgs e)
{
_display!.DisplaySynthesisWordBoundary(e);
_output!.SynthesisWordBoundary(e);
}
private void WaitForStopOrCancel(SpeechSynthesizer synthesizer, Task<SpeechSynthesisResult> task)
{
var interval = 100;
while (!task.Wait(interval))
{
if (_stopEvent.WaitOne(0)) break;
if (_canceledEvent.WaitOne(0)) break;
}
}
private void StartCommand()
{
CheckPath();
CheckSynthesizerInput();
_display = new DisplayHelper(_values);
_output = new OutputHelper(_values);
_output!.StartOutput();
var id = _values["synthesizer.input.id"];
_output!.EnsureOutputAll("synthesizer.input.id", id);
_output!.EnsureOutputEach("synthesizer.input.id", id);
_lock = new SpinLock();
_lock.StartLock();
_expectSynthesisCompleted = 0;
}
private void StopCommand()
{
_lock!.StopLock(5000);
_output!.CheckOutput();
_output!.StopOutput();
}
private SpinLock? _lock = null;
private int _expectSynthesisCompleted = 0;
OutputHelper? _output = null;
DisplayHelper? _display = null;
}
}