in PollyTTSEngine/PollyManager.cpp [151:225]
PollySpeechMarksResponse PollyManager::GenerateSpeechMarks(CSentItem& item, std::streamsize streamSize)
{
SynthesizeSpeechRequest speechMarksRequest;
PollySpeechMarksResponse response;
Aws::Polly::PollyClient p = Aws::MakeShared<Aws::Auth::ProfileConfigFileAWSCredentialsProvider>(ALLOCATION_TAG, "polly-windows");
auto text = Aws::Utils::StringUtils::FromWString(item.pItem);
m_logger->debug("{}: Asking Polly for '{}'", __FUNCTION__, text.c_str());
speechMarksRequest.SetOutputFormat(OutputFormat::json);
speechMarksRequest.SetVoiceId(m_vVoiceId);
speechMarksRequest.SetText(text);
speechMarksRequest.AddSpeechMarkTypes(SpeechMarkType::word);
if (Aws::Utils::StringUtils::ToLower(text.c_str()).find("<speak") == 0)
{
m_logger->debug("Text type = ssml");
speechMarksRequest.SetTextType(TextType::ssml);
}
else
{
m_logger->debug("Text type = text");
speechMarksRequest.SetTextType(TextType::text);
}
speechMarksRequest.SetSampleRate("16000");
auto speech_marks = p.SynthesizeSpeech(speechMarksRequest);
if (!speech_marks.IsSuccess())
{
std::stringstream error;
//error << "Unable to generate speech marks: " << speech_marks.GetError().GetMessageW();
response.ErrorMessage = error.str();
return response;
}
auto &m = speech_marks.GetResult();
auto& m_stream = m.GetAudioStream();
std::string json_str;
std::vector<SpeechMark> speechMarks;
auto firstWord = true;
long bytesProcessed = 0;
m_logger->debug("SpeechMarks response:\n\n{}\n\n", json_str);
while (getline(m_stream, json_str)) {
SpeechMark sm;
rapidjson::Document d;
d.Parse(json_str.c_str());
assert(d.HasMember("end"));
assert(d["end"].GetInt());
sm.StartInMs = d["time"].GetInt();
sm.StartByte = d["start"].GetInt();
sm.EndByte = d["end"].GetInt();
sm.Text = d["value"].GetString();
SpeechMark displaySpeechMark;
if (!firstWord)
{
auto currentSm = speechMarks[speechMarks.size()-1];
currentSm.TimeInMs = sm.StartInMs - currentSm.StartInMs;
currentSm.LengthInBytes = 32 * currentSm.TimeInMs;
displaySpeechMark = currentSm;
bytesProcessed += currentSm.LengthInBytes;
speechMarks[speechMarks.size() - 1] = currentSm;
}
m_logger->debug("Word: {}, Start: {}, End: {}, Time: {}\n", sm.Text.c_str(), sm.StartInMs,
sm.EndByte,
sm.TimeInMs);
speechMarks.push_back(sm);
firstWord = false;
}
auto sm = speechMarks[speechMarks.size() - 1];
sm.LengthInBytes = streamSize - bytesProcessed;
sm.TimeInMs = sm.LengthInBytes / 32;
speechMarks[speechMarks.size() - 1] = sm;
m_logger->debug("Word: {}, Start: {}, End: {}, Time: {}\n", sm.Text.c_str(), sm.StartInMs,
sm.EndByte,
sm.TimeInMs);
m_logger->debug("Total words generated: {}", speechMarks.size());
speechMarks.push_back(sm);
response.SpeechMarks = speechMarks;
return response;
}