PollySpeechResponse PollyManager::GenerateSpeech()

in PollyTTSEngine/PollyManager.cpp [60:122]


PollySpeechResponse PollyManager::GenerateSpeech(CSentItem& item)
{
	PollySpeechResponse response;
	
	auto creds = Aws::MakeShared<Aws::Auth::ProfileConfigFileAWSCredentialsProvider>(ALLOCATION_TAG, "polly-windows");
	Aws::Client::ClientConfiguration config = Aws::Client::ClientConfiguration("polly-windows");
	config.userAgent = config.userAgent + " request-source/polly-windows/PRODUCTVERSION";
	Aws::Polly::PollyClient p = Aws::Polly::PollyClient(creds, config);
	SynthesizeSpeechRequest speech_request;
	
	auto speech_text = Aws::Utils::StringUtils::FromWString(item.pItem);
	if (Aws::Utils::StringUtils::ToLower(speech_text.c_str()).find("</voice>") != std::string::npos)
	{
		speech_text = "<speak>" + speech_text.replace(speech_text.find("</voice>"), sizeof("</voice>") - 1, "");
	}
	tinyxml2::XMLDocument doc;
	tinyxml2::XMLError res = doc.Parse(speech_text.c_str());
	speech_request.SetTextType(TextType::text);
	if (res == tinyxml2::XML_SUCCESS && strcmp(doc.RootElement()->Name(), "speak") == 0) {
		m_logger->debug("Text type = ssml");
		speech_request.SetTextType(TextType::ssml);
		speech_text = speech_text.replace(speech_text.find("<speak>"), sizeof("<speak>") - 1, "");
	}
	speech_request.SetTextType(TextType::ssml);
	if (m_isNews)
	{
		speech_text = "<speak><amazon:domain name=\"news\">" + speech_text + "</amazon:domain></speak>";
	}
	else if (m_isConversational)
	{
		speech_text = "<speak><amazon:domain name=\"conversational\">" + speech_text + "</amazon:domain></speak>";
	}
	else
	{
		speech_text = "<speak>" + speech_text + "</speak>";
	}
	m_logger->debug("{}: Asking Polly for '{}'", __FUNCTION__, speech_text.c_str());
	speech_request.SetOutputFormat(OutputFormat::pcm);
	speech_request.SetVoiceId(m_vVoiceId);
	m_logger->debug("Generating speech: {}", speech_text);
	speech_request.SetText(speech_text);

	speech_request.SetSampleRate("16000");
	if (m_isNeural) {
		m_logger->debug("Neural voice? Yes");
		speech_request.SetEngine(Engine::neural);
	}
	auto speech = p.SynthesizeSpeech(speech_request);
	response.IsSuccess = speech.IsSuccess();
	if (!speech.IsSuccess())
	{
		std::stringstream error;
		//error << speech.GetError().GetMessageW();
		response.ErrorMessage = error.str();
		return response;
	}
	auto &r = speech.GetResult();

	auto& stream = r.GetAudioStream();
	stream.read(reinterpret_cast<char*>(&response.AudioData[0]), MAX_SIZE);
	response.Length = stream.gcount();
	return response;
}