PollySpeechMarksResponse PollyManager::GenerateSpeechMarks()

in PollyTTSEngine/PollyManager.cpp [151:225]


PollySpeechMarksResponse PollyManager::GenerateSpeechMarks(CSentItem& item, std::streamsize streamSize)
{
	SynthesizeSpeechRequest speechMarksRequest;
	PollySpeechMarksResponse response;
	Aws::Polly::PollyClient p = Aws::MakeShared<Aws::Auth::ProfileConfigFileAWSCredentialsProvider>(ALLOCATION_TAG, "polly-windows");
	auto text = Aws::Utils::StringUtils::FromWString(item.pItem);
	m_logger->debug("{}: Asking Polly for '{}'", __FUNCTION__, text.c_str());
	speechMarksRequest.SetOutputFormat(OutputFormat::json);
	speechMarksRequest.SetVoiceId(m_vVoiceId);
	speechMarksRequest.SetText(text);
	speechMarksRequest.AddSpeechMarkTypes(SpeechMarkType::word);
	if (Aws::Utils::StringUtils::ToLower(text.c_str()).find("<speak") == 0)
	{
		m_logger->debug("Text type = ssml");
		speechMarksRequest.SetTextType(TextType::ssml);
	}
	else
	{
		m_logger->debug("Text type = text");
		speechMarksRequest.SetTextType(TextType::text);
	}
	speechMarksRequest.SetSampleRate("16000");
	auto speech_marks = p.SynthesizeSpeech(speechMarksRequest);
	if (!speech_marks.IsSuccess())
	{
		std::stringstream error;
		//error << "Unable to generate speech marks: " << speech_marks.GetError().GetMessageW();
		response.ErrorMessage = error.str();
		return response;
	}
	auto &m = speech_marks.GetResult();
	auto& m_stream = m.GetAudioStream();
	std::string json_str;
	std::vector<SpeechMark> speechMarks;
	auto firstWord = true;
	long bytesProcessed = 0;
	m_logger->debug("SpeechMarks response:\n\n{}\n\n", json_str);
	while (getline(m_stream, json_str)) {
		SpeechMark sm;
		rapidjson::Document d;
		d.Parse(json_str.c_str());
		assert(d.HasMember("end"));
		assert(d["end"].GetInt());
		sm.StartInMs = d["time"].GetInt();
		sm.StartByte = d["start"].GetInt();
		sm.EndByte = d["end"].GetInt();
		sm.Text = d["value"].GetString();
		SpeechMark displaySpeechMark;
		if (!firstWord)
		{
			auto currentSm = speechMarks[speechMarks.size()-1];
			currentSm.TimeInMs = sm.StartInMs - currentSm.StartInMs;
			currentSm.LengthInBytes = 32 * currentSm.TimeInMs;
			displaySpeechMark = currentSm;
			bytesProcessed += currentSm.LengthInBytes;
			speechMarks[speechMarks.size() - 1] = currentSm;
		}
		m_logger->debug("Word: {}, Start: {}, End: {}, Time: {}\n", sm.Text.c_str(), sm.StartInMs,
			sm.EndByte,
			sm.TimeInMs);
		speechMarks.push_back(sm);
		firstWord = false;
	}
	auto sm = speechMarks[speechMarks.size() - 1];
	sm.LengthInBytes = streamSize - bytesProcessed;
	sm.TimeInMs = sm.LengthInBytes / 32;
	speechMarks[speechMarks.size() - 1] = sm;
	m_logger->debug("Word: {}, Start: {}, End: {}, Time: {}\n", sm.Text.c_str(), sm.StartInMs,
		sm.EndByte,
		sm.TimeInMs);
	m_logger->debug("Total words generated: {}", speechMarks.size());
	speechMarks.push_back(sm);
	response.SpeechMarks = speechMarks;
	return response;
}