in PollyTTSEngine/ttsengobj.cpp [261:433]
HRESULT CTTSEngObj::OutputSentence( CItemList& ItemList, ISpTTSEngineSite* pOutputSite )
{
HRESULT hr = S_OK;
// ULONG WordIndex;
m_logger->debug(__FUNCTION__);
//--- Lookup words in our voice
SPLISTPOS ListPos = ItemList.GetHeadPosition();
CSentItem& Item = ItemList.GetNext(ListPos);
DescribeVoicesRequest request;
std::wstring text_w(Item.pItem);
text_w = ReplaceText(text_w, L"�", L"'");
text_w = ReplaceText(text_w, L"�", L"'");
text_w = ReplaceText(text_w, L"�", L"\"");
text_w = ReplaceText(text_w, L"�", L"\"");
std::string speech = CW2A(text_w.c_str());
std::map<char, std::string> transformations;
transformations['&'] = std::string("&");
transformations['\''] = std::string("'");
transformations['"'] = std::string(""");
transformations['>'] = std::string(">");
transformations['<'] = std::string("<");
std::string reserved_chars;
for (auto ti = transformations.begin(); ti != transformations.end(); ti++)
{
reserved_chars += ti->first;
}
size_t pos = 0;
while (std::string::npos != (pos = speech.find_first_of(reserved_chars, pos)))
{
speech.replace(pos, 1, transformations[speech[pos]]);
pos++;
};
boost::trim(speech);
std::wstring stemp = std::wstring(speech.begin(), speech.end());
Item.pItem = stemp.c_str();
if (speech.find("<voice name=") != std::string::npos)
{
tinyxml2::XMLDocument doc;
tinyxml2::XMLError res = doc.Parse(speech.c_str());
auto voice_node = doc.RootElement()->FirstChildElement();
auto voice_name = voice_node->Attribute("name");
mbstowcs(m_voiceOveride, voice_name, strlen(voice_name) + 1);
m_pPollyVoice = m_voiceOveride;
}
if (speech.find("<voice name=") != std::string::npos ||
!speech.compare("</speak>"))
{
return S_OK;
}
ListPos = ItemList.GetHeadPosition();
PollyManager pm = PollyManager(m_pPollyVoice, m_isNeural, m_isNews, m_isConversational);
auto resp = pm.GenerateSpeech(Item);
if (!resp.IsSuccess)
{
std::stringstream message;
message << "Error generating speech:\n\n" << resp.ErrorMessage << "\nSpeech:\n" << speech;
MessageBoxA(NULL, message.str().c_str(), "Error", MB_OK);
return FAILED(ERROR_SUCCESS);
}
PollySpeechMarksResponse generateSpeechMarksResp = pm.GenerateSpeechMarks(Item, resp.Length);
hr = pOutputSite->Write(reinterpret_cast<char*>(&resp.AudioData[0]), resp.Length, NULL);
return hr;
auto i = generateSpeechMarksResp.SpeechMarks.begin();
auto wordOffset = 0;
while(ListPos && i != generateSpeechMarksResp.SpeechMarks.end() && !(pOutputSite->GetActions() & SPVES_ABORT) )
{
SpeechMark sm = *i;
CSentItem& Item = ItemList.GetNext( ListPos );
m_logger->debug("ListPos={}, current word={}", ListPos, sm.Text);
//--- Process sentence items
switch( Item.pXmlState->eAction )
{
//--- Speak some text ---------------------------------------
case SPVA_Speak:
{
if( iswalpha( Item.pItem[0] ) || iswdigit( Item.pItem[0] ) )
{
//--- Queue the event
CSpEvent Event;
Event.eEventId = SPEI_WORD_BOUNDARY;
Event.elParamType = SPET_LPARAM_IS_UNDEFINED;
Event.ullAudioStreamOffset = wordOffset;
Event.lParam = Item.ulItemSrcOffset,
Event.wParam = sm.Text.length();
m_logger->debug("Writing word boundary for '{}', offset={}, length={}", sm.Text, Item.ulItemSrcOffset, sm.Text.length());
pOutputSite->AddEvents( &Event, 1 );
std::vector<unsigned char> word = std::vector<unsigned char>(&resp.AudioData[wordOffset], &resp.AudioData[wordOffset + sm.LengthInBytes]);
hr = pOutputSite->Write(reinterpret_cast<char*>(&word[0]), sm.LengthInBytes, NULL);
++i;
m_ullAudioOff += sm.LengthInBytes;
wordOffset += sm.LengthInBytes;
}
}
break;
//--- Output some silence for a pause -----------------------
case SPVA_Silence:
{
BYTE Buff[1000];
memset( Buff, 0, 1000 );
ULONG NumSilenceBytes = Item.pXmlState->SilenceMSecs * 22;
//--- Queue the audio data in chunks so that we can get
// interrupted if necessary.
while( !(pOutputSite->GetActions() & SPVES_ABORT) )
{
if( NumSilenceBytes > 1000 )
{
hr = pOutputSite->Write( Buff, 1000, NULL );
NumSilenceBytes -= 1000;
}
else
{
hr = pOutputSite->Write( Buff, NumSilenceBytes, NULL );
break;
}
}
//--- Update the audio offset
m_ullAudioOff += NumSilenceBytes;
}
break;
//--- Fire a bookmark event ---------------------------------
case SPVA_Bookmark:
{
//--- The bookmark is NOT a null terminated string in the Item, but we need
//--- to convert it to one. Allocate enough space for the string.
WCHAR * pszBookmark = (WCHAR *)_malloca((Item.ulItemLen + 1) * sizeof(WCHAR));
memcpy(pszBookmark, Item.pItem, Item.ulItemLen * sizeof(WCHAR));
pszBookmark[Item.ulItemLen] = 0;
//--- Queue the event
SPEVENT Event;
Event.eEventId = SPEI_TTS_BOOKMARK;
Event.elParamType = SPET_LPARAM_IS_STRING;
Event.ullAudioStreamOffset = m_ullAudioOff;
Event.lParam = (LPARAM)pszBookmark;
Event.wParam = _wtol(pszBookmark);
hr = pOutputSite->AddEvents( &Event, 1 );
//--- Free the space for the string.
_freea(pszBookmark);
}
break;
case SPVA_Pronounce:
//--- Our sample engine doesn't handle this. If it
// did, you would use the associated pronunciation in
// the XmlState structure instead of the lexicon.
break;
case SPVA_ParseUnknownTag:
//--- This will reference an XML tag that is unknown to SAPI
// if your engine has private tags to control state, you
// would examine these tags and see if you recognize it. This
// would also be the point that you would make the rendering
// state change.
break;
}
}
return hr;
} /* CTTSEngObj::OutputSentence */