HRESULT CTTSEngObj::OutputSentence()

in PollyTTSEngine/ttsengobj.cpp [261:433]


HRESULT CTTSEngObj::OutputSentence( CItemList& ItemList, ISpTTSEngineSite* pOutputSite )
{
    HRESULT hr = S_OK;
//    ULONG WordIndex;
	m_logger->debug(__FUNCTION__);

    //--- Lookup words in our voice
    SPLISTPOS ListPos = ItemList.GetHeadPosition();
	CSentItem& Item = ItemList.GetNext(ListPos);
	DescribeVoicesRequest request;

    std::wstring text_w(Item.pItem);

    text_w = ReplaceText(text_w, L"�", L"'");
    text_w = ReplaceText(text_w, L"�", L"'");
    text_w = ReplaceText(text_w, L"�", L"\"");
    text_w = ReplaceText(text_w, L"�", L"\"");

    std::string speech = CW2A(text_w.c_str());
    std::map<char, std::string> transformations;
    transformations['&'] = std::string("&amp;");
    transformations['\''] = std::string("&apos;");
    transformations['"'] = std::string("&quot;");
    transformations['>'] = std::string("&gt;");
    transformations['<'] = std::string("&lt;");
    std::string reserved_chars;
    for (auto ti = transformations.begin(); ti != transformations.end(); ti++)
    {
        reserved_chars += ti->first;
    }

    size_t pos = 0;
    while (std::string::npos != (pos = speech.find_first_of(reserved_chars, pos)))
    {
        speech.replace(pos, 1, transformations[speech[pos]]);
        pos++;
    };
	boost::trim(speech);
    std::wstring stemp = std::wstring(speech.begin(), speech.end());
    Item.pItem = stemp.c_str();
    if (speech.find("<voice name=") != std::string::npos)
	{
		tinyxml2::XMLDocument doc;
		tinyxml2::XMLError res = doc.Parse(speech.c_str());
		auto voice_node = doc.RootElement()->FirstChildElement();
		auto voice_name = voice_node->Attribute("name");
		mbstowcs(m_voiceOveride, voice_name, strlen(voice_name) + 1);
		m_pPollyVoice = m_voiceOveride;
	}

	if (speech.find("<voice name=") != std::string::npos ||
		!speech.compare("</speak>"))
	{
		return S_OK;
	}

	ListPos = ItemList.GetHeadPosition();
	PollyManager pm = PollyManager(m_pPollyVoice, m_isNeural, m_isNews, m_isConversational);
	auto resp = pm.GenerateSpeech(Item);
	if (!resp.IsSuccess)
	{
		std::stringstream message;
		message << "Error generating speech:\n\n" << resp.ErrorMessage << "\nSpeech:\n" << speech;
		MessageBoxA(NULL, message.str().c_str(), "Error", MB_OK);
		return FAILED(ERROR_SUCCESS);
	}
	PollySpeechMarksResponse generateSpeechMarksResp = pm.GenerateSpeechMarks(Item, resp.Length);
	
	hr = pOutputSite->Write(reinterpret_cast<char*>(&resp.AudioData[0]), resp.Length, NULL);
	return hr;
	auto i = generateSpeechMarksResp.SpeechMarks.begin();
	auto wordOffset = 0;
    while(ListPos && i != generateSpeechMarksResp.SpeechMarks.end() && !(pOutputSite->GetActions() & SPVES_ABORT) )
    {
		SpeechMark sm = *i;
        CSentItem& Item = ItemList.GetNext( ListPos );
		m_logger->debug("ListPos={}, current word={}", ListPos, sm.Text);


        //--- Process sentence items
		switch( Item.pXmlState->eAction )
        {
          //--- Speak some text ---------------------------------------
          case SPVA_Speak:
          {
            if( iswalpha( Item.pItem[0] ) || iswdigit( Item.pItem[0] ) )
            {
				//--- Queue the event
                CSpEvent Event;
                Event.eEventId             = SPEI_WORD_BOUNDARY;
                Event.elParamType          = SPET_LPARAM_IS_UNDEFINED;
                Event.ullAudioStreamOffset = wordOffset;
				Event.lParam               = Item.ulItemSrcOffset,
                Event.wParam               = sm.Text.length();
				m_logger->debug("Writing word boundary for '{}', offset={}, length={}", sm.Text, Item.ulItemSrcOffset, sm.Text.length());
                pOutputSite->AddEvents( &Event, 1 );

				std::vector<unsigned char> word = std::vector<unsigned char>(&resp.AudioData[wordOffset], &resp.AudioData[wordOffset + sm.LengthInBytes]);
				hr = pOutputSite->Write(reinterpret_cast<char*>(&word[0]), sm.LengthInBytes, NULL);
				++i;
				m_ullAudioOff += sm.LengthInBytes;
				wordOffset += sm.LengthInBytes;
			}
          }
          break;

          //--- Output some silence for a pause -----------------------
          case SPVA_Silence:
          {
            BYTE Buff[1000];
            memset( Buff, 0, 1000 );
            ULONG NumSilenceBytes = Item.pXmlState->SilenceMSecs * 22;

            //--- Queue the audio data in chunks so that we can get
            //    interrupted if necessary.
            while( !(pOutputSite->GetActions() & SPVES_ABORT) )
            {
                if( NumSilenceBytes > 1000 )
                {
                    hr = pOutputSite->Write( Buff, 1000, NULL );
                    NumSilenceBytes -= 1000;
                }
                else
                {
                    hr = pOutputSite->Write( Buff, NumSilenceBytes, NULL );
                    break;
                }
            }

            //--- Update the audio offset
            m_ullAudioOff += NumSilenceBytes;
          }
          break;

          //--- Fire a bookmark event ---------------------------------
          case SPVA_Bookmark:
          {
            //--- The bookmark is NOT a null terminated string in the Item, but we need
            //--- to convert it to one.  Allocate enough space for the string.
            WCHAR * pszBookmark = (WCHAR *)_malloca((Item.ulItemLen + 1) * sizeof(WCHAR));
            memcpy(pszBookmark, Item.pItem, Item.ulItemLen * sizeof(WCHAR));
            pszBookmark[Item.ulItemLen] = 0;
            //--- Queue the event
            SPEVENT Event;
            Event.eEventId             = SPEI_TTS_BOOKMARK;
            Event.elParamType          = SPET_LPARAM_IS_STRING;
            Event.ullAudioStreamOffset = m_ullAudioOff;
            Event.lParam               = (LPARAM)pszBookmark;
            Event.wParam               = _wtol(pszBookmark);
            hr = pOutputSite->AddEvents( &Event, 1 );
            //--- Free the space for the string.
            _freea(pszBookmark);
          }
          break;

          case SPVA_Pronounce:
            //--- Our sample engine doesn't handle this. If it
            //    did, you would use the associated pronunciation in
            //    the XmlState structure instead of the lexicon.
            break;

          case SPVA_ParseUnknownTag:
            //--- This will reference an XML tag that is unknown to SAPI
            //    if your engine has private tags to control state, you
            //    would examine these tags and see if you recognize it. This
            //    would also be the point that you would make the rendering
            //    state change.
            break;
        }
    }

    return hr;
} /* CTTSEngObj::OutputSentence */