in TTS/facebookmms_handler.py [0:0]
def process(self, llm_sentence):
language_code = None
if isinstance(llm_sentence, tuple):
llm_sentence, language_code = llm_sentence
console.print(f"[green]ASSISTANT: {llm_sentence}")
logger.debug(f"Processing text: {llm_sentence}")
logger.debug(f"Language code: {language_code}")
if language_code is not None and self.language != language_code:
try:
logger.info(f"Switching language from {self.language} to {language_code}")
self.load_model(language_code)
except KeyError:
console.print(f"[red]Language {language_code} not supported by Facebook MMS. Using {self.language} instead.")
logger.warning(f"Unsupported language: {language_code}")
audio_output = self.generate_audio(llm_sentence)
if audio_output is None or audio_output.numel() == 0:
logger.warning("No audio output generated")
self.should_listen.set()
return
audio_numpy = audio_output.cpu().numpy().squeeze()
logger.debug(f"Raw audio shape: {audio_numpy.shape}, dtype: {audio_numpy.dtype}")
audio_resampled = librosa.resample(audio_numpy, orig_sr=self.model.config.sampling_rate, target_sr=16000)
logger.debug(f"Resampled audio shape: {audio_resampled.shape}, dtype: {audio_resampled.dtype}")
audio_int16 = (audio_resampled * 32768).astype(np.int16)
logger.debug(f"Final audio shape: {audio_int16.shape}, dtype: {audio_int16.dtype}")
if self.stream:
for i in range(0, len(audio_int16), self.chunk_size):
chunk = audio_int16[i:i + self.chunk_size]
yield np.pad(chunk, (0, self.chunk_size - len(chunk)))
else:
for i in range(0, len(audio_int16), self.chunk_size):
yield np.pad(
audio_int16[i : i + self.chunk_size],
(0, self.chunk_size - len(audio_int16[i : i + self.chunk_size])),
)
self.should_listen.set()