in VAD/vad_handler.py [0:0]
def process(self, audio_chunk):
audio_int16 = np.frombuffer(audio_chunk, dtype=np.int16)
audio_float32 = int2float(audio_int16)
vad_output = self.iterator(torch.from_numpy(audio_float32))
if vad_output is not None and len(vad_output) != 0:
logger.debug("VAD: end of speech detected")
array = torch.cat(vad_output).cpu().numpy()
duration_ms = len(array) / self.sample_rate * 1000
if duration_ms < self.min_speech_ms or duration_ms > self.max_speech_ms:
logger.debug(
f"audio input of duration: {len(array) / self.sample_rate}s, skipping"
)
else:
self.should_listen.clear()
logger.debug("Stop listening")
if self.audio_enhancement:
if self.sample_rate != self.df_state.sr():
audio_float32 = torchaudio.functional.resample(
torch.from_numpy(array),
orig_freq=self.sample_rate,
new_freq=self.df_state.sr(),
)
enhanced = enhance(
self.enhanced_model,
self.df_state,
audio_float32.unsqueeze(0),
)
enhanced = torchaudio.functional.resample(
enhanced,
orig_freq=self.df_state.sr(),
new_freq=self.sample_rate,
)
else:
enhanced = enhance(
self.enhanced_model, self.df_state, audio_float32
)
array = enhanced.numpy().squeeze()
yield array