in src/diarizers/data/synthetic_pipeline.py [0:0]
def refine_audio_segment_timestamps(self, audio_segment, speaker):
"""Refine audio_segment timestamps using a Voice Activity Detector.
Args:
audio_segment (numpy.ndarray): audio segment.
speaker (str): speaker id.
Returns:
audio_segment (numpy.ndarray): croped audio segment - removes the beginning and end of the segment if there is no speech.
file_timestamps_start (list): list of refined start timestamps.
file_timestamps_end (list): list of refined end timestamps.
speakers (list): List of speakers associated to file_timestamps_start and file_timestamps_end.
"""
speech_timestamps = self.get_speech_timestamps(audio_segment, self.vad_model, sampling_rate=self.sample_rate)
if len(speech_timestamps):
audio_segment_start_index = int(speech_timestamps[0]["start"])
audio_segment_end_index = int(speech_timestamps[-1]["end"])
audio_segment = audio_segment[audio_segment_start_index:audio_segment_end_index]
file_timestamps_start = [
(timestamps["start"] - speech_timestamps[0]["start"]) / self.sample_rate
for timestamps in speech_timestamps
]
file_timestamps_end = [
(timestamps["end"] - speech_timestamps[0]["start"]) / self.sample_rate
for timestamps in speech_timestamps
]
speakers = [speaker] * len(speech_timestamps)
else:
file_timestamps_start = [0]
file_timestamps_end = [len(audio_segment) / self.sample_rate]
speakers = [speaker]
assert len(speakers) > 0
return (audio_segment, file_timestamps_start, file_timestamps_end, speakers)