def refine_audio_segment_timestamps()

in src/diarizers/data/synthetic_pipeline.py [0:0]


    def refine_audio_segment_timestamps(self, audio_segment, speaker):
        """Refine audio_segment timestamps using a Voice Activity Detector.

        Args:
            audio_segment (numpy.ndarray): audio segment.
            speaker (str): speaker id.

        Returns:
            audio_segment (numpy.ndarray): croped audio segment - removes the beginning and end of the segment if there is no speech.
            file_timestamps_start (list): list of refined start timestamps.
            file_timestamps_end (list): list of refined end timestamps.
            speakers (list): List of speakers associated to file_timestamps_start and file_timestamps_end.
        """

        speech_timestamps = self.get_speech_timestamps(audio_segment, self.vad_model, sampling_rate=self.sample_rate)

        if len(speech_timestamps):
            audio_segment_start_index = int(speech_timestamps[0]["start"])
            audio_segment_end_index = int(speech_timestamps[-1]["end"])
            audio_segment = audio_segment[audio_segment_start_index:audio_segment_end_index]

            file_timestamps_start = [
                (timestamps["start"] - speech_timestamps[0]["start"]) / self.sample_rate
                for timestamps in speech_timestamps
            ]
            file_timestamps_end = [
                (timestamps["end"] - speech_timestamps[0]["start"]) / self.sample_rate
                for timestamps in speech_timestamps
            ]
            speakers = [speaker] * len(speech_timestamps)

        else:
            file_timestamps_start = [0]
            file_timestamps_end = [len(audio_segment) / self.sample_rate]
            speakers = [speaker]

        assert len(speakers) > 0

        return (audio_segment, file_timestamps_start, file_timestamps_end, speakers)