def sample_meeting_segments()

in src/diarizers/data/synthetic_pipeline.py [0:0]


    def sample_meeting_segments(self):
        """Sample segments that will be used for meeting generation:

        Returns:
            batch_samples (HuggingFace dataset): batch of audio segments to be concatenated to form a meeting.
        """

        batch_samples = Dataset.from_dict({str(self.speaker_column_name): [], str(self.audio_column_name): []})

        # Sample nb_speakers_per_meeting from the list of speakers_to_sample_from:
        self.sampled_speakers = random.sample(self.speakers_to_sample_from, self.nb_speakers_per_meeting)
        # Get the pool of segments associated with the speakers:
        self.audio_index_pool = {
            speaker: self.speaker_indexes_in_dataset[str(speaker)].copy() for speaker in self.sampled_speakers
        }

        self.current_speaker = self.sampled_speakers[0]

        indexes = []
        # Sample segments_per_meeting segments:
        for _ in range(self.segments_per_meeting):

            # select a segment from the current speaker and remove it from the pool of segments:
            indexes.append(random.choice(self.audio_index_pool[self.current_speaker]))
            self.audio_index_pool[self.current_speaker].remove(indexes[-1])

            if len(self.audio_index_pool[self.current_speaker]) == 0:
                del self.audio_index_pool[self.current_speaker]
                self.sampled_speakers.remove(self.current_speaker)

            # Sample next speaker
            self.current_speaker = self.sample_next_speaker()

        batch_samples = self.speakers_to_sample_from_dataset.select(indexes)

        assert len(batch_samples) == self.segments_per_meeting
        return batch_samples