in src/diarizers/data/synthetic_pipeline.py [0:0]
def sample_meeting_segments(self):
"""Sample segments that will be used for meeting generation:
Returns:
batch_samples (HuggingFace dataset): batch of audio segments to be concatenated to form a meeting.
"""
batch_samples = Dataset.from_dict({str(self.speaker_column_name): [], str(self.audio_column_name): []})
# Sample nb_speakers_per_meeting from the list of speakers_to_sample_from:
self.sampled_speakers = random.sample(self.speakers_to_sample_from, self.nb_speakers_per_meeting)
# Get the pool of segments associated with the speakers:
self.audio_index_pool = {
speaker: self.speaker_indexes_in_dataset[str(speaker)].copy() for speaker in self.sampled_speakers
}
self.current_speaker = self.sampled_speakers[0]
indexes = []
# Sample segments_per_meeting segments:
for _ in range(self.segments_per_meeting):
# select a segment from the current speaker and remove it from the pool of segments:
indexes.append(random.choice(self.audio_index_pool[self.current_speaker]))
self.audio_index_pool[self.current_speaker].remove(indexes[-1])
if len(self.audio_index_pool[self.current_speaker]) == 0:
del self.audio_index_pool[self.current_speaker]
self.sampled_speakers.remove(self.current_speaker)
# Sample next speaker
self.current_speaker = self.sample_next_speaker()
batch_samples = self.speakers_to_sample_from_dataset.select(indexes)
assert len(batch_samples) == self.segments_per_meeting
return batch_samples