in src/diarizers/data/speaker_diarization.py [0:0]
def construct_dataset(self, num_proc=1):
"""Main method to construct the dataset
Returns:
self.spd_dataset: HF dataset compatible with diarizers.
"""
self.spd_dataset = DatasetDict()
for subset in self.audio_paths:
timestamps_start = []
timestamps_end = []
speakers = []
self.spd_dataset[str(subset)] = Dataset.from_dict({})
for annotations in self.annotations_paths[subset]:
if self.annotations_type == "rttm":
timestamps_start_file, timestamps_end_file, speakers_file = self.process_rttm_file(annotations)
elif self.annotations_type == "cha":
timestamps_start_file, timestamps_end_file, speakers_file = self.process_cha_file(annotations)
timestamps_start.append(timestamps_start_file)
timestamps_end.append(timestamps_end_file)
speakers.append(speakers_file)
self.spd_dataset[subset] = Dataset.from_dict(
{
"audio": self.audio_paths[subset],
"timestamps_start": timestamps_start,
"timestamps_end": timestamps_end,
"speakers": speakers,
}
).cast_column("audio", Audio(sampling_rate=self.sample_rate))
if self.crop_unannotated_regions:
self.spd_dataset[subset] = (
self.spd_dataset[subset]
.map(
lambda example: self.crop_audio(example),
batched=True,
batch_size=8,
remove_columns=self.spd_dataset[subset].column_names,
num_proc=num_proc,
)
.cast_column("audio", Audio(sampling_rate=self.sample_rate))
)
return self.spd_dataset