def construct_dataset()

in src/diarizers/data/speaker_diarization.py [0:0]


    def construct_dataset(self, num_proc=1):
        """Main method to construct the dataset

        Returns:
            self.spd_dataset: HF dataset compatible with diarizers.
        """

        self.spd_dataset = DatasetDict()

        for subset in self.audio_paths:
            timestamps_start = []
            timestamps_end = []
            speakers = []

            self.spd_dataset[str(subset)] = Dataset.from_dict({})

            for annotations in self.annotations_paths[subset]:
                if self.annotations_type == "rttm":
                    timestamps_start_file, timestamps_end_file, speakers_file = self.process_rttm_file(annotations)
                elif self.annotations_type == "cha":
                    timestamps_start_file, timestamps_end_file, speakers_file = self.process_cha_file(annotations)

                timestamps_start.append(timestamps_start_file)
                timestamps_end.append(timestamps_end_file)
                speakers.append(speakers_file)

            self.spd_dataset[subset] = Dataset.from_dict(
                {
                    "audio": self.audio_paths[subset],
                    "timestamps_start": timestamps_start,
                    "timestamps_end": timestamps_end,
                    "speakers": speakers,
                }
            ).cast_column("audio", Audio(sampling_rate=self.sample_rate))

            if self.crop_unannotated_regions:
                self.spd_dataset[subset] = (
                    self.spd_dataset[subset]
                    .map(
                        lambda example: self.crop_audio(example),
                        batched=True,
                        batch_size=8,
                        remove_columns=self.spd_dataset[subset].column_names,
                        num_proc=num_proc,
                    )
                    .cast_column("audio", Audio(sampling_rate=self.sample_rate))
                )

        return self.spd_dataset