def encode_example()

in src/datasets/features/audio.py [0:0]
40 lines of code
18 McCabe index (conditional complexity)

    def encode_example(self, value: Union[str, bytes, bytearray, dict, "AudioDecoder"]) -> dict:
        """Encode example into a format for Arrow.

        Args:
            value (`str`, `bytes`,`bytearray`,`dict`, `AudioDecoder`):
                Data passed as input to Audio feature.

        Returns:
            `dict`
        """
        try:
            import soundfile as sf  # needed to write audio files
        except ImportError as err:
            raise ImportError("To support encoding audio data, please install 'soundfile'.") from err

        if value is None:
            raise ValueError("value must be provided")

        if config.TORCHCODEC_AVAILABLE:
            from torchcodec.decoders import AudioDecoder

        else:
            AudioDecoder = None

        if isinstance(value, str):
            return {"bytes": None, "path": value}
        elif isinstance(value, (bytes, bytearray)):
            return {"bytes": value, "path": None}
        elif AudioDecoder is not None and isinstance(value, AudioDecoder):
            return encode_torchcodec_audio(value)
        elif "array" in value:
            # convert the audio array to wav bytes
            buffer = BytesIO()
            sf.write(buffer, value["array"].T, value["sampling_rate"], format="wav")
            return {"bytes": buffer.getvalue(), "path": None}
        elif value.get("path") is not None and os.path.isfile(value["path"]):
            # we set "bytes": None to not duplicate the data if they're already available locally
            if value["path"].endswith("pcm"):
                # "PCM" only has raw audio bytes
                if value.get("sampling_rate") is None:
                    # At least, If you want to convert "PCM-byte" to "WAV-byte", you have to know sampling rate
                    raise KeyError("To use PCM files, please specify a 'sampling_rate' in Audio object")
                if value.get("bytes"):
                    # If we already had PCM-byte, we don`t have to make "read file, make bytes" (just use it!)
                    bytes_value = np.frombuffer(value["bytes"], dtype=np.int16).astype(np.float32) / 32767
                else:
                    bytes_value = np.memmap(value["path"], dtype="h", mode="r").astype(np.float32) / 32767

                buffer = BytesIO(b"")
                sf.write(buffer, bytes_value, value["sampling_rate"], format="wav")
                return {"bytes": buffer.getvalue(), "path": None}
            else:
                return {"bytes": None, "path": value.get("path")}
        elif value.get("bytes") is not None or value.get("path") is not None:
            # store the audio bytes, and path is used to infer the audio format using the file extension
            return {"bytes": value.get("bytes"), "path": value.get("path")}
        else:
            raise ValueError(
                f"An audio sample should have one of 'path' or 'bytes' but they are missing or None in {value}."
            )