def _read_audio_and_transform_to_feature()

in ludwig/features/audio_feature.py [0:0]


    def _read_audio_and_transform_to_feature(filepath, audio_feature_dict,
                                             feature_dim, max_length,
                                             padding_value, normalization_type,
                                             audio_stats):
        """
        :param filepath: path to the audio
        :param audio_feature_dict: dictionary describing audio feature see default
        :param feature_dim: dimension of each feature frame
        :param max_length: max audio length defined by user in samples
        """
        try:
            import soundfile
        except ImportError:
            logger.error(
                ' soundfile is not installed. '
                'In order to install all audio feature dependencies run '
                'pip install ludwig[audio]'
            )
            sys.exit(-1)

        feature_type = audio_feature_dict[TYPE]
        audio, sampling_rate_in_hz = soundfile.read(filepath)
        AudioFeatureMixin._update(audio_stats, audio, sampling_rate_in_hz)

        if feature_type == 'raw':
            audio_feature = np.expand_dims(audio, axis=-1)
        elif feature_type in ['stft', 'stft_phase', 'group_delay', 'fbank']:
            audio_feature = np.transpose(
                AudioFeatureMixin._get_2D_feature(audio, feature_type,
                                                  audio_feature_dict,
                                                  sampling_rate_in_hz))
        else:
            raise ValueError('{} is not recognized.'.format(feature_type))

        if normalization_type == 'per_file':
            mean = np.mean(audio_feature, axis=0)
            std = np.std(audio_feature, axis=0)
            audio_feature = np.divide((audio_feature - mean),
                                      std + 1.0e-10)
        elif normalization_type == 'global':
            raise ValueError('not implemented yet')

        feature_length = audio_feature.shape[0]
        broadcast_feature_length = min(feature_length, max_length)
        audio_feature_padded = np.full((max_length, feature_dim),
                                       padding_value,
                                       dtype=np.float32)
        audio_feature_padded[:broadcast_feature_length, :] = audio_feature[
                                                             :max_length, :]

        return audio_feature_padded