in 01-byoc/code/dataset.py [0:0]
def preprocessing(self, wav_data, sr):
"""Convert wav_data to log mel spectrogram.
1. normalize the wav_data
2. convert the wav_data into mono-channel
3. resample the wav_data to the sampling rate we want
4. compute the log mel spetrogram with librosa function
Args:
wav_data: An np.array indicating wav data in np.int16 datatype
sr: An integer specifying the sampling rate of this wav data
Return:
inpt: An np.array indicating the log mel spectrogram of data
"""
# normalize wav_data
if self.normalize == 'peak':
samples = wav_data/np.max(wav_data)
elif self.normalize == 'rms':
rms_level = 0
r = 10**(rms_level / 10.0)
a = np.sqrt((len(wav_data) * r**2) / np.sum(wav_data**2))
samples = wav_data * a
else:
samples = wav_data / 32768.0
# convert samples to mono-channel file
if len(samples.shape) > 1:
samples = np.mean(samples, axis=1)
# resample samples to 8k
if sr != self.params.sr:
samples = resampy.resample(samples, sr, self.params.sr)
# transform samples to mel spectrogram
inpt_x = 500
spec = librosa.feature.melspectrogram(samples, sr=self.params.sr, n_fft=self.params.nfft, hop_length=self.params.hop, n_mels=self.params.mel)
spec_db = librosa.power_to_db(spec).T
spec_db = np.concatenate((spec_db, np.zeros((inpt_x - spec_db.shape[0], self.params.mel))), axis=0) if spec_db.shape[0] < inpt_x else spec_db[:inpt_x]
inpt = np.reshape(spec_db, (1, spec_db.shape[0], spec_db.shape[1]))
# inpt = np.reshape(spec_db, (spec_db.shape[0], spec_db.shape[1]))
return inpt.astype('float32')