in datasets/preprocessing.py [0:0]
def __call__(self, sig, sr, duration=None):
if duration is None:
duration = self.duration
num_frames = int(duration*sr)
# Check if audio is missing
if self.missing_as_zero and sig is None:
sig = np.zeros((1, num_frames), dtype=np.float32)
# Downmix to mono
sig = sig.mean(0).astype(np.float32)
# Trim or pad to constant shape
if self.trim_pad:
if sig.shape[0] > num_frames:
sig = sig[:num_frames]
elif sig.shape[0] < num_frames:
n_pad = num_frames - sig.shape[0]
sig = np.pad(sig, (0, n_pad), mode='constant', constant_values=(0., 0.))
# Augment by changing volume +/- 10%
if self.augment:
sig *= random.uniform(1.-self.volume, 1.+self.volume)
sig = sig[np.newaxis]
if self.to_tensor:
sig = torch.from_numpy(sig)
return sig, sr