in tutorials/video_classification_example/train.py [0:0]
def _audio_transform(self):
"""
This function contains example transforms using both PyTorchVideo and TorchAudio
in the same Callable.
"""
args = self.args
n_fft = int(
float(args.audio_resampled_rate) / 1000 * args.audio_mel_window_size
)
hop_length = int(
float(args.audio_resampled_rate) / 1000 * args.audio_mel_step_size
)
eps = 1e-10
return ApplyTransformToKey(
key="audio",
transform=Compose(
[
Resample(
orig_freq=args.audio_raw_sample_rate,
new_freq=args.audio_resampled_rate,
),
MelSpectrogram(
sample_rate=args.audio_resampled_rate,
n_fft=n_fft,
hop_length=hop_length,
n_mels=args.audio_num_mels,
center=False,
),
Lambda(lambda x: x.clamp(min=eps)),
Lambda(torch.log),
UniformTemporalSubsample(args.audio_mel_num_subsample),
Lambda(lambda x: x.transpose(1, 0)), # (F, T) -> (T, F)
Lambda(
lambda x: x.view(1, x.size(0), 1, x.size(1))
), # (T, F) -> (1, T, 1, F)
Normalize((args.audio_logmel_mean,), (args.audio_logmel_std,)),
]
),
)