in avhubert/hubert_dataset.py [0:0]
def load_feature(self, mix_name):
"""
Load image and audio feature
Returns:
video_feats: numpy.ndarray of shape [T, H, W, 1], audio_feats: numpy.ndarray of shape [T, F]
"""
def stacker(feats, stack_order):
"""
Concatenating consecutive audio frames
Args:
feats - numpy.ndarray of shape [T, F]
stack_order - int (number of neighboring frames to concatenate
Returns:
feats - numpy.ndarray of shape [T', F']
"""
feat_dim = feats.shape[1]
if len(feats) % stack_order != 0:
res = stack_order - len(feats) % stack_order
res = np.zeros([res, feat_dim]).astype(feats.dtype)
feats = np.concatenate([feats, res], axis=0)
feats = feats.reshape((-1, stack_order, feat_dim)).reshape(-1, stack_order*feat_dim)
return feats
video_fn, audio_fn = mix_name
if 'video' in self.modalities:
video_feats = self.load_video(video_fn) # [T, H, W, 1]
else:
video_feats = None
if 'audio' in self.modalities:
audio_fn = audio_fn.split(':')[0]
sample_rate, wav_data = wavfile.read(audio_fn)
assert sample_rate == 16_000 and len(wav_data.shape) == 1
if np.random.rand() < self.noise_prob:
wav_data = self.add_noise(wav_data)
audio_feats = logfbank(wav_data, samplerate=sample_rate).astype(np.float32) # [T, F]
audio_feats = stacker(audio_feats, self.stack_order_audio) # [T/stack_order_audio, F*stack_order_audio]
else:
audio_feats = None
if audio_feats is not None and video_feats is not None:
diff = len(audio_feats) - len(video_feats)
if diff < 0:
audio_feats = np.concatenate([audio_feats, np.zeros([-diff, audio_feats.shape[-1]], dtype=audio_feats.dtype)])
elif diff > 0:
audio_feats = audio_feats[:-diff]
return video_feats, audio_feats