in avhubert/hubert_dataset.py [0:0]
def load_audio_visual(manifest_path, max_keep, min_keep, frame_rate, label_paths, label_rates, tol=0.1):
def is_audio_label_aligned(audio_dur, label_durs):
return all([abs(audio_dur - label_dur)<tol for label_dur in label_durs])
n_long, n_short, n_unaligned = 0, 0, 0
names, inds, sizes = [], [], []
dur_from_label_list = []
is_seq_label = any([x==-1 for x in label_rates])
for label_path, label_rate in zip(label_paths, label_rates):
label_lengths = [len(line.rstrip().split())/label_rate for line in open(label_path).readlines()]
dur_from_label_list.append(label_lengths)
dur_from_label_list = list(zip(*dur_from_label_list))
with open(manifest_path) as f:
root = f.readline().strip()
for ind, line in enumerate(f):
items = line.strip().split("\t")
sz = int(items[-2]) #
if min_keep is not None and sz < min_keep:
n_short += 1
elif max_keep is not None and sz > max_keep:
n_long += 1
elif (not is_seq_label) and (not is_audio_label_aligned(sz/frame_rate, dur_from_label_list[ind])):
n_unaligned += 1
else:
video_path = items[1]
audio_path = items[2]
audio_id = items[0]
names.append((video_path, audio_path+':'+audio_id))
inds.append(ind)
sizes.append(sz)
tot = ind + 1
logger.info(
(
f"max_keep={max_keep}, min_keep={min_keep}, "
f"loaded {len(names)}, skipped {n_short} short and {n_long} long and {n_unaligned} unaligned, "
f"longest-loaded={max(sizes)}, shortest-loaded={min(sizes)}"
)
)
return root, names, inds, tot, sizes