in avhubert/preparation/lrs3_prepare.py [0:0]
def make_short_manifest(pretrain_dir, output_fn):
subdirs = os.listdir(pretrain_dir)
min_interval = 0.4
max_duration = 15
df = {'fid': [], 'sent': [], 'start': [], 'end': []}
for subdir in tqdm(subdirs):
txt_fns = glob.glob(os.path.join(pretrain_dir, subdir+'/*txt'))
for txt_fn in txt_fns:
fid = os.path.relpath(txt_fn, pretrain_dir)[:-4]
lns = open(txt_fn).readlines()
raw_text = lns[0].strip().split(':')[-1].strip()
conf = lns[1].strip().split(':')[-1].strip()
word_intervals = []
for i_line, ln in enumerate(lns):
if ln[:4] == 'WORD':
start_index = i_line
break
for ln in lns[start_index+1:]:
word, start, end, score = ln.strip().split()
word_intervals.append([word, float(start), float(end)])
if word_intervals[-1][-1] < max_duration:
df['fid'].append(fid)
df['sent'].append(raw_text)
df['start'].append(0)
df['end'].append(-1)
continue
sents, cur_sent = [], []
for i_word, (word, start, end) in enumerate(word_intervals):
if i_word == 0:
cur_sent.append([word, start, end])
else:
assert start >= cur_sent[-1][-1], f"{fid} , {word}, start-{start}, prev-{cur_sent[-1][-1]}"
if start - cur_sent[-1][-1] > min_interval:
sents.append(cur_sent)
cur_sent = [[word, start, end]]
else:
cur_sent.append([word, start, end])
if len(cur_sent) > 0:
sents.append(cur_sent)
for i_sent, sent in enumerate(sents):
df['fid'].append(fid+'_'+str(i_sent))
sent_words = ' '.join([x[0] for x in sent])
if i_sent == 0:
sent_start = 0
else:
sent_start = (sent[0][1] + sents[i_sent-1][-1][2])/2
if i_sent == len(sents)-1:
sent_end = -1
else:
sent_end = (sent[-1][2] + sents[i_sent+1][0][1])/2
df['sent'].append(sent_words)
df['start'].append(sent_start)
df['end'].append(sent_end)
durations = [y-x for x, y in zip(df['start'], df['end'])]
num_long = len(list(filter(lambda x: x > 15, durations)))
print(f"Percentage of >15 second: {100*num_long/len(durations)}%")
num_long = len(list(filter(lambda x: x > 20, durations)))
print(f"Percentage of >20 second: {100*num_long/len(durations)}%")
with open(output_fn, 'w') as fo:
fo.write('id,text,start,end\n')
for i in range(len(df['fid'])):
fo.write(','.join([df['fid'][i], df['sent'][i], '%.3f' % (df['start'][i]), '%.3f' % (df['end'][i])])+'\n')
return