def make_short_manifest()

in avhubert/preparation/lrs3_prepare.py [0:0]


def make_short_manifest(pretrain_dir, output_fn):
    subdirs = os.listdir(pretrain_dir)
    min_interval = 0.4
    max_duration = 15
    df = {'fid': [], 'sent': [], 'start': [], 'end': []}
    for subdir in tqdm(subdirs):
        txt_fns = glob.glob(os.path.join(pretrain_dir, subdir+'/*txt'))
        for txt_fn in txt_fns:
            fid = os.path.relpath(txt_fn, pretrain_dir)[:-4]
            lns = open(txt_fn).readlines()
            raw_text = lns[0].strip().split(':')[-1].strip()
            conf = lns[1].strip().split(':')[-1].strip()
            word_intervals = []
            for i_line, ln in enumerate(lns):
                if ln[:4] == 'WORD':
                    start_index = i_line
                    break
            for ln in lns[start_index+1:]:
                word, start, end, score = ln.strip().split()
                word_intervals.append([word, float(start), float(end)])
            if word_intervals[-1][-1] < max_duration:
                df['fid'].append(fid)
                df['sent'].append(raw_text)
                df['start'].append(0)
                df['end'].append(-1)
                continue
            sents, cur_sent = [], []
            for i_word, (word, start, end) in enumerate(word_intervals):
                if i_word == 0:
                    cur_sent.append([word, start, end])
                else:
                    assert start >= cur_sent[-1][-1], f"{fid} , {word}, start-{start}, prev-{cur_sent[-1][-1]}"
                    if start - cur_sent[-1][-1] > min_interval:
                        sents.append(cur_sent)
                        cur_sent = [[word, start, end]]
                    else:
                        cur_sent.append([word, start, end])
            if len(cur_sent) > 0:
                sents.append(cur_sent)
            for i_sent, sent in enumerate(sents):
                df['fid'].append(fid+'_'+str(i_sent))
                sent_words = ' '.join([x[0] for x in sent])
                if i_sent == 0:
                    sent_start = 0
                else:
                    sent_start = (sent[0][1] + sents[i_sent-1][-1][2])/2
                if i_sent == len(sents)-1:
                    sent_end = -1
                else:
                    sent_end = (sent[-1][2] + sents[i_sent+1][0][1])/2
                df['sent'].append(sent_words)
                df['start'].append(sent_start)
                df['end'].append(sent_end)
    durations = [y-x for x, y in zip(df['start'], df['end'])]
    num_long = len(list(filter(lambda x: x > 15, durations)))
    print(f"Percentage of >15 second: {100*num_long/len(durations)}%")
    num_long = len(list(filter(lambda x: x > 20, durations)))
    print(f"Percentage of >20 second: {100*num_long/len(durations)}%")
    with open(output_fn, 'w') as fo:
        fo.write('id,text,start,end\n')
        for i in range(len(df['fid'])):
            fo.write(','.join([df['fid'][i], df['sent'][i], '%.3f' % (df['start'][i]), '%.3f' % (df['end'][i])])+'\n')
    return