def make_musan_tsv()

in avhubert/preparation/noise_manifest.py [0:0]


def make_musan_tsv(musan_root, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    sample_rate = 16_000
    min_dur, max_dur = 3*sample_rate, 11*sample_rate
    part_ratios = zip(['train', 'valid', 'test'], [0.8, 0.1, 0.1])
    all_fns = {}
    nfs = f"{musan_root}/nframes.audio"
    nfs = dict([x.strip().split('\t') for x in open(nfs).readlines()])
    for category in ['babble', 'music', 'noise']:
        wav_fns = glob.glob(f"{musan_root}/{category}/*/*wav")
        target_fns = []
        for wav_fn in tqdm(wav_fns):
            dur = int(nfs[os.path.abspath(wav_fn)])
            if dur >= min_dur and dur < max_dur:
                target_fns.append(wav_fn)
        print(f"{category}: {len(target_fns)}/{len(wav_fns)}")
        all_fns[category] = target_fns
        output_subdir = f"{output_dir}/{category}"
        os.makedirs(output_subdir, exist_ok=True)
        num_train, num_valid, num_test = int(0.8*len(target_fns)), int(0.1*len(target_fns)), int(0.1*len(target_fns))
        if category in {'music', 'noise'}:
            np.random.shuffle(target_fns)
            train_fns, valid_fns, test_fns = target_fns[:num_train], target_fns[num_train:num_train+num_valid], target_fns[num_train+num_valid:]
        elif category == 'babble':
            train_fns, valid_fns, test_fns = [], [], []
            for wav_fn in target_fns:
                split_id = os.path.basename(wav_fn)[:-4].split('-')[0]
                if split_id == 'train':
                    train_fns.append(wav_fn)
                elif split_id == 'valid':
                    valid_fns.append(wav_fn)
                elif split_id == 'test':
                    test_fns.append(wav_fn)
        for x in ['train', 'valid', 'test']:
            x_fns = eval(f"{x}_fns")
            x_fns = [os.path.abspath(x_fn) for x_fn in x_fns]
            print(os.path.abspath(output_subdir), x, len(x_fns))
            with open(f"{output_subdir}/{x}.tsv", 'w') as fo:
                fo.write('\n'.join(x_fns)+'\n')
    return