def main()

in data_creation/finalize_qda.py [0:0]


def main():
    parser  = argparse.ArgumentParser(description='Gather into train, valid and test')
    parser.add_argument('-ns', '--num_selected', default=15, type=int, metavar='N',
                        help='number of selected passages')
    parser.add_argument('-nc', '--num_context', default=1, type=int, metavar='N',
                        help='number of sentences per passage')
    parser.add_argument('-sr_l', '--subreddit_list', default='["explainlikeimfive"]', type=str,
                        help='subreddit name')
    args    = parser.parse_args()
    n_sel   = args.num_selected
    n_cont  = args.num_context
    for name in json.loads(args.subreddit_list):
        data_split  = json.load(open('pre_computed/%s_split_keys.json' % (name,)))
        qda_list    = []
        for f_name in glob('processed_data/selected_%d_%d/%s/selected_slice_*.json' % (n_sel, n_cont, name)):
            qda_list    += json.load(open(f_name))
        qda_dict    = dict([(dct['id'], dct) for dct in qda_list])
        for spl in ['train', 'valid', 'test']:
            split_list  = [qda_dict[k] for k in data_split[spl] if k in qda_dict]
            json.dump(split_list, open('processed_data/selected_%d_%d/%s_%s.json' % (n_sel, n_cont, name, spl), 'w'))