def main()

in data_creation/select_sentences_tfidf.py [0:0]


def main():
    parser  = argparse.ArgumentParser(description='Gather into train, valid and test')
    parser.add_argument('-sid', '--slice_id', default=0, type=int, metavar='N',
                        help='slice to process')
    parser.add_argument('-ns', '--num_selected', default=15, type=int, metavar='N',
                        help='number of selected passages')
    parser.add_argument('-nc', '--num_context', default=3, type=int, metavar='N',
                        help='number of sentences per passage')
    parser.add_argument('-sr_n', '--subreddit_name', default='explainlikeimfive', type=str,
                        help='subreddit name')
    args        = parser.parse_args()
    reddit      = args.subreddit_name
    n_slice     = args.slice_id
    n_sents     = args.num_selected
    n_context   = args.num_context
    if isfile("processed_data/collected_docs/%s/slices/slice_%d.json" % (reddit, n_slice)):
        print("loading data", reddit, n_slice)
        qa_data     = dict(json.load(open("processed_data/%s_qalist.json" % (reddit,))))
        docs_slice  = json.load(open("processed_data/collected_docs/%s/slices/slice_%d.json" % (reddit, n_slice)))
        word_counts = json.load(open("pre_computed/%s_unigram_counts.json" % (reddit,)))
        qt_freqs    = dict(word_counts['question_title'])
        qt_sum      = sum(qt_freqs.values())
        d_freqs     = dict(word_counts['document'])
        d_sum       = sum(d_freqs.values())
        word_freqs  = {'title': (qt_freqs, qt_sum),
                       'doc'  : (d_freqs, d_sum)}
        print("loaded data")
        processed   = []
        st_time = time()
        for i, (k, docs_list) in enumerate(docs_slice):
            if k in qa_data:
                processed   += [make_example(qa_data[k], docs_list, word_freqs, n_sents, n_context)]
            if i % 10 == 0:
                print(i, len(processed), time() - st_time)
        if not isdir('processed_data/selected_%d_%d' % (n_sents, n_context)):
            os.mkdir('processed_data/selected_%d_%d' % (n_sents, n_context))
        if not isdir('processed_data/selected_%d_%d/%s' % (n_sents, n_context, reddit)):
            os.mkdir('processed_data/selected_%d_%d/%s' % (n_sents, n_context, reddit))
        json.dump(processed, open('processed_data/selected_%d_%d/%s/selected_slice_%d.json' % (n_sents, n_context, reddit, n_slice), 'w'))