in data_creation/select_sentences_tfidf.py [0:0]
def main():
parser = argparse.ArgumentParser(description='Gather into train, valid and test')
parser.add_argument('-sid', '--slice_id', default=0, type=int, metavar='N',
help='slice to process')
parser.add_argument('-ns', '--num_selected', default=15, type=int, metavar='N',
help='number of selected passages')
parser.add_argument('-nc', '--num_context', default=3, type=int, metavar='N',
help='number of sentences per passage')
parser.add_argument('-sr_n', '--subreddit_name', default='explainlikeimfive', type=str,
help='subreddit name')
args = parser.parse_args()
reddit = args.subreddit_name
n_slice = args.slice_id
n_sents = args.num_selected
n_context = args.num_context
if isfile("processed_data/collected_docs/%s/slices/slice_%d.json" % (reddit, n_slice)):
print("loading data", reddit, n_slice)
qa_data = dict(json.load(open("processed_data/%s_qalist.json" % (reddit,))))
docs_slice = json.load(open("processed_data/collected_docs/%s/slices/slice_%d.json" % (reddit, n_slice)))
word_counts = json.load(open("pre_computed/%s_unigram_counts.json" % (reddit,)))
qt_freqs = dict(word_counts['question_title'])
qt_sum = sum(qt_freqs.values())
d_freqs = dict(word_counts['document'])
d_sum = sum(d_freqs.values())
word_freqs = {'title': (qt_freqs, qt_sum),
'doc' : (d_freqs, d_sum)}
print("loaded data")
processed = []
st_time = time()
for i, (k, docs_list) in enumerate(docs_slice):
if k in qa_data:
processed += [make_example(qa_data[k], docs_list, word_freqs, n_sents, n_context)]
if i % 10 == 0:
print(i, len(processed), time() - st_time)
if not isdir('processed_data/selected_%d_%d' % (n_sents, n_context)):
os.mkdir('processed_data/selected_%d_%d' % (n_sents, n_context))
if not isdir('processed_data/selected_%d_%d/%s' % (n_sents, n_context, reddit)):
os.mkdir('processed_data/selected_%d_%d/%s' % (n_sents, n_context, reddit))
json.dump(processed, open('processed_data/selected_%d_%d/%s/selected_slice_%d.json' % (n_sents, n_context, reddit, n_slice), 'w'))