in data_creation/download_reddit_qalist.py [0:0]
def main():
parser = argparse.ArgumentParser(description='Subreddit QA pair downloader')
parser.add_argument('-sy', '--start_year', default=2011, type=int, metavar='N',
help='starting year')
parser.add_argument('-ey', '--end_year', default=2018, type=int, metavar='N',
help='end year')
parser.add_argument('-sm', '--start_month', default=7, type=int, metavar='N',
help='starting year')
parser.add_argument('-em', '--end_month', default=7, type=int, metavar='N',
help='end year')
parser.add_argument('-sr_l', '--subreddit_list', default='["explainlikeimfive"]', type=str,
help='subreddit name')
parser.add_argument('-Q', '--questions_only', action='store_true',
help= 'only download submissions')
parser.add_argument('-A', '--answers_only', action='store_true',
help= 'only download comments')
args = parser.parse_args()
### collect submissions and comments monthly URLs
date_to_url_submissions = gather_dump_urls(REDDIT_URL,
"submissions")
date_to_url_comments = gather_dump_urls(REDDIT_URL,
"comments")
date_to_urls = {}
for k, v in date_to_url_submissions.items():
date_to_urls[k] = (v, date_to_url_comments.get(k, ''))
### download, filter, process, remove
subprocess.run(['mkdir', 'reddit_tmp'], stdout=subprocess.PIPE)
st_time = time()
subreddit_names = json.loads(args.subreddit_list)
output_files = dict([(name, "processed_data/%s_qalist.json" % (name,))
for name in subreddit_names])
qa_dict = dict([(name, {}) for name in subreddit_names])
for name, fname in output_files.items():
if isfile(fname):
print("loading already processed documents from", fname)
f = open(fname)
qa_dict[name] = dict(json.load(f))
f.close()
print("loaded already processed documents")
# slice file save
n_months = 0
for year in range(args.start_year, args.end_year + 1):
st_month = args.start_month if year == args.start_year else 1
end_month = args.end_month if year == args.end_year else 12
months = range(st_month, end_month + 1)
for month in months:
merged_comments = 0
submissions_url, comments_url = date_to_urls[(year, month)]
if not args.answers_only:
try:
processed_submissions = download_and_process(submissions_url,
'submissions',
subreddit_names,
st_time)
except FileNotFoundError as e:
sleep(60)
print("retrying %s once" % (submissions_url))
processed_submissions = download_and_process(submissions_url,
'submissions',
subreddit_names,
st_time)
for name in subreddit_names:
for dct in processed_submissions[name]:
qa_dict[name][dct['id']] = dct
if not args.questions_only:
try:
processed_comments = download_and_process(comments_url,
'comments',
subreddit_names,
st_time)
except FileNotFoundError as e:
sleep(60)
print("retrying %s once" % (comments_url))
processed_comments = download_and_process(comments_url,
'comments',
subreddit_names,
st_time)
# merge submissions and comments
for name in subreddit_names:
merged_comments = 0
for dct in processed_comments[name]:
did = dct['parent_id'].split('_')[-1]
# did = dct['parent_id'][3:]
if did in qa_dict[name]:
merged_comments += 1
comments_list = qa_dict[name][did].get('comments', []) + [dct]
qa_dict[name][did]['comments'] = sorted(comments_list,
key=lambda x:x['score'],
reverse=True)
print("----- added to global dictionary", name, year, month,
time() - st_time,
merged_comments,
len(qa_dict[name]))
for name, out_file_name in output_files.items():
fo = open(out_file_name, "w")
json.dump([(eli_k, eli_dct) for eli_k, eli_dct in qa_dict[name].items()], fo)
fo.close()
if not args.questions_only:
for name, out_file_name in output_files.items():
print('post-processing', name)
qa_dct_list = [(k, post_process(rdct, name)) for k, rdct in qa_dict[name].items() if 'comments' in rdct]
qa_dct_list = [x for x in qa_dct_list if len(x[1]['comments']) > 0 and name in x[1]['url']]
fo = open(out_file_name, "w")
json.dump(qa_dct_list, fo)
fo.close()