def main()

in data_creation/download_reddit_qalist.py [0:0]


def main():
    parser  = argparse.ArgumentParser(description='Subreddit QA pair downloader')
    parser.add_argument('-sy', '--start_year', default=2011, type=int, metavar='N',
                        help='starting year')
    parser.add_argument('-ey', '--end_year', default=2018, type=int, metavar='N',
                        help='end year')
    parser.add_argument('-sm', '--start_month', default=7, type=int, metavar='N',
                        help='starting year')
    parser.add_argument('-em', '--end_month', default=7, type=int, metavar='N',
                        help='end year')
    parser.add_argument('-sr_l', '--subreddit_list', default='["explainlikeimfive"]', type=str,
                        help='subreddit name')
    parser.add_argument('-Q', '--questions_only', action='store_true',
                        help= 'only download submissions')
    parser.add_argument('-A', '--answers_only', action='store_true',
                        help= 'only download comments')
    args        = parser.parse_args()
    ### collect submissions and comments monthly URLs
    date_to_url_submissions = gather_dump_urls(REDDIT_URL,
                                               "submissions")
    date_to_url_comments    = gather_dump_urls(REDDIT_URL,
                                               "comments")
    date_to_urls    = {}
    for k, v in date_to_url_submissions.items():
        date_to_urls[k]    = (v, date_to_url_comments.get(k, ''))
    ### download, filter, process, remove
    subprocess.run(['mkdir', 'reddit_tmp'], stdout=subprocess.PIPE)
    st_time    = time()
    subreddit_names = json.loads(args.subreddit_list)
    output_files    = dict([(name, "processed_data/%s_qalist.json" % (name,))
                            for name in subreddit_names])
    qa_dict         = dict([(name, {}) for name in subreddit_names])
    for name, fname in output_files.items():
        if isfile(fname):
            print("loading already processed documents from", fname)
            f = open(fname)
            qa_dict[name] = dict(json.load(f))
            f.close()
            print("loaded already processed documents")
    # slice file save
    n_months    = 0
    for year in range(args.start_year, args.end_year + 1):
        st_month    = args.start_month if year == args.start_year else 1
        end_month   = args.end_month if year == args.end_year else 12
        months      = range(st_month, end_month + 1)
        for month in months:
            merged_comments = 0
            submissions_url, comments_url   = date_to_urls[(year, month)]
            if not args.answers_only:
                try:
                    processed_submissions   = download_and_process(submissions_url,
                                                                   'submissions',
                                                                   subreddit_names,
                                                                   st_time)
                except FileNotFoundError as e:
                    sleep(60)
                    print("retrying %s once" % (submissions_url))
                    processed_submissions   = download_and_process(submissions_url,
                                                                   'submissions',
                                                                   subreddit_names,
                                                                   st_time)
                for name in subreddit_names:
                    for dct in processed_submissions[name]:
                        qa_dict[name][dct['id']]  = dct
            if not args.questions_only:
                try:
                    processed_comments      = download_and_process(comments_url,
                                                                   'comments',
                                                                   subreddit_names,
                                                                   st_time)
                except FileNotFoundError as e:
                    sleep(60)
                    print("retrying %s once" % (comments_url))
                    processed_comments      = download_and_process(comments_url,
                                                                   'comments',
                                                                   subreddit_names,
                                                                   st_time)
                # merge submissions and comments
                for name in subreddit_names:
                    merged_comments = 0
                    for dct in processed_comments[name]:
                        did = dct['parent_id'].split('_')[-1]
                        # did = dct['parent_id'][3:]
                        if did in qa_dict[name]:
                            merged_comments += 1
                            comments_list               = qa_dict[name][did].get('comments', []) + [dct]
                            qa_dict[name][did]['comments']    = sorted(comments_list,
                                                                 key=lambda x:x['score'],
                                                                 reverse=True)
                    print("----- added to global dictionary", name, year, month,
                                                              time() - st_time,
                                                              merged_comments,
                                                              len(qa_dict[name]))
            for name, out_file_name in output_files.items():
                fo = open(out_file_name, "w")
                json.dump([(eli_k, eli_dct) for eli_k, eli_dct in qa_dict[name].items()], fo)
                fo.close()
    if not args.questions_only:
        for name, out_file_name in output_files.items():
            print('post-processing', name)
            qa_dct_list = [(k, post_process(rdct, name)) for k, rdct in qa_dict[name].items() if 'comments' in rdct]
            qa_dct_list = [x for x in qa_dct_list if len(x[1]['comments']) > 0 and name in x[1]['url']]
            fo = open(out_file_name, "w")
            json.dump(qa_dct_list, fo)
            fo.close()