def post_process()

in data_creation/download_reddit_qalist.py [0:0]


def post_process(reddit_dct, name=''):
    # remove the ELI5 at the start of explainlikeimfive questions
    start_re    = re.compile('[\[]?[ ]?eli[5f][ ]?[\]]?[]?[:,]?', re.IGNORECASE)
    if name == 'explainlikeimfive':
        title, uls  = reddit_dct['title']
        title       = start_re.sub('', title).strip()
        reddit_dct['title'] = [title, uls]
    # dedupe and filter comments
    comments    = [c for i, c in enumerate(reddit_dct['comments']) if len(c['body'][0].split()) >= 8 and c['id'] not in [x['id'] for x in reddit_dct['comments'][:i]]]
    comments    = sorted(comments, key=lambda c: (c['score'], len(c['body'][0].split()), c['id']), reverse=True)
    reddit_dct['comments']  = comments
    return reddit_dct