in data_creation/download_reddit_qalist.py [0:0]
def post_process(reddit_dct, name=''):
# remove the ELI5 at the start of explainlikeimfive questions
start_re = re.compile('[\[]?[ ]?eli[5f][ ]?[\]]?[]?[:,]?', re.IGNORECASE)
if name == 'explainlikeimfive':
title, uls = reddit_dct['title']
title = start_re.sub('', title).strip()
reddit_dct['title'] = [title, uls]
# dedupe and filter comments
comments = [c for i, c in enumerate(reddit_dct['comments']) if len(c['body'][0].split()) >= 8 and c['id'] not in [x['id'] for x in reddit_dct['comments'][:i]]]
comments = sorted(comments, key=lambda c: (c['score'], len(c['body'][0].split()), c['id']), reverse=True)
reddit_dct['comments'] = comments
return reddit_dct