in clutrr/template_mturk.py [0:0]
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--mfile', type=str, default='amt_mturk.csv', help='MTurk generated file')
parser.add_argument('--outfile', type=str, default='amt_placeholders', help='placeholders json file')
parser.add_argument('--split', type=float, default=0.8, help='Train/Test split.')
args = parser.parse_args()
df = pd.read_csv(args.mfile)
# do not use the rejected samples
df = df[df.review != 'rejected']
print("Number of accepted rows : {}".format(len(df)))
df, skipped = extract_placeholder(df)
# create a json file for easy lookup
placeholders = {}
for i, row in df.iterrows():
if i in skipped:
continue
if row['f_comb'] not in placeholders:
placeholders[row['f_comb']] = {}
if row['template_gender'] not in placeholders[row['f_comb']]:
placeholders[row['f_comb']][row['template_gender']] = []
placeholders[row['f_comb']][row['template_gender']].append(row['template'])
# training and testing split of the placeholders
train_p = {}
test_p = {}
for key, gv in placeholders.items():
if key not in train_p:
train_p[key] = {}
test_p[key] = {}
for gk, ps in gv.items():
split = int(len(placeholders[key][gk]) * args.split)
train_p[key][gk] = placeholders[key][gk][:split]
test_p[key][gk] = placeholders[key][gk][split:]
# save
json.dump(train_p, open(args.outfile + '.train.json','w'))
json.dump(test_p, open(args.outfile + '.test.json', 'w'))
json.dump(placeholders, open(args.outfile + '.json','w'))
print("Done.")