def main()

in clutrr/template_mturk.py [0:0]


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--mfile', type=str, default='amt_mturk.csv', help='MTurk generated file')
    parser.add_argument('--outfile', type=str, default='amt_placeholders', help='placeholders json file')
    parser.add_argument('--split', type=float, default=0.8, help='Train/Test split.')
    args = parser.parse_args()

    df = pd.read_csv(args.mfile)
    # do not use the rejected samples
    df = df[df.review != 'rejected']
    print("Number of accepted rows : {}".format(len(df)))
    df, skipped = extract_placeholder(df)
    # create a json file for easy lookup
    placeholders = {}
    for i, row in df.iterrows():
        if i in skipped:
            continue
        if row['f_comb'] not in placeholders:
            placeholders[row['f_comb']] = {}
        if row['template_gender'] not in placeholders[row['f_comb']]:
            placeholders[row['f_comb']][row['template_gender']] = []
        placeholders[row['f_comb']][row['template_gender']].append(row['template'])
    # training and testing split of the placeholders
    train_p = {}
    test_p = {}
    for key, gv in placeholders.items():
        if key not in train_p:
            train_p[key] = {}
            test_p[key] = {}
        for gk, ps in gv.items():
            split = int(len(placeholders[key][gk]) * args.split)
            train_p[key][gk] = placeholders[key][gk][:split]
            test_p[key][gk] = placeholders[key][gk][split:]
    # save
    json.dump(train_p, open(args.outfile + '.train.json','w'))
    json.dump(test_p, open(args.outfile + '.test.json', 'w'))
    json.dump(placeholders, open(args.outfile + '.json','w'))
    print("Done.")