def main()

in scripts/attr_prep_tag_NP.py [0:0]


def main(args):
    nlp = StanfordCoreNLP(args.corenlp_path)
    props={'annotators': 'ssplit, tokenize, lemma','pipelineLanguage':'en', 'outputFormat':'json'}

    # load anet captions
    with open(args.train_cap_file) as f:
        database_cap = json.load(f)
    with open(args.val_cap_file) as f:
        database_cap.update(json.load(f))
    print('Number of videos in ActivityNet Captions (train+val): {}'.format(len(database_cap)))

    # load raw annotation output anet bb
    with open(args.src_file) as f:
        database = json.load(f)['database']
    print('Number of videos in ActivityNet-BB (train+val): {}'.format(len(database)))

    if os.path.isfile(args.split_file):
        with open(args.split_file) as f:
            all_splits = json.load(f)
            splits = [all_splits['training'], all_splits['validation'], all_splits['testing']]
    else:
        raise '[WARNING] Cannot find the split file! Uncomment this if you want to create a new split.'
        splits = define_split(database)
        all_splits = {'training':splits[0], 'validation':splits[1], 'testing':splits[2]}
        with open(args.split_file, 'w') as f:
            json.dump(all_splits, f)

    attr_all = extract_attr(database, splits[:2]) # define object classes on train/val data

    obj_cls_lst, w2l = freq_obj_list(attr_all, nlp, props)

    new_database, new_database_np = prep_all(database, database_cap, obj_cls_lst, w2l, nlp)

    # write raw annotation file
    new_database_np = {'database':new_database_np}
    with open(args.target_np_file, 'w') as f:
        json.dump(new_database_np, f)

    # write pre-processed annotation file
    new_database = {'vocab':obj_cls_lst, 'annotations':new_database}
    with open(args.target_file, 'w') as f:
        json.dump(new_database, f)

    nlp.close()