in scripts/attr_prep_tag_NP.py [0:0]
def main(args):
nlp = StanfordCoreNLP(args.corenlp_path)
props={'annotators': 'ssplit, tokenize, lemma','pipelineLanguage':'en', 'outputFormat':'json'}
# load anet captions
with open(args.train_cap_file) as f:
database_cap = json.load(f)
with open(args.val_cap_file) as f:
database_cap.update(json.load(f))
print('Number of videos in ActivityNet Captions (train+val): {}'.format(len(database_cap)))
# load raw annotation output anet bb
with open(args.src_file) as f:
database = json.load(f)['database']
print('Number of videos in ActivityNet-BB (train+val): {}'.format(len(database)))
if os.path.isfile(args.split_file):
with open(args.split_file) as f:
all_splits = json.load(f)
splits = [all_splits['training'], all_splits['validation'], all_splits['testing']]
else:
raise '[WARNING] Cannot find the split file! Uncomment this if you want to create a new split.'
splits = define_split(database)
all_splits = {'training':splits[0], 'validation':splits[1], 'testing':splits[2]}
with open(args.split_file, 'w') as f:
json.dump(all_splits, f)
attr_all = extract_attr(database, splits[:2]) # define object classes on train/val data
obj_cls_lst, w2l = freq_obj_list(attr_all, nlp, props)
new_database, new_database_np = prep_all(database, database_cap, obj_cls_lst, w2l, nlp)
# write raw annotation file
new_database_np = {'database':new_database_np}
with open(args.target_np_file, 'w') as f:
json.dump(new_database_np, f)
# write pre-processed annotation file
new_database = {'vocab':obj_cls_lst, 'annotations':new_database}
with open(args.target_file, 'w') as f:
json.dump(new_database, f)
nlp.close()