in scripts/attr_prep_tag_NP.py [0:0]
def freq_obj_list(attr_all, nlp, props):
# generate a list of object classes
num_nn_per_attr = []
anet_obj_cls = []
nn_wo_noun = [] # noun phrases that contain no nouns
w2lemma = defaultdict(list)
for i, v in enumerate(attr_all):
if i%10000 == 0:
print(i)
out = json.loads(nlp.annotate(v.encode('utf-8'), properties=props))
assert(out['sentences'] > 0)
counter = 0
for token in out['sentences'][0]['tokens']:
if ('NN' in token['pos']) or ('PRP' in token['pos']):
lemma_w = token['lemma']
anet_obj_cls.append(lemma_w)
w2lemma[token['word']].append(lemma_w)
counter += 1
num_nn_per_attr.append(counter)
if counter == 0:
nn_wo_noun.append(v)
top_nn_wo_noun = Counter(nn_wo_noun)
print('Frequency of NPs w/o nouns:')
print(top_nn_wo_noun.most_common(10))
print('Frequency of number of nouns per attribute:')
print(Counter(num_nn_per_attr))
top_obj_cls = Counter(anet_obj_cls)
print('Top 10 objects:', top_obj_cls.most_common(20))
obj_cls_lst = []
for w,freq in top_obj_cls.items():
if freq >= args.freq_thresh:
obj_cls_lst.append(w.encode('ascii'))
w2l = {}
for w, l in w2lemma.items():
# manually correct some machine lemmatization mistakes
spec_w2l = {'outfits':'outfit', 'mariachi':'mariachi', 'barrios':'barrio', 'mans':'man', 'bags':'bag', 'aerobics':'aerobic', 'motobikes':'motobike', 'graffiti':'graffiti', 'semi':'semi', 'los':'los', 'tutus':'tutu'}
if spec_w2l.get(w, -1) != -1: # one special case...
w2l[w] = spec_w2l[w]
print('Ambiguous lemma for: {}'.format(w))
else:
assert(len(set(l)) == 1)
w2l[w] = list(set(l))[0]
print('Number of words derived from lemma visual words {}'.format(len(w2l)))
return obj_cls_lst, w2l