def freq_obj_list()

in scripts/attr_prep_tag_NP.py [0:0]


def freq_obj_list(attr_all, nlp, props):
    # generate a list of object classes
    num_nn_per_attr = []
    anet_obj_cls = []
    nn_wo_noun = [] # noun phrases that contain no nouns
    w2lemma = defaultdict(list)

    for i, v in enumerate(attr_all):
        if i%10000 == 0:
            print(i)
        out = json.loads(nlp.annotate(v.encode('utf-8'), properties=props))
        assert(out['sentences'] > 0)
        counter = 0 
        for token in out['sentences'][0]['tokens']:
            if ('NN' in token['pos']) or ('PRP' in token['pos']):
                lemma_w = token['lemma']
                anet_obj_cls.append(lemma_w)
                w2lemma[token['word']].append(lemma_w)
                counter += 1
        num_nn_per_attr.append(counter)
        if counter == 0:
            nn_wo_noun.append(v)
    
    top_nn_wo_noun = Counter(nn_wo_noun)
    print('Frequency of NPs w/o nouns:')
    print(top_nn_wo_noun.most_common(10))
    
    print('Frequency of number of nouns per attribute:')
    print(Counter(num_nn_per_attr))
    
    top_obj_cls = Counter(anet_obj_cls)
    print('Top 10 objects:', top_obj_cls.most_common(20))
    
    obj_cls_lst = []
    for w,freq in top_obj_cls.items():
        if freq >= args.freq_thresh:
            obj_cls_lst.append(w.encode('ascii'))

    w2l = {}
    for w, l in w2lemma.items():
        # manually correct some machine lemmatization mistakes
        spec_w2l = {'outfits':'outfit', 'mariachi':'mariachi', 'barrios':'barrio', 'mans':'man', 'bags':'bag', 'aerobics':'aerobic', 'motobikes':'motobike', 'graffiti':'graffiti', 'semi':'semi', 'los':'los', 'tutus':'tutu'}
        if spec_w2l.get(w, -1) != -1: # one special case...
            w2l[w] = spec_w2l[w]
            print('Ambiguous lemma for: {}'.format(w))
        else:
            assert(len(set(l)) == 1)
            w2l[w] = list(set(l))[0]
    print('Number of words derived from lemma visual words {}'.format(len(w2l)))

    return obj_cls_lst, w2l