def grd_eval()

in scripts/eval_grd_anet_entities.py [0:0]


    def grd_eval(self, mode='all'):
        if mode == 'all':
            print('Evaluating on all object words.')
        elif mode == 'loc':
            print('Evaluating only on correctly-predicted object words.')
        else:
            raise Exception('Invalid loc mode!')

        prec, recall, prec_per_sent, rec_per_sent, vocab_in_split = self.precision_recall_util(mode=mode)

        # compute the per-class precision, recall, and F1 scores
        num_vocab = len(vocab_in_split)
        print('Number of groundable objects in this split: {}'.format(num_vocab))
        print('Number of objects in prec and recall: {}, {}'.format(len(prec), len(recall)))
        prec_cls = np.sum([sum(hm)*1./len(hm) for i,hm in prec.items()])*1./num_vocab
        recall_cls = np.sum([sum(hm)*1./len(hm) for i,hm in recall.items()])*1./num_vocab
        f1_cls = 2. * prec_cls * recall_cls / (prec_cls + recall_cls)

        print('-' * 80)
        print('The overall precision_{0} / recall_{0} / F1_{0} are {1:.4f} / {2:.4f} / {3:.4f}'.format(mode, prec_cls, recall_cls, f1_cls))
        print('-' * 80)
        if self.verbose:
            print('Object frequency and grounding accuracy per class (descending by object frequency):')
            accu_per_clss = {}
            for i in vocab_in_split:
                prec_clss = sum(prec[i])*1./len(prec[i]) if i in prec else 0
                recall_clss = sum(recall[i])*1./len(recall[i]) if i in recall else 0
                accu_per_clss[(i, prec_clss, recall_clss)] = (len(prec[i]), len(recall[i]))
            accu_per_clss = sorted(accu_per_clss.items(), key=lambda x:x[1][1], reverse=True)
            for accu in accu_per_clss:
                print('{} ({} / {}): {:.4f} / {:.4f}'.format(accu[0][0], accu[1][0], accu[1][1], accu[0][1], accu[0][2]))

        # compute the per-sent precision, recall, and F1 scores
        num_segment_without_labels = 0
        prec, rec, f1 = [], [], []
        for seg_id, prec_list in prec_per_sent.items():

            if rec_per_sent[seg_id] == []:
                # skip the segment if no target objects
                num_segment_without_labels += 1
            else:
                current_prec = 0 if prec_list == [] else np.mean(prec_list) # avoid empty prec_list
                current_rec = np.mean(rec_per_sent[seg_id])

                # if precision and recall are both 0, set the f1 to be 0
                if current_prec == 0.0 and current_rec == 0.0:
                    current_f1_score = 0.0
                else:
                    current_f1_score = 2. * current_prec * current_rec / (current_prec + current_rec) # per-sent F1

                prec.append(current_prec)
                rec.append(current_rec)
                f1.append(current_f1_score)

        num_predictions = 0
        for _, pred_seg in self.pred.items():
            num_predictions += len(pred_seg)

        # divide the scores with the total number of predictions
        avg_prec = np.sum(prec) / (num_predictions - num_segment_without_labels)
        avg_rec = np.sum(rec) / (num_predictions - num_segment_without_labels)
        avg_f1 = np.sum(f1) / (num_predictions - num_segment_without_labels)

        print('-' * 80)
        print('The overall precision_{0}_per_sent / recall_{0}_per_sent / F1_{0}_per_sent are {1:.4f} / {2:.4f} / {3:.4f}'.format(mode, avg_prec, avg_rec, avg_f1))
        print('-' * 80)

        return prec_cls, recall_cls, f1_cls, avg_prec, avg_rec, avg_f1