in scripts/eval_grd_anet_entities.py [0:0]
def grd_eval(self, mode='all'):
if mode == 'all':
print('Evaluating on all object words.')
elif mode == 'loc':
print('Evaluating only on correctly-predicted object words.')
else:
raise Exception('Invalid loc mode!')
prec, recall, prec_per_sent, rec_per_sent, vocab_in_split = self.precision_recall_util(mode=mode)
# compute the per-class precision, recall, and F1 scores
num_vocab = len(vocab_in_split)
print('Number of groundable objects in this split: {}'.format(num_vocab))
print('Number of objects in prec and recall: {}, {}'.format(len(prec), len(recall)))
prec_cls = np.sum([sum(hm)*1./len(hm) for i,hm in prec.items()])*1./num_vocab
recall_cls = np.sum([sum(hm)*1./len(hm) for i,hm in recall.items()])*1./num_vocab
f1_cls = 2. * prec_cls * recall_cls / (prec_cls + recall_cls)
print('-' * 80)
print('The overall precision_{0} / recall_{0} / F1_{0} are {1:.4f} / {2:.4f} / {3:.4f}'.format(mode, prec_cls, recall_cls, f1_cls))
print('-' * 80)
if self.verbose:
print('Object frequency and grounding accuracy per class (descending by object frequency):')
accu_per_clss = {}
for i in vocab_in_split:
prec_clss = sum(prec[i])*1./len(prec[i]) if i in prec else 0
recall_clss = sum(recall[i])*1./len(recall[i]) if i in recall else 0
accu_per_clss[(i, prec_clss, recall_clss)] = (len(prec[i]), len(recall[i]))
accu_per_clss = sorted(accu_per_clss.items(), key=lambda x:x[1][1], reverse=True)
for accu in accu_per_clss:
print('{} ({} / {}): {:.4f} / {:.4f}'.format(accu[0][0], accu[1][0], accu[1][1], accu[0][1], accu[0][2]))
# compute the per-sent precision, recall, and F1 scores
num_segment_without_labels = 0
prec, rec, f1 = [], [], []
for seg_id, prec_list in prec_per_sent.items():
if rec_per_sent[seg_id] == []:
# skip the segment if no target objects
num_segment_without_labels += 1
else:
current_prec = 0 if prec_list == [] else np.mean(prec_list) # avoid empty prec_list
current_rec = np.mean(rec_per_sent[seg_id])
# if precision and recall are both 0, set the f1 to be 0
if current_prec == 0.0 and current_rec == 0.0:
current_f1_score = 0.0
else:
current_f1_score = 2. * current_prec * current_rec / (current_prec + current_rec) # per-sent F1
prec.append(current_prec)
rec.append(current_rec)
f1.append(current_f1_score)
num_predictions = 0
for _, pred_seg in self.pred.items():
num_predictions += len(pred_seg)
# divide the scores with the total number of predictions
avg_prec = np.sum(prec) / (num_predictions - num_segment_without_labels)
avg_rec = np.sum(rec) / (num_predictions - num_segment_without_labels)
avg_f1 = np.sum(f1) / (num_predictions - num_segment_without_labels)
print('-' * 80)
print('The overall precision_{0}_per_sent / recall_{0}_per_sent / F1_{0}_per_sent are {1:.4f} / {2:.4f} / {3:.4f}'.format(mode, avg_prec, avg_rec, avg_f1))
print('-' * 80)
return prec_cls, recall_cls, f1_cls, avg_prec, avg_rec, avg_f1