in metaicl/data.py [0:0]
def evaluate(self, predictions, groundtruths, is_classification):
assert len(predictions)==len(self.metadata)
accs = []
precisions = defaultdict(list)
recalls = defaultdict(list)
for prediction, groundtruth in zip(predictions, groundtruths):
prediction = prediction.strip()
groundtruth = [gt.strip() for gt in groundtruth] if type(groundtruth)==list else groundtruth.strip()
is_correct = prediction in groundtruth if type(groundtruth)==list else prediction==groundtruth
accs.append(is_correct)
if is_classification:
recalls[groundtruth].append(is_correct)
precisions[prediction].append(is_correct)
if not is_classification:
return np.mean(accs)
f1s = []
for key in recalls:
precision = np.mean(precisions[key]) if key in precisions else 1.0
recall = np.mean(recalls[key])
if precision+recall==0:
f1s.append(0)
else:
f1s.append(2*precision*recall / (precision+recall))
return np.mean(f1s)