in datasets.py [0:0]
def evaluate_example(self, example: InputExample, output_sentence: str, model=None, tokenizer=None) -> Counter:
"""
Evaluate an output sentence on a single example of this dataset.
"""
# extract entities and relations from output sentence
res = self.output_format.run_inference(
example,
output_sentence,
entity_types=self.entity_types,
)
predicted_intent, predicted_entities, wrong_reconstruction, label_error, format_error = res
predicted_entities_no_type = set([entity[1:] for entity in predicted_entities])
# load ground truth entities
gt_entities = set(entity.to_tuple() for entity in example.entities)
gt_entities_no_type = set([entity[1:] for entity in gt_entities])
# compute correct entities
correct_entities = predicted_entities & gt_entities
correct_entities_no_type = gt_entities_no_type & predicted_entities_no_type
# load ground truth intent
gt_intent = example.intent
# print(f"Ground truth: {gt_intent} ||| Predicted: {predicted_intent}")
# compute correct intent
correct_intent = int(predicted_intent == gt_intent.natural)
assert len(correct_entities) <= len(predicted_entities)
assert len(correct_entities) <= len(gt_entities)
assert len(correct_entities_no_type) <= len(predicted_entities_no_type)
assert len(correct_entities_no_type) <= len(gt_entities_no_type)
res = Counter({
'num_sentences': 1,
'wrong_reconstructions': 1 if wrong_reconstruction else 0,
'label_error': 1 if label_error else 0,
'format_error': 1 if format_error else 0,
'predicted_intent': 1 if len(predicted_intent) > 0 else 0,
'gt_intent': 1,
'correct_intent': correct_intent,
'gt_entities': len(gt_entities),
'predicted_entities': len(predicted_entities),
'correct_entities': len(correct_entities),
'gt_entities_no_type': len(gt_entities_no_type),
'predicted_entities_no_type': len(predicted_entities_no_type),
'correct_entities_no_type': len(correct_entities_no_type),
})
if self.intents is not None:
for intent_type in self.intents.values():
predicted = int(predicted_intent == intent_type.natural)
gt = int(gt_intent.natural == intent_type.natural)
correct = int(predicted_intent == gt_intent.natural)
res['predicted_intent', intent_type.natural] = predicted
res['gt_intent', intent_type.natural] = gt
res['correct_intent', intent_type.natural] = correct
# add information about each entity/relation type so that we can compute the macro-F1 scores
if self.entity_types is not None:
for entity_type in self.entity_types.values():
predicted = set(entity for entity in predicted_entities if entity[0] == entity_type.natural)
gt = set(entity for entity in gt_entities if entity[0] == entity_type.natural)
correct = predicted & gt
res['predicted_entities', entity_type.natural] = len(predicted)
res['gt_entities', entity_type.natural] = len(gt)
res['correct_entities', entity_type.natural] = len(correct)
return res