in datasets.py [0:0]
def evaluate_example(self, example: InputExample, output_sentence: str, model=None, tokenizer=None,
eval_nll=False) -> Counter:
"""
Evaluate an output sentence on a single example of this dataset.
We use NLL inference only when the generated output sentence has an unrecognized relation.
"""
if not eval_nll and not self.eval_nll:
# evaluate by generating the output sentence
predicted_entities, predicted_relations = self.output_format.run_inference(
example,
output_sentence,
entity_types=self.entity_types,
relation_types=self.relation_types,
)
predicted_relation_str = next(iter(predicted_relations))[0]
if predicted_relation_str not in [relation_type.natural for relation_type in self.relation_types.values()]:
# the output relation is not in the list of possible relations, so we use NLL evaluation instead
return self.evaluate_example(
example=example,
output_sentence=output_sentence,
model=model,
tokenizer=tokenizer,
eval_nll=True
)
else:
# NLL evaluation
predicted_relation_type = self.nll_inference(
example=example,
relation_types=list(self.relation_types.values()),
model=model,
tokenizer=tokenizer,
)
predicted_relation_str = predicted_relation_type.natural
# load ground truth relation
gt_relation_str = example.relations[0].type.natural
if gt_relation_str == self.NO_RELATION and predicted_relation_str == self.NO_RELATION:
return Counter({
'num_sentences': 0,
'gt_relations': 0,
'predicted_relations': 0,
'correct_relations': 0,
})
elif gt_relation_str == self.NO_RELATION:
return Counter({
'num_sentences': 1,
'gt_relations': 0,
'predicted_relations': 1,
'correct_relations': 0,
})
elif predicted_relation_str == self.NO_RELATION:
return Counter({
'num_sentences': 1,
'gt_relations': 1,
'predicted_relations': 0,
'correct_relations': 0,
})
else:
return Counter({
'num_sentences': 1,
'gt_relations': 1,
'predicted_relations': 1,
'correct_relations': 1 if predicted_relation_str == gt_relation_str else 0,
})