def evaluate_example()

in datasets.py [0:0]


    def evaluate_example(self, example: InputExample, output_sentence: str, model=None, tokenizer=None) -> Counter:
        """
        Evaluate an output sentence on a single example of this dataset.
        """
        # extract entities and relations from output sentence
        res = self.output_format.run_inference(
            example,
            output_sentence,
            entity_types=self.entity_types,
        )
        predicted_intent, predicted_entities, wrong_reconstruction, label_error, format_error = res

        predicted_entities_no_type = set([entity[1:] for entity in predicted_entities])

        # load ground truth entities
        gt_entities = set(entity.to_tuple() for entity in example.entities)
        gt_entities_no_type = set([entity[1:] for entity in gt_entities])

        # compute correct entities
        correct_entities = predicted_entities & gt_entities
        correct_entities_no_type = gt_entities_no_type & predicted_entities_no_type

        # load ground truth intent
        gt_intent = example.intent

        # print(f"Ground truth: {gt_intent} ||| Predicted: {predicted_intent}")

        # compute correct intent
        correct_intent = int(predicted_intent == gt_intent.natural)


        assert len(correct_entities) <= len(predicted_entities)
        assert len(correct_entities) <= len(gt_entities)
        assert len(correct_entities_no_type) <= len(predicted_entities_no_type)
        assert len(correct_entities_no_type) <= len(gt_entities_no_type)

        res = Counter({
            'num_sentences': 1,
            'wrong_reconstructions': 1 if wrong_reconstruction else 0,
            'label_error': 1 if label_error else 0,
            'format_error': 1 if format_error else 0, 
            'predicted_intent': 1 if len(predicted_intent) > 0 else 0,
            'gt_intent': 1,
            'correct_intent': correct_intent,
            'gt_entities': len(gt_entities),
            'predicted_entities': len(predicted_entities),
            'correct_entities': len(correct_entities),
            'gt_entities_no_type': len(gt_entities_no_type),
            'predicted_entities_no_type': len(predicted_entities_no_type),
            'correct_entities_no_type': len(correct_entities_no_type),
        })
        
        if self.intents is not None:
            for intent_type in self.intents.values():
                predicted = int(predicted_intent == intent_type.natural)
                gt = int(gt_intent.natural == intent_type.natural)
                correct = int(predicted_intent == gt_intent.natural)
                res['predicted_intent', intent_type.natural] = predicted
                res['gt_intent', intent_type.natural] = gt
                res['correct_intent', intent_type.natural] = correct

        # add information about each entity/relation type so that we can compute the macro-F1 scores
        if self.entity_types is not None:
            for entity_type in self.entity_types.values():
                predicted = set(entity for entity in predicted_entities if entity[0] == entity_type.natural)
                gt = set(entity for entity in gt_entities if entity[0] == entity_type.natural)
                correct = predicted & gt
                res['predicted_entities', entity_type.natural] = len(predicted)
                res['gt_entities', entity_type.natural] = len(gt)
                res['correct_entities', entity_type.natural] = len(correct)

        return res