def evaluate_dataset()

in datasets.py [0:0]


    def evaluate_dataset(self, data_args: DataTrainingArguments, model, device, batch_size: int, macro: bool = False) \
            -> Dict[str, float]:
        """
        Evaluate model on this dataset.
        """
        results = Counter()

        for example, output_sentence in self.generate_output_sentences(data_args, model, device, batch_size):
            new_result = self.evaluate_example(
                    example=example,
                    output_sentence=output_sentence,
                    tokenizer=self.tokenizer,
                )
            results += new_result

        relation_precision, relation_recall, relation_f1 = get_precision_recall_f1(
            num_correct=results['correct_relations'],
            num_predicted=results['predicted_relations'],
            num_gt=results['gt_relations'],
        )

        relation_precision_no_type, relation_recall_no_type, relation_f1_no_type = get_precision_recall_f1(
            num_correct=results['correct_relations_no_type'],
            num_predicted=results['predicted_relations_no_type'],
            num_gt=results['gt_relations_no_type'],
        )

        res = {
            'relation_precision': relation_precision,
            'relation_recall': relation_recall,
            'relation_f1': relation_f1,
            'relation_precision_no_type': relation_precision_no_type,
            'relation_recall_no_type': relation_recall_no_type,
            'relation_f1_no_type': relation_f1_no_type,
            'num_gt_triggers': results['gt_entities'],
            'num_pred_triggers': results['predicted_entities'],
            'num_gt_relations': results['gt_relations'],
            'num_pred_relations': results['predicted_relations'],
        }

        return res