def evaluate_dataset()

in datasets.py [0:0]


    def evaluate_dataset(self, data_args, model, device, batch_size=8, macro=False, by_relation_type=False) \
            -> Dict[str, float]:
        """
        Evaluate model on this dataset.
        """
        documents_to_chunk_data = defaultdict(list)
        predictions = {}

        for example, output_sentence in self.generate_output_sentences(data_args, model, device, batch_size):
            document_id = example.document_id

            data = self.output_format.run_inference(
                example=example,
                output_sentence=output_sentence,
            )

            # add offset to all indices
            offset = example.offset
            data = [tuple(tuple(y + offset for y in x) for x in z) for z in data if z[1] is not None]

            documents_to_chunk_data[document_id].append(data)

            if len(documents_to_chunk_data[document_id]) == len(self.documents[document_id].chunks):
                # process predictions for this document
                predictions[document_id] = self.get_document_predictions(documents_to_chunk_data[document_id])

        predictions_list = []
        labels_list = []
        for document_id, document in self.documents.items():
            predictions_list.append(predictions[document_id])
            labels_list.append([
                [(entity.start, entity.end) for entity in group]
                for group in document.groups
            ])

        metrics = CorefAllMetrics().get_all_metrics(labels_list, predictions_list)
        return {
            f'{metric_name}_{x}': v
            for metric_name, metric_values in metrics['micro'].items()
            for x, v in metric_values.items()
        }