in tabular/src/autogluon/tabular/learner/abstract_learner.py [0:0]
def evaluate_predictions(self, y_true, y_pred, silent=False, auxiliary_metrics=True, detailed_report=False):
""" Evaluate predictions. Does not support sample weights since this method reports a variety of metrics.
Args:
silent (bool): Should we print which metric is being used as well as performance.
auxiliary_metrics (bool): Should we compute other (problem_type specific) metrics in addition to the default metric?
detailed_report (bool): Should we computed more-detailed versions of the auxiliary_metrics? (requires auxiliary_metrics=True).
Returns single performance-value if auxiliary_metrics=False.
Otherwise returns dict where keys = metrics, values = performance along each metric.
"""
if self.weight_evaluation:
raise AssertionError('evaluate_predictions does not support `weight_evaluation=True`. Use `predictor.leaderboard` instead.')
is_proba = False
assert isinstance(y_true, (np.ndarray, pd.Series))
assert isinstance(y_pred, (np.ndarray, pd.Series, pd.DataFrame))
self._validate_class_labels(y_true)
if isinstance(y_pred, np.ndarray):
if self.problem_type == QUANTILE:
y_pred = pd.DataFrame(data=y_pred, columns=self.quantile_levels)
elif len(y_pred.shape) > 1:
y_pred = pd.DataFrame(data=y_pred, columns=self.class_labels)
if isinstance(y_pred, pd.DataFrame):
is_proba = True
elif not self.eval_metric.needs_pred:
raise AssertionError(f'`evaluate_predictions` requires y_pred_proba input '
f'when evaluating "{self.eval_metric.name}"... Please generate valid input via `predictor.predict_proba(data)`.\n'
f'This may have occurred if you passed in predict input instead of predict_proba input, '
f'or if you specified `as_multiclass=False` to `predictor.predict_proba(data, as_multiclass=False)`, '
f'which is not supported by `evaluate_predictions`.')
if is_proba:
y_pred_proba = y_pred
y_pred = get_pred_from_proba_df(y_pred_proba, problem_type=self.problem_type)
if self.problem_type == BINARY:
# roc_auc crashes if this isn't done
y_pred_proba = y_pred_proba[self.positive_class]
else:
y_pred_proba = None
y_pred = pd.Series(y_pred)
if y_pred_proba is not None:
y_pred_proba_internal = self.label_cleaner.transform_proba(y_pred_proba, as_pandas=True)
else:
y_pred_proba_internal = None
y_true_internal = self.label_cleaner.transform(y_true) # Get labels in numeric order
y_true_internal = y_true_internal.fillna(-1)
y_pred_internal = self.label_cleaner.transform(y_pred) # Get labels in numeric order
# Compute auxiliary metrics:
auxiliary_metrics_lst = [self.eval_metric]
performance_dict = {}
if auxiliary_metrics:
if self.problem_type == REGRESSION: # Adding regression metrics
auxiliary_metrics_lst += [
'root_mean_squared_error',
'mean_squared_error',
'mean_absolute_error',
'r2',
'pearsonr',
'median_absolute_error',
]
if self.problem_type in [BINARY, MULTICLASS]: # Adding classification metrics
auxiliary_metrics_lst += [
'accuracy',
'balanced_accuracy',
# 'log_loss', # Don't include as it probably adds more confusion to novice users (can be infinite)
'mcc',
]
if self.problem_type == BINARY: # binary-specific metrics
auxiliary_metrics_lst += [
'roc_auc',
'f1',
'precision',
'recall',
]
scoring_args = dict(
y=y_true,
y_internal=y_true_internal,
# sample_weight=sample_weight, # TODO: add sample_weight support
)
for aux_metric in auxiliary_metrics_lst:
if isinstance(aux_metric, str):
aux_metric = get_metric(metric=aux_metric, problem_type=self.problem_type, metric_type='aux_metric')
if not aux_metric.needs_pred and y_pred_proba_internal is None:
logger.log(15, f'Skipping {aux_metric.name} because no prediction probabilities are available to score.')
continue
if aux_metric.name not in performance_dict:
if y_pred_proba_internal is not None:
score = self._score_with_pred_proba(
y_pred_proba_internal=y_pred_proba_internal,
metric=aux_metric,
**scoring_args
)
else:
score = self._score_with_pred(
y_pred_internal=y_pred_internal,
metric=aux_metric,
**scoring_args
)
performance_dict[aux_metric.name] = score
if self.eval_metric.name in performance_dict:
score_eval = performance_dict[self.eval_metric.name]
score_eval_flipped = self.eval_metric.convert_score_to_sklearn_val(score_eval) # flip negative once again back to positive (so higher is no longer necessarily better)
if score_eval_flipped != score_eval:
flipped = True
else:
flipped = False
if not silent:
logger.log(20, f"Evaluation: {self.eval_metric.name} on test data: {score_eval}")
if flipped:
logger.log(20, f"\tNote: Scores are always higher_is_better. This metric score can be multiplied by -1 to get the metric value.")
if not silent:
logger.log(20, "Evaluations on test data:")
logger.log(20, json.dumps(performance_dict, indent=4))
if detailed_report and (self.problem_type != REGRESSION):
# Construct confusion matrix
try:
performance_dict['confusion_matrix'] = confusion_matrix(y_true, y_pred, labels=self.label_cleaner.ordered_class_labels, output_format='pandas_dataframe')
except ValueError:
pass
# One final set of metrics to report
cl_metric = lambda y_true, y_pred: classification_report(y_true, y_pred, output_dict=True)
metric_name = 'classification_report'
if metric_name not in performance_dict:
try: # only compute auxiliary metrics which do not error (y_pred = class-probabilities may cause some metrics to error)
performance_dict[metric_name] = cl_metric(y_true, y_pred)
except ValueError:
pass
if not silent and metric_name in performance_dict:
logger.log(20, "Detailed (per-class) classification report:")
logger.log(20, json.dumps(performance_dict[metric_name], indent=4))
return performance_dict