in assets/training/model_evaluation/src/utils.py [0:0]
def _log_metrics(metrics, artifacts):
"""Log metrics and artifacts to current run.
Args:
metrics (_type_): _description_
artifacts (_type_): _description_
Raises:
ModelEvaluationException: _description_
"""
table_scores = {}
nonscalar_scores = {}
list_metrics = [metrics_constants.Metric.FMPerplexity]
run = current_run.run
list_scores = {}
classwise_scores = {}
for name, score in artifacts.items():
if score is None:
logger.warning("Empty score for {}. Skipping.".format(name))
continue
elif _scoring_utilities.is_table_metric(name) or name in metrics_constants.Metric.QA_GPT_METRICS_SET \
or name == metrics_constants.Metric.BERTScore:
table_scores[name] = score
elif name in list_metrics:
try:
list_scores[name] = list(score)
if name == metrics_constants.Metric.FMPerplexity:
metrics["mean_" + name] = np.mean(score)
except TypeError:
logger.warning(f"{name} is not of type list.")
elif name in metrics_constants.Metric.NONSCALAR_FULL_SET or \
name in metrics_constants.FULL_NONSCALAR_SET:
nonscalar_scores[name] = score
elif name in metrics_constants.TrainingResultsType.ALL_TIME:
# Filter out time metrics as we do not log these
pass
elif name in metrics_constants.FULL_CLASSWISE_SET:
classwise_scores[name] = score
else:
logger.warning("Unknown metric {}. Will not log.".format(name))
try:
# Log the scalar metrics. (Currently, these are stored in CosmosDB)
for name, score in metrics.items():
if isinstance(score, list):
list_scores[name] = list(score)
continue
run.log(name, score)
for name, score in table_scores.items():
if name == metrics_constants.Metric.BERTScore:
for k, v in score.items():
if not isinstance(v, np.ndarray) and not isinstance(v, list):
continue
x, y = np.histogram(v, bins=10)
# TODO: check if we need to look in different keys for precision, recall and f1
x = np.array(x).tolist()
y = np.array(y).tolist()
run.log_table("Bert F1 Score", value={"_score": json.dumps(list(y)[1:]),
"count": json.dumps(list(x))})
x, y = np.histogram(v, bins=10)
x = np.array(x).tolist()
y = np.array(y).tolist()
run.log_table("Bert Precision", value={"_score": json.dumps(list(y)[1:]),
"count": json.dumps(list(x))})
x, y = np.histogram(v, bins=10)
x = np.array(x).tolist()
y = np.array(y).tolist()
run.log_table("Bert Recall", value={"_score": json.dumps(list(y)[1:]),
"count": json.dumps(list(x))})
elif name in metrics_constants.Metric.QA_GPT_METRICS_SET:
try:
if not isinstance(score, list) and not isinstance(score, np.ndarray):
logger.warning(f"{name} is not an iterable. \nValue: {score}")
continue
int_score = [int(i) for i in score]
counts = [0]*5
for i in int_score:
counts[i-1] += 1
cur_score = {
"_rating": [i for i in range(1, 6)],
"count": counts
}
run.log_table(name, cur_score)
except Exception as e:
if (isinstance(score, list) or isinstance(score, np.ndarray)) and len(score) > 0:
exception_cls_name = score[0]
logger.warning(f"Ignoring metric: {name}\n Computation Failed due to: {exception_cls_name}")
else:
logger.warning(f"Ignoring metric: {name}\n Logging Failed due to: {repr(e)}")
else:
run.log_table(name, score)
# for name, score in list_scores.items():
# # TODO: Add checks for logging longer lists
# pass
# # run.log_list(name, score)
# Log the non-scalar metrics. (Currently, these are all artifact-based.)
for name, score in nonscalar_scores.items():
if name == metrics_constants.Metric.AccuracyTable:
run.log_accuracy_table(name, score)
elif name in metrics_constants.Metric.IMAGE_LEVEL_BINARY_CLASSIFIER_METRICS:
run.log_table(name, score)
elif name == metrics_constants.Metric.ConfusionMatrix:
run.log_confusion_matrix(name, score)
elif name == metrics_constants.Metric.CONFUSION_MATRICES_PER_SCORE_THRESHOLD:
for key, confusion_matrix in score.items():
run.log_confusion_matrix(key, confusion_matrix)
elif name == metrics_constants.Metric.Residuals:
run.log_residuals(name, score)
elif name == metrics_constants.Metric.PredictedTrue:
run.log_predictions(name, score)
elif name in metrics_constants.Metric.NONSCALAR_FORECAST_SET:
# Filter out non-scalar forecasting metrics as we do not log these yet
pass
else:
logger.warning("Unsupported non-scalar metric {}. Will not log.".format(name))
# Log the classwise metrics. (Currently, these are all artifact-based.)
for name, score in classwise_scores.items():
try:
if name == metrics_constants.Metric.PER_LABEL_METRICS:
for metrics in score.values():
run.log_table(name, metrics)
else:
logger.warning("Unsupported non-scalar metric {}. Will not log.".format(name))
except Exception: # TODO
e = ModelEvaluationException(f"Failed to log classwise metric {name} with value {score}")
log_traceback(e, logger)
raise e
except Exception as e:
exception = get_azureml_exception(ComputeMetricsException, MetricsLoggingError, e, wrap_azureml_ex=False,
metric_name=name, error=repr(e))
log_traceback(exception, logger)
raise exception