in example_zoo/tensorflow/models/ncf_main/official/recommendation/neumf_model.py [0:0]
def compute_eval_loss_and_metrics(logits, # type: tf.Tensor
softmax_logits, # type: tf.Tensor
duplicate_mask, # type: tf.Tensor
num_training_neg, # type: int
match_mlperf=False, # type: bool
use_tpu_spec=False # type: bool
):
# type: (...) -> tf.estimator.EstimatorSpec
"""Model evaluation with HR and NDCG metrics.
The evaluation protocol is to rank the test interacted item (truth items)
among the randomly chosen 999 items that are not interacted by the user.
The performance of the ranked list is judged by Hit Ratio (HR) and Normalized
Discounted Cumulative Gain (NDCG).
For evaluation, the ranked list is truncated at 10 for both metrics. As such,
the HR intuitively measures whether the test item is present on the top-10
list, and the NDCG accounts for the position of the hit by assigning higher
scores to hits at top ranks. Both metrics are calculated for each test user,
and the average scores are reported.
If `match_mlperf` is True, then the HR and NDCG computations are done in a
slightly unusual way to match the MLPerf reference implementation.
Specifically, if the evaluation negatives contain duplicate items, it will be
treated as if the item only appeared once. Effectively, for duplicate items in
a row, the predicted score for all but one of the items will be set to
-infinity
For example, suppose we have that following inputs:
logits_by_user: [[ 2, 3, 3],
[ 5, 4, 4]]
items_by_user: [[10, 20, 20],
[30, 40, 40]]
# Note: items_by_user is not explicitly present. Instead the relevant \
information is contained within `duplicate_mask`
top_k: 2
Then with match_mlperf=True, the HR would be 2/2 = 1.0. With
match_mlperf=False, the HR would be 1/2 = 0.5. This is because each user has
predicted scores for only 2 unique items: 10 and 20 for the first user, and 30
and 40 for the second. Therefore, with match_mlperf=True, it's guaranteed the
first item's score is in the top 2. With match_mlperf=False, this function
would compute the first user's first item is not in the top 2, because item 20
has a higher score, and item 20 occurs twice.
Args:
logits: A tensor containing the predicted logits for each user. The shape
of logits is (num_users_per_batch * (1 + NUM_EVAL_NEGATIVES),) Logits
for a user are grouped, and the last element of the group is the true
element.
softmax_logits: The same tensor, but with zeros left-appended.
duplicate_mask: A vector with the same shape as logits, with a value of 1
if the item corresponding to the logit at that position has already
appeared for that user.
num_training_neg: The number of negatives per positive during training.
match_mlperf: Use the MLPerf reference convention for computing rank.
use_tpu_spec: Should a TPUEstimatorSpec be returned instead of an
EstimatorSpec. Required for TPUs and if XLA is done on a GPU. Despite its
name, TPUEstimatorSpecs work with GPUs
Returns:
An EstimatorSpec for evaluation.
"""
in_top_k, ndcg, metric_weights, logits_by_user = compute_top_k_and_ndcg(
logits, duplicate_mask, match_mlperf)
# Examples are provided by the eval Dataset in a structured format, so eval
# labels can be reconstructed on the fly.
eval_labels = tf.reshape(shape=(-1,), tensor=tf.one_hot(
tf.zeros(shape=(logits_by_user.shape[0],), dtype=tf.int32) +
rconst.NUM_EVAL_NEGATIVES, logits_by_user.shape[1], dtype=tf.int32))
eval_labels_float = tf.cast(eval_labels, tf.float32)
# During evaluation, the ratio of negatives to positives is much higher
# than during training. (Typically 999 to 1 vs. 4 to 1) By adjusting the
# weights for the negative examples we compute a loss which is consistent with
# the training data. (And provides apples-to-apples comparison)
negative_scale_factor = num_training_neg / rconst.NUM_EVAL_NEGATIVES
example_weights = (
(eval_labels_float + (1 - eval_labels_float) * negative_scale_factor) *
(1 + rconst.NUM_EVAL_NEGATIVES) / (1 + num_training_neg))
# Tile metric weights back to logit dimensions
expanded_metric_weights = tf.reshape(tf.tile(
metric_weights[:, tf.newaxis], (1, rconst.NUM_EVAL_NEGATIVES + 1)), (-1,))
# ignore padded examples
example_weights *= tf.cast(expanded_metric_weights, tf.float32)
cross_entropy = tf.losses.sparse_softmax_cross_entropy(
logits=softmax_logits, labels=eval_labels, weights=example_weights)
def metric_fn(top_k_tensor, ndcg_tensor, weight_tensor):
return {
rconst.HR_KEY: tf.metrics.mean(top_k_tensor, weights=weight_tensor,
name=rconst.HR_METRIC_NAME),
rconst.NDCG_KEY: tf.metrics.mean(ndcg_tensor, weights=weight_tensor,
name=rconst.NDCG_METRIC_NAME),
}
if use_tpu_spec:
return tf.contrib.tpu.TPUEstimatorSpec(
mode=tf.estimator.ModeKeys.EVAL, loss=cross_entropy,
eval_metrics=(metric_fn, [in_top_k, ndcg, metric_weights]))
return tf.estimator.EstimatorSpec(
mode=tf.estimator.ModeKeys.EVAL,
loss=cross_entropy,
eval_metric_ops=metric_fn(in_top_k, ndcg, metric_weights)
)