in ludwig/decoders/sequence_decoders.py [0:0]
def extract_sequence_probabilities(decoder_output, beam_width, sequence_id=0):
# obtain tesnors needed
predictions = decoder_output.predicted_ids[:, :, sequence_id]
all_log_probs = decoder_output.beam_search_decoder_output.scores
top_ids = decoder_output.beam_search_decoder_output.predicted_ids
parent_rows = decoder_output.beam_search_decoder_output.parent_ids
# tile predictions so that they have the same shape
# of top ids, [b, s, beam]
preds_tiled = tf.tile(tf.expand_dims(predictions, -1),
[1, 1, beam_width])
# figure out the location among the top k ids of the ones
# we ended using for predictions, by first obtaining a boolean tensor
# that reports if they match or not, and then using tf.where
# to obtain the coordinates where they appear.
# the output is a tensor preds_locs_all of size = [n, dims]
# where n is ]the number of matches and dims is
# the number of axes of the coordinates
# (the rank of the preds_locs_bool tesnor)
# They are not always the first, because of the way beam search works.
preds_locs_bool = tf.equal(preds_tiled, top_ids)
preds_locs_all = tf.where(preds_locs_bool)
# the predicted ids may have appeared multiple times across
# the different beams, so we need to select the first one (as it's the
# one with highest probability.
# to do so we create segment ids to use with the segment_min function.
# to obtain the segments we use the first 2 coordinates of preds_locs
# multiply the first by the max length of the second and then
# add the second to obtain contgous numbering.
# for example if we know that the maximum length is 12,
# location [2,3] becomes segment 2 * 12 + 3 = 27
segments = ((preds_locs_all[:, 0] *
tf.cast(tf.shape(predictions)[-1], tf.int64)) +
preds_locs_all[:, 1])
# degment min takes the min (first occurrence) of
# the predicted sequence elment among all the beams
preds_locs = tf.math.segment_min(
preds_locs_all[:, 2], segments
)
# as we want to gather the values in parent rows,
# we need to construct the coordinates xs and ys as the preds_locs
# are the values of the third axis (beam size)
# from which we want to gather).
# we know for sure the values of xs and ys because we know for sure
# that at least one of the besms contains the pred id at each step,
# so we know for sure that there will be b*s rows in pred_loc
# and so we can concatenate xs and ys that have the same size
xs = tf.repeat(
tf.range(tf.shape(parent_rows)[0], dtype=tf.int64),
tf.repeat(
tf.shape(parent_rows)[1], tf.shape(parent_rows)[0])
)
ys = tf.tile(tf.range(tf.shape(parent_rows)[1], dtype=tf.int64),
tf.shape(parent_rows)[0:1])
preds_locs_for_gather = tf.concat(
[xs[:, tf.newaxis], ys[:, tf.newaxis],
preds_locs[:, tf.newaxis]],
axis=-1
)
# now that we have a [b*s, x, y ,z] tensor of coordinates,
# we can use it to gather from the parent rows tensor
rows_from_log_probs_to_select = tf.gather_nd(
parent_rows,
preds_locs_for_gather
)
# we can reuse xs and ys to concatenate to the id of rows
# from log probs to select in order to obtain the coordinates
# in the all_log_probs tensor to gather
rows_from_log_probs_for_gather = tf.concat(
[xs[:, tf.newaxis], ys[:, tf.newaxis],
tf.cast(rows_from_log_probs_to_select[:, tf.newaxis],
dtype=tf.int64)],
axis=-1
)
# let's finally gather the logprobs
log_probs_to_reshape = tf.gather_nd(
all_log_probs,
rows_from_log_probs_for_gather
)
# and let's reshape them in a [b,s,v] shape where v is
# the size of the output vocabulary
log_probs = tf.reshape(
log_probs_to_reshape,
tf.stack(
[tf.shape(all_log_probs)[0], tf.shape(all_log_probs)[1],
tf.shape(all_log_probs)[3]], axis=0)
)
# as they are log probs, exponentiating them return probabilities
probabilities = tf.exp(log_probs)
return probabilities