in tensorflow_transform/mappers.py [0:0]
def ngrams(tokens: tf.SparseTensor,
ngram_range: Tuple[int, int],
separator: str,
name: Optional[str] = None) -> tf.SparseTensor:
"""Create a `SparseTensor` of n-grams.
Given a `SparseTensor` of tokens, returns a `SparseTensor` containing the
ngrams that can be constructed from each row.
`separator` is inserted between each pair of tokens, so " " would be an
appropriate choice if the tokens are words, while "" would be an appropriate
choice if they are characters.
Example:
>>> tokens = tf.SparseTensor(
... indices=[[0, 0], [0, 1], [0, 2], [1, 0], [1, 1], [1, 2], [1, 3]],
... values=['One', 'was', 'Johnny', 'Two', 'was', 'a', 'rat'],
... dense_shape=[2, 4])
>>> print(tft.ngrams(tokens, ngram_range=(1, 3), separator=' '))
SparseTensor(indices=tf.Tensor(
[[0 0] [0 1] [0 2] [0 3] [0 4] [0 5]
[1 0] [1 1] [1 2] [1 3] [1 4] [1 5] [1 6] [1 7] [1 8]],
shape=(15, 2), dtype=int64),
values=tf.Tensor(
[b'One' b'One was' b'One was Johnny' b'was' b'was Johnny' b'Johnny' b'Two'
b'Two was' b'Two was a' b'was' b'was a' b'was a rat' b'a' b'a rat'
b'rat'], shape=(15,), dtype=string),
dense_shape=tf.Tensor([2 9], shape=(2,), dtype=int64))
Args:
tokens: a two-dimensional`SparseTensor` of dtype `tf.string` containing
tokens that will be used to construct ngrams.
ngram_range: A pair with the range (inclusive) of ngram sizes to return.
separator: a string that will be inserted between tokens when ngrams are
constructed.
name: (Optional) A name for this operation.
Returns:
A `SparseTensor` containing all ngrams from each row of the input. Note:
if an ngram appears multiple times in the input row, it will be present the
same number of times in the output. For unique ngrams, see tft.bag_of_words.
Raises:
ValueError: if `tokens` is not 2D.
ValueError: if ngram_range[0] < 1 or ngram_range[1] < ngram_range[0]
"""
# This function is implemented as follows. Assume we start with the following
# `SparseTensor`:
#
# indices=[[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [2, 0], [2, 1], [2, 2]]
# values=['a', 'b', 'c', 'd', 'q', 'x', 'y', 'z']
# dense_shape=[3, 4]
#
# First we then create shifts of the values and first column of indices,
# buffering to avoid overrunning the end of the array, so the shifted values
# (if we are ngrams up to size 3) are
#
# shifted_batch_indices[0]=[0, 0, 0, 0, 1, 2, 2, 2]
# shifted_tokens[0]=['a', 'b', 'c', 'd', 'q', 'x', 'y', 'z']
#
# shifted_batch_indices[1]=[0, 0, 0, 1, 2, 2, 2, -1]
# shifted_tokens[1]=['b', 'c', 'd', 'q', 'x', 'y', 'z', '']
#
# shifted_batch_indices[2]=[0, 0, 1, 2, 2, 2, -1, -1]
# shifted_tokens[2]=['c', 'd', 'q', 'x', 'y', 'z', '', '']
#
# These shifted ngrams are used to create the ngrams as follows. We use
# tf.string_join to join shifted_tokens[:k] to create k-grams. The `separator`
# string is inserted between each pair of tokens in the k-gram.
# The batch that the first of these belonged to is given by
# shifted_batch_indices[0]. However some of these will cross the boundaries
# between 'batches' and so we we create a boolean mask which is True when
# shifted_indices[:k] are all equal.
#
# This results in tensors of ngrams, their batch indices and a boolean mask,
# which we then use to construct the output SparseTensor.
if tokens.get_shape().ndims != 2:
raise ValueError('ngrams requires `tokens` to be 2-dimensional')
with tf.compat.v1.name_scope(name, 'ngrams'):
if ngram_range[0] < 1 or ngram_range[1] < ngram_range[0]:
raise ValueError('Invalid ngram_range: %r' % (ngram_range,))
def _sliding_windows(values, num_shifts, fill_value):
buffered_values = tf.concat(
[values, tf.fill([num_shifts - 1], fill_value)], 0)
return [
tf.slice(buffered_values, [i], tf.shape(input=values))
for i in range(num_shifts)
]
shifted_batch_indices = _sliding_windows(
tokens.indices[:, 0], ngram_range[1] + 1,
tf.constant(-1, dtype=tf.int64))
shifted_tokens = _sliding_windows(tokens.values, ngram_range[1] + 1, '')
# Construct a tensor of the form
# [['a', 'ab, 'abc'], ['b', 'bcd', cde'], ...]
def _string_join(tensors):
if tensors:
return tf.strings.join(tensors, separator=separator)
else:
return
ngrams_array = [_string_join(shifted_tokens[:k])
for k in range(ngram_range[0], ngram_range[1] + 1)]
ngrams_tensor = tf.stack(ngrams_array, 1)
# Construct a boolean mask for whether each ngram in ngram_tensor is valid,
# in that each character came from the same batch.
valid_ngram = tf.equal(
tf.math.cumprod(
tf.cast(
tf.equal(
tf.stack(shifted_batch_indices, 1),
tf.expand_dims(shifted_batch_indices[0], 1)),
dtype=tf.int32),
axis=1), 1)
valid_ngram = valid_ngram[:, (ngram_range[0] - 1):ngram_range[1]]
# Construct a tensor with the batch that each ngram in ngram_tensor belongs
# to.
batch_indices = tf.tile(tf.expand_dims(tokens.indices[:, 0], 1),
[1, ngram_range[1] + 1 - ngram_range[0]])
# Apply the boolean mask and construct a SparseTensor with the given indices
# and values, where another index is added to give the position within a
# batch.
batch_indices = tf.boolean_mask(tensor=batch_indices, mask=valid_ngram)
ngrams_tensor = tf.boolean_mask(tensor=ngrams_tensor, mask=valid_ngram)
instance_indices = segment_indices(batch_indices)
dense_shape_second_dim = tf.maximum(
tf.reduce_max(input_tensor=instance_indices), -1) + 1
return tf.SparseTensor(
indices=tf.stack([batch_indices, instance_indices], 1),
values=ngrams_tensor,
dense_shape=tf.stack(
[tokens.dense_shape[0], dense_shape_second_dim]))