def ngrams()

in tensorflow_transform/mappers.py [0:0]
49 lines of code
13 McCabe index (conditional complexity)

def ngrams(tokens: tf.SparseTensor,
           ngram_range: Tuple[int, int],
           separator: str,
           name: Optional[str] = None) -> tf.SparseTensor:
  """Create a `SparseTensor` of n-grams.

  Given a `SparseTensor` of tokens, returns a `SparseTensor` containing the
  ngrams that can be constructed from each row.

  `separator` is inserted between each pair of tokens, so " " would be an
  appropriate choice if the tokens are words, while "" would be an appropriate
  choice if they are characters.

  Example:

  >>> tokens = tf.SparseTensor(
  ...         indices=[[0, 0], [0, 1], [0, 2], [1, 0], [1, 1], [1, 2], [1, 3]],
  ...         values=['One', 'was', 'Johnny', 'Two', 'was', 'a', 'rat'],
  ...         dense_shape=[2, 4])
  >>> print(tft.ngrams(tokens, ngram_range=(1, 3), separator=' '))
  SparseTensor(indices=tf.Tensor(
      [[0 0] [0 1] [0 2] [0 3] [0 4] [0 5]
       [1 0] [1 1] [1 2] [1 3] [1 4] [1 5] [1 6] [1 7] [1 8]],
       shape=(15, 2), dtype=int64),
    values=tf.Tensor(
      [b'One' b'One was' b'One was Johnny' b'was' b'was Johnny' b'Johnny' b'Two'
       b'Two was' b'Two was a' b'was' b'was a' b'was a rat' b'a' b'a rat'
       b'rat'], shape=(15,), dtype=string),
    dense_shape=tf.Tensor([2 9], shape=(2,), dtype=int64))

  Args:
    tokens: a two-dimensional`SparseTensor` of dtype `tf.string` containing
      tokens that will be used to construct ngrams.
    ngram_range: A pair with the range (inclusive) of ngram sizes to return.
    separator: a string that will be inserted between tokens when ngrams are
      constructed.
    name: (Optional) A name for this operation.

  Returns:
    A `SparseTensor` containing all ngrams from each row of the input. Note:
    if an ngram appears multiple times in the input row, it will be present the
    same number of times in the output. For unique ngrams, see tft.bag_of_words.

  Raises:
    ValueError: if `tokens` is not 2D.
    ValueError: if ngram_range[0] < 1 or ngram_range[1] < ngram_range[0]
  """
  # This function is implemented as follows.  Assume we start with the following
  # `SparseTensor`:
  #
  # indices=[[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [2, 0], [2, 1], [2, 2]]
  # values=['a', 'b', 'c', 'd', 'q', 'x', 'y', 'z']
  # dense_shape=[3, 4]
  #
  # First we then create shifts of the values and first column of indices,
  # buffering to avoid overrunning the end of the array, so the shifted values
  # (if we are ngrams up to size 3) are
  #
  # shifted_batch_indices[0]=[0, 0, 0, 0, 1, 2, 2, 2]
  # shifted_tokens[0]=['a', 'b', 'c', 'd', 'q', 'x', 'y', 'z']
  #
  # shifted_batch_indices[1]=[0, 0, 0, 1, 2, 2, 2, -1]
  # shifted_tokens[1]=['b', 'c', 'd', 'q', 'x', 'y', 'z', '']
  #
  # shifted_batch_indices[2]=[0, 0, 1, 2, 2, 2, -1, -1]
  # shifted_tokens[2]=['c', 'd', 'q', 'x', 'y', 'z', '', '']
  #
  # These shifted ngrams are used to create the ngrams as follows.  We use
  # tf.string_join to join shifted_tokens[:k] to create k-grams. The `separator`
  # string is inserted between each pair of tokens in the k-gram.
  # The batch that the first of these belonged to is given by
  # shifted_batch_indices[0]. However some of these will cross the boundaries
  # between 'batches' and so we we create a boolean mask which is True when
  # shifted_indices[:k] are all equal.
  #
  # This results in tensors of ngrams, their batch indices and a boolean mask,
  # which we then use to construct the output SparseTensor.
  if tokens.get_shape().ndims != 2:
    raise ValueError('ngrams requires `tokens` to be 2-dimensional')
  with tf.compat.v1.name_scope(name, 'ngrams'):
    if ngram_range[0] < 1 or ngram_range[1] < ngram_range[0]:
      raise ValueError('Invalid ngram_range: %r' % (ngram_range,))

    def _sliding_windows(values, num_shifts, fill_value):
      buffered_values = tf.concat(
          [values, tf.fill([num_shifts - 1], fill_value)], 0)
      return [
          tf.slice(buffered_values, [i], tf.shape(input=values))
          for i in range(num_shifts)
      ]

    shifted_batch_indices = _sliding_windows(
        tokens.indices[:, 0], ngram_range[1] + 1,
        tf.constant(-1, dtype=tf.int64))
    shifted_tokens = _sliding_windows(tokens.values, ngram_range[1] + 1, '')

    # Construct a tensor of the form
    # [['a', 'ab, 'abc'], ['b', 'bcd', cde'], ...]
    def _string_join(tensors):
      if tensors:
        return tf.strings.join(tensors, separator=separator)
      else:
        return

    ngrams_array = [_string_join(shifted_tokens[:k])
                    for k in range(ngram_range[0], ngram_range[1] + 1)]
    ngrams_tensor = tf.stack(ngrams_array, 1)

    # Construct a boolean mask for whether each ngram in ngram_tensor is valid,
    # in that each character came from the same batch.
    valid_ngram = tf.equal(
        tf.math.cumprod(
            tf.cast(
                tf.equal(
                    tf.stack(shifted_batch_indices, 1),
                    tf.expand_dims(shifted_batch_indices[0], 1)),
                dtype=tf.int32),
            axis=1), 1)
    valid_ngram = valid_ngram[:, (ngram_range[0] - 1):ngram_range[1]]

    # Construct a tensor with the batch that each ngram in ngram_tensor belongs
    # to.
    batch_indices = tf.tile(tf.expand_dims(tokens.indices[:, 0], 1),
                            [1, ngram_range[1] + 1 - ngram_range[0]])

    # Apply the boolean mask and construct a SparseTensor with the given indices
    # and values, where another index is added to give the position within a
    # batch.
    batch_indices = tf.boolean_mask(tensor=batch_indices, mask=valid_ngram)
    ngrams_tensor = tf.boolean_mask(tensor=ngrams_tensor, mask=valid_ngram)
    instance_indices = segment_indices(batch_indices)
    dense_shape_second_dim = tf.maximum(
        tf.reduce_max(input_tensor=instance_indices), -1) + 1
    return tf.SparseTensor(
        indices=tf.stack([batch_indices, instance_indices], 1),
        values=ngrams_tensor,
        dense_shape=tf.stack(
            [tokens.dense_shape[0], dense_shape_second_dim]))