def _build_from_token_counts()

in tensorflow_datasets/core/deprecated/text/subword_text_encoder.py [0:0]


  def _build_from_token_counts(cls, token_counts, min_token_count,
                               reserved_tokens, num_iterations,
                               max_subword_length):
    # Start with subwords initialized to only reserved_tokens
    subwords = list(reserved_tokens)

    for _ in range(num_iterations):
      encoder = cls(vocab_list=subwords)
      subword_counts = collections.defaultdict(int)
      for token, count in six.iteritems(token_counts):
        start_idx = 0
        for subword in encoder._token_to_subwords(token):  # pylint: disable=protected-access
          last_idx = min(len(token), start_idx + max_subword_length)
          for end_idx in range(start_idx + 1, last_idx + 1):
            candidate_subword = token[start_idx:end_idx]
            subword_counts[candidate_subword] += count
          start_idx += len(subword)

      # Group subword candidates by length and filter bad candidates
      len_to_subwords = [set() for _ in range(max_subword_length + 1)]
      for subword, count in six.iteritems(subword_counts):
        if count < min_token_count:
          continue
        # Skip single bytes because they're always in the vocab
        if len(tf.compat.as_bytes(subword)) <= 1:
          continue
        len_to_subwords[len(subword)].add(subword)

      # Consider subword candidates by descending length so that if a longer
      # subword is accepted, its prefixes can have their counts decremented.
      candidate_subwords = []
      for subword_len in reversed(range(max_subword_length + 1)):
        for subword in len_to_subwords[subword_len]:
          count = subword_counts[subword]
          if count < min_token_count:
            continue
          candidate_subwords.append((count, subword))
          # Decrement prefix counts
          for end_idx in range(1, subword_len):
            subword_counts[subword[:end_idx]] -= count

      # Sort subwords by count in descending order, keeping reserved_tokens as
      # the beginning.
      candidate_subwords.sort(reverse=True)
      subwords = reserved_tokens + [s for _, s in candidate_subwords]

    return cls(vocab_list=subwords)