in tensorflow_datasets/core/deprecated/text/subword_text_encoder.py [0:0]
def _build_from_token_counts(cls, token_counts, min_token_count,
reserved_tokens, num_iterations,
max_subword_length):
# Start with subwords initialized to only reserved_tokens
subwords = list(reserved_tokens)
for _ in range(num_iterations):
encoder = cls(vocab_list=subwords)
subword_counts = collections.defaultdict(int)
for token, count in six.iteritems(token_counts):
start_idx = 0
for subword in encoder._token_to_subwords(token): # pylint: disable=protected-access
last_idx = min(len(token), start_idx + max_subword_length)
for end_idx in range(start_idx + 1, last_idx + 1):
candidate_subword = token[start_idx:end_idx]
subword_counts[candidate_subword] += count
start_idx += len(subword)
# Group subword candidates by length and filter bad candidates
len_to_subwords = [set() for _ in range(max_subword_length + 1)]
for subword, count in six.iteritems(subword_counts):
if count < min_token_count:
continue
# Skip single bytes because they're always in the vocab
if len(tf.compat.as_bytes(subword)) <= 1:
continue
len_to_subwords[len(subword)].add(subword)
# Consider subword candidates by descending length so that if a longer
# subword is accepted, its prefixes can have their counts decremented.
candidate_subwords = []
for subword_len in reversed(range(max_subword_length + 1)):
for subword in len_to_subwords[subword_len]:
count = subword_counts[subword]
if count < min_token_count:
continue
candidate_subwords.append((count, subword))
# Decrement prefix counts
for end_idx in range(1, subword_len):
subword_counts[subword[:end_idx]] -= count
# Sort subwords by count in descending order, keeping reserved_tokens as
# the beginning.
candidate_subwords.sort(reverse=True)
subwords = reserved_tokens + [s for _, s in candidate_subwords]
return cls(vocab_list=subwords)