tensorflow_text/tools/wordpiece_vocab/generate_vocab.py [51:74]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
flags.DEFINE_integer('upper_thresh', 10000000,
                     'Upper threshold for binary search.')
flags.DEFINE_integer('lower_thresh', 10, 'Lower threshold for binary search.')
flags.DEFINE_integer('num_iterations', 4,
                     'Number of iterations in wordpiece learning algorithm.')
flags.DEFINE_integer('num_pad_tokens', 100, 'Number of padding tokens to '
                     'include in vocab.')
flags.DEFINE_integer('max_input_tokens', 5000000,
                     'Maximum number of input tokens, where -1 means no max.')
flags.DEFINE_integer('max_token_length', 50, 'Maximum length of a token.')
flags.DEFINE_integer('max_unique_chars', 1000,
                     'Maximum number of unique characters as tokens.')
flags.DEFINE_integer('vocab_size', 110000, 'Target size of generated vocab, '
                     'where vocab_size is an upper bound and the size of vocab '
                     'can be within slack_ratio less than the vocab_size.')
flags.DEFINE_float('slack_ratio', 0.05,
                   'Difference permitted between target and actual vocab size.')
flags.DEFINE_bool('include_joiner_token', True,
                  'Whether to include joiner token in word suffixes.')
flags.DEFINE_string('joiner', '##', 'Joiner token in word suffixes.')
flags.DEFINE_list('reserved_tokens',
                  ['<unk>', '<s>', '</s>', '<mask>',
                   '<cls>', '<sep>', '<S>', '<T>'],
                  'Reserved tokens to be included in vocab.')
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


tensorflow_text/tools/wordpiece_vocab/wordpiece_tokenizer_learner.py [33:56]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
flags.DEFINE_integer('upper_thresh', 10000000,
                     'Upper threshold for binary search.')
flags.DEFINE_integer('lower_thresh', 10, 'Lower threshold for binary search.')
flags.DEFINE_integer('num_iterations', 4,
                     'Number of iterations in wordpiece learning algorithm.')
flags.DEFINE_integer('num_pad_tokens', 100, 'Number of padding tokens to '
                     'include in vocab.')
flags.DEFINE_integer('max_input_tokens', 5000000,
                     'Maximum number of input tokens, where -1 means no max.')
flags.DEFINE_integer('max_token_length', 50, 'Maximum length of a token.')
flags.DEFINE_integer('max_unique_chars', 1000,
                     'Maximum number of unique characters as tokens.')
flags.DEFINE_integer('vocab_size', 110000, 'Target size of generated vocab, '
                     'where vocab_size is an upper bound and the size of vocab '
                     'can be within slack_ratio less than the vocab_size.')
flags.DEFINE_float('slack_ratio', 0.05,
                   'Difference permitted between target and actual vocab size.')
flags.DEFINE_bool('include_joiner_token', True,
                  'Whether to include joiner token in word suffixes.')
flags.DEFINE_string('joiner', '##', 'Joiner token in word suffixes.')
flags.DEFINE_list('reserved_tokens',
                  ['<unk>', '<s>', '</s>', '<mask>',
                   '<cls>', '<sep>', '<S>', '<T>'],
                  'Reserved tokens to be included in vocab.')
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -