in torchmoji/sentence_tokenizer.py [0:0]
def __init__(self, vocabulary, fixed_length, custom_wordgen=None,
ignore_sentences_with_only_custom=False, masking_value=0,
unknown_value=1):
""" Needs a dictionary as input for the vocabulary.
"""
if len(vocabulary) > np.iinfo('uint16').max:
raise ValueError('Dictionary is too big ({} tokens) for the numpy '
'datatypes used (max limit={}). Reduce vocabulary'
' or adjust code accordingly!'
.format(len(vocabulary), np.iinfo('uint16').max))
# Shouldn't be able to modify the given vocabulary
self.vocabulary = deepcopy(vocabulary)
self.fixed_length = fixed_length
self.ignore_sentences_with_only_custom = ignore_sentences_with_only_custom
self.masking_value = masking_value
self.unknown_value = unknown_value
# Initialized with an empty stream of sentences that must then be fed
# to the generator at a later point for reusability.
# A custom word generator can be used for domain-specific filtering etc
if custom_wordgen is not None:
assert custom_wordgen.stream is None
self.wordgen = custom_wordgen
self.uses_custom_wordgen = True
else:
self.wordgen = WordGenerator(None, allow_unicode_text=True,
ignore_emojis=False,
remove_variation_selectors=True,
break_replacement=True)
self.uses_custom_wordgen = False