in torchmoji/word_generator.py [0:0]
def convert_unicode_word(self, word):
""" Converts Unicode words to ASCII using unidecode. If Unicode is not
allowed (set as a variable during initialization), then only
punctuation that can be converted to ASCII will be allowed.
"""
if self.check_ascii(word):
return True, word
# First we ensure that the Unicode is normalized so it's
# always a single character.
word = unicodedata.normalize("NFKC", word)
# Convert Unicode punctuation to ASCII equivalent. We want
# e.g. "\u203c" (double exclamation mark) to be treated the same
# as "!!" no matter if we allow other Unicode characters or not.
word = self.convert_unicode_punctuation(word)
if self.ignore_emojis:
_, word = separate_emojis_and_text(word)
# If conversion of punctuation and removal of emojis took care
# of all the Unicode or if we allow Unicode then everything is fine
if self.check_ascii(word) or self.allow_unicode_text:
return True, word
else:
# Sometimes we might want to simply ignore Unicode sentences
# (e.g. for vocabulary creation). This is another way to prevent
# "polution" of strange Unicode tokens from low quality datasets
return False, ''