in torchmoji/filter_utils.py [0:0]
def mostly_english(words, english, pct_eng_short=0.5, pct_eng_long=0.6, ignore_special_tokens=True, min_length=2):
""" Ensure text meets threshold for containing English words """
n_words = 0
n_english = 0
if english is None:
return True, 0, 0
for w in words:
if len(w) < min_length:
continue
if punct_word(w):
continue
if ignore_special_tokens and is_special_token(w):
continue
n_words += 1
if w in english:
n_english += 1
if n_words < 2:
return True, n_words, n_english
if n_words < 5:
valid_english = n_english >= n_words * pct_eng_short
else:
valid_english = n_english >= n_words * pct_eng_long
return valid_english, n_words, n_english