def mostly_english()

in torchmoji/filter_utils.py [0:0]


def mostly_english(words, english, pct_eng_short=0.5, pct_eng_long=0.6, ignore_special_tokens=True, min_length=2):
    """ Ensure text meets threshold for containing English words """

    n_words = 0
    n_english = 0

    if english is None:
        return True, 0, 0

    for w in words:
        if len(w) < min_length:
            continue
        if punct_word(w):
            continue
        if ignore_special_tokens and is_special_token(w):
            continue
        n_words += 1
        if w in english:
            n_english += 1

    if n_words < 2:
        return True, n_words, n_english
    if n_words < 5:
        valid_english = n_english >= n_words * pct_eng_short
    else:
        valid_english = n_english >= n_words * pct_eng_long
    return valid_english, n_words, n_english