src/filter_words.py (30 lines of code) (raw):
import requests
from transformers import AutoTokenizer
import json
model_name = "Mozilla/smart-tab-topic"
# Function to load words from a URL
def load_words_from_url(url):
response = requests.get(url)
response.raise_for_status()
words = {line.strip() for line in response.text.splitlines()}
return words
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
bad_words = load_words_from_url(
"https://raw.githubusercontent.com/snguyenthanh/better_profanity/master/better_profanity/profanity_wordlist.txt"
)
vocab = tokenizer.get_vocab()
vocab = [tokenizer.convert_tokens_to_string([token]).lower().strip() for token in vocab]
vocab_bad_words = [word for word in vocab if word in bad_words]
print(len(vocab_bad_words), vocab_bad_words)
def get_tokens_as_list(word_list):
tokens_list = []
for word in word_list:
tokenized_word = tokenizer([word], add_special_tokens=False).input_ids[0]
tokens_list.append(tokenized_word)
return tokens_list
bad_words_ids = get_tokens_as_list(vocab_bad_words)
print({'bad_word_ids': bad_words_ids})
with open("bad_words_smart_topic.json", "w") as file:
json.dump({
'bad_words': vocab_bad_words,
'bad_words_ids': bad_words_ids
}, file, indent=4)