in misc/precision_filtering/wordlist_gen.py [0:0]
def save(tokenizer_id, selected_language):
tokenizer_path = os.path.join(root_path, tokenizer_id)
with open(os.path.join(tokenizer_path, selected_language + '.pkl'), 'rb') as f:
language_tf = pickle.load(f)
with open(os.path.join('/fsx/user_dir/common_freq', selected_language + '.pkl'), 'rb') as f:
common_tf = pickle.load(f)
language_filter_tf = filter_top_percentile(language_tf, 95)
filtered_counter = filter_by_ratio(language_filter_tf, common_tf, 0.85)
output_dir = './wordlists-0.85'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, f'{selected_language}.txt')
# Save the filtered results to a text file
with open(output_file, 'w') as f:
for key, ratio in filtered_counter.items():
f.write(f'{key}\n')