in misc/precision_filtering/count_common.py [0:0]
def load_and_save_tokenizer_freq(tokenizer_id, selected_language):
# Load selected langauge
tokenizer_path = os.path.join(root_path, tokenizer_id)
with open(os.path.join(tokenizer_path, selected_language + '.pkl'), 'rb') as f:
language_tf = pickle.load(f)
os.makedirs(save_path, exist_ok=True) # Ensure the directory exists
common_freq = Counter()
# Iterate over all pickle files in the directory
for filename in os.listdir(tokenizer_path):
if filename.endswith(".pkl"):
file_path = os.path.join(tokenizer_path, filename)
with open(file_path, "rb") as f:
counts = pickle.load(f)
# Filter counts based on the keys present in language_tf (which is a Counter)
filtered_counts = {word: counts[word] for word in language_tf if word in counts}
common_freq.update(filtered_counts) # Only update with relevant words
# Save the common_freq Counter to a pickle file
save_path = os.path.join(save_path, f"{selected_language}.pkl")
with open(save_path, "wb") as f:
pickle.dump(common_freq, f)