def load_and_save_tokenizer_freq()

in misc/precision_filtering/count_common.py [0:0]


def load_and_save_tokenizer_freq(tokenizer_id, selected_language):
    # Load selected langauge
    tokenizer_path = os.path.join(root_path, tokenizer_id)
    with open(os.path.join(tokenizer_path, selected_language + '.pkl'), 'rb') as f:
        language_tf = pickle.load(f)

    os.makedirs(save_path, exist_ok=True)  # Ensure the directory exists

    common_freq = Counter()

    # Iterate over all pickle files in the directory
    for filename in os.listdir(tokenizer_path):
        if filename.endswith(".pkl"):
            file_path = os.path.join(tokenizer_path, filename)
            
            with open(file_path, "rb") as f:
                counts = pickle.load(f)
                # Filter counts based on the keys present in language_tf (which is a Counter)
                filtered_counts = {word: counts[word] for word in language_tf if word in counts}
                common_freq.update(filtered_counts)  # Only update with relevant words

    # Save the common_freq Counter to a pickle file
    save_path = os.path.join(save_path, f"{selected_language}.pkl")
    with open(save_path, "wb") as f:
        pickle.dump(common_freq, f)