in misc/precision_filtering/slurm_count_word.py [0:0]
def run(self, _data, rank, world_size):
import os
from datatrove.utils.text import split_into_words, simplify_text, TextNormConfig
from collections import Counter
import pickle
norm_config = TextNormConfig(
lowercase=False,
norm_numbers=False,
norm_weekdays=False,
norm_monthnames=False,
remove_punctuation=True,
norm_unicode_diacritics=False,
norm_whitespace=True,
)
root_dir = "/fsx/user_dir/glotlid-corpus/v3.1/" # Change this to your actual directory from glotlid corpus or another corpus
# List of language codes
language_codes = [x for x in os.listdir(root_dir)]
language_codes.sort()
# shard that we will process
language_codes = language_codes[rank::world_size]
base_save_path = f"/fsx/user_dir/language_tf/{self.tokenizer_name}" # the save path for the term frequencies
os.makedirs(base_save_path, exist_ok=True)
for code in language_codes:
dir_path = os.path.join(root_dir, f"{code}")
print(f"Processing language code {code} with tokenizer {self.tokenizer_name} ({self.example_language})")
if os.path.exists(os.path.join(base_save_path, f"{code}.pkl")):
continue
word_counts = Counter()
if os.path.isdir(dir_path):
print(f"Processing {dir_path}...")
# Iterate over all text files in the directory
for filename in os.listdir(dir_path):
file_path = os.path.join(dir_path, filename)
if filename.endswith(".txt") and os.path.isfile(file_path):
with open(file_path, "r", encoding="utf-8") as f:
# Read up to 1000 lines and add to the list
for line in f:
# removed the 100k limit, feel free to re-add if you thing it will be an issue
simplified = simplify_text(line.strip(), norm_config)
# use example_language as an example for this tokenizer
word_counts.update(split_into_words(simplified, self.example_language))
with open(os.path.join(base_save_path, f"{code}.pkl"), "wb") as f:
pickle.dump(word_counts, f)