text/pretraining/continual-pretraining/finemath/finemath-tokenize.py (48 lines of code) (raw):
import argparse
import os
parser = argparse.ArgumentParser("Quickly launch thom's style of tokenization.")
# python /fsx/loubna/projects/datatrove/examples/edu_fw.py hf://datasets/bigcode/stackoverflow-clean stackoverflow --n_tasks 50 --tokenizer HuggingFaceTB/cosmo2-tokenizer
# parser.add_argument("data_path", type=str, help="Path to the data to tokenize.")
parser.add_argument("output_name", type=str, help="Output name.")
parser.add_argument("--n_tasks", type=int, help="nb of tokenization tasks", default=100)
parser.add_argument("--max_toks", type=int, help="max tokens per file", default=1e9)
parser.add_argument("--tokenizer", type=str, help="tokenizer to use", default="meta-llama/Llama-3.2-1B")
parser.add_argument("--text_key", type=str, default="text")
if __name__ == "__main__":
args = parser.parse_args()
from datatrove.executor import SlurmPipelineExecutor
from datatrove.pipeline.filters import SamplerFilter
from datatrove.pipeline.readers import ParquetReader, JsonlReader
from datatrove.pipeline.tokens.tokenizer import DocumentTokenizer
from datatrove.pipeline.tokens.merger import DocumentTokenizerMerger
dic = {
"finemath-3-plus":"hf://datasets/HuggingFaceTB/finemath/finemath-3plus",
"finemath-4-plus":"hf://datasets/HuggingFaceTB/finemath/finemath-4plus",
"infiwebmath-3-plus":"hf://datasets/HuggingFaceTB/finemath/infiwebmath-3plus",
"infiwebmath-4-plus":"hf://datasets/HuggingFaceTB/finemath/infiwebmath-4plus",
# "fw-edu-dedup": "hf://datasets/HuggingFaceTB/smollm-corpus/fineweb-edu-dedup",
# "infiwebmath-ablation-new": "hf://datasets/Infi-MM/InfiMM-WebMath-40B",
# "owm-ablation": "hf://datasets/open-web-math/open-web-math/data",
}
for name, path in dic.items():
dist_executor = SlurmPipelineExecutor(
job_name=f"tok-{name}",
pipeline=[
ParquetReader(
path, # read directly from huggingface
glob_pattern="*.parquet", # "**/*.parquet",
text_key=args.text_key,
),
#SamplerFilter(rate=0.5),
DocumentTokenizer(
output_folder=f"/fsx/elie_bakouch/data/{name}",
tokenizer_name_or_path=args.tokenizer,
batch_size=10000,
max_tokens_per_file=args.max_toks,
shuffle=True,
),
],
tasks=args.n_tasks,
time="20:00:00",
partition="hopper-cpu",
logging_dir=f"/fsx/elie_bakouch/tokenize_logs/fw-edu-classico/{name}",
cpus_per_task=32,
mem_per_cpu_gb=2,
qos="high",
mail_user=args.email,
)
dist_executor.run()