path # lines of code # active days days since first update days since last update # commits # contributors first updated last updated first contributor last contributor ablations/evaluation/launch_evals.py 139 1 207 207 1 1 2024-12-06 2024-12-06 kydlicek.hynek@gmail.com kydlicek.hynek@gmail.com ablations/evaluation/launch_random_evals.py 50 1 207 207 1 1 2024-12-06 2024-12-06 kydlicek.hynek@gmail.com kydlicek.hynek@gmail.com ablations/tokenization/launch_tokenization.py 63 1 207 207 1 1 2024-12-06 2024-12-06 kydlicek.hynek@gmail.com kydlicek.hynek@gmail.com ablations/training/launch_exp.py 226 1 207 207 2 2 2024-12-06 2024-12-06 nostrumg@gmail.com kydlicek.hynek@gmail.com fineweb-2-pipeline.py 227 1 207 207 1 1 2024-12-06 2024-12-06 nostrumg@gmail.com nostrumg@gmail.com misc/reference_datasets/monolingual/zh/download_mnbvc.py 19 1 208 208 1 1 2024-12-05 2024-12-05 kydlicek.hynek@gmail.com kydlicek.hynek@gmail.com misc/reference_datasets/monolingual/zh/download_mapcc.py 140 1 208 208 2 1 2024-12-05 2024-12-05 kydlicek.hynek@gmail.com kydlicek.hynek@gmail.com misc/reference_datasets/monolingual/zh/download_tigerbot.py 19 1 208 208 1 1 2024-12-05 2024-12-05 kydlicek.hynek@gmail.com kydlicek.hynek@gmail.com misc/reference_datasets/monolingual/hi/download_sangraha.py 21 1 208 208 1 1 2024-12-05 2024-12-05 kydlicek.hynek@gmail.com kydlicek.hynek@gmail.com misc/reference_datasets/monolingual/hi/download_odaigen.py 17 1 208 208 1 1 2024-12-05 2024-12-05 kydlicek.hynek@gmail.com kydlicek.hynek@gmail.com misc/reference_datasets/monolingual/ar/download_101b_arabicwords.py 18 1 208 208 1 1 2024-12-05 2024-12-05 kydlicek.hynek@gmail.com kydlicek.hynek@gmail.com misc/reference_datasets/monolingual/ar/download_arabicweb24.py 74 1 208 208 1 1 2024-12-05 2024-12-05 kydlicek.hynek@gmail.com kydlicek.hynek@gmail.com misc/reference_datasets/monolingual/fr/download_croissant.py 75 1 208 208 1 1 2024-12-05 2024-12-05 kydlicek.hynek@gmail.com kydlicek.hynek@gmail.com misc/reference_datasets/monolingual/th/download_sea_commoncrawl.py 17 1 208 208 1 1 2024-12-05 2024-12-05 kydlicek.hynek@gmail.com kydlicek.hynek@gmail.com misc/reference_datasets/monolingual/tr/download_vngrs.py 19 1 208 208 1 1 2024-12-05 2024-12-05 kydlicek.hynek@gmail.com kydlicek.hynek@gmail.com misc/reference_datasets/monolingual/te/download_sangraha.py 39 1 208 208 1 1 2024-12-05 2024-12-05 kydlicek.hynek@gmail.com kydlicek.hynek@gmail.com misc/reference_datasets/multilingual/download_culturax.py 24 1 208 208 1 1 2024-12-05 2024-12-05 kydlicek.hynek@gmail.com kydlicek.hynek@gmail.com misc/reference_datasets/multilingual/download_mc4.py 29 1 208 208 1 1 2024-12-05 2024-12-05 kydlicek.hynek@gmail.com kydlicek.hynek@gmail.com misc/reference_datasets/multilingual/download_hplt_split.py 20 1 208 208 1 1 2024-12-05 2024-12-05 kydlicek.hynek@gmail.com kydlicek.hynek@gmail.com misc/reference_datasets/multilingual/part jsons.py 112 1 208 208 1 1 2024-12-05 2024-12-05 kydlicek.hynek@gmail.com kydlicek.hynek@gmail.com misc/reference_datasets/multilingual/download_hplt.py 41 1 208 208 1 1 2024-12-05 2024-12-05 kydlicek.hynek@gmail.com kydlicek.hynek@gmail.com misc/reference_datasets/multilingual/download_cc-100.py 162 1 208 208 1 1 2024-12-05 2024-12-05 kydlicek.hynek@gmail.com kydlicek.hynek@gmail.com misc/reference_datasets/multilingual/copy_raw_data.py 105 1 208 208 1 1 2024-12-05 2024-12-05 kydlicek.hynek@gmail.com kydlicek.hynek@gmail.com misc/precision_filtering/slurm_count_word.py 71 2 27 15 2 2 2025-06-04 2025-06-16 kargaranamir@gmail.com nostrumg@gmail.com misc/precision_filtering/run_precision_filtering.py 106 3 13 6 3 2 2025-06-18 2025-06-25 guilherme@huggingface.co guilherme@huggingface.co misc/precision_filtering/wordlist_score.py 40 2 27 15 2 2 2025-06-04 2025-06-16 kargaranamir@gmail.com nostrumg@gmail.com misc/precision_filtering/count_common.py 44 2 27 15 2 2 2025-06-04 2025-06-16 kargaranamir@gmail.com nostrumg@gmail.com misc/precision_filtering/wordlist_gen.py 56 3 27 6 3 3 2025-06-04 2025-06-25 kargaranamir@gmail.com guilherme@huggingface.co