id: 1 unit: def run() file: misc/reference_datasets/multilingual/download_cc-100.py start line: 0 end line: 0 size: 147 LOC McCabe index: 6 number of parameters: 4 id: 2 unit: def run() file: misc/precision_filtering/slurm_count_word.py start line: 0 end line: 0 size: 37 LOC McCabe index: 14 number of parameters: 4 id: 3 unit: def __init__() file: misc/reference_datasets/multilingual/copy_raw_data.py start line: 0 end line: 0 size: 32 LOC McCabe index: 1 number of parameters: 15 id: 4 unit: def __init__() file: misc/precision_filtering/run_precision_filtering.py start line: 0 end line: 0 size: 31 LOC McCabe index: 8 number of parameters: 5 id: 5 unit: def read_files_shard() file: misc/reference_datasets/monolingual/zh/download_mapcc.py start line: 0 end line: 0 size: 29 LOC McCabe index: 9 number of parameters: 2 id: 6 unit: def read_files_shard() file: misc/reference_datasets/multilingual/part jsons.py start line: 0 end line: 0 size: 29 LOC McCabe index: 9 number of parameters: 2 id: 7 unit: def get_checkpoints_to_run() file: ablations/evaluation/launch_evals.py start line: 0 end line: 0 size: 24 LOC McCabe index: 12 number of parameters: 6 id: 8 unit: def run() file: misc/reference_datasets/monolingual/zh/download_mapcc.py start line: 0 end line: 0 size: 23 LOC McCabe index: 4 number of parameters: 4 id: 9 unit: def read_file() file: misc/reference_datasets/multilingual/copy_raw_data.py start line: 0 end line: 0 size: 23 LOC McCabe index: 7 number of parameters: 2 id: 10 unit: def run() file: misc/reference_datasets/multilingual/copy_raw_data.py start line: 0 end line: 0 size: 20 LOC McCabe index: 8 number of parameters: 4 id: 11 unit: def url_filter() file: misc/precision_filtering/run_precision_filtering.py start line: 0 end line: 0 size: 19 LOC McCabe index: 8 number of parameters: 2 id: 12 unit: def run() file: misc/reference_datasets/multilingual/download_hplt.py start line: 0 end line: 0 size: 18 LOC McCabe index: 7 number of parameters: 4 id: 13 unit: def __init__() file: misc/reference_datasets/monolingual/ar/download_arabicweb24.py start line: 0 end line: 0 size: 17 LOC McCabe index: 1 number of parameters: 0 id: 14 unit: def __init__() file: misc/reference_datasets/monolingual/fr/download_croissant.py start line: 0 end line: 0 size: 17 LOC McCabe index: 1 number of parameters: 0 id: 15 unit: def read_file() file: misc/reference_datasets/multilingual/part jsons.py start line: 0 end line: 0 size: 17 LOC McCabe index: 5 number of parameters: 2 id: 16 unit: def read_file() file: misc/reference_datasets/monolingual/ar/download_arabicweb24.py start line: 0 end line: 0 size: 16 LOC McCabe index: 4 number of parameters: 2 id: 17 unit: def read_file() file: misc/reference_datasets/monolingual/fr/download_croissant.py start line: 0 end line: 0 size: 16 LOC McCabe index: 4 number of parameters: 2 id: 18 unit: def load_and_save_tokenizer_freq() file: misc/precision_filtering/count_common.py start line: 0 end line: 0 size: 16 LOC McCabe index: 5 number of parameters: 2 id: 19 unit: def print_differences() file: ablations/training/launch_exp.py start line: 0 end line: 0 size: 14 LOC McCabe index: 8 number of parameters: 2 id: 20 unit: def save() file: misc/precision_filtering/wordlist_gen.py start line: 0 end line: 0 size: 14 LOC McCabe index: 2 number of parameters: 2 id: 21 unit: def checkpoint_exists() file: ablations/evaluation/launch_evals.py start line: 0 end line: 0 size: 13 LOC McCabe index: 6 number of parameters: 4 id: 22 unit: def read() file: misc/reference_datasets/monolingual/zh/download_mapcc.py start line: 0 end line: 0 size: 13 LOC McCabe index: 5 number of parameters: 2 id: 23 unit: def read() file: misc/reference_datasets/multilingual/part jsons.py start line: 0 end line: 0 size: 13 LOC McCabe index: 5 number of parameters: 2 id: 24 unit: def load_words_from_txt() file: misc/precision_filtering/wordlist_score.py start line: 0 end line: 0 size: 13 LOC McCabe index: 8 number of parameters: 1 id: 25 unit: def adapter() file: misc/reference_datasets/multilingual/download_mc4.py start line: 0 end line: 0 size: 10 LOC McCabe index: 2 number of parameters: 4 id: 26 unit: def launch_slurm_job() file: ablations/evaluation/launch_evals.py start line: 0 end line: 0 size: 9 LOC McCabe index: 2 number of parameters: 2 id: 27 unit: def launch_slurm_job() file: ablations/evaluation/launch_random_evals.py start line: 0 end line: 0 size: 9 LOC McCabe index: 2 number of parameters: 2 id: 28 unit: def _open_next_file() file: misc/reference_datasets/monolingual/zh/download_mapcc.py start line: 0 end line: 0 size: 9 LOC McCabe index: 3 number of parameters: 1 id: 29 unit: def __init__() file: misc/reference_datasets/monolingual/zh/download_mapcc.py start line: 0 end line: 0 size: 9 LOC McCabe index: 1 number of parameters: 0 id: 30 unit: def _open_next_file() file: misc/reference_datasets/multilingual/part jsons.py start line: 0 end line: 0 size: 9 LOC McCabe index: 3 number of parameters: 1 id: 31 unit: def __init__() file: misc/reference_datasets/multilingual/part jsons.py start line: 0 end line: 0 size: 9 LOC McCabe index: 1 number of parameters: 0 id: 32 unit: def filter_by_ratio() file: misc/precision_filtering/wordlist_gen.py start line: 0 end line: 0 size: 9 LOC McCabe index: 7 number of parameters: 3 id: 33 unit: def filter_top_percentile() file: misc/precision_filtering/wordlist_gen.py start line: 0 end line: 0 size: 8 LOC McCabe index: 4 number of parameters: 2 id: 34 unit: def parse_date() file: ablations/evaluation/launch_evals.py start line: 0 end line: 0 size: 7 LOC McCabe index: 3 number of parameters: 1 id: 35 unit: def adapter() file: misc/reference_datasets/multilingual/download_culturax.py start line: 0 end line: 0 size: 7 LOC McCabe index: 1 number of parameters: 4 id: 36 unit: def adapter() file: misc/reference_datasets/multilingual/copy_raw_data.py start line: 0 end line: 0 size: 7 LOC McCabe index: 1 number of parameters: 4 id: 37 unit: def wordlist_filter() file: misc/precision_filtering/run_precision_filtering.py start line: 0 end line: 0 size: 7 LOC McCabe index: 2 number of parameters: 2 id: 38 unit: def filter_score() file: misc/precision_filtering/wordlist_score.py start line: 0 end line: 0 size: 7 LOC McCabe index: 8 number of parameters: 2 id: 39 unit: def run() file: misc/reference_datasets/monolingual/zh/download_mapcc.py start line: 0 end line: 0 size: 6 LOC McCabe index: 3 number of parameters: 4 id: 40 unit: def wordlist() file: misc/precision_filtering/run_precision_filtering.py start line: 0 end line: 0 size: 6 LOC McCabe index: 4 number of parameters: 1 id: 41 unit: def launch_slurm_job() file: ablations/training/launch_exp.py start line: 0 end line: 0 size: 5 LOC McCabe index: 1 number of parameters: 2 id: 42 unit: def __init__() file: misc/reference_datasets/monolingual/zh/download_mapcc.py start line: 0 end line: 0 size: 5 LOC McCabe index: 1 number of parameters: 2 id: 43 unit: def __init__() file: misc/reference_datasets/multilingual/part jsons.py start line: 0 end line: 0 size: 5 LOC McCabe index: 1 number of parameters: 2 id: 44 unit: def open_concatenated_gzip_files() file: misc/reference_datasets/monolingual/zh/download_mapcc.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 1 id: 45 unit: def open_concatenated_gzip_files() file: misc/reference_datasets/multilingual/part jsons.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 1 id: 46 unit: def close() file: misc/reference_datasets/monolingual/zh/download_mapcc.py start line: 0 end line: 0 size: 3 LOC McCabe index: 2 number of parameters: 1 id: 47 unit: def close() file: misc/reference_datasets/multilingual/part jsons.py start line: 0 end line: 0 size: 3 LOC McCabe index: 2 number of parameters: 1 id: 48 unit: def __init__() file: misc/precision_filtering/slurm_count_word.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 3 id: 49 unit: def above_lang_threshold() file: fineweb-2-pipeline.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 50 unit: def generate_tokenizer_identifier() file: misc/precision_filtering/slurm_count_word.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 51 unit: def filter() file: misc/precision_filtering/run_precision_filtering.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 2 id: 52 unit: def generate_tokenizer_identifier() file: misc/precision_filtering/count_common.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 53 unit: def generate_tokenizer_identifier() file: misc/precision_filtering/wordlist_gen.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1