id: 1 unit: def run() file: src/datatrove/pipeline/tokens/merger.py start line: 0 end line: 0 size: 97 LOC McCabe index: 24 number of parameters: 4 id: 2 unit: def run() file: src/datatrove/pipeline/dedup/minhash.py start line: 0 end line: 0 size: 86 LOC McCabe index: 26 number of parameters: 4 id: 3 unit: def run() file: src/datatrove/executor/ray.py start line: 0 end line: 0 size: 68 LOC McCabe index: 14 number of parameters: 1 id: 4 unit: def run() file: src/datatrove/pipeline/dedup/url_dedup.py start line: 0 end line: 0 size: 66 LOC McCabe index: 21 number of parameters: 4 id: 5 unit: def run() file: src/datatrove/pipeline/dedup/sentence_dedup.py start line: 0 end line: 0 size: 61 LOC McCabe index: 21 number of parameters: 4 id: 6 unit: def run() file: src/datatrove/pipeline/dedup/sentence_dedup.py start line: 0 end line: 0 size: 59 LOC McCabe index: 21 number of parameters: 4 id: 7 unit: def main() file: src/datatrove/tools/inspect_data.py start line: 0 end line: 0 size: 58 LOC McCabe index: 18 number of parameters: 0 id: 8 unit: def run() file: src/datatrove/pipeline/dedup/minhash.py start line: 0 end line: 0 size: 57 LOC McCabe index: 20 number of parameters: 4 id: 9 unit: def load_tokenizer_assignments() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 54 LOC McCabe index: 20 number of parameters: 0 id: 10 unit: def run() file: src/datatrove/pipeline/dedup/minhash.py start line: 0 end line: 0 size: 52 LOC McCabe index: 21 number of parameters: 4 id: 11 unit: def run() file: src/datatrove/executor/local.py start line: 0 end line: 0 size: 52 LOC McCabe index: 10 number of parameters: 1 id: 12 unit: def main() file: src/datatrove/tools/jobs_status.py start line: 0 end line: 0 size: 52 LOC McCabe index: 10 number of parameters: 0 id: 13 unit: def remove_dup_sentences() file: src/datatrove/pipeline/dedup/sentence_dedup.py start line: 0 end line: 0 size: 47 LOC McCabe index: 20 number of parameters: 3 id: 14 unit: def launch_job() file: src/datatrove/executor/slurm.py start line: 0 end line: 0 size: 46 LOC McCabe index: 17 number of parameters: 1 id: 15 unit: def process_record() file: src/datatrove/pipeline/readers/warc.py start line: 0 end line: 0 size: 41 LOC McCabe index: 20 number of parameters: 1 id: 16 unit: def filter() file: src/datatrove/pipeline/filters/gopher_quality_filter.py start line: 0 end line: 0 size: 40 LOC McCabe index: 35 number of parameters: 2 id: 17 unit: def run() file: src/datatrove/pipeline/dedup/url_dedup.py start line: 0 end line: 0 size: 39 LOC McCabe index: 12 number of parameters: 4 id: 18 unit: def filter() file: src/datatrove/pipeline/filters/c4_filters.py start line: 0 end line: 0 size: 38 LOC McCabe index: 24 number of parameters: 2 id: 19 unit: def run() file: src/datatrove/pipeline/decont/n_grams.py start line: 0 end line: 0 size: 37 LOC McCabe index: 12 number of parameters: 4 id: 20 unit: def run() file: src/datatrove/pipeline/dedup/minhash.py start line: 0 end line: 0 size: 37 LOC McCabe index: 12 number of parameters: 4 id: 21 unit: def run() file: src/datatrove/pipeline/tokens/tokenizer.py start line: 0 end line: 0 size: 37 LOC McCabe index: 11 number of parameters: 4 id: 22 unit: def get_dtokens_and_spaces() file: src/datatrove/utils/japanese_tokenizer.py start line: 0 end line: 0 size: 37 LOC McCabe index: 19 number of parameters: 3 id: 23 unit: def main() file: src/datatrove/tools/failed_logs.py start line: 0 end line: 0 size: 37 LOC McCabe index: 6 number of parameters: 0 id: 24 unit: def run() file: src/datatrove/pipeline/readers/huggingface.py start line: 0 end line: 0 size: 36 LOC McCabe index: 15 number of parameters: 4 id: 25 unit: def _run_for_rank() file: src/datatrove/executor/ray.py start line: 0 end line: 0 size: 36 LOC McCabe index: 9 number of parameters: 3 id: 26 unit: def run() file: src/datatrove/pipeline/stats/base.py start line: 0 end line: 0 size: 34 LOC McCabe index: 16 number of parameters: 4 id: 27 unit: def __init__() file: src/datatrove/executor/slurm.py start line: 0 end line: 0 size: 33 LOC McCabe index: 1 number of parameters: 27 id: 28 unit: def _load_file_list() file: src/datatrove/utils/dataset.py start line: 0 end line: 0 size: 32 LOC McCabe index: 10 number of parameters: 1 id: 29 unit: def read_files_shard() file: src/datatrove/pipeline/readers/base.py start line: 0 end line: 0 size: 31 LOC McCabe index: 9 number of parameters: 2 id: 30 unit: def run() file: src/datatrove/pipeline/extractors/base.py start line: 0 end line: 0 size: 31 LOC McCabe index: 7 number of parameters: 4 id: 31 unit: def _run_for_rank() file: src/datatrove/executor/base.py start line: 0 end line: 0 size: 31 LOC McCabe index: 9 number of parameters: 3 id: 32 unit: def get_duplicate_range() file: src/datatrove/pipeline/dedup/exact_substrings.py start line: 0 end line: 0 size: 30 LOC McCabe index: 11 number of parameters: 2 id: 33 unit: def __call__() file: src/datatrove/utils/japanese_tokenizer.py start line: 0 end line: 0 size: 29 LOC McCabe index: 11 number of parameters: 2 id: 34 unit: def seek_to_start() file: src/datatrove/utils/binaryio.py start line: 0 end line: 0 size: 28 LOC McCabe index: 9 number of parameters: 4 id: 35 unit: def filter() file: src/datatrove/pipeline/filters/gopher_repetition_filter.py start line: 0 end line: 0 size: 27 LOC McCabe index: 14 number of parameters: 2 id: 36 unit: def filter() file: src/datatrove/pipeline/filters/fasttext_filter.py start line: 0 end line: 0 size: 27 LOC McCabe index: 11 number of parameters: 2 id: 37 unit: def _get_badwords() file: src/datatrove/pipeline/filters/c4_filters.py start line: 0 end line: 0 size: 27 LOC McCabe index: 10 number of parameters: 2 id: 38 unit: def close() file: src/datatrove/pipeline/writers/huggingface.py start line: 0 end line: 0 size: 27 LOC McCabe index: 6 number of parameters: 2 id: 39 unit: def save_hashes() file: src/datatrove/pipeline/dedup/url_dedup.py start line: 0 end line: 0 size: 27 LOC McCabe index: 2 number of parameters: 3 id: 40 unit: def run() file: src/datatrove/pipeline/dedup/minhash.py start line: 0 end line: 0 size: 27 LOC McCabe index: 10 number of parameters: 4 id: 41 unit: def split_into_parts() file: src/datatrove/utils/text.py start line: 0 end line: 0 size: 27 LOC McCabe index: 12 number of parameters: 3 id: 42 unit: def get_worker_hash_range() file: src/datatrove/pipeline/dedup/minhash.py start line: 0 end line: 0 size: 26 LOC McCabe index: 7 number of parameters: 4 id: 43 unit: def write_unshuffled() file: src/datatrove/pipeline/tokens/tokenizer.py start line: 0 end line: 0 size: 26 LOC McCabe index: 12 number of parameters: 3 id: 44 unit: def to_dict() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 26 LOC McCabe index: 3 number of parameters: 1 id: 45 unit: def save_hashes() file: src/datatrove/pipeline/dedup/sentence_dedup.py start line: 0 end line: 0 size: 25 LOC McCabe index: 6 number of parameters: 3 id: 46 unit: def run() file: src/datatrove/pipeline/dedup/exact_substrings.py start line: 0 end line: 0 size: 25 LOC McCabe index: 6 number of parameters: 4 id: 47 unit: def _get_pos_from_index_file() file: src/datatrove/utils/dataset.py start line: 0 end line: 0 size: 24 LOC McCabe index: 15 number of parameters: 2 id: 48 unit: def __add__() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 24 LOC McCabe index: 5 number of parameters: 2 id: 49 unit: def _get_dtokens() file: src/datatrove/utils/japanese_tokenizer.py start line: 0 end line: 0 size: 24 LOC McCabe index: 17 number of parameters: 3 id: 50 unit: def run() file: src/datatrove/pipeline/stats/merger.py start line: 0 end line: 0 size: 23 LOC McCabe index: 9 number of parameters: 4 id: 51 unit: def _get_dataset_shard() file: src/datatrove/pipeline/readers/huggingface.py start line: 0 end line: 0 size: 23 LOC McCabe index: 5 number of parameters: 4 id: 52 unit: def _get_sub_tokens() file: src/datatrove/utils/japanese_tokenizer.py start line: 0 end line: 0 size: 23 LOC McCabe index: 7 number of parameters: 2 id: 53 unit: def run() file: src/datatrove/pipeline/filters/base_filter.py start line: 0 end line: 0 size: 22 LOC McCabe index: 10 number of parameters: 4 id: 54 unit: def extract_stats() file: src/datatrove/pipeline/stats/line_stats.py start line: 0 end line: 0 size: 22 LOC McCabe index: 14 number of parameters: 2 id: 55 unit: def chunk_text_on_bytes() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 22 LOC McCabe index: 7 number of parameters: 2 id: 56 unit: def filter() file: src/datatrove/pipeline/filters/url_filter.py start line: 0 end line: 0 size: 21 LOC McCabe index: 10 number of parameters: 2 id: 57 unit: def extract_stats() file: src/datatrove/pipeline/stats/word_stats.py start line: 0 end line: 0 size: 21 LOC McCabe index: 10 number of parameters: 2 id: 58 unit: def process_document() file: src/datatrove/pipeline/extractors/base.py start line: 0 end line: 0 size: 21 LOC McCabe index: 8 number of parameters: 3 id: 59 unit: def download_data() file: src/datatrove/pipeline/filters/url_filter.py start line: 0 end line: 0 size: 20 LOC McCabe index: 4 number of parameters: 1 id: 60 unit: def run() file: src/datatrove/pipeline/dedup/sentence_dedup.py start line: 0 end line: 0 size: 20 LOC McCabe index: 6 number of parameters: 4 id: 61 unit: def run() file: src/datatrove/pipeline/dedup/url_dedup.py start line: 0 end line: 0 size: 20 LOC McCabe index: 6 number of parameters: 4 id: 62 unit: def run() file: src/datatrove/pipeline/dedup/bloom_filter.py start line: 0 end line: 0 size: 20 LOC McCabe index: 6 number of parameters: 4 id: 63 unit: def close() file: src/datatrove/pipeline/tokens/megatron_tokenizer.py start line: 0 end line: 0 size: 20 LOC McCabe index: 5 number of parameters: 1 id: 64 unit: def get_sbatch_args() file: src/datatrove/executor/slurm.py start line: 0 end line: 0 size: 20 LOC McCabe index: 4 number of parameters: 2 id: 65 unit: def filter() file: src/datatrove/pipeline/filters/fineweb_quality_filter.py start line: 0 end line: 0 size: 19 LOC McCabe index: 14 number of parameters: 2 id: 66 unit: def extract_stats() file: src/datatrove/pipeline/stats/paragraph_stats.py start line: 0 end line: 0 size: 19 LOC McCabe index: 11 number of parameters: 2 id: 67 unit: def run() file: src/datatrove/pipeline/dedup/exact_substrings.py start line: 0 end line: 0 size: 19 LOC McCabe index: 4 number of parameters: 4 id: 68 unit: def get_bytearange() file: src/datatrove/pipeline/dedup/exact_substrings.py start line: 0 end line: 0 size: 19 LOC McCabe index: 7 number of parameters: 2 id: 69 unit: def to_dict() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 19 LOC McCabe index: 11 number of parameters: 1 id: 70 unit: def __add__() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 19 LOC McCabe index: 2 number of parameters: 2 id: 71 unit: def run() file: src/datatrove/executor/slurm.py start line: 0 end line: 0 size: 19 LOC McCabe index: 8 number of parameters: 1 id: 72 unit: def reader_factory() file: src/datatrove/tools/inspect_data.py start line: 0 end line: 0 size: 19 LOC McCabe index: 3 number of parameters: 3 id: 73 unit: def check_can_skip_sig_writing() file: src/datatrove/pipeline/dedup/minhash.py start line: 0 end line: 0 size: 18 LOC McCabe index: 14 number of parameters: 2 id: 74 unit: def step() file: src/datatrove/pipeline/dedup/bloom_filter.py start line: 0 end line: 0 size: 18 LOC McCabe index: 5 number of parameters: 2 id: 75 unit: def run() file: src/datatrove/pipeline/tokens/context_shuffler.py start line: 0 end line: 0 size: 18 LOC McCabe index: 3 number of parameters: 4 id: 76 unit: def write_final_metadata() file: src/datatrove/pipeline/tokens/tokenizer.py start line: 0 end line: 0 size: 18 LOC McCabe index: 4 number of parameters: 3 id: 77 unit: def try_sudachi_import() file: src/datatrove/utils/japanese_tokenizer.py start line: 0 end line: 0 size: 18 LOC McCabe index: 2 number of parameters: 1 id: 78 unit: def get_launch_file_contents() file: src/datatrove/executor/slurm.py start line: 0 end line: 0 size: 18 LOC McCabe index: 6 number of parameters: 3 id: 79 unit: def filter() file: src/datatrove/pipeline/filters/language_filter.py start line: 0 end line: 0 size: 17 LOC McCabe index: 10 number of parameters: 2 id: 80 unit: def get_frequencies() file: src/datatrove/pipeline/filters/unigram_log_probs.py start line: 0 end line: 0 size: 17 LOC McCabe index: 4 number of parameters: 1 id: 81 unit: def __init__() file: src/datatrove/pipeline/readers/parquet.py start line: 0 end line: 0 size: 17 LOC McCabe index: 1 number of parameters: 0 id: 82 unit: def read_file() file: src/datatrove/pipeline/readers/jsonl.py start line: 0 end line: 0 size: 17 LOC McCabe index: 5 number of parameters: 2 id: 83 unit: def run() file: src/datatrove/pipeline/readers/base.py start line: 0 end line: 0 size: 17 LOC McCabe index: 7 number of parameters: 4 id: 84 unit: def write_tokens() file: src/datatrove/pipeline/tokens/megatron_tokenizer.py start line: 0 end line: 0 size: 17 LOC McCabe index: 4 number of parameters: 3 id: 85 unit: def __init__() file: src/datatrove/pipeline/tokens/tokenizer.py start line: 0 end line: 0 size: 17 LOC McCabe index: 9 number of parameters: 10 id: 86 unit: def resolve_pos() file: src/datatrove/utils/japanese_tokenizer.py start line: 0 end line: 0 size: 17 LOC McCabe index: 7 number of parameters: 3 id: 87 unit: def download_file() file: src/datatrove/io.py start line: 0 end line: 0 size: 17 LOC McCabe index: 3 number of parameters: 3 id: 88 unit: def __init__() file: src/datatrove/pipeline/formatters/ftfy.py start line: 0 end line: 0 size: 16 LOC McCabe index: 1 number of parameters: 12 id: 89 unit: def compute_hashes() file: src/datatrove/pipeline/decont/n_grams.py start line: 0 end line: 0 size: 16 LOC McCabe index: 8 number of parameters: 3 id: 90 unit: def read_file() file: src/datatrove/pipeline/readers/parquet.py start line: 0 end line: 0 size: 16 LOC McCabe index: 5 number of parameters: 2 id: 91 unit: def __init__() file: src/datatrove/pipeline/readers/jsonl.py start line: 0 end line: 0 size: 16 LOC McCabe index: 1 number of parameters: 0 id: 92 unit: def __init__() file: src/datatrove/pipeline/readers/csv.py start line: 0 end line: 0 size: 16 LOC McCabe index: 1 number of parameters: 0 id: 93 unit: def __init__() file: src/datatrove/pipeline/readers/ipc.py start line: 0 end line: 0 size: 16 LOC McCabe index: 1 number of parameters: 0 id: 94 unit: def _default_adapter() file: src/datatrove/pipeline/readers/base.py start line: 0 end line: 0 size: 16 LOC McCabe index: 4 number of parameters: 4 id: 95 unit: def __init__() file: src/datatrove/pipeline/readers/warc.py start line: 0 end line: 0 size: 16 LOC McCabe index: 1 number of parameters: 0 id: 96 unit: def remove_duplicate() file: src/datatrove/pipeline/dedup/exact_substrings.py start line: 0 end line: 0 size: 16 LOC McCabe index: 5 number of parameters: 3 id: 97 unit: def simplify_text() file: src/datatrove/utils/text.py start line: 0 end line: 0 size: 16 LOC McCabe index: 12 number of parameters: 2 id: 98 unit: def from_dict() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 16 LOC McCabe index: 5 number of parameters: 2 id: 99 unit: def model() file: src/datatrove/pipeline/filters/fasttext_filter.py start line: 0 end line: 0 size: 15 LOC McCabe index: 7 number of parameters: 1 id: 100 unit: def filter() file: src/datatrove/pipeline/filters/c4_filters.py start line: 0 end line: 0 size: 15 LOC McCabe index: 5 number of parameters: 2 id: 101 unit: def filter() file: src/datatrove/pipeline/decont/n_grams.py start line: 0 end line: 0 size: 15 LOC McCabe index: 5 number of parameters: 2 id: 102 unit: def extract_stats() file: src/datatrove/pipeline/stats/sentence_stats.py start line: 0 end line: 0 size: 15 LOC McCabe index: 6 number of parameters: 2 id: 103 unit: def __init__() file: src/datatrove/pipeline/readers/base.py start line: 0 end line: 0 size: 15 LOC McCabe index: 1 number of parameters: 0 id: 104 unit: def upload_files() file: src/datatrove/pipeline/writers/huggingface.py start line: 0 end line: 0 size: 15 LOC McCabe index: 5 number of parameters: 2 id: 105 unit: def run_for_rank() file: src/datatrove/executor/ray.py start line: 0 end line: 0 size: 15 LOC McCabe index: 5 number of parameters: 2 id: 106 unit: def __init__() file: src/datatrove/executor/ray.py start line: 0 end line: 0 size: 15 LOC McCabe index: 1 number of parameters: 0 id: 107 unit: def __init__() file: src/datatrove/pipeline/formatters/pii.py start line: 0 end line: 0 size: 14 LOC McCabe index: 1 number of parameters: 7 id: 108 unit: def __init__() file: src/datatrove/pipeline/filters/gopher_quality_filter.py start line: 0 end line: 0 size: 14 LOC McCabe index: 1 number of parameters: 0 id: 109 unit: def find_all_duplicate() file: src/datatrove/pipeline/filters/gopher_repetition_filter.py start line: 0 end line: 0 size: 14 LOC McCabe index: 3 number of parameters: 2 id: 110 unit: def __init__() file: src/datatrove/pipeline/filters/c4_filters.py start line: 0 end line: 0 size: 14 LOC McCabe index: 1 number of parameters: 0 id: 111 unit: def load_index_hashes() file: src/datatrove/pipeline/decont/n_grams.py start line: 0 end line: 0 size: 14 LOC McCabe index: 3 number of parameters: 1 id: 112 unit: def __init__() file: src/datatrove/pipeline/readers/huggingface.py start line: 0 end line: 0 size: 14 LOC McCabe index: 1 number of parameters: 0 id: 113 unit: def get_document_from_dict() file: src/datatrove/pipeline/readers/base.py start line: 0 end line: 0 size: 14 LOC McCabe index: 4 number of parameters: 4 id: 114 unit: def __init__() file: src/datatrove/pipeline/tokens/merger.py start line: 0 end line: 0 size: 14 LOC McCabe index: 4 number of parameters: 0 id: 115 unit: def launch_merge_stats() file: src/datatrove/executor/slurm.py start line: 0 end line: 0 size: 14 LOC McCabe index: 1 number of parameters: 1 id: 116 unit: fn read_and_parse_file() file: src/datatrove/tools/fast_mh3/src/local_union_find.rs start line: 85 end line: 100 size: 14 LOC McCabe index: 2 number of parameters: 1 id: 117 unit: def format() file: src/datatrove/pipeline/formatters/symbol_lines_remover.py start line: 0 end line: 0 size: 13 LOC McCabe index: 9 number of parameters: 2 id: 118 unit: def read_file() file: src/datatrove/pipeline/readers/ipc.py start line: 0 end line: 0 size: 13 LOC McCabe index: 5 number of parameters: 2 id: 119 unit: def __init__() file: src/datatrove/pipeline/writers/huggingface.py start line: 0 end line: 0 size: 13 LOC McCabe index: 2 number of parameters: 14 id: 120 unit: def write() file: src/datatrove/pipeline/writers/disk_base.py start line: 0 end line: 0 size: 13 LOC McCabe index: 3 number of parameters: 4 id: 121 unit: def normalize_range() file: src/datatrove/pipeline/dedup/exact_substrings.py start line: 0 end line: 0 size: 13 LOC McCabe index: 3 number of parameters: 4 id: 122 unit: def tokenizer() file: src/datatrove/utils/tokenization.py start line: 0 end line: 0 size: 13 LOC McCabe index: 4 number of parameters: 1 id: 123 unit: def get_perplexity() file: src/datatrove/utils/perplexity.py start line: 0 end line: 0 size: 13 LOC McCabe index: 4 number of parameters: 3 id: 124 unit: def main() file: src/datatrove/tools/merge_stats.py start line: 0 end line: 0 size: 13 LOC McCabe index: 2 number of parameters: 0 id: 125 unit: def reader_class_from_name() file: src/datatrove/tools/inspect_data.py start line: 0 end line: 0 size: 13 LOC McCabe index: 1 number of parameters: 1 id: 126 unit: def read_file() file: src/datatrove/pipeline/readers/warc.py start line: 0 end line: 0 size: 12 LOC McCabe index: 4 number of parameters: 2 id: 127 unit: def _write() file: src/datatrove/pipeline/writers/parquet.py start line: 0 end line: 0 size: 12 LOC McCabe index: 4 number of parameters: 4 id: 128 unit: def get_hashes() file: src/datatrove/pipeline/dedup/sentence_dedup.py start line: 0 end line: 0 size: 12 LOC McCabe index: 7 number of parameters: 3 id: 129 unit: def run() file: src/datatrove/pipeline/dedup/exact_substrings.py start line: 0 end line: 0 size: 12 LOC McCabe index: 2 number of parameters: 4 id: 130 unit: def run() file: src/datatrove/pipeline/tokens/counter.py start line: 0 end line: 0 size: 12 LOC McCabe index: 5 number of parameters: 4 id: 131 unit: def _raise_error_for_missing_dependencies() file: src/datatrove/utils/_import_utils.py start line: 0 end line: 0 size: 12 LOC McCabe index: 2 number of parameters: 3 id: 132 unit: def _get_positions_from_tokens() file: src/datatrove/utils/dataset.py start line: 0 end line: 0 size: 12 LOC McCabe index: 1 number of parameters: 2 id: 133 unit: def __init__() file: src/datatrove/utils/dataset.py start line: 0 end line: 0 size: 12 LOC McCabe index: 1 number of parameters: 0 id: 134 unit: def chunk_doc_ends() file: src/datatrove/utils/tokenization.py start line: 0 end line: 0 size: 12 LOC McCabe index: 5 number of parameters: 2 id: 135 unit: def get_repr() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 12 LOC McCabe index: 5 number of parameters: 2 id: 136 unit: def model() file: src/datatrove/utils/lid.py start line: 0 end line: 0 size: 12 LOC McCabe index: 2 number of parameters: 1 id: 137 unit: def __init__() file: src/datatrove/executor/local.py start line: 0 end line: 0 size: 12 LOC McCabe index: 1 number of parameters: 0 id: 138 unit: def get_datafolder() file: src/datatrove/io.py start line: 0 end line: 0 size: 12 LOC McCabe index: 9 number of parameters: 1 id: 139 unit: def safely_create_file() file: src/datatrove/io.py start line: 0 end line: 0 size: 12 LOC McCabe index: 8 number of parameters: 3 id: 140 unit: def find_duplicates() file: src/datatrove/pipeline/filters/gopher_repetition_filter.py start line: 0 end line: 0 size: 11 LOC McCabe index: 3 number of parameters: 1 id: 141 unit: def __init__() file: src/datatrove/pipeline/filters/fineweb_quality_filter.py start line: 0 end line: 0 size: 11 LOC McCabe index: 1 number of parameters: 0 id: 142 unit: def get_all_files() file: src/datatrove/pipeline/dedup/exact_substrings.py start line: 0 end line: 0 size: 11 LOC McCabe index: 1 number of parameters: 3 id: 143 unit: def get_shingles() file: src/datatrove/pipeline/dedup/minhash.py start line: 0 end line: 0 size: 11 LOC McCabe index: 2 number of parameters: 2 id: 144 unit: def close() file: src/datatrove/pipeline/tokens/tokenizer.py start line: 0 end line: 0 size: 11 LOC McCabe index: 5 number of parameters: 1 id: 145 unit: def _cleanup_process() file: src/datatrove/pipeline/extractors/base.py start line: 0 end line: 0 size: 11 LOC McCabe index: 3 number of parameters: 1 id: 146 unit: def _worker() file: src/datatrove/pipeline/extractors/base.py start line: 0 end line: 0 size: 11 LOC McCabe index: 3 number of parameters: 3 id: 147 unit: def check_required_dependencies() file: src/datatrove/utils/_import_utils.py start line: 0 end line: 0 size: 11 LOC McCabe index: 7 number of parameters: 3 id: 148 unit: def _get_input_ids() file: src/datatrove/utils/dataset.py start line: 0 end line: 0 size: 11 LOC McCabe index: 3 number of parameters: 2 id: 149 unit: def __getitem__() file: src/datatrove/utils/dataset.py start line: 0 end line: 0 size: 11 LOC McCabe index: 3 number of parameters: 2 id: 150 unit: def tokenizer() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 11 LOC McCabe index: 2 number of parameters: 1 id: 151 unit: def __repr__() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 11 LOC McCabe index: 3 number of parameters: 2 id: 152 unit: def get_repr() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 11 LOC McCabe index: 3 number of parameters: 2 id: 153 unit: def update() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 11 LOC McCabe index: 3 number of parameters: 3 id: 154 unit: def __init__() file: src/datatrove/pipeline/filters/gopher_repetition_filter.py start line: 0 end line: 0 size: 10 LOC McCabe index: 1 number of parameters: 8 id: 155 unit: def __init__() file: src/datatrove/pipeline/filters/url_filter.py start line: 0 end line: 0 size: 10 LOC McCabe index: 1 number of parameters: 0 id: 156 unit: def __init__() file: src/datatrove/pipeline/stats/word_stats.py start line: 0 end line: 0 size: 10 LOC McCabe index: 1 number of parameters: 7 id: 157 unit: def extract_stats() file: src/datatrove/pipeline/stats/doc_stats.py start line: 0 end line: 0 size: 10 LOC McCabe index: 12 number of parameters: 2 id: 158 unit: def __init__() file: src/datatrove/pipeline/writers/parquet.py start line: 0 end line: 0 size: 10 LOC McCabe index: 1 number of parameters: 0 id: 159 unit: def sequence_reader() file: src/datatrove/pipeline/dedup/exact_substrings.py start line: 0 end line: 0 size: 10 LOC McCabe index: 3 number of parameters: 2 id: 160 unit: def get_shingles() file: src/datatrove/pipeline/dedup/bloom_filter.py start line: 0 end line: 0 size: 10 LOC McCabe index: 2 number of parameters: 2 id: 161 unit: def __init__() file: src/datatrove/pipeline/tokens/tokenizer.py start line: 0 end line: 0 size: 10 LOC McCabe index: 1 number of parameters: 0 id: 162 unit: def copy() file: src/datatrove/pipeline/tokens/tokenizer.py start line: 0 end line: 0 size: 10 LOC McCabe index: 1 number of parameters: 0 id: 163 unit: def get_loss_values() file: src/datatrove/pipeline/tokens/tokenizer.py start line: 0 end line: 0 size: 10 LOC McCabe index: 6 number of parameters: 3 id: 164 unit: def read_tuples_from_file() file: src/datatrove/utils/binaryio.py start line: 0 end line: 0 size: 10 LOC McCabe index: 6 number of parameters: 3 id: 165 unit: def simple_span_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 10 LOC McCabe index: 3 number of parameters: 2 id: 166 unit: def __init__() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 10 LOC McCabe index: 3 number of parameters: 3 id: 167 unit: def tokenizer() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 10 LOC McCabe index: 3 number of parameters: 1 id: 168 unit: def _do_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 10 LOC McCabe index: 6 number of parameters: 2 id: 169 unit: def load_word_tokenizer() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 10 LOC McCabe index: 4 number of parameters: 1 id: 170 unit: def __repr__() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 10 LOC McCabe index: 4 number of parameters: 1 id: 171 unit: def from_dict() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 10 LOC McCabe index: 2 number of parameters: 2 id: 172 unit: def _launch_run_for_rank() file: src/datatrove/executor/local.py start line: 0 end line: 0 size: 10 LOC McCabe index: 3 number of parameters: 5 id: 173 unit: fn from_path() file: src/datatrove/tools/fast_mh3/src/s3_union_find.rs start line: 67 end line: 76 size: 10 LOC McCabe index: 2 number of parameters: 1 id: 174 unit: def get_fs_with_filepath() file: src/datatrove/io.py start line: 0 end line: 0 size: 10 LOC McCabe index: 8 number of parameters: 1 id: 175 unit: def __init__() file: src/datatrove/pipeline/filters/fasttext_filter.py start line: 0 end line: 0 size: 9 LOC McCabe index: 1 number of parameters: 0 id: 176 unit: def __init__() file: src/datatrove/pipeline/stats/line_stats.py start line: 0 end line: 0 size: 9 LOC McCabe index: 1 number of parameters: 5 id: 177 unit: def __init__() file: src/datatrove/pipeline/stats/contamination_stats.py start line: 0 end line: 0 size: 9 LOC McCabe index: 1 number of parameters: 4 id: 178 unit: def __init__() file: src/datatrove/pipeline/stats/paragraph_stats.py start line: 0 end line: 0 size: 9 LOC McCabe index: 1 number of parameters: 7 id: 179 unit: def __init__() file: src/datatrove/pipeline/stats/sentence_stats.py start line: 0 end line: 0 size: 9 LOC McCabe index: 1 number of parameters: 7 id: 180 unit: def read_file() file: src/datatrove/pipeline/readers/csv.py start line: 0 end line: 0 size: 9 LOC McCabe index: 3 number of parameters: 2 id: 181 unit: def __init__() file: src/datatrove/pipeline/writers/disk_base.py start line: 0 end line: 0 size: 9 LOC McCabe index: 2 number of parameters: 0 id: 182 unit: def read_sigs() file: src/datatrove/pipeline/dedup/minhash.py start line: 0 end line: 0 size: 9 LOC McCabe index: 1 number of parameters: 0 id: 183 unit: def get_signature() file: src/datatrove/pipeline/dedup/minhash.py start line: 0 end line: 0 size: 9 LOC McCabe index: 3 number of parameters: 2 id: 184 unit: def __init__() file: src/datatrove/pipeline/dedup/minhash.py start line: 0 end line: 0 size: 9 LOC McCabe index: 1 number of parameters: 0 id: 185 unit: def __init__() file: src/datatrove/pipeline/dedup/minhash.py start line: 0 end line: 0 size: 9 LOC McCabe index: 1 number of parameters: 0 id: 186 unit: def cleanup() file: src/datatrove/pipeline/tokens/tokenizer.py start line: 0 end line: 0 size: 9 LOC McCabe index: 6 number of parameters: 1 id: 187 unit: def __init__() file: src/datatrove/pipeline/extractors/modular.py start line: 0 end line: 0 size: 9 LOC McCabe index: 1 number of parameters: 5 id: 188 unit: def extract() file: src/datatrove/pipeline/extractors/trafilatura.py start line: 0 end line: 0 size: 9 LOC McCabe index: 1 number of parameters: 2 id: 189 unit: def model() file: src/datatrove/utils/perplexity.py start line: 0 end line: 0 size: 9 LOC McCabe index: 2 number of parameters: 1 id: 190 unit: def replace() file: src/datatrove/pipeline/formatters/pii.py start line: 0 end line: 0 size: 8 LOC McCabe index: 3 number of parameters: 2 id: 191 unit: def __init__() file: src/datatrove/pipeline/filters/language_filter.py start line: 0 end line: 0 size: 8 LOC McCabe index: 1 number of parameters: 0 id: 192 unit: def paragraph_filter() file: src/datatrove/pipeline/filters/c4_filters.py start line: 0 end line: 0 size: 8 LOC McCabe index: 4 number of parameters: 2 id: 193 unit: def __init__() file: src/datatrove/pipeline/stats/perplexity_stats.py start line: 0 end line: 0 size: 8 LOC McCabe index: 1 number of parameters: 6 id: 194 unit: def __init__() file: src/datatrove/pipeline/readers/base.py start line: 0 end line: 0 size: 8 LOC McCabe index: 1 number of parameters: 0 id: 195 unit: def close() file: src/datatrove/pipeline/writers/parquet.py start line: 0 end line: 0 size: 8 LOC McCabe index: 3 number of parameters: 1 id: 196 unit: def __init__() file: src/datatrove/pipeline/writers/jsonl.py start line: 0 end line: 0 size: 8 LOC McCabe index: 2 number of parameters: 0 id: 197 unit: def parameters() file: src/datatrove/pipeline/dedup/minhash.py start line: 0 end line: 0 size: 8 LOC McCabe index: 2 number of parameters: 1 id: 198 unit: def parameters() file: src/datatrove/pipeline/dedup/bloom_filter.py start line: 0 end line: 0 size: 8 LOC McCabe index: 2 number of parameters: 1 id: 199 unit: def __init__() file: src/datatrove/pipeline/tokens/megatron_tokenizer.py start line: 0 end line: 0 size: 8 LOC McCabe index: 3 number of parameters: 0 id: 200 unit: def write_bytes() file: src/datatrove/pipeline/tokens/tokenizer.py start line: 0 end line: 0 size: 8 LOC McCabe index: 2 number of parameters: 3 id: 201 unit: def _ensure_process() file: src/datatrove/pipeline/extractors/base.py start line: 0 end line: 0 size: 8 LOC McCabe index: 4 number of parameters: 2 id: 202 unit: def __init__() file: src/datatrove/utils/dataset.py start line: 0 end line: 0 size: 8 LOC McCabe index: 1 number of parameters: 0 id: 203 unit: def span_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 8 LOC McCabe index: 4 number of parameters: 2 id: 204 unit: def create_hash_func() file: src/datatrove/utils/hashing.py start line: 0 end line: 0 size: 8 LOC McCabe index: 5 number of parameters: 1 id: 205 unit: def get_incomplete_ranks() file: src/datatrove/executor/base.py start line: 0 end line: 0 size: 8 LOC McCabe index: 3 number of parameters: 2 id: 206 unit: def default() file: src/datatrove/executor/base.py start line: 0 end line: 0 size: 8 LOC McCabe index: 6 number of parameters: 2 id: 207 unit: fn new() file: src/datatrove/tools/fast_mh3/src/s3_union_find.rs start line: 95 end line: 102 size: 8 LOC McCabe index: 1 number of parameters: 0 id: 208 unit: fn new() file: src/datatrove/tools/fast_mh3/src/local_union_find.rs start line: 54 end line: 61 size: 8 LOC McCabe index: 1 number of parameters: 0 id: 209 unit: def open_file() file: src/datatrove/io.py start line: 0 end line: 0 size: 8 LOC McCabe index: 3 number of parameters: 3 id: 210 unit: def __init__() file: src/datatrove/pipeline/filters/c4_filters.py start line: 0 end line: 0 size: 7 LOC McCabe index: 1 number of parameters: 0 id: 211 unit: def __init__() file: src/datatrove/pipeline/decont/n_grams.py start line: 0 end line: 0 size: 7 LOC McCabe index: 2 number of parameters: 0 id: 212 unit: def __init__() file: src/datatrove/pipeline/stats/lang_stats.py start line: 0 end line: 0 size: 7 LOC McCabe index: 1 number of parameters: 4 id: 213 unit: def extract_stats() file: src/datatrove/pipeline/stats/lang_stats.py start line: 0 end line: 0 size: 7 LOC McCabe index: 3 number of parameters: 2 id: 214 unit: def extract_stats() file: src/datatrove/pipeline/stats/contamination_stats.py start line: 0 end line: 0 size: 7 LOC McCabe index: 3 number of parameters: 2 id: 215 unit: def __init__() file: src/datatrove/pipeline/stats/token_stats.py start line: 0 end line: 0 size: 7 LOC McCabe index: 1 number of parameters: 0 id: 216 unit: def extract_stats() file: src/datatrove/pipeline/stats/token_stats.py start line: 0 end line: 0 size: 7 LOC McCabe index: 2 number of parameters: 2 id: 217 unit: def run() file: src/datatrove/pipeline/dedup/sentence_dedup.py start line: 0 end line: 0 size: 7 LOC McCabe index: 2 number of parameters: 4 id: 218 unit: def __init__() file: src/datatrove/pipeline/dedup/sentence_dedup.py start line: 0 end line: 0 size: 7 LOC McCabe index: 1 number of parameters: 0 id: 219 unit: def __init__() file: src/datatrove/pipeline/dedup/sentence_dedup.py start line: 0 end line: 0 size: 7 LOC McCabe index: 1 number of parameters: 0 id: 220 unit: def get_hashes() file: src/datatrove/pipeline/dedup/url_dedup.py start line: 0 end line: 0 size: 7 LOC McCabe index: 3 number of parameters: 3 id: 221 unit: def run() file: src/datatrove/pipeline/dedup/url_dedup.py start line: 0 end line: 0 size: 7 LOC McCabe index: 2 number of parameters: 4 id: 222 unit: def __init__() file: src/datatrove/pipeline/dedup/url_dedup.py start line: 0 end line: 0 size: 7 LOC McCabe index: 1 number of parameters: 0 id: 223 unit: def __init__() file: src/datatrove/pipeline/dedup/url_dedup.py start line: 0 end line: 0 size: 7 LOC McCabe index: 1 number of parameters: 0 id: 224 unit: def reset() file: src/datatrove/pipeline/dedup/exact_substrings.py start line: 0 end line: 0 size: 7 LOC McCabe index: 1 number of parameters: 1 id: 225 unit: def __init__() file: src/datatrove/pipeline/dedup/minhash.py start line: 0 end line: 0 size: 7 LOC McCabe index: 1 number of parameters: 0 id: 226 unit: def __init__() file: src/datatrove/pipeline/dedup/minhash.py start line: 0 end line: 0 size: 7 LOC McCabe index: 1 number of parameters: 0 id: 227 unit: def __init__() file: src/datatrove/pipeline/dedup/bloom_filter.py start line: 0 end line: 0 size: 7 LOC McCabe index: 1 number of parameters: 0 id: 228 unit: def query() file: src/datatrove/pipeline/dedup/bloom_filter.py start line: 0 end line: 0 size: 7 LOC McCabe index: 3 number of parameters: 2 id: 229 unit: def __init__() file: src/datatrove/pipeline/tokens/context_shuffler.py start line: 0 end line: 0 size: 7 LOC McCabe index: 1 number of parameters: 0 id: 230 unit: def _sequence_pointers() file: src/datatrove/pipeline/tokens/megatron_tokenizer.py start line: 0 end line: 0 size: 7 LOC McCabe index: 2 number of parameters: 2 id: 231 unit: def write() file: src/datatrove/pipeline/tokens/megatron_tokenizer.py start line: 0 end line: 0 size: 7 LOC McCabe index: 1 number of parameters: 2 id: 232 unit: def run() file: src/datatrove/pipeline/tokens/megatron_tokenizer.py start line: 0 end line: 0 size: 7 LOC McCabe index: 2 number of parameters: 4 id: 233 unit: def extract() file: src/datatrove/pipeline/extractors/modular.py start line: 0 end line: 0 size: 7 LOC McCabe index: 1 number of parameters: 2 id: 234 unit: def __init__() file: src/datatrove/pipeline/extractors/trafilatura.py start line: 0 end line: 0 size: 7 LOC McCabe index: 1 number of parameters: 0 id: 235 unit: def model() file: src/datatrove/utils/perplexity.py start line: 0 end line: 0 size: 7 LOC McCabe index: 2 number of parameters: 1 id: 236 unit: def from_dict() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 7 LOC McCabe index: 2 number of parameters: 2 id: 237 unit: def predict() file: src/datatrove/utils/lid.py start line: 0 end line: 0 size: 7 LOC McCabe index: 4 number of parameters: 2 id: 238 unit: def dependency() file: src/datatrove/executor/slurm.py start line: 0 end line: 0 size: 7 LOC McCabe index: 4 number of parameters: 1 id: 239 unit: def load_dataset_bytes() file: src/datatrove/tools/check_dataset.py start line: 0 end line: 0 size: 7 LOC McCabe index: 2 number of parameters: 3 id: 240 unit: fn format_duration() file: src/datatrove/tools/fast_mh3/src/s3_union_find.rs start line: 17 end line: 23 size: 7 LOC McCabe index: 1 number of parameters: 1 id: 241 unit: def run() file: src/datatrove/pipeline/formatters/base.py start line: 0 end line: 0 size: 6 LOC McCabe index: 2 number of parameters: 4 id: 242 unit: def public_ip_validator() file: src/datatrove/pipeline/formatters/pii.py start line: 0 end line: 0 size: 6 LOC McCabe index: 3 number of parameters: 2 id: 243 unit: def format() file: src/datatrove/pipeline/formatters/pii.py start line: 0 end line: 0 size: 6 LOC McCabe index: 3 number of parameters: 2 id: 244 unit: def find_top_duplicate() file: src/datatrove/pipeline/filters/gopher_repetition_filter.py start line: 0 end line: 0 size: 6 LOC McCabe index: 2 number of parameters: 1 id: 245 unit: def __init__() file: src/datatrove/pipeline/filters/base_filter.py start line: 0 end line: 0 size: 6 LOC McCabe index: 3 number of parameters: 3 id: 246 unit: def get_logprob() file: src/datatrove/pipeline/filters/unigram_log_probs.py start line: 0 end line: 0 size: 6 LOC McCabe index: 4 number of parameters: 2 id: 247 unit: def __init__() file: src/datatrove/pipeline/decont/n_grams.py start line: 0 end line: 0 size: 6 LOC McCabe index: 1 number of parameters: 0 id: 248 unit: def __init__() file: src/datatrove/pipeline/stats/base.py start line: 0 end line: 0 size: 6 LOC McCabe index: 1 number of parameters: 0 id: 249 unit: def __init__() file: src/datatrove/pipeline/stats/doc_stats.py start line: 0 end line: 0 size: 6 LOC McCabe index: 1 number of parameters: 3 id: 250 unit: def __init__() file: src/datatrove/pipeline/stats/merger.py start line: 0 end line: 0 size: 6 LOC McCabe index: 1 number of parameters: 0 id: 251 unit: def _iter_file_batches() file: src/datatrove/pipeline/readers/ipc.py start line: 0 end line: 0 size: 6 LOC McCabe index: 2 number of parameters: 2 id: 252 unit: def _iter_stream_batches() file: src/datatrove/pipeline/readers/ipc.py start line: 0 end line: 0 size: 6 LOC McCabe index: 2 number of parameters: 2 id: 253 unit: def _write_batch() file: src/datatrove/pipeline/writers/parquet.py start line: 0 end line: 0 size: 6 LOC McCabe index: 2 number of parameters: 2 id: 254 unit: def run() file: src/datatrove/pipeline/writers/disk_base.py start line: 0 end line: 0 size: 6 LOC McCabe index: 2 number of parameters: 4 id: 255 unit: def __init__() file: src/datatrove/pipeline/dedup/sentence_dedup.py start line: 0 end line: 0 size: 6 LOC McCabe index: 1 number of parameters: 0 id: 256 unit: def read_sigs() file: src/datatrove/pipeline/dedup/sentence_dedup.py start line: 0 end line: 0 size: 6 LOC McCabe index: 1 number of parameters: 0 id: 257 unit: def __init__() file: src/datatrove/pipeline/dedup/sentence_dedup.py start line: 0 end line: 0 size: 6 LOC McCabe index: 1 number of parameters: 0 id: 258 unit: def __lt__() file: src/datatrove/pipeline/dedup/url_dedup.py start line: 0 end line: 0 size: 6 LOC McCabe index: 1 number of parameters: 2 id: 259 unit: def read_sigs() file: src/datatrove/pipeline/dedup/url_dedup.py start line: 0 end line: 0 size: 6 LOC McCabe index: 1 number of parameters: 0 id: 260 unit: def __init__() file: src/datatrove/pipeline/dedup/exact_substrings.py start line: 0 end line: 0 size: 6 LOC McCabe index: 1 number of parameters: 0 id: 261 unit: def get_sequence_bytes_offset() file: src/datatrove/pipeline/dedup/exact_substrings.py start line: 0 end line: 0 size: 6 LOC McCabe index: 1 number of parameters: 1 id: 262 unit: def __init__() file: src/datatrove/pipeline/dedup/minhash.py start line: 0 end line: 0 size: 6 LOC McCabe index: 1 number of parameters: 0 id: 263 unit: def __init__() file: src/datatrove/pipeline/tokens/megatron_tokenizer.py start line: 0 end line: 0 size: 6 LOC McCabe index: 1 number of parameters: 0 id: 264 unit: def _set_oom_score_adj() file: src/datatrove/pipeline/extractors/base.py start line: 0 end line: 0 size: 6 LOC McCabe index: 4 number of parameters: 2 id: 265 unit: def batched() file: src/datatrove/utils/batching.py start line: 0 end line: 0 size: 6 LOC McCabe index: 3 number of parameters: 2 id: 266 unit: def ngrams() file: src/datatrove/utils/text.py start line: 0 end line: 0 size: 6 LOC McCabe index: 4 number of parameters: 2 id: 267 unit: def _is_distribution_available() file: src/datatrove/utils/_import_utils.py start line: 0 end line: 0 size: 6 LOC McCabe index: 4 number of parameters: 1 id: 268 unit: def __init__() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 6 LOC McCabe index: 1 number of parameters: 2 id: 269 unit: def _try_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 6 LOC McCabe index: 2 number of parameters: 2 id: 270 unit: def sent_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 6 LOC McCabe index: 3 number of parameters: 2 id: 271 unit: def __add__() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 6 LOC McCabe index: 1 number of parameters: 2 id: 272 unit: def to_dict() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 6 LOC McCabe index: 1 number of parameters: 1 id: 273 unit: def __init__() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 6 LOC McCabe index: 6 number of parameters: 2 id: 274 unit: def from_disk() file: src/datatrove/utils/japanese_tokenizer.py start line: 0 end line: 0 size: 6 LOC McCabe index: 1 number of parameters: 4 id: 275 unit: def launch_slurm_job() file: src/datatrove/executor/slurm.py start line: 0 end line: 0 size: 6 LOC McCabe index: 1 number of parameters: 3 id: 276 unit: def __init__() file: src/datatrove/executor/base.py start line: 0 end line: 0 size: 6 LOC McCabe index: 1 number of parameters: 0 id: 277 unit: def __init__() file: src/datatrove/io.py start line: 0 end line: 0 size: 6 LOC McCabe index: 1 number of parameters: 0 id: 278 unit: def list_files() file: src/datatrove/io.py start line: 0 end line: 0 size: 6 LOC McCabe index: 1 number of parameters: 0 id: 279 unit: def resolve_paths() file: src/datatrove/io.py start line: 0 end line: 0 size: 6 LOC McCabe index: 4 number of parameters: 2 id: 280 unit: def __init__() file: src/datatrove/pipeline/filters/sampler_filter.py start line: 0 end line: 0 size: 5 LOC McCabe index: 1 number of parameters: 0 id: 281 unit: def __init__() file: src/datatrove/pipeline/filters/c4_filters.py start line: 0 end line: 0 size: 5 LOC McCabe index: 1 number of parameters: 2 id: 282 unit: def get_filter_result() file: src/datatrove/pipeline/filters/base_filter.py start line: 0 end line: 0 size: 5 LOC McCabe index: 2 number of parameters: 1 id: 283 unit: def __new__() file: src/datatrove/pipeline/base.py start line: 0 end line: 0 size: 5 LOC McCabe index: 3 number of parameters: 3 id: 284 unit: def get_document_from_dict() file: src/datatrove/pipeline/readers/huggingface.py start line: 0 end line: 0 size: 5 LOC McCabe index: 2 number of parameters: 4 id: 285 unit: def get_document_from_dict() file: src/datatrove/pipeline/readers/base.py start line: 0 end line: 0 size: 5 LOC McCabe index: 2 number of parameters: 4 id: 286 unit: def _default_adapter() file: src/datatrove/pipeline/writers/disk_base.py start line: 0 end line: 0 size: 5 LOC McCabe index: 5 number of parameters: 2 id: 287 unit: def __init__() file: src/datatrove/pipeline/dedup/url_dedup.py start line: 0 end line: 0 size: 5 LOC McCabe index: 1 number of parameters: 0 id: 288 unit: def __init__() file: src/datatrove/pipeline/dedup/url_dedup.py start line: 0 end line: 0 size: 5 LOC McCabe index: 1 number of parameters: 0 id: 289 unit: def prepare_doc() file: src/datatrove/pipeline/dedup/exact_substrings.py start line: 0 end line: 0 size: 5 LOC McCabe index: 1 number of parameters: 4 id: 290 unit: def __init__() file: src/datatrove/pipeline/dedup/exact_substrings.py start line: 0 end line: 0 size: 5 LOC McCabe index: 1 number of parameters: 4 id: 291 unit: def get_optimal_k() file: src/datatrove/pipeline/dedup/bloom_filter.py start line: 0 end line: 0 size: 5 LOC McCabe index: 1 number of parameters: 2 id: 292 unit: def update_bf() file: src/datatrove/pipeline/dedup/bloom_filter.py start line: 0 end line: 0 size: 5 LOC McCabe index: 2 number of parameters: 2 id: 293 unit: def __init__() file: src/datatrove/pipeline/tokens/counter.py start line: 0 end line: 0 size: 5 LOC McCabe index: 3 number of parameters: 0 id: 294 unit: def run() file: src/datatrove/pipeline/tokens/counter.py start line: 0 end line: 0 size: 5 LOC McCabe index: 2 number of parameters: 4 id: 295 unit: def __init__() file: src/datatrove/pipeline/extractors/base.py start line: 0 end line: 0 size: 5 LOC McCabe index: 1 number of parameters: 2 id: 296 unit: def __del__() file: src/datatrove/utils/dataset.py start line: 0 end line: 0 size: 5 LOC McCabe index: 3 number of parameters: 1 id: 297 unit: def load_tokenizer() file: src/datatrove/utils/tokenization.py start line: 0 end line: 0 size: 5 LOC McCabe index: 2 number of parameters: 1 id: 298 unit: def __init__() file: src/datatrove/utils/tokenization.py start line: 0 end line: 0 size: 5 LOC McCabe index: 1 number of parameters: 0 id: 299 unit: def tokenizer() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 5 LOC McCabe index: 2 number of parameters: 1 id: 300 unit: def word_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 5 LOC McCabe index: 3 number of parameters: 2 id: 301 unit: def sent_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 5 LOC McCabe index: 3 number of parameters: 2 id: 302 unit: def __init__() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 5 LOC McCabe index: 1 number of parameters: 3 id: 303 unit: def tokenizer() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 5 LOC McCabe index: 2 number of parameters: 1 id: 304 unit: def __init__() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 5 LOC McCabe index: 1 number of parameters: 1 id: 305 unit: def wt() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 5 LOC McCabe index: 2 number of parameters: 1 id: 306 unit: def span_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 5 LOC McCabe index: 2 number of parameters: 2 id: 307 unit: def __init__() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 5 LOC McCabe index: 1 number of parameters: 1 id: 308 unit: def __add__() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 5 LOC McCabe index: 2 number of parameters: 2 id: 309 unit: def __init__() file: src/datatrove/utils/japanese_tokenizer.py start line: 0 end line: 0 size: 5 LOC McCabe index: 3 number of parameters: 3 id: 310 unit: def from_bytes() file: src/datatrove/utils/japanese_tokenizer.py start line: 0 end line: 0 size: 5 LOC McCabe index: 1 number of parameters: 3 id: 311 unit: def requeue_handler() file: src/datatrove/executor/slurm.py start line: 0 end line: 0 size: 5 LOC McCabe index: 1 number of parameters: 2 id: 312 unit: def main() file: src/datatrove/tools/launch_pickled_pipeline.py start line: 0 end line: 0 size: 5 LOC McCabe index: 1 number of parameters: 0 id: 313 unit: def __init__() file: src/datatrove/io.py start line: 0 end line: 0 size: 5 LOC McCabe index: 1 number of parameters: 4 id: 314 unit: def get_shard() file: src/datatrove/io.py start line: 0 end line: 0 size: 5 LOC McCabe index: 2 number of parameters: 4 id: 315 unit: def get_shard_from_paths_file() file: src/datatrove/io.py start line: 0 end line: 0 size: 5 LOC McCabe index: 3 number of parameters: 3 id: 316 unit: def __init__() file: src/datatrove/pipeline/formatters/symbol_lines_remover.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 0 id: 317 unit: def filter() file: src/datatrove/pipeline/filters/c4_filters.py start line: 0 end line: 0 size: 4 LOC McCabe index: 2 number of parameters: 2 id: 318 unit: def update_doc_stats() file: src/datatrove/pipeline/base.py start line: 0 end line: 0 size: 4 LOC McCabe index: 2 number of parameters: 2 id: 319 unit: def track_time() file: src/datatrove/pipeline/base.py start line: 0 end line: 0 size: 4 LOC McCabe index: 2 number of parameters: 2 id: 320 unit: def is_bullet_line() file: src/datatrove/pipeline/stats/line_stats.py start line: 0 end line: 0 size: 4 LOC McCabe index: 2 number of parameters: 1 id: 321 unit: def extract_stats() file: src/datatrove/pipeline/stats/perplexity_stats.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 2 id: 322 unit: def _get_output_filename() file: src/datatrove/pipeline/writers/disk_base.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 4 id: 323 unit: def _get_filename_with_file_id() file: src/datatrove/pipeline/writers/disk_base.py start line: 0 end line: 0 size: 4 LOC McCabe index: 2 number of parameters: 2 id: 324 unit: def read_duplicates() file: src/datatrove/pipeline/dedup/sentence_dedup.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 2 id: 325 unit: def get_indexes() file: src/datatrove/pipeline/dedup/bloom_filter.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 2 id: 326 unit: def write() file: src/datatrove/pipeline/tokens/tokenizer.py start line: 0 end line: 0 size: 4 LOC McCabe index: 2 number of parameters: 3 id: 327 unit: def get_endpoint() file: src/datatrove/pipeline/tokens/tokenizer.py start line: 0 end line: 0 size: 4 LOC McCabe index: 2 number of parameters: 1 id: 328 unit: def get_doc_ends_from_boundary() file: src/datatrove/pipeline/tokens/tokenizer.py start line: 0 end line: 0 size: 4 LOC McCabe index: 3 number of parameters: 2 id: 329 unit: def get_output_filename() file: src/datatrove/pipeline/tokens/tokenizer.py start line: 0 end line: 0 size: 4 LOC McCabe index: 6 number of parameters: 4 id: 330 unit: def __init__() file: src/datatrove/pipeline/extractors/base.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 2 id: 331 unit: def read_np_from_file() file: src/datatrove/utils/binaryio.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 0 id: 332 unit: def add_task_logger() file: src/datatrove/utils/logging.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 0 id: 333 unit: def close_task_logger() file: src/datatrove/utils/logging.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 1 id: 334 unit: def __getitem__() file: src/datatrove/utils/dataset.py start line: 0 end line: 0 size: 4 LOC McCabe index: 3 number of parameters: 2 id: 335 unit: def __init__() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 2 id: 336 unit: def word_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 2 id: 337 unit: def sent_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 2 id: 338 unit: def word_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 4 LOC McCabe index: 3 number of parameters: 2 id: 339 unit: def sent_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 4 LOC McCabe index: 2 number of parameters: 2 id: 340 unit: def word_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 2 id: 341 unit: def sent_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 2 id: 342 unit: def word_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 2 id: 343 unit: def sent_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 2 id: 344 unit: def word_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 2 id: 345 unit: def word_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 2 id: 346 unit: def __init__() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 1 id: 347 unit: def word_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 2 id: 348 unit: def __init__() file: src/datatrove/utils/perplexity.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 0 id: 349 unit: def __init__() file: src/datatrove/utils/perplexity.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 0 id: 350 unit: def tokenizer() file: src/datatrove/utils/perplexity.py start line: 0 end line: 0 size: 4 LOC McCabe index: 2 number of parameters: 1 id: 351 unit: def from_pretrained() file: src/datatrove/utils/perplexity.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 0 id: 352 unit: def __init__() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 4 LOC McCabe index: 2 number of parameters: 4 id: 353 unit: def __init__() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 2 id: 354 unit: def __add__() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 4 LOC McCabe index: 3 number of parameters: 2 id: 355 unit: def update() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 4 LOC McCabe index: 2 number of parameters: 3 id: 356 unit: def create_tokenizer() file: src/datatrove/utils/japanese_tokenizer.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 1 id: 357 unit: def to_disk() file: src/datatrove/utils/japanese_tokenizer.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 4 id: 358 unit: def __init__() file: src/datatrove/utils/lid.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 3 id: 359 unit: def to_json() file: src/datatrove/executor/base.py start line: 0 end line: 0 size: 4 LOC McCabe index: 4 number of parameters: 2 id: 360 unit: def get_file() file: src/datatrove/io.py start line: 0 end line: 0 size: 4 LOC McCabe index: 2 number of parameters: 2 id: 361 unit: def pop() file: src/datatrove/io.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 2 id: 362 unit: def close() file: src/datatrove/io.py start line: 0 end line: 0 size: 4 LOC McCabe index: 2 number of parameters: 1 id: 363 unit: def open() file: src/datatrove/io.py start line: 0 end line: 0 size: 4 LOC McCabe index: 4 number of parameters: 5 id: 364 unit: def do_download_file() file: src/datatrove/io.py start line: 0 end line: 0 size: 4 LOC McCabe index: 1 number of parameters: 0 id: 365 unit: def _get_true_fs() file: src/datatrove/io.py start line: 0 end line: 0 size: 4 LOC McCabe index: 2 number of parameters: 1 id: 366 unit: def format() file: src/datatrove/pipeline/formatters/ftfy.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 2 id: 367 unit: def __init__() file: src/datatrove/pipeline/filters/regex_filter.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 3 id: 368 unit: def __init__() file: src/datatrove/pipeline/filters/lambda_filter.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 4 id: 369 unit: def get_list() file: src/datatrove/pipeline/filters/url_filter.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 4 id: 370 unit: def __init__() file: src/datatrove/pipeline/base.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 1 id: 371 unit: def stat_update() file: src/datatrove/pipeline/base.py start line: 0 end line: 0 size: 3 LOC McCabe index: 2 number of parameters: 4 id: 372 unit: def run() file: src/datatrove/pipeline/base.py start line: 0 end line: 0 size: 3 LOC McCabe index: 2 number of parameters: 4 id: 373 unit: def _on_file_switch() file: src/datatrove/pipeline/writers/huggingface.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 4 id: 374 unit: def _on_file_switch() file: src/datatrove/pipeline/writers/parquet.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 4 id: 375 unit: def _write() file: src/datatrove/pipeline/writers/jsonl.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 4 id: 376 unit: def read_duplicates() file: src/datatrove/pipeline/dedup/url_dedup.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 3 id: 377 unit: def __init__() file: src/datatrove/pipeline/dedup/exact_substrings.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 3 id: 378 unit: def save_sizes() file: src/datatrove/pipeline/dedup/exact_substrings.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 3 id: 379 unit: def __post_init__() file: src/datatrove/pipeline/dedup/bloom_filter.py start line: 0 end line: 0 size: 3 LOC McCabe index: 2 number of parameters: 1 id: 380 unit: def get_false_positive_prob() file: src/datatrove/pipeline/dedup/bloom_filter.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 3 id: 381 unit: def get_ordering() file: src/datatrove/pipeline/tokens/context_shuffler.py start line: 0 end line: 0 size: 3 LOC McCabe index: 2 number of parameters: 2 id: 382 unit: def write_loss_bytes() file: src/datatrove/pipeline/tokens/tokenizer.py start line: 0 end line: 0 size: 3 LOC McCabe index: 2 number of parameters: 2 id: 383 unit: def get_ordering() file: src/datatrove/pipeline/tokens/merger.py start line: 0 end line: 0 size: 3 LOC McCabe index: 3 number of parameters: 2 id: 384 unit: def load_doc_ends() file: src/datatrove/pipeline/tokens/merger.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 1 id: 385 unit: def __exit__() file: src/datatrove/pipeline/extractors/base.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 4 id: 386 unit: def get_env_bool() file: src/datatrove/utils/logging.py start line: 0 end line: 0 size: 3 LOC McCabe index: 2 number of parameters: 2 id: 387 unit: def setup_default_logger() file: src/datatrove/utils/logging.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 0 id: 388 unit: def log_pipeline() file: src/datatrove/utils/logging.py start line: 0 end line: 0 size: 3 LOC McCabe index: 3 number of parameters: 1 id: 389 unit: def span_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 3 LOC McCabe index: 2 number of parameters: 2 id: 390 unit: def __init__() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 1 id: 391 unit: def span_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 2 id: 392 unit: def __init__() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 2 id: 393 unit: def span_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 2 id: 394 unit: def word_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 3 LOC McCabe index: 2 number of parameters: 2 id: 395 unit: def sent_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 3 LOC McCabe index: 2 number of parameters: 2 id: 396 unit: def __init__() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 1 id: 397 unit: def sent_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 2 id: 398 unit: def span_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 2 id: 399 unit: def __init__() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 1 id: 400 unit: def sent_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 2 id: 401 unit: def span_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 2 id: 402 unit: def sent_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 2 id: 403 unit: def span_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 2 id: 404 unit: def tokenize() file: src/datatrove/utils/perplexity.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 2 id: 405 unit: def normalize() file: src/datatrove/utils/perplexity.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 0 id: 406 unit: def __post_init__() file: src/datatrove/utils/hashing.py start line: 0 end line: 0 size: 3 LOC McCabe index: 2 number of parameters: 1 id: 407 unit: def __post_init__() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 3 LOC McCabe index: 2 number of parameters: 1 id: 408 unit: def score() file: src/datatrove/utils/japanese_tokenizer.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 2 id: 409 unit: def to_bytes() file: src/datatrove/utils/japanese_tokenizer.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 2 id: 410 unit: def __init__() file: src/datatrove/utils/lid.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 4 id: 411 unit: def save_executor_as_json() file: src/datatrove/executor/base.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 2 id: 412 unit: def load_doc_ends() file: src/datatrove/tools/check_dataset.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 1 id: 413 unit: fn with_key() file: src/datatrove/tools/fast_mh3/src/s3_union_find.rs start line: 78 end line: 80 size: 3 LOC McCabe index: 1 number of parameters: 2 id: 414 unit: def file_exists() file: src/datatrove/io.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 1 id: 415 unit: def file_is_local() file: src/datatrove/io.py start line: 0 end line: 0 size: 3 LOC McCabe index: 1 number of parameters: 1 id: 416 unit: def __init__() file: src/datatrove/pipeline/formatters/base.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 417 unit: def format() file: src/datatrove/pipeline/formatters/base.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 418 unit: def __init__() file: src/datatrove/pipeline/formatters/pii.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 0 id: 419 unit: def filter() file: src/datatrove/pipeline/filters/regex_filter.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 420 unit: def get_n_grams() file: src/datatrove/pipeline/filters/gopher_repetition_filter.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 2 id: 421 unit: def filter() file: src/datatrove/pipeline/filters/lambda_filter.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 422 unit: def filter() file: src/datatrove/pipeline/filters/sampler_filter.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 423 unit: def normalize() file: src/datatrove/pipeline/filters/url_filter.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 424 unit: def parse_list() file: src/datatrove/pipeline/filters/url_filter.py start line: 0 end line: 0 size: 2 LOC McCabe index: 4 number of parameters: 2 id: 425 unit: def filter() file: src/datatrove/pipeline/filters/base_filter.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 426 unit: def filter_batch() file: src/datatrove/pipeline/filters/base_filter.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 427 unit: def __init__() file: src/datatrove/pipeline/filters/unigram_log_probs.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 0 id: 428 unit: def filter() file: src/datatrove/pipeline/filters/unigram_log_probs.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 429 unit: def __repr__() file: src/datatrove/pipeline/base.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 430 unit: def __call__() file: src/datatrove/pipeline/base.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 4 id: 431 unit: def get_max_chars_per_line_ratio() file: src/datatrove/pipeline/stats/line_stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 3 number of parameters: 2 id: 432 unit: def get_min_chars_per_line_ratio() file: src/datatrove/pipeline/stats/line_stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 3 number of parameters: 2 id: 433 unit: def get_short_word_ratio() file: src/datatrove/pipeline/stats/word_stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 3 number of parameters: 2 id: 434 unit: def get_long_word_ratio() file: src/datatrove/pipeline/stats/word_stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 3 number of parameters: 2 id: 435 unit: def get_short_paragraph_ratio() file: src/datatrove/pipeline/stats/paragraph_stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 3 number of parameters: 2 id: 436 unit: def get_long_paragraph_ratio() file: src/datatrove/pipeline/stats/paragraph_stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 3 number of parameters: 2 id: 437 unit: def get_short_sentence_ratio() file: src/datatrove/pipeline/stats/sentence_stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 3 number of parameters: 2 id: 438 unit: def get_long_sentence_ratio() file: src/datatrove/pipeline/stats/sentence_stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 3 number of parameters: 2 id: 439 unit: def extract_stats() file: src/datatrove/pipeline/stats/base.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 440 unit: def get_kv() file: src/datatrove/pipeline/stats/base.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 0 id: 441 unit: def get_leaf_non_empty_folders() file: src/datatrove/pipeline/stats/merger.py start line: 0 end line: 0 size: 2 LOC McCabe index: 4 number of parameters: 1 id: 442 unit: def run() file: src/datatrove/pipeline/readers/base.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 4 id: 443 unit: def read_file() file: src/datatrove/pipeline/readers/base.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 444 unit: def __enter__() file: src/datatrove/pipeline/writers/disk_base.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 445 unit: def close() file: src/datatrove/pipeline/writers/disk_base.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 446 unit: def __exit__() file: src/datatrove/pipeline/writers/disk_base.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 4 id: 447 unit: def _write() file: src/datatrove/pipeline/writers/disk_base.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 4 id: 448 unit: def _on_file_switch() file: src/datatrove/pipeline/writers/disk_base.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 4 id: 449 unit: def is_from_index() file: src/datatrove/pipeline/dedup/sentence_dedup.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 450 unit: def is_from_index() file: src/datatrove/pipeline/dedup/url_dedup.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 1 id: 451 unit: def get_sig_dtype() file: src/datatrove/pipeline/dedup/url_dedup.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 452 unit: def read_bytes() file: src/datatrove/pipeline/dedup/exact_substrings.py start line: 0 end line: 0 size: 2 LOC McCabe index: 4 number of parameters: 1 id: 453 unit: def __str__() file: src/datatrove/pipeline/dedup/minhash.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 454 unit: def is_from_index() file: src/datatrove/pipeline/dedup/minhash.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 455 unit: def m() file: src/datatrove/pipeline/dedup/bloom_filter.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 456 unit: def __len__() file: src/datatrove/pipeline/tokens/megatron_tokenizer.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 1 id: 457 unit: def get_output_filename() file: src/datatrove/pipeline/tokens/megatron_tokenizer.py start line: 0 end line: 0 size: 2 LOC McCabe index: 3 number of parameters: 4 id: 458 unit: def __len__() file: src/datatrove/pipeline/tokens/tokenizer.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 1 id: 459 unit: def get_data_reader() file: src/datatrove/pipeline/tokens/merger.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 0 id: 460 unit: def extract() file: src/datatrove/pipeline/extractors/base.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 461 unit: def __enter__() file: src/datatrove/pipeline/extractors/base.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 462 unit: def split_into_words() file: src/datatrove/utils/text.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 463 unit: def split_into_sentences() file: src/datatrove/utils/text.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 464 unit: def split_into_paragraphs() file: src/datatrove/utils/text.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 465 unit: def get_timestamp() file: src/datatrove/utils/logging.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 0 id: 466 unit: def get_random_str() file: src/datatrove/utils/logging.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 1 id: 467 unit: def _is_package_available() file: src/datatrove/utils/_import_utils.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 468 unit: def is_rich_available() file: src/datatrove/utils/_import_utils.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 0 id: 469 unit: def is_pyarrow_available() file: src/datatrove/utils/_import_utils.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 0 id: 470 unit: def is_tokenizers_available() file: src/datatrove/utils/_import_utils.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 0 id: 471 unit: def is_fasteners_available() file: src/datatrove/utils/_import_utils.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 0 id: 472 unit: def is_boto3_available() file: src/datatrove/utils/_import_utils.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 0 id: 473 unit: def is_s3fs_available() file: src/datatrove/utils/_import_utils.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 0 id: 474 unit: def is_moto_available() file: src/datatrove/utils/_import_utils.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 0 id: 475 unit: def is_torch_available() file: src/datatrove/utils/_import_utils.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 0 id: 476 unit: def __len__() file: src/datatrove/utils/dataset.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 477 unit: def __len__() file: src/datatrove/utils/dataset.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 1 id: 478 unit: def current_file_path() file: src/datatrove/utils/dataset.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 479 unit: def token_size() file: src/datatrove/utils/tokenization.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 1 id: 480 unit: def token_format() file: src/datatrove/utils/tokenization.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 1 id: 481 unit: def strip_strings() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 2 LOC McCabe index: 3 number of parameters: 1 id: 482 unit: def __init__() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 483 unit: def word_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 484 unit: def sent_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 485 unit: def span_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 486 unit: def span_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 487 unit: def preprocess() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 488 unit: def span_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 2 id: 489 unit: def word_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 2 id: 490 unit: def _spacy_xx() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 1 id: 491 unit: def word_tokenize() file: src/datatrove/utils/word_tokenizers.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 492 unit: def pp() file: src/datatrove/utils/perplexity.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 3 id: 493 unit: def replace_unicode_punct() file: src/datatrove/utils/perplexity.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 2 id: 494 unit: def remove_non_printing_char() file: src/datatrove/utils/perplexity.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 495 unit: def np_dtype() file: src/datatrove/utils/hashing.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 1 id: 496 unit: def np_descr() file: src/datatrove/utils/hashing.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 497 unit: def struct_format() file: src/datatrove/utils/hashing.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 1 id: 498 unit: def max() file: src/datatrove/utils/hashing.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 499 unit: def min() file: src/datatrove/utils/hashing.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 500 unit: def __str__() file: src/datatrove/utils/hashing.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 501 unit: def topk() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 2 id: 502 unit: def __repr__() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 1 id: 503 unit: def to_dict() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 3 number of parameters: 1 id: 504 unit: def from_dict() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 2 id: 505 unit: def __getitem__() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 506 unit: def __setitem__() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 3 id: 507 unit: def to_json() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 508 unit: def save_to_disk() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 509 unit: def total_time() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 1 id: 510 unit: def total_std_dev() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 1 id: 511 unit: def __repr__() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 512 unit: def to_json() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 1 id: 513 unit: def from_json() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 2 id: 514 unit: def save_to_disk() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 515 unit: def variance() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 1 id: 516 unit: def standard_deviation() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 517 unit: def __enter__() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 518 unit: def __exit__() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 4 id: 519 unit: def _get_time_frac() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 2 id: 520 unit: def __repr__() file: src/datatrove/utils/stats.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 521 unit: def xxhash32() file: src/datatrove/utils/hashes/xxhash.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 522 unit: def xxhash64() file: src/datatrove/utils/hashes/xxhash.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 523 unit: def sha1_hash32() file: src/datatrove/utils/hashes/sha1.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 524 unit: def sha1_hash64() file: src/datatrove/utils/hashes/sha1.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 525 unit: def __reduce__() file: src/datatrove/utils/japanese_tokenizer.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 526 unit: def _get_config() file: src/datatrove/utils/japanese_tokenizer.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 527 unit: def _set_config() file: src/datatrove/utils/japanese_tokenizer.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 3 id: 528 unit: def __init__() file: src/datatrove/utils/lid.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 529 unit: def predict() file: src/datatrove/utils/lid.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 530 unit: def world_size() file: src/datatrove/executor/ray.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 531 unit: def world_size() file: src/datatrove/executor/slurm.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 532 unit: def default_job_id_retriever() file: src/datatrove/executor/slurm.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 533 unit: def run() file: src/datatrove/executor/base.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 534 unit: def world_size() file: src/datatrove/executor/base.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 535 unit: def is_rank_completed() file: src/datatrove/executor/base.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 2 id: 536 unit: def mark_rank_as_completed() file: src/datatrove/executor/base.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 537 unit: def world_size() file: src/datatrove/executor/local.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 538 unit: def check_dataset() file: src/datatrove/tools/check_dataset.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 0 id: 539 unit: def open_file() file: src/datatrove/tools/check_dataset.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 540 unit: def get_filter_expr() file: src/datatrove/tools/inspect_data.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 1 id: 541 unit: def get_open_files() file: src/datatrove/io.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 542 unit: def write() file: src/datatrove/io.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 3 id: 543 unit: def __enter__() file: src/datatrove/io.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 544 unit: def __exit__() file: src/datatrove/io.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 4 id: 545 unit: def get_output_file_manager() file: src/datatrove/io.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 2 id: 546 unit: def open_files() file: src/datatrove/io.py start line: 0 end line: 0 size: 2 LOC McCabe index: 2 number of parameters: 4 id: 547 unit: def is_local() file: src/datatrove/io.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 1 id: 548 unit: def cached_asset_path_or_download() file: src/datatrove/io.py start line: 0 end line: 0 size: 2 LOC McCabe index: 1 number of parameters: 0