Path Lines of Code pyproject.toml 152 src/datatrove/__init__.py 1 src/datatrove/data.py 19 src/datatrove/executor/__init__.py 3 src/datatrove/executor/base.py 98 src/datatrove/executor/local.py 101 src/datatrove/executor/ray.py 160 src/datatrove/executor/slurm.py 235 src/datatrove/io.py 188 src/datatrove/pipeline/__init__.py 1 src/datatrove/pipeline/base.py 35 src/datatrove/pipeline/decont/__init__.py 1 src/datatrove/pipeline/decont/n_grams.py 150 src/datatrove/pipeline/dedup/__init__.py 12 src/datatrove/pipeline/dedup/bloom_filter.py 143 src/datatrove/pipeline/dedup/exact_substrings.py 229 src/datatrove/pipeline/dedup/minhash.py 499 src/datatrove/pipeline/dedup/sentence_dedup.py 370 src/datatrove/pipeline/dedup/url_dedup.py 301 src/datatrove/pipeline/extractors/__init__.py 2 src/datatrove/pipeline/extractors/base.py 117 src/datatrove/pipeline/extractors/modular.py 23 src/datatrove/pipeline/extractors/trafilatura.py 28 src/datatrove/pipeline/filters/__init__.py 11 src/datatrove/pipeline/filters/base_filter.py 49 src/datatrove/pipeline/filters/c4_filters.py 197 src/datatrove/pipeline/filters/fasttext_filter.py 78 src/datatrove/pipeline/filters/fineweb_quality_filter.py 47 src/datatrove/pipeline/filters/gopher_quality_filter.py 76 src/datatrove/pipeline/filters/gopher_repetition_filter.py 90 src/datatrove/pipeline/filters/lambda_filter.py 11 src/datatrove/pipeline/filters/language_filter.py 43 src/datatrove/pipeline/filters/regex_filter.py 11 src/datatrove/pipeline/filters/sampler_filter.py 17 src/datatrove/pipeline/filters/unigram_log_probs.py 46 src/datatrove/pipeline/filters/url_filter.py 91 src/datatrove/pipeline/formatters/__init__.py 3 src/datatrove/pipeline/formatters/base.py 17 src/datatrove/pipeline/formatters/ftfy.py 43 src/datatrove/pipeline/formatters/pii.py 68 src/datatrove/pipeline/formatters/symbol_lines_remover.py 25 src/datatrove/pipeline/readers/__init__.py 6 src/datatrove/pipeline/readers/base.py 143 src/datatrove/pipeline/readers/csv.py 50 src/datatrove/pipeline/readers/huggingface.py 95 src/datatrove/pipeline/readers/ipc.py 64 src/datatrove/pipeline/readers/jsonl.py 57 src/datatrove/pipeline/readers/parquet.py 57 src/datatrove/pipeline/readers/warc.py 94 src/datatrove/pipeline/stats/__init__.py 11 src/datatrove/pipeline/stats/base.py 91 src/datatrove/pipeline/stats/config.py 9 src/datatrove/pipeline/stats/contamination_stats.py 33 src/datatrove/pipeline/stats/doc_stats.py 30 src/datatrove/pipeline/stats/lang_stats.py 26 src/datatrove/pipeline/stats/line_stats.py 57 src/datatrove/pipeline/stats/merger.py 51 src/datatrove/pipeline/stats/paragraph_stats.py 51 src/datatrove/pipeline/stats/perplexity_stats.py 25 src/datatrove/pipeline/stats/sentence_stats.py 47 src/datatrove/pipeline/stats/token_stats.py 25 src/datatrove/pipeline/stats/word_stats.py 56 src/datatrove/pipeline/tokens/__init__.py 5 src/datatrove/pipeline/tokens/context_shuffler.py 46 src/datatrove/pipeline/tokens/counter.py 36 src/datatrove/pipeline/tokens/megatron_tokenizer.py 107 src/datatrove/pipeline/tokens/merger.py 153 src/datatrove/pipeline/tokens/tokenizer.py 298 src/datatrove/pipeline/writers/__init__.py 3 src/datatrove/pipeline/writers/disk_base.py 83 src/datatrove/pipeline/writers/huggingface.py 104 src/datatrove/pipeline/writers/jsonl.py 28 src/datatrove/pipeline/writers/parquet.py 67 src/datatrove/tools/__init__.py 1 src/datatrove/tools/check_dataset.py 58 src/datatrove/tools/failed_logs.py 55 src/datatrove/tools/fast_mh3/Cargo.toml 26 src/datatrove/tools/fast_mh3/src/local_union_find.rs 272 src/datatrove/tools/fast_mh3/src/s3_union_find.rs 477 src/datatrove/tools/inspect_data.py 129 src/datatrove/tools/jobs_status.py 71 src/datatrove/tools/launch_pickled_pipeline.py 13 src/datatrove/tools/merge_stats.py 33 src/datatrove/utils/__init__.py 1 src/datatrove/utils/_import_utils.py 54 src/datatrove/utils/batching.py 7 src/datatrove/utils/binaryio.py 54 src/datatrove/utils/dataset.py 194 src/datatrove/utils/hashes/sha1.py 6 src/datatrove/utils/hashes/xxhash.py 5 src/datatrove/utils/hashing.py 37 src/datatrove/utils/japanese_tokenizer.py 216 src/datatrove/utils/lid.py 45 src/datatrove/utils/logging.py 37 src/datatrove/utils/perplexity.py 138 src/datatrove/utils/stats.py 297 src/datatrove/utils/text.py 254 src/datatrove/utils/tokenization.py 58 src/datatrove/utils/typeshelper.py 4106 src/datatrove/utils/word_tokenizers.py 361