Path Lines of Code tests/__init__.py 1 tests/executor/__init__.py 1 tests/executor/test_local.py 53 tests/executor/test_ray.py 44 tests/pipeline/__init__.py 1 tests/pipeline/test_adapter_reader.py 20 tests/pipeline/test_base.py 15 tests/pipeline/test_bloom_filter.py 86 tests/pipeline/test_exact_substrings.py 236 tests/pipeline/test_extractors.py 14 tests/pipeline/test_filters.py 108 tests/pipeline/test_hf_reader.py 54 tests/pipeline/test_ipc_reader.py 46 tests/pipeline/test_jsonl_zstd_compression.py 25 tests/pipeline/test_minhash.py 240 tests/pipeline/test_ngrams_decont.py 55 tests/pipeline/test_parquet_reader.py 57 tests/pipeline/test_parquet_writer.py 27 tests/pipeline/test_parquet_zstd_compression.py 25 tests/pipeline/test_pii_removal.py 14 tests/pipeline/test_sentence_deduplication.py 197 tests/pipeline/test_stats.py 235 tests/pipeline/test_symbollines.py 8 tests/pipeline/test_text.py 15 tests/pipeline/test_tokenization.py 230 tests/pipeline/test_url_deduplication.py 162 tests/pipeline/test_word_tokenizers.py 117 tests/test_io.py 58 tests/utils.py 103