graph G { compound="true" rankdir="TB" bgcolor="white" fontname="Tahoma" node [ fixedsize="false" fontname="Tahoma" color="white" fillcolor="deepskyblue2" fontcolor="black" shape="box" style="filled" penwidth="1.0" ] edge [ fontname="Arial" color="#00688b" fontcolor="black" fontsize="12" arrowsize="0.5" penwidth="1.0" ] "[src/datatrove/utils/tokenization.py]" -- "[src/datatrove/pipeline/tokens/tokenizer.py]" [label=" 2 ", penwidth="2", color="#00688b70"]; "[src/datatrove/pipeline/tokens/counter.py]" -- "[src/datatrove/pipeline/tokens/tokenizer.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/tools/jobs_status.py]" -- "[src/datatrove/pipeline/decont/n_grams.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/dedup/url_dedup.py]" -- "[src/datatrove/pipeline/dedup/exact_substrings.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/dedup/url_dedup.py]" -- "[src/datatrove/pipeline/dedup/minhash.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/executor/slurm.py]" -- "[src/datatrove/executor/local.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/tools/jobs_status.py]" -- "[src/datatrove/executor/slurm.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/utils/perplexity.py]" -- "[src/datatrove/tools/jobs_status.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/filters/language_filter.py]" -- "[src/datatrove/pipeline/filters/fasttext_filter.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/decont/n_grams.py]" -- "[src/datatrove/executor/slurm.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/tools/failed_logs.py]" -- "[src/datatrove/executor/ray.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/tools/check_dataset.py]" -- "[src/datatrove/pipeline/tokens/tokenizer.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/filters/fasttext_filter.py]" -- "[src/datatrove/utils/_import_utils.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/dedup/exact_substrings.py]" -- "[src/datatrove/executor/local.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/executor/ray.py]" -- "[pyproject.toml]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/dedup/sentence_dedup.py]" -- "[src/datatrove/pipeline/dedup/url_dedup.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/dedup/exact_substrings.py]" -- "[src/datatrove/executor/slurm.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/tokens/__init__.py]" -- "[src/datatrove/pipeline/tokens/megatron_tokenizer.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/dedup/exact_substrings.py]" -- "[src/datatrove/utils/tokenization.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/tools/check_dataset.py]" -- "[src/datatrove/executor/local.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/dedup/exact_substrings.py]" -- "[src/datatrove/pipeline/dedup/minhash.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/executor/ray.py]" -- "[src/datatrove/executor/slurm.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/decont/n_grams.py]" -- "[src/datatrove/executor/ray.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/tokens/counter.py]" -- "[src/datatrove/pipeline/dedup/exact_substrings.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/extractors/trafilatura.py]" -- "[src/datatrove/pipeline/extractors/base.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/utils/_import_utils.py]" -- "[src/datatrove/io.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/stats/token_stats.py]" -- "[src/datatrove/pipeline/tokens/megatron_tokenizer.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/readers/base.py]" -- "[src/datatrove/pipeline/decont/n_grams.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/utils/perplexity.py]" -- "[src/datatrove/tools/failed_logs.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/writers/parquet.py]" -- "[src/datatrove/pipeline/writers/huggingface.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/tokens/megatron_tokenizer.py]" -- "[src/datatrove/utils/tokenization.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/utils/dataset.py]" -- "[src/datatrove/io.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/utils/perplexity.py]" -- "[src/datatrove/pipeline/readers/base.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/tokens/counter.py]" -- "[src/datatrove/utils/tokenization.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/dedup/url_dedup.py]" -- "[src/datatrove/executor/local.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/tokens/megatron_tokenizer.py]" -- "[src/datatrove/pipeline/dedup/exact_substrings.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/tools/check_dataset.py]" -- "[src/datatrove/pipeline/dedup/exact_substrings.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/tools/jobs_status.py]" -- "[src/datatrove/executor/ray.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/tokens/merger.py]" -- "[src/datatrove/pipeline/tokens/tokenizer.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/executor/__init__.py]" -- "[pyproject.toml]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/utils/dataset.py]" -- "[src/datatrove/utils/_import_utils.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/stats/token_stats.py]" -- "[src/datatrove/utils/tokenization.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/dedup/exact_substrings.py]" -- "[src/datatrove/pipeline/tokens/tokenizer.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/stats/token_stats.py]" -- "[src/datatrove/pipeline/tokens/tokenizer.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/tools/failed_logs.py]" -- "[src/datatrove/executor/slurm.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/utils/perplexity.py]" -- "[src/datatrove/executor/slurm.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/tools/failed_logs.py]" -- "[src/datatrove/pipeline/readers/base.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/pipeline/tokens/counter.py]" -- "[src/datatrove/pipeline/stats/token_stats.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/utils/perplexity.py]" -- "[src/datatrove/pipeline/decont/n_grams.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; "[src/datatrove/executor/local.py]" -- "[src/datatrove/pipeline/dedup/minhash.py]" [label=" 1 ", penwidth="1", color="#00688b5E"]; }