pyproject.toml

[project] name = "datatrove" version = "0.5.0" # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) description = "HuggingFace library to process and filter large amounts of webdata" readme = "README.md" authors = [ {name = "HuggingFace Inc.", email = "guilherme@huggingface.co"} ] license = {text = "Apache-2.0"} classifiers = [ "Intended Audience :: Developers", "Intended Audience :: Education", "Intended Audience :: Science/Research", "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Topic :: Scientific/Engineering :: Artificial Intelligence", ] keywords = ["data", "machine", "learning", "processing"] requires-python = ">=3.10.0" dependencies = [ "dill>=0.3.0", "fsspec>=2023.12.2", "huggingface-hub>=0.17.0", "humanize", "loguru>=0.7.0", "multiprocess", "numpy>=2.0.0", "tqdm", ] [project.optional-dependencies] cli = [ "rich", ] io = [ "faust-cchardet", "pyarrow", "python-magic", "warcio", "datasets>=3.1.0", "orjson", "zstandard" ] s3 = [ "s3fs>=2023.12.2", ] processing = [ "fasttext-numpy2-wheel", "nltk", "inscriptis", # "readability-lxml @ git+https://github.com/huggingface/python-readability.git@speedup", "tldextract", "trafilatura>=1.8.0,<1.12.0", "tokenizers", "ftfy", "fasteners", "regex", "xxhash", "kenlm", "pyahocorasick" ] decont = [ "lighteval>=0.3.0" ] multilingual = [ "spacy[ja]>=3.8", "stanza", "pyvi", # vietnamese "pythainlp", # thai "jieba", # chinese "indic-nlp-library", # indic languages "kiwipiepy", # korean # urduhack has keras and tensorflow as dependencies and requires a specific version to work... "urduhack", "tensorflow>=2.16", "khmer-nltk", # khmer "laonlp", # lao "botok", # tibetan languages, "pyidaungsu-numpy2", # burmese ] ray = [ "ray" ] quality = [ "ruff>=0.1.5" ] testing = [ "datatrove[cli]", "datatrove[io]", "datatrove[processing]", "datatrove[multilingual]", "datatrove[s3]", "datatrove[ray]", # Lighteval doesn't support numpy>=2.0.0 # "datatrove[decont]", # Flask doesn't have correct dependencies on werkzeux, causing issues, thus we pin flask 3.1 (which currently works) to avoid it "flask>=3.1.0", "pytest", "pytest-timeout", "pytest-xdist", "moto[s3,server]", "spacy[ja]" ] all = [ "datatrove[quality]", "datatrove[testing]", ] dev = [ "datatrove[all]" ] [project.urls] Repository = "https://github.com/huggingface/datatrove" [project.scripts] check_dataset = "datatrove.tools.check_dataset:main" merge_stats = "datatrove.tools.merge_stats:main" launch_pickled_pipeline = "datatrove.tools.launch_pickled_pipeline:main" failed_logs = "datatrove.tools.failed_logs:main" inspect_data = "datatrove.tools.inspect_data:main" jobs_status = "datatrove.tools.jobs_status:main" [build-system] requires = ["setuptools"] build-backend = "setuptools.build_meta" [tool.setuptools.packages.find] where = ["src"] [tool.setuptools.package-data] datatrove = ["assets/*"] [tool.ruff] lint.ignore = [ "C901", # `function_name` is too complex "E501", # line length violation ] lint.select = [ "C", "E", "F", "I", "W", "NPY201", # numpy 2.0.0 ] line-length = 119 [tool.ruff.lint.per-file-ignores] "__init__.py" = [ "F401" # module imported but unused ] [tool.ruff.lint.isort] lines-after-imports = 2 known-first-party = [ "datatrove" ]

pyproject.toml (152 lines of code) (raw):