vision/smolvlm2/pyproject.toml (43 lines of code) (raw):
[project]
name = "smolvlm"
version = "0.1.0"
description = "Fine-tuning smolvlm models with multi-modal support (images + videos)."
readme = "README.md"
license = { file = "LICENSE" }
authors = [
{ name = "Orr Zohar", email = "orrzohar@stanford.edu" }
]
keywords = ["NLP", "deep-learning", "transformers", "multi-modal", "vision-language", "video"]
dependencies = [
"torch>=2.1.2",
"torchvision",
"transformers>=4.47.1",
"datasets>=2.0.0",
"peft>=0.2.0",
"deepspeed>=0.13.5",
"safetensors>=0.2.0",
"bitsandbytes>=0.37.0",
"accelerate>=0.33.0",
"ujson>=5.0.0",
"numpy>=1.21.0",
"opencv-python",
"chardet",
"decord",
"liger-kernel",
"tabulate",
"num2words",
]
[project.optional-dependencies]
wandb = [
"wandb>=0.12.0",
]
tensorboard = [
"tensorboard>=2.9.0",
]
train = ["ninja", "wandb", "ipdb"]
[build-system]
requires = ["setuptools>=61.0", "wheel"]
build-backend = "setuptools.build_meta"
[tool.setuptools.packages.find]
where = ["."]
include = ["smolvlm*", "scripts*"]