chart/values.yaml (454 lines of code) (raw):
global:
huggingface:
imageRegistry: ""
imagePullSecrets: []
privateHub:
enabled: false
ingress:
enabled: true
domain: huggingface.co
ssl: true
subdomains:
datasetsServer: datasets-server
service:
type: ClusterIP
ports:
datasetsServer:
admin: 30021
api: 30022
rows: 30023
search: 30024
sseApi: 30025
webhook: 30026
images:
pullPolicy: IfNotPresent
pullSecrets: []
reverseProxy:
useGlobalRegistry: false
registry: docker.io
repository: nginx
tag: "1.25.3"
jobs:
mongodbMigration:
registry: huggingface
useGlobalRegistry: false
repository: datasets-server-jobs-mongodb_migration
tag: sha-fb3399a
cacheMaintenance:
registry: huggingface
useGlobalRegistry: false
repository: datasets-server-jobs-cache_maintenance
tag: sha-fb3399a
services:
admin:
registry: huggingface
useGlobalRegistry: false
repository: datasets-server-services-admin
tag: sha-fb3399a
api:
registry: huggingface
useGlobalRegistry: false
repository: datasets-server-services-api
tag: sha-fb3399a
rows:
registry: huggingface
useGlobalRegistry: false
repository: datasets-server-services-rows
tag: sha-fb3399a
search:
registry: huggingface
useGlobalRegistry: false
repository: datasets-server-services-search
tag: sha-fb3399a
sseApi:
registry: huggingface
useGlobalRegistry: false
repository: datasets-server-services-sse-api
tag: sha-fb3399a
worker:
registry: huggingface
useGlobalRegistry: false
repository: datasets-server-services-worker
tag: sha-fb3399a
webhook:
registry: huggingface
useGlobalRegistry: false
repository: datasets-server-services-webhook
tag: sha-fb3399a
common:
# Comma-separated list of blocked datasets. No jobs will be processed for those datasets.
blockedDatasets: "open-llm-leaderboard-old/details_*,lunaluan/*,atom-in-the-universe/*,cot-leaderboard/cot-eval-traces,mitermix/yt-links,mcding-org/*"
# URL of the HuggingFace Hub
hfEndpoint: ""
log:
# Log level
level: "INFO"
# --- common parameters ---
secrets:
infisical:
enabled: false
env: ""
project: "datasets-server-n5x-l"
url: ""
resyncInterval: 60
operatorSecretName: "datasets-server-operator-secrets"
operatorSecretNamespace: "datasets-server"
mongoUrl:
fromSecret: false
secretName: "mongo-url"
value: mongo://
appHfToken:
fromSecret: true
secretName: ""
value: ""
appParquetConverterHfToken:
fromSecret: true
secretName: ""
value: ""
hfWebhookSecret:
fromSecret: false
secretName: "webhook-secret"
value: ""
# a comma-separated list of additional public keys to use to decode the JWT sent by the Hugging Face Hub.
# The public keys must be in PEM format and include "\n" for line breaks
# ("-----BEGIN PUBLIC KEY-----\n....\n-----END PUBLIC KEY-----\n"). Defaults to empty.
hfJwtAdditionalPublicKeys:
fromSecret: false
secretName: ""
value: ""
spawningToken:
fromSecret: true
secretName: ""
s3:
accessKeyId:
fromSecret: true
secretName: ""
secretAccessKey:
fromSecret: true
secretName: ""
cloudfront:
keyPairId:
fromSecret: false
value: ""
privateKey:
fromSecret: false
value: ""
uid: 1000
gid: 3000
persistence:
parquetMetadata:
existingClaim: ""
monitoring:
enabled: false
mongodb:
enabled: true
nameOverride: datasets-server-mongodb
useStatefulSet: true
auth:
enabled: false
serviceAccount:
create: false
cache:
# Name of the mongo db database used to cache the API responses
mongoDatabase: "datasets_server_cache"
queue:
# Name of the mongo db database used to store the jobs queue
mongoDatabase: "datasets_server_queue"
worker:
# maximum size in bytes of the response content computed by a worker
contentMaxBytes: "10_000_000"
# the time interval between two heartbeats. Each heartbeat updates the job "last_heartbeat" field in the queue.
heartbeatIntervalSeconds: 60
# the time interval at which the worker looks for long jobs to kill them
killLongJobIntervalSeconds: 60
# the time interval at which the worker looks for zombie jobs to kill them
killZombiesIntervalSeconds: 600
# the maximum duration of a job before it gets stopped for exceeded the maximum duration
maxJobDurationSeconds: 2400
# Max CPU load (%) - if reached, sleeps until it comes back under the limit. Set to 0 to disable the test.
maxLoadPct: 0
# Max memory (RAM + SWAP) (%) - if reached, sleeps until it comes back under the limit. Set to 0 to disable the test.
maxMemoryPct: 0
# the number of heartbeats a job must have missed to be considered a zombie job.
maxMissingHeartbeats: 5
# Number of seconds a worker will sleep before trying to process a new job
sleepSeconds: 5
firstRows:
# Max size of the /first-rows endpoint response in bytes
maxBytes: "1_000_000"
# Min size of a cell in the /first-rows endpoint response in bytes
minCellBytes: 100
# Min number of rows in the /first-rows endpoint response
minNumber: 10
# Max number of columns in the /first-rows endpoint response
columnsMaxNumber: 1_000
parquetAndInfo:
# the git commit message when the parquet files are uploaded to the Hub. Defaults to `Update parquet files`.
commitMessage: "Update parquet files"
# the maximum size in bytes of the dataset to pre-compute the parquet files. Bigger datasets, or datasets without that information, are partially streamed to get parquet files up to this value.
maxDatasetSizeBytes: "100_000_000"
# the maximum number of external files of the datasets. Bigger datasets, or datasets without that information, are partially streamed to get parquet files up to `maxDatasetSizeBytes` bytes.
maxExternalDataFiles: "10_000"
# the maximum size in bytes of the row groups of parquet datasets that are copied to the target revision. Bigger datasets, or datasets without that information, are partially streamed to get parquet files up to `maxDatasetSizeBytes` bytes.
maxRowGroupByteSizeForCopy: "300_000_000"
# the git revision of the dataset to use to prepare the parquet files. Defaults to `main`.
sourceRevision: "main"
# the git revision of the dataset where to store the parquet files. Make sure the hf_token (see the "Common" section) allows to write there. Defaults to `refs/convert/parquet`.
targetRevision: "refs/convert/parquet"
# the URL template to build the parquet file URLs. Defaults to `/datasets/%s/resolve/%s/%s`.
urlTemplate: "/datasets/%s/resolve/%s/%s"
optInOutUrlsScan:
columnsMaxNumber: 10
# the max number of columns to scan
maxConcurrentRequestsNumber: 10
# the max concurrent request number
maxRequestsPerSecond: 20
# the max number of request allowed to process in parallel per second
rowsMaxNumber: 1_000
# the max number of rows to scan
urlsNumberPerBatch: 1_000
# the number of grouped urls to be send in every request to spawning
spawningUrl: "https://opts-api.spawningaiapi.com/api/v2/query/urls"
# the URL for spawning requests
configNames:
# the max number of configs per dataset
maxNumber: 4_000
s3:
regionName: "us-east-1"
cloudfront:
# CloudFront expiration delay in seconds, for the signed URLs
expirationSeconds: 3600
assets:
# base URL for the assets files. It should be set accordingly to the datasets-server domain, eg https://datasets-server.huggingface.co/assets
# baseUrl: "not used for now"
# name of the folder where assets are stored.
storageRoot: "/storage/assets"
storageProtocol: "file"
cachedAssets:
# base URL for the cached assets files. It should be set accordingly to the datasets-server domain, eg https://datasets-server.huggingface.co/cached-assets
# baseUrl: "not used for now"
# name of the folder where cached assets are stored.
storageRoot: "/storage/cached-assets"
storageProtocol: "file"
parquetMetadata:
# Directory on the shared storage (parquet metadata files used for random access in /rows)
storageDirectory: "/storage/parquet-metadata"
duckDBIndex:
# Directory on the local storage (used to cache the duckdb files for /filter and /search)
cacheDirectory: "/tmp/duckdb-index"
# Directory on the worker (used temporarily to prepare the duckdb indexes before sending to the Hub)
workerDirectory: "/tmp/duckdb-index"
# the git commit message when the duckdb index file is uploaded to the Hub. Defaults to `Update duckdb index files`.
commitMessage: "Update duckdb index files"
# the git revision of the dataset where to store the duckdb index file. Defaults to `refs/convert/duckdb`.
targetRevision: "refs/convert/duckdb"
# the URL template to build the duckdb index file URL. Defaults to `/datasets/%s/resolve/%s/%s`.
urlTemplate: "/datasets/%s/resolve/%s/%s"
# the maximum size of the split parquets.
maxSplitSizeBytes: "100_000_000"
rowsIndex:
# Maximum number of bytes to load in memory from parquet row groups to avoid OOM
maxArrowDataInMemory: "300_000_000"
descriptiveStatistics:
# Directory used temporarily to download dataset locally in .parquet to compute statistics
cacheDirectory: "/tmp/stats-cache"
# fixed number of bins for histogram count
histogramNumBins: 10
# the maximum size of the split parquets.
maxSplitSizeBytes: "100_000_000"
hfDatasetsCache:
# Directory where the HF datasets cache data will be stored
cacheDirectory: "/tmp/hf-datasets-cache"
discussions:
# name of the Hub user associated with the dataset viewer bot app
botAssociatedUserName: "parquet-converter"
# --- jobs (pre-install/upgrade hooks) ---
mongodbMigration:
# Name of the mongo db database used for storing the migrations history
mongoDatabase: "datasets_server_maintenance"
nodeSelector: {}
resources:
requests:
cpu: 0
limits:
cpu: 0
tolerations: []
# --- cron jobs ---
backfill:
enabled: false
log:
level: "info"
action: "backfill"
schedule: "0 */3 * * *"
# every 3 hours
nodeSelector: {}
resources:
requests:
cpu: 0
limits:
cpu: 0
tolerations: []
backfillRetryableErrors:
enabled: false
log:
level: "info"
action: "backfill-retryable-errors"
schedule: "*/10 * * * *"
# every 10 minutes
nodeSelector: {}
resources:
requests:
cpu: 0
limits:
cpu: 0
tolerations: []
postMessages:
enabled: true
log:
level: "info"
action: "post-messages"
schedule: "10 0 * * *"
# at 00:10
nodeSelector: {}
resources:
requests:
cpu: 0
limits:
cpu: 0
tolerations: []
queueMetricsCollector:
enabled: true
action: "collect-queue-metrics"
schedule: "14 00 * * *"
nodeSelector: {}
resources:
requests:
cpu: 0
limits:
cpu: 0
tolerations: []
cacheMetricsCollector:
enabled: true
action: "collect-cache-metrics"
schedule: "13 00 * * *"
nodeSelector: {}
resources:
requests:
cpu: 0
limits:
cpu: 0
tolerations: []
# --- ALB ---
ingress:
enabled: true
tls: []
annotations: {}
# --- services ---
admin:
# HF organization that is allowed to request the report
hfOrganization: "datasets-maintainers"
# Number of reports in /cache-reports/... endpoints
cacheReportsNumResults: 100
# Number of reports in /cache-reports-with-content/... endpoints
cacheReportsWithContentNumResults: 100
# the timeout in seconds for the requests to the Hugging Face Hub.
hfTimeoutSeconds: "0.2"
# The path of the whoami service on the hub.
hfWhoamiPath: "/api/whoami-v2"
# Number of seconds to set in the `max-age` header on technical endpoints
maxAge: "10"
# Directory where the uvicorn workers share their prometheus metrics
# see https://github.com/prometheus/client_python#multiprocess-mode-eg-gunicorn
prometheusMultiprocDirectory: "/tmp"
# hostname - it must not be set to localhost to work in Kube!
uvicornHostname: "0.0.0.0"
# Number of uvicorn workers for running the application
uvicornNumWorkers: "1"
# Application endpoint port
uvicornPort: 8080
nodeSelector: {}
replicas: 1
resources:
requests:
cpu: 0
limits:
cpu: 0
service:
type: ""
annotations: {}
ingress:
enabled: true
annotations: {}
ingressInternal:
enabled: false
annotations: {}
tolerations: []
hf:
# the path of the external authentication service on the hub.
# The string must contain `%s` which will be replaced with the dataset name.
authPath: "/api/datasets/%s/auth-check"
# the URL where the "Hub JWT public key" is published. The "Hub JWT public key" must be in JWK format.
# It helps to decode a JWT sent by the Hugging Face Hub, for example, to bypass the external authentication
# check (JWT in the 'X-Api-Key' header). If not set, the JWT are ignored.
jwtPublicKeyUrl: "https://huggingface.co/api/keys/jwt"
# the algorithm used to encode the JWT.
jwtAlgorithm: "EdDSA"
# the timeout in seconds for the requests to the Hugging Face Hub.
timeoutSeconds: "0.2"
api:
# Number of seconds to set in the `max-age` header on data endpoints
maxAgeLong: "120"
# Number of seconds to set in the `max-age` header on technical endpoints
maxAgeShort: "10"
# Directory where the uvicorn workers will write the prometheus metrics
# see https://github.com/prometheus/client_python#multiprocess-mode-eg-gunicorn
prometheusMultiprocDirectory: "/tmp"
# Hostname - it must not be set to localhost to work in Kube!
uvicornHostname: "0.0.0.0"
# Number of uvicorn workers for running the application
uvicornNumWorkers: "1"
# Application endpoint port
uvicornPort: 8080
nodeSelector: {}
replicas: 1
resources:
requests:
cpu: 0
limits:
cpu: 0
service:
type: ""
annotations: {}
ingress:
enabled: true
annotations: {}
ingressInternal:
enabled: false
annotations: {}
tolerations: []
rows:
# Number of seconds to set in the `max-age` header on data endpoints
maxAgeLong: "120"
# Number of seconds to set in the `max-age` header on technical endpoints
maxAgeShort: "10"
# Directory where the uvicorn workers will write the prometheus metrics
# see https://github.com/prometheus/client_python#multiprocess-mode-eg-gunicorn
prometheusMultiprocDirectory: "/tmp"
# Hostname - it must not be set to localhost to work in Kube!
uvicornHostname: "0.0.0.0"
# Number of uvicorn workers for running the application
uvicornNumWorkers: "1"
# Application endpoint port
uvicornPort: 8080
nodeSelector: {}
replicas: 1
resources:
requests:
cpu: 0
limits:
cpu: 0
service:
type: ""
annotations: {}
ingress:
enabled: true
annotations: {}
ingressInternal:
enabled: false
annotations: {}
tolerations: []
search:
# Number of seconds to set in the `max-age` header on data endpoints
maxAgeLong: "120"
# Number of seconds to set in the `max-age` header on technical endpoints
maxAgeShort: "10"
# Directory where the uvicorn workers will write the prometheus metrics
# see https://github.com/prometheus/client_python#multiprocess-mode-eg-gunicorn
prometheusMultiprocDirectory: "/tmp"
# Hostname - it must not be set to localhost to work in Kube!
uvicornHostname: "0.0.0.0"
# Number of uvicorn workers for running the application
uvicornNumWorkers: "1"
# Application endpoint port
uvicornPort: 8080
# Probability of cleaning the downloads folder at each request.
cleanCacheProba: 0.05
# Retention period for downloads.
expiredTimeIntervalSeconds: 43_200 # 12 hours
nodeSelector: {}
replicas: 1
resources:
requests:
cpu: 0
limits:
cpu: 0
service:
type: ""
annotations: {}
ingress:
enabled: true
annotations: {}
ingressInternal:
enabled: false
annotations: {}
tolerations: []
sseApi:
# Directory where the uvicorn workers will write the prometheus metrics
# see https://github.com/prometheus/client_python#multiprocess-mode-eg-gunicorn
prometheusMultiprocDirectory: "/tmp"
# Hostname - it must not be set to localhost to work in Kube!
uvicornHostname: "0.0.0.0"
# Number of uvicorn workers for running the application
uvicornNumWorkers: "1"
# Application endpoint port
uvicornPort: 8080
nodeSelector: {}
replicas: 1
resources:
requests:
cpu: 0
limits:
cpu: 0
service:
type: ""
annotations: {}
ingress:
enabled: true
annotations: {}
ingressInternal:
enabled: false
annotations: {}
tolerations: []
workers:
- # name of the deployment
deployName: "all"
# max difficulty of the jobs that this worker will process
workerDifficultyMax: 100
# min difficulty of the jobs that this worker will process
workerDifficultyMin: 0
# Directory where the uvicorn workers share their prometheus metrics
# see https://github.com/prometheus/client_python#multiprocess-mode-eg-gunicorn
prometheusMultiprocDirectory: "/tmp"
# hostname - it must not be set to localhost to work in Kube!
uvicornHostname: "0.0.0.0"
# Number of uvicorn workers for running the application
uvicornNumWorkers: "1"
# Application endpoint port
uvicornPort: 8080
nodeSelector: {}
replicas: 1
autoscaling:
enabled: false
resources:
requests:
cpu: 0
limits:
cpu: 0
tolerations: []
webhook:
# Number of seconds to set in the `max-age` header on data endpoints
maxAgeLong: "120"
# Number of seconds to set in the `max-age` header on technical endpoints
maxAgeShort: "10"
# Directory where the uvicorn workers will write the prometheus metrics
# see https://github.com/prometheus/client_python#multiprocess-mode-eg-gunicorn
prometheusMultiprocDirectory: "/tmp"
# Hostname - it must not be set to localhost to work in Kube!
uvicornHostname: "0.0.0.0"
# Number of uvicorn workers for running the application
uvicornNumWorkers: "1"
# Application endpoint port
uvicornPort: 8080
nodeSelector: {}
replicas: 1
resources:
requests:
cpu: 0
limits:
cpu: 0
service:
type: ""
annotations: {}
ingress:
enabled: true
annotations: {}
ingressInternal:
enabled: false
annotations: {}
tolerations: []