chart/values.yaml

global: huggingface: imageRegistry: "" imagePullSecrets: [] privateHub: enabled: false ingress: enabled: true domain: huggingface.co ssl: true subdomains: datasetsServer: datasets-server service: type: ClusterIP ports: datasetsServer: admin: 30021 api: 30022 rows: 30023 search: 30024 sseApi: 30025 webhook: 30026 images: pullPolicy: IfNotPresent pullSecrets: [] reverseProxy: useGlobalRegistry: false registry: docker.io repository: nginx tag: "1.25.3" jobs: mongodbMigration: registry: huggingface useGlobalRegistry: false repository: datasets-server-jobs-mongodb_migration tag: sha-fb3399a cacheMaintenance: registry: huggingface useGlobalRegistry: false repository: datasets-server-jobs-cache_maintenance tag: sha-fb3399a services: admin: registry: huggingface useGlobalRegistry: false repository: datasets-server-services-admin tag: sha-fb3399a api: registry: huggingface useGlobalRegistry: false repository: datasets-server-services-api tag: sha-fb3399a rows: registry: huggingface useGlobalRegistry: false repository: datasets-server-services-rows tag: sha-fb3399a search: registry: huggingface useGlobalRegistry: false repository: datasets-server-services-search tag: sha-fb3399a sseApi: registry: huggingface useGlobalRegistry: false repository: datasets-server-services-sse-api tag: sha-fb3399a worker: registry: huggingface useGlobalRegistry: false repository: datasets-server-services-worker tag: sha-fb3399a webhook: registry: huggingface useGlobalRegistry: false repository: datasets-server-services-webhook tag: sha-fb3399a common: # Comma-separated list of blocked datasets. No jobs will be processed for those datasets. blockedDatasets: "open-llm-leaderboard-old/details_*,lunaluan/*,atom-in-the-universe/*,cot-leaderboard/cot-eval-traces,mitermix/yt-links,mcding-org/*" # URL of the HuggingFace Hub hfEndpoint: "" log: # Log level level: "INFO" # --- common parameters --- secrets: infisical: enabled: false env: "" project: "datasets-server-n5x-l" url: "" resyncInterval: 60 operatorSecretName: "datasets-server-operator-secrets" operatorSecretNamespace: "datasets-server" mongoUrl: fromSecret: false secretName: "mongo-url" value: mongo:// appHfToken: fromSecret: true secretName: "" value: "" appParquetConverterHfToken: fromSecret: true secretName: "" value: "" hfWebhookSecret: fromSecret: false secretName: "webhook-secret" value: "" # a comma-separated list of additional public keys to use to decode the JWT sent by the Hugging Face Hub. # The public keys must be in PEM format and include "\n" for line breaks # ("-----BEGIN PUBLIC KEY-----\n....\n-----END PUBLIC KEY-----\n"). Defaults to empty. hfJwtAdditionalPublicKeys: fromSecret: false secretName: "" value: "" spawningToken: fromSecret: true secretName: "" s3: accessKeyId: fromSecret: true secretName: "" secretAccessKey: fromSecret: true secretName: "" cloudfront: keyPairId: fromSecret: false value: "" privateKey: fromSecret: false value: "" uid: 1000 gid: 3000 persistence: parquetMetadata: existingClaim: "" monitoring: enabled: false mongodb: enabled: true nameOverride: datasets-server-mongodb useStatefulSet: true auth: enabled: false serviceAccount: create: false cache: # Name of the mongo db database used to cache the API responses mongoDatabase: "datasets_server_cache" queue: # Name of the mongo db database used to store the jobs queue mongoDatabase: "datasets_server_queue" worker: # maximum size in bytes of the response content computed by a worker contentMaxBytes: "10_000_000" # the time interval between two heartbeats. Each heartbeat updates the job "last_heartbeat" field in the queue. heartbeatIntervalSeconds: 60 # the time interval at which the worker looks for long jobs to kill them killLongJobIntervalSeconds: 60 # the time interval at which the worker looks for zombie jobs to kill them killZombiesIntervalSeconds: 600 # the maximum duration of a job before it gets stopped for exceeded the maximum duration maxJobDurationSeconds: 2400 # Max CPU load (%) - if reached, sleeps until it comes back under the limit. Set to 0 to disable the test. maxLoadPct: 0 # Max memory (RAM + SWAP) (%) - if reached, sleeps until it comes back under the limit. Set to 0 to disable the test. maxMemoryPct: 0 # the number of heartbeats a job must have missed to be considered a zombie job. maxMissingHeartbeats: 5 # Number of seconds a worker will sleep before trying to process a new job sleepSeconds: 5 firstRows: # Max size of the /first-rows endpoint response in bytes maxBytes: "1_000_000" # Min size of a cell in the /first-rows endpoint response in bytes minCellBytes: 100 # Min number of rows in the /first-rows endpoint response minNumber: 10 # Max number of columns in the /first-rows endpoint response columnsMaxNumber: 1_000 parquetAndInfo: # the git commit message when the parquet files are uploaded to the Hub. Defaults to `Update parquet files`. commitMessage: "Update parquet files" # the maximum size in bytes of the dataset to pre-compute the parquet files. Bigger datasets, or datasets without that information, are partially streamed to get parquet files up to this value. maxDatasetSizeBytes: "100_000_000" # the maximum number of external files of the datasets. Bigger datasets, or datasets without that information, are partially streamed to get parquet files up to `maxDatasetSizeBytes` bytes. maxExternalDataFiles: "10_000" # the maximum size in bytes of the row groups of parquet datasets that are copied to the target revision. Bigger datasets, or datasets without that information, are partially streamed to get parquet files up to `maxDatasetSizeBytes` bytes. maxRowGroupByteSizeForCopy: "300_000_000" # the git revision of the dataset to use to prepare the parquet files. Defaults to `main`. sourceRevision: "main" # the git revision of the dataset where to store the parquet files. Make sure the hf_token (see the "Common" section) allows to write there. Defaults to `refs/convert/parquet`. targetRevision: "refs/convert/parquet" # the URL template to build the parquet file URLs. Defaults to `/datasets/%s/resolve/%s/%s`. urlTemplate: "/datasets/%s/resolve/%s/%s" optInOutUrlsScan: columnsMaxNumber: 10 # the max number of columns to scan maxConcurrentRequestsNumber: 10 # the max concurrent request number maxRequestsPerSecond: 20 # the max number of request allowed to process in parallel per second rowsMaxNumber: 1_000 # the max number of rows to scan urlsNumberPerBatch: 1_000 # the number of grouped urls to be send in every request to spawning spawningUrl: "https://opts-api.spawningaiapi.com/api/v2/query/urls" # the URL for spawning requests configNames: # the max number of configs per dataset maxNumber: 4_000 s3: regionName: "us-east-1" cloudfront: # CloudFront expiration delay in seconds, for the signed URLs expirationSeconds: 3600 assets: # base URL for the assets files. It should be set accordingly to the datasets-server domain, eg https://datasets-server.huggingface.co/assets # baseUrl: "not used for now" # name of the folder where assets are stored. storageRoot: "/storage/assets" storageProtocol: "file" cachedAssets: # base URL for the cached assets files. It should be set accordingly to the datasets-server domain, eg https://datasets-server.huggingface.co/cached-assets # baseUrl: "not used for now" # name of the folder where cached assets are stored. storageRoot: "/storage/cached-assets" storageProtocol: "file" parquetMetadata: # Directory on the shared storage (parquet metadata files used for random access in /rows) storageDirectory: "/storage/parquet-metadata" duckDBIndex: # Directory on the local storage (used to cache the duckdb files for /filter and /search) cacheDirectory: "/tmp/duckdb-index" # Directory on the worker (used temporarily to prepare the duckdb indexes before sending to the Hub) workerDirectory: "/tmp/duckdb-index" # the git commit message when the duckdb index file is uploaded to the Hub. Defaults to `Update duckdb index files`. commitMessage: "Update duckdb index files" # the git revision of the dataset where to store the duckdb index file. Defaults to `refs/convert/duckdb`. targetRevision: "refs/convert/duckdb" # the URL template to build the duckdb index file URL. Defaults to `/datasets/%s/resolve/%s/%s`. urlTemplate: "/datasets/%s/resolve/%s/%s" # the maximum size of the split parquets. maxSplitSizeBytes: "100_000_000" rowsIndex: # Maximum number of bytes to load in memory from parquet row groups to avoid OOM maxArrowDataInMemory: "300_000_000" descriptiveStatistics: # Directory used temporarily to download dataset locally in .parquet to compute statistics cacheDirectory: "/tmp/stats-cache" # fixed number of bins for histogram count histogramNumBins: 10 # the maximum size of the split parquets. maxSplitSizeBytes: "100_000_000" hfDatasetsCache: # Directory where the HF datasets cache data will be stored cacheDirectory: "/tmp/hf-datasets-cache" discussions: # name of the Hub user associated with the dataset viewer bot app botAssociatedUserName: "parquet-converter" # --- jobs (pre-install/upgrade hooks) --- mongodbMigration: # Name of the mongo db database used for storing the migrations history mongoDatabase: "datasets_server_maintenance" nodeSelector: {} resources: requests: cpu: 0 limits: cpu: 0 tolerations: [] # --- cron jobs --- backfill: enabled: false log: level: "info" action: "backfill" schedule: "0 */3 * * *" # every 3 hours nodeSelector: {} resources: requests: cpu: 0 limits: cpu: 0 tolerations: [] backfillRetryableErrors: enabled: false log: level: "info" action: "backfill-retryable-errors" schedule: "*/10 * * * *" # every 10 minutes nodeSelector: {} resources: requests: cpu: 0 limits: cpu: 0 tolerations: [] postMessages: enabled: true log: level: "info" action: "post-messages" schedule: "10 0 * * *" # at 00:10 nodeSelector: {} resources: requests: cpu: 0 limits: cpu: 0 tolerations: [] queueMetricsCollector: enabled: true action: "collect-queue-metrics" schedule: "14 00 * * *" nodeSelector: {} resources: requests: cpu: 0 limits: cpu: 0 tolerations: [] cacheMetricsCollector: enabled: true action: "collect-cache-metrics" schedule: "13 00 * * *" nodeSelector: {} resources: requests: cpu: 0 limits: cpu: 0 tolerations: [] # --- ALB --- ingress: enabled: true tls: [] annotations: {} # --- services --- admin: # HF organization that is allowed to request the report hfOrganization: "datasets-maintainers" # Number of reports in /cache-reports/... endpoints cacheReportsNumResults: 100 # Number of reports in /cache-reports-with-content/... endpoints cacheReportsWithContentNumResults: 100 # the timeout in seconds for the requests to the Hugging Face Hub. hfTimeoutSeconds: "0.2" # The path of the whoami service on the hub. hfWhoamiPath: "/api/whoami-v2" # Number of seconds to set in the `max-age` header on technical endpoints maxAge: "10" # Directory where the uvicorn workers share their prometheus metrics # see https://github.com/prometheus/client_python#multiprocess-mode-eg-gunicorn prometheusMultiprocDirectory: "/tmp" # hostname - it must not be set to localhost to work in Kube! uvicornHostname: "0.0.0.0" # Number of uvicorn workers for running the application uvicornNumWorkers: "1" # Application endpoint port uvicornPort: 8080 nodeSelector: {} replicas: 1 resources: requests: cpu: 0 limits: cpu: 0 service: type: "" annotations: {} ingress: enabled: true annotations: {} ingressInternal: enabled: false annotations: {} tolerations: [] hf: # the path of the external authentication service on the hub. # The string must contain `%s` which will be replaced with the dataset name. authPath: "/api/datasets/%s/auth-check" # the URL where the "Hub JWT public key" is published. The "Hub JWT public key" must be in JWK format. # It helps to decode a JWT sent by the Hugging Face Hub, for example, to bypass the external authentication # check (JWT in the 'X-Api-Key' header). If not set, the JWT are ignored. jwtPublicKeyUrl: "https://huggingface.co/api/keys/jwt" # the algorithm used to encode the JWT. jwtAlgorithm: "EdDSA" # the timeout in seconds for the requests to the Hugging Face Hub. timeoutSeconds: "0.2" api: # Number of seconds to set in the `max-age` header on data endpoints maxAgeLong: "120" # Number of seconds to set in the `max-age` header on technical endpoints maxAgeShort: "10" # Directory where the uvicorn workers will write the prometheus metrics # see https://github.com/prometheus/client_python#multiprocess-mode-eg-gunicorn prometheusMultiprocDirectory: "/tmp" # Hostname - it must not be set to localhost to work in Kube! uvicornHostname: "0.0.0.0" # Number of uvicorn workers for running the application uvicornNumWorkers: "1" # Application endpoint port uvicornPort: 8080 nodeSelector: {} replicas: 1 resources: requests: cpu: 0 limits: cpu: 0 service: type: "" annotations: {} ingress: enabled: true annotations: {} ingressInternal: enabled: false annotations: {} tolerations: [] rows: # Number of seconds to set in the `max-age` header on data endpoints maxAgeLong: "120" # Number of seconds to set in the `max-age` header on technical endpoints maxAgeShort: "10" # Directory where the uvicorn workers will write the prometheus metrics # see https://github.com/prometheus/client_python#multiprocess-mode-eg-gunicorn prometheusMultiprocDirectory: "/tmp" # Hostname - it must not be set to localhost to work in Kube! uvicornHostname: "0.0.0.0" # Number of uvicorn workers for running the application uvicornNumWorkers: "1" # Application endpoint port uvicornPort: 8080 nodeSelector: {} replicas: 1 resources: requests: cpu: 0 limits: cpu: 0 service: type: "" annotations: {} ingress: enabled: true annotations: {} ingressInternal: enabled: false annotations: {} tolerations: [] search: # Number of seconds to set in the `max-age` header on data endpoints maxAgeLong: "120" # Number of seconds to set in the `max-age` header on technical endpoints maxAgeShort: "10" # Directory where the uvicorn workers will write the prometheus metrics # see https://github.com/prometheus/client_python#multiprocess-mode-eg-gunicorn prometheusMultiprocDirectory: "/tmp" # Hostname - it must not be set to localhost to work in Kube! uvicornHostname: "0.0.0.0" # Number of uvicorn workers for running the application uvicornNumWorkers: "1" # Application endpoint port uvicornPort: 8080 # Probability of cleaning the downloads folder at each request. cleanCacheProba: 0.05 # Retention period for downloads. expiredTimeIntervalSeconds: 43_200 # 12 hours nodeSelector: {} replicas: 1 resources: requests: cpu: 0 limits: cpu: 0 service: type: "" annotations: {} ingress: enabled: true annotations: {} ingressInternal: enabled: false annotations: {} tolerations: [] sseApi: # Directory where the uvicorn workers will write the prometheus metrics # see https://github.com/prometheus/client_python#multiprocess-mode-eg-gunicorn prometheusMultiprocDirectory: "/tmp" # Hostname - it must not be set to localhost to work in Kube! uvicornHostname: "0.0.0.0" # Number of uvicorn workers for running the application uvicornNumWorkers: "1" # Application endpoint port uvicornPort: 8080 nodeSelector: {} replicas: 1 resources: requests: cpu: 0 limits: cpu: 0 service: type: "" annotations: {} ingress: enabled: true annotations: {} ingressInternal: enabled: false annotations: {} tolerations: [] workers: - # name of the deployment deployName: "all" # max difficulty of the jobs that this worker will process workerDifficultyMax: 100 # min difficulty of the jobs that this worker will process workerDifficultyMin: 0 # Directory where the uvicorn workers share their prometheus metrics # see https://github.com/prometheus/client_python#multiprocess-mode-eg-gunicorn prometheusMultiprocDirectory: "/tmp" # hostname - it must not be set to localhost to work in Kube! uvicornHostname: "0.0.0.0" # Number of uvicorn workers for running the application uvicornNumWorkers: "1" # Application endpoint port uvicornPort: 8080 nodeSelector: {} replicas: 1 autoscaling: enabled: false resources: requests: cpu: 0 limits: cpu: 0 tolerations: [] webhook: # Number of seconds to set in the `max-age` header on data endpoints maxAgeLong: "120" # Number of seconds to set in the `max-age` header on technical endpoints maxAgeShort: "10" # Directory where the uvicorn workers will write the prometheus metrics # see https://github.com/prometheus/client_python#multiprocess-mode-eg-gunicorn prometheusMultiprocDirectory: "/tmp" # Hostname - it must not be set to localhost to work in Kube! uvicornHostname: "0.0.0.0" # Number of uvicorn workers for running the application uvicornNumWorkers: "1" # Application endpoint port uvicornPort: 8080 nodeSelector: {} replicas: 1 resources: requests: cpu: 0 limits: cpu: 0 service: type: "" annotations: {} ingress: enabled: true annotations: {} ingressInternal: enabled: false annotations: {} tolerations: []

chart/values.yaml (454 lines of code) (raw):