config/hostmgr/base.yaml (61 lines of code) (raw):

storage: cassandra: max_parallel_batches: 1000 max_updates_job: 10 connection: contactPoints: ["127.0.0.1"] port: 9042 consistency: LOCAL_QUORUM serialConsistency: LOCAL_SERIAL hostPolicy: TokenAwareHostPolicy # Need to increase timeout from 10s to 20s to avoid recovery code from timing out # We saw recovery code timing out when peloton was recovering from a # Cassandra latency spike issue. timeout: 20s store_name: peloton_test migrations: pkg/storage/cassandra/migrations/ use_cassandra: true auto_migrate: false host_manager: http_port: 5291 grpc_port: 5391 offer_hold_time_sec: 864000 offer_pruning_period_sec: 3600 taskupdate_ack_concurrency: 10 taskupdate_buffer_size: 100000 task_reconciler: initial_reconcile_delay_sec: 60 reconcile_interval_sec: 1800 explicit_reconcile_batch_interval_sec: 5 explicit_reconcile_batch_size: 1000 hostmap_refresh_interval: 10s host_pruning_period_sec: 120s host_placing_offer_status_sec: 300s held_host_pruning_period_sec: 180s hostmgr_backoff_retry_count: 3 hostmgr_backoff_retry_interval_sec: 15 host_drainer_period: 900s # scarce_resource_types are resources, which are exclusively reserved for specific task requirements, # and to prevent every task to schedule on those hosts such as GPU. # Resource Types are case sensitive, supported resource types are "CPU", "GPU", "Mem" and "Disk" # As of now, GPU is the only supported scarce resource type, adding other resource types are no-op. scarce_resource_types: # slack_resource_types are resource types exposed as revocable resources by Mesos. # usage slack = allocated resources - actual resources consumed. # Mesos supports cpus & mem as revocable resources. # Peloton only supports [cpus] as revocable/slack resources. slack_resource_types: # bin_packing represents the strategy hostmanager is going to use in order # to pack the tasks in the host. By default it was FIRST_FIT, we are changing # it to DEFRAG. bin_packing: FIRST_FIT # DEFRAG/FIRST_FIT # bin packing refresh interval represents the time interval in which # we can refresh the list of hosts based on bin packing algorithm bin_packing_refresh_interval: 30s enable_host_pool: false host_pool_reconcile_interval: 10s mesos: encoding: "x-protobuf" framework: gpu_supported: true task_killing_state: false partition_aware: false revocable_resources: false user: "root" name: "Peloton" # TODO : add roles for other components role: "peloton" principal: "peloton" # ~100 weeks to failover failover_timeout: 60000000 max_connections_to_mesos_master: 1024 election: root: "/peloton" health: heartbeat_interval: 5s metrics: runtime_metrics: enabled: true interval: 10s