config/hostmgr/base.yaml (61 lines of code) (raw):
storage:
cassandra:
max_parallel_batches: 1000
max_updates_job: 10
connection:
contactPoints: ["127.0.0.1"]
port: 9042
consistency: LOCAL_QUORUM
serialConsistency: LOCAL_SERIAL
hostPolicy: TokenAwareHostPolicy
# Need to increase timeout from 10s to 20s to avoid recovery code from timing out
# We saw recovery code timing out when peloton was recovering from a
# Cassandra latency spike issue.
timeout: 20s
store_name: peloton_test
migrations: pkg/storage/cassandra/migrations/
use_cassandra: true
auto_migrate: false
host_manager:
http_port: 5291
grpc_port: 5391
offer_hold_time_sec: 864000
offer_pruning_period_sec: 3600
taskupdate_ack_concurrency: 10
taskupdate_buffer_size: 100000
task_reconciler:
initial_reconcile_delay_sec: 60
reconcile_interval_sec: 1800
explicit_reconcile_batch_interval_sec: 5
explicit_reconcile_batch_size: 1000
hostmap_refresh_interval: 10s
host_pruning_period_sec: 120s
host_placing_offer_status_sec: 300s
held_host_pruning_period_sec: 180s
hostmgr_backoff_retry_count: 3
hostmgr_backoff_retry_interval_sec: 15
host_drainer_period: 900s
# scarce_resource_types are resources, which are exclusively reserved for specific task requirements,
# and to prevent every task to schedule on those hosts such as GPU.
# Resource Types are case sensitive, supported resource types are "CPU", "GPU", "Mem" and "Disk"
# As of now, GPU is the only supported scarce resource type, adding other resource types are no-op.
scarce_resource_types:
# slack_resource_types are resource types exposed as revocable resources by Mesos.
# usage slack = allocated resources - actual resources consumed.
# Mesos supports cpus & mem as revocable resources.
# Peloton only supports [cpus] as revocable/slack resources.
slack_resource_types:
# bin_packing represents the strategy hostmanager is going to use in order
# to pack the tasks in the host. By default it was FIRST_FIT, we are changing
# it to DEFRAG.
bin_packing: FIRST_FIT # DEFRAG/FIRST_FIT
# bin packing refresh interval represents the time interval in which
# we can refresh the list of hosts based on bin packing algorithm
bin_packing_refresh_interval: 30s
enable_host_pool: false
host_pool_reconcile_interval: 10s
mesos:
encoding: "x-protobuf"
framework:
gpu_supported: true
task_killing_state: false
partition_aware: false
revocable_resources: false
user: "root"
name: "Peloton"
# TODO : add roles for other components
role: "peloton"
principal: "peloton"
# ~100 weeks to failover
failover_timeout: 60000000
max_connections_to_mesos_master: 1024
election:
root: "/peloton"
health:
heartbeat_interval: 5s
metrics:
runtime_metrics:
enabled: true
interval: 10s