benchmarks/benchmark/tools/CL2-benchmark/modules/statefulsets.yaml (394 lines of code) (raw):
{{$inferenceWorkloadName := .inferenceWorkloadName}}
{{$trainingWorkloadName := .trainingWorkloadName}}
{{$inferenceWorkloadScaledUpSize := .inferenceWorkloadScaledUpSize}}
{{$inferenceWorkloadInitialSize := .inferenceWorkloadInitialSize}}
{{$trainingWorkloadSingleWorkloadSize := .trainingWorkloadSingleWorkloadSize}}
{{$trainingWorkloadMixedWorkloadSize := .trainingWorkloadMixedWorkloadSize}}
{{$priorityClassHigh := .priorityClassHigh}}
{{$priorityClassLow := .priorityClassLow }}
{{$throughputThreshold := DefaultParam .throughputThreshold 0}}
steps:
# Phase #1: Single workload: creating a training workload (StatefulSet) from scratch of size {{$trainingWorkloadSingleWorkloadSize}}
- name: Phase 1; starting measurements for training workload (StatefulSet) creation
measurements:
- Identifier: PodStartupLatencyTrainingWorkloadSingleWorkloadSizeCreation
Method: PodStartupLatency
Params:
action: start
labelSelector: group=training-workload
threshold: 1h
- Identifier: WaitForRunningTrainingWorkloadSingleWorkloadSize
Method: WaitForControlledPodsRunning
Params:
checkIfPodsAreUpdated: false
apiVersion: apps/v1
kind: StatefulSet
action: start
labelSelector: group=training-workload
operationTimeout: 1h
- module:
path: modules/scheduling-throughput.yaml
params:
basename: {{$trainingWorkloadName}}
action: start
threshold: {{$throughputThreshold}}
- name: Phase 1; creating training workload (StatefulSet) of size {{$trainingWorkloadSingleWorkloadSize}}
phases:
- namespaceRange:
min: {{.minNamespace}}
max: {{.maxNamespace}}
replicasPerNamespace: 1
tuningSet: default
objectBundle:
- basename: {{$trainingWorkloadName}}
objectTemplatePath: statefulset.yaml
templateFillMap:
Replicas: {{$trainingWorkloadSingleWorkloadSize}}
CpuRequest: "5m"
PriorityClass: {{$priorityClassLow}}
group: training-workload
- name: Phase 1; waiting for training workload (StatefulSet) to be created
measurements:
- Method: WaitForControlledPodsRunning
Identifier: WaitForRunningTrainingWorkloadSingleWorkloadSize
Params:
action: gather
labelSelector: group=training-workload
- module:
path: modules/scheduling-throughput.yaml
params:
basename: {{$trainingWorkloadName}}
action: gather
- name: Phase 1; measure training workload (StatefulSet) pod startup latency
measurements:
- Identifier: PodStartupLatencyTrainingWorkloadSingleWorkloadSizeCreation
Method: PodStartupLatency
Params:
action: gather
- name: Phase 1; starting measurements for training workload (StatefulSet) deletion
measurements:
- Identifier: WaitForDeletionTrainingWorkloadSingleWorkloadSize
Method: WaitForControlledPodsRunning
Params:
checkIfPodsAreUpdated: false
apiVersion: apps/v1
kind: StatefulSet
action: start
labelSelector: group=training-workload
operationTimeout: 1h
- name: Phase 1; deleting training workload (StatefulSet) of size {{$trainingWorkloadSingleWorkloadSize}}
phases:
- namespaceRange:
min: {{.minNamespace}}
max: {{.maxNamespace}}
replicasPerNamespace: 0
tuningSet: default
objectBundle:
- basename: {{$trainingWorkloadName}}
objectTemplatePath: statefulset.yaml
- name: Phase 1; waiting for training workload (StatefulSet) to be deleted
measurements:
- Identifier: WaitForDeletionTrainingWorkloadSingleWorkloadSize
Method: WaitForControlledPodsRunning
Params:
action: gather
labelSelector: group=training-workload
# Phase #2: Mixed workload: Training and inference workloads (StatefulSets)
- name: Phase 2; starting measurements for training workload (StatefulSet) creation
measurements:
- Identifier: PodStartupLatencyTrainingWorkloadMixedWorkloadSizeCreation
Method: PodStartupLatency
Params:
action: start
labelSelector: group=training-workload
threshold: 1h
- Identifier: WaitForRunningTrainingWorkloadMixedWorkloadSize
Method: WaitForControlledPodsRunning
Params:
checkIfPodsAreUpdated: false
apiVersion: apps/v1
kind: StatefulSet
action: start
labelSelector: group=training-workload
operationTimeout: 1h
- module:
path: modules/scheduling-throughput.yaml
params:
basename: {{$trainingWorkloadName}}
action: start
threshold: {{$throughputThreshold}}
- name: Phase 2; creating training workload (StatefulSet) of size {{$trainingWorkloadMixedWorkloadSize}}
phases:
- namespaceRange:
min: {{.minNamespace}}
max: {{.maxNamespace}}
replicasPerNamespace: 1
tuningSet: default
objectBundle:
- basename: {{$trainingWorkloadName}}
objectTemplatePath: statefulset.yaml
templateFillMap:
Replicas: {{$trainingWorkloadMixedWorkloadSize}}
CpuRequest: "5m"
PriorityClass: {{$priorityClassLow}}
group: training-workload
- name: Phase 2; waiting for training workload (StatefulSet) to be created
measurements:
- Method: WaitForControlledPodsRunning
Identifier: WaitForRunningTrainingWorkloadMixedWorkloadSize
Params:
action: gather
labelSelector: group=training-workload
- module:
path: modules/scheduling-throughput.yaml
params:
basename: {{$trainingWorkloadName}}
action: gather
- name: Phase 2; measure training workload (StatefulSet) pod startup latency
measurements:
- Identifier: PodStartupLatencyTrainingWorkloadMixedWorkloadSizeCreation
Method: PodStartupLatency
Params:
action: gather
- name: Phase 2; starting measurements for inference workload (StatefulSet) creation
measurements:
- Identifier: PodStartupLatencyInferenceWorkloadInitialSizeCreation
Method: PodStartupLatency
Params:
action: start
labelSelector: group=inference-workload
threshold: 1h
- Identifier: WaitForRunningInferenceWorkloadInitialSize
Method: WaitForControlledPodsRunning
Params:
checkIfPodsAreUpdated: false
apiVersion: apps/v1
kind: StatefulSet
action: start
labelSelector: group=inference-workload
operationTimeout: 1h
- module:
path: modules/scheduling-throughput.yaml
params:
basename: {{$inferenceWorkloadName}}
action: start
threshold: {{$throughputThreshold}}
- name: Phase 2; creating inference workload (StatefulSet) of size {{$inferenceWorkloadInitialSize}}
phases:
- namespaceRange:
min: {{.minNamespace}}
max: {{.maxNamespace}}
replicasPerNamespace: 1
tuningSet: default
objectBundle:
- basename: {{$inferenceWorkloadName}}
objectTemplatePath: statefulset.yaml
templateFillMap:
Replicas: {{$inferenceWorkloadInitialSize}}
CpuRequest: "5m"
PriorityClass: {{$priorityClassHigh}}
group: inference-workload
- name: Phase 2; waiting for inference workload (StatefulSet) to be created
measurements:
- Method: WaitForControlledPodsRunning
Identifier: WaitForRunningInferenceWorkloadInitialSize
Params:
action: gather
labelSelector: group=inference-workload
- module:
path: modules/scheduling-throughput.yaml
params:
basename: {{$inferenceWorkloadName}}
action: gather
- name: Phase 2; measure inference workload (StatefulSet) pod startup latency
measurements:
- Identifier: PodStartupLatencyInferenceWorkloadInitialSizeCreation
Method: PodStartupLatency
Params:
action: gather
# Phase #3: Scale up of inference workload (StatefulSet), training workload disruption
- name: Phase 3; starting measurements for scaling up inference workload (StatefulSet) to full size
measurements:
- Identifier: WaitForRunningInferenceWorkloadScaledUpSize
Method: WaitForControlledPodsRunning
Params:
labelSelector: group=inference-workload
checkIfPodsAreUpdated: false
operationTimeout: 1h
apiVersion: apps/v1
kind: StatefulSet
action: start
- name: Phase 3; starting measurements for manual preemption of training workload (StatefulSet)
measurements:
- Identifier: WaitForManualPreemptionTrainingWorkloadMixedWorkloadSize
Method: WaitForControlledPodsRunning
Params:
labelSelector: group=training-workload
checkIfPodsAreUpdated: false
operationTimeout: 1h
apiVersion: apps/v1
kind: StatefulSet
action: start
- module:
path: modules/scheduling-throughput.yaml
params:
basename: {{$inferenceWorkloadName}}
action: start
threshold: {{$throughputThreshold}}
- name: Phase 3; scaling up inference workload (StatefulSet) to size {{$inferenceWorkloadScaledUpSize}}
phases:
- namespaceRange:
min: {{.minNamespace}}
max: {{.maxNamespace}}
replicasPerNamespace: 1
tuningSet: default
objectBundle:
- basename: {{$inferenceWorkloadName}}
objectTemplatePath: statefulset.yaml
templateFillMap:
Replicas: {{$inferenceWorkloadScaledUpSize}}
CpuRequest: "5m"
PriorityClass: {{$priorityClassHigh}}
group: inference-workload
- name: Phase 3; scaling down training workload (StatefulSet) to size 0 (triggering preemption)
phases:
- namespaceRange:
min: {{.minNamespace}}
max: {{.maxNamespace}}
replicasPerNamespace: 1
tuningSet: default
objectBundle:
- basename: {{$trainingWorkloadName}}
objectTemplatePath: statefulset.yaml
templateFillMap:
Replicas: 0
CpuRequest: "5m"
PriorityClass: {{$priorityClassLow}}
group: training-workload
- name: Phase 3; waiting for training workload (StatefulSet) to be preempted
measurements:
- Identifier: WaitForManualPreemptionTrainingWorkloadMixedWorkloadSize
Method: WaitForControlledPodsRunning
Params:
labelSelector: group=training-workload
action: gather
- name: Phase 3; waiting for inference workload (StatefulSet) to scale up
measurements:
- Identifier: WaitForRunningInferenceWorkloadScaledUpSize
Method: WaitForControlledPodsRunning
Params:
labelSelector: group=inference-workload
action: gather
- module:
path: modules/scheduling-throughput.yaml
params:
basename: {{$inferenceWorkloadName}}
action: gather
- name: Phase 3; starting measurements for training workload (StatefulSet) recovery
measurements:
- Identifier: WaitForReschedulingTrainingWorkloadMixedWorkloadSize
Method: WaitForControlledPodsRunning
Params:
operationTimeout: 1h
checkIfPodsAreUpdated: false
labelSelector: group=training-workload
apiVersion: apps/v1
kind: StatefulSet
action: start
- name: Phase 3; attempting to scale up training workload (StatefulSet) to size {{$trainingWorkloadMixedWorkloadSize}} (should remain pending)
phases:
- namespaceRange:
min: {{.minNamespace}}
max: {{.maxNamespace}}
replicasPerNamespace: 1
tuningSet: default
objectBundle:
- basename: {{$trainingWorkloadName}}
objectTemplatePath: statefulset.yaml
templateFillMap:
Replicas: {{$trainingWorkloadMixedWorkloadSize}}
CpuRequest: "5m"
PriorityClass: {{$priorityClassLow}}
group: training-workload
- name: Phase 3; sleep after creating scale up training workload (StatefulSet) to ensure they are pending
measurements:
- Identifier: SleepAfterLowPrioriorityScaleUp
Method: Sleep
Params:
duration: 5m
# Phase #4: Scale down inference workload, training workload recovery
- name: Phase 4; starting measurements for scaling down inference workload (StatefulSet) to small size
measurements:
- Identifier: WaitForScalingDownInferenceWorkloadInitialSize
Method: WaitForControlledPodsRunning
Params:
labelSelector: group=inference-workload
checkIfPodsAreUpdated: false
apiVersion: apps/v1
kind: StatefulSet
action: start
operationTimeout: 1h
- module:
path: modules/scheduling-throughput.yaml
params:
basename: {{$trainingWorkloadName}}
action: start
threshold: {{$throughputThreshold}}
- name: Phase 4; scaling down inference workload (StatefulSet) to size {{$inferenceWorkloadInitialSize}}
phases:
- namespaceRange:
min: {{.minNamespace}}
max: {{.maxNamespace}}
replicasPerNamespace: 1
tuningSet: default
objectBundle:
- basename: {{$inferenceWorkloadName}}
objectTemplatePath: statefulset.yaml
templateFillMap:
Replicas: {{$inferenceWorkloadInitialSize}}
CpuRequest: "5m"
PriorityClass: {{$priorityClassHigh}}
group: inference-workload
- name: Phase 4; waiting for inference workload (StatefulSet) to scale down
measurements:
- Identifier: WaitForScalingDownInferenceWorkloadInitialSize
Method: WaitForControlledPodsRunning
Params:
labelSelector: group=inference-workload
action: gather
- name: Phase 4; waiting for training workload (StatefulSet) to recover
measurements:
- Identifier: WaitForReschedulingTrainingWorkloadMixedWorkloadSize
Method: WaitForControlledPodsRunning
Params:
labelSelector: group=training-workload
action: gather
- module:
path: modules/scheduling-throughput.yaml
params:
basename: {{$trainingWorkloadName}}
action: gather
# Phase #5: Training workload finishes
- name: Phase 5; starting measurements for deleting training workload (StatefulSet)
measurements:
- Identifier: WaitForDeletionTrainingWorkloadMixedWorkloadSize
Method: WaitForControlledPodsRunning
Params:
checkIfPodsAreUpdated: false
apiVersion: apps/v1
kind: StatefulSet
action: start
labelSelector: group=training-workload
operationTimeout: 1h
- name: Phase 5; deleting training workload (StatefulSet) of size {{$trainingWorkloadMixedWorkloadSize}}
phases:
- namespaceRange:
min: {{.minNamespace}}
max: {{.maxNamespace}}
replicasPerNamespace: 0
tuningSet: default
objectBundle:
- basename: {{$trainingWorkloadName}}
objectTemplatePath: statefulset.yaml
- name: Phase 5; waiting for training workload (StatefulSet) to be deleted
measurements:
- Identifier: WaitForDeletionTrainingWorkloadMixedWorkloadSize
Method: WaitForControlledPodsRunning
Params:
action: gather
labelSelector: group=training-workload