src/health_runner/health_runner_config.proto (106 lines of code) (raw):
// Copyright 2024 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// The configuration for the diag runner.
edition = "2023";
package health_runner;
import "google/protobuf/duration.proto";
option features.field_presence = IMPLICIT;
// The configuration for the health runner.
message HealthRunnerConfig {
string name = 1;
repeated HealthCheck health_checks = 2;
}
// The configuration for a single health check.
message HealthCheck {
// The name of the task.
HealthCheckName name = 1;
// The type of the task.
HealthCheckType type = 2;
// The timeout of the task.
google.protobuf.Duration timeout = 3;
// The params for the health check.
repeated HealthCheckParam health_check_params = 4;
// The file path of the health check.
oneof health_check_file {
string yaml_file = 5;
HelmConfig helm_config = 6;
}
// The place to store the results of the health check.
oneof result_object {
string result_label = 10;
}
oneof health_check_config {
PerformanceHealthCheckConfig performance_health_check_config = 9;
}
}
// The configuration for a single performance health check.
message PerformanceHealthCheckConfig {
// The topology level of the test.
TopologyLevel topology_level = 1;
// Optional. The maximum number of nodes to run the test on.
int32 max_node_count = 5;
// Optional. The minimum number of nodes to run the test on.
int32 min_node_count = 6;
// The baseline file path for the health check.
string baseline_file = 7;
oneof performance_health_check_config {
NCCLPerformanceHealthCheckConfig nccl_performance_health_check_config = 3;
NEMOPerformanceHealthCheckConfig nemo_performance_health_check_config = 4;
}
message NCCLPerformanceHealthCheckConfig {
// The number of nodes to run the test on.
repeated string benchmarks = 1;
}
message NEMOPerformanceHealthCheckConfig {
// The number of nodes to run the test on.
repeated string models = 1;
}
}
// The configuration for a single helm release.
message HelmConfig {
string chart = 1;
string chart_version = 2;
string install_flags = 3;
}
// The type of the health check. Determines how the health check is run.
enum HealthCheckType {
HEALTH_CHECK_TYPE_UNSPECIFIED = 0;
HEALTH_CHECK_TYPE_NODE = 1;
HEALTH_CHECK_TYPE_COMMUNICATION = 2;
HEALTH_CHECK_TYPE_PERFORMANCE = 3;
}
// The list of health checks.
enum HealthCheckName {
HEALTH_CHECK_UNSPECIFIED = 0;
HEALTH_CHECK_DCGM = 1;
HEALTH_CHECK_NCCL_SINGLE_NODE = 3;
HEALTH_CHECK_NCCL_INTRA_RACK = 4;
HEALTH_CHECK_NCCL_INTER_RACK = 5;
HEALTH_CHECK_NCCL_INTER_CLUSTER = 6;
HEALTH_CHECK_NCCL_RANDOM_PAIR = 7;
HEALTH_CHECK_NCCL_PERFORMANCE = 8;
HEALTH_CHECK_NEMO_PERFORMANCE = 9;
}
// The topology level of the test.
enum TopologyLevel {
TOPOLOGY_LEVEL_UNSPECIFIED = 0;
TOPOLOGY_LEVEL_SUBBLOCK = 1;
TOPOLOGY_LEVEL_BLOCK = 2;
TOPOLOGY_LEVEL_CLUSTER = 3;
}
// The parameters for a single health check.
message HealthCheckParam {
string name = 1;
string value = 2;
}