src/straggler_healthcheck/straggler_detection_healthcheck.proto (44 lines of code) (raw):

// Copyright 2024 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Intermediary data structure for Straggler Detection healthcheck. edition = "2023"; package cloud_cluster_supercomputer_validation_bad_node_detectors_straggler_healthcheck; option java_multiple_files = true; // Result of a single Pipeline Parallelism Benchmark operation. message PPBenchmarkResult { int64 batch_id = 1; int64 microbatch_id = 2; int64 barrier_time_ns = 3; int64 t0_ns = 4; int64 t1_ns = 5; int64 t2_ns = 6; int64 t3_ns = 7; } // Metadata associated with a Pipeline Parallelism Benchmark run. message Metadata { string hostname = 1; int64 rank = 2; int64 prev_rank = 3; int64 next_rank = 4; int64 node_id = 5; int64 gpu_id = 6; int64 n_batch = 7; int64 n_microbatch = 8; int64 msg_size_mb = 9; } // Results of a Pipeline Parallelism Benchmark run. message PPBenchmarkResults { Metadata metadata = 1; repeated PPBenchmarkResult benchmark_results = 2; }