in src/main.rs [450:481]
fn are_gpus_healthy(
burn_results: Vec<BurnResult>,
tflops_tolerance: f64,
tolerate_software_throttling: bool,
) -> (bool, Vec<String>) {
let mut reasons = vec![];
// acceptable_flops is tflops_tolerance% lower than best gpu avg flops
let acceptable_flops: f64 = burn_results
.iter()
.map(|r| r.flops_avg())
.fold(0., |max, avg| {
max.max(avg * (100. - tflops_tolerance) / 100.)
});
for r in burn_results.iter() {
let mut low_flops = false;
if r.flops_avg() < acceptable_flops {
reasons.push(format!("GPU {} - ", r.gpu_idx) + GPU_FLOPS_REASON);
low_flops = true;
}
// if we have any throttling
if r.is_throttled() {
if !low_flops
&& tolerate_software_throttling
&& (r.throttling_thermal_hw == 0 && r.throttling_hw == 0)
{
continue;
}
reasons.push(format!("GPU {} - ", r.gpu_idx) + GPU_THROTTLING_REASON);
}
}
(reasons.is_empty(), reasons)
}