in src/health_runner/health_runner.py [0:0]
def determine_test_iterations(num_nodes: int | None = None) -> int:
"""Determine the number of tests to run.
This function will calculate the number of tests to deploy for a given test
run. The behavior is determined by three environment variables:
* BLAST_MODE_ENABLED - If set to "true", more than one test can be deployed.
By default this will fill the cluster and run a test on every compatible node.
* BLAST_MODE_NUM_TESTS_LIMIT - If set to an integer, places a limit on the
number of tests that will be deployed.
* NODES_CHECKED_PER_TEST - This should be set to the number of discrete nodes
that will be consumed in the YAML_PATH yaml.
Used to calculate the number of tests to deploy. (Defaults to 1)
Args:
num_nodes: The number of nodes in the cluster. If not provided, it will be
determined automatically.
Returns:
Integer of the number of tests that will be deployed.
"""
is_blast_mode = str(os.environ.get("BLAST_MODE_ENABLED")).lower() in [
"true",
"1",
]
if is_blast_mode:
logging.info("Running blast mode")
get_nodes_output = checker_common.run_command(
_K_NUM_GPU_NODES_IN_CLUSTER_COMMAND
)
num_nodes = num_nodes if num_nodes else int(get_nodes_output.stdout)
nodes_per_test = int(os.environ.get("NODES_CHECKED_PER_TEST", "1"))
if num_nodes % nodes_per_test != 0:
logging.warning(
"Not all nodes can be checked. %d are present on the"
" cluster. %d will be checked per test."
" %d node(s) will be unchecked.",
num_nodes,
nodes_per_test,
num_nodes % nodes_per_test,
)
max_num_tests = num_nodes // nodes_per_test
manual_limit_str = os.environ.get("BLAST_MODE_NUM_TESTS_LIMIT")
if manual_limit_str is not None:
return min(int(manual_limit_str), max_num_tests)
else:
return max_num_tests
else:
logging.info("Running single test mode")
return 1