in src/tinymax_healthcheck/tinymax_runner.py [0:0]
def main() -> int:
# Check to make sure all the needed envs variables are set.
ensure_env_variables()
# Run a tiny max training. This will train a 7b LLM over a 30 steps.
success = run_tinymax_test()
node_name = os.environ.get("NODE_NAME")
checker_common.add_label(
node_name,
_K_RESULT_LABEL_KEY,
"pass" if success else "fail",
_K_ADD_LABEL_FORMAT,
)
if success:
print(f"Node {node_name} passed tinymax test")
# If the job was not successful and not a dry run, taint the node.
if not success:
print(f"Node {node_name} failed tinymax test")
taint_node()
label_node()
print("Finished running. Bye!")