in generate/run_ioi_slurm.py [0:0]
def create_slurm_script(args, logs_dir):
# Override with custom values if provided
concurrency = get_concurrency(args.model, args.concurrency)
tp = get_tp(args.model, args.revision)
context_length = get_context_length(args.model, args.revision)
# Create a sanitized model name for the job name
job_name = f"ioi-eval-{args.model.replace('/', '-')}"
log_dir = logs_dir / job_name
log_dir.mkdir(parents=True, exist_ok=True)
n_nodes = ceil(tp / 8)
tasks = n_nodes
revision_arg = f"--revision {args.revision}" if args.revision else ""
slurm_script = f"""#!/bin/bash
#SBATCH --job-name={job_name}
#SBATCH --partition={args.partition}
#SBATCH --qos={args.qos}
#SBATCH --nodes={n_nodes}
#SBATCH --gpus-per-node=8
#SBATCH --exclusive
#SBATCH --output={log_dir}/%j-%x.out
#SBATCH --error={log_dir}/%j-%x.out
#SBATCH --time={args.time}
#SBATCH --ntasks-per-node=1
set -exuo pipefail
SERVER_PORT=39877
DIST_PORT=45000
# random sleep (0-100) to prevent ddosing server
sleep $((RANDOM % 100 + 1))
# Environment configuration
export OUTLINES_CACHE_DIR=/scratch/serve_r1/ocache/
export TRITON_HOME=/scratch/serve_r1/triton/
export GLOO_SOCKET_IFNAME="enp71s0"
export NCCL_SOCKET_IFNAME="enp71s0"
# Evaluation script path
EVAL_SCRIPT_PATH="/fsx/hynek_kydlicek/projects/ioi/generate/evaluate.py"
module load cuda/12.4
source ~/.bashrc
# Activate uv
source {args.uv_env or UV_ENV}/bin/activate
FIRST_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n1)
FIRST_NODE_IP=$(srun --nodes=1 --ntasks=1 -w "$FIRST_NODE" hostname --ip-address)
# Launch servers synchronously across all nodes
srun --nodes={n_nodes} --ntasks={tasks} --ntasks-per-node=1 \\
bash -c "python -m sglang.launch_server \\
--model-path '{args.model}' \\
--tp {tp} \\
--dist-init-addr '$FIRST_NODE_IP:$DIST_PORT' \\
{revision_arg} \\
--nnodes {n_nodes} \\
--node-rank \\$SLURM_PROCID \\
--port '$SERVER_PORT' \\
--host 0.0.0.0 \\
--trust-remote-code \\
--max-running-requests {concurrency} \\
--context-length {context_length}" &
# Wait for server with timeout
TIMEOUT={args.startup_delay} # 1h, but model loading should take ~30min
START_TIME=$(date +%s)
echo "Waiting for SGLang server (http://$FIRST_NODE_IP:$SERVER_PORT)..."
while true; do
if curl -s -o /dev/null -w "%{{http_code}}" "http://$FIRST_NODE_IP:$SERVER_PORT/health" >/dev/null 2>&1; then
echo "Server is ready at http://$FIRST_NODE_IP:$SERVER_PORT"
break
fi
CURRENT_TIME=$(date +%s)
if [ $((CURRENT_TIME - START_TIME)) -gt $TIMEOUT ]; then
echo "Error: Server failed to start within $TIMEOUT seconds"
exit 1
fi
echo "Still waiting... ($(($CURRENT_TIME - $START_TIME)) seconds elapsed)"
sleep 60
done
echo "Checking available models..."
curl "http://$FIRST_NODE_IP:$SERVER_PORT/v1/models"
sleep 10
echo "Executing sanity check..."
curl "http://$FIRST_NODE_IP:$SERVER_PORT/v1/completions" \\
-H "Content-Type: application/json" \\
-d '{{
"model": "default",
"prompt": "hi, how are you?",
"max_tokens": 2048,
"temperature": 0.6
}}'
python "$EVAL_SCRIPT_PATH" \\
--model_id "sglang/{args.model}" \\
{revision_arg} \\
--api_base "http://localhost:$SERVER_PORT/v1" \\
--concurrency {concurrency} \\
{args.eval_args}
# Kill the server and exit
pkill -f "python -m sglang.launch_server"
exit 0
"""
return slurm_script, job_name