ablations/evaluation/launch_evals.py [189:269]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        deps = []
        if args.d:
            deps.append(f"afterok:{args.d}")
        if job_id:
            deps.append(f"afterany:{job_id}")

        launch_script = f"""#!/bin/bash
#SBATCH --job-name={args.job_prefix}eval-{model_name}
#SBATCH --nodes={NODES}
#SBATCH --ntasks-per-node=1
#SBATCH --partition={PARTITION}
{f'#SBATCH --qos={args.qos}' if args.qos else ''}
#SBATCH --array=0-{len(selected_checkpoints) - 1}%{args.parallel}
#SBATCH --gres=gpu:{args.gpus}
#SBATCH --time={args.time_limit}
#SBATCH --cpus-per-task={CPUS_PER_NODE}
#SBATCH --output={EVAL_LOGS_PATH}/{model_name}/{args.language}/eval-%A_%a.out
#SBATCH --error={EVAL_LOGS_PATH}/{model_name}/{args.language}/eval-%A_%a.out
{"#SBATCH --dependency=" + ",".join(deps) if deps else ""}
#SBATCH --requeue
###########################################
# [BEGINING] ADAPT TO YOUR ENVIRONMENT
source /path/to/.bashrc
source /path/to/miniconda3/etc/profile.d/conda.sh
conda activate /path/to/miniconda3/envs/exp/

BRRR_FOLDER=/path/to/brrr
# Ensure cache is on fsx not on admin
export HUGGINGFACE_HUB_CACHE=/path/to/.cache/huggingface
export HF_DATASETS_CACHE=/path/to/.cache/huggingface
export HF_MODULES_CACHE=/path/to/.cache/huggingface
export HF_HOME=/path/to/.cache/huggingface
export HF_DATASETS_OFFLINE={1 if args.offline_datasets else 0}

# [END] ADAPT TO YOUR ENVIRONMENT
###########################################


set -x -e
echo "START TIME: $(date)"
echo python3 version = `python3 --version`

# SLURM stuff
export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_PORT=6000
export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`

export CUBLAS_WORKSPACE_CONFIG=":4096:8"
export CUDA_DEVICE_MAX_CONNECTIONS="1"

module load cuda/12.1

echo go $COUNT_NODE
echo $HOSTNAMES
CHECKPOINTS_LIST={bash_ckpts_list}
NSTEP=$((SLURM_ARRAY_TASK_ID))
STEP=${{CHECKPOINTS_LIST[$NSTEP]}}


export TMPDIR=/scratch/USER/{model_name}/{args.language}/$STEP
mkdir -p $TMPDIR

LOCAL_DOWNLOAD_CHECKPOINT_FOLDER=/scratch/USER/checkpoint/{model_name}/$STEP
# Copying checkpoint from s3 to the node on node
mkdir -p $LOCAL_DOWNLOAD_CHECKPOINT_FOLDER
s5cmd cp --exclude "optimizer/*" {s3_path}/$STEP/* $LOCAL_DOWNLOAD_CHECKPOINT_FOLDER

torch_dist_args="--nproc_per_node {args.gpus} \\
    --nnodes $COUNT_NODE \\
    --max_restarts 0 \\
    --tee 3 \\
    --node_rank $SLURM_PROCID \\
    --role $SLURMD_NODENAME: "

launch_args="$torch_dist_args $BRRR_FOLDER/run_evals_nanotron.py \\
    --checkpoint-config-path ${{LOCAL_DOWNLOAD_CHECKPOINT_FOLDER}}/config.yaml --lighteval-override {EVAL_LOGS_PATH}/{model_name}/{args.language}.yml"

sleep $((RANDOM % 60))
srun -u bash -c "python3 -u -m torch.distributed.run ${{launch_args}}" """
        launched_id = launch_slurm_job(launch_script)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


ablations/evaluation/launch_random_evals.py [58:106]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    deps = []
    if args.d:
        deps.append(f"afterok:{args.d}")
    if job_id:
        deps.append(f"afterany:{job_id}")

    launch_script = f"""#!/bin/bash
#SBATCH --job-name=eval-{model_name}-{args.language}
#SBATCH --tasks=1
#SBATCH --partition=partition
#SBATCH --qos={args.qos}
#SBATCH --array=0-{len(selected_checkpoints)-1}%{args.parallel}
#SBATCH --time={args.time_limit}
#SBATCH --cpus-per-task=4
#SBATCH --output=/path/to/logs/train/multilingual/eval-logs/{model_name}/{args.language}/eval-%A_%a.out
#SBATCH --error=/path/to/logs/train/multilingual/eval-logs/{model_name}/{args.language}/eval-%A_%a.out
{"#SBATCH --dependency=" + ",".join(deps) if deps else ""}
#SBATCH --requeue
###########################################
# [BEGINING] ADAPT TO YOUR ENVIRONMENT
source /admin/home/{USER}/.bashrc
source /path/to/miniconda3/etc/profile.d/conda.sh
conda activate /path/to/miniconda3/envs/exp/


LIGHTEVAL_FOLDER=/path/to/ml-lighteval
export HUGGINGFACE_HUB_CACHE=/path/to/.cache/huggingface
export HF_DATASETS_CACHE=/path/to/.cache/huggingface
export HF_MODULES_CACHE=/path/to/.cache/huggingface
export HF_HOME=/path/to/.cache/huggingface
export HF_DATASETS_OFFLINE={1 if args.offline_datasets else 0}
# [END] ADAPT TO YOUR ENVIRONMENT
###########################################
set -x -e
echo "START TIME: $(date)"
echo python3 version = `python3 --version`

# SLURM stuff
export TMPDIR=/scratch/{USER}/{model_name}/{args.language}
mkdir -p $TMPDIR
CHECKPOINTS_LIST={bash_ckpts_list}
NSTEP=$((SLURM_ARRAY_TASK_ID))
STEP=${{CHECKPOINTS_LIST[$NSTEP]}}

launch_args="$LIGHTEVAL_FOLDER/run_evals_accelerate.py --model_args='dummy,name=dummy-{args.language}-/${{STEP}},tokenizer={args.tokenizer}' --max_samples=1000 --custom_tasks=lighteval.community_tasks.multilingual.configs.{args.language} --tasks={args.tasks} --save_results --logging_dir={args.logging_dir}"
sleep $((RANDOM % 60))
srun -u bash -c "python3 -u ${{launch_args}}" """

    launched_id = launch_slurm_job(launch_script)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -