create_config.py (114 lines of code) (raw):
"""
python create_config.py --out_dir tmp --exp_name test_2_node --tp 2 --cp 2 --pp 2 --dp 2 --model_name HuggingFaceTB/SmolLM-360M-Instruct --num_attention_heads 16 --num_key_value_heads 4 --grad_acc_steps 1 --mbs 32 --seq_len 4096 --use_wandb
"""
import os
from copy import deepcopy
from transformers import AutoConfig
import shutil
import argparse
import json
from typing import Optional
from picotron.utils import download_model
def create_single_config(
out_dir: str,
tp: int,
cp: int,
dp: int,
pp: int,
pp_engine: str,
model_name: str,
num_hidden_layers: Optional[int],
num_attention_heads: Optional[int],
num_key_value_heads: Optional[int],
grad_acc_steps: int,
mbs: int,
seq_len: int,
subset_name: Optional[str],
exp_name: str,
use_wandb: bool = False,
use_cpu: bool = False,
use_fused_adam: bool = False,
hf_token: str = None
):
run_path = os.path.join(out_dir, exp_name)
if not os.path.exists(out_dir):
os.makedirs(out_dir)
with open("template/base_config.json", "r") as f:
base_config = json.load(f)
config_content = deepcopy(base_config)
config_content["environment"]["HF_TOKEN"] = hf_token
config_content["training"]["seq_length"] = seq_len
config_content["checkpoint"]["save_dir"] = run_path
config_content["dataset"]["subset_name"] = subset_name
config_content["model"]["name"] = model_name
tmp_model_config = AutoConfig.from_pretrained(model_name)
config_content["model"]["num_hidden_layers"] = tmp_model_config.num_hidden_layers if num_hidden_layers is None else num_hidden_layers
config_content["model"]["num_attention_heads"] = tmp_model_config.num_attention_heads if num_attention_heads is None else num_attention_heads
config_content["model"]["num_key_value_heads"] = tmp_model_config.num_key_value_heads if num_key_value_heads is None else num_key_value_heads
config_content["model"]["use_fused_adam"] = use_fused_adam
del tmp_model_config
config_content['distributed']['tp_size'] = tp
config_content['distributed']['cp_size'] = cp
config_content['distributed']['dp_size'] = dp
config_content['distributed']['pp_size'] = pp
config_content['distributed']['pp_engine'] = pp_engine
config_content['distributed']['use_cpu'] = use_cpu
if use_cpu:
config_content["environment"]["FLASH_ATTEN"] = "0"
config_content["distributed"]["backend"] = "gloo"
config_content['logging']['use_wandb'] = use_wandb
config_content['logging']['run_name'] = exp_name
gbs = dp * mbs * grad_acc_steps
gbs_token = gbs * seq_len
print(f"Gbs_token: {gbs_token:,}, Gbs: {gbs}, dp: {dp}, seq_len: {seq_len}, grad_acc_steps: {grad_acc_steps}, mbs: {mbs}")
config_content['training']['gradient_accumulation_steps'] = grad_acc_steps
config_content['training']['micro_batch_size'] = mbs
if os.path.exists(run_path):
shutil.rmtree(run_path)
os.makedirs(run_path)
with open(os.path.join(run_path, "config.json"), "w") as new_config:
json.dump(config_content, new_config, indent=4)
del config_content
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--out_dir", type=str, help="Output directory to store the configs", default="tmp")
parser.add_argument("--tp", type=int, help="number of tensor parallelism", default=1)
parser.add_argument("--cp", type=int, help="number of context parallelism", default=1)
parser.add_argument("--dp", type=int, help="number of data parallelism", default=1)
parser.add_argument("--pp", type=int, help="number of pipeline parallelism", default=1)
parser.add_argument("--pp_engine", type=str, help="pipeline parallel engine", default="1f1b")
parser.add_argument("--model_name", type=str, help="Model name to create configs for", default="HuggingFaceTB/SmolLM-360M-Instruct")
parser.add_argument("--num_hidden_layers", type=int, help="Number of hidden layers", default=None)
parser.add_argument("--num_attention_heads", type=int, help="Number of attention heads", default=None)
parser.add_argument("--num_key_value_heads", type=int, help="Number of key value heads", default=None)
parser.add_argument("--grad_acc_steps", type=int, help="grad accumulation", default=1)
parser.add_argument("--mbs", type=int, help="micro batch size", default=1)
parser.add_argument("--seq_len", type=int, help="Sequence length", default=1024)
parser.add_argument("--subset_name", type=str, help="Subset name", default=None)
parser.add_argument("--exp_name", type=str, help="Experiment name", default="dummy_exp")
parser.add_argument("--use_wandb", action="store_true", help="Use wandb for logging")
parser.add_argument("--use_cpu", action="store_true", help="Use CPU for training")
parser.add_argument("--use_fused_adam", action="store_true", help="Use fused adam")
parser.add_argument("--hf_token", type=str, help="HF token")
args=parser.parse_args()
create_single_config(
out_dir=args.out_dir,
tp=args.tp,
cp=args.cp,
dp=args.dp,
pp=args.pp,
pp_engine=args.pp_engine,
model_name=args.model_name,
num_hidden_layers=args.num_hidden_layers,
num_attention_heads=args.num_attention_heads,
num_key_value_heads=args.num_key_value_heads,
grad_acc_steps=args.grad_acc_steps,
mbs=args.mbs,
seq_len=args.seq_len,
subset_name=args.subset_name,
exp_name=args.exp_name,
use_wandb=args.use_wandb,
use_cpu=args.use_cpu,
use_fused_adam=args.use_fused_adam,
hf_token=args.hf_token
)
print("Configs created successfully! ✅")
download_model(args.model_name, args.hf_token)
print("SafeTensors files downloaded successfully! ✅")