in fastchat/train/train_baichuan.py [0:0]
def train():
global local_rank
parser = transformers.HfArgumentParser(
(ModelArguments, DataArguments, TrainingArguments)
)
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
local_rank = training_args.local_rank
config = transformers.AutoConfig.from_pretrained(
model_args.model_name_or_path,
trust_remote_code=True,
cache_dir=training_args.cache_dir,
)
# Set RoPE scaling factor
orig_ctx_len = getattr(config, "max_position_embeddings", None)
if orig_ctx_len and training_args.model_max_length > orig_ctx_len:
scaling_factor = float(math.ceil(training_args.model_max_length / orig_ctx_len))
config.rope_scaling = {"type": "linear", "factor": scaling_factor}
config.use_cache = False
model = transformers.AutoModelForCausalLM.from_pretrained(
model_args.model_name_or_path,
config=config,
trust_remote_code=True,
cache_dir=training_args.cache_dir,
)
# Tie the weights
model.tie_weights()
tokenizer = transformers.AutoTokenizer.from_pretrained(
model_args.model_name_or_path,
config=config,
trust_remote_code=True,
cache_dir=training_args.cache_dir,
model_max_length=training_args.model_max_length,
padding_side="right",
use_fast=False,
)
# NOTE: if the token_id exceed the vocab_size will cause failing in training process! we need add special config and resize the embedding size!
tokenizer.pad_token = tokenizer.unk_token
print(f"tokens len: {len(tokenizer)}")
model.resize_token_embeddings(len(tokenizer))
data_module = make_supervised_data_module(
tokenizer=tokenizer, train_ratio=0.98, data_args=data_args
)
trainer = Trainer(
model=model, tokenizer=tokenizer, args=training_args, **data_module
)
if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
trainer.train(resume_from_checkpoint=True)
else:
trainer.train()
trainer.save_state()
safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir)