in src/autotrain/trainers/seq2seq/__main__.py [0:0]
def train(config):
if isinstance(config, dict):
config = Seq2SeqParams(**config)
train_data = None
valid_data = None
# check if config.train_split.csv exists in config.data_path
if config.train_split is not None:
if config.data_path == f"{config.project_name}/autotrain-data":
logger.info("loading dataset from disk")
train_data = load_from_disk(config.data_path)[config.train_split]
else:
if ":" in config.train_split:
dataset_config_name, split = config.train_split.split(":")
train_data = load_dataset(
config.data_path,
name=dataset_config_name,
split=split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
else:
train_data = load_dataset(
config.data_path,
split=config.train_split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
if config.valid_split is not None:
if config.data_path == f"{config.project_name}/autotrain-data":
logger.info("loading dataset from disk")
valid_data = load_from_disk(config.data_path)[config.valid_split]
else:
if ":" in config.valid_split:
dataset_config_name, split = config.valid_split.split(":")
valid_data = load_dataset(
config.data_path,
name=dataset_config_name,
split=split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
else:
valid_data = load_dataset(
config.data_path,
split=config.valid_split,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
tokenizer = AutoTokenizer.from_pretrained(config.model, token=config.token, trust_remote_code=ALLOW_REMOTE_CODE)
train_data = Seq2SeqDataset(data=train_data, tokenizer=tokenizer, config=config)
if config.valid_split is not None:
valid_data = Seq2SeqDataset(data=valid_data, tokenizer=tokenizer, config=config)
if config.logging_steps == -1:
if config.valid_split is not None:
logging_steps = int(0.2 * len(valid_data) / config.batch_size)
else:
logging_steps = int(0.2 * len(train_data) / config.batch_size)
if logging_steps == 0:
logging_steps = 1
if logging_steps > 25:
logging_steps = 25
config.logging_steps = logging_steps
else:
logging_steps = config.logging_steps
logger.info(f"Logging steps: {logging_steps}")
training_args = dict(
output_dir=config.project_name,
per_device_train_batch_size=config.batch_size,
per_device_eval_batch_size=2 * config.batch_size,
learning_rate=config.lr,
num_train_epochs=config.epochs,
eval_strategy=config.eval_strategy if config.valid_split is not None else "no",
logging_steps=logging_steps,
save_total_limit=config.save_total_limit,
save_strategy=config.eval_strategy if config.valid_split is not None else "no",
gradient_accumulation_steps=config.gradient_accumulation,
report_to=config.log,
auto_find_batch_size=config.auto_find_batch_size,
lr_scheduler_type=config.scheduler,
optim=config.optimizer,
warmup_ratio=config.warmup_ratio,
weight_decay=config.weight_decay,
max_grad_norm=config.max_grad_norm,
push_to_hub=False,
load_best_model_at_end=True if config.valid_split is not None else False,
ddp_find_unused_parameters=False,
predict_with_generate=True,
seed=config.seed,
)
if config.mixed_precision == "fp16":
training_args["fp16"] = True
if config.mixed_precision == "bf16":
training_args["bf16"] = True
if config.valid_split is not None:
early_stop = EarlyStoppingCallback(
early_stopping_patience=config.early_stopping_patience,
early_stopping_threshold=config.early_stopping_threshold,
)
callbacks_to_use = [early_stop]
else:
callbacks_to_use = []
callbacks_to_use.extend([UploadLogs(config=config), LossLoggingCallback(), TrainStartCallback()])
args = Seq2SeqTrainingArguments(**training_args)
model_config = AutoConfig.from_pretrained(
config.model,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
use_cache=False,
)
if config.peft:
if config.quantization == "int4":
raise NotImplementedError("int4 quantization is not supported")
# if config.use_int4:
# bnb_config = BitsAndBytesConfig(
# load_in_4bit=config.use_int4,
# bnb_4bit_quant_type="nf4",
# bnb_4bit_compute_dtype=torch.float16,
# bnb_4bit_use_double_quant=False,
# )
# config.fp16 = True
if config.quantization == "int8":
bnb_config = BitsAndBytesConfig(load_in_8bit=True)
else:
bnb_config = None
model = AutoModelForSeq2SeqLM.from_pretrained(
config.model,
config=model_config,
token=config.token,
quantization_config=bnb_config,
trust_remote_code=ALLOW_REMOTE_CODE,
)
else:
model = AutoModelForSeq2SeqLM.from_pretrained(
config.model,
config=model_config,
token=config.token,
trust_remote_code=ALLOW_REMOTE_CODE,
)
embedding_size = model.get_input_embeddings().weight.shape[0]
if len(tokenizer) > embedding_size:
model.resize_token_embeddings(len(tokenizer))
if config.peft:
target_modules = config.target_modules.split(",") if config.target_modules is not None else None
if target_modules:
target_modules = [module.strip() for module in target_modules]
if len(target_modules) == 1 and target_modules[0] == "all-linear":
target_modules = "all-linear"
lora_config = LoraConfig(
r=config.lora_r,
lora_alpha=config.lora_alpha,
target_modules=target_modules,
lora_dropout=config.lora_dropout,
bias="none",
task_type=TaskType.SEQ_2_SEQ_LM,
)
if config.quantization is not None:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
_s2s_metrics = partial(utils._seq2seq_metrics, tokenizer=tokenizer)
trainer_args = dict(
args=args,
model=model,
callbacks=callbacks_to_use,
compute_metrics=_s2s_metrics,
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
**trainer_args,
train_dataset=train_data,
eval_dataset=valid_data,
data_collator=data_collator,
tokenizer=tokenizer,
)
for name, module in trainer.model.named_modules():
if "norm" in name:
module = module.to(torch.float32)
trainer.remove_callback(PrinterCallback)
trainer.train()
logger.info("Finished training, saving model...")
trainer.model.config.use_cache = True
trainer.save_model(config.project_name)
model_card = utils.create_model_card(config, trainer)
# save model card to output directory as README.md
with open(f"{config.project_name}/README.md", "w", encoding="utf-8") as f:
f.write(model_card)
if config.push_to_hub:
if PartialState().process_index == 0:
remove_autotrain_data(config)
save_training_params(config)
logger.info("Pushing model to hub...")
api = HfApi(token=config.token)
api.create_repo(
repo_id=f"{config.username}/{config.project_name}", repo_type="model", private=True, exist_ok=True
)
api.upload_folder(
folder_path=config.project_name,
repo_id=f"{config.username}/{config.project_name}",
repo_type="model",
)
if PartialState().process_index == 0:
pause_space(config)