in torchbenchmark/e2e_models/hf_bert/__init__.py [0:0]
def prep(self, hf_args):
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
accelerator = Accelerator(fp16=(self.tb_args.fp16 == "amp"))
accelerator.wait_for_everyone()
raw_datasets = prep_dataset(hf_args)
num_labels, label_list, is_regression = prep_labels(hf_args, raw_datasets)
# Load pretrained model and tokenizer
#
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
config = AutoConfig.from_pretrained(hf_args.model_name_or_path, num_labels=num_labels, finetuning_task=hf_args.task_name)
tokenizer = AutoTokenizer.from_pretrained(hf_args.model_name_or_path, use_fast=not hf_args.use_slow_tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(
hf_args.model_name_or_path,
from_tf=bool(".ckpt" in hf_args.model_name_or_path),
config=config,)
train_dataset, eval_dataset, self.mnli_eval_dataset = preprocess_dataset(hf_args, config, model, \
tokenizer, raw_datasets, num_labels, label_list, is_regression, accelerator)
# DataLoaders creation:
if hf_args.pad_to_max_length:
# If padding was already done ot max length, we use the default data collator that will just convert everything
# to tensors.
self.data_collator = default_data_collator
else:
# Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
# the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
# of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
self.data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None))
train_dataloader = DataLoader(
train_dataset, shuffle=True, collate_fn=self.data_collator, batch_size=hf_args.per_device_train_batch_size)
eval_dataloader = DataLoader(eval_dataset, collate_fn=self.data_collator, batch_size=hf_args.per_device_eval_batch_size)
# Optimizer
# Split weights in two groups, one with weight decay and the other not.
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": hf_args.weight_decay,
},
{
"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
"weight_decay": 0.0,
},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=hf_args.learning_rate)
# Prepare everything with our `accelerator`.
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
model, optimizer, train_dataloader, eval_dataloader
)
# Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
# shorter in multiprocess)
# Scheduler and math around the number of training steps.
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / hf_args.gradient_accumulation_steps)
if hf_args.max_train_steps is None:
hf_args.max_train_steps = hf_args.num_train_epochs * num_update_steps_per_epoch
else:
hf_args.num_train_epochs = math.ceil(hf_args.max_train_steps / num_update_steps_per_epoch)
lr_scheduler = get_scheduler(
name=hf_args.lr_scheduler_type,
optimizer=optimizer,
num_warmup_steps=hf_args.num_warmup_steps,
num_training_steps=hf_args.max_train_steps,
)
# Steup metrics
# Get the metric function
if hf_args.task_name is not None:
self.metric = load_metric("glue", hf_args.task_name)
else:
self.metric = load_metric("accuracy")
# Setup class members
self.hf_args = hf_args
self.is_regression = is_regression
self.model = model
self.optimizer = optimizer
self.train_dataloader = train_dataloader
self.eval_dataloader = eval_dataloader
self.lr_scheduler = lr_scheduler
self.accelerator = accelerator