in src/autofill_model.py [0:0]
def train(self, save_path=DEFAULT_SAVE_PATH):
self.save_path = f'{os.path.abspath(os.getcwd())}/{save_path}'
# Get the classifier
self.classifier = AutoModelForSequenceClassification.from_pretrained(
self.model_name,
num_labels=len(self.dataset['train'].features["labels"].names),
ignore_mismatched_sizes=True
)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.tokenized_datasets = self.dataset.map(
lambda dataset: self.tokenizer(dataset['html_cleaned'], truncation=True), batched=True
)
# only keep necessary columns for trainer
self.tokenized_datasets.remove_columns(['html_cleaned', 'ml_dataset', 'language'])
# Training args
data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
training_args = TrainingArguments(
self.model_name,
num_train_epochs=self.epochs,
per_device_train_batch_size=self.batch_size,
weight_decay=self.weight_decay,
learning_rate=self.learning_rate,
save_strategy="no",
report_to="none",
evaluation_strategy="epoch",
logging_strategy='epoch',
)
self.trainer = Trainer(
self.classifier,
training_args,
train_dataset=self.tokenized_datasets["train"],
eval_dataset=self.tokenized_datasets["eval"],
data_collator=data_collator,
tokenizer=self.tokenizer,
compute_metrics=self._compute_metrics,
)
self.trainer.train()
logger.info('Done Training \n')
self._save_model(push_to_hub=False)
self.evaluate()