def train()

in src/autofill_model.py [0:0]


    def train(self, save_path=DEFAULT_SAVE_PATH):
        self.save_path = f'{os.path.abspath(os.getcwd())}/{save_path}'
        # Get the classifier
        self.classifier = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=len(self.dataset['train'].features["labels"].names),
            ignore_mismatched_sizes=True
        )

        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.tokenized_datasets = self.dataset.map(
            lambda dataset: self.tokenizer(dataset['html_cleaned'], truncation=True), batched=True
        )

        # only keep necessary columns for trainer
        self.tokenized_datasets.remove_columns(['html_cleaned', 'ml_dataset', 'language'])
        # Training args
        data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
        training_args = TrainingArguments(
            self.model_name,
            num_train_epochs=self.epochs,
            per_device_train_batch_size=self.batch_size,
            weight_decay=self.weight_decay,
            learning_rate=self.learning_rate,
            save_strategy="no",
            report_to="none",
            evaluation_strategy="epoch",
            logging_strategy='epoch',
        )

        self.trainer = Trainer(
            self.classifier,
            training_args,
            train_dataset=self.tokenized_datasets["train"],
            eval_dataset=self.tokenized_datasets["eval"],
            data_collator=data_collator,
            tokenizer=self.tokenizer,
            compute_metrics=self._compute_metrics,
        )
        self.trainer.train()
        logger.info('Done Training \n')
        self._save_model(push_to_hub=False)
        self.evaluate()