optimum/graphcore/trainer.py

# Copyright 2021 The HuggingFace Team. All rights reserved. # Copyright (c) 2022 Graphcore Ltd. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import collections import copy import functools import inspect import math import os import random import re import shutil import sys import time import warnings from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union # Integrations must be imported before ML frameworks: import numpy as np import poptorch import torch from huggingface_hub import Repository from packaging import version from peft import PeftModel from poptorch import DataLoaderMode, PoplarExecutor from poptorch.optim import LAMB, AdamW from torch import nn, optim from torch.utils.data import Dataset, RandomSampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler from tqdm.auto import tqdm from transformers.configuration_utils import PretrainedConfig from transformers.data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator from transformers.debug_utils import DebugOption, DebugUnderflowOverflow from transformers.integrations import ( # isort: split get_reporting_integration_callbacks, ) from transformers.modeling_utils import PreTrainedModel from transformers.optimization import get_scheduler from transformers.pytorch_utils import is_torch_less_than_1_11 from transformers.tokenization_utils_base import PreTrainedTokenizerBase from transformers.trainer import OPTIMIZER_NAME, SCHEDULER_NAME, TRAINER_STATE_NAME, TRAINING_ARGS_NAME from transformers.trainer_callback import ( CallbackHandler, DefaultFlowCallback, PrinterCallback, ProgressCallback, TrainerCallback, TrainerControl, TrainerState, ) from transformers.trainer_pt_utils import ( IterableDatasetShard, LabelSmoother, LengthGroupedSampler, find_batch_size, get_parameter_names, nested_concat, nested_detach, nested_numpify, nested_truncate, reissue_pt_warnings, ) from transformers.trainer_utils import ( PREFIX_CHECKPOINT_DIR, EvalLoopOutput, EvalPrediction, HubStrategy, IntervalStrategy, PredictionOutput, RemoveColumnsCollator, TrainerMemoryTracker, TrainOutput, denumpify_detensorize, get_last_checkpoint, has_length, set_seed, speed_metrics, ) from transformers.utils import ( CONFIG_NAME, WEIGHTS_INDEX_NAME, WEIGHTS_NAME, find_labels, get_full_repo_name, is_datasets_available, ) from optimum.graphcore.version import __version__ from optimum.utils import logging from .data.data_collator import pad_on_batch_axis from .ipu_configuration import IPU_CONFIG_NAME, IPUConfig from .modelcard import IPUTrainingSummary from .modeling_utils import to_pipelined from .trainer_utils import _WorkerInit from .training_args import IPUTrainingArguments if is_datasets_available(): import datasets if TYPE_CHECKING: import optuna logger = logging.get_logger(__name__) _is_torch_generator_available = False DEFAULT_CALLBACKS = [DefaultFlowCallback] DEFAULT_PROGRESS_CALLBACK = ProgressCallback # TODO: Import from transformers.utils when updating transformers version. ADAPTER_WEIGHTS_NAME = "adapter_model.bin" @dataclass class IPUTrainerState(TrainerState): start_time: float = -1.0 class IPUTrainer: """ `IPUTrainer` is a simple but feature-complete training and evaluation loop on Graphcore IPUs for PyTorch, optimized for 🤗 Transformers. Args: model ([`transformers.PreTrainedModel`] or `torch.nn.Module`, *optional*): The model to train, evaluate or use for predictions. If not provided, a `model_init` function must be passed. <Tip> [`IPUTrainer`] is optimized to work with the [`transformers.PreTrainedModel`] class provided by the 🤗 Transformers library. You can still use your own models defined as `torch.nn.Module` as long as they work in the same way as the 🤗 Transformers models. </Tip> args ([`IPUTrainingArguments`], *optional*): The arguments to tweak for training. Will default to a basic instance of [`IPUTrainingArguments`] with `output_dir` set to a directory named *tmp_trainer* in the current directory if not provided. data_collator ([`transformers.data.data_collator.DataCollator`], *optional*): The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. Will default to [`transformers.data.default_data_collator`] if no `tokenizer` is provided, or an instance of [`~transformers.data.DataCollatorWithPadding`] otherwise. train_dataset (`torch.utils.data.Dataset` or `torch.utils.data.IterableDataset`, *optional*): The dataset to use for training. If it is a [`~datasets.Dataset`] dataset, the columns not accepted by the `model.forward()` method are automatically removed. Note that if it's a `torch.utils.data.IterableDataset` dataset with some randomization and you are training in a distributed fashion, your iterable dataset should either use an internal attribute `generator` that is a `torch.Generator` object for the randomization that must be identical on all processes (and the trainer will manually set the seed of this `generator` at each epoch) or have a `set_epoch()` method that internally sets the seed of the RNGs used. eval_dataset (Union[`torch.utils.data.Dataset`, Dict[str, `torch.utils.data.Dataset`]), *optional*): The dataset to use for evaluation. If it is a [`~datasets.Dataset`] dataset, the columns not accepted by the `model.forward()` method are automatically removed. If it is a dictionary, it will evaluate on each dataset prepending the dictionary key to the metric name. tokenizer ([`transformers.PreTrainedTokenizerBase`], *optional*): The tokenizer used to preprocess the data. If provided, it will be used to automatically pad the inputs to the maximum length when batching inputs, and it will be saved along the model to make it easier to rerun an interrupted training or reuse the fine-tuned model. model_init (`Callable[[], transformers.PreTrainedModel]`, *optional*): A function that instantiates the model to be used. If provided, each call to [`IPUTrainer.train`] will start from a new instance of the model as given by this function. The function may have no arguments, or a single argument containing the optuna/Ray Tune/SigOpt trial object, to be able to choose different architectures according to hyper parameters (such as layer count, sizes of inner layers and dropout probabilities). **Note: this feature is not supported for now.** compute_metrics (`Callable[[~transformers.trainer_utils.EvalPrediction], Dict]`, *optional*): The function that will be used to compute metrics at evaluation. Must take a [`~transformers.trainer_utils.EvalPrediction`] and return a dictionary of strings to metric values. callbacks (List of [`transformers.trainer_callback.TrainerCallback`], *optional*): A list of callbacks to customize the training loop. Will add those to the list of default callbacks detailed in [here](callback). If you want to remove one of the default callbacks used, use the [`Trainer.remove_callback`] method. optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*): A tuple containing the optimizer and the scheduler to use. Will default to an instance of `poptorch.AdamW` on your model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`. preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*): A function that preprocesses the logits right before caching them at each evaluation step. Must take two tensors, the logits and the labels, and return the logits once processed as desired. The modifications made by this function will be reflected in the predictions received by `compute_metrics`. Note that the labels (second parameter) will be `None` if the dataset does not have them. """ from transformers.trainer_pt_utils import log_metrics, metrics_format, save_metrics, save_state from .trainer_pt_utils import _get_learning_rate def __init__( self, model: Union[PreTrainedModel, nn.Module] = None, ipu_config: IPUConfig = None, args: IPUTrainingArguments = None, data_collator: Optional[DataCollator] = None, eval_data_collator: Optional[DataCollator] = None, train_dataset: Optional[Dataset] = None, eval_dataset: Optional[Dataset] = None, tokenizer: Optional[PreTrainedTokenizerBase] = None, model_init: Callable[[], PreTrainedModel] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, callbacks: Optional[List[TrainerCallback]] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, force_to_pipelined: bool = False, ): if args is None: output_dir = "tmp_trainer" logger.info(f"No `TrainingArguments` passed, using `output_dir={output_dir}`.") args = IPUTrainingArguments(output_dir=output_dir) self.args = args # Seed must be set before instantiating the model when using model set_seed(self.args.seed) self.is_in_train = False # memory metrics - must set up as early as possible self._memory_tracker = TrainerMemoryTracker(self.args.skip_memory_metrics) self._memory_tracker.start() # set the correct log level depending on the node log_level = args.get_process_log_level() logging.set_verbosity(log_level) # force device and distributed setup init explicitly args._setup_devices if model is None: if model_init is not None: raise RuntimeError("`model_init` is not supported by `IPUTrainer` yet") else: raise RuntimeError("`IPUTrainer` requires either a `model` or `model_init` argument") else: if model_init is not None: warnings.warn( "`IPUTrainer` requires either a `model` or `model_init` argument, but not both. " "`model_init` will overwrite your model when calling the `train` method. This will become a fatal error in the next release.", FutureWarning, ) self.model_init = model_init # TODO: not sure about setting the data_collator? default_collator = default_data_collator if tokenizer is None else DataCollatorWithPadding(tokenizer) self.data_collator = data_collator if data_collator is not None else default_collator # If no eval_data_collator is specified then use the train data_collator self.eval_data_collator = eval_data_collator if eval_data_collator is not None else self.data_collator self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.tokenizer = tokenizer self.ipu_config = copy.deepcopy(ipu_config) # set replication factor using n_ipu (can be overruled by ipu_config_overrides) if (n_ipu := self.args.n_ipu) is not None: if self.ipu_config.replication_factor > 1 or self.ipu_config.inference_replication_factor > 1: warnings.warn( "IPUTrainer is overwriting the replication factors set in self.ipu_config because `--n_ipu` was provided." ) self.ipu_config.replication_factor = n_ipu // self.ipu_config.ipus_per_replica self.ipu_config.inference_replication_factor = n_ipu // self.ipu_config.inference_ipus_per_replica if self.ipu_config.replication_factor > 1 or self.ipu_config.inference_replication_factor > 1: os.environ["TOKENIZERS_PARALLELISM"] = "true" if args.ipu_config_overrides: logger.info(f"Overriding IPU config: {args.ipu_config_overrides}") self.ipu_config.update_from_string(args.ipu_config_overrides) self.ipu_config.seed = self.args.seed self.opts = self.ipu_config.to_options(compile_only=args.compile_only) self.eval_opts = self.ipu_config.to_options(for_inference=True, compile_only=args.compile_only) # If batch axis padding enabled, wrap train/eval data collators with `pad_on_batch_axis` wrapper if self.args.pad_on_batch_axis: logger.info( "Padding on batch axis enabled. Each batch fed to the compiled model during training will have the proper size" ) if self.args.do_train: data_collator_wrapper = pad_on_batch_axis( self.args.per_device_train_batch_size * self.ipu_config.batch_size_factor() ) self.data_collator = data_collator_wrapper(self.data_collator) if self.args.do_eval: data_collator_wrapper = pad_on_batch_axis( self.args.per_device_eval_batch_size * self.ipu_config.batch_size_factor(for_inference=True), ) self.eval_data_collator = data_collator_wrapper(self.eval_data_collator) self.model = to_pipelined(model, self.ipu_config, force=force_to_pipelined) self.model.parallelize(**self.ipu_config.parallelize_kwargs) self.original_model = model if not self.args.fp32: self.model = self.model.half() self.training_model = None self.inference_model = None self.compute_metrics = compute_metrics self.preprocess_logits_for_metrics = preprocess_logits_for_metrics self.optimizer, self.lr_scheduler = optimizers if model_init is not None and (self.optimizer is not None or self.lr_scheduler is not None): raise RuntimeError( "Passing a `model_init` is incompatible with providing the `optimizers` argument. " "You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method." ) if self.optimizer is not None and not isinstance(self.optimizer, poptorch.optim.Optimizer): self.optimizer = self.pytorch_optimizer_to_poptorch(self.optimizer, model, self.model) default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to) callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks self.callback_handler = CallbackHandler( callbacks, self.model, self.tokenizer, self.optimizer, self.lr_scheduler ) self.add_callback(PrinterCallback if self.args.disable_tqdm else DEFAULT_PROGRESS_CALLBACK) # Will be set to True by `self._setup_loggers()` on first call to `self.log()`. self._loggers_initialized = False # Create clone of distant repo and output directory if needed if self.args.push_to_hub: self.init_git_repo() if self.args.should_save: os.makedirs(self.args.output_dir, exist_ok=True) if not callable(self.data_collator) and callable(getattr(self.data_collator, "collate_batch", None)): raise ValueError("`data_collator` should be a simple callable (function, class with `__call__`).") if not callable(self.eval_data_collator) and callable(getattr(self.eval_data_collator, "collate_batch", None)): raise ValueError("`eval_data_collator` should be a simple callable (function, class with `__call__`).") if args.max_steps > 0: logger.info("max_steps is given. It will override any value given in num_train_epochs") if train_dataset is not None and not isinstance(train_dataset, collections.abc.Sized) and args.max_steps <= 0: raise ValueError("train_dataset does not implement __len__. max_steps has to be specified") self._signature_columns = None # Label smoothing if self.args.label_smoothing_factor != 0: self.label_smoother = LabelSmoother(epsilon=self.args.label_smoothing_factor) else: self.label_smoother = None self.state = IPUTrainerState() self.control = TrainerControl() # Internal variable to count flos in each process, will be accumulated in `self.state.total_flos` then # returned to 0 every time flos need to be logged self.current_flos = 0 self.hp_search_backend = None self.use_tune_checkpoints = False default_label_names = find_labels(model.__class__) self.label_names = default_label_names if self.args.label_names is None else self.args.label_names self.control = self.callback_handler.on_init_end(self.args, self.state, self.control) # very last self._memory_tracker.stop_and_update_metrics() # If compile-only then compile and exit if args.compile_only: logger.info("Called with compile_only=True. Compiling models then exiting.") if args.do_train: train_dl = self.get_train_dataloader() model = self.wrap_model(self.model) try: model_inputs = next(iter(train_dl)) except StopIteration: raise ValueError( "Couldn't get first sample from dataloader, please check for warnings " "during dataloader construction." ) self.compile_model(model, model_inputs, log=True) if args.do_eval: # Same thing with _wrap_and_compile_for_evaluation eval_dl = self.get_eval_dataloader() model = self._wrap_and_compile_model_for_evaluation(eval_dl, False) logger.info("Exiting after compiling models with compile_only=True") sys.exit(0) def pytorch_optimizer_to_poptorch( self, optimizer: optim.Optimizer, model: Union[PreTrainedModel, nn.Module], pipelined_model: Union[PreTrainedModel, nn.Module], ) -> poptorch.optim.Optimizer: """ Converts a PyTorch optimizer to a PopTorch optimizer. Args: optimizer (`torch.optim.Optimizer`): The PyTorch optimizer to convert. model (`[transformers.PreTrainedModel]` or `torch.nn.Module`): The original model the optimizer has parameter references to. pipelined_model (`[transformers.PreTrainedModel] or `torch.nn.Module`): The pipelined version of the model. Its parameters will be used by the PopTorch optimizer. Returns: `poptorch.optim.Optimizer`: The converted PopTorch optimizer. """ first_order_type = torch.float32 if self.args.fp32 else torch.float16 optimizer_kwargs = { "loss_scaling": self.args.loss_scaling, "accum_type": first_order_type, "first_order_momentum_accum_type": first_order_type, "second_order_momentum_accum_type": torch.float32, } # TODO: disabled max_grad_norm because it make things fail, fix it. max_grad_norm = self.args.max_grad_norm self.args.max_grad_norm = None pytorch_to_poptorch_mapping = { optim.SGD: (poptorch.optim.SGD, {"loss_scaling": self.args.loss_scaling}), optim.Adam: (poptorch.optim.Adam, {"max_grad_norm": self.args.max_grad_norm, **optimizer_kwargs}), optim.AdamW: (poptorch.optim.AdamW, {"max_grad_norm": self.args.max_grad_norm, **optimizer_kwargs}), optim.RMSprop: (poptorch.optim.RMSprop, optimizer_kwargs), } self.args.max_grad_norm = max_grad_norm poptorch_optimizer_cls, kwargs = pytorch_to_poptorch_mapping.get(optimizer.__class__, (None, {})) if poptorch_optimizer_cls is None: raise KeyError(f"Could not find a PopTorch counterpart for optimizer {optimizer.__class__.__name__}") # Some dummy value that should be overridden by the real value with .load_state_dict, using some absurd value to # make clear if the value is not properly overridden. dummy_lr = 1e4 poptorch_optimizer = poptorch_optimizer_cls(optimizer.param_groups, lr=dummy_lr, **kwargs) poptorch_optimizer.load_state_dict({"ipu_state": None, "ipu_param": None, **optimizer.state_dict()}) # Currently poptorch_optimizer contains references to the original model parameters, so we need to change those # to references to the pipelined model parameters. id2name = {id(param): name for name, param in model.named_parameters()} name2param = dict(pipelined_model.named_parameters()) for group in poptorch_optimizer.param_groups: for idx, param in enumerate(group["params"]): group["params"][idx] = name2param[id2name[id(param)]] return poptorch_optimizer def compile_model( self, model: poptorch.PoplarExecutor, sample_batch: Union[Dict[str, torch.Tensor], Tuple[torch.Tensor]], log: bool = False, ): """ Compiles the model with PopTorch. Args: model (`poptorch.PoplarExecutor`): The model to compile (already wrapped). sample_batch (`Dict[str, torch.Tensor]` or `Tuple[torch.Tensor]`): The inputs to use for the compilation. This will set the input shapes that the compiled model can accept. log (`bool`, *optional*, defaults to `False`): If `True`, logs that the compilation is in progress. """ # Skipping compilation if the model was already compiled. if model.isCompiled(): return if log: logger.info("Compiling Model...") sample_batch = self._prepare_inputs(sample_batch) start_compile = time.perf_counter() if isinstance(sample_batch, tuple): model.compile(*sample_batch) else: model.compile(**sample_batch) duration_compilation = time.perf_counter() - start_compile if log: logger.info(f"Compiled/Loaded model in {duration_compilation} secs") def add_callback(self, callback): """ Adds a callback to the current list of [`~transformer.TrainerCallback`]. Args: callback (`type` or [`~transformer.TrainerCallback`]): A [`~transformer.TrainerCallback`] class or an instance of [`~transformer.TrainerCallback`]. In the first case, will instantiate a member of that class. """ self.callback_handler.add_callback(callback) def pop_callback(self, callback): """ Removes a callback from the current list of [`~transformer.TrainerCallback`] and returns it. If the callback is not found, returns `None` (and no error is raised). Args: callback (`type` or [`~transformer.TrainerCallback`]): A [`~transformer.TrainerCallback`] class or an instance of [`~transformer.TrainerCallback`]. In the first case, will pop the first member of that class found in the list of callbacks. Returns: [`~transformer.TrainerCallback`]: The callback was removed, if found. """ return self.callback_handler.pop_callback(callback) def remove_callback(self, callback): """ Removes a callback from the current list of [`~transformer.TrainerCallback`]. Args: callback (`type` or [`~transformer.TrainerCallback`]): A [`~transformer.TrainerCallback`] class or an instance of [`~transformer.TrainerCallback`]. In the first case, will remove the first member of that class found in the list of callbacks. """ self.callback_handler.remove_callback(callback) def _set_signature_columns_if_needed(self): if self._signature_columns is None: # Inspect model forward signature to keep only the arguments it accepts. signature = inspect.signature(self.model.forward) self._signature_columns = list(signature.parameters.keys()) # Labels may be named label or label_ids, the default data collator handles that. self._signature_columns += list(set(["label", "label_ids"] + self.label_names)) def _remove_unused_columns(self, dataset: "datasets.Dataset", description: Optional[str] = None): if not self.args.remove_unused_columns: return dataset self._set_signature_columns_if_needed() signature_columns = self._signature_columns ignored_columns = list(set(dataset.column_names) - set(self._signature_columns)) if len(ignored_columns) > 0: dset_description = "" if description is None else f"in the {description} set " logger.info( f"The following columns {dset_description} don't have a corresponding argument in " f"`{self.model.__class__.__name__}.forward` and have been ignored: {', '.join(ignored_columns)}." ) columns = [k for k in signature_columns if k in dataset.column_names] if version.parse(datasets.__version__) < version.parse("1.4.0"): dataset.set_format( type=dataset.format["type"], columns=columns, format_kwargs=dataset.format["format_kwargs"] ) return dataset else: return dataset.remove_columns(ignored_columns) def _get_collator_with_removed_columns( self, data_collator: Callable, description: Optional[str] = None ) -> Callable: """Wraps the data collator in a callable removing unused columns.""" if not self.args.remove_unused_columns: return data_collator self._set_signature_columns_if_needed() signature_columns = self._signature_columns remove_columns_collator = RemoveColumnsCollator( data_collator=data_collator, signature_columns=signature_columns, logger=logger, description=description, model_name=self.model.__class__.__name__, ) return remove_columns_collator def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]: if not isinstance(self.train_dataset, collections.abc.Sized): return None generator = None if _is_torch_generator_available: generator = torch.Generator() # for backwards compatibility, we generate a seed here (which is sampled from a generator seeded with # `args.seed`) if data_seed isn't provided. # Further on in this method, we default to `args.seed` instead. if self.args.data_seed is None: seed = int(torch.empty((), dtype=torch.int64).random_().item()) else: seed = self.args.data_seed generator.manual_seed(seed) seed = self.args.data_seed if self.args.data_seed is not None else self.args.seed combined_batch_size = self.args.per_device_train_batch_size * self.ipu_config.batch_size_factor() # Build the sampler. if self.args.group_by_length: if is_datasets_available() and isinstance(self.train_dataset, datasets.Dataset): lengths = ( self.train_dataset[self.args.length_column_name] if self.args.length_column_name in self.train_dataset.column_names else None ) else: lengths = None model_input_name = self.tokenizer.model_input_names[0] if self.tokenizer is not None else None return LengthGroupedSampler( combined_batch_size, dataset=self.train_dataset, lengths=lengths, model_input_name=model_input_name, generator=generator, ) else: return RandomSampler(self.train_dataset) def _check_dataset_can_fill_batch(self, dataset: torch.utils.data.Dataset, for_inference: bool = False) -> None: replication_factor = ( self.ipu_config.inference_replication_factor if for_inference else self.ipu_config.replication_factor ) gradient_accumulation_steps = 1 if for_inference else self.ipu_config.gradient_accumulation_steps device_iterations = ( self.ipu_config.inference_device_iterations if for_inference else self.ipu_config.device_iterations ) micro_batch_size = ( self.args.per_device_eval_batch_size if for_inference else self.args.per_device_train_batch_size ) global_batch_size = micro_batch_size * replication_factor * gradient_accumulation_steps * device_iterations try: len(dataset) except Exception: # If the length of the dataset cannot be determined skip the checks return if len(dataset) < global_batch_size: mode_str = "inference_" if for_inference else "" logger.warning( f"The provided dataset is of length {len(dataset)}, but the total dataset batch size is {global_batch_size}. " f"This batch size is calculated as:\n" f" per_device_{'eval' if for_inference else 'train'}_batch_size={micro_batch_size}\n" f"* {mode_str}{replication_factor=}\n" f"* {mode_str}{gradient_accumulation_steps=}\n" f"* {mode_str}{device_iterations=}\n" "Please disregard this warning if you believe the dataset is reporting an incorrect length, such as 1." ) def get_train_dataloader(self) -> poptorch.DataLoader: """ Returns the training `poptorch.DataLoader`. Will not use a sampler if `train_dataset` does not implement `__len__` and will use a random sampler (adapted to distributed training if necessary) otherwise. Subclass and override this method if you want to inject some custom behavior. """ if self.train_dataset is None: raise ValueError("Trainer: training requires a train_dataset.") train_dataset = self.train_dataset data_collator = self.data_collator if is_datasets_available() and isinstance(train_dataset, datasets.Dataset): train_dataset = self._remove_unused_columns(train_dataset, description="training") else: data_collator = self._get_collator_with_removed_columns(data_collator, description="training") poptorch_specific_kwargs = { "auto_distributed_partitioning": not isinstance(train_dataset, torch.utils.data.IterableDataset), "mode": self.args.dataloader_mode, "worker_init_fn": _WorkerInit(123), } if isinstance(train_dataset, torch.utils.data.IterableDataset): return poptorch.DataLoader( self.opts, train_dataset, batch_size=self.args.train_batch_size, collate_fn=self.data_collator, num_workers=self.args.dataloader_num_workers, drop_last=self.args.dataloader_drop_last, pin_memory=self.args.dataloader_pin_memory, **poptorch_specific_kwargs, ) train_sampler = self._get_train_sampler() combined_batch_size = self.args.per_device_train_batch_size * self.ipu_config.batch_size_factor() rebatched_worker_size = ( 2 * (combined_batch_size // self.args.dataloader_num_workers) if self.args.dataloader_num_workers else combined_batch_size ) self._check_dataset_can_fill_batch(train_dataset, for_inference=False) return poptorch.DataLoader( self.opts, train_dataset, batch_size=self.args.per_device_train_batch_size, sampler=train_sampler, collate_fn=self.data_collator, num_workers=self.args.dataloader_num_workers, pin_memory=self.args.dataloader_pin_memory, rebatched_worker_size=rebatched_worker_size, **poptorch_specific_kwargs, ) def _get_eval_sampler(self, eval_dataset: Dataset) -> Optional[torch.utils.data.Sampler]: return SequentialSampler(eval_dataset) def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> poptorch.DataLoader: """ Returns the evaluation `poptorch.DataLoader`. Subclass and override this method if you want to inject some custom behavior. Args: eval_dataset (`torch.utils.data.Dataset`, *optional*): If provided, will override `self.eval_dataset`. If it is a [`~datasets.Dataset`] dataset, the columns not accepted by the `model.forward()` method are automatically removed. It must implement `__len__`. """ poptorch_specific_kwargs = { "auto_distributed_partitioning": not isinstance(eval_dataset, torch.utils.data.IterableDataset), "mode": DataLoaderMode.Sync, "worker_init_fn": _WorkerInit(123), } if eval_dataset is None and self.eval_dataset is None: raise ValueError("Trainer: evaluation requires an eval_dataset.") eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset data_collator = self.eval_data_collator if is_datasets_available() and isinstance(eval_dataset, datasets.Dataset): eval_dataset = self._remove_unused_columns(eval_dataset, description="evaluation") else: data_collator = self._get_collator_with_removed_columns(data_collator, description="evaluation") if isinstance(eval_dataset, torch.utils.data.IterableDataset): return poptorch.DataLoader( self.eval_opts, eval_dataset, batch_size=self.args.per_device_eval_batch_size, collate_fn=data_collator, num_workers=self.args.dataloader_num_workers, pin_memory=self.args.dataloader_pin_memory, **poptorch_specific_kwargs, ) eval_sampler = self._get_eval_sampler(eval_dataset) self._check_dataset_can_fill_batch(eval_dataset, for_inference=True) return poptorch.DataLoader( self.eval_opts, eval_dataset, sampler=eval_sampler, batch_size=self.args.per_device_eval_batch_size, collate_fn=data_collator, drop_last=self.args.dataloader_drop_last, num_workers=self.args.dataloader_num_workers, pin_memory=self.args.dataloader_pin_memory, **poptorch_specific_kwargs, ) def get_test_dataloader(self, test_dataset: Dataset) -> poptorch.DataLoader: """ Returns the test `poptorch.DataLoader`. Subclass and override this method if you want to inject some custom behavior. Args: test_dataset (`torch.utils.data.Dataset`, *optional*): The test dataset to use. If it is a [`~datasets.Dataset`] dataset, the columns not accepted by the `model.forward()` method are automatically removed. It must implement `__len__`. """ poptorch_specific_kwargs = { "auto_distributed_partitioning": not isinstance(test_dataset, torch.utils.data.IterableDataset), "mode": DataLoaderMode.Sync, "worker_init_fn": _WorkerInit(123), } data_collator = self.eval_data_collator if is_datasets_available() and isinstance(test_dataset, datasets.Dataset): test_dataset = self._remove_unused_columns(test_dataset, description="test") else: data_collator = self._get_collator_with_removed_columns(data_collator, description="test") if isinstance(test_dataset, torch.utils.data.IterableDataset): return poptorch.DataLoader( self.eval_opts, test_dataset, batch_size=self.args.per_device_eval_batch_size, collate_fn=data_collator, num_workers=self.args.dataloader_num_workers, pin_memory=self.args.dataloader_pin_memory, **poptorch_specific_kwargs, ) test_sampler = self._get_eval_sampler(test_dataset) self._check_dataset_can_fill_batch(test_dataset, for_inference=True) # We use the same batch_size as for eval. return poptorch.DataLoader( self.eval_opts, test_dataset, sampler=test_sampler, batch_size=self.args.per_device_eval_batch_size, collate_fn=data_collator, drop_last=self.args.dataloader_drop_last, pin_memory=self.args.dataloader_pin_memory, **poptorch_specific_kwargs, ) def create_optimizer_and_scheduler(self, num_training_steps: int): """ Sets up the optimizer and the learning rate scheduler. We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the trainer's init through `optimizers`, or subclass and override this method (or `create_optimizer` and/or `create_scheduler`) in a subclass. """ self.create_optimizer() self.create_scheduler(num_training_steps=num_training_steps, optimizer=self.optimizer) def create_optimizer(self): """ Sets up the optimizer. We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the trainer's init through `optimizers`, or subclass and override this method in a subclass. """ if self.optimizer is None: decay_parameters = get_parameter_names(self.model, [nn.LayerNorm]) decay_parameters = {name for name in decay_parameters if "bias" not in name} if self.args.lamb or self.args.lamb_no_bias_correction: bias_parameters = {n for n, _ in self.model.named_parameters() if "bias" in n} optimizer_grouped_parameters = [ { "params": [ p for n, p in self.model.named_parameters() if (n in decay_parameters and p.requires_grad) ], "weight_decay": self.args.weight_decay, }, { # Disable LAMB updates for bias parameters "params": [ p for n, p in self.model.named_parameters() if (n in bias_parameters and p.requires_grad) ], "weight_decay": 0.0, "max_weight_norm": 0.0, }, { "params": [ p for n, p in self.model.named_parameters() if n not in decay_parameters and n not in bias_parameters and p.requires_grad ], "weight_decay": 0.0, }, ] optimizer_cls = LAMB optimizer_kwargs = { "max_weight_norm": None, "bias_correction": not self.args.lamb_no_bias_correction, "eps": self.args.adam_epsilon, } else: optimizer_grouped_parameters = [ { "params": [ p for n, p in self.model.named_parameters() if (n in decay_parameters and p.requires_grad) ], "weight_decay": self.args.weight_decay, }, { "params": [ p for n, p in self.model.named_parameters() if (n not in decay_parameters and p.requires_grad) ], "weight_decay": 0.0, }, ] optimizer_cls = AdamW optimizer_kwargs = { # TODO: disabled max_grad_norm because it make things fail, fix it. # "max_grad_norm": self.args.max_grad_norm, "betas": (self.args.adam_beta1, self.args.adam_beta2), "eps": self.args.adam_epsilon, "bias_correction": False, } first_order_type = torch.float32 if self.args.fp32 else torch.float16 optimizer_kwargs["lr"] = self.args.learning_rate optimizer_kwargs["loss_scaling"] = self.args.loss_scaling optimizer_kwargs["accum_type"] = first_order_type optimizer_kwargs["first_order_momentum_accum_type"] = first_order_type optimizer_kwargs["second_order_momentum_accum_type"] = torch.float32 self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs) if self.args.lamb or self.args.lamb_no_bias_correction: self.optimizer.variable_attrs.markAsConstant("max_weight_norm") self.optimizer.variable_attrs.markAsConstant("weight_decay") return self.optimizer def create_scheduler(self, num_training_steps: int, optimizer: torch.optim.Optimizer = None): """ Sets up the scheduler. The optimizer of the trainer must have been set up either before this method is called or is passed as an argument. Args: num_training_steps (int): The number of training steps to execute. """ optimizer = self.optimizer if optimizer is None else optimizer if self.lr_scheduler is None: self.lr_scheduler = get_scheduler( self.args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=self.args.get_warmup_steps(num_training_steps), num_training_steps=num_training_steps, ) optimizer._step_count = 1 elif isinstance(self.lr_scheduler, functools.partial): self.lr_scheduler = self.lr_scheduler(optimizer) return self.lr_scheduler def num_examples(self, dataloader: poptorch.DataLoader) -> int: """ Returns the number of samples in a `poptorch.DataLoader` object by accessing its dataset. When `poptorch.DataLoader.dataset` does not exist or has no length, returns the best estimate best it can. """ return len(dataloader.dataset) def wrap_model(self, model: Union[PreTrainedModel, PoplarExecutor], training=True) -> PoplarExecutor: """ Wraps a model for PopTorch, either for training or for inference. Args: model ([`transformers.PreTrainedModel`] or `poptorch.PoplarExecutor`): The model to wrap. training (`bool`, *optional*, defaults to `True`): If `True`, wraps the model for training. If `False`, does not wrap the model for training. Returns: `poptorch.PoplarExecutor`: The wrapped model. """ wrapped = None if isinstance(model, PoplarExecutor): wrapped = model elif training: if self.training_model is None: model.deparallelize() model.ipu_config.train() model.parallelize(**model.ipu_config.parallelize_kwargs) self.create_optimizer() self.training_model = poptorch.trainingModel( model.train(), options=self.opts, optimizer=self.optimizer ) wrapped = self.training_model else: if self.inference_model is None: model.deparallelize() model.ipu_config.eval() model.parallelize(**model.ipu_config.inference_parallelize_kwargs) self.inference_model = poptorch.inferenceModel(model.eval(), options=self.eval_opts) wrapped = self.inference_model # Attaching to device when the model that is being access was already compiled but detached from previous loop. if wrapped.isCompiled() and not wrapped.isAttachedToDevice(): wrapped.attachToDevice() return wrapped def _detach_training_model(self): """ Detach the training model from IPUs. """ self.training_model.detachFromDevice() def _detach_inference_model(self): """ Detach the inference model from IPUs. """ self.inference_model.detachFromDevice() def _reattach_training_model(self): """ Reattach the training model to IPUs. """ self.training_model.attachToDevice() def train( self, resume_from_checkpoint: Optional[Union[str, bool]] = None, trial: Union["optuna.Trial", Dict[str, Any]] = None, ignore_keys_for_eval: Optional[List[str]] = None, **kwargs, ): """ Main training entry point. Args: resume_from_checkpoint (`str` or `bool`, *optional*): Indicates that training will resume from the model, optimizer or scheduler states loaded here. If `str`, local path to a saved checkpoint as saved by a previous instance of [`IPUTrainer`]. If `bool` and `True`, load the last checkpoint in *args.output_dir* as saved by a previous instance of [`IPUTrainer`]. trial (`optuna.Trial` or `Dict[str, Any]`, *optional*): The trial run or the hyperparameter dictionary for a hyperparameter search. **Note**: Feature not supported. ignore_keys_for_eval (`List[str]`, *optional*) A list of keys in the output of your model (if it is a dictionary) that should be ignored when gathering predictions for evaluation during the training. kwargs: Additional keyword arguments used to hide deprecated arguments. """ if resume_from_checkpoint is False: resume_from_checkpoint = None # memory metrics - must set up as early as possible self._memory_tracker.start() args = self.args self.is_in_train = True if "model_path" in kwargs: resume_from_checkpoint = kwargs.pop("model_path") warnings.warn( "`model_path` is deprecated and will be removed in a future version. Use `resume_from_checkpoint` " "instead.", FutureWarning, ) if len(kwargs) > 0: raise TypeError(f"train() received got unexpected keyword arguments: {', '.join(list(kwargs.keys()))}.") # Load potential model checkpoint if isinstance(resume_from_checkpoint, bool) and resume_from_checkpoint: resume_from_checkpoint = get_last_checkpoint(args.output_dir) if resume_from_checkpoint is None: raise ValueError(f"No valid checkpoint found in output directory ({args.output_dir})") if resume_from_checkpoint is not None: self._load_from_checkpoint(resume_from_checkpoint) return self._inner_training_loop( args=args, resume_from_checkpoint=resume_from_checkpoint, ignore_keys_for_eval=ignore_keys_for_eval ) def _inner_training_loop( self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None ): # For now, it will always be None. if batch_size is None: batch_size = args.per_device_train_batch_size # Data loader and number of training steps train_dataloader = self.get_train_dataloader() # Setting up training control variables: # number of training epochs: num_train_epochs # number of training steps per epoch: num_update_steps_per_epoch # total number of training steps to execute: max_steps total_train_batch_size = batch_size * self.ipu_config.batch_size_factor() len_dataloader = None if has_length(train_dataloader): # No need to divide by the number of gradient accumulation steps as poptorch already accounts for that. len_dataloader = len(train_dataloader) num_update_steps_per_epoch = len_dataloader num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1) if args.max_steps > 0: max_steps = args.max_steps num_train_epochs = args.max_steps // num_update_steps_per_epoch + int( args.max_steps % num_update_steps_per_epoch > 0 ) else: max_steps = math.ceil(args.num_train_epochs * num_update_steps_per_epoch) num_train_epochs = math.ceil(args.num_train_epochs) # May be slightly incorrect if the last batch in the training dataloader has a smaller size but it's # the best we can do. num_train_samples = max_steps * total_train_batch_size elif args.max_steps > 0: # Rely on max_steps when dataloader does not have a working size max_steps = args.max_steps # Setting a very large number of epochs so we go as many times as necessary over the iterator. num_train_epochs = sys.maxsize num_update_steps_per_epoch = max_steps num_train_samples = args.max_steps * total_train_batch_size else: raise ValueError( "args.max_steps must be set to a positive value if dataloader does not have a length, was" f" {args.max_steps}" ) if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug: debug_overflow = DebugUnderflowOverflow(self.model) # noqa self.state = IPUTrainerState() if trial is not None: raise ValueError("Hyperparameter tuning is not supported by the IPUTrainer.") trial = None self.state.is_hyper_param_search = trial is not None self.training_model = self.wrap_model(self.model) self.create_scheduler(num_training_steps=max_steps) # TODO: handle optimizer and scheduler creation # if delay_optimizer_creation: # self.create_optimizer_and_scheduler(num_training_steps=max_steps) # Check if saved optimizer or scheduler states exist self._load_optimizer_and_scheduler(resume_from_checkpoint) try: model_inputs = next(iter(train_dataloader)) except StopIteration: raise ValueError( "Couldn't get first sample from dataloader, please check for warnings " "during dataloader construction." ) self.compile_model(self.training_model, model_inputs, log=True) # Train! num_examples = ( self.num_examples(train_dataloader) if has_length(train_dataloader) else total_train_batch_size * args.max_steps ) logger.info("***** Running training *****") logger.info(f" Num examples = {num_examples}") logger.info(f" Num epochs = {num_train_epochs}") logger.info(f" Instantaneous batch size per device = {batch_size}") logger.info( f" Total training batch size (w. parallel, distributed & accumulation) = {total_train_batch_size}" ) logger.info(f" Gradient accumulation steps = {self.ipu_config.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {max_steps}") self.state.epoch = 0 start_time = time.time() epochs_trained = 0 steps_trained_in_current_epoch = 0 steps_trained_progress_bar = None # Check if continuing training from a checkpoint if resume_from_checkpoint is not None and os.path.isfile( os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME) ): self.state = IPUTrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)) if self.state.start_time < 0: self.state.start_time = start_time start_time = self.state.start_time epochs_trained = self.state.global_step // num_update_steps_per_epoch if not args.ignore_data_skip: steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch) # No need to multiply by the number of gradient accumulation steps as poptorch already accounts for that. # steps_trained_in_current_epoch *= self.ipu_config.gradient_accumulation_steps else: steps_trained_in_current_epoch = 0 logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(f" Continuing training from epoch {epochs_trained}") logger.info(f" Continuing training from global step {self.state.global_step}") if not args.ignore_data_skip: logger.info( f" Will skip the first {epochs_trained} epochs then the first {steps_trained_in_current_epoch} " "batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` " "flag to your launch command, but you will resume the training on data already seen by your model." ) if args.disable_tqdm: steps_trained_progress_bar = tqdm(total=steps_trained_in_current_epoch) steps_trained_progress_bar.set_description("Skipping the first batches") # Update the references self.callback_handler.model = self.model self.callback_handler.optimizer = self.optimizer self.callback_handler.lr_scheduler = self.lr_scheduler self.callback_handler.train_dataloader = train_dataloader self.state.trial_name = None self.state.trial_params = None # This should be the same if the state has been saved but in case the training arguments changed, it's safer # to set this after the load. self.state.max_steps = max_steps self.state.num_train_epochs = num_train_epochs self.state.start_time = start_time # tr_loss is a tensor to avoid synchronization of TPUs through .item() tr_loss = torch.tensor(0.0).to(args.device) # _total_loss_scalar is updated everytime .item() has to be called on tr_loss and stores the sum of all losses self._total_loss_scalar = 0.0 self._globalstep_last_logged = self.state.global_step self.control = self.callback_handler.on_train_begin(args, self.state, self.control) # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point. if not args.ignore_data_skip: for epoch in range(epochs_trained): is_random_sampler = hasattr(train_dataloader, "sampler") and isinstance( train_dataloader.sampler, RandomSampler ) if is_torch_less_than_1_11 or not is_random_sampler: # We just need to begin an iteration to create the randomization of the sampler. # That was before PyTorch 1.11 however... for _ in train_dataloader: break else: # Otherwise we need to call the whooooole sampler cause there is some random operation added # AT THE VERY END! _ = list(train_dataloader.sampler) for epoch in range(epochs_trained, num_train_epochs): if isinstance(train_dataloader, poptorch.DataLoader) and isinstance( train_dataloader.sampler, DistributedSampler ): train_dataloader.sampler.set_epoch(epoch) elif isinstance(train_dataloader.dataset, IterableDatasetShard): train_dataloader.dataset.set_epoch(epoch) epoch_iterator = train_dataloader # Reset the past mems state at the beginning of each epoch if necessary. if args.past_index >= 0: self._past = None steps_in_epoch = ( len(epoch_iterator) if has_length(train_dataloader) else args.max_steps * self.ipu_config.gradient_accumulation_steps ) self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control) if epoch == epochs_trained and resume_from_checkpoint is not None and steps_trained_in_current_epoch == 0: self._load_rng_state(resume_from_checkpoint) step = -1 for step, inputs in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 if steps_trained_progress_bar is not None: steps_trained_progress_bar.update(1) if steps_trained_in_current_epoch == 0: self._load_rng_state(resume_from_checkpoint) continue elif steps_trained_progress_bar is not None: steps_trained_progress_bar.close() steps_trained_progress_bar = None self.control = self.callback_handler.on_step_begin(args, self.state, self.control) tr_loss_step = self.training_step(self.training_model, inputs) if args.logging_nan_inf_filter and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)): # if loss is nan or inf simply add the average of previous logged losses tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged) else: tr_loss += tr_loss_step # TODO: see how to enable this (if necessary), slows down training a lot. self.current_flos += float(self.floating_point_ops(inputs)) # Optimizer step optimizer_was_run = True if optimizer_was_run: self.lr_scheduler.step() self.training_model.setOptimizer(self.optimizer) self.state.global_step += 1 self.state.epoch = epoch + (step + 1) / steps_in_epoch self.control = self.callback_handler.on_step_end(args, self.state, self.control) self._maybe_log_save_evaluate(tr_loss, self.training_model, epoch, ignore_keys_for_eval) if self.control.should_epoch_stop or self.control.should_training_stop: break if step < 0: logger.warning( "There seems to be not a single sample in your epoch_iterator, stopping training at step" f" {self.state.global_step}! This is expected if you're using an IterableDataset and set" f" num_steps ({max_steps}) higher than the number of available samples." ) self.control.should_training_stop = True self.control = self.callback_handler.on_epoch_end(args, self.state, self.control) self._maybe_log_save_evaluate(tr_loss, self.training_model, epoch, ignore_keys_for_eval) if self.control.should_training_stop: break if args.past_index and hasattr(self, "_past"): # Clean the state at the end of training delattr(self, "_past") logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") if args.load_best_model_at_end and self.state.best_model_checkpoint is not None: self._load_best_model() # add remaining tr_loss self._total_loss_scalar += tr_loss.item() train_loss = self._total_loss_scalar / self.state.global_step metrics = speed_metrics("train", start_time, num_samples=num_train_samples, num_steps=self.state.max_steps) self.store_flos() metrics["total_flos"] = self.state.total_flos metrics["train_loss"] = train_loss self.is_in_train = False self._memory_tracker.stop_and_update_metrics(metrics) self.log(metrics) self.control = self.callback_handler.on_train_end(args, self.state, self.control) # Detaching model from device to let the inference model attach itself self._detach_training_model() return TrainOutput(self.state.global_step, train_loss, metrics) def _load_from_checkpoint(self, resume_from_checkpoint, model=None): if model is None: model = self.model config_file = os.path.join(resume_from_checkpoint, CONFIG_NAME) weights_file = os.path.join(resume_from_checkpoint, WEIGHTS_NAME) weights_index_file = os.path.join(resume_from_checkpoint, WEIGHTS_INDEX_NAME) adapter_weights_file = os.path.join(resume_from_checkpoint, ADAPTER_WEIGHTS_NAME) if not any( os.path.isfile(f) for f in [ weights_file, weights_index_file, adapter_weights_file, ] ): raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}") logger.info(f"Loading model from {resume_from_checkpoint}.") if os.path.isfile(config_file): config = PretrainedConfig.from_json_file(config_file) checkpoint_version = config.transformers_version if checkpoint_version is not None and checkpoint_version != __version__: logger.warning( f"You are resuming training from a checkpoint trained with {checkpoint_version} of " f"Transformers but your current version is {__version__}. This is not recommended and could " "yield to errors or unwanted behavior." ) if os.path.isfile(weights_file): # We load the model state dict on the CPU to avoid an OOM error. state_dict = torch.load(weights_file, map_location="cpu") # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963 # which takes *args instead of **kwargs load_result = model.load_state_dict(state_dict, False) # release memory del state_dict self._issue_warnings_after_load(load_result) # Load adapters following PR # 24096 (> 4.29.2) elif isinstance(model, PeftModel): # If training a model using PEFT & LoRA, assume that adapter has been saved properly. if hasattr(model, "active_adapter") and hasattr(model, "load_adapter"): if os.path.exists(resume_from_checkpoint): model.load_adapter(resume_from_checkpoint, model.active_adapter) else: logger.warning( "The intermediate checkpoints of PEFT may not be saved correctly, " f"consider using a custom callback to save {ADAPTER_WEIGHTS_NAME} in corresponding saving folders. " "Check some examples here: https://github.com/huggingface/peft/issues/96" ) else: logger.warning("Could not load adapter model, make sure to have `peft>=0.3.0` installed") def _load_best_model(self): logger.info(f"Loading best model from {self.state.best_model_checkpoint} (score: {self.state.best_metric}).") model = self.model best_model_path = os.path.join(self.state.best_model_checkpoint, WEIGHTS_NAME) best_adapter_model_path = os.path.join(self.state.best_model_checkpoint, ADAPTER_WEIGHTS_NAME) if os.path.exists(best_model_path) or os.path.exists(best_adapter_model_path): if isinstance(model, PeftModel): # If training a model using PEFT & LoRA, assume that adapter has been saved properly. if hasattr(model, "active_adapter") and hasattr(model, "load_adapter"): if os.path.exists(best_adapter_model_path): model.load_adapter(self.state.best_model_checkpoint, model.active_adapter) else: # We load the model state dict on the CPU to avoid an OOM error. state_dict = torch.load(best_model_path, map_location="cpu") self._load_state_dict_in_model(state_dict) else: logger.warning( f"Could not locate the best model at {best_model_path}. If you are running a distributed training " "on multiple nodes, you should activate `--save_on_each_node`." ) def _load_state_dict_in_model(self, state_dict): self.model.deparallelize() load_result = self.model.load_state_dict(state_dict, strict=False) self.model.parallelize(**self.model.ipu_config._parallelize_kwargs) if not self.args.fp32: self.model.half() if len(load_result.missing_keys) != 0: if self.model._keys_to_ignore_on_save is not None and set(load_result.missing_keys) != set( self.model._keys_to_ignore_on_save ): logger.warn(f"There were missing keys in the checkpoint model loaded: {load_result.missing_keys}.") if len(load_result.unexpected_keys) != 0: logger.warn(f"There were unexpected keys in the checkpoint model loaded: {load_result.unexpected_keys}.") def _issue_warnings_after_load(self, load_result): if len(load_result.missing_keys) != 0: if self.model._keys_to_ignore_on_save is not None and set(load_result.missing_keys) == set( self.model._keys_to_ignore_on_save ): self.model.tie_weights() else: logger.warning(f"There were missing keys in the checkpoint model loaded: {load_result.missing_keys}.") if len(load_result.unexpected_keys) != 0: logger.warning( f"There were unexpected keys in the checkpoint model loaded: {load_result.unexpected_keys}." ) def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval): if self.control.should_log: logs: Dict[str, float] = {} tr_loss_scalar = tr_loss.mean().item() # reset tr_loss to zero tr_loss -= tr_loss logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4) logs["learning_rate"] = self._get_learning_rate() self._total_loss_scalar += tr_loss_scalar self._globalstep_last_logged = self.state.global_step self.store_flos() self.log(logs) metrics = None if self.control.should_evaluate: self._detach_training_model() if isinstance(self.eval_dataset, dict): for eval_dataset_name, eval_dataset in self.eval_dataset.items(): metrics = self.evaluate( eval_dataset=eval_dataset, ignore_keys=ignore_keys_for_eval, metric_key_prefix=f"eval_{eval_dataset_name}", ) else: metrics = self.evaluate(ignore_keys=ignore_keys_for_eval) self._reattach_training_model() if self.control.should_save: self._save_checkpoint(model, metrics=metrics) self.control = self.callback_handler.on_save(self.args, self.state, self.control) def _load_rng_state(self, checkpoint): # Load RNG states from `checkpoint` if checkpoint is None: return # TODO: validate that. local_rank = -1 if local_rank != -1: rng_file = os.path.join(checkpoint, f"rng_state_{local_rank}.pth") if not os.path.isfile(os.path.join(checkpoint, rng_file)): logger.info( f"Didn't find an RNG file for process {local_rank}. If you are resuming a training that " "wasn't launched in a distributed fashion, reproducibility is not guaranteed." ) return else: rng_file = os.path.join(checkpoint, "rng_state.pth") if not os.path.isfile(rng_file): logger.info( "Didn't find an RNG file. If you are resuming a training that was launched in a distributed " "fashion, reproducibility is not guaranteed." ) return checkpoint_rng_state = torch.load(rng_file) random.setstate(checkpoint_rng_state["python"]) np.random.set_state(checkpoint_rng_state["numpy"]) torch.random.set_rng_state(checkpoint_rng_state["cpu"]) # TODO: enable this when SDK 2.5 is out. # self.training_model.rng_state = checkpoint_rng_state["ipu"] def _save_checkpoint(self, model, metrics=None): # Save model checkpoint checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}" run_dir = self.args.output_dir self.store_flos() output_dir = os.path.join(run_dir, checkpoint_folder) self.save_model(output_dir, _internal_call=True) if self.args.should_save: torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME)) with warnings.catch_warnings(record=True) as caught_warnings: torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME)) reissue_pt_warnings(caught_warnings) # Determine the new best metric / best model checkpoint if metrics is not None and self.args.metric_for_best_model is not None: metric_to_check = self.args.metric_for_best_model if not metric_to_check.startswith("eval_"): metric_to_check = f"eval_{metric_to_check}" metric_value = metrics[metric_to_check] operator = np.greater if self.args.greater_is_better else np.less if ( self.state.best_metric is None or self.state.best_model_checkpoint is None or operator(metric_value, self.state.best_metric) ): self.state.best_metric = metric_value self.state.best_model_checkpoint = output_dir # Save the Trainer state if self.args.should_save: self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME)) # Save RNG state in non-distributed training rng_states = { "python": random.getstate(), "numpy": np.random.get_state(), "cpu": torch.random.get_rng_state(), # TODO: enable this when SDK 2.5 is out. # "ipu": self.training_model.rng_state, } # A process can arrive here before the process 0 has a chance to save the model, in which case output_dir may # not yet exist. os.makedirs(output_dir, exist_ok=True) torch.save(rng_states, os.path.join(output_dir, "rng_state.pth")) if self.args.push_to_hub: self._push_from_checkpoint(output_dir) # Maybe delete some older checkpoints. if self.args.should_save: self._rotate_checkpoints(use_mtime=True, output_dir=run_dir) def _load_optimizer_and_scheduler(self, checkpoint): """If optimizer and scheduler states exist, load them.""" if checkpoint is None: return if os.path.isfile(os.path.join(checkpoint, OPTIMIZER_NAME)) and os.path.isfile( os.path.join(checkpoint, SCHEDULER_NAME) ): self.optimizer.load_state_dict(torch.load(os.path.join(checkpoint, OPTIMIZER_NAME))) with warnings.catch_warnings(record=True) as caught_warnings: self.lr_scheduler.load_state_dict(torch.load(os.path.join(checkpoint, SCHEDULER_NAME))) reissue_pt_warnings(caught_warnings) self.training_model.setOptimizer(self.optimizer) def log(self, logs: Dict[str, float]) -> None: """ Log `logs` on the various objects watching the training. Subclass and override this method to inject custom behavior. Args: logs (`Dict[str, float]`): The values to log. """ if self.state.epoch is not None: logs["epoch"] = round(self.state.epoch, 2) output = {**logs, **{"step": self.state.global_step}} self.state.log_history.append(output) self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs) def _prepare_input(self, data: Union[torch.Tensor, Any]) -> Union[torch.Tensor, Any]: """ Prepares a single data sample before feeding it to the model. The data sample can be it a tensor or a nested list or dictionary of tensors. """ if isinstance(data, dict): return type(data)(**{k: self._prepare_input(v) for k, v in data.items()}) elif isinstance(data, (tuple, list)): return type(data)(self._prepare_input(v) for v in data) elif isinstance(data, torch.Tensor): return data return data def _prepare_inputs(self, inputs: Dict[str, Union[torch.Tensor, Any]]) -> Dict[str, Union[torch.Tensor, Any]]: """ Prepares inputs before feeding them to the model. This method converts the inputs to tensors if they are not already tensors and handles the potential state. """ inputs = self._prepare_input(inputs) if len(inputs) == 0: raise ValueError( "The batch received was empty. Your model won't be able to train on it. Double-check that your " f"training dataset contains the keys expected by the model: {','.join(self._signature_columns)}." ) if self.args.past_index >= 0 and self._past is not None: inputs["mems"] = self._past return inputs def training_step( self, model: poptorch.PoplarExecutor, inputs: Dict[str, Union[torch.Tensor, Any]] ) -> torch.Tensor: """ Performs a training step on a batch of inputs. Subclass and override to inject custom behavior. Args: model (`poptorch.PoplarExecutor`): The model to train. inputs (`Dict[str, Union[torch.Tensor, Any]]`): The inputs and targets of the model. The dictionary will be unpacked before being fed to the model. Most models expect the targets under the argument `labels`. Check your model's documentation for all accepted arguments. Return: `torch.Tensor`: The tensor with the training loss on this batch. """ inputs = self._prepare_inputs(inputs) loss = self.compute_loss(model, inputs) loss = loss.mean() return loss def compute_loss(self, model, inputs, return_outputs=False): """ Computes the loss on a batch of training inputs. Args: model: The model to train. inputs: The inputs and targets of the model. return_outputs (defaults to `False`): If `True`, returns the outputs with the loss. If `False`, only returns the loss. By default, all models return the loss in the first element. Subclass and override for custom behavior. """ if self.label_smoother is not None and "labels" in inputs: labels = inputs.pop("labels") else: labels = None outputs = model(**inputs) # Save past state if it exists # TODO: this needs to be fixed and made cleaner later. if self.args.past_index >= 0: self._past = outputs[self.args.past_index] if labels is not None: loss = self.label_smoother(outputs, labels) else: if isinstance(outputs, dict) and "loss" not in outputs: raise ValueError( "The model did not return a loss from the inputs, only the following keys: " f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." ) # We don't use .loss here since the model may return tuples instead of ModelOutput. loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] return (loss, outputs) if return_outputs else loss def is_world_process_zero(self) -> bool: # Needed only because log_metrics use it. return True def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False): """ Saves the model, so you can reload it using `from_pretrained()`. Will only save the model from the main process. """ if output_dir is None: output_dir = self.args.output_dir if self.args.should_save: self._save(output_dir) # Push to the Hub when `save_model` is called by the user. if self.args.push_to_hub and not _internal_call: self.push_to_hub(commit_message="Model save") def _save(self, output_dir: Optional[str] = None, state_dict=None): # If we are executing this function, we are the process zero, so we don't check for that. output_dir = output_dir if output_dir is not None else self.args.output_dir os.makedirs(output_dir, exist_ok=True) logger.info(f"Saving model checkpoint to {output_dir}") # Updating self.model weights with the weights stored on device. # TODO: can this be deleted? I would makle things faster. if self.training_model is not None and self.training_model.isAttachedToDevice(): self.training_model.copyWeightsToHost() # Save a trained model and configuration using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` if not isinstance(self.model, (PreTrainedModel, PeftModel)): logger.info( "Trainer.model is not a `transformers.PreTrainedModel` or `peft.PeftModel`, only saving its state dict." ) if state_dict is None: state_dict = self.model.state_dict() torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME)) else: rng_state = torch.random.get_rng_state() self.model.deparallelize() self.model.save_pretrained(output_dir, state_dict=state_dict) self.model.parallelize(**self.model.ipu_config.parallelize_kwargs) torch.random.set_rng_state(rng_state) if self.tokenizer is not None: self.tokenizer.save_pretrained(output_dir) self.ipu_config.save_pretrained(output_dir) # Good practice: save your training arguments together with the trained model torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME)) def store_flos(self): # Storing the number of floating-point operations that went into the model # TODO: Validate that this is right. (It's most likely wrong) self.state.total_flos += self.current_flos * self.ipu_config.batch_size_factor() self.current_flos = 0 def _sorted_checkpoints( self, output_dir=None, checkpoint_prefix=PREFIX_CHECKPOINT_DIR, use_mtime=False ) -> List[str]: ordering_and_checkpoint_path = [] glob_checkpoints = [str(x) for x in Path(output_dir).glob(f"{checkpoint_prefix}-*")] for path in glob_checkpoints: if use_mtime: ordering_and_checkpoint_path.append((os.path.getmtime(path), path)) else: regex_match = re.match(f".*{checkpoint_prefix}-([0-9]+)", path) if regex_match is not None and regex_match.groups() is not None: ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path)) checkpoints_sorted = sorted(ordering_and_checkpoint_path) checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted] # Make sure we don't delete the best model. if self.state.best_model_checkpoint is not None: best_model_index = checkpoints_sorted.index(str(Path(self.state.best_model_checkpoint))) for i in range(best_model_index, len(checkpoints_sorted) - 2): checkpoints_sorted[i], checkpoints_sorted[i + 1] = checkpoints_sorted[i + 1], checkpoints_sorted[i] return checkpoints_sorted def _rotate_checkpoints(self, use_mtime=False, output_dir=None) -> None: if self.args.save_total_limit is None or self.args.save_total_limit <= 0: return # Check if we should delete older checkpoint(s) checkpoints_sorted = self._sorted_checkpoints(use_mtime=use_mtime, output_dir=output_dir) if len(checkpoints_sorted) <= self.args.save_total_limit: return # If save_total_limit=1 with load_best_model_at_end=True, we could end up deleting the last checkpoint, which # we don't do to allow resuming. save_total_limit = self.args.save_total_limit if ( self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1 and checkpoints_sorted[-1] != self.state.best_model_checkpoint ): save_total_limit = 2 number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - save_total_limit) checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete] for checkpoint in checkpoints_to_be_deleted: logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit") shutil.rmtree(checkpoint) def evaluate( self, eval_dataset: Optional[Dataset] = None, ignore_keys: Optional[List[str]] = None, metric_key_prefix: str = "eval", ) -> Dict[str, float]: """ Runs an evaluation and returns metrics. The calling script will be responsible for providing a method to compute the metrics, as they are task-dependent (pass it to the init `compute_metrics` argument). You can also subclass and override this method to inject custom behavior. Args: eval_dataset (`Dataset`, *optional*): Pass a dataset if you wish to override `self.eval_dataset`. If it is a [`~datasets.Dataset`] dataset, the columns not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__` method. ignore_keys (`Lst[str]`, *optional*): A list of keys in the output of your model (if it is a dictionary) that should be ignored when gathering predictions. metric_key_prefix (`str`, *optional*, defaults to `"eval"`): An optional prefix to be used as the metrics key prefix. For example the metric "bleu" will be named "eval_bleu" if the prefix is "eval" (default) Returns: A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The dictionary also contains the epoch number which comes from the training state. """ # memory metrics - must set up as early as possible self._memory_tracker.start() eval_dataloader = self.get_eval_dataloader(eval_dataset) prediction_loss_only = True if self.compute_metrics is None else None if prediction_loss_only is None: prediction_loss_only = self.args.prediction_loss_only # Running this here (even though it is being recalled in self.evaluation_loop to make compilation happen here. # That way, compilation will not mess inference speed metrics. _ = self._wrap_and_compile_model_for_evaluation(eval_dataloader, prediction_loss_only) start_time = time.time() output = self.evaluation_loop( eval_dataloader, description="Evaluation", # No point gathering the predictions if there are no metrics, otherwise we defer to # self.args.prediction_loss_only prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix, ) # If we are using padded data collator, dropped the padded part of the output if self.args.pad_on_batch_axis: eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset dataset_len = len(eval_dataset) output = output._replace(predictions=tuple(pred[:dataset_len] for pred in output.predictions)) output = output._replace(num_samples=dataset_len) total_batch_size = self.args.per_device_eval_batch_size * self.ipu_config.batch_size_factor(for_inference=True) output.metrics.update( speed_metrics( metric_key_prefix, start_time, num_samples=output.num_samples, num_steps=math.ceil(output.num_samples / total_batch_size), ) ) self.log(output.metrics) self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics) self._memory_tracker.stop_and_update_metrics(output.metrics) return output.metrics def predict( self, test_dataset: Dataset, ignore_keys: Optional[List[str]] = None, metric_key_prefix: str = "test" ) -> PredictionOutput: """ Returns predictions and potential metrics. Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method will also return metrics, like in `evaluate()`. Args: test_dataset (`Dataset`): Dataset to run the predictions on. If it is an `datasets.Dataset` dataset, the columns not accepted by the `model.forward()` method are automatically removed. Has to implement the method `__len__` ignore_keys (`Lst[str]`, *optional*): A list of keys in the output of your model (if it is a dictionary) that should be ignored when gathering predictions. metric_key_prefix (`str`, *optional*, defaults to `"test"`): An optional prefix to be used as the metrics key prefix. For example the metric "bleu" will be named "test_bleu" if the prefix is "test" (default) <Tip> If your predictions or labels have different sequence lengths (for instance because you're doing dynamic padding in a token classification task) the predictions will be padded (on the right) to allow for concatenation into one array. The padding index is -100. </Tip> Returns: *NamedTuple* A namedtuple with the following keys: - predictions (`np.ndarray`): The predictions on `test_dataset`. - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some). - metrics (`Dict[str, float]`, *optional*): The dictionary of potential metrics (if the dataset contained labels). """ # memory metrics - must set up as early as possible self._memory_tracker.start() test_dataloader = self.get_test_dataloader(test_dataset) start_time = time.time() output = self.evaluation_loop( test_dataloader, description="Prediction", ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix ) # If we are using padded data collator, dropped the padded part of the output if self.args.pad_on_batch_axis: dataset_len = len(test_dataset) output = output._replace(predictions=tuple([pred[:dataset_len] for pred in output.predictions])) output = output._replace(num_samples=dataset_len) total_batch_size = self.args.eval_batch_size * self.ipu_config.batch_size_factor(for_inference=True) output.metrics.update( speed_metrics( metric_key_prefix, start_time, num_samples=output.num_samples, num_steps=math.ceil(output.num_samples / total_batch_size), ) ) self._memory_tracker.stop_and_update_metrics(output.metrics) return PredictionOutput(predictions=output.predictions, label_ids=output.label_ids, metrics=output.metrics) def _wrap_and_compile_model_for_evaluation(self, dataloader, prediction_loss_only): model = self.wrap_model(self.model, training=False) try: model_inputs = next(iter(dataloader)) except StopIteration: raise ValueError( "Couldn't get first sample from dataloader, please check for warnings " "during dataloader construction." ) self.compile_model(model, model_inputs, log=True) return model def evaluation_loop( self, dataloader: poptorch.DataLoader, description: str, prediction_loss_only: Optional[bool] = None, ignore_keys: Optional[List[str]] = None, metric_key_prefix: str = "eval", ) -> EvalLoopOutput: """ Prediction/evaluation loop, shared by [`IPUTrainer.evaluate`] and [`IPUTrainer.predict`]. Works both with or without labels. Args: dataloader (`poptorch.DataLoader`): The dataset to be used. description (`str`): The description of what is being run. prediction_loss_only (`bool`): If `True`, only returns the loss. If `False`, returns loss, logits and labels (if present). ignore_keys (`Lst[str]`, *optional*): A list of keys in the output of your model (if it is a dictionary) that should be ignored when gathering predictions. metric_key_prefix (`str`, *optional*, defaults to `"eval"`): An optional prefix to be used as the metrics key prefix. For example the metric "bleu" will be named "eval_bleu" if the prefix is "eval" (default). """ prediction_loss_only = ( prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only ) self.inference_model = self._wrap_and_compile_model_for_evaluation(dataloader, prediction_loss_only) batch_size = dataloader.batch_size logger.info(f"***** Running {description} *****") if has_length(dataloader): logger.info(f" Num examples = {self.num_examples(dataloader)}") else: logger.info(" Num examples: Unknown") logger.info(f" Batch size = {batch_size}") self.callback_handler.eval_dataloader = dataloader # eval_dataset = getattr(dataloader, "dataset", None) if self.args.past_index >= 0: self._past = None # Initialize containers # losses/preds/labels on IPU (accumulated for eval_accumulation_steps, legacy code for IPUs) losses_host = None preds_host = None labels_host = None # losses/preds/labels on CPU (final containers) all_losses = None all_preds = None all_labels = None # Will be useful when we have an iterable dataset so don't know its length. observed_num_examples = 0 # Main evaluation loop for step, inputs in enumerate(dataloader): # Update the observed num examples observed_batch_size = find_batch_size(inputs) if observed_batch_size is not None: observed_num_examples += observed_batch_size # For batch samplers, batch_size is not known by the dataloader in advance. if batch_size is None: batch_size = observed_batch_size # Prediction step # If dataset is not sized, is_last_batch is False because we cannot know. is_last_batch = ( step == len(dataloader) - 1 if isinstance(dataloader.dataset, collections.abc.Sized) else False ) loss, logits, labels = self.prediction_step( self.inference_model, inputs, prediction_loss_only, ignore_keys=ignore_keys, is_last_batch=is_last_batch, ) # Update containers on host if loss is not None: loss = loss.mean(dim=0, keepdim=True) # If only one IPU is used, loss is a zero dimensional tensor, we unsqueeze to be able to concatenate. if loss.dim() == 0: loss = loss.unsqueeze(0) losses_host = loss if losses_host is None else torch.cat((losses_host, loss), dim=0) if logits is not None: if self.preprocess_logits_for_metrics is not None: logits = self.preprocess_logits_for_metrics(logits, labels) preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100) if labels is not None: labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100) self.control = self.callback_handler.on_prediction_step(self.args, self.state, self.control) if self.args.past_index and hasattr(self, "_past"): # Clean the state at the end of the evaluation loop delattr(self, "_past") if losses_host is not None: losses = nested_numpify(losses_host) all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0) if preds_host is not None: logits = nested_numpify(preds_host) all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100) if labels_host is not None: labels = nested_numpify(labels_host) all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100) # Number of samples # In the original Trainer, TODO: should we use this instead? # if has_length(eval_dataset): # num_samples = len(eval_dataset) # # The instance check is weird and does not actually check for the type, but whether the dataset has the right # # methods. Therefore we need to make sure it also has the attribute. # elif isinstance(eval_dataset, IterableDatasetShard) and getattr(eval_dataset, "num_examples", 0) > 0: # num_samples = eval_dataset.num_examples # else: # if has_length(dataloader): # num_samples = self.num_examples(dataloader) # else: # both len(dataloader.dataset) and len(dataloader) fail # num_samples = observed_num_examples # if num_samples == 0 and observed_num_examples > 0: # num_samples = observed_num_examples num_samples = observed_num_examples # Number of losses has been rounded to a multiple of batch_size and in a distributed training, the number of # samplers has been rounded to a multiple of batch_size, so we truncate. if all_losses is not None: all_losses = all_losses[:num_samples] if all_preds is not None: all_preds = nested_truncate(all_preds, num_samples) if all_labels is not None: all_labels = nested_truncate(all_labels, num_samples) # Metrics! if self.compute_metrics is not None and all_preds is not None and all_labels is not None: metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels)) else: metrics = {} # To be JSON-serializable, we need to remove numpy types or zero-d tensors metrics = denumpify_detensorize(metrics) if all_losses is not None: metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item() # Prefix all keys with metric_key_prefix + '_' for key in list(metrics.keys()): if not key.startswith(f"{metric_key_prefix}_"): metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) # Detaching model from device to let the training model attach itself self._detach_inference_model() return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples) def prediction_step( self, model: poptorch.PoplarExecutor, inputs: Dict[str, Union[torch.Tensor, Any]], prediction_loss_only: bool, ignore_keys: Optional[List[str]] = None, is_last_batch: bool = False, ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: """ Performs an evaluation step. Subclass and override to inject custom behavior. Args: model (`poptorch.PoplarExecutor`): The model to evaluate. inputs (`Dict[str, Union[torch.Tensor, Any]]`): The inputs and targets of the model. The dictionary will be unpacked before being fed to the model. Most models expect the targets under the argument `labels`. Check your model's documentation for all accepted arguments. prediction_loss_only (`bool`): If `True`, only returns the loss. If `False`, returns loss, logits and labels (if present). ignore_keys (`Lst[str]`, *optional*): A list of keys in the output of your model (if it is a dictionary) that should be ignored when gathering predictions. Return: Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and labels (each being optional). """ has_labels = all(inputs.get(k) is not None for k in self.label_names) inputs = self._prepare_inputs(inputs) if ignore_keys is None: if hasattr(self.model, "config"): ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", []) else: ignore_keys = [] # labels may be popped when computing the loss (label smoothing for instance) so we grab them first. if has_labels: labels = nested_detach(tuple(inputs.get(name) for name in self.label_names)) if len(labels) == 1: labels = labels[0] else: labels = None with torch.no_grad(): if has_labels: loss, outputs = self.compute_loss(model, inputs, return_outputs=True) # If last batch is incomplete, some losses might be NaN because nothing was computed on the # corresponding Pod, ignoring them is necessary to not mess up evaluation loss computation if is_last_batch: loss = loss[~loss.isnan()] loss = loss.detach() if isinstance(outputs, dict): logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"]) else: logits = outputs[1:] else: loss = None outputs = model(**inputs) if isinstance(outputs, dict): logits = tuple(v for k, v in outputs.items() if k not in ignore_keys) else: logits = outputs # TODO: this needs to be fixed and made cleaner later. if self.args.past_index >= 0: self._past = outputs[self.args.past_index - 1] if prediction_loss_only: return (loss, None, None) logits = nested_detach(logits) if len(logits) == 1: logits = logits[0] return (loss, logits, labels) def floating_point_ops(self, inputs: Dict[str, Union[torch.Tensor, Any]]): """ For models that inherit from [`transformers.PreTrainedModel`], uses that class's `floating_point_ops` method to compute the number of floating point operations for every backward and every forward pass. If using another model, either implement a `floating_point_ops` method in the model or subclass and override this method. Args: inputs (`Dict[str, Union[torch.Tensor, Any]]`): The inputs and targets of the model. Returns: `int`: The number of floating-point operations. """ # Using self.original_model because self.model is the underlying model used by the IPUs # and calling floating_point_ops on it slows things down a lot. if hasattr(self.original_model, "floating_point_ops"): return self.original_model.floating_point_ops(inputs) else: return 0 def init_git_repo(self, at_init: bool = False): """ Initializes a Git repo in `self.args.hub_model_id`. Args: at_init (`bool`, *optional*, defaults to `False`): If `True`, this function is called before any training. If `self.args.overwrite_output_dir` is `True` and `at_init` is `True`, the path to the repo (which is `self.args.output_dir`) might be wiped out. """ if not self.is_world_process_zero(): return use_auth_token = True if self.args.hub_token is None else self.args.hub_token if self.args.hub_model_id is None: repo_name = Path(self.args.output_dir).absolute().name else: repo_name = self.args.hub_model_id if "/" not in repo_name: repo_name = get_full_repo_name(repo_name, token=self.args.hub_token) try: self.repo = Repository( self.args.output_dir, clone_from=repo_name, use_auth_token=use_auth_token, private=self.args.hub_private_repo, ) except EnvironmentError: if self.args.overwrite_output_dir and at_init: # Try again after wiping output_dir shutil.rmtree(self.args.output_dir) self.repo = Repository( self.args.output_dir, clone_from=repo_name, use_auth_token=use_auth_token, ) else: raise self.repo.git_pull() # By default, ignore the checkpoint folders if ( not os.path.exists(os.path.join(self.args.output_dir, ".gitignore")) and self.args.hub_strategy != HubStrategy.ALL_CHECKPOINTS ): with open(os.path.join(self.args.output_dir, ".gitignore"), "w", encoding="utf-8") as writer: writer.writelines(["checkpoint-*/"]) self.push_in_progress = None def create_model_card( self, language: Optional[str] = None, license: Optional[str] = None, tags: Union[str, List[str], None] = None, model_name: Optional[str] = None, finetuned_from: Optional[str] = None, tasks: Union[str, List[str], None] = None, dataset_tags: Union[str, List[str], None] = None, dataset: Union[str, List[str], None] = None, dataset_args: Union[str, List[str], None] = None, ): """ Creates a draft of a model card using the information available to [`IPUTrainer`]. Args: language (`str`, *optional*): The language of the model (if applicable) license (`str`, *optional*): The license of the model. Will default to the license of the pretrained model used, if the original model given to [`IPUTrainer`] comes from a repo on the Hub. tags (`str` or `List[str]`, *optional*): Some tags to be included in the metadata of the model card. model_name (`str`, *optional*): The name of the model. finetuned_from (`str`, *optional*): The name of the model used to fine-tune this one (if applicable). Will default to the name of the repo of the original model given to [`IPUTrainer`] (if it comes from the Hub). tasks (`str` or `List[str]`, *optional*): One or several task identifiers, to be included in the metadata of the model card. dataset_tags (`str` or `List[str]`, *optional*): One or several dataset tags, to be included in the metadata of the model card. dataset (`str` or `List[str]`, *optional*): One or several dataset identifiers, to be included in the metadata of the model card. dataset_args (`str` or `List[str]`, *optional*): One or several dataset arguments, to be included in the metadata of the model card. """ if not self.is_world_process_zero(): return training_summary = IPUTrainingSummary.from_trainer( self, language=language, license=license, tags=tags, model_name=model_name, finetuned_from=finetuned_from, tasks=tasks, dataset_tags=dataset_tags, dataset=dataset, dataset_args=dataset_args, ) model_card = training_summary.to_model_card() with open(os.path.join(self.args.output_dir, "README.md"), "w") as f: f.write(model_card) def _push_from_checkpoint(self, checkpoint_folder): # Only push from one node. if self.args.hub_strategy == HubStrategy.END: return # If we haven't finished the last push, we don't do this one. if self.push_in_progress is not None and not self.push_in_progress.is_done: return output_dir = self.args.output_dir # To avoid a new synchronization of all model weights, we just copy the file from the checkpoint folder modeling_files = [CONFIG_NAME, WEIGHTS_NAME, IPU_CONFIG_NAME] for modeling_file in modeling_files: if os.path.isfile(os.path.join(checkpoint_folder, modeling_file)): shutil.copy(os.path.join(checkpoint_folder, modeling_file), os.path.join(output_dir, modeling_file)) # Saving the tokenizer is fast and we don't know how many files it may have spawned, so we resave it to be sure. if self.tokenizer is not None: self.tokenizer.save_pretrained(output_dir) # Same for the training arguments torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME)) try: if self.args.hub_strategy == HubStrategy.CHECKPOINT: # Temporarily move the checkpoint just saved for the push tmp_checkpoint = os.path.join(output_dir, "last-checkpoint") # We have to remove the "last-checkpoint" dir if it exists, otherwise the checkpoint is moved as a # subfolder. if os.path.isdir(tmp_checkpoint): shutil.rmtree(tmp_checkpoint) shutil.move(checkpoint_folder, tmp_checkpoint) if self.args.save_strategy == IntervalStrategy.STEPS: commit_message = f"Training in progress, step {self.state.global_step}" else: commit_message = f"Training in progress, epoch {int(self.state.epoch)}" _, self.push_in_progress = self.repo.push_to_hub( commit_message=commit_message, blocking=False, auto_lfs_prune=True ) finally: if self.args.hub_strategy == HubStrategy.CHECKPOINT: # Move back the checkpoint to its place shutil.move(tmp_checkpoint, checkpoint_folder) def push_to_hub(self, commit_message: Optional[str] = "End of training", blocking: bool = True, **kwargs) -> str: """ Uploads *self.model* and *self.tokenizer* to the 🤗 Models Hub on the repo *self.args.hub_model_id*. Parameters: commit_message (`str`, *optional*, defaults to `"End of training"`): Message for the commit. blocking (`bool`, *optional*, defaults to `True`): If `True` (default), the function only returns when the `git push` command has completed. If `False`, returns immediately. kwargs: Additional keyword arguments passed along to [`~Trainer.create_model_card`]. Returns: If `blocking=False`, returns the URL of the commit of your model in the given repository. If `blocking=True`, returns a tuple with the URL of the commit and an object to track the progress of the commit. """ # If a user calls manually `push_to_hub` with `self.args.push_to_hub = False`, we try to create the repo but # it might fail. if not hasattr(self, "repo"): self.init_git_repo() if self.args.should_save: if self.args.hub_model_id is None: model_name = Path(self.args.output_dir).name else: model_name = self.args.hub_model_id.split("/")[-1] # Needs to be executed on all processes for TPU training, but will only save on the processed determined by # self.args.should_save. self.save_model(_internal_call=True) # Cancel any async push in progress if blocking=True. The commits will all be pushed together. if blocking and self.push_in_progress is not None and not self.push_in_progress.is_done: self.push_in_progress._process.kill() self.push_in_progress = None git_head_commit_url = self.repo.push_to_hub( commit_message=commit_message, blocking=blocking, auto_lfs_prune=True ) # push separately the model card to be independant from the rest of the model if self.args.should_save: self.create_model_card(model_name=model_name, **kwargs) try: self.repo.push_to_hub( commit_message="update model card README.md", blocking=blocking, auto_lfs_prune=True ) except EnvironmentError as exc: logger.error(f"Error pushing update to the model card. Please read logs and retry.\n${exc}") return git_head_commit_url

optimum/graphcore/trainer.py (1,497 lines of code) (raw):