in optimum/neuron/training_args.py [0:0]
def __post_init__(self):
if self.neuron_cc_flags_model_type is not None:
os.environ["OPTIMUM_NEURON_COMMON_FLAGS_MODEL_TYPE"] = self.neuron_cc_flags_model_type
# Patches accelerate.utils.imports.is_tpu_available to match `is_torch_xla_available`
patch_accelerate_is_torch_xla_available()
if self.fsdp not in ["", []]:
raise RuntimeError("FSDP is not supported.")
if self.fp16:
raise ValueError("The fp16 data type is not supported in Neuron, please use bf16 instead.")
resume_from_checkpoint = self.resume_from_checkpoint
if resume_from_checkpoint is None and self.output_dir is not None and os.path.isdir(self.output_dir):
# If checkpoint is None, then there was no checkpoint in output dir, otherwise we use it.
checkpoint = get_last_checkpoint(self.output_dir)
resume_from_checkpoint = checkpoint
if self.pipeline_parallel_size > 1:
if self.gradient_accumulation_steps > 1:
if is_main_worker():
logger.info(
"Pipeline parallel used, setting gradient_accumulation_steps to 1 and scaling the pipeline batch size."
)
self.per_device_train_batch_size *= self.gradient_accumulation_steps
self.per_device_eval_batch_size *= self.gradient_accumulation_steps
self.gradient_accumulation_steps = 1
if self.pipeline_parallel_num_microbatches == -1:
self.pipeline_parallel_num_microbatches = self.per_device_train_batch_size
if self.per_device_train_batch_size % self.pipeline_parallel_num_microbatches != 0:
raise ValueError(
f"The number of pipeline microbatches ({self.pipeline_parallel_num_microbatches}) divide the total "
f"per-device train batch size ({self.per_device_train_batch_size})."
)
if self.per_device_eval_batch_size % self.pipeline_parallel_num_microbatches != 0:
raise ValueError(
f"The number of pipeline microbatches ({self.pipeline_parallel_num_microbatches}) divide the total "
f"per-device eval batch size ({self.per_device_eval_batch_size})."
)
self.trn_config = TrainingNeuronConfig(
self.tensor_parallel_size,
parallelize_embeddings=not self.disable_embedding_parallelization,
sequence_parallel_enabled=not self.disable_sequence_parallel,
kv_size_multiplier=self.kv_size_multiplier,
pipeline_parallel_size=self.pipeline_parallel_size,
pipeline_parallel_num_microbatches=self.pipeline_parallel_num_microbatches,
pipeline_parallel_use_zero1_optimizer=self.zero_1,
checkpoint_dir=resume_from_checkpoint,
num_local_ranks_per_step=self.num_local_ranks_per_step,
use_xser=self.use_xser,
async_save=self.async_save,
fuse_qkv=self.fuse_qkv,
recompute_causal_mask=self.recompute_causal_mask,
gradient_checkpointing=self.gradient_checkpointing,
)
if self.bf16 and self.half_precision_backend == "amp":
os.environ["ACCELERATE_USE_AMP"] = "true"
else:
os.environ["ACCELERATE_USE_AMP"] = "false"
if self.neuron_cc_optlevel is not None:
set_neuron_cc_optlevel(self.neuron_cc_optlevel)
self._world_size_should_behave_as_dp_size = False
# This is required to be able to use bf16, otherwise a check in super().__post_init__() fails.
with Patcher([("transformers.training_args.get_xla_device_type", lambda _: "GPU")]):
super().__post_init__()