in vision/m4/training/config.py [0:0]
def __post_init__(self, should_verify: bool = True):
"""Post-initialization code"""
self.verify(should_verify=should_verify)
# copy select_n_examples to the more specific ones if the latter haven't been preset
if self.data_param.select_n_examples is not None:
if self.data_param.select_n_examples_train is None:
self.data_param.select_n_examples_train = self.data_param.select_n_examples
if self.data_param.select_n_examples_validation is None:
self.data_param.select_n_examples_validation = self.data_param.select_n_examples
# Get commit id
if self.hparams.repo_commit_id is None:
self.hparams.repo_commit_id = git.Repo(search_parent_directories=True).head.object.hexsha
if self.hparams.lora_name is not None and not self.hparams.use_lora:
raise ValueError("Can't have a lora_name if use_lora is False")
# If processing on the fly, with the current implementation, we can't have `num_workers=0`
if self.data_param.realtime_processing and self.data_param.num_workers == 0:
raise ValueError(
"If doing processing on the fly (and thus using the `IterableDataset`), you can't have `num_workers`"
" equal to 0."
)
# batch_size deprecation
if self.hparams.batch_size is not None:
if self.hparams.batch_size_per_gpu > 1:
raise ValueError(
"as hparams.batch_size is deprecated - don't know how to proceed with both hparams.batch_size>1"
" and hparams.batch_size_per_gpu > 1"
)
else:
logger.warning(
"will use the deprecated hparams.batch_size, but transition to hparams.batch_size_per_gpu instead"
)
self.hparams.batch_size_per_gpu = self.hparams.batch_size
self.hparams.batch_size = None
# Assign batch size to data_param as well for dataloaders
self.data_param.batch_size = self.hparams.batch_size_per_gpu
# note: all global batch_size-related configs including hparams.grad_acc_size will be
# checked/set in trainer's setup_batch_size_related_configs since we need to know the value
# of num_processes
# Assign loggingtypes given values
self.hparams.train_logging_activations = [LoggingTypes(val) for val in self.hparams.train_logging_activations]
# Check that proba_interleaving_dataset is mutually exclusive to loss_weights_per_dataset
if self.data_param.proba_interleaving_dataset and self.hparams.loss_weights_per_dataset:
raise ValueError(
"Can't have hparams.loss_weights_per_dataset and proba_interleaving_dataset. If we have"
" loss_weights_per_dataset, it means the gradients are accumulated over datasets. Therefore a batch of"
" each given at each update and there is no use of proba_interleaving_dataset"
)
if (
self.data_param.proba_interleaving_dataset is not None
and sum(self.data_param.proba_interleaving_dataset) != 1
):
if abs(sum(self.data_param.proba_interleaving_dataset) - 1) > 0.001:
# Allow a tolerance for floating points rounding errors.
raise ValueError("proba_interleaving_dataset must sum to 1")
if self.hparams.use_lora:
has_vision_lora = any(["vision" in pattern for pattern in self.hparams.patterns_to_loraify])
has_text_lora = any(["model.layers" in pattern for pattern in self.hparams.patterns_to_loraify])
if has_vision_lora and not self.hparams.model_config["freeze_vision_layers"]:
raise ValueError(
"hparams.patterns_to_loraify suggests Lora is applied on the vision backbone, so"
" model_config.freeze_vision_layers should be True, but it is set to False"
)
if has_text_lora and not self.hparams.model_config["freeze_text_layers"]:
raise ValueError(
"hparams.patterns_to_loraify,suggests Lora is applied on the text backbone, so"
" model_config.freeze_text_layers should be True, but it is set to False"
)
self.hparams.train_logging_grad_param_deepspeed = [
LoggingTypes(val) for val in self.hparams.train_logging_grad_param_deepspeed
]
# Resume run if there is already an existing folder for this run
if self.hparams.save_dir is not None and self.hparams.save_dir.exists():
save_dir_has_checkpoints = (
len([dir for dir in self.hparams.save_dir.iterdir() if (dir.is_dir() and "opt_step" in str(dir))]) > 0
)
if self.hparams.resume_run is not None and not self.hparams.resume_run and save_dir_has_checkpoints:
logger.warning(
"`resume_run` was explicitely set to False (i.e. starting from scratch), but the experiment"
" folder already has been populated with previous runs.\nAlready saved checkpoints will be"
" overwritten (at best, when `train_saving_opt_steps` is the same) or will be mixed with the new"
" checkpoints of a potentially brand new experiment. Would it make sense to create a new"
" `save_dir`?"
)
self.hparams.resume_run = save_dir_has_checkpoints
# Setup all args needed to resume a run
if self.hparams.resume_run:
# Get last step directory
if self.resume_param.opt_step_dir is None and not self.resume_param.resume_last:
raise ValueError(
"`opt_step_dir` cannot be None while `resume_last` is False. Choose which dir you want to resume"
" from..."
)
if self.resume_param.resume_last:
if self.resume_param.opt_step_dir is not None:
raise ValueError(
"`resume_last` cannot be True while `opt_step_dir` is not None. Choose which dir you want to"
" resume from..."
)
latest_path = self.hparams.save_dir / "latest_opt_step_dir"
with open(latest_path, "r") as fd:
self.resume_param.opt_step_dir = Path(fd.read().strip())
if not (self.resume_param.opt_step_dir.exists() and self.resume_param.opt_step_dir.is_dir()):
raise ValueError(
f"It appears that the path in the `latest_opt_step_dir` file {latest_path} is invalid. It's"
" either does not exist or is not a directory. Please fix that."
)
with open(self.resume_param.opt_step_dir / "resume_run_infos.json", "r") as f:
resume_infos = json.load(f)
logger.info(f"Resuming from {self.resume_param.opt_step_dir}")
self.resume_param.accelerator_state_dir = self.resume_param.opt_step_dir / "accelerator_state"
self.resume_param.model_file = self.resume_param.opt_step_dir / "unwrapped_model"
self.resume_param.lora_file = self.resume_param.opt_step_dir / "unwrapped_adapter"
self.resume_param.model_config_file = self.resume_param.opt_step_dir / "unwrapped_model/config.json"
self.resume_param.tokenizer = self.resume_param.opt_step_dir / "tokenizer"
self.resume_param.train_logs = resume_infos["train_logs"]
self.resume_param.resume_opt_step = resume_infos["resume_opt_step"]
self.resume_param.resume_epoch = resume_infos["resume_epoch"]
self.resume_param.resume_dataset_state = resume_infos.get("resume_dataset_state", list())
gbs_running = resume_infos["gbs_running"]
self.resume_param.gbs_running.global_batch_size_current = gbs_running["global_batch_size_current"]
self.resume_param.gbs_running.global_seen_samples = gbs_running["global_seen_samples"]
self.resume_param.gbs_running.next_goal_samples = gbs_running["next_goal_samples"]
self.resume_param.gbs_running.grad_acc_size_current = gbs_running["grad_acc_size_current"]
self.hparams.wandb_run_id = resume_infos["wandb_run_id"]
self.hparams.seed = resume_infos["seed"]
if not self.hparams.wandb_enable:
self.hparams.wandb_run_id = ""