in vision/m4/models/custom_modules.py [0:0]
def from_pretrained(cls, *model_args, is_resume=False, new_model=False, **kwargs):
"""
Use this method when loading an already pretrained vloom model - either from a checkpoint or from hub.
For creating an untrained model use `pretrained_models` instead.
"""
# config is:
# 1. either not passed and then we use the model's default config (used by tests)
# 2. passed and in which case it's one of:
# 2a. `PretrainedConfig` (a new m4 model)
# 2b. path to a json config (an already pretrained m4 model, usually resumed training)
config = kwargs.get("config", None)
if config is None:
config = cls.config_class.from_pretrained(*model_args, **kwargs, return_unused_kwargs=False)
elif not isinstance(config, PretrainedConfig):
# adapted from https://github.com/huggingface/transformers/blob/d0acc9537829e7d067edbb791473bbceb2ecf056/src/transformers/modeling_utils.py#L1920
assert isinstance(config, os.PathLike)
config_path = str(config)
config = cls.config_class.from_pretrained(
config_path,
return_unused_kwargs=False,
**kwargs,
)
is_untrained_vloom_model, is_pretrained_vloom_model_resumed, is_pretrained_vloom_model_from_hub_or_path = (
False,
False,
False,
)
# 3 Cases:
# 1 - Model has never been trained. This means we need a vision_model_name to start the training with - as we never create one from scratch -
# 2 - Model has been trained and is resuming. We load a random empty model in this case
# 3 - Model has been trained and saved somewhere with a path or is on the hub and has a vision_model_name we initialize the vision model from the vision_model_name class.
if new_model:
is_untrained_vloom_model = True
elif is_resume:
is_pretrained_vloom_model_resumed = True
else:
is_pretrained_vloom_model_from_hub_or_path = True
# torch_dtype is crucial for using the minimal amount of memory at load time
torch_dtype = kwargs.get("torch_dtype", None)
vision_model_name = config.vision_config.vision_model_name
# Create an uninitialized vision_model to insert into the main model.
vision_model_config = AutoConfig.from_pretrained(vision_model_name, trust_remote_code=True)
# Override image_size if we want to increase it compared to pretraining
if hasattr(vision_model_config, "vision_config"):
vision_model_config.vision_config.image_size = config.vision_config.image_size
else:
vision_model_config.image_size = config.vision_config.image_size
# model_with_vision_component = AutoModel.from_config(
# vision_model_config, torch_dtype=torch_dtype, trust_remote_code=True
# )
# Extracts the desired submodule if the part we want is nested (e.g. as in clip)
# kwargs["vision_model"] = vision_model_name_to_model(vision_model_name, model_with_vision_component)
# 1. We load a trained checkpoint but we are not resuming a training:
# If the model is from_hub or from_path, the language model is loaded as well, and
# the uninitialized vision_model is overriden by the checkpoint's weights (i.e. idefics' weights)```
if is_pretrained_vloom_model_from_hub_or_path:
model = super().from_pretrained(*model_args, **kwargs)
# 2. We resume under deepspeed:
# We create an empty model, and get deepspeed to load the weights from the checkpoint.
# Not all models have these keys so handle the case they don't have them
elif is_pretrained_vloom_model_resumed:
_ = kwargs.pop("config", None)
model = super().from_pretrained(None, config=config, state_dict={}, **kwargs)
# 3. If is_untrained_vloom_model, we load the language model first, then we override
# the uninitialized vision_model with one with pretrained weights from the model vision_model_name
elif is_untrained_vloom_model:
model = super().from_pretrained(*model_args, **kwargs)
cls.override_vision_model_wrapper(
model, config, vision_model_name, vision_model_config.to_dict(), torch_dtype
)
return model