in optimum/exporters/openvino/model_configs.py [0:0]
def get_model_for_behavior(model, behavior: Union[str, Phi4MMConfigBehavior]):
if isinstance(behavior, str) and not isinstance(behavior, Phi4MMConfigBehavior):
behavior = Phi4MMConfigBehavior(behavior)
if behavior == Phi4MMConfigBehavior.LANGUAGE:
return model
if behavior == Phi4MMConfigBehavior.VISION_EMBEDDINGS:
vision_embeddings = model.model.embed_tokens_extend.image_embed
vision_embeddings.config = model.model.embed_tokens_extend.image_embed.img_processor.config
return model.model.embed_tokens_extend.image_embed
if behavior == Phi4MMConfigBehavior.VISION_PROJECTION:
vision_model = model.model.embed_tokens_extend.image_embed
if hasattr(vision_model, "img_projection"):
projection = vision_model.img_projection
else:
import torch
projection = torch.nn.Sequential(
*[vision_model.img_projection_up, torch.nn.GELU(), vision_model.img_projection_down]
)
projection.config = vision_model.img_processor.config
return projection
if behavior == Phi4MMConfigBehavior.TEXT_EMBEDDINGS:
if hasattr(model.model, "_require_grads_hook"):
model.model.disable_input_require_grads()
text_embedding = model.model.embed_tokens
text_embedding.config = model.config
return text_embedding
if behavior == Phi4MMConfigBehavior.AUDIO_EMBEDDINGS:
audio_embeddings = model.model.embed_tokens_extend.audio_embed.encoder.encoder_embedding
audio_embeddings.config = model.config
return audio_embeddings
if behavior == Phi4MMConfigBehavior.AUDIO_ENCODER:
audio_encoder = model.model.embed_tokens_extend.audio_embed.encoder
audio_encoder.config = model.config
return audio_encoder
if behavior == Phi4MMConfigBehavior.AUDIO_FORWARD_EMBEDDINGS:
audio_encoder = model.model.embed_tokens_extend.audio_embed.encoder
audio_encoder.config = model.config
return audio_encoder
if behavior == Phi4MMConfigBehavior.AUDIO_SPEECH_PROJECTION:
if hasattr(model.model.embed_tokens_extend.audio_embed, "audio_projection"):
audio_projection = model.model.embed_tokens_extend.audio_embed.audio_projection["speech"]
audio_projection.config = model.config
return audio_projection
else:
import torch
audio_projection = torch.nn.Sequential(
*[
model.model.embed_tokens_extend.audio_embed.up_proj_for_speech,
torch.nn.GELU(),
model.model.embed_tokens_extend.audio_embed.down_proj_for_speech,
]
)
audio_projection.config = model.config
return audio_projection
if behavior == Phi4MMConfigBehavior.AUDIO_VISION_PROJECTION:
if hasattr(model.model.embed_tokens_extend.audio_embed, "audio_projection"):
audio_projection = model.model.embed_tokens_extend.audio_embed.audio_projection["vision"]
audio_projection.config = model.config
return audio_projection
else:
import torch
audio_projection = torch.nn.Sequential(
*[
model.model.embed_tokens_extend.audio_embed.up_proj_for_vision_speech,
torch.nn.GELU(),
model.model.embed_tokens_extend.audio_embed.down_proj_for_vision_speech,
]
)
audio_projection.config = model.config
return audio_projection