in ultravox/model/ultravox_model.py [0:0]
def __init__(self, config: UltravoxConfig):
super().__init__()
self.hidden_dim = config.hidden_size
self._pad_and_stack = StackAudioFrames(config.stack_factor)
dim_in = config.audio_config.hidden_size * config.stack_factor
self.ln_pre = RMSNorm(dim_in, init=config.norm_init)
self.linear_1 = nn.Linear(dim_in, self.hidden_dim, bias=False)
dim_mid = self.hidden_dim
self.act = transformers.activations.get_activation(config.projector_act)
dim_mid = dim_mid // 2 if config.projector_act == "swiglu" else dim_mid
dim_out = config.text_config.hidden_size
self.linear_2 = nn.Linear(dim_mid, dim_out, bias=False)
# Ultravox v0.4.1 and below uses layer_norm after the second linear layer,
# while v0.5.0 and above uses layer_norm after the first linear layer.
if config.projector_ln_mid:
self.ln_mid: nn.Module = RMSNorm(dim_mid, init=config.norm_init)
self.ln_post: nn.Module = nn.Identity()
else:
self.ln_mid = nn.Identity()
self.ln_post = RMSNorm(dim_out, init=config.norm_init)