in models/language_model.py [0:0]
def __init__(self, cfg):
super().__init__()
assert cfg.lm_hidden_dim % cfg.lm_n_heads == 0, "Hidden dimension must be divisible by number of heads"
self.dim = cfg.lm_hidden_dim // cfg.lm_n_heads # dim of each head
self.base = cfg.lm_re_base
self.max_seq_len = cfg.lm_max_position_embeddings
# Standard RoPE implementation - create frequencies for each dimension
# freq_i = 1 / (base^(2i/dim)) where i is the dimension index
inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float() / self.dim))
self.register_buffer("inv_freq", inv_freq)
self.original_max_seq_len = cfg.lm_max_position_embeddings
self.attention_scaling = cfg.lm_attn_scaling