optimum/tpu/modeling_gemma.py [223:247]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        layer_idx: Optional[int] = None,
        rank: Optional[int] = None,
        world_size: Optional[int] = None,
    ):
        super().__init__()
        if rank is None:
            self.rank = get_model_parallel_rank()
        else:
            self.rank = rank
        if world_size is None:
            self.world_size = get_model_parallel_world_size()
        else:
            self.world_size = world_size
        self.config = config
        self.layer_idx = layer_idx
        if layer_idx is None:
            logger.warning_once(
                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
                "when creating this class."
            )

        self.attention_dropout = config.attention_dropout
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


optimum/tpu/modeling_llama.py [267:291]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        layer_idx: Optional[int] = None,
        rank: Optional[int] = None,
        world_size: Optional[int] = None,
    ):
        super().__init__()
        if rank is None:
            self.rank = get_model_parallel_rank()
        else:
            self.rank = rank
        if world_size is None:
            self.world_size = get_model_parallel_world_size()
        else:
            self.world_size = world_size
        self.config = config
        self.layer_idx = layer_idx
        if layer_idx is None:
            logger.warning_once(
                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
                "when creating this class."
            )

        self.attention_dropout = config.attention_dropout
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -