optimum/tpu/modeling_llama.py [40:61]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
)
from transformers.modeling_utils import PreTrainedModel
from transformers.models.llama.configuration_llama import LlamaConfig
from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
from transformers.utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal_2_10,
    logging,
    replace_return_docstrings,
)

from optimum.tpu.xla_model_parallel import (
    ColumnParallelLinear,
    RowParallelLinear,
    get_model_parallel_rank,
    get_model_parallel_world_size,
)


if is_flash_attn_2_available():
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


optimum/tpu/modeling_mistral.py [40:60]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
)
from transformers.modeling_utils import PreTrainedModel
from transformers.models.mistral.configuration_mistral import MistralConfig
from transformers.utils import (
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal_2_10,
    logging,
    replace_return_docstrings,
)

from optimum.tpu.xla_model_parallel import (
    ColumnParallelLinear,
    RowParallelLinear,
    get_model_parallel_rank,
    get_model_parallel_world_size,
)


if is_flash_attn_2_available():
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -