backends/gaudi/server/text_generation_server/utils/adapter.py [132:279]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        )

        adapters_to_merge.append((module_map, adapter_config))
        merged_weight_names = merged_weight_names.union(adapter_weight_names)
        if tokenizer is None:
            tokenizer = adapter_tokenizer

    if len(adapters_to_merge) == 0:
        raise ValueError("No adapters to merge.")

    module_map, adapter_config = merge_adapters(adapters_to_merge, params)
    return module_map, adapter_config, merged_weight_names, tokenizer


def check_architectures(
    model_id: str,
    adapter_id: str,
    adapter_config: "AdapterConfig",
    trust_remote_code: bool = False,
):
    try:
        if not adapter_config.base_model_name_or_path:
            # Avoid execution latency caused by the network connection retrying for AutoConfig.from_pretrained(None)
            return

        expected_config = AutoConfig.from_pretrained(
            model_id, trust_remote_code=trust_remote_code
        )
        model_config = AutoConfig.from_pretrained(
            adapter_config.base_model_name_or_path, trust_remote_code=trust_remote_code
        )
    except Exception as e:
        warnings.warn(
            f"Unable to check architecture compatibility for adapter '{adapter_id}' "
            f"against model '{model_id}'. Assuming they are compatible. Error: {e}"
        )
        return

    if model_config.architectures == expected_config.architectures:
        warnings.warn(
            f"Adapter '{adapter_id}' was not trained on base model '{model_id}'. "
            f"If you encounter issues, use --model-id '{adapter_config.base_model_name_or_path}' instead."
        )
    else:
        # TODO(travis): revisit this when we support clasification heads which will not use CausalLM
        raise ValueError(
            f"Adapter '{adapter_id}' is not compatible with model '{model_id}'. "
            f"Architectures differ: {model_config.architectures} != {expected_config.architectures}. "
            f"Use --model-id '{adapter_config.base_model_name_or_path}' instead."
        )


@lru_cache(maxsize=128)
def load_module_map(
    model_id: str,
    revision: str,
    adapter_id: str,
    adapter_path: Optional[str],
    weight_names: Tuple[str],
    trust_remote_code: bool = False,
) -> Tuple["ModuleMap", "AdapterConfig", Set[str], PreTrainedTokenizer]:
    adapter_config = LoraConfig.load(adapter_path or adapter_id, None)

    if not adapter_path and adapter_config.base_model_name_or_path != model_id:
        check_architectures(model_id, adapter_id, adapter_config, trust_remote_code)

    adapter_filenames = (
        hub._weight_files_from_dir(adapter_path, extension=".safetensors")
        if adapter_path
        else hub._cached_weight_files(
            adapter_id, revision=revision, extension=".safetensors"
        )
    )

    # throw an error if no adapter weights are found
    if not adapter_filenames:
        raise FileNotFoundError(
            f"No adapter weights found for adapter '{adapter_id}' and revision '{revision}'."
        )

    try:
        adapter_tokenizer = AutoTokenizer.from_pretrained(
            adapter_config.config_path,
            trust_remote_code=trust_remote_code,
        )
    except Exception:
        # Adapter does not have a tokenizer, so fallback to base model tokenizer
        adapter_tokenizer = None

    # load adapter weights from all shards (should have relatively small memory footprint)
    adapter_weights = {}
    for filename in adapter_filenames:
        adapter_weights.update(load_file(filename))

    # map the model weights to the relevant adapter weights (LoRA A and B matrices)
    module_map, adapter_weight_names = adapter_config.map_weights_for_model(
        adapter_weights, weight_names
    )
    return module_map, adapter_config, adapter_weight_names, adapter_tokenizer


def get_attn_weights(i, layer):
    qkv = layer.self_attn.query_key_value
    weights = {}

    for k in ["q", "k", "v"]:
        key = (i, f"{k}_proj")
        value = (f"model.layers.{i}.self_attn.{k}_proj", qkv)
        weights[key] = value

    # also add the qkv_proj weight for the adapter
    weights[(i, "qkv_proj")] = (
        f"model.layers.{i}.self_attn.qkv_proj",
        qkv,
    )

    weights[(i, "o_proj")] = (
        f"model.layers.{i}.self_attn.o_proj",
        layer.self_attn.o_proj,
    )

    return weights


def get_mlp_weights(i, layer):
    weights = {}
    if hasattr(layer, "mlp"):
        mlp = layer.mlp
        if hasattr(mlp, "gate_up_proj"):
            # handle combined gate_up_proj (e.g., for some LLaMA variants)
            weights.update(
                {
                    (i, "gate_proj"): (
                        f"model.layers.{i}.mlp.gate_proj",
                        mlp.gate_up_proj,
                    ),
                    (i, "up_proj"): (f"model.layers.{i}.mlp.up_proj", mlp.gate_up_proj),
                }
            )
        else:
            # handle separate gate_proj, up_proj, and down_proj (e.g., for Gemma)
            if hasattr(mlp, "gate_proj"):
                weights[(i, "gate_proj")] = (
                    f"model.layers.{i}.mlp.gate_proj",
                    mlp.gate_proj,
                )
            if hasattr(mlp, "up_proj"):
                weights[(i, "up_proj")] = (f"model.layers.{i}.mlp.up_proj", mlp.up_proj)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


server/text_generation_server/utils/adapter.py [135:282]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        )

        adapters_to_merge.append((module_map, adapter_config))
        merged_weight_names = merged_weight_names.union(adapter_weight_names)
        if tokenizer is None:
            tokenizer = adapter_tokenizer

    if len(adapters_to_merge) == 0:
        raise ValueError("No adapters to merge.")

    module_map, adapter_config = merge_adapters(adapters_to_merge, params)
    return module_map, adapter_config, merged_weight_names, tokenizer


def check_architectures(
    model_id: str,
    adapter_id: str,
    adapter_config: "AdapterConfig",
    trust_remote_code: bool = False,
):
    try:
        if not adapter_config.base_model_name_or_path:
            # Avoid execution latency caused by the network connection retrying for AutoConfig.from_pretrained(None)
            return

        expected_config = AutoConfig.from_pretrained(
            model_id, trust_remote_code=trust_remote_code
        )
        model_config = AutoConfig.from_pretrained(
            adapter_config.base_model_name_or_path, trust_remote_code=trust_remote_code
        )
    except Exception as e:
        warnings.warn(
            f"Unable to check architecture compatibility for adapter '{adapter_id}' "
            f"against model '{model_id}'. Assuming they are compatible. Error: {e}"
        )
        return

    if model_config.architectures == expected_config.architectures:
        warnings.warn(
            f"Adapter '{adapter_id}' was not trained on base model '{model_id}'. "
            f"If you encounter issues, use --model-id '{adapter_config.base_model_name_or_path}' instead."
        )
    else:
        # TODO(travis): revisit this when we support clasification heads which will not use CausalLM
        raise ValueError(
            f"Adapter '{adapter_id}' is not compatible with model '{model_id}'. "
            f"Architectures differ: {model_config.architectures} != {expected_config.architectures}. "
            f"Use --model-id '{adapter_config.base_model_name_or_path}' instead."
        )


@lru_cache(maxsize=128)
def load_module_map(
    model_id: str,
    revision: str,
    adapter_id: str,
    adapter_path: Optional[str],
    weight_names: Tuple[str],
    trust_remote_code: bool = False,
) -> Tuple["ModuleMap", "AdapterConfig", Set[str], PreTrainedTokenizer]:
    adapter_config = LoraConfig.load(adapter_path or adapter_id, None)

    if not adapter_path and adapter_config.base_model_name_or_path != model_id:
        check_architectures(model_id, adapter_id, adapter_config, trust_remote_code)

    adapter_filenames = (
        hub._weight_files_from_dir(adapter_path, extension=".safetensors")
        if adapter_path
        else hub._cached_weight_files(
            adapter_id, revision=revision, extension=".safetensors"
        )
    )

    # throw an error if no adapter weights are found
    if not adapter_filenames:
        raise FileNotFoundError(
            f"No adapter weights found for adapter '{adapter_id}' and revision '{revision}'."
        )

    try:
        adapter_tokenizer = AutoTokenizer.from_pretrained(
            adapter_config.config_path,
            trust_remote_code=trust_remote_code,
        )
    except Exception:
        # Adapter does not have a tokenizer, so fallback to base model tokenizer
        adapter_tokenizer = None

    # load adapter weights from all shards (should have relatively small memory footprint)
    adapter_weights = {}
    for filename in adapter_filenames:
        adapter_weights.update(load_file(filename))

    # map the model weights to the relevant adapter weights (LoRA A and B matrices)
    module_map, adapter_weight_names = adapter_config.map_weights_for_model(
        adapter_weights, weight_names
    )
    return module_map, adapter_config, adapter_weight_names, adapter_tokenizer


def get_attn_weights(i, layer):
    qkv = layer.self_attn.query_key_value
    weights = {}

    for k in ["q", "k", "v"]:
        key = (i, f"{k}_proj")
        value = (f"model.layers.{i}.self_attn.{k}_proj", qkv)
        weights[key] = value

    # also add the qkv_proj weight for the adapter
    weights[(i, "qkv_proj")] = (
        f"model.layers.{i}.self_attn.qkv_proj",
        qkv,
    )

    weights[(i, "o_proj")] = (
        f"model.layers.{i}.self_attn.o_proj",
        layer.self_attn.o_proj,
    )

    return weights


def get_mlp_weights(i, layer):
    weights = {}
    if hasattr(layer, "mlp"):
        mlp = layer.mlp
        if hasattr(mlp, "gate_up_proj"):
            # handle combined gate_up_proj (e.g., for some LLaMA variants)
            weights.update(
                {
                    (i, "gate_proj"): (
                        f"model.layers.{i}.mlp.gate_proj",
                        mlp.gate_up_proj,
                    ),
                    (i, "up_proj"): (f"model.layers.{i}.mlp.up_proj", mlp.gate_up_proj),
                }
            )
        else:
            # handle separate gate_proj, up_proj, and down_proj (e.g., for Gemma)
            if hasattr(mlp, "gate_proj"):
                weights[(i, "gate_proj")] = (
                    f"model.layers.{i}.mlp.gate_proj",
                    mlp.gate_proj,
                )
            if hasattr(mlp, "up_proj"):
                weights[(i, "up_proj")] = (f"model.layers.{i}.mlp.up_proj", mlp.up_proj)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -