in src/transformers/modeling_gguf_pytorch_utils.py [0:0]
def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_load=None):
"""
Load a GGUF file and return a dictionary of parsed parameters containing tensors, the parsed
tokenizer and config attributes.
Args:
gguf_checkpoint_path (`str`):
The path the to GGUF file to load
return_tensors (`bool`, defaults to `False`):
Whether to read the tensors from the file and return them. Not doing so is faster
and only loads the metadata in memory.
"""
if is_gguf_available() and is_torch_available():
from gguf import GGUFReader, dequantize
else:
logger.error(
"Loading a GGUF checkpoint in PyTorch, requires both PyTorch and GGUF>=0.10.0 to be installed. Please see "
"https://pytorch.org/ and https://github.com/ggerganov/llama.cpp/tree/master/gguf-py for installation instructions."
)
raise ImportError("Please install torch and gguf>=0.10.0 to load a GGUF checkpoint in PyTorch.")
reader = GGUFReader(gguf_checkpoint_path)
fields = reader.fields
reader_keys = list(fields.keys())
parsed_parameters = {k: {} for k in GGUF_TO_TRANSFORMERS_MAPPING}
architecture = read_field(reader, "general.architecture")[0]
# NOTE: Some GGUF checkpoints may miss `general.name` field in metadata
model_name = read_field(reader, "general.name")
updated_architecture = None
# in llama.cpp mistral models use the same architecture as llama. We need
# to add this patch to ensure things work correctly on our side.
if "llama" in architecture and "mistral" in model_name:
updated_architecture = "mistral"
# FIXME: Currently this implementation is only for flan-t5 architecture.
# It needs to be developed for supporting legacy t5.
elif "t5" in architecture or "t5encoder" in architecture:
parsed_parameters["config"]["is_gated_act"] = True
if "t5encoder" in architecture:
parsed_parameters["config"]["architectures"] = ["T5EncoderModel"]
updated_architecture = "t5"
else:
updated_architecture = architecture
if "qwen2moe" in architecture:
updated_architecture = "qwen2_moe"
# For stablelm architecture, we need to set qkv_bias and use_parallel_residual from tensors
# If `qkv_bias=True`, qkv_proj with bias will be present in the tensors
# If `use_parallel_residual=False`, ffn_norm will be present in the tensors
if "stablelm" in architecture:
attn_bias_name = {"attn_q.bias", "attn_k.bias", "attn_v.bias"}
ffn_norm_name = "ffn_norm"
qkv_bias = any(bias_name in tensor.name for tensor in reader.tensors for bias_name in attn_bias_name)
use_parallel_residual = any(ffn_norm_name in tensor.name for tensor in reader.tensors)
parsed_parameters["config"]["use_qkv_bias"] = qkv_bias
parsed_parameters["config"]["use_parallel_residual"] = not use_parallel_residual
if architecture not in GGUF_SUPPORTED_ARCHITECTURES and updated_architecture not in GGUF_SUPPORTED_ARCHITECTURES:
raise ValueError(f"GGUF model with architecture {architecture} is not supported yet.")
# Handle tie_word_embeddings, if lm_head.weight is not present in tensors,
# tie_word_embeddings is true otherwise false
exceptions = ["falcon", "bloom"]
parsed_parameters["config"]["tie_word_embeddings"] = (
all("output.weight" != tensor.name for tensor in reader.tensors) or architecture in exceptions
)
# List all key-value pairs in a columnized format
for gguf_key, field in reader.fields.items():
gguf_key = gguf_key.replace(architecture, updated_architecture)
split = gguf_key.split(".")
prefix = split[0]
config_key = ".".join(split[1:])
value = [_gguf_parse_value(field.parts[_data_index], field.types) for _data_index in field.data]
if len(value) == 1:
value = value[0]
if isinstance(value, str) and architecture in value:
value = value.replace(architecture, updated_architecture)
for parameter in GGUF_TO_TRANSFORMERS_MAPPING:
parameter_renames = GGUF_TO_TRANSFORMERS_MAPPING[parameter]
if prefix in parameter_renames and config_key in parameter_renames[prefix]:
renamed_config_key = parameter_renames[prefix][config_key]
if renamed_config_key == -1:
continue
if renamed_config_key is not None:
parsed_parameters[parameter][renamed_config_key] = value
if gguf_key in reader_keys:
reader_keys.remove(gguf_key)
if gguf_key in reader_keys:
logger.info(f"Some keys were not parsed and added into account {gguf_key} | {value}")
# Gemma3 GGUF checkpoint only contains weights of text backbone
if parsed_parameters["config"]["model_type"] == "gemma3":
parsed_parameters["config"]["model_type"] = "gemma3_text"
# retrieve config vocab_size from tokenizer
# Please refer to https://github.com/huggingface/transformers/issues/32526 for more details
if "vocab_size" not in parsed_parameters["config"]:
tokenizer_parameters = parsed_parameters["tokenizer"]
if "tokens" in tokenizer_parameters:
parsed_parameters["config"]["vocab_size"] = len(tokenizer_parameters["tokens"])
else:
logger.warning(
"Can't find a way to retrieve missing config vocab_size from tokenizer parameters. "
"This will use default value from model config class and cause unexpected behavior."
)
if return_tensors:
parsed_parameters["tensors"] = {}
tensor_key_mapping = get_gguf_hf_weights_map(model_to_load)
config = parsed_parameters.get("config", {})
ProcessorClass = TENSOR_PROCESSORS.get(architecture, TensorProcessor)
processor = ProcessorClass(config=config)
for tensor in tqdm(reader.tensors, desc="Converting and de-quantizing GGUF tensors..."):
name = tensor.name
weights = dequantize(tensor.data, tensor.tensor_type)
result = processor.process(
weights=weights,
name=name,
tensor_key_mapping=tensor_key_mapping,
parsed_parameters=parsed_parameters,
)
weights = result.weights
name = result.name
if name not in tensor_key_mapping:
continue
name = tensor_key_mapping[name]
parsed_parameters["tensors"][name] = torch.from_numpy(np.copy(weights))
if len(reader_keys) > 0:
logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}")
return parsed_parameters