in src/peft/tuners/randlora/layer.py [0:0]
def get_scaled_bases(self, adapter, device=None) -> tuple[torch.Tensor, torch.Tensor]:
"""
Performs scaling on the smallest random base (randlora_A) and returns randlora_A and randlora_B in the correct
order to fit the target layers' dimensions
Args:
adapter (str):
The name of the adapter for which the delta weight should be computed.
"""
randlora_A = self.randlora_A[adapter]
randlora_B = self.randlora_B[adapter]
if device is None:
device = randlora_B.device
dtype = randlora_B.dtype
# In case users wants to merge the adapter weights that are in
# (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to
# (b)float16 because some CPUs have slow bf16/fp16 matmuls.
cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16)
randlora_lambda = self.randlora_lambda[adapter].to(device)
randlora_gamma = self.randlora_gamma[adapter].to(device)
if cast_to_fp32:
randlora_A = randlora_A.float()
randlora_B = randlora_B.float()
randlora_lambda = randlora_lambda.float()
randlora_gamma = randlora_gamma.float()
# The trainable parameters are always applied to randlora_A, the smallest basis.
min_dim, max_dim = min(self.out_features, self.in_features), max(self.out_features, self.in_features)
# As adapted layers may have different shapes and RandLora contains a single shared pair of A and B matrices,
# we initialize these matrices with the largest required size for each dimension.
# During the forward pass, required submatrices are sliced out from the shared randlora_A and randlora_B.
sliced_A = randlora_A[:, : self.num_bases, :min_dim].to(device)
sliced_B = randlora_B[:max_dim, : self.num_bases, :].to(device)
# Flattening the matrices over the rank and number of bases dimensions is more memory efficient
update_B = sliced_B.flatten(start_dim=1)
update_A = UniqueBaseGrad.apply(sliced_A, randlora_lambda, randlora_gamma).flatten(end_dim=1)
# Since update_A is applied on the smallest dimension, test whether update_A or update_B should be applied first. This is done to reduce trainable parameters.
if min_dim == self.in_features:
return update_A, update_B
return update_B.T, update_A.T