chatlearn/models/megatron/lora/layers.py [114:168]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        self.gradient_accumulation_fusion = args.gradient_accumulation_fusion

        self.weight = weight
        self.bias = bias

        if lora_dim <= 0:
            raise ValueError(
                "You are training to use LoRA, whose reduced dim should be larger than 1"
            )

        rows, columns = weight.shape
        self.fan_in = columns
        self.lora_right_weight = nn.Parameter(torch.zeros(
            lora_dim, columns
        ))  # apply transpose so in forward we do not need to
        self.lora_left_weight = nn.Parameter(torch.zeros(rows, lora_dim))
        self.lora_scaling = lora_scaling / lora_dim

        if lora_dropout > 0:
            self.lora_dropout = nn.Dropout(lora_dropout)
        else:
            self.lora_dropout = nn.Identity()

        self.reset_parameters()
        # disable the original weight gradient
        self.weight.requires_grad = False
        # fuse LoRA to the original weight
        self.fuse_lora = False

    def eval(self):
        self.lora_dropout.eval()

    #   self.fuse_lora_weight()

    def train(self, mode=True):
        self.lora_dropout.train(mode)
        # self.unfuse_lora_weight()

    def reset_parameters(self):
        distributed_kaiming_uniform_(self.lora_right_weight, self.fan_in, a=math.sqrt(5))
        nn.init.zeros_(self.lora_left_weight)

    def fuse_lora_weight(self):
        if not self.fuse_lora:
            self.weight.data += self.lora_scaling * torch.matmul(
                self.lora_left_weight, self.lora_right_weight)
        self.fuse_lora = True

    def unfuse_lora_weight(self):
        if self.fuse_lora:
            self.weight.data -= self.lora_scaling * torch.matmul(
                self.lora_left_weight, self.lora_right_weight)
        self.fuse_lora = False

    def forward(self, input_):
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


chatlearn/models/megatron/lora/layers.py [243:297]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        self.gradient_accumulation_fusion = args.gradient_accumulation_fusion

        self.weight = weight
        self.bias = bias

        if lora_dim <= 0:
            raise ValueError(
                "You are training to use LoRA, whose reduced dim should be larger than 1"
            )

        rows, columns = weight.shape
        self.fan_in = columns
        self.lora_right_weight = nn.Parameter(torch.zeros(
            lora_dim, columns
        ))  # apply transpose so in forward we do not need to
        self.lora_left_weight = nn.Parameter(torch.zeros(rows, lora_dim))
        self.lora_scaling = lora_scaling / lora_dim

        if lora_dropout > 0:
            self.lora_dropout = nn.Dropout(lora_dropout)
        else:
            self.lora_dropout = nn.Identity()

        self.reset_parameters()
        # disable the original weight gradient
        self.weight.requires_grad = False
        # fuse LoRA to the original weight
        self.fuse_lora = False

    def eval(self):
        self.lora_dropout.eval()

    #   self.fuse_lora_weight()

    def train(self, mode=True):
        self.lora_dropout.train(mode)
        # self.unfuse_lora_weight()

    def reset_parameters(self):
        distributed_kaiming_uniform_(self.lora_right_weight, self.fan_in, a=math.sqrt(5))
        nn.init.zeros_(self.lora_left_weight)

    def fuse_lora_weight(self):
        if not self.fuse_lora:
            self.weight.data += self.lora_scaling * torch.matmul(
                self.lora_left_weight, self.lora_right_weight)
        self.fuse_lora = True

    def unfuse_lora_weight(self):
        if self.fuse_lora:
            self.weight.data -= self.lora_scaling * torch.matmul(
                self.lora_left_weight, self.lora_right_weight)
        self.fuse_lora = False

    def forward(self, input_):
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -