chatlearn/models/megatron/lora/initializer.py

# Copyright 2024 Alibaba Group Holding Limited. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """distributed initializer.""" import math import warnings import torch from torch import Tensor from torch.nn.init import calculate_gain def distributed_kaiming_uniform_( tensor: Tensor, fan: int, a: float = 0, nonlinearity: str = 'leaky_relu' ): r"""Fills the input `Tensor` with values according to the method described in `Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification` - He, K. et al. (2015), using a uniform distribution. The resulting tensor will have values sampled from :math:`\mathcal{U}(-\text{bound}, \text{bound})` where .. math:: \text{bound} = \text{gain} \times \sqrt{\frac{3}{\text{fan\_mode}}} Also known as He initialization. Args: tensor: an n-dimensional `torch.Tensor` fan: fan_in or fan_out for W. a: the negative slope of the rectifier used after this layer (only used with ``'leaky_relu'``) nonlinearity: the non-linear function (`nn.functional` name), recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default). Examples: >>> w = torch.empty(3, 5) >>> distributed_kaiming_uniform_(w, fan=3, nonlinearity='relu') """ if torch.overrides.has_torch_function_variadic(tensor): return torch.overrides.handle_torch_function( distributed_kaiming_uniform_, (tensor, fan), tensor=tensor, fan=fan, a=a, nonlinearity=nonlinearity) if 0 in tensor.shape: warnings.warn("Initializing zero-element tensors is a no-op") return tensor assert isinstance(fan, int) and fan > 0, "Expect fan int and greater than 0." gain = calculate_gain(nonlinearity, a) std = gain / math.sqrt(fan) bound = math.sqrt(3.0) * std # Calculate uniform bounds from standard deviation with torch.no_grad(): return tensor.uniform_(-bound, bound)

chatlearn/models/megatron/lora/initializer.py (26 lines of code) (raw):