in picotron/tensor_parallel/tensor_parallel.py [0:0]
def __init__(self, in_features: int, out_features: int, bias: bool):
super(RowParallelLinear, self).__init__()
self.tp_world_size = pgm.process_group_manager.tp_world_size
self.tp_rank = pgm.process_group_manager.tp_rank
self.in_features = in_features
self.out_features = out_features
assert in_features % self.tp_world_size == 0, "Hidden dimension must be divisible by the tensor parallel world size"
self.input_size_per_partition = in_features // self.tp_world_size
self.weight = nn.Parameter(torch.Tensor(self.out_features, self.input_size_per_partition))
if bias:
self.bias = nn.Parameter(torch.Tensor(self.out_features))
# Always initialize bias to zero.
with torch.no_grad():
self.bias.zero_()
else:
self.register_parameter("bias", None)
self.reset_parameters()