tbsm_pytorch.py [76:87]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
            self.emb_m = self.nheads * m  # mha emb dim
            mean = 0.0
            std_dev = np.sqrt(2 / (m + m))  # np.sqrt(1 / m) # np.sqrt(1 / n)
            qm = np.random.normal(mean, std_dev, size=(1, m, self.emb_m)) \
                .astype(np.float32)
            self.Q = Parameter(torch.tensor(qm), requires_grad=True)
            km = np.random.normal(mean, std_dev, size=(1, m, self.emb_m))  \
                .astype(np.float32)
            self.K = Parameter(torch.tensor(km), requires_grad=True)
            vm = np.random.normal(mean, std_dev, size=(1, m, self.emb_m)) \
                .astype(np.float32)
            self.V = Parameter(torch.tensor(vm), requires_grad=True)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



tbsm_synthetic.py [151:162]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
                    self.emb_m = self.nheads * m  # mha emb dim
                    mean = 0.0
                    std_dev = np.sqrt(2 / (m + m))  # np.sqrt(1 / m) # np.sqrt(1 / n)
                    qm = np.random.normal(mean, std_dev, size=(1, m, self.emb_m)) \
                        .astype(np.float32)
                    self.Q = Parameter(torch.tensor(qm), requires_grad=True)
                    km = np.random.normal(mean, std_dev, size=(1, m, self.emb_m))  \
                        .astype(np.float32)
                    self.K = Parameter(torch.tensor(km), requires_grad=True)
                    vm = np.random.normal(mean, std_dev, size=(1, m, self.emb_m)) \
                        .astype(np.float32)
                    self.V = Parameter(torch.tensor(vm), requires_grad=True)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



