def fit()

in model.py [0:0]


    def fit(self, x, y):
        self.train()
        self.opt.zero_grad()

        y = y.reshape(-1,1)

        if self.use_cuda:
            x = x.cuda()
            y = y.cuda()

        # get activations
        yp = self.forward(x)
        y_mask = (x == 0).float()[:,yp.shape[2]]

        # squeeze it
        yp = yp.squeeze()

        # retain top-k scores; get the average of these scores.
        yp_topk = torch.topk(yp, self.topk_count, dim=1)[0]
        yp_max = yp_topk[:,0]

        loss = self.loss(yp_topk, y.repeat(1, yp_topk.shape[1])).mean()

        # backprop
        loss.backward()

        # update grads
        self.opt.step()

        self.sched.step()

        # return predictions and loss as np arrays
        return yp_max.detach().cpu().numpy(), loss.detach().cpu().numpy()