in model.py [0:0]
def fit(self, x, y):
self.train()
self.opt.zero_grad()
y = y.reshape(-1,1)
if self.use_cuda:
x = x.cuda()
y = y.cuda()
# get activations
yp = self.forward(x)
y_mask = (x == 0).float()[:,yp.shape[2]]
# squeeze it
yp = yp.squeeze()
# retain top-k scores; get the average of these scores.
yp_topk = torch.topk(yp, self.topk_count, dim=1)[0]
yp_max = yp_topk[:,0]
loss = self.loss(yp_topk, y.repeat(1, yp_topk.shape[1])).mean()
# backprop
loss.backward()
# update grads
self.opt.step()
self.sched.step()
# return predictions and loss as np arrays
return yp_max.detach().cpu().numpy(), loss.detach().cpu().numpy()