def learn()

in qlearn/toys/mnf_agent.py [0:0]


    def learn(self, states, actions, rewards, next_states, terminals):
        self.online_net.train()
        self.target_net.eval()
        self.online_net.reset_noise()
        states = Variable(self.FloatTensor(states))
        actions = Variable(self.LongTensor(actions))
        next_states = Variable(self.FloatTensor(next_states))
        rewards = Variable(self.FloatTensor(rewards)).view(-1, 1)
        terminals = Variable(self.FloatTensor(terminals)).view(-1, 1)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken

        state_values = self.online_net(states, same_noise=False)
        kldiv = self.online_net.kldiv()
        state_action_values = state_values.gather(1, actions.view(-1, 1))

        if self.double_q:
            next_actions = self.online_net(next_states, same_noise=False).max(1)[1]
            next_state_values = self.target_net(next_states, same_noise=False).gather(1, next_actions.view(-1, 1))
        else:
            next_state_values = self.target_net(next_states).max(1)[0]

        target_state_action_values = rewards + (1 - terminals) * self.discount * next_state_values.view(-1, 1)

        td_errors = F.smooth_l1_loss(state_action_values, target_state_action_values.detach(), size_average=True)

        loss = td_errors + self.kl_coeff * kldiv

        # Optimize the model
        self.optimiser.zero_grad()
        loss.backward()
        for param in self.online_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimiser.step()

        return loss