in qlearn/toys/agent.py [0:0]
def learn(self, states, actions, rewards, next_states, terminals):
self.online_net.train()
self.target_net.eval()
states = Variable(self.FloatTensor(states))
actions = Variable(self.LongTensor(actions))
next_states = Variable(self.FloatTensor(next_states))
rewards = Variable(self.FloatTensor(rewards)).view(-1, 1)
terminals = Variable(self.FloatTensor(terminals)).view(-1, 1)
# import pdb
# pdb.set_trace()
# Compute Q(s_t, a) - the model computes Q(s_t), then we select the
# columns of actions taken
state_action_values = self.online_net(states).gather(1, actions.view(-1, 1))
if self.double_q:
next_actions = self.online_net(next_states).max(1)[1]
next_state_values = self.target_net(next_states).gather(1, next_actions.view(-1, 1))
else:
next_state_values = self.target_net(next_states).max(1)[0]
# Compute V(s_{t+1}) for all next states.
target_state_action_values = rewards + (1 - terminals) * self.discount * next_state_values.view(-1, 1)
# Undo volatility (which was used to prevent unnecessary gradients)
#target_state_action_values = Variable(target_state_action_values.data)
# Compute Huber loss
loss = F.smooth_l1_loss(state_action_values, target_state_action_values.detach())
# Optimize the model
self.optimiser.zero_grad()
loss.backward()
for param in self.online_net.parameters():
param.grad.data.clamp_(-1, 1)
self.optimiser.step()
return loss