in rela/dqn_actor.h [58:106]
FFTransition popTransition() {
assert((int)obsHistory_.size() == multiStep_ + 1);
assert((int)actionHistory_.size() == multiStep_ + 1);
assert((int)rewardHistory_.size() == multiStep_ + 1);
assert((int)terminalHistory_.size() == multiStep_ + 1);
TensorDict obs = obsHistory_.front();
TensorDict action = actionHistory_.front();
torch::Tensor terminal = terminalHistory_.front();
torch::Tensor bootstrap = torch::ones(batchsize_, torch::kFloat32);
auto bootstrapAccessor = bootstrap.accessor<float, 1>();
std::vector<int> nextObsIndices(batchsize_);
// calculate bootstrap and nextState indices
for (int i = 0; i < batchsize_; i++) {
for (int step = 0; step < multiStep_; step++) {
// next state is step (shouldn't be used anyways)
if (terminalHistory_[step][i].item<bool>()) {
bootstrapAccessor[i] = 0.0;
nextObsIndices[i] = step;
break;
}
}
// next state is step+n
if (bootstrapAccessor[i] > 1e-6) {
nextObsIndices[i] = multiStep_;
}
}
// calculate discounted rewards
torch::Tensor reward = torch::zeros_like(rewardHistory_.front());
auto accessor = reward.accessor<float, 1>();
for (int i = 0; i < batchsize_; i++) {
// if bootstrap, we only use the first nsAccessor[i]-1 (i.e. multiStep_-1)
int initial = bootstrapAccessor[i] ? multiStep_ - 1 : nextObsIndices[i];
for (int step = initial; step >= 0; step--) {
float stepReward = rewardHistory_[step][i].item<float>();
accessor[i] = stepReward + gamma_ * accessor[i];
}
}
TensorDict nextObs = obsHistory_.back();
obsHistory_.pop_front();
actionHistory_.pop_front();
rewardHistory_.pop_front();
terminalHistory_.pop_front();
return FFTransition(obs, action, reward, terminal, bootstrap, nextObs);
}