FFTransition popTransition()

in rela/dqn_actor.h [58:106]


  FFTransition popTransition() {
    assert((int)obsHistory_.size() == multiStep_ + 1);
    assert((int)actionHistory_.size() == multiStep_ + 1);
    assert((int)rewardHistory_.size() == multiStep_ + 1);
    assert((int)terminalHistory_.size() == multiStep_ + 1);

    TensorDict obs = obsHistory_.front();
    TensorDict action = actionHistory_.front();
    torch::Tensor terminal = terminalHistory_.front();
    torch::Tensor bootstrap = torch::ones(batchsize_, torch::kFloat32);
    auto bootstrapAccessor = bootstrap.accessor<float, 1>();

    std::vector<int> nextObsIndices(batchsize_);
    // calculate bootstrap and nextState indices
    for (int i = 0; i < batchsize_; i++) {
      for (int step = 0; step < multiStep_; step++) {
        // next state is step (shouldn't be used anyways)
        if (terminalHistory_[step][i].item<bool>()) {
          bootstrapAccessor[i] = 0.0;
          nextObsIndices[i] = step;
          break;
        }
      }
      // next state is step+n
      if (bootstrapAccessor[i] > 1e-6) {
        nextObsIndices[i] = multiStep_;
      }
    }

    // calculate discounted rewards
    torch::Tensor reward = torch::zeros_like(rewardHistory_.front());
    auto accessor = reward.accessor<float, 1>();
    for (int i = 0; i < batchsize_; i++) {
      // if bootstrap, we only use the first nsAccessor[i]-1 (i.e. multiStep_-1)
      int initial = bootstrapAccessor[i] ? multiStep_ - 1 : nextObsIndices[i];
      for (int step = initial; step >= 0; step--) {
        float stepReward = rewardHistory_[step][i].item<float>();
        accessor[i] = stepReward + gamma_ * accessor[i];
      }
    }

    TensorDict nextObs = obsHistory_.back();

    obsHistory_.pop_front();
    actionHistory_.pop_front();
    rewardHistory_.pop_front();
    terminalHistory_.pop_front();
    return FFTransition(obs, action, reward, terminal, bootstrap, nextObs);
  }