in reagent/ope/trainers/rl_tabular_trainers.py [0:0]
def _state_value(self, state: State):
i = 0
state_count = self._state_counts[state] if state in self._state_counts else 0
while state_count < self._count_threshold and i < self._max_iteration:
i += 1
mdp = self._log_generator.generate_log(state)
if self._first_visit:
state_counts = {}
for t in mdp:
if t.last_state is None:
continue
if t.last_state in state_counts:
state_counts[t.last_state] += 1
else:
state_counts[t.last_state] = 1
g = 0
for t in reversed(mdp):
if t.last_state is None:
continue
g = self._gamma * g + t.reward
counts = state_counts[t.last_state]
if counts > 1:
self._update_state_value(t.last_state, g)
counts -= 1
if counts == 0:
del state_counts[t.last_state]
else:
state_counts[t.last_state] = counts
else:
g = 0
for t in reversed(mdp):
if t.last_state is None:
continue
g = self._gamma * g + t.reward
self._update_state_value(t.last_state, g)
state_count = (
self._state_counts[state] if state in self._state_counts else 0
)
return super()._state_value(state)