hucc/agents/hsd3.py [994:1023]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
            with th.no_grad():
                backup = q_target(batch)

            # Q-Function update
            q_in = copy(obs)
            q_in[self._dkey] = self.action_hi_d_qinput(
                batch[f'action_hi_{self._dkey}']
            )
            q_in[self._ckey] = batch[f'action_hi_{self._ckey}']
            q = self._q_hi(q_in)
            q1 = q[:, 0]
            q2 = q[:, 1]
            q1_loss = F.mse_loss(q1, backup, reduction='none')
            q2_loss = F.mse_loss(q2, backup, reduction='none')
            q_loss = q1_loss.mean() + q2_loss.mean()
            self._optim.hi.q.zero_grad()
            q_loss.backward()
            if self._clip_grad_norm > 0.0:
                nn.utils.clip_grad_norm_(
                    self._model.q.parameters(), self._clip_grad_norm
                )
            self._optim.hi.q.step()

            # Policy update
            for param in self._model.hi.q.parameters():
                param.requires_grad_(False)

            # No time input for policy, and Q-functions are queried as if step
            # would be 0 (i.e. we would take an action)
            obs['time'] = obs['time'] * 0
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


hucc/agents/hsdb.py [601:630]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
            with th.no_grad():
                backup = q_target(batch)

            # Q-Function update
            q_in = copy(obs)
            q_in[self._dkey] = self.action_hi_d_qinput(
                batch[f'action_hi_{self._dkey}']
            )
            q_in[self._ckey] = batch[f'action_hi_{self._ckey}']
            q = self._q_hi(q_in)
            q1 = q[:, 0]
            q2 = q[:, 1]
            q1_loss = F.mse_loss(q1, backup, reduction='none')
            q2_loss = F.mse_loss(q2, backup, reduction='none')
            q_loss = q1_loss.mean() + q2_loss.mean()
            self._optim.hi.q.zero_grad()
            q_loss.backward()
            if self._clip_grad_norm > 0.0:
                nn.utils.clip_grad_norm_(
                    self._model.q.parameters(), self._clip_grad_norm
                )
            self._optim.hi.q.step()

            # Policy update
            for param in self._model.hi.q.parameters():
                param.requires_grad_(False)

            # No time input for policy, and Q-functions are queried as if step
            # would be 0 (i.e. we would take an action)
            obs['time'] = obs['time'] * 0
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -