def _step()

in gym/gym/envs/mujoco/ant_obstaclesgen.py [0:0]


    def _step(self, a):
        self.count += 1

        if self.count % 200 == 0:
            n_qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1)
            n_qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
            n_qpos[:2] = self.data.qpos[:2,0]
            n_qpos[-11:] = self.data.qpos[-11:,0]
            self.set_state(n_qpos, n_qvel)

        goal = np.array([8, 24])
        if self.realgoal == 0:
            goal = np.array([8, 24])
        if self.realgoal == 1:
            goal = np.array([8, -24])
        if self.realgoal == 2:
            goal = np.array([24, 24])
        if self.realgoal == 3:
            goal = np.array([24, -24])
        if self.realgoal == 4:
            goal = np.array([48, 0])

        if self.realgoal == 5:
            goal = np.array([40, 24])
        if self.realgoal == 6:
            goal = np.array([40, -24])
        if self.realgoal == 7:
            goal = np.array([32, 16])
        if self.realgoal == 8:
            goal = np.array([32, -16])

        # reward = -np.sum(np.square(self.data.qpos[:2,0] - goal)) / 100000

        xposbefore = self.data.qpos[0,0]
        yposbefore = self.data.qpos[1,0]

        self.do_simulation(a, self.frame_skip)

        xposafter = self.data.qpos[0,0]
        yposafter = self.data.qpos[1,0]

        if xposbefore < goal[0]:
            forward_reward = (xposafter - xposbefore)/self.dt
        else:
            forward_reward = -1*(xposafter - xposbefore)/self.dt
        if yposbefore < goal[1]:
            forward_reward += (yposafter - yposbefore)/self.dt
        else:
            forward_reward += -1*(yposafter - yposbefore)/self.dt

        ctrl_cost = .1 * np.square(a).sum()
        reward = forward_reward - ctrl_cost

        # print(reward)
        done = False
        ob = self._get_obs()
        return ob, reward, done, {}