in gym/gym/envs/mujoco/ant_obstaclesgen.py [0:0]
def _step(self, a):
self.count += 1
if self.count % 200 == 0:
n_qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1)
n_qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
n_qpos[:2] = self.data.qpos[:2,0]
n_qpos[-11:] = self.data.qpos[-11:,0]
self.set_state(n_qpos, n_qvel)
goal = np.array([8, 24])
if self.realgoal == 0:
goal = np.array([8, 24])
if self.realgoal == 1:
goal = np.array([8, -24])
if self.realgoal == 2:
goal = np.array([24, 24])
if self.realgoal == 3:
goal = np.array([24, -24])
if self.realgoal == 4:
goal = np.array([48, 0])
if self.realgoal == 5:
goal = np.array([40, 24])
if self.realgoal == 6:
goal = np.array([40, -24])
if self.realgoal == 7:
goal = np.array([32, 16])
if self.realgoal == 8:
goal = np.array([32, -16])
# reward = -np.sum(np.square(self.data.qpos[:2,0] - goal)) / 100000
xposbefore = self.data.qpos[0,0]
yposbefore = self.data.qpos[1,0]
self.do_simulation(a, self.frame_skip)
xposafter = self.data.qpos[0,0]
yposafter = self.data.qpos[1,0]
if xposbefore < goal[0]:
forward_reward = (xposafter - xposbefore)/self.dt
else:
forward_reward = -1*(xposafter - xposbefore)/self.dt
if yposbefore < goal[1]:
forward_reward += (yposafter - yposbefore)/self.dt
else:
forward_reward += -1*(yposafter - yposbefore)/self.dt
ctrl_cost = .1 * np.square(a).sum()
reward = forward_reward - ctrl_cost
# print(reward)
done = False
ob = self._get_obs()
return ob, reward, done, {}