in main.py [0:0]
def run(config):
if config.env == "kick-and-defend":
env = gym.make("kick-and-defend-v0")
policy_type = "lstm"
elif config.env == "run-to-goal-humans":
env = gym.make("run-to-goal-humans-v0")
policy_type = "mlp"
elif config.env == "run-to-goal-ants":
env = gym.make("run-to-goal-ants-v0")
policy_type = "mlp"
elif config.env == "you-shall-not-pass":
env = gym.make("you-shall-not-pass-humans-v0")
policy_type = "mlp"
elif config.env == "sumo-humans":
env = gym.make("sumo-humans-v0")
policy_type = "lstm"
elif config.env == "sumo-ants":
env = gym.make("sumo-ants-v0")
policy_type = "lstm"
else:
print("unsupported environment")
print("choose from: run-to-goal-humans, run-to-goal-ants, you-shall-not-pass, sumo-humans, sumo-ants, kick-and-defend")
sys.exit()
param_paths = config.param_paths
tf_config = tf.ConfigProto(
inter_op_parallelism_threads=1,
intra_op_parallelism_threads=1)
sess = tf.Session(config=tf_config)
sess.__enter__()
policy = []
for i in range(2):
scope = "policy" + str(i)
if policy_type == "lstm":
policy.append(LSTMPolicy(scope=scope, reuse=False,
ob_space=env.observation_space.spaces[i],
ac_space=env.action_space.spaces[i],
hiddens=[128, 128], normalize=True))
else:
policy.append(MlpPolicyValue(scope=scope, reuse=False,
ob_space=env.observation_space.spaces[i],
ac_space=env.action_space.spaces[i],
hiddens=[64, 64], normalize=True))
# initialize uninitialized variables
sess.run(tf.variables_initializer(tf.global_variables()))
params = [load_from_file(param_pkl_path=path) for path in param_paths]
for i in range(len(policy)):
setFromFlat(policy[i].get_variables(), params[i])
max_episodes = config.max_episodes
num_episodes = 0
nstep = 0
total_reward = [0.0 for _ in range(len(policy))]
total_scores = [0 for _ in range(len(policy))]
# total_scores = np.asarray(total_scores)
observation = env.reset()
print("-"*5 + " Episode %d " % (num_episodes+1) + "-"*5)
while num_episodes < max_episodes:
env.render()
action = tuple([policy[i].act(stochastic=True, observation=observation[i])[0]
for i in range(len(policy))])
observation, reward, done, infos = env.step(action)
nstep += 1
for i in range(len(policy)):
total_reward[i] += reward[i]
if done[0]:
num_episodes += 1
draw = True
for i in range(len(policy)):
if 'winner' in infos[i]:
draw = False
total_scores[i] += 1
print("Winner: Agent {}, Scores: {}, Total Episodes: {}".format(i, total_scores, num_episodes))
if draw:
print("Game Tied: Agent {}, Scores: {}, Total Episodes: {}".format(i, total_scores, num_episodes))
observation = env.reset()
nstep = 0
total_reward = [0.0 for _ in range(len(policy))]
for i in range(len(policy)):
policy[i].reset()
if num_episodes < max_episodes:
print("-"*5 + "Episode %d" % (num_episodes+1) + "-"*5)