main.py (109 lines of code) (raw):

from policy import LSTMPolicy, MlpPolicyValue import gym import gym_compete import pickle import sys import argparse import tensorflow as tf import numpy as np def load_from_file(param_pkl_path): with open(param_pkl_path, 'rb') as f: params = pickle.load(f) return params def setFromFlat(var_list, flat_params): shapes = list(map(lambda x: x.get_shape().as_list(), var_list)) total_size = np.sum([int(np.prod(shape)) for shape in shapes]) theta = tf.placeholder(tf.float32, [total_size]) start = 0 assigns = [] for (shape, v) in zip(shapes, var_list): size = int(np.prod(shape)) assigns.append(tf.assign(v, tf.reshape(theta[start:start + size], shape))) start += size op = tf.group(*assigns) tf.get_default_session().run(op, {theta: flat_params}) def run(config): if config.env == "kick-and-defend": env = gym.make("kick-and-defend-v0") policy_type = "lstm" elif config.env == "run-to-goal-humans": env = gym.make("run-to-goal-humans-v0") policy_type = "mlp" elif config.env == "run-to-goal-ants": env = gym.make("run-to-goal-ants-v0") policy_type = "mlp" elif config.env == "you-shall-not-pass": env = gym.make("you-shall-not-pass-humans-v0") policy_type = "mlp" elif config.env == "sumo-humans": env = gym.make("sumo-humans-v0") policy_type = "lstm" elif config.env == "sumo-ants": env = gym.make("sumo-ants-v0") policy_type = "lstm" else: print("unsupported environment") print("choose from: run-to-goal-humans, run-to-goal-ants, you-shall-not-pass, sumo-humans, sumo-ants, kick-and-defend") sys.exit() param_paths = config.param_paths tf_config = tf.ConfigProto( inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() policy = [] for i in range(2): scope = "policy" + str(i) if policy_type == "lstm": policy.append(LSTMPolicy(scope=scope, reuse=False, ob_space=env.observation_space.spaces[i], ac_space=env.action_space.spaces[i], hiddens=[128, 128], normalize=True)) else: policy.append(MlpPolicyValue(scope=scope, reuse=False, ob_space=env.observation_space.spaces[i], ac_space=env.action_space.spaces[i], hiddens=[64, 64], normalize=True)) # initialize uninitialized variables sess.run(tf.variables_initializer(tf.global_variables())) params = [load_from_file(param_pkl_path=path) for path in param_paths] for i in range(len(policy)): setFromFlat(policy[i].get_variables(), params[i]) max_episodes = config.max_episodes num_episodes = 0 nstep = 0 total_reward = [0.0 for _ in range(len(policy))] total_scores = [0 for _ in range(len(policy))] # total_scores = np.asarray(total_scores) observation = env.reset() print("-"*5 + " Episode %d " % (num_episodes+1) + "-"*5) while num_episodes < max_episodes: env.render() action = tuple([policy[i].act(stochastic=True, observation=observation[i])[0] for i in range(len(policy))]) observation, reward, done, infos = env.step(action) nstep += 1 for i in range(len(policy)): total_reward[i] += reward[i] if done[0]: num_episodes += 1 draw = True for i in range(len(policy)): if 'winner' in infos[i]: draw = False total_scores[i] += 1 print("Winner: Agent {}, Scores: {}, Total Episodes: {}".format(i, total_scores, num_episodes)) if draw: print("Game Tied: Agent {}, Scores: {}, Total Episodes: {}".format(i, total_scores, num_episodes)) observation = env.reset() nstep = 0 total_reward = [0.0 for _ in range(len(policy))] for i in range(len(policy)): policy[i].reset() if num_episodes < max_episodes: print("-"*5 + "Episode %d" % (num_episodes+1) + "-"*5) if __name__ == "__main__": p = argparse.ArgumentParser(description="Environments for Multi-agent competition") p.add_argument("--env", default="sumo-humans", type=str, help="competitive environment: run-to-goal-humans, run-to-goal-ants, you-shall-not-pass, sumo-humans, sumo-ants, kick-and-defend") p.add_argument("--param-paths", nargs='+', required=True, type=str) p.add_argument("--max-episodes", default=10, help="max number of matches", type=int) config = p.parse_args() run(config)