experiments/train.py (149 lines of code) (raw):
import argparse
import numpy as np
import tensorflow as tf
import time
import pickle
import maddpg.common.tf_util as U
from maddpg.trainer.maddpg import MADDPGAgentTrainer
import tensorflow.contrib.layers as layers
def parse_args():
parser = argparse.ArgumentParser("Reinforcement Learning experiments for multiagent environments")
# Environment
parser.add_argument("--scenario", type=str, default="simple", help="name of the scenario script")
parser.add_argument("--max-episode-len", type=int, default=25, help="maximum episode length")
parser.add_argument("--num-episodes", type=int, default=60000, help="number of episodes")
parser.add_argument("--num-adversaries", type=int, default=0, help="number of adversaries")
parser.add_argument("--good-policy", type=str, default="maddpg", help="policy for good agents")
parser.add_argument("--adv-policy", type=str, default="maddpg", help="policy of adversaries")
# Core training parameters
parser.add_argument("--lr", type=float, default=1e-2, help="learning rate for Adam optimizer")
parser.add_argument("--gamma", type=float, default=0.95, help="discount factor")
parser.add_argument("--batch-size", type=int, default=1024, help="number of episodes to optimize at the same time")
parser.add_argument("--num-units", type=int, default=64, help="number of units in the mlp")
# Checkpointing
parser.add_argument("--exp-name", type=str, default=None, help="name of the experiment")
parser.add_argument("--save-dir", type=str, default="/tmp/policy/", help="directory in which training state and model should be saved")
parser.add_argument("--save-rate", type=int, default=1000, help="save model once every time this many episodes are completed")
parser.add_argument("--load-dir", type=str, default="", help="directory in which training state and model are loaded")
# Evaluation
parser.add_argument("--restore", action="store_true", default=False)
parser.add_argument("--display", action="store_true", default=False)
parser.add_argument("--benchmark", action="store_true", default=False)
parser.add_argument("--benchmark-iters", type=int, default=100000, help="number of iterations run for benchmarking")
parser.add_argument("--benchmark-dir", type=str, default="./benchmark_files/", help="directory where benchmark data is saved")
parser.add_argument("--plots-dir", type=str, default="./learning_curves/", help="directory where plot data is saved")
return parser.parse_args()
def mlp_model(input, num_outputs, scope, reuse=False, num_units=64, rnn_cell=None):
# This model takes as input an observation and returns values of all actions
with tf.variable_scope(scope, reuse=reuse):
out = input
out = layers.fully_connected(out, num_outputs=num_units, activation_fn=tf.nn.relu)
out = layers.fully_connected(out, num_outputs=num_units, activation_fn=tf.nn.relu)
out = layers.fully_connected(out, num_outputs=num_outputs, activation_fn=None)
return out
def make_env(scenario_name, arglist, benchmark=False):
from multiagent.environment import MultiAgentEnv
import multiagent.scenarios as scenarios
# load scenario from script
scenario = scenarios.load(scenario_name + ".py").Scenario()
# create world
world = scenario.make_world()
# create multiagent environment
if benchmark:
env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data)
else:
env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation)
return env
def get_trainers(env, num_adversaries, obs_shape_n, arglist):
trainers = []
model = mlp_model
trainer = MADDPGAgentTrainer
for i in range(num_adversaries):
trainers.append(trainer(
"agent_%d" % i, model, obs_shape_n, env.action_space, i, arglist,
local_q_func=(arglist.adv_policy=='ddpg')))
for i in range(num_adversaries, env.n):
trainers.append(trainer(
"agent_%d" % i, model, obs_shape_n, env.action_space, i, arglist,
local_q_func=(arglist.good_policy=='ddpg')))
return trainers
def train(arglist):
with U.single_threaded_session():
# Create environment
env = make_env(arglist.scenario, arglist, arglist.benchmark)
# Create agent trainers
obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
num_adversaries = min(env.n, arglist.num_adversaries)
trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)
print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy))
# Initialize
U.initialize()
# Load previous results, if necessary
if arglist.load_dir == "":
arglist.load_dir = arglist.save_dir
if arglist.display or arglist.restore or arglist.benchmark:
print('Loading previous state...')
U.load_state(arglist.load_dir)
episode_rewards = [0.0] # sum of rewards for all agents
agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward
final_ep_rewards = [] # sum of rewards for training curve
final_ep_ag_rewards = [] # agent rewards for training curve
agent_info = [[[]]] # placeholder for benchmarking info
saver = tf.train.Saver()
obs_n = env.reset()
episode_step = 0
train_step = 0
t_start = time.time()
print('Starting iterations...')
while True:
# get action
action_n = [agent.action(obs) for agent, obs in zip(trainers,obs_n)]
# environment step
new_obs_n, rew_n, done_n, info_n = env.step(action_n)
episode_step += 1
done = all(done_n)
terminal = (episode_step >= arglist.max_episode_len)
# collect experience
for i, agent in enumerate(trainers):
agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal)
obs_n = new_obs_n
for i, rew in enumerate(rew_n):
episode_rewards[-1] += rew
agent_rewards[i][-1] += rew
if done or terminal:
obs_n = env.reset()
episode_step = 0
episode_rewards.append(0)
for a in agent_rewards:
a.append(0)
agent_info.append([[]])
# increment global step counter
train_step += 1
# for benchmarking learned policies
if arglist.benchmark:
for i, info in enumerate(info_n):
agent_info[-1][i].append(info_n['n'])
if train_step > arglist.benchmark_iters and (done or terminal):
file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
print('Finished benchmarking, now saving...')
with open(file_name, 'wb') as fp:
pickle.dump(agent_info[:-1], fp)
break
continue
# for displaying learned policies
if arglist.display:
time.sleep(0.1)
env.render()
continue
# update all trainers, if not in display or benchmark mode
loss = None
for agent in trainers:
agent.preupdate()
for agent in trainers:
loss = agent.update(trainers, train_step)
# save model, display training output
if terminal and (len(episode_rewards) % arglist.save_rate == 0):
U.save_state(arglist.save_dir, saver=saver)
# print statement depends on whether or not there are adversaries
if num_adversaries == 0:
print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(
train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3)))
else:
print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(
train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]),
[np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3)))
t_start = time.time()
# Keep track of final episode reward
final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:]))
for rew in agent_rewards:
final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:]))
# saves final episode reward for plotting training curve later
if len(episode_rewards) > arglist.num_episodes:
rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
with open(rew_file_name, 'wb') as fp:
pickle.dump(final_ep_rewards, fp)
agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
with open(agrew_file_name, 'wb') as fp:
pickle.dump(final_ep_ag_rewards, fp)
print('...Finished total of {} episodes.'.format(len(episode_rewards)))
break
if __name__ == '__main__':
arglist = parse_args()
train(arglist)