mlsh_code/master.py (57 lines of code) (raw):

import gym import test_envs import tensorflow as tf import rollouts from policy_network import Policy from subpolicy_network import SubPolicy from observation_network import Features from learner import Learner import rl_algs.common.tf_util as U import numpy as np # from tinkerbell import logger import pickle def start(callback, args, workerseed, rank, comm): env = gym.make(args.task) env.seed(workerseed) np.random.seed(workerseed) ob_space = env.observation_space ac_space = env.action_space num_subs = args.num_subs macro_duration = args.macro_duration num_rollouts = args.num_rollouts warmup_time = args.warmup_time train_time = args.train_time num_batches = 15 # observation in. ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, ob_space.shape[0]]) # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, 104]) # features = Features(name="features", ob=ob) policy = Policy(name="policy", ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2, num_subpolicies=num_subs) old_policy = Policy(name="old_policy", ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2, num_subpolicies=num_subs) sub_policies = [SubPolicy(name="sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2) for x in range(num_subs)] old_sub_policies = [SubPolicy(name="old_sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2) for x in range(num_subs)] learner = Learner(env, policy, old_policy, sub_policies, old_sub_policies, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-5, optim_batchsize=64) rollout = rollouts.traj_segment_generator(policy, sub_policies, env, macro_duration, num_rollouts, stochastic=True, args=args) for x in range(10000): callback(x) if x == 0: learner.syncSubpolicies() print("synced subpols") # Run the inner meta-episode. policy.reset() learner.syncMasterPolicies() env.env.randomizeCorrect() shared_goal = comm.bcast(env.env.realgoal, root=0) env.env.realgoal = shared_goal print("It is iteration %d so i'm changing the goal to %s" % (x, env.env.realgoal)) mini_ep = 0 if x > 0 else -1 * (rank % 10)*int(warmup_time+train_time / 10) # mini_ep = 0 totalmeans = [] while mini_ep < warmup_time+train_time: mini_ep += 1 # rollout rolls = rollout.__next__() allrolls = [] allrolls.append(rolls) # train theta rollouts.add_advantage_macro(rolls, macro_duration, 0.99, 0.98) gmean, lmean = learner.updateMasterPolicy(rolls) # train phi test_seg = rollouts.prepare_allrolls(allrolls, macro_duration, 0.99, 0.98, num_subpolicies=num_subs) learner.updateSubPolicies(test_seg, num_batches, (mini_ep >= warmup_time)) # learner.updateSubPolicies(test_seg, # log print(("%d: global: %s, local: %s" % (mini_ep, gmean, lmean))) if args.s: totalmeans.append(gmean) with open('outfile'+str(x)+'.pickle', 'wb') as fp: pickle.dump(totalmeans, fp)