in mlsh_code/learner.py [0:0]
def updateSubPolicies(self, test_segs, num_batches, optimize=True):
for i in range(self.num_subpolicies):
is_optimizing = True
test_seg = test_segs[i]
ob, ac, atarg, tdlamret = test_seg["ob"], test_seg["ac"], test_seg["adv"], test_seg["tdlamret"]
if np.shape(ob)[0] < 1:
is_optimizing = False
else:
atarg = (atarg - atarg.mean()) / max(atarg.std(), 0.000001)
test_d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=True)
test_batchsize = int(ob.shape[0] / num_batches)
self.assign_subs[i]() # set old parameter values to new parameter values
# Here we do a bunch of optimization epochs over the data
if self.optim_batchsize > 0 and is_optimizing and optimize:
self.sub_policies[i].ob_rms.update(ob)
for k in range(self.optim_epochs):
m = 0
for test_batch in test_d.iterate_times(test_batchsize, num_batches):
test_g = self.losses[i](test_batch["ob"], test_batch["ac"], test_batch["atarg"], test_batch["vtarg"])
self.adams[i].update(test_g, self.optim_stepsize, 1)
m += 1
else:
self.sub_policies[i].ob_rms.noupdate()
blank = self.zerograd()
for _ in range(self.optim_epochs):
for _ in range(num_batches):
self.adams[i].update(blank, self.optim_stepsize, 0)