def updateSubPolicies()

in mlsh_code/learner.py [0:0]


    def updateSubPolicies(self, test_segs, num_batches, optimize=True):
        for i in range(self.num_subpolicies):
            is_optimizing = True
            test_seg = test_segs[i]
            ob, ac, atarg, tdlamret = test_seg["ob"], test_seg["ac"], test_seg["adv"], test_seg["tdlamret"]
            if np.shape(ob)[0] < 1:
                is_optimizing = False
            else:
                atarg = (atarg - atarg.mean()) / max(atarg.std(), 0.000001)
            test_d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=True)
            test_batchsize = int(ob.shape[0] / num_batches)

            self.assign_subs[i]() # set old parameter values to new parameter values
            # Here we do a bunch of optimization epochs over the data
            if self.optim_batchsize > 0 and is_optimizing and optimize:
                self.sub_policies[i].ob_rms.update(ob)
                for k in range(self.optim_epochs):
                    m = 0
                    for test_batch in test_d.iterate_times(test_batchsize, num_batches):
                        test_g = self.losses[i](test_batch["ob"], test_batch["ac"], test_batch["atarg"], test_batch["vtarg"])
                        self.adams[i].update(test_g, self.optim_stepsize, 1)
                        m += 1
            else:
                self.sub_policies[i].ob_rms.noupdate()
                blank = self.zerograd()
                for _ in range(self.optim_epochs):
                    for _ in range(num_batches):
                        self.adams[i].update(blank, self.optim_stepsize, 0)