def _step()

in gym/gym/envs/parameter_tuning/convergence.py [0:0]


    def _step(self, action):
        """
        Perform some action in the environment
        """
        assert self.action_space.contains(action)

        lr, decay, momentum, batch_size, l1, l2 = action;


        # map ranges of inputs
        lr = (10.0 ** lr[0]).astype('float32')
        decay = (10.0 ** decay[0]).astype('float32')
        momentum = (10.0 ** momentum[0]).astype('float32')

        batch_size = int( 2 ** batch_size[0] )

        l1 = (10.0 ** l1[0]).astype('float32')
        l2 = (10.0 ** l2[0]).astype('float32')

        """
        names = ["lr", "decay", "mom", "batch", "l1", "l2"]
        values = [lr, decay, momentum, batch_size, l1, l2]

        for n,v in zip(names, values):
            print(n,v)
        """

        X,Y,Xv,Yv = self.data

        # set parameters of training step

        self.sgd.lr.set_value(lr)
        self.sgd.decay.set_value(decay)
        self.sgd.momentum.set_value(momentum)

        self.reg.l1.set_value(l1)
        self.reg.l2.set_value(l2)

        # train model for one epoch_idx
        H = self.model.fit(X, Y,
                      batch_size=int(batch_size),
                      nb_epoch=1,
                      shuffle=True)

        _, acc = self.model.evaluate(Xv,Yv)

        # save best validation
        if acc > self.best_val:
            self.best_val = acc

        self.previous_acc = acc;

        self.epoch_idx = self.epoch_idx + 1

        diverged = math.isnan( H.history['loss'][-1] )
        done = self.epoch_idx == 20 or diverged

        if diverged:
            """ maybe not set to a very large value; if you get something nice,
            but then diverge, maybe it is not too bad
            """
            reward = -100.0
        else:
            reward = self.best_val

            # as number of labels increases, learning problem becomes
            # more difficult for fixed dataset size. In order to avoid
            # for the agent to ignore more complex datasets, on which
            # accuracy is low and concentrate on simple cases which bring bulk
            # of reward, I normalize by number of labels in dataset

            reward = reward * self.nb_classes

            # formula below encourages higher best validation

            reward = reward + reward ** 2

        return self._get_obs(), reward, done, {}