spinup/exercises/tf1/problem_set_1/exercise1_2.py (26 lines of code) (raw):
import tensorflow as tf
import numpy as np
from spinup.exercises.tf1.problem_set_1 import exercise1_1
"""
Exercise 1.2: PPO Gaussian Policy
Implement an MLP diagonal Gaussian policy for PPO.
Log-likelihoods will be computed using your answer to Exercise 1.1,
so make sure to complete that exercise before beginning this one.
"""
EPS = 1e-8
def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
"""
Builds a multi-layer perceptron in Tensorflow.
Args:
x: Input tensor.
hidden_sizes: Tuple, list, or other iterable giving the number of units
for each hidden layer of the MLP.
activation: Activation function for all layers except last.
output_activation: Activation function for last layer.
Returns:
A TF symbol for the output of an MLP that takes x as an input.
"""
#######################
# #
# YOUR CODE HERE #
# #
#######################
pass
def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space):
"""
Builds symbols to sample actions and compute log-probs of actions.
Special instructions: Make log_std a tf variable with the same shape as
the action vector, independent of x, initialized to [-0.5, -0.5, ..., -0.5].
Args:
x: Input tensor of states. Shape [batch, obs_dim].
a: Input tensor of actions. Shape [batch, act_dim].
hidden_sizes: Sizes of hidden layers for action network MLP.
activation: Activation function for all layers except last.
output_activation: Activation function for last layer (action layer).
action_space: A gym.spaces object describing the action space of the
environment this agent will interact with.
Returns:
pi: A symbol for sampling stochastic actions from a Gaussian
distribution.
logp: A symbol for computing log-likelihoods of actions from a Gaussian
distribution.
logp_pi: A symbol for computing log-likelihoods of actions in pi from a
Gaussian distribution.
"""
#######################
# #
# YOUR CODE HERE #
# #
#######################
# mu =
# log_std =
# pi =
logp = exercise1_1.gaussian_likelihood(a, mu, log_std)
logp_pi = exercise1_1.gaussian_likelihood(pi, mu, log_std)
return pi, logp, logp_pi
if __name__ == '__main__':
"""
Run this file to verify your solution.
"""
from spinup import ppo_tf1 as ppo
from spinup.exercises.common import print_result
import gym
import os
import pandas as pd
import psutil
import time
logdir = "/tmp/experiments/%i"%int(time.time())
ppo(env_fn = lambda : gym.make('InvertedPendulum-v2'),
ac_kwargs=dict(policy=mlp_gaussian_policy, hidden_sizes=(64,)),
steps_per_epoch=4000, epochs=20, logger_kwargs=dict(output_dir=logdir))
# Get scores from last five epochs to evaluate success.
data = pd.read_table(os.path.join(logdir,'progress.txt'))
last_scores = data['AverageEpRet'][-5:]
# Your implementation is probably correct if the agent has a score >500,
# or if it reaches the top possible score of 1000, in the last five epochs.
correct = np.mean(last_scores) > 500 or np.max(last_scores)==1e3
print_result(correct)