spinup/exercises/pytorch/problem_set_1/exercise1_2.py (47 lines of code) (raw):
import torch
import torch.nn as nn
import numpy as np
from spinup.exercises.pytorch.problem_set_1 import exercise1_1
from spinup.exercises.pytorch.problem_set_1 import exercise1_2_auxiliary
"""
Exercise 1.2: PPO Gaussian Policy
You will implement an MLP diagonal Gaussian policy for PPO by
writing an MLP-builder, and a few other key functions.
Log-likelihoods will be computed using your answer to Exercise 1.1,
so make sure to complete that exercise before beginning this one.
"""
def mlp(sizes, activation, output_activation=nn.Identity):
"""
Build a multi-layer perceptron in PyTorch.
Args:
sizes: Tuple, list, or other iterable giving the number of units
for each layer of the MLP.
activation: Activation function for all layers except last.
output_activation: Activation function for last layer.
Returns:
A PyTorch module that can be called to give the output of the MLP.
(Use an nn.Sequential module.)
"""
#######################
# #
# YOUR CODE HERE #
# #
#######################
pass
class DiagonalGaussianDistribution:
def __init__(self, mu, log_std):
self.mu = mu
self.log_std = log_std
def sample(self):
"""
Returns:
A PyTorch Tensor of samples from the diagonal Gaussian distribution with
mean and log_std given by self.mu and self.log_std.
"""
#######################
# #
# YOUR CODE HERE #
# #
#######################
pass
#================================(Given, ignore)==========================================#
def log_prob(self, value):
return exercise1_1.gaussian_likelihood(value, self.mu, self.log_std)
def entropy(self):
return 0.5 + 0.5 * np.log(2 * np.pi) + self.log_std.sum(axis=-1)
#=========================================================================================#
class MLPGaussianActor(nn.Module):
def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
super().__init__()
"""
Initialize an MLP Gaussian Actor by making a PyTorch module for computing the
mean of the distribution given a batch of observations, and a log_std parameter.
Make log_std a PyTorch Parameter with the same shape as the action vector,
independent of observations, initialized to [-0.5, -0.5, ..., -0.5].
(Make sure it's trainable!)
"""
#######################
# #
# YOUR CODE HERE #
# #
#######################
# self.log_std =
# self.mu_net =
pass
#================================(Given, ignore)==========================================#
def forward(self, obs, act=None):
mu = self.mu_net(obs)
pi = DiagonalGaussianDistribution(mu, self.log_std)
logp_a = None
if act is not None:
logp_a = pi.log_prob(act)
return pi, logp_a
#=========================================================================================#
if __name__ == '__main__':
"""
Run this file to verify your solution.
"""
from spinup import ppo_pytorch as ppo
from spinup.exercises.common import print_result
from functools import partial
import gym
import os
import pandas as pd
import psutil
import time
logdir = "/tmp/experiments/%i"%int(time.time())
ActorCritic = partial(exercise1_2_auxiliary.ExerciseActorCritic, actor=MLPGaussianActor)
ppo(env_fn = lambda : gym.make('InvertedPendulum-v2'),
actor_critic=ActorCritic,
ac_kwargs=dict(hidden_sizes=(64,)),
steps_per_epoch=4000, epochs=20, logger_kwargs=dict(output_dir=logdir))
# Get scores from last five epochs to evaluate success.
data = pd.read_table(os.path.join(logdir,'progress.txt'))
last_scores = data['AverageEpRet'][-5:]
# Your implementation is probably correct if the agent has a score >500,
# or if it reaches the top possible score of 1000, in the last five epochs.
correct = np.mean(last_scores) > 500 or np.max(last_scores)==1e3
print_result(correct)