def learn()

in ppo_ewma/ppo.py [0:0]


def learn(
    *,
    venv: "(VecEnv) vectorized environment",
    model: "(ppo.PpoModel)",
    model_ewma: "(ppg.EwmaModel) alternate model used for clipping or the KL penalty",
    interacts_total: "(float) total timesteps of interaction" = float("inf"),
    nstep: "(int) number of serial timesteps" = 256,
    γ: "(float) discount" = 0.99,
    λ: "(float) GAE parameter" = 0.95,
    clip_param: "(float) PPO parameter for clipping prob ratio" = 0.2,
    vfcoef: "(float) value function coefficient" = 0.5,
    entcoef: "(float) entropy coefficient" = 0.01,
    nminibatch: "(int) number of minibatches to break epoch of data into" = 4,
    n_epoch_vf: "(int) number of epochs to use when training the value function" = 1,
    n_epoch_pi: "(int) number of epochs to use when training the policy" = 1,
    lr: "(float) Adam learning rate" = 5e-4,
    beta1: "(float) Adam beta1" = 0.9,
    beta2: "(float) Adam beta2" = 0.999,
    default_loss_weights: "(dict) default_loss_weights" = {},
    store_segs: "(bool) whether or not to store segments in a buffer" = True,
    verbose: "(bool) print per-epoch loss stats" = True,
    log_save_opts: "(dict) passed into LogSaveHelper" = {},
    rnorm: "(bool) reward normalization" = True,
    kl_penalty: "(int) weight of the KL penalty, which can be used in place of clipping" = 0,
    adv_ewma_decay: "(float) EWMA decay for advantage normalization" = 0.0,
    grad_weight: "(float) relative weight of this worker's gradients" = 1,
    comm: "(MPI.Comm) MPI communicator" = None,
    callbacks: "(seq of function(dict)->bool) to run each update" = (),
    learn_state: "dict with optional keys {'opts', 'roller', 'lsh', 'reward_normalizer', 'curr_interact_count', 'seg_buf', 'segs_delayed', 'adv_moments'}" = None,
    staleness: "(int) number of iterations by which to make data artificially stale, for experimentation" = 0,
    staleness_loss: "(str) one of 'decoupled', 'behavior' or 'proximal', only used if staleness > 0" = "decoupled",
    imp_samp_max: "(float) value at which to clip importance sampling ratio" = 100.0,