qlearn/atari/train_dqn.py [173:221]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        new_obs, rew, done, info = env.step(action)
        death = done or (prev_lives is not None and info['ale.lives'] < prev_lives and info['ale.lives'] > 0)
        prev_lives = info['ale.lives']

        replay_buffer.add(obs, action, np.sign(rew), new_obs, float(death))
        obs = new_obs
        episode_rewards[-1] += rew

        if done:
            log.add_scalar('reward', episode_rewards[-1], num_iters)
            episode_rewards.append(0.0)
            obs = env.reset()
            num_episodes += 1

        if num_iters > args.learning_starts and num_iters % args.learning_freq == 0:

            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(args.batch_size)
            # Reshape state to (batch, channels, x_dim, y_dim)
            obses_t = np.transpose(obses_t, [0, 3, 1, 2])
            obses_tp1 = np.transpose(obses_tp1, [0, 3, 1, 2])

            # TODO
            td_errors = agent.learn(obses_t, actions, rewards, obses_tp1, dones)
            td_errors_list.append(td_errors.item())
            log.add_scalar('td_error', td_errors.item(), num_iters)

            num_updates += 1

            # Update target network.
        if num_iters > args.learning_starts and num_iters % args.target_update_freq == 0:
            # TODO
            agent.update_target_net()

        if start_time is not None:
            steps_per_iter.update(num_iters - start_steps)
            iteration_time_est.update(time.time() - start_time)
        start_time, start_steps = time.time(), num_iters

        if num_iters > args.num_steps:
            break

        if done and num_episodes % args.print_freq == 0 and num_episodes >= args.print_freq:
            steps_left = args.num_steps - num_iters
            completion = np.round(num_iters / args.num_steps, 1)
            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            logger.record_tabular("% completion", completion)
            logger.record_tabular("total steps", num_iters)
            logger.record_tabular("episodes", num_episodes)
            logger.record_tabular("reward (100 epi mean)", mean_100ep_reward)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



qlearn/atari/train_noisy_agent.py [164:212]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        new_obs, rew, done, info = env.step(action)
        death = done or (prev_lives is not None and info['ale.lives'] < prev_lives and info['ale.lives'] > 0)
        prev_lives = info['ale.lives']

        replay_buffer.add(obs, action, np.sign(rew), new_obs, float(death))
        obs = new_obs
        episode_rewards[-1] += rew

        if done:
            log.add_scalar('reward', episode_rewards[-1], num_iters)
            episode_rewards.append(0.0)
            obs = env.reset()
            num_episodes += 1

        if num_iters > args.learning_starts and num_iters % args.learning_freq == 0:

            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(args.batch_size)
            # Reshape state to (batch, channels, x_dim, y_dim)
            obses_t = np.transpose(obses_t, [0, 3, 1, 2])
            obses_tp1 = np.transpose(obses_tp1, [0, 3, 1, 2])

            # TODO
            td_errors = agent.learn(obses_t, actions, rewards, obses_tp1, dones)
            td_errors_list.append(td_errors.item())
            log.add_scalar('td_error', td_errors.item(), num_iters)

            num_updates += 1

            # Update target network.
        if num_iters > args.learning_starts and num_iters % args.target_update_freq == 0:
            # TODO
            agent.update_target_net()

        if start_time is not None:
            steps_per_iter.update(num_iters - start_steps)
            iteration_time_est.update(time.time() - start_time)
        start_time, start_steps = time.time(), num_iters

        if num_iters > args.num_steps:
            break

        if done and num_episodes % args.print_freq == 0 and num_episodes >= args.print_freq:
            steps_left = args.num_steps - num_iters
            completion = np.round(num_iters / args.num_steps, 1)
            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            logger.record_tabular("% completion", completion)
            logger.record_tabular("total steps", num_iters)
            logger.record_tabular("episodes", num_episodes)
            logger.record_tabular("reward (100 epi mean)", mean_100ep_reward)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



