torchbenchmark/models/LearningToPaint/baseline_modelfree/train.py (104 lines of code) (raw):

#!/usr/bin/env python3 import cv2 import random import numpy as np import argparse from DRL.evaluator import Evaluator from utils.util import * from utils.tensorboard import TensorBoard import time exp = os.path.abspath('.').split('/')[-1] writer = TensorBoard('../train_log/{}'.format(exp)) os.system('ln -sf ../train_log/{} ./log'.format(exp)) os.system('mkdir ./model') def train(agent, env, evaluate): train_times = args.train_times env_batch = args.env_batch validate_interval = args.validate_interval max_step = args.max_step debug = args.debug episode_train_times = args.episode_train_times resume = args.resume output = args.output time_stamp = time.time() step = episode = episode_steps = 0 tot_reward = 0. observation = None noise_factor = args.noise_factor while step <= train_times: step += 1 episode_steps += 1 # reset if it is the start of episode if observation is None: observation = env.reset() agent.reset(observation, noise_factor) action = agent.select_action(observation, noise_factor=noise_factor) observation, reward, done, _ = env.step(action) agent.observe(reward, observation, done, step) if (episode_steps >= max_step and max_step): if step > args.warmup: # [optional] evaluate if episode > 0 and validate_interval > 0 and episode % validate_interval == 0: reward, dist = evaluate(env, agent.select_action, debug=debug) if debug: prRed('Step_{:07d}: mean_reward:{:.3f} mean_dist:{:.3f} var_dist:{:.3f}'.format(step - 1, np.mean(reward), np.mean(dist), np.var(dist))) writer.add_scalar('validate/mean_reward', np.mean(reward), step) writer.add_scalar('validate/mean_dist', np.mean(dist), step) writer.add_scalar('validate/var_dist', np.var(dist), step) agent.save_model(output) train_time_interval = time.time() - time_stamp time_stamp = time.time() tot_Q = 0. tot_value_loss = 0. if step > args.warmup: if step < 10000 * max_step: lr = (3e-4, 1e-3) elif step < 20000 * max_step: lr = (1e-4, 3e-4) else: lr = (3e-5, 1e-4) for i in range(episode_train_times): Q, value_loss = agent.update_policy(lr) tot_Q += Q.data.cpu().numpy() tot_value_loss += value_loss.data.cpu().numpy() writer.add_scalar('train/critic_lr', lr[0], step) writer.add_scalar('train/actor_lr', lr[1], step) writer.add_scalar('train/Q', tot_Q / episode_train_times, step) writer.add_scalar('train/critic_loss', tot_value_loss / episode_train_times, step) if debug: prBlack('#{}: steps:{} interval_time:{:.2f} train_time:{:.2f}' \ .format(episode, step, train_time_interval, time.time()-time_stamp)) time_stamp = time.time() # reset observation = None episode_steps = 0 episode += 1 if __name__ == "__main__": parser = argparse.ArgumentParser(description='Learning to Paint') # hyper-parameter parser.add_argument('--warmup', default=400, type=int, help='timestep without training but only filling the replay memory') parser.add_argument('--discount', default=0.95**5, type=float, help='discount factor') parser.add_argument('--batch_size', default=96, type=int, help='minibatch size') parser.add_argument('--rmsize', default=800, type=int, help='replay memory size') parser.add_argument('--env_batch', default=96, type=int, help='concurrent environment number') parser.add_argument('--tau', default=0.001, type=float, help='moving average for target network') parser.add_argument('--max_step', default=40, type=int, help='max length for episode') parser.add_argument('--noise_factor', default=0.05, type=float, help='noise level for parameter space noise') parser.add_argument('--validate_interval', default=50, type=int, help='how many episodes to perform a validation') parser.add_argument('--validate_episodes', default=5, type=int, help='how many episode to perform during validation') parser.add_argument('--train_times', default=2000000, type=int, help='total traintimes') parser.add_argument('--episode_train_times', default=10, type=int, help='train times for each episode') parser.add_argument('--resume', default=None, type=str, help='Resuming model path for testing') parser.add_argument('--output', default='./model', type=str, help='Resuming model path for testing') parser.add_argument('--debug', dest='debug', action='store_true', help='print some info') parser.add_argument('--seed', default=1234, type=int, help='random seed') args = parser.parse_args() args.output = get_output_folder(args.output, "Paint") np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = False torch.backends.cudnn.benchmark = False from DRL.ddpg import DDPG from DRL.multi import fastenv fenv = fastenv(args.max_step, args.env_batch, writer) agent = DDPG(args.batch_size, args.env_batch, args.max_step, \ args.tau, args.discount, args.rmsize, \ writer, args.resume, args.output) evaluate = Evaluator(args, writer) print('observation_space', fenv.observation_space, 'action_space', fenv.action_space) train(agent, fenv, evaluate)