From a6652b0c1997bb47dd502bf674e0b3b9b2d09d23 Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Thu, 16 May 2019 15:06:23 +0100 Subject: [PATCH 01/57] work for tl2 tf2 --- .../tutorial_atari_pong.py | 38 +-- ...ial_bipedalwalker_a3c_continuous_action.py | 266 +++++++++--------- .../tutorial_cartpole_ac.py | 108 +++---- .../tutorial_frozenlake_dqn.py | 45 +-- .../tutorial_frozenlake_q_table.py | 7 +- 5 files changed, 192 insertions(+), 272 deletions(-) diff --git a/examples/reinforcement_learning/tutorial_atari_pong.py b/examples/reinforcement_learning/tutorial_atari_pong.py index ad8e264df..7c8ba1d38 100644 --- a/examples/reinforcement_learning/tutorial_atari_pong.py +++ b/examples/reinforcement_learning/tutorial_atari_pong.py @@ -34,11 +34,7 @@ import gym import tensorlayer as tl -## enable eager mode -tf.enable_eager_execution() - -tf.logging.set_verbosity(tf.logging.DEBUG) # enable logging tl.logging.set_verbosity(tl.logging.DEBUG) # hyper-parameters @@ -52,7 +48,7 @@ render = False # display the game environment # resume = True # load existing policy network model_file_name = "model_pong" -np.set_printoptions(threshold=np.nan) +np.set_printoptions(threshold=np.inf) def prepro(I): @@ -73,10 +69,7 @@ def prepro(I): episode_number = 0 xs, ys, rs = [], [], [] -# observation for training and inference -# t_states = tf.placeholder(tf.float32, shape=[None, D]) # policy network - def get_model(inputs_shape): ni = tl.layers.Input(inputs_shape) nn = tl.layers.Dense(n_units=H, act=tf.nn.relu, name='hidden')(ni) @@ -85,22 +78,9 @@ def get_model(inputs_shape): return M model = get_model([None, D]) train_weights = model.trainable_weights -# probs = model(t_states, is_train=True).outputs -# sampling_prob = tf.nn.softmax(probs) - -# t_actions = tf.placeholder(tf.int32, shape=[None]) -# t_discount_rewards = tf.placeholder(tf.float32, shape=[None]) -# loss = tl.rein.cross_entropy_reward_loss(probs, t_actions, t_discount_rewards) -optimizer = tf.train.RMSPropOptimizer(learning_rate, decay_rate)#.minimize(loss) - -# with tf.Session() as sess: -# sess.run(tf.global_variables_initializer()) - # if resume: TODO - # load_params = tl.files.load_npz(name=model_file_name+'.npz') - # tl.files.assign_params(sess, load_params, network) - # tl.files.load_and_assign_npz(sess, model_file_name + '.npz', network) - # network.print_params() - # network.print_layers() + +optimizer = tf.optimizers.RMSprop(lr=learning_rate, decay=decay_rate) + model.train() # set model to train mode (in case you add dropout into the model) start_time = time.time() @@ -114,14 +94,12 @@ def get_model(inputs_shape): x = x.reshape(1, D) prev_x = cur_x - # prob = sess.run(sampling_prob, feed_dict={t_states: x}) - _prob = model(x).outputs + _prob = model(x) prob = tf.nn.softmax(_prob) # action. 1: STOP 2: UP 3: DOWN # action = np.random.choice([1,2,3], p=prob.flatten()) # action = tl.rein.choice_action_by_probs(prob.flatten(), [1, 2, 3]) - # action = np.random.choice([1,2,3], p=prob.numpy()) action = tl.rein.choice_action_by_probs(prob[0].numpy(), [1, 2, 3]) observation, reward, done, _ = env.step(action) @@ -145,12 +123,8 @@ def get_model(inputs_shape): xs, ys, rs = [], [], [] - # sess.run(train_op, feed_dict={t_states: epx, t_actions: epy, t_discount_rewards: disR}) - # t_actions = tf.placeholder(tf.int32, shape=[None]) - # t_discount_rewards = tf.placeholder(tf.float32, shape=[None]) - # loss = tl.rein.cross_entropy_reward_loss(probs, t_actions, t_discount_rewards) with tf.GradientTape() as tape: - _prob = model(epx).outputs + _prob = model(epx) _loss = tl.rein.cross_entropy_reward_loss(_prob, epy, disR) grad = tape.gradient(_loss, train_weights) optimizer.apply_gradients(zip(grad, train_weights)) diff --git a/examples/reinforcement_learning/tutorial_bipedalwalker_a3c_continuous_action.py b/examples/reinforcement_learning/tutorial_bipedalwalker_a3c_continuous_action.py index 2f1f96d67..fbe2dd560 100644 --- a/examples/reinforcement_learning/tutorial_bipedalwalker_a3c_continuous_action.py +++ b/examples/reinforcement_learning/tutorial_bipedalwalker_a3c_continuous_action.py @@ -29,6 +29,13 @@ and joints angular speed, legs contact with ground, and 10 lidar rangefinder measurements. There's no coordinates in the state vector. +tensorflow 2.0.0a0 +tensorflow-probability 0.6.0 +tensorlayer 2.0.0 + +&& +pip install box2d box2d-kengz --user + """ import multiprocessing @@ -36,25 +43,29 @@ import numpy as np import tensorflow as tf +import tensorflow_probability as tfp +tfd = tfp.distributions import gym import tensorlayer as tl from tensorlayer.layers import DenseLayer, InputLayer -tf.logging.set_verbosity(tf.logging.DEBUG) tl.logging.set_verbosity(tl.logging.DEBUG) -GAME = 'BipedalWalker-v2' # BipedalWalkerHardcore-v2 +np.random.seed(2) +tf.random.set_seed(2) # reproducible + +GAME = 'BipedalWalker-v2' # BipedalWalkerHardcore-v2 BipedalWalker-v2 LunarLanderContinuous-v2 OUTPUT_GRAPH = False LOG_DIR = './log' N_WORKERS = multiprocessing.cpu_count() -# N_WORKERS = 4 -MAX_GLOBAL_EP = 20000 # 8000 +# N_WORKERS = 2 +MAX_GLOBAL_EP = 8000 # 8000 GLOBAL_NET_SCOPE = 'Global_Net' UPDATE_GLOBAL_ITER = 10 -GAMMA = 0.999 +GAMMA = 0.99 ENTROPY_BETA = 0.005 -LR_A = 0.00002 # learning rate for actor +LR_A = 0.00005 # learning rate for actor LR_C = 0.0001 # learning rate for critic GLOBAL_RUNNING_R = [] GLOBAL_EP = 0 # will increase during training, stop training when it >= MAX_GLOBAL_EP @@ -63,117 +74,93 @@ N_S = env.observation_space.shape[0] N_A = env.action_space.shape[0] -# A_BOUND = [env.action_space.low, env.action_space.high] -A_BOUND = [env.action_space.low, env.action_space.high] -A_BOUND[0] = A_BOUND[0].reshape(1, 4) -A_BOUND[1] = A_BOUND[1].reshape(1, 4) -# print(env.unwrapped.hull.position[0]) -# exit() +A_BOUND = [env.action_space.low, env.action_space.high] +A_BOUND[0] = A_BOUND[0].reshape(1, N_A) +A_BOUND[1] = A_BOUND[1].reshape(1, N_A) +# print(A_BOUND) class ACNet(object): - def __init__(self, scope, globalAC=None): + def __init__(self, scope, globalAC=None): self.scope = scope - if scope == GLOBAL_NET_SCOPE: - ## global network only do inference - with tf.variable_scope(scope): - self.s = tf.placeholder(tf.float32, [None, N_S], 'S') - self._build_net() - self.a_params = tl.layers.get_variables_with_name(scope + '/actor', True, False) - self.c_params = tl.layers.get_variables_with_name(scope + '/critic', True, False) - - normal_dist = tf.contrib.distributions.Normal(self.mu, self.sigma) # for continuous action space - - with tf.name_scope('choose_a'): # use local params to choose action - self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), *A_BOUND) - - else: - ## worker network calculate gradient locally, update on global network - with tf.variable_scope(scope): - self.s = tf.placeholder(tf.float32, [None, N_S], 'S') - self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A') - self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget') - - self._build_net() - - td = tf.subtract(self.v_target, self.v, name='TD_error') - with tf.name_scope('c_loss'): - self.c_loss = tf.reduce_mean(tf.square(td)) - - with tf.name_scope('wrap_a_out'): - self.test = self.sigma[0] - self.mu, self.sigma = self.mu * A_BOUND[1], self.sigma + 1e-5 - - normal_dist = tf.contrib.distributions.Normal(self.mu, self.sigma) # for continuous action space - - with tf.name_scope('a_loss'): - log_prob = normal_dist.log_prob(self.a_his) - exp_v = log_prob * td - entropy = normal_dist.entropy() # encourage exploration - self.exp_v = ENTROPY_BETA * entropy + exp_v - self.a_loss = tf.reduce_mean(-self.exp_v) - - with tf.name_scope('choose_a'): # use local params to choose action - self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), *A_BOUND) - - with tf.name_scope('local_grad'): - self.a_params = tl.layers.get_variables_with_name(scope + '/actor', True, False) - self.c_params = tl.layers.get_variables_with_name(scope + '/critic', True, False) - self.a_grads = tf.gradients(self.a_loss, self.a_params) - self.c_grads = tf.gradients(self.c_loss, self.c_params) - - with tf.name_scope('sync'): - with tf.name_scope('pull'): - self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)] - self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)] - with tf.name_scope('push'): - self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params)) - self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params)) - - def _build_net(self): - w_init = tf.contrib.layers.xavier_initializer() - with tf.variable_scope('actor'): # Policy network - nn = InputLayer(self.s, name='in') - nn = DenseLayer(nn, n_units=500, act=tf.nn.relu6, W_init=w_init, name='la') - nn = DenseLayer(nn, n_units=300, act=tf.nn.relu6, W_init=w_init, name='la2') - mu = DenseLayer(nn, n_units=N_A, act=tf.nn.tanh, W_init=w_init, name='mu') - sigma = DenseLayer(nn, n_units=N_A, act=tf.nn.softplus, W_init=w_init, name='sigma') - self.mu = mu.outputs - self.sigma = sigma.outputs - - with tf.variable_scope('critic'): # we use Value-function here, but not Q-function. - nn = InputLayer(self.s, name='in') - nn = DenseLayer(nn, n_units=500, act=tf.nn.relu6, W_init=w_init, name='lc') - nn = DenseLayer(nn, n_units=200, act=tf.nn.relu6, W_init=w_init, name='lc2') - v = DenseLayer(nn, n_units=1, W_init=w_init, name='v') - self.v = v.outputs - - def update_global(self, feed_dict): # run by a local - _, _, t = sess.run( - [self.update_a_op, self.update_c_op, self.test], feed_dict - ) # local grads applies to global net - return t - - def pull_global(self): # run by a local - sess.run([self.pull_a_params_op, self.pull_c_params_op]) + self.save_path = './model' + + w_init = tf.keras.initializers.glorot_normal(seed=None) # initializer, glorot=xavier + def get_actor(input_shape): # policy network + with tf.name_scope(self.scope): + ni = tl.layers.Input(input_shape, name='in') + nn = tl.layers.Dense(n_units=500, act=tf.nn.relu6, W_init=w_init, name='la')(ni) + nn = tl.layers.Dense(n_units=300, act=tf.nn.relu6, W_init=w_init, name='la2')(nn) + mu = tl.layers.Dense(n_units=N_A, act=tf.nn.tanh, W_init=w_init, name='mu')(nn) + sigma = tl.layers.Dense(n_units=N_A, act=tf.nn.softplus, W_init=w_init, name='sigma')(nn) + return tl.models.Model(inputs=ni, outputs=[mu, sigma], name=scope+'/Actor') + self.actor = get_actor( [None, N_S]) + self.actor.train() # train mode for Dropout, BatchNorm + def get_critic(input_shape): # we use Value-function here, but not Q-function. + with tf.name_scope(self.scope): + ni = tl.layers.Input(input_shape, name='in') + nn = tl.layers.Dense(n_units=500, act=tf.nn.relu6, W_init=w_init, name='lc')(ni) + nn = tl.layers.Dense(n_units=300, act=tf.nn.relu6, W_init=w_init, name='lc2')(nn) + v = tl.layers.Dense(n_units=1, W_init=w_init, name='v')(nn) + return tl.models.Model(inputs=ni, outputs=v, name=scope+'/Critic') + self.critic = get_critic( [None, N_S]) + self.critic.train() # train mode for Dropout, BatchNorm + + @tf.function # convert numpy functions to tf.Operations in the TFgraph, return tensor + def update_global(self, buffer_s, buffer_a, buffer_v_target, globalAC): # refer to the global Actor-Crtic network for updating it with samples + ''' update the global critic ''' + with tf.GradientTape() as tape: + self.v = self.critic(buffer_s) + self.v_target = buffer_v_target + td = tf.subtract(self.v_target, self.v, name='TD_error') + self.c_loss = tf.reduce_mean(tf.square(td)) + self.c_grads = tape.gradient(self.c_loss, self.critic.trainable_weights) + OPT_C.apply_gradients(zip(self.c_grads, globalAC.critic.trainable_weights)) # local grads applies to global net + # del tape # Drop the reference to the tape + + ''' update the global actor ''' + with tf.GradientTape() as tape: + self.mu, self.sigma = self.actor(buffer_s) + self.test = self.sigma[0] + self.mu, self.sigma = self.mu * A_BOUND[1], self.sigma + 1e-5 + + normal_dist = tfd.Normal(self.mu, self.sigma) # no tf.contrib for tf2.0 + self.a_his = buffer_a # float32 + log_prob = normal_dist.log_prob(self.a_his) + exp_v = log_prob * td # td is from the critic part, no gradients for it + entropy = normal_dist.entropy() # encourage exploration + self.exp_v = ENTROPY_BETA * entropy + exp_v + self.a_loss = tf.reduce_mean(-self.exp_v) + self.a_grads = tape.gradient(self.a_loss, self.actor.trainable_weights) + OPT_A.apply_gradients(zip(self.a_grads, globalAC.actor.trainable_weights)) # local grads applies to global net + return self.test # for test purpose + + @tf.function + def pull_global(self, globalAC): # run by a local, pull weights from the global nets + for l_p, g_p in zip(self.actor.trainable_weights, globalAC.actor.trainable_weights): + l_p.assign(g_p) + for l_p, g_p in zip(self.critic.trainable_weights, globalAC.critic.trainable_weights): + l_p.assign(g_p) def choose_action(self, s): # run by a local s = s[np.newaxis, :] - return sess.run(self.A, {self.s: s})[0] + self.mu, self.sigma = self.actor(s) - def save_ckpt(self): - tl.files.exists_or_mkdir(self.scope) - tl.files.save_ckpt( - sess=sess, mode_name='model.ckpt', var_list=self.a_params + self.c_params, save_dir=self.scope, - printable=True - ) + with tf.name_scope('wrap_a_out'): + self.mu, self.sigma = self.mu * A_BOUND[1], self.sigma + 1e-5 + normal_dist = tfd.Normal(self.mu, self.sigma) # for continuous action space + self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), *A_BOUND) + return self.A.numpy()[0] - def load_ckpt(self): - tl.files.load_ckpt(sess=sess, var_list=self.a_params + self.c_params, save_dir=self.scope, printable=True) - # tl.files.load_ckpt(sess=sess, mode_name='model.ckpt', var_list=self.a_params+self.c_params, save_dir=self.scope, is_latest=False, printable=True) + def save_ckpt(self): # save trained weights + tl.files.save_npz(self.actor.trainable_weights, name='model_actor.npz') + tl.files.save_npz(self.critic.trainable_weights, name='model_critic.npz') + def load_ckpt(self): # load trained weights + tl.files.load_and_assign_npz(name='model_actor.npz', network=self.actor) + tl.files.load_and_assign_npz(name='model_critic.npz', network=self.critic) class Worker(object): @@ -182,7 +169,8 @@ def __init__(self, name, globalAC): self.name = name self.AC = ACNet(name, globalAC) - def work(self): + # def work(self): + def work(self, globalAC): global GLOBAL_RUNNING_R, GLOBAL_EP total_step = 1 buffer_s, buffer_a, buffer_r = [], [], [] @@ -193,9 +181,11 @@ def work(self): # visualize Worker_0 during training if self.name == 'Worker_0' and total_step % 30 == 0: self.env.render() - a = self.AC.choose_action(s) + s = s.astype('float32') # double to float + a = self.AC.choose_action(s) s_, r, done, _info = self.env.step(a) - + + s_ = s_.astype('float32') # double to float # set robot falls reward to -2 instead of -100 if r == -100: r = -2 @@ -209,7 +199,7 @@ def work(self): if done: v_s_ = 0 # terminal else: - v_s_ = sess.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0] + v_s_ = self.AC.critic(s_[np.newaxis, :])[0,0] # reduce dim from 2 to 0 buffer_v_target = [] @@ -218,46 +208,45 @@ def work(self): buffer_v_target.append(v_s_) buffer_v_target.reverse() - + buffer_s, buffer_a, buffer_v_target = ( np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target) ) - feed_dict = {self.AC.s: buffer_s, self.AC.a_his: buffer_a, self.AC.v_target: buffer_v_target} # update gradients on global network - self.AC.update_global(feed_dict) + self.AC.update_global(buffer_s, buffer_a, buffer_v_target.astype('float32'), globalAC) buffer_s, buffer_a, buffer_r = [], [], [] # update local network from global network - self.AC.pull_global() + self.AC.pull_global(globalAC) s = s_ total_step += 1 if done: if len(GLOBAL_RUNNING_R) == 0: # record running episode reward GLOBAL_RUNNING_R.append(ep_r) - else: + else: # moving average GLOBAL_RUNNING_R.append(0.95 * GLOBAL_RUNNING_R[-1] + 0.05 * ep_r) print( self.name, "episode:", GLOBAL_EP, - "| pos: %i" % self.env.unwrapped.hull.position[0], # number of move + # "| pos: %i" % self.env.unwrapped.hull.position[0], # number of move '| reward: %.1f' % ep_r, "| running_reward: %.1f" % GLOBAL_RUNNING_R[-1], # '| sigma:', test, # debug - 'WIN ' * 5 if self.env.unwrapped.hull.position[0] >= 88 else '', + # 'WIN ' * 5 if self.env.unwrapped.hull.position[0] >= 88 else '', ) GLOBAL_EP += 1 break if __name__ == "__main__": - sess = tf.Session() - # ============================= TRAINING =============================== with tf.device("/cpu:0"): - OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA') - OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC') + + OPT_A = tf.optimizers.RMSprop(LR_A, name='RMSPropA') + OPT_C = tf.optimizers.RMSprop(LR_C, name='RMSPropC') + GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params workers = [] # Create worker @@ -266,31 +255,38 @@ def work(self): workers.append(Worker(i_name, GLOBAL_AC)) COORD = tf.train.Coordinator() - sess.run(tf.global_variables_initializer()) # start TF threading worker_threads = [] for worker in workers: - t = threading.Thread(target=worker.work) + # t = threading.Thread(target=worker.work) + job = lambda: worker.work(GLOBAL_AC) + t = threading.Thread(target=job) t.start() worker_threads.append(t) COORD.join(worker_threads) + import matplotlib.pyplot as plt + plt.plot(GLOBAL_RUNNING_R) + plt.xlabel('episode') + plt.ylabel('global running reward') + plt.savefig('a3c.png') + plt.show() GLOBAL_AC.save_ckpt() # ============================= EVALUATION ============================= # env = gym.make(GAME) # GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) - # sess.run(tf.global_variables_initializer()) - # GLOBAL_AC.load_ckpt() - # while True: - # s = env.reset() - # rall = 0 - # while True: - # env.render() - # a = GLOBAL_AC.choose_action(s) - # s, r, d, _ = env.step(a) - # rall += r - # if d: - # print("reward", rall) - # break + GLOBAL_AC.load_ckpt() + while True: + s = env.reset() + rall = 0 + while True: + env.render() + s = s.astype('float32') # double to float + a = GLOBAL_AC.choose_action(s) + s, r, d, _ = env.step(a) + rall += r + if d: + print("reward", rall) + break diff --git a/examples/reinforcement_learning/tutorial_cartpole_ac.py b/examples/reinforcement_learning/tutorial_cartpole_ac.py index 4d8b6f8ea..dca27540a 100644 --- a/examples/reinforcement_learning/tutorial_cartpole_ac.py +++ b/examples/reinforcement_learning/tutorial_cartpole_ac.py @@ -39,15 +39,11 @@ import gym import tensorlayer as tl -## enable eager mode -tf.enable_eager_execution() - -tf.logging.set_verbosity(tf.logging.DEBUG) tl.logging.set_verbosity(tl.logging.DEBUG) np.random.seed(2) -tf.set_random_seed(2) # reproducible +tf.random.set_seed(2) # reproducible # hyper-parameters OUTPUT_GRAPH = False @@ -65,7 +61,6 @@ N_F = env.observation_space.shape[0] N_A = env.action_space.n -# env.action_space.sample() random sample print("observation dimension: %d" % N_F) # 4 print("observation high: %s" % env.observation_space.high) # [ 2.4 , inf , 0.41887902 , inf] @@ -76,16 +71,6 @@ class Actor(object): def __init__(self, n_features, n_actions, lr=0.001): - # self.sess = sess - # self.s = tf.placeholder(tf.float32, [1, n_features], "state") - # self.a = tf.placeholder(tf.int32, [None], "act") - # self.td_error = tf.placeholder(tf.float32, [None], "td_error") # TD_error - - # with tf.variable_scope('Actor'): # Policy network - # n = InputLayer(self.s, name='in') - # n = DenseLayer(n, n_units=30, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden') - # # n = DenseLayer(n, n_units=10, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden2') - # n = DenseLayer(n, n_units=n_actions, name='Pi') def get_model(inputs_shape): ni = tl.layers.Input(inputs_shape, name='state') @@ -93,66 +78,42 @@ def get_model(inputs_shape): nn = tl.layers.Dense(n_units=10, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden2')(nn) nn = tl.layers.Dense(n_units=n_actions, name='actions')(nn) return tl.models.Model(inputs=ni, outputs=nn, name="Actor") - self.model = get_model([1, n_features]) + self.model = get_model([None, n_features]) self.model.train() - # self.acts_logits = n.outputs - # self.acts_prob = tf.nn.softmax(self.acts_logits) - - # Hao Dong - # with tf.variable_scope('loss'): - # self.exp_v = tl.rein.cross_entropy_reward_loss( - # logits=self.acts_logits, actions=self.a, rewards=self.td_error, name='actor_weighted_loss' - # ) - - # with tf.variable_scope('train'): - # self.train_op = tf.train.AdamOptimizer(lr).minimize(self.exp_v) - self.optimizer = tf.train.AdamOptimizer(lr) - # Morvan Zhou (the same) - # with tf.variable_scope('exp_v'): - # # log_prob = tf.log(self.acts_prob[0, self.a[0]]) - # # self.exp_v = tf.reduce_mean(log_prob * self.td_error[0]) # advantage (TD_error) guided loss - # self.exp_v = tl.rein.log_weight(probs=self.acts_prob[0, self.a[0]], weights=self.td_error) - # - # with tf.variable_scope('train'): - # self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v) # minimize(-exp_v) = maximize(exp_v) + self.optimizer = tf.optimizers.Adam(lr) def learn(self, s, a, td): - # _, exp_v = self.sess.run([self.train_op, self.exp_v], {self.s: [s], self.a: [a], self.td_error: td[0]}) with tf.GradientTape() as tape: - _logits = self.model([s]).outputs - # _probs = tf.nn.softmax(_logits) - _exp_v = tl.rein.cross_entropy_reward_loss(logits=_logits, actions=[a], rewards=td[0]) + _logits = self.model(np.array([s])) + ## cross-entropy loss weighted by td-error (advantage), + # the cross-entropy mearsures the difference of two probability distributions: the predicted logits and sampled action distribution, + # then weighted by the td-error: small difference of real and predict actions for large td-error (advantage); and vice versa. + _exp_v = tl.rein.cross_entropy_reward_loss(logits=_logits, actions=[a], rewards=td[0]) grad = tape.gradient(_exp_v, self.model.trainable_weights) self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights)) return _exp_v def choose_action(self, s): - # probs = self.sess.run(self.acts_prob, {self.s: [s]}) # get probabilities of all actions - _logits = self.model([s]).outputs + _logits = self.model(np.array([s])) _probs = tf.nn.softmax(_logits).numpy() - return tl.rein.choice_action_by_probs(_probs.ravel()) + return tl.rein.choice_action_by_probs(_probs.ravel()) # sample according to probability distribution def choose_action_greedy(self, s): - # probs = self.sess.run(self.acts_prob, {self.s: [s]}) # get probabilities of all actions - _logits = self.model([s]).outputs + _logits = self.model(np.array([s])) # logits: probability distribution of actions _probs = tf.nn.softmax(_logits).numpy() return np.argmax(_probs.ravel()) + def save_ckpt(self): # save trained weights + tl.files.save_npz(self.model.trainable_weights, name='model_actor.npz') + + def load_ckpt(self): # load trained weights + tl.files.load_and_assign_npz(name='model_actor.npz', network=self.model) + class Critic(object): def __init__(self, n_features, lr=0.01): - # self.sess = sess - # self.s = tf.placeholder(tf.float32, [1, n_features], "state") - # self.v_ = tf.placeholder(tf.float32, [1, 1], "v_next") - # self.r = tf.placeholder(tf.float32, None, 'r') - - # with tf.variable_scope('Critic'): # we use Value-function here, not Action-Value-function - # n = InputLayer(self.s, name='in') - # n = DenseLayer(n, n_units=30, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden') - # # n = DenseLayer(n, n_units=5, act=tf.nn.relu, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden2') - # n = DenseLayer(n, n_units=1, act=None, name='V') - # self.v = n.outputs + def get_model(inputs_shape): ni = tl.layers.Input(inputs_shape, name='state') nn = tl.layers.Dense(n_units=30, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden')(ni) @@ -161,27 +122,25 @@ def get_model(inputs_shape): return tl.models.Model(inputs=ni, outputs=nn, name="Critic") self.model = get_model([1, n_features]) self.model.train() - # with tf.variable_scope('squared_TD_error'): - # # TD_error = r + lambd * V(newS) - V(S) - # self.td_error = self.r + LAMBDA * self.v_ - self.v - # self.loss = tf.square(self.td_error) - # with tf.variable_scope('train'): - # self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss) - self.optimizer = tf.train.AdamOptimizer(lr) + + self.optimizer = tf.optimizers.Adam(lr) def learn(self, s, r, s_): - # v_ = self.sess.run(self.v, {self.s: [s_]}) - v_ = self.model([s_]).outputs - # td_error, _ = self.sess.run([self.td_error, self.train_op], {self.s: [s], self.v_: v_, self.r: r}) + v_ = self.model(np.array([s_])) with tf.GradientTape() as tape: - v = self.model([s]).outputs - # TD_error = r + lambd * V(newS) - V(S) + v = self.model(np.array([s])) + ## TD_error = r + lambd * V(newS) - V(S) td_error = r + LAMBDA * v_ - v loss = tf.square(td_error) grad = tape.gradient(loss, self.model.trainable_weights) self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights)) return td_error + def save_ckpt(self): # save trained weights + tl.files.save_npz(self.model.trainable_weights, name='model_critic.npz') + + def load_ckpt(self): # load trained weights + tl.files.load_and_assign_npz(name='model_critic.npz', network=self.model) actor = Actor(n_features=N_F, n_actions=N_A, lr=LR_A) # we need a good teacher, so the teacher should learn faster than the actor @@ -194,6 +153,7 @@ def learn(self, s, r, s_): t = 0 # number of step in this episode all_r = [] # rewards of all steps while True: + if RENDER: env.render() a = actor.choose_action(s) @@ -212,11 +172,16 @@ def learn(self, s, r, s_): all_r.append(r) td_error = critic.learn(s, r, s_new) # learn Value-function : gradient = grad[r + lambda * V(s_new) - V(s)] - actor.learn(s, a, td_error) # learn Policy : true_gradient = grad[logPi(s, a) * td_error] + try: + actor.learn(s, a, td_error) # learn Policy : true_gradient = grad[logPi(s, a) * td_error] + except KeyboardInterrupt: # if Ctrl+C at running actor.learn(), then save model, or exit if not at actor.learn() + actor.save_ckpt() + critic.save_ckpt() + # logging s = s_new t += 1 - + if done or t >= MAX_EP_STEPS: ep_rs_sum = sum(all_r) @@ -247,3 +212,4 @@ def learn(self, s, r, s_): s = env.reset().astype(np.float32) rall = 0 break + diff --git a/examples/reinforcement_learning/tutorial_frozenlake_dqn.py b/examples/reinforcement_learning/tutorial_frozenlake_dqn.py index 9411da423..9ab10b742 100644 --- a/examples/reinforcement_learning/tutorial_frozenlake_dqn.py +++ b/examples/reinforcement_learning/tutorial_frozenlake_dqn.py @@ -24,6 +24,10 @@ The episode ends when you reach the goal or fall in a hole. You receive a reward of 1 if you reach the goal, and zero otherwise. + +tensorflow==2.0.0a0 +tensorlayer==2.0.0 + """ import time @@ -33,11 +37,6 @@ import gym import tensorlayer as tl -## enable eager mode -tf.enable_eager_execution() - - -tf.logging.set_verbosity(tf.logging.DEBUG) tl.logging.set_verbosity(tl.logging.DEBUG) env = gym.make('FrozenLake-v0') @@ -50,37 +49,23 @@ def to_one_hot(i, n_classes=None): render = False # display the game environment running_reward = None - # tf.reset_default_graph() ## Define Q-network q(a,s) that ouput the rewards of 4 actions by given state, i.e. Action-Value Function. -# 4x4 grid can be represented by one-hot vector with 16 integers. - # inputs = tf.placeholder(shape=[1, 16], dtype=tf.float32) - # net = InputLayer(inputs, name='observation') - # net = DenseLayer(net, 4, act=None, W_init=tf.random_uniform_initializer(0, 0.01), b_init=None, name='q_a_s') - # y = net.outputs # action-value / rewards of 4 actions +# encoding for state: 4x4 grid can be represented by one-hot vector with 16 integers. def get_model(inputs_shape): ni = tl.layers.Input(inputs_shape, name='observation') nn = tl.layers.Dense(4, act=None, W_init=tf.random_uniform_initializer(0, 0.01), b_init=None, name='q_a_s')(ni) return tl.models.Model(inputs=ni, outputs=nn, name="Q-Network") -qnetwork = get_model([1, 16]) +qnetwork = get_model([None, 16]) qnetwork.train() train_weights = qnetwork.trainable_weights -# chose action greedily with reward. in Q-Learning, policy is greedy, so we use "max" to select the next action. - # predict = tf.argmax(y, 1) - -## Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values. - # nextQ = tf.placeholder(shape=[1, 4], dtype=tf.float32) - # loss = tl.cost.mean_squared_error(nextQ, y, is_mean=False) # tf.reduce_sum(tf.square(nextQ - y)) - # train_op = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(loss) -optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1) +optimizer = tf.optimizers.SGD(learning_rate=0.1) ## Set learning parameters lambd = .99 # decay factor e = 0.1 # e-Greedy Exploration, the larger the more random num_episodes = 10000 -# with tf.Session() as sess: - # tl.layers.initialize_global_variables(sess) for i in range(num_episodes): ## Reset environment and get first new observation episode_time = time.time() @@ -89,8 +74,7 @@ def get_model(inputs_shape): for j in range(99): # step index, maximum step is 99 if render: env.render() ## Choose an action by greedily (with e chance of random action) from the Q-network - # a, allQ = sess.run([predict, y], feed_dict={inputs: [to_one_hot(s, 16)]}) - allQ = qnetwork(np.asarray([to_one_hot(s, 16)], dtype=np.float32)).outputs.numpy() + allQ = qnetwork(np.asarray([to_one_hot(s, 16)], dtype=np.float32)).numpy() a = np.argmax(allQ, 1) ## e-Greedy Exploration !!! sample random action @@ -99,8 +83,7 @@ def get_model(inputs_shape): ## Get new state and reward from environment s1, r, d, _ = env.step(a[0]) ## Obtain the Q' values by feeding the new state through our network - # Q1 = sess.run(y, feed_dict={inputs: [to_one_hot(s1, 16)]}) - Q1 = qnetwork(np.asarray([to_one_hot(s1, 16)], dtype=np.float32)).outputs.numpy() + Q1 = qnetwork(np.asarray([to_one_hot(s1, 16)], dtype=np.float32)).numpy() ## Obtain maxQ' and set our target value for chosen action. maxQ1 = np.max(Q1) # in Q-Learning, policy is greedy, so we use "max" to select the next action. @@ -110,11 +93,9 @@ def get_model(inputs_shape): # it is not real target Q value, it is just an estimation, # but check the Q-Learning update formula: # Q'(s,a) <- Q(s,a) + alpha(r + lambd * maxQ(s',a') - Q(s, a)) - # minimizing |r + lambd * maxQ(s',a') - Q(s, a)|^2 equal to force - # Q'(s,a) ≈ Q(s,a) - # _ = sess.run(train_op, {inputs: [to_one_hot(s, 16)], nextQ: targetQ}) + # minimizing |r + lambd * maxQ(s',a') - Q(s, a)|^2 equals to force Q'(s,a) ≈ Q(s,a) with tf.GradientTape() as tape: - _qvalues = qnetwork(np.asarray([to_one_hot(s, 16)], dtype=np.float32)).outputs + _qvalues = qnetwork(np.asarray([to_one_hot(s, 16)], dtype=np.float32)) _loss = tl.cost.mean_squared_error(targetQ, _qvalues, is_mean=False) grad = tape.gradient(_loss, train_weights) optimizer.apply_gradients(zip(grad, train_weights)) @@ -128,5 +109,5 @@ def get_model(inputs_shape): ## Note that, the rewards here with random action running_reward = rAll if running_reward is None else running_reward * 0.99 + rAll * 0.01 - print("Episode [%d/%d] sum reward: %f running reward: %f took: %.5fs %s" % \ - (i, num_episodes, rAll, running_reward, time.time() - episode_time, '' if rAll == 0 else ' !!!!!!!!')) + print("Episode [%d/%d] sum reward: %f running reward: %f took: %.5fs " % \ + (i, num_episodes, rAll, running_reward, time.time() - episode_time)) diff --git a/examples/reinforcement_learning/tutorial_frozenlake_q_table.py b/examples/reinforcement_learning/tutorial_frozenlake_q_table.py index a5b44059a..a8decb273 100644 --- a/examples/reinforcement_learning/tutorial_frozenlake_q_table.py +++ b/examples/reinforcement_learning/tutorial_frozenlake_q_table.py @@ -11,6 +11,9 @@ EN: https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0#.5m3361vlw CN: https://zhuanlan.zhihu.com/p/25710327 +tensorflow==2.0.0a0 +tensorlayer==2.0.0 + """ import time @@ -52,7 +55,7 @@ break rList.append(rAll) running_reward = r if running_reward is None else running_reward * 0.99 + r * 0.01 - print("Episode [%d/%d] sum reward: %f running reward: %f took: %.5fs %s" % \ - (i, num_episodes, rAll, running_reward, time.time() - episode_time, '' if rAll == 0 else ' !!!!!!!!')) + print("Episode [%d/%d] sum reward: %f running reward: %f took: %.5fs " % \ + (i, num_episodes, rAll, running_reward, time.time() - episode_time)) print("Final Q-Table Values:/n %s" % Q) From 08eca73dd7495cdaed52be3bd61211f72f3ea2db Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Sat, 18 May 2019 16:03:50 +0100 Subject: [PATCH 02/57] td3 added --- .../tutorial_cifar10_cnn_static.py | 2 +- .../tutorial_mnist_mlp_dynamic.py | 2 +- .../tutorial_mnist_mlp_dynamic_2.py | 2 +- .../tutorial_mnist_mlp_static.py | 2 +- .../tutorial_mnist_mlp_static_2.py | 2 +- .../basic_tutorials/tutorial_mnist_siamese.py | 2 +- .../basic_tutorials/tutorial_mnist_simple.py | 3 +- .../tutorial_fast_affine_transform.py | 4 +- .../data_process/tutorial_tf_dataset_voc.py | 2 +- examples/data_process/tutorial_tfrecord.py | 3 +- examples/data_process/tutorial_tfrecord2.py | 2 +- examples/data_process/tutorial_tfrecord3.py | 2 +- examples/database/dispatch_tasks.py | 1 - examples/database/task_script.py | 1 - ...torial_imagenet_inceptionV3_distributed.py | 4 +- .../tutorial_mnist_distributed.py | 1 - .../tutorial_cifar10_distributed_trainer.py | 2 +- .../tutorial_mnist_distributed_trainer.py | 2 +- examples/keras_tfslim/tutorial_keras.py | 2 +- .../tutorial_models_mobilenetv1.py | 2 +- .../tutorial_models_squeezenetv1.py | 2 +- .../pretrained_cnn/tutorial_models_vgg16.py | 2 +- .../pretrained_cnn/tutorial_models_vgg19.py | 2 +- .../tutorial_models_vgg_static.py | 2 +- .../tutorial_binarynet_cifar10_tfrecord.py | 1 - .../tutorial_binarynet_mnist_cnn.py | 1 - .../tutorial_dorefanet_cifar10_tfrecord.py | 1 - .../tutorial_dorefanet_mnist_cnn.py | 1 - .../tutorial_quanconv_cifar10.py | 2 +- .../quantized_net/tutorial_quanconv_mnist.py | 1 - ...tutorial_ternaryweight_cifar10_tfrecord.py | 1 - .../tutorial_ternaryweight_mnist_cnn.py | 1 - .../tutorial_atari_pong.py | 3 +- ...ial_bipedalwalker_a3c_continuous_action.py | 8 +- .../tutorial_cartpole_ac.py | 4 +- .../tutorial_frozenlake_dqn.py | 2 +- .../reinforcement_learning/tutorial_sac.py | 381 ++++++++++++++++ .../reinforcement_learning/tutorial_td3.py | 408 ++++++++++++++++++ ...ial_spatial_transformer_network_dynamic.py | 2 + ...rial_spatial_transformer_network_static.py | 2 + .../tutorial_imdb_fasttext.py | 2 +- .../text_generation/tutorial_generate_text.py | 2 +- examples/text_ptb/tutorial_ptb_lstm.py | 2 +- .../tutorial_ptb_lstm_state_is_tuple.py | 2 +- .../tutorial_word2vec_basic.py | 2 +- examples/tutorial_work_with_onnx.py | 4 +- tensorlayer/activation.py | 1 - tensorlayer/cost.py | 3 +- tensorlayer/db.py | 13 +- tensorlayer/distributed.py | 1 - .../files/dataset_loaders/celebA_dataset.py | 3 +- .../files/dataset_loaders/cyclegan_dataset.py | 3 +- .../dataset_loaders/flickr_1M_dataset.py | 6 +- .../dataset_loaders/flickr_25k_dataset.py | 6 +- .../files/dataset_loaders/mpii_dataset.py | 3 +- .../files/dataset_loaders/voc_dataset.py | 5 +- .../dataset_loaders/wmt_en_fr_dataset.py | 1 - tensorlayer/files/utils.py | 18 +- tensorlayer/initializers.py | 1 + tensorlayer/layers/activation.py | 1 - tensorlayer/layers/convolution/binary_conv.py | 1 - .../layers/convolution/deformable_conv.py | 1 - .../layers/convolution/depthwise_conv.py | 1 - tensorlayer/layers/convolution/dorefa_conv.py | 1 - tensorlayer/layers/convolution/expert_conv.py | 1 - .../layers/convolution/expert_deconv.py | 1 - tensorlayer/layers/convolution/group_conv.py | 1 - tensorlayer/layers/convolution/quan_conv.py | 4 +- .../layers/convolution/quan_conv_bn.py | 4 +- .../layers/convolution/separable_conv.py | 2 +- .../layers/convolution/simplified_conv.py | 1 - .../layers/convolution/simplified_deconv.py | 2 +- .../layers/convolution/super_resolution.py | 1 - .../layers/convolution/ternary_conv.py | 1 - tensorlayer/layers/core.py | 6 +- tensorlayer/layers/dense/base_dense.py | 2 +- tensorlayer/layers/dense/binary_dense.py | 1 - tensorlayer/layers/dense/dorefa_dense.py | 1 - tensorlayer/layers/dense/dropconnect.py | 4 +- tensorlayer/layers/dense/quan_dense.py | 4 +- tensorlayer/layers/dense/quan_dense_bn.py | 4 +- tensorlayer/layers/dense/ternary_dense.py | 1 - tensorlayer/layers/dropout.py | 1 - tensorlayer/layers/embedding.py | 2 +- tensorlayer/layers/extend.py | 1 - tensorlayer/layers/image_resampling.py | 1 - tensorlayer/layers/inputs.py | 2 +- tensorlayer/layers/lambda_layers.py | 3 +- tensorlayer/layers/merge.py | 1 - tensorlayer/layers/noise.py | 1 - tensorlayer/layers/normalization.py | 3 +- tensorlayer/layers/padding.py | 1 - tensorlayer/layers/pooling.py | 1 - tensorlayer/layers/quantize.py | 1 - tensorlayer/layers/recurrent.py | 1 - tensorlayer/layers/scale.py | 1 - tensorlayer/layers/shape.py | 1 - tensorlayer/layers/spatial_transformer.py | 4 +- tensorlayer/layers/stack.py | 1 - tensorlayer/layers/utils.py | 2 +- tensorlayer/logging/contrib/hyperdash.py | 1 - tensorlayer/models/core.py | 3 +- tensorlayer/models/mobilenetv1.py | 1 - tensorlayer/models/squeezenetv1.py | 3 +- tensorlayer/models/vgg.py | 4 +- tensorlayer/nlp.py | 4 +- tensorlayer/rein.py | 3 +- tensorlayer/utils.py | 2 +- 108 files changed, 896 insertions(+), 147 deletions(-) create mode 100644 examples/reinforcement_learning/tutorial_sac.py create mode 100644 examples/reinforcement_learning/tutorial_td3.py diff --git a/examples/basic_tutorials/tutorial_cifar10_cnn_static.py b/examples/basic_tutorials/tutorial_cifar10_cnn_static.py index c12c791a1..93794c414 100644 --- a/examples/basic_tutorials/tutorial_cifar10_cnn_static.py +++ b/examples/basic_tutorials/tutorial_cifar10_cnn_static.py @@ -5,8 +5,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import (BatchNorm, Conv2d, Dense, Flatten, Input, LocalResponseNorm, MaxPool2d) diff --git a/examples/basic_tutorials/tutorial_mnist_mlp_dynamic.py b/examples/basic_tutorials/tutorial_mnist_mlp_dynamic.py index 1ffa7fbe0..13db1abae 100644 --- a/examples/basic_tutorials/tutorial_mnist_mlp_dynamic.py +++ b/examples/basic_tutorials/tutorial_mnist_mlp_dynamic.py @@ -1,8 +1,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import Dense, Dropout, Input from tensorlayer.models import Model diff --git a/examples/basic_tutorials/tutorial_mnist_mlp_dynamic_2.py b/examples/basic_tutorials/tutorial_mnist_mlp_dynamic_2.py index b752012b0..0d94b1dfa 100644 --- a/examples/basic_tutorials/tutorial_mnist_mlp_dynamic_2.py +++ b/examples/basic_tutorials/tutorial_mnist_mlp_dynamic_2.py @@ -1,8 +1,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import Dense, Dropout, Input, LayerList from tensorlayer.models import Model diff --git a/examples/basic_tutorials/tutorial_mnist_mlp_static.py b/examples/basic_tutorials/tutorial_mnist_mlp_static.py index c9c15f911..de811a8d8 100644 --- a/examples/basic_tutorials/tutorial_mnist_mlp_static.py +++ b/examples/basic_tutorials/tutorial_mnist_mlp_static.py @@ -1,8 +1,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import Dense, Dropout, Input from tensorlayer.models import Model diff --git a/examples/basic_tutorials/tutorial_mnist_mlp_static_2.py b/examples/basic_tutorials/tutorial_mnist_mlp_static_2.py index f0836c528..a9a2c7d48 100644 --- a/examples/basic_tutorials/tutorial_mnist_mlp_static_2.py +++ b/examples/basic_tutorials/tutorial_mnist_mlp_static_2.py @@ -1,8 +1,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import Dense, Dropout, Input from tensorlayer.models import Model diff --git a/examples/basic_tutorials/tutorial_mnist_siamese.py b/examples/basic_tutorials/tutorial_mnist_siamese.py index db43f1163..fe4abdc52 100644 --- a/examples/basic_tutorials/tutorial_mnist_siamese.py +++ b/examples/basic_tutorials/tutorial_mnist_siamese.py @@ -14,8 +14,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import Dense, Dropout, Flatten, Input from tensorlayer.models import Model diff --git a/examples/basic_tutorials/tutorial_mnist_simple.py b/examples/basic_tutorials/tutorial_mnist_simple.py index 04e233819..ceaee0c48 100644 --- a/examples/basic_tutorials/tutorial_mnist_simple.py +++ b/examples/basic_tutorials/tutorial_mnist_simple.py @@ -1,9 +1,10 @@ #! /usr/bin/python # -*- coding: utf-8 -*- +import numpy as np + import tensorflow as tf import tensorlayer as tl -import numpy as np tl.logging.set_verbosity(tl.logging.DEBUG) diff --git a/examples/data_process/tutorial_fast_affine_transform.py b/examples/data_process/tutorial_fast_affine_transform.py index 52452ffd5..71890f5bd 100644 --- a/examples/data_process/tutorial_fast_affine_transform.py +++ b/examples/data_process/tutorial_fast_affine_transform.py @@ -8,10 +8,10 @@ import multiprocessing import time -import cv2 import numpy as np -import tensorflow as tf +import cv2 +import tensorflow as tf import tensorlayer as tl # tl.logging.set_verbosity(tl.logging.DEBUG) diff --git a/examples/data_process/tutorial_tf_dataset_voc.py b/examples/data_process/tutorial_tf_dataset_voc.py index fab1612f7..9779b1f60 100644 --- a/examples/data_process/tutorial_tf_dataset_voc.py +++ b/examples/data_process/tutorial_tf_dataset_voc.py @@ -13,8 +13,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl # tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/data_process/tutorial_tfrecord.py b/examples/data_process/tutorial_tfrecord.py index 4cb832c1d..bcf3fe46a 100644 --- a/examples/data_process/tutorial_tfrecord.py +++ b/examples/data_process/tutorial_tfrecord.py @@ -22,9 +22,9 @@ import os import numpy as np -import tensorflow as tf from PIL import Image +import tensorflow as tf import tensorlayer as tl ## Save data ================================================================== @@ -97,4 +97,3 @@ def read_and_decode(filename): print("img_batch : %s" % img_batch.shape) print("label_batch : %s" % label_batch.shape) tl.visualize.images2d(img_batch, second=1, saveable=False, name='batch', dtype=None, fig_idx=2020121) - diff --git a/examples/data_process/tutorial_tfrecord2.py b/examples/data_process/tutorial_tfrecord2.py index be41b697f..22b3d7757 100755 --- a/examples/data_process/tutorial_tfrecord2.py +++ b/examples/data_process/tutorial_tfrecord2.py @@ -14,10 +14,10 @@ import os import numpy as np + # import matplotlib # matplotlib.use('GTK') import tensorflow as tf - import tensorlayer as tl # Download data, and convert to TFRecord format, see ```tutorial_tfrecord.py``` diff --git a/examples/data_process/tutorial_tfrecord3.py b/examples/data_process/tutorial_tfrecord3.py index 9e5751a25..bc8752f2a 100644 --- a/examples/data_process/tutorial_tfrecord3.py +++ b/examples/data_process/tutorial_tfrecord3.py @@ -19,9 +19,9 @@ import os import numpy as np -import tensorflow as tf from PIL import Image +import tensorflow as tf import tensorlayer as tl diff --git a/examples/database/dispatch_tasks.py b/examples/database/dispatch_tasks.py index d1204bcd4..260257e77 100644 --- a/examples/database/dispatch_tasks.py +++ b/examples/database/dispatch_tasks.py @@ -6,7 +6,6 @@ import time import tensorflow as tf - import tensorlayer as tl tl.logging.set_verbosity(tl.logging.DEBUG) diff --git a/examples/database/task_script.py b/examples/database/task_script.py index ad51dd3ed..58ef60d1a 100644 --- a/examples/database/task_script.py +++ b/examples/database/task_script.py @@ -1,7 +1,6 @@ """Sample task script.""" import tensorflow as tf - import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/deprecated_tutorials/tutorial_imagenet_inceptionV3_distributed.py b/examples/deprecated_tutorials/tutorial_imagenet_inceptionV3_distributed.py index 936ae9702..15c0a3f3c 100644 --- a/examples/deprecated_tutorials/tutorial_imagenet_inceptionV3_distributed.py +++ b/examples/deprecated_tutorials/tutorial_imagenet_inceptionV3_distributed.py @@ -19,7 +19,9 @@ from xml.etree import ElementTree import numpy as np + import tensorflow as tf +import tensorlayer as tl from tensorflow.contrib import slim from tensorflow.contrib.slim.python.slim.nets.inception_v3 import (inception_v3, inception_v3_arg_scope) @@ -29,8 +31,6 @@ from tensorflow.python.training.monitored_session import \ SingularMonitoredSession -import tensorlayer as tl - tf.logging.set_verbosity(tf.logging.DEBUG) tl.logging.set_verbosity(tl.logging.DEBUG) diff --git a/examples/deprecated_tutorials/tutorial_mnist_distributed.py b/examples/deprecated_tutorials/tutorial_mnist_distributed.py index 29d291ba4..18f7cdb92 100644 --- a/examples/deprecated_tutorials/tutorial_mnist_distributed.py +++ b/examples/deprecated_tutorials/tutorial_mnist_distributed.py @@ -13,7 +13,6 @@ """ import tensorflow as tf - import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/distributed_training/tutorial_cifar10_distributed_trainer.py b/examples/distributed_training/tutorial_cifar10_distributed_trainer.py index 1ddc2d937..ce3aec007 100644 --- a/examples/distributed_training/tutorial_cifar10_distributed_trainer.py +++ b/examples/distributed_training/tutorial_cifar10_distributed_trainer.py @@ -15,8 +15,8 @@ import multiprocessing import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import (BatchNormLayer, Conv2d, DenseLayer, FlattenLayer, InputLayer, MaxPool2d) diff --git a/examples/distributed_training/tutorial_mnist_distributed_trainer.py b/examples/distributed_training/tutorial_mnist_distributed_trainer.py index 0f1b8b6dd..0cf916370 100755 --- a/examples/distributed_training/tutorial_mnist_distributed_trainer.py +++ b/examples/distributed_training/tutorial_mnist_distributed_trainer.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/keras_tfslim/tutorial_keras.py b/examples/keras_tfslim/tutorial_keras.py index 0622bc745..33a9ca860 100644 --- a/examples/keras_tfslim/tutorial_keras.py +++ b/examples/keras_tfslim/tutorial_keras.py @@ -4,8 +4,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import Input, Lambda diff --git a/examples/pretrained_cnn/tutorial_models_mobilenetv1.py b/examples/pretrained_cnn/tutorial_models_mobilenetv1.py index 8d7b35a6b..6b797a075 100644 --- a/examples/pretrained_cnn/tutorial_models_mobilenetv1.py +++ b/examples/pretrained_cnn/tutorial_models_mobilenetv1.py @@ -10,8 +10,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.models.imagenet_classes import class_names diff --git a/examples/pretrained_cnn/tutorial_models_squeezenetv1.py b/examples/pretrained_cnn/tutorial_models_squeezenetv1.py index 9b6ee4e7f..755d6c28b 100644 --- a/examples/pretrained_cnn/tutorial_models_squeezenetv1.py +++ b/examples/pretrained_cnn/tutorial_models_squeezenetv1.py @@ -5,8 +5,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.models.imagenet_classes import class_names diff --git a/examples/pretrained_cnn/tutorial_models_vgg16.py b/examples/pretrained_cnn/tutorial_models_vgg16.py index e6bb1c22e..b1bd3823f 100644 --- a/examples/pretrained_cnn/tutorial_models_vgg16.py +++ b/examples/pretrained_cnn/tutorial_models_vgg16.py @@ -5,8 +5,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.models.imagenet_classes import class_names diff --git a/examples/pretrained_cnn/tutorial_models_vgg19.py b/examples/pretrained_cnn/tutorial_models_vgg19.py index 850412c38..922c3bdf5 100644 --- a/examples/pretrained_cnn/tutorial_models_vgg19.py +++ b/examples/pretrained_cnn/tutorial_models_vgg19.py @@ -5,8 +5,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.models.imagenet_classes import class_names diff --git a/examples/pretrained_cnn/tutorial_models_vgg_static.py b/examples/pretrained_cnn/tutorial_models_vgg_static.py index 40a3ed865..a0e056e4d 100644 --- a/examples/pretrained_cnn/tutorial_models_vgg_static.py +++ b/examples/pretrained_cnn/tutorial_models_vgg_static.py @@ -5,8 +5,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.models.imagenet_classes import class_names diff --git a/examples/quantized_net/tutorial_binarynet_cifar10_tfrecord.py b/examples/quantized_net/tutorial_binarynet_cifar10_tfrecord.py index 98532debb..d3205045a 100644 --- a/examples/quantized_net/tutorial_binarynet_cifar10_tfrecord.py +++ b/examples/quantized_net/tutorial_binarynet_cifar10_tfrecord.py @@ -43,7 +43,6 @@ import time import tensorflow as tf - import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/quantized_net/tutorial_binarynet_mnist_cnn.py b/examples/quantized_net/tutorial_binarynet_mnist_cnn.py index 248812e23..84fbf7fc9 100644 --- a/examples/quantized_net/tutorial_binarynet_mnist_cnn.py +++ b/examples/quantized_net/tutorial_binarynet_mnist_cnn.py @@ -4,7 +4,6 @@ import time import tensorflow as tf - import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/quantized_net/tutorial_dorefanet_cifar10_tfrecord.py b/examples/quantized_net/tutorial_dorefanet_cifar10_tfrecord.py index 9c8ab1239..fe7666bab 100644 --- a/examples/quantized_net/tutorial_dorefanet_cifar10_tfrecord.py +++ b/examples/quantized_net/tutorial_dorefanet_cifar10_tfrecord.py @@ -43,7 +43,6 @@ import time import tensorflow as tf - import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/quantized_net/tutorial_dorefanet_mnist_cnn.py b/examples/quantized_net/tutorial_dorefanet_mnist_cnn.py index 90d7b0893..d8cab9bc8 100644 --- a/examples/quantized_net/tutorial_dorefanet_mnist_cnn.py +++ b/examples/quantized_net/tutorial_dorefanet_mnist_cnn.py @@ -4,7 +4,6 @@ import time import tensorflow as tf - import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/quantized_net/tutorial_quanconv_cifar10.py b/examples/quantized_net/tutorial_quanconv_cifar10.py index 6eb35ed67..f93368467 100644 --- a/examples/quantized_net/tutorial_quanconv_cifar10.py +++ b/examples/quantized_net/tutorial_quanconv_cifar10.py @@ -41,8 +41,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl bitW = 8 diff --git a/examples/quantized_net/tutorial_quanconv_mnist.py b/examples/quantized_net/tutorial_quanconv_mnist.py index 4060c6137..66d52d13c 100644 --- a/examples/quantized_net/tutorial_quanconv_mnist.py +++ b/examples/quantized_net/tutorial_quanconv_mnist.py @@ -4,7 +4,6 @@ import time import tensorflow as tf - import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/quantized_net/tutorial_ternaryweight_cifar10_tfrecord.py b/examples/quantized_net/tutorial_ternaryweight_cifar10_tfrecord.py index f1ee7b4bb..b695fa88a 100644 --- a/examples/quantized_net/tutorial_ternaryweight_cifar10_tfrecord.py +++ b/examples/quantized_net/tutorial_ternaryweight_cifar10_tfrecord.py @@ -42,7 +42,6 @@ import time import tensorflow as tf - import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/quantized_net/tutorial_ternaryweight_mnist_cnn.py b/examples/quantized_net/tutorial_ternaryweight_mnist_cnn.py index e1c305db6..6850b9591 100644 --- a/examples/quantized_net/tutorial_ternaryweight_mnist_cnn.py +++ b/examples/quantized_net/tutorial_ternaryweight_mnist_cnn.py @@ -4,7 +4,6 @@ import time import tensorflow as tf - import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/reinforcement_learning/tutorial_atari_pong.py b/examples/reinforcement_learning/tutorial_atari_pong.py index 7c8ba1d38..7e1b28822 100644 --- a/examples/reinforcement_learning/tutorial_atari_pong.py +++ b/examples/reinforcement_learning/tutorial_atari_pong.py @@ -29,12 +29,11 @@ import time import numpy as np -import tensorflow as tf import gym +import tensorflow as tf import tensorlayer as tl - tl.logging.set_verbosity(tl.logging.DEBUG) # hyper-parameters diff --git a/examples/reinforcement_learning/tutorial_bipedalwalker_a3c_continuous_action.py b/examples/reinforcement_learning/tutorial_bipedalwalker_a3c_continuous_action.py index fbe2dd560..796ef9d74 100644 --- a/examples/reinforcement_learning/tutorial_bipedalwalker_a3c_continuous_action.py +++ b/examples/reinforcement_learning/tutorial_bipedalwalker_a3c_continuous_action.py @@ -42,14 +42,16 @@ import threading import numpy as np -import tensorflow as tf -import tensorflow_probability as tfp -tfd = tfp.distributions import gym +import tensorflow as tf +import tensorflow_probability as tfp import tensorlayer as tl from tensorlayer.layers import DenseLayer, InputLayer +tfd = tfp.distributions + + tl.logging.set_verbosity(tl.logging.DEBUG) np.random.seed(2) diff --git a/examples/reinforcement_learning/tutorial_cartpole_ac.py b/examples/reinforcement_learning/tutorial_cartpole_ac.py index dca27540a..119ad7eb7 100644 --- a/examples/reinforcement_learning/tutorial_cartpole_ac.py +++ b/examples/reinforcement_learning/tutorial_cartpole_ac.py @@ -34,12 +34,11 @@ import time import numpy as np -import tensorflow as tf import gym +import tensorflow as tf import tensorlayer as tl - tl.logging.set_verbosity(tl.logging.DEBUG) np.random.seed(2) @@ -212,4 +211,3 @@ def load_ckpt(self): # load trained weights s = env.reset().astype(np.float32) rall = 0 break - diff --git a/examples/reinforcement_learning/tutorial_frozenlake_dqn.py b/examples/reinforcement_learning/tutorial_frozenlake_dqn.py index 9ab10b742..935e3e04b 100644 --- a/examples/reinforcement_learning/tutorial_frozenlake_dqn.py +++ b/examples/reinforcement_learning/tutorial_frozenlake_dqn.py @@ -32,9 +32,9 @@ import time import numpy as np -import tensorflow as tf import gym +import tensorflow as tf import tensorlayer as tl tl.logging.set_verbosity(tl.logging.DEBUG) diff --git a/examples/reinforcement_learning/tutorial_sac.py b/examples/reinforcement_learning/tutorial_sac.py new file mode 100644 index 000000000..876c726f9 --- /dev/null +++ b/examples/reinforcement_learning/tutorial_sac.py @@ -0,0 +1,381 @@ +''' +Soft Actor-Critic version 2 +using target Q instead of V net: 2 Q net, 2 target Q net, 1 policy net +add alpha loss compared with version 1 +paper: https://arxiv.org/pdf/1812.05905.pdf +''' + +import argparse +import math +import random +import time + +import matplotlib.pyplot as plt +import numpy as np +from IPython.display import clear_output + +import gym +import tensorflow as tf +import tensorflow_probability as tfp +import tensorlayer as tl +from tensorlayer.layers import Dense +from tensorlayer.models import Model + +tfd = tfp.distributions +Normal = tfd.Normal + +tl.logging.set_verbosity(tl.logging.DEBUG) + +np.random.seed(2) +tf.random.set_seed(2) # reproducible + +# GPU = True +# device_idx = 0 +# if GPU: +# device = torch.device("cuda:" + str(device_idx) if torch.cuda.is_available() else "cpu") +# else: +# device = torch.device("cpu") +# print(device) + +parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') +parser.add_argument('--train', dest='train', action='store_true', default=False) +parser.add_argument('--test', dest='test', action='store_true', default=True) +args = parser.parse_args() + + + +class ReplayBuffer: + def __init__(self, capacity): + self.capacity = capacity + self.buffer = [] + self.position = 0 + + def push(self, state, action, reward, next_state, done): + if len(self.buffer) < self.capacity: + self.buffer.append(None) + self.buffer[self.position] = (state, action, reward, next_state, done) + self.position = int((self.position + 1) % self.capacity) # as a ring buffer + + def sample(self, batch_size): + batch = random.sample(self.buffer, batch_size) + state, action, reward, next_state, done = map(np.stack, zip(*batch)) # stack for each element + ''' + the * serves as unpack: sum(a,b) <=> batch=(a,b), sum(*batch) ; + zip: a=[1,2], b=[2,3], zip(a,b) => [(1, 2), (2, 3)] ; + the map serves as mapping the function on each list element: map(square, [2,3]) => [4,9] ; + np.stack((1,2)) => array([1, 2]) + ''' + return state, action, reward, next_state, done + + def __len__(self): + return len(self.buffer) + +class NormalizedActions(gym.ActionWrapper): + def _action(self, action): + low = self.action_space.low + high = self.action_space.high + + action = low + (action + 1.0) * 0.5 * (high - low) + action = np.clip(action, low, high) + + return action + + def _reverse_action(self, action): + low = self.action_space.low + high = self.action_space.high + + action = 2 * (action - low) / (high - low) - 1 + action = np.clip(action, low, high) + + return action + +def plot(frame_idx, rewards, predict_qs): + clear_output(True) + plt.figure(figsize=(20,5)) + plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1])) + plt.plot(rewards) + plt.plot(predict_qs) + plt.savefig('sac.png') + # plt.show() + +class SoftQNetwork(Model): + def __init__(self, num_inputs, num_actions, hidden_dim, init_w=3e-3): + super(SoftQNetwork, self).__init__() + input_dim = num_inputs + num_actions + # w_init = tf.keras.initializers.glorot_normal(seed=None) + w_init = tf.random_uniform_initializer(-init_w, init_w) + + self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=input_dim, name='q1') + self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='q2') + self.linear3 = Dense(n_units=1, W_init=w_init, in_channels=hidden_dim, name='q3') + + def forward(self, input): + x = self.linear1(input) + x = self.linear2(x) + x = self.linear3(x) + return x + + +class PolicyNetwork(nn.Module): + def __init__(self, num_inputs, num_actions, hidden_dim, action_range=1., init_w=3e-3, log_std_min=-20, log_std_max=2): + super(PolicyNetwork, self).__init__() + + self.log_std_min = log_std_min + self.log_std_max = log_std_max + + # w_init = tf.keras.initializers.glorot_normal(seed=None) + w_init = tf.random_uniform_initializer(-init_w, init_w) + + self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=num_inputs, name='policy1') + self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy2') + self.linear3 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy3') + + self.mean_linear = Dense(n_units=num_actions, W_init=w_init, \ + b_init=tf.random_uniform_initializer(-init_w, init_w), in_channels=hidden_dim, name='policy_output') + self.log_std_linear = Dense(n_units=num_actions, W_init=w_init, \ + b_init=tf.random_uniform_initializer(-init_w, init_w), in_channels=hidden_dim, name='policy_output') + + + # self.mean_linear = nn.Linear(hidden_size, num_actions) + # self.mean_linear.weight.data.uniform_(-init_w, init_w) + # self.mean_linear.bias.data.uniform_(-init_w, init_w) + + # self.log_std_linear = nn.Linear(hidden_size, num_actions) + # self.log_std_linear.weight.data.uniform_(-init_w, init_w) + # self.log_std_linear.bias.data.uniform_(-init_w, init_w) + + self.action_range = action_range + self.num_actions = num_actions + + + def forward(self, state): + # x = F.relu(self.linear1(state)) + # x = F.relu(self.linear2(x)) + # x = F.relu(self.linear3(x)) + # x = F.relu(self.linear4(x)) + x = self.linear1(state) + x = self.linear2(x) + x = self.linear3(x) + + mean = self.mean_linear(x) + log_std = self.log_std_linear(x) + log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max) + + return mean, log_std + + def evaluate(self, state, epsilon=1e-6): + ''' generate action with state for calculating gradients ''' + state = state.astype(np.float32) + mean, log_std = self.forward(state) + std = tf.math.exp(log_std) # no clip in evaluation, clip affects gradients flow + + normal = Normal(0, 1) + z = normal.sample() + action_0 = tf.math.tanh(mean + std*z) # TanhNormal distribution as actions; reparameterization trick + action = self.action_range*action_0 + log_prob = Normal(mean, std).log_prob(mean+ std*z) - tf.math.log(1. - action_0.pow(2) + epsilon) - np.log(self.action_range) + # both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action); + # the Normal.log_prob outputs the same dim of input features instead of 1 dim probability, + # needs sum up across the features dim to get 1 dim prob; or else use Multivariate Normal. + log_prob = tf.reduce_sum(log_prob, axis=1) + return action, log_prob, z, mean, log_std + + + def get_action(self, state, deterministic): + mean, log_std = self.forward([state]) + std = tf.math.exp(log_std) + + normal = Normal(0, 1) + z = normal.sample() + action = self.action_range * tf.math.tanh(mean + std*z) + + action = self.action_range*mean.detach().cpu().numpy()[0] if deterministic else action.detach().cpu().numpy()[0] + return action + + + def sample_action(self,): + a=torch.FloatTensor(self.num_actions).uniform_(-1, 1) + return self.action_range*a.numpy() + + +class SAC_Trainer(): + def __init__(self, replay_buffer, hidden_dim, action_range): + self.replay_buffer = replay_buffer + + self.soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(device) + self.soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(device) + self.target_soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(device) + self.target_soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(device) + self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim, action_range).to(device) + self.log_alpha = torch.zeros(1, dtype=torch.float32, requires_grad=True, device=device) + print('Soft Q Network (1,2): ', self.soft_q_net1) + print('Policy Network: ', self.policy_net) + + for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()): + target_param.data.copy_(param.data) + for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()): + target_param.data.copy_(param.data) + + self.soft_q_criterion1 = nn.MSELoss() + self.soft_q_criterion2 = nn.MSELoss() + + soft_q_lr = 3e-4 + policy_lr = 3e-4 + alpha_lr = 3e-4 + + self.soft_q_optimizer1 = optim.Adam(self.soft_q_net1.parameters(), lr=soft_q_lr) + self.soft_q_optimizer2 = optim.Adam(self.soft_q_net2.parameters(), lr=soft_q_lr) + self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) + self.alpha_optimizer = optim.Adam([self.log_alpha], lr=alpha_lr) + + + def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy=-2, gamma=0.99,soft_tau=1e-2): + state, action, reward, next_state, done = self.replay_buffer.sample(batch_size) + # print('sample:', state, action, reward, done) + + state = torch.FloatTensor(state).to(device) + next_state = torch.FloatTensor(next_state).to(device) + action = torch.FloatTensor(action).to(device) + reward = torch.FloatTensor(reward).unsqueeze(1).to(device) # reward is single value, unsqueeze() to add one dim to be [reward] at the sample dim; + done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(device) + + predicted_q_value1 = self.soft_q_net1(state, action) + predicted_q_value2 = self.soft_q_net2(state, action) + new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state) + new_next_action, next_log_prob, _, _, _ = self.policy_net.evaluate(next_state) + + reward = reward_scale * (reward - reward.mean(dim=0)) /reward.std(dim=0) # normalize with batch mean and std + # Updating alpha wrt entropy + # alpha = 0.0 # trade-off between exploration (max entropy) and exploitation (max Q) + if auto_entropy is True: + alpha_loss = -(self.log_alpha * (log_prob + target_entropy).detach()).mean() + # print('alpha loss: ',alpha_loss) + self.alpha_optimizer.zero_grad() + alpha_loss.backward() + self.alpha_optimizer.step() + self.alpha = self.log_alpha.exp() + else: + self.alpha = 1. + alpha_loss = 0 + + # Training Q Function + target_q_min = torch.min(self.target_soft_q_net1(next_state, new_next_action),self.target_soft_q_net2(next_state, new_next_action)) - self.alpha * next_log_prob + target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward + q_value_loss1 = self.soft_q_criterion1(predicted_q_value1, target_q_value.detach()) # detach: no gradients for the variable + q_value_loss2 = self.soft_q_criterion2(predicted_q_value2, target_q_value.detach()) + + + self.soft_q_optimizer1.zero_grad() + q_value_loss1.backward() + self.soft_q_optimizer1.step() + self.soft_q_optimizer2.zero_grad() + q_value_loss2.backward() + self.soft_q_optimizer2.step() + + # Training Policy Function + predicted_new_q_value = torch.min(self.soft_q_net1(state, new_action),self.soft_q_net2(state, new_action)) + policy_loss = (self.alpha * log_prob - predicted_new_q_value).mean() + + self.policy_optimizer.zero_grad() + policy_loss.backward() + self.policy_optimizer.step() + + # print('q loss: ', q_value_loss1, q_value_loss2) + # print('policy loss: ', policy_loss ) + + + # Soft update the target value net + for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()): + target_param.data.copy_( # copy data value into target parameters + target_param.data * (1.0 - soft_tau) + param.data * soft_tau + ) + for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()): + target_param.data.copy_( # copy data value into target parameters + target_param.data * (1.0 - soft_tau) + param.data * soft_tau + ) + return predicted_new_q_value.mean() + + +replay_buffer_size = 1e6 +replay_buffer = ReplayBuffer(replay_buffer_size) + +# choose env +ENV = ['Pendulum', 'Reacher'][0] +if ENV == 'Reacher': + NUM_JOINTS=2 + LINK_LENGTH=[200, 140] + INI_JOING_ANGLES=[0.1, 0.1] + # NUM_JOINTS=4 + # LINK_LENGTH=[200, 140, 80, 50] + # INI_JOING_ANGLES=[0.1, 0.1, 0.1, 0.1] + SCREEN_SIZE=1000 + SPARSE_REWARD=False + SCREEN_SHOT=False + action_range = 10.0 + + env=Reacher(screen_size=SCREEN_SIZE, num_joints=NUM_JOINTS, link_lengths = LINK_LENGTH, \ + ini_joint_angles=INI_JOING_ANGLES, target_pos = [369,430], render=True, change_goal=False) + action_dim = env.num_actions + state_dim = env.num_observations +elif ENV == 'Pendulum': + env = NormalizedActions(gym.make("Pendulum-v0")) + action_dim = env.action_space.shape[0] + state_dim = env.observation_space.shape[0] + action_range=1. + + +# hyper-parameters for RL training +max_frames = 40000 +max_steps = 20 if ENV == 'Reacher' else 150 # Pendulum needs 150 steps per episode to learn well, cannot handle 20 +frame_idx = 0 +batch_size = 256 +explore_steps = 200 # for random action sampling in the beginning of training +update_itr = 1 +AUTO_ENTROPY=True +DETERMINISTIC=False +hidden_dim = 512 +rewards = [] +predict_qs = [] + + +sac_trainer=SAC_Trainer(replay_buffer, hidden_dim=hidden_dim, action_range=action_range ) +# training loop +while frame_idx < max_frames: + if ENV == 'Reacher': + state = env.reset(SCREEN_SHOT) + elif ENV == 'Pendulum': + state = env.reset() + episode_reward = 0 + predict_q = 0 + + + for step in range(max_steps): + if frame_idx > explore_steps: + action = sac_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC) + else: + action = sac_trainer.policy_net.sample_action() + if ENV == 'Reacher': + next_state, reward, done, _ = env.step(action, SPARSE_REWARD, SCREEN_SHOT) + elif ENV == 'Pendulum': + next_state, reward, done, _ = env.step(action) + env.render() + + replay_buffer.push(state, action, reward, next_state, done) + + state = next_state + episode_reward += reward + frame_idx += 1 + + + if len(replay_buffer) > batch_size: + for i in range(update_itr): + predict_q=sac_trainer.update(batch_size, reward_scale=10., auto_entropy=AUTO_ENTROPY, target_entropy=-1.*action_dim) + + if frame_idx % batch_size == 0: + plot(frame_idx, rewards, predict_qs) + + if done: + break + print('Episode: ', frame_idx/max_steps, '| Episode Reward: ', episode_reward) + rewards.append(episode_reward) + predict_qs.append(predict_q) diff --git a/examples/reinforcement_learning/tutorial_td3.py b/examples/reinforcement_learning/tutorial_td3.py new file mode 100644 index 000000000..52ae022ff --- /dev/null +++ b/examples/reinforcement_learning/tutorial_td3.py @@ -0,0 +1,408 @@ +''' +Twin Delayed DDPG (TD3), if no twin no delayed then it's DDPG. +using networks including: 2 Q-net, 2 target Q-net, 1 policy net, 1 target policy net +original paper: https://arxiv.org/pdf/1802.09477.pdf +Actor policy is deterministic, with Gaussian exploration noise. + +Env: Openai Gym Pendulum-v0, continuous action space + +tensorflow 2.0.0a0 +tensorflow-probability 0.6.0 +tensorlayer 2.0.0 + +&& +pip install box2d box2d-kengz --user +''' +import argparse +import math +import random +import time + +import matplotlib.pyplot as plt +import numpy as np +from IPython.display import clear_output + +import gym +import tensorflow as tf +import tensorflow_probability as tfp +import tensorlayer as tl +from tensorlayer.layers import Dense +from tensorlayer.models import Model + +tfd = tfp.distributions +Normal = tfd.Normal + +tl.logging.set_verbosity(tl.logging.DEBUG) + +np.random.seed(2) +tf.random.set_seed(2) # reproducible + + +# GPU = True +# device_idx = 0 +# if GPU: +# device = torch.device("cuda:" + str(device_idx) if torch.cuda.is_available() else "cpu") +# else: +# device = torch.device("cpu") +# print(device) + +parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') +parser.add_argument('--train', dest='train', action='store_true', default=False) +parser.add_argument('--test', dest='test', action='store_true', default=True) +args = parser.parse_args() + +class ReplayBuffer: + def __init__(self, capacity): + self.capacity = capacity + self.buffer = [] + self.position = 0 + + def push(self, state, action, reward, next_state, done): + if len(self.buffer) < self.capacity: + self.buffer.append(None) + self.buffer[self.position] = (state, action, reward, next_state, done) + self.position = int((self.position + 1) % self.capacity) # as a ring buffer + + def sample(self, batch_size): + batch = random.sample(self.buffer, batch_size) + state, action, reward, next_state, done = map(np.stack, zip(*batch)) # stack for each element + ''' + the * serves as unpack: sum(a,b) <=> batch=(a,b), sum(*batch) ; + zip: a=[1,2], b=[2,3], zip(a,b) => [(1, 2), (2, 3)] ; + the map serves as mapping the function on each list element: map(square, [2,3]) => [4,9] ; + np.stack((1,2)) => array([1, 2]) + ''' + return state, action, reward, next_state, done + + def __len__(self): + return len(self.buffer) + +class NormalizedActions(gym.ActionWrapper): + def _action(self, action): + low = self.action_space.low + high = self.action_space.high + + action = low + (action + 1.0) * 0.5 * (high - low) + action = np.clip(action, low, high) + + return action + + def _reverse_action(self, action): + low = self.action_space.low + high = self.action_space.high + + action = 2 * (action - low) / (high - low) - 1 + action = np.clip(action, low, high) + + return action + +class QNetwork(Model): + def __init__(self, num_inputs, num_actions, hidden_dim, init_w=3e-3): + super(QNetwork, self).__init__() + input_dim = num_inputs + num_actions + # w_init = tf.keras.initializers.glorot_normal(seed=None) + w_init = tf.random_uniform_initializer(-init_w, init_w) + + self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=input_dim, name='q1') + self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='q2') + self.linear3 = Dense(n_units=1, W_init=w_init, in_channels=hidden_dim, name='q3') + + def forward(self, input): + x = self.linear1(input) + x = self.linear2(x) + x = self.linear3(x) + return x + +class PolicyNetwork(Model): + def __init__(self, num_inputs, num_actions, hidden_dim, action_range=1., init_w=3e-3): + super(PolicyNetwork, self).__init__() + + # w_init = tf.keras.initializers.glorot_normal(seed=None) + w_init = tf.random_uniform_initializer(-init_w, init_w) + + self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=num_inputs, name='policy1') + self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy2') + self.linear3 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy3') + + self.output_linear = Dense(n_units=num_actions, W_init=w_init, \ + b_init=tf.random_uniform_initializer(-init_w, init_w), in_channels=hidden_dim, name='policy_output') + + self.action_range = action_range + self.num_actions = num_actions + + + def forward(self, state): + x = self.linear1(state) + x = self.linear2(x) + x = self.linear3(x) + + output = tf.nn.tanh(self.output_linear(x)) # unit range output [-1, 1] + + return output + + def evaluate(self, state, eval_noise_scale): + ''' generate action with state for calculating gradients ''' + state = state.astype(np.float32) + action = self.forward(state) + + action = self.action_range*action + + # add noise + normal = Normal(0, 1) + eval_noise_clip = 2*eval_noise_scale + noise = normal.sample(action.shape) * eval_noise_scale + noise = tf.clip_by_value(noise, -eval_noise_clip, eval_noise_clip) + action = action + noise + + return action + + + def get_action(self, state, explore_noise_scale): + ''' generate action with state for interaction with envronment ''' + action = self.forward([state]) + action = action.numpy()[0] + + # add noise + normal = Normal(0, 1) + noise = normal.sample(action.shape) * explore_noise_scale + action = self.action_range*action + noise + + return action.numpy() + + def sample_action(self,): + ''' generate random actions for exploration ''' + a = tf.random.uniform([self.num_actions], -1, 1) + + return self.action_range*a.numpy() + + +class TD3_Trainer(): + def __init__(self, replay_buffer, hidden_dim, action_range, policy_target_update_interval=1, q_lr=3e-4, policy_lr=3e-4): + self.replay_buffer = replay_buffer + + # initialize all networks + self.q_net1 = QNetwork(state_dim, action_dim, hidden_dim) + self.q_net2 = QNetwork(state_dim, action_dim, hidden_dim) + self.target_q_net1 = QNetwork(state_dim, action_dim, hidden_dim) + self.target_q_net2 = QNetwork(state_dim, action_dim, hidden_dim) + self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim, action_range) + self.target_policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim, action_range) + print('Q Network (1,2): ', self.q_net1) + print('Policy Network: ', self.policy_net) + + # initialize weights of target networks + self.target_q_net1 = self.target_ini(self.q_net1, self.target_q_net1) + self.target_q_net2 = self.target_ini(self.q_net2, self.target_q_net2) + self.target_policy_net = self.target_ini(self.policy_net, self.target_policy_net) + + self.update_cnt = 0 + self.policy_target_update_interval = policy_target_update_interval + + self.q_optimizer1 = tf.optimizers.Adam(q_lr) + self.q_optimizer2 = tf.optimizers.Adam(q_lr) + self.policy_optimizer = tf.optimizers.Adam(policy_lr) + + def target_ini(self, net, target_net): + ''' hard-copy update for initializing target networks ''' + for target_param, param in zip(target_net.trainable_weights, net.trainable_weights): + target_param.assign(param) + return target_net + + def target_soft_update(self, net, target_net, soft_tau): + ''' soft update the target net with Polyak averaging ''' + for target_param, param in zip(target_net.trainable_weights, net.trainable_weights): + target_param.assign( # copy weight value into target parameters + target_param * (1.0 - soft_tau) + param * soft_tau + ) + return target_net + + def update(self, batch_size, eval_noise_scale, reward_scale=10., gamma=0.9, soft_tau=1e-2): + ''' update all networks in TD3 ''' + self.update_cnt+=1 + state, action, reward, next_state, done = self.replay_buffer.sample(batch_size) + + reward = reward[:, np.newaxis] # expand dim + done = done[:, np.newaxis] + + new_next_action = self.target_policy_net.evaluate(next_state, eval_noise_scale=eval_noise_scale) # clipped normal noise + reward = reward_scale * (reward - np.mean(reward, axis=0)) /np.std(reward, axis=0) # normalize with batch mean and std + + # Training Q Function + target_q_input = tf.concat([next_state, new_next_action], 1) # the dim 0 is number of samples + target_q_min = tf.minimum(self.target_q_net1(target_q_input),self.target_q_net2(target_q_input)) + + target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward + q_input = tf.concat([state, action], 1) # input of q_net + + with tf.GradientTape() as q1_tape: + predicted_q_value1 = self.q_net1(q_input) + q_value_loss1 = tf.reduce_mean(tf.square(predicted_q_value1 - target_q_value)) + q1_grad = q1_tape.gradient(q_value_loss1, self.q_net1.trainable_weights) + self.q_optimizer1.apply_gradients(zip(q1_grad, self.q_net1.trainable_weights)) + + with tf.GradientTape() as q2_tape: + predicted_q_value2 = self.q_net2(q_input) + q_value_loss2 = tf.reduce_mean(tf.square(predicted_q_value2 - target_q_value)) + q2_grad = q2_tape.gradient(q_value_loss2, self.q_net2.trainable_weights) + self.q_optimizer2.apply_gradients(zip(q2_grad, self.q_net2.trainable_weights)) + + # Training Policy Function + if self.update_cnt%self.policy_target_update_interval==0: + with tf.GradientTape() as p_tape: + new_action = self.policy_net.evaluate(state, eval_noise_scale=0.0) # no noise, deterministic policy gradients + new_q_input = tf.concat([state, new_action], 1) + ''' implementation 1 ''' + # predicted_new_q_value = torch.min(self.q_net1(new_q_input),self.q_net2(new_q_input)) + ''' implementation 2 ''' + predicted_new_q_value = self.q_net1(new_q_input) + policy_loss = - tf.reduce_mean(predicted_new_q_value) + p_grad = p_tape.gradient(policy_loss, self.policy_net.trainable_weights) + self.policy_optimizer.apply_gradients(zip(p_grad, self.policy_net.trainable_weights)) + + # Soft update the target nets + self.target_q_net1=self.target_soft_update(self.q_net1, self.target_q_net1, soft_tau) + self.target_q_net2=self.target_soft_update(self.q_net2, self.target_q_net2, soft_tau) + self.target_policy_net=self.target_soft_update(self.policy_net, self.target_policy_net, soft_tau) + + def save_weights(self): # save trained weights + tl.files.save_npz(self.q_net1.trainable_weights, name='model_q_net1.npz') + tl.files.save_npz(self.q_net2.trainable_weights, name='model_q_net2.npz') + tl.files.save_npz(self.target_q_net1.trainable_weights, name='model_target_q_net1.npz') + tl.files.save_npz(self.target_q_net2.trainable_weights, name='model_target_q_net2.npz') + tl.files.save_npz(self.policy_net.trainable_weights, name='model_policy_net.npz') + tl.files.save_npz(self.target_policy_net.trainable_weights, name='model_target_policy_net.npz') + + def load_weights(self): # load trained weights + tl.files.load_and_assign_npz(name='model_q_net1.npz', network=self.q_net1) + tl.files.load_and_assign_npz(name='model_q_net2.npz', network=self.q_net2) + tl.files.load_and_assign_npz(name='model_target_q_net1.npz', network=self.target_q_net1) + tl.files.load_and_assign_npz(name='model_target_q_net2.npz', network=self.target_q_net2) + tl.files.load_and_assign_npz(name='model_policy_net.npz', network=self.policy_net) + tl.files.load_and_assign_npz(name='model_target_policy_net.npz', network=self.target_policy_net) + + + +def plot(frame_idx, rewards): + clear_output(True) + plt.figure(figsize=(20,5)) + plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1])) + plt.plot(rewards) + plt.xlabel('Episode') + plt.ylabel('Episode Reward') + plt.savefig('td3.png') + # plt.show() + + +# choose env +ENV = 'Pendulum-v0' +env = NormalizedActions(gym.make(ENV)) +action_dim = env.action_space.shape[0] +state_dim = env.observation_space.shape[0] +action_range=1. + +replay_buffer_size = 5e5 +replay_buffer = ReplayBuffer(replay_buffer_size) + + +# hyper-parameters for RL training +max_frames = 40000 # total number of steps for training +test_frames = 300 # total number of steps for testing +max_steps = 150 # maximum number of steps for one episode +batch_size = 64 # udpate batchsize +explore_steps = 0 # 500 for random action sampling in the beginning of training +update_itr = 3 # delayed steps for updating the policy network and target networks +hidden_dim = 32 # size of hidden layers for networks +q_lr = 3e-4 # q_net learning rate +policy_lr = 3e-4 # policy_net learning rate +policy_target_update_interval = 3 # delayed update for the policy network and target networks +explore_noise_scale = 1.0 # range of action noise for exploration +eval_noise_scale = 0.5 # range of action noise for evaluation of action value +reward_scale = 1. # value range of reward + +td3_trainer=TD3_Trainer(replay_buffer, hidden_dim=hidden_dim, policy_target_update_interval=policy_target_update_interval, \ +action_range=action_range, q_lr=q_lr, policy_lr=policy_lr ) +# set train mode +td3_trainer.q_net1.train() +td3_trainer.q_net2.train() +td3_trainer.target_q_net1.train() +td3_trainer.target_q_net2.train() +td3_trainer.policy_net.train() +td3_trainer.target_policy_net.train() + +# training loop +if args.train: + frame_idx = 0 + rewards = [] + while frame_idx < max_frames: + state = env.reset() + state = state.astype(np.float32) + episode_reward = 0 + if frame_idx <1 : + print('intialize') + _=td3_trainer.policy_net([state]) # need an extra call to make inside functions be able to use forward + _=td3_trainer.target_policy_net([state]) + + + for step in range(max_steps): + if frame_idx > explore_steps: + action = td3_trainer.policy_net.get_action(state, explore_noise_scale=1.0) + else: + action = td3_trainer.policy_net.sample_action() + + next_state, reward, done, _ = env.step(action) + next_state = next_state.astype(np.float32) + env.render() + done = 1 if done == True else 0 + + replay_buffer.push(state, action, reward, next_state, done) + + state = next_state + episode_reward += reward + frame_idx += 1 + + if len(replay_buffer) > batch_size: + for i in range(update_itr): + td3_trainer.update(batch_size, eval_noise_scale=0.5, reward_scale=1.) + + if frame_idx % 500 == 0: + plot(frame_idx, rewards) + + if done: + break + print('Episode: ', frame_idx/max_steps, '| Episode Reward: ', episode_reward) + rewards.append(episode_reward) + td3_trainer.save_weights() + +if args.test: + frame_idx = 0 + rewards = [] + td3_trainer.load_weights() + + while frame_idx < test_frames: + state = env.reset() + state = state.astype(np.float32) + episode_reward = 0 + if frame_idx <1 : + print('intialize') + _=td3_trainer.policy_net([state]) # need an extra call to make inside functions be able to use forward + _=td3_trainer.target_policy_net([state]) + + + for step in range(max_steps): + action = td3_trainer.policy_net.get_action(state, explore_noise_scale=1.0) + next_state, reward, done, _ = env.step(action) + next_state = next_state.astype(np.float32) + env.render() + done = 1 if done == True else 0 + + state = next_state + episode_reward += reward + frame_idx += 1 + + # if frame_idx % 50 == 0: + # plot(frame_idx, rewards) + + if done: + break + print('Episode: ', frame_idx/max_steps, '| Episode Reward: ', episode_reward) + rewards.append(episode_reward) diff --git a/examples/spatial_transformer_network/tutorial_spatial_transformer_network_dynamic.py b/examples/spatial_transformer_network/tutorial_spatial_transformer_network_dynamic.py index aecc69f61..3170585e4 100644 --- a/examples/spatial_transformer_network/tutorial_spatial_transformer_network_dynamic.py +++ b/examples/spatial_transformer_network/tutorial_spatial_transformer_network_dynamic.py @@ -1,7 +1,9 @@ #! /usr/bin/python # -*- coding: utf8 -*- import time + import numpy as np + import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import * diff --git a/examples/spatial_transformer_network/tutorial_spatial_transformer_network_static.py b/examples/spatial_transformer_network/tutorial_spatial_transformer_network_static.py index c9a93629f..5f09db68b 100644 --- a/examples/spatial_transformer_network/tutorial_spatial_transformer_network_static.py +++ b/examples/spatial_transformer_network/tutorial_spatial_transformer_network_static.py @@ -1,7 +1,9 @@ #! /usr/bin/python # -*- coding: utf8 -*- import time + import numpy as np + import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import * diff --git a/examples/text_classification/tutorial_imdb_fasttext.py b/examples/text_classification/tutorial_imdb_fasttext.py index 2c2c7aed0..731d2fce4 100644 --- a/examples/text_classification/tutorial_imdb_fasttext.py +++ b/examples/text_classification/tutorial_imdb_fasttext.py @@ -31,8 +31,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import * from tensorlayer.models import * diff --git a/examples/text_generation/tutorial_generate_text.py b/examples/text_generation/tutorial_generate_text.py index 22a17ea37..4c42d0b12 100644 --- a/examples/text_generation/tutorial_generate_text.py +++ b/examples/text_generation/tutorial_generate_text.py @@ -28,8 +28,8 @@ import nltk import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import * diff --git a/examples/text_ptb/tutorial_ptb_lstm.py b/examples/text_ptb/tutorial_ptb_lstm.py index de08399c9..77c7c3425 100644 --- a/examples/text_ptb/tutorial_ptb_lstm.py +++ b/examples/text_ptb/tutorial_ptb_lstm.py @@ -104,8 +104,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/text_ptb/tutorial_ptb_lstm_state_is_tuple.py b/examples/text_ptb/tutorial_ptb_lstm_state_is_tuple.py index 0021a7bfc..9fccca66a 100644 --- a/examples/text_ptb/tutorial_ptb_lstm_state_is_tuple.py +++ b/examples/text_ptb/tutorial_ptb_lstm_state_is_tuple.py @@ -105,8 +105,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/text_word_embedding/tutorial_word2vec_basic.py b/examples/text_word_embedding/tutorial_word2vec_basic.py index 6310699ad..5a1dc842c 100644 --- a/examples/text_word_embedding/tutorial_word2vec_basic.py +++ b/examples/text_word_embedding/tutorial_word2vec_basic.py @@ -44,9 +44,9 @@ import time import numpy as np -import tensorflow as tf from six.moves import xrange # pylint: disable=redefined-builtin +import tensorflow as tf import tensorlayer as tl import wget diff --git a/examples/tutorial_work_with_onnx.py b/examples/tutorial_work_with_onnx.py index 522f2ad8c..46fd0cb42 100644 --- a/examples/tutorial_work_with_onnx.py +++ b/examples/tutorial_work_with_onnx.py @@ -117,13 +117,13 @@ import time import numpy as np -import tensorflow as tf -from tensorflow.python.tools.freeze_graph import freeze_graph as _freeze_graph import onnx +import tensorflow as tf import tensorlayer as tl from onnx_tf.backend import prepare from onnx_tf.frontend import tensorflow_graph_to_onnx_model +from tensorflow.python.tools.freeze_graph import freeze_graph as _freeze_graph tf.logging.set_verbosity(tf.logging.DEBUG) tl.logging.set_verbosity(tl.logging.DEBUG) diff --git a/tensorlayer/activation.py b/tensorlayer/activation.py index 7c7b833c3..4aef4a429 100644 --- a/tensorlayer/activation.py +++ b/tensorlayer/activation.py @@ -3,7 +3,6 @@ """A file containing various activation functions.""" import tensorflow as tf - from tensorlayer.decorators import deprecated __all__ = [ diff --git a/tensorlayer/cost.py b/tensorlayer/cost.py index 252178502..2664d8d72 100644 --- a/tensorlayer/cost.py +++ b/tensorlayer/cost.py @@ -5,8 +5,7 @@ import tensorflow as tf from tensorflow.python.framework import ops -from tensorflow.python.ops import standard_ops, math_ops, nn_ops, array_ops - +from tensorflow.python.ops import array_ops, math_ops, nn_ops, standard_ops from tensorlayer import logging __all__ = [ diff --git a/tensorlayer/db.py b/tensorlayer/db.py index cb8db8e10..1de73bf6a 100644 --- a/tensorlayer/db.py +++ b/tensorlayer/db.py @@ -7,15 +7,16 @@ import time from datetime import datetime -import gridfs import numpy as np + +import gridfs import pymongo import tensorflow as tf - from tensorlayer import logging -from tensorlayer.files import net2static_graph, static_graph2net, assign_weights -from tensorlayer.files import save_weights_to_hdf5, load_hdf5_to_weights -from tensorlayer.files import del_folder, exists_or_mkdir +from tensorlayer.files import ( + assign_weights, del_folder, exists_or_mkdir, load_hdf5_to_weights, net2static_graph, save_weights_to_hdf5, + static_graph2net +) class TensorHub(object): @@ -640,7 +641,7 @@ def run_top_task(self, task_name=None, sort=None, **kwargs): logging.info("[Database] Start Task: key: {} sort: {} push time: {}".format(task_name, sort, _datetime)) _script = _script.decode('utf-8') with tf.Graph().as_default(): # # as graph: # clear all TF graphs - exec(_script, globals()) + exec (_script, globals()) # set status to finished _ = self.db.Task.find_one_and_update({'_id': _id}, {'$set': {'status': 'finished'}}) diff --git a/tensorlayer/distributed.py b/tensorlayer/distributed.py index 544aac87e..d3fbdd38f 100644 --- a/tensorlayer/distributed.py +++ b/tensorlayer/distributed.py @@ -6,7 +6,6 @@ import tensorflow as tf from tensorflow.python.training import session_run_hook - from tensorlayer import logging from tensorlayer.decorators import deprecated from tensorlayer.lazy_imports import LazyImport diff --git a/tensorlayer/files/dataset_loaders/celebA_dataset.py b/tensorlayer/files/dataset_loaders/celebA_dataset.py index d5dc5755f..3563d58f9 100644 --- a/tensorlayer/files/dataset_loaders/celebA_dataset.py +++ b/tensorlayer/files/dataset_loaders/celebA_dataset.py @@ -5,7 +5,8 @@ import zipfile from tensorlayer import logging -from tensorlayer.files.utils import (download_file_from_google_drive, exists_or_mkdir, load_file_list) +from tensorlayer.files.utils import (download_file_from_google_drive, + exists_or_mkdir, load_file_list) __all__ = ['load_celebA_dataset'] diff --git a/tensorlayer/files/dataset_loaders/cyclegan_dataset.py b/tensorlayer/files/dataset_loaders/cyclegan_dataset.py index e327b3b4c..6c465f6c5 100644 --- a/tensorlayer/files/dataset_loaders/cyclegan_dataset.py +++ b/tensorlayer/files/dataset_loaders/cyclegan_dataset.py @@ -6,7 +6,8 @@ import numpy as np from tensorlayer import logging, visualize -from tensorlayer.files.utils import (del_file, folder_exists, load_file_list, maybe_download_and_extract) +from tensorlayer.files.utils import (del_file, folder_exists, load_file_list, + maybe_download_and_extract) __all__ = ['load_cyclegan_dataset'] diff --git a/tensorlayer/files/dataset_loaders/flickr_1M_dataset.py b/tensorlayer/files/dataset_loaders/flickr_1M_dataset.py index f2e582ae5..9f466c0eb 100644 --- a/tensorlayer/files/dataset_loaders/flickr_1M_dataset.py +++ b/tensorlayer/files/dataset_loaders/flickr_1M_dataset.py @@ -4,9 +4,9 @@ import os from tensorlayer import logging, visualize -from tensorlayer.files.utils import ( - del_file, folder_exists, load_file_list, load_folder_list, maybe_download_and_extract, read_file -) +from tensorlayer.files.utils import (del_file, folder_exists, load_file_list, + load_folder_list, + maybe_download_and_extract, read_file) __all__ = ['load_flickr1M_dataset'] diff --git a/tensorlayer/files/dataset_loaders/flickr_25k_dataset.py b/tensorlayer/files/dataset_loaders/flickr_25k_dataset.py index 8049a0653..0492371b0 100644 --- a/tensorlayer/files/dataset_loaders/flickr_25k_dataset.py +++ b/tensorlayer/files/dataset_loaders/flickr_25k_dataset.py @@ -4,9 +4,9 @@ import os from tensorlayer import logging, visualize -from tensorlayer.files.utils import ( - del_file, folder_exists, load_file_list, maybe_download_and_extract, natural_keys, read_file -) +from tensorlayer.files.utils import (del_file, folder_exists, load_file_list, + maybe_download_and_extract, natural_keys, + read_file) __all__ = ['load_flickr25k_dataset'] diff --git a/tensorlayer/files/dataset_loaders/mpii_dataset.py b/tensorlayer/files/dataset_loaders/mpii_dataset.py index a6f88f609..8b90dcdec 100644 --- a/tensorlayer/files/dataset_loaders/mpii_dataset.py +++ b/tensorlayer/files/dataset_loaders/mpii_dataset.py @@ -4,7 +4,8 @@ import os from tensorlayer import logging -from tensorlayer.files.utils import (del_file, folder_exists, load_file_list, maybe_download_and_extract) +from tensorlayer.files.utils import (del_file, folder_exists, load_file_list, + maybe_download_and_extract) __all__ = ['load_mpii_pose_dataset'] diff --git a/tensorlayer/files/dataset_loaders/voc_dataset.py b/tensorlayer/files/dataset_loaders/voc_dataset.py index 5584864ae..c5ccadbcf 100644 --- a/tensorlayer/files/dataset_loaders/voc_dataset.py +++ b/tensorlayer/files/dataset_loaders/voc_dataset.py @@ -4,9 +4,10 @@ import os import tensorflow as tf - from tensorlayer import logging, utils -from tensorlayer.files.utils import (del_file, del_folder, folder_exists, load_file_list, maybe_download_and_extract) +from tensorlayer.files.utils import (del_file, del_folder, folder_exists, + load_file_list, + maybe_download_and_extract) __all__ = ['load_voc_dataset'] diff --git a/tensorlayer/files/dataset_loaders/wmt_en_fr_dataset.py b/tensorlayer/files/dataset_loaders/wmt_en_fr_dataset.py index 0261a8581..77c1f93f9 100644 --- a/tensorlayer/files/dataset_loaders/wmt_en_fr_dataset.py +++ b/tensorlayer/files/dataset_loaders/wmt_en_fr_dataset.py @@ -6,7 +6,6 @@ import tarfile from tensorflow.python.platform import gfile - from tensorlayer import logging from tensorlayer.files.utils import maybe_download_and_extract diff --git a/tensorlayer/files/utils.py b/tensorlayer/files/utils.py index 72fcb1824..e4b0f6f8e 100644 --- a/tensorlayer/files/utils.py +++ b/tensorlayer/files/utils.py @@ -1,8 +1,9 @@ #! /usr/bin/python # -*- coding: utf-8 -*- +import base64 import gzip -import importlib +import json import math import os import pickle @@ -14,23 +15,20 @@ import time import zipfile +import cloudpickle import h5py import numpy as np -import progressbar import scipy.io as sio -import tensorflow as tf from six.moves import cPickle -from tensorflow.python.platform import gfile +import progressbar +import tensorflow as tf import tensorlayer as tl -from tensorlayer import logging, nlp, utils, visualize - -import cloudpickle -import base64 from tensorflow.python.keras.saving import model_config as model_config_lib -from tensorflow.python.util.tf_export import keras_export +from tensorflow.python.platform import gfile from tensorflow.python.util import serialization -import json +from tensorflow.python.util.tf_export import keras_export +from tensorlayer import logging, nlp, utils, visualize # from six.moves import zip diff --git a/tensorlayer/initializers.py b/tensorlayer/initializers.py index 666777824..f68c05c1d 100644 --- a/tensorlayer/initializers.py +++ b/tensorlayer/initializers.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import numpy as np + import tensorflow as tf __all__ = [ diff --git a/tensorlayer/layers/activation.py b/tensorlayer/layers/activation.py index 44fcc47a9..9abb19ce7 100644 --- a/tensorlayer/layers/activation.py +++ b/tensorlayer/layers/activation.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - from tensorlayer import logging from tensorlayer.activation import leaky_relu6, leaky_twice_relu6 from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/binary_conv.py b/tensorlayer/layers/convolution/binary_conv.py index 23448cf6f..14e5a8721 100644 --- a/tensorlayer/layers/convolution/binary_conv.py +++ b/tensorlayer/layers/convolution/binary_conv.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/deformable_conv.py b/tensorlayer/layers/convolution/deformable_conv.py index 5f75bbe15..b9a8224db 100644 --- a/tensorlayer/layers/convolution/deformable_conv.py +++ b/tensorlayer/layers/convolution/deformable_conv.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias, private_method diff --git a/tensorlayer/layers/convolution/depthwise_conv.py b/tensorlayer/layers/convolution/depthwise_conv.py index d6136ede3..4fe4dc34c 100644 --- a/tensorlayer/layers/convolution/depthwise_conv.py +++ b/tensorlayer/layers/convolution/depthwise_conv.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/dorefa_conv.py b/tensorlayer/layers/convolution/dorefa_conv.py index ed9b32dd8..1f8944382 100644 --- a/tensorlayer/layers/convolution/dorefa_conv.py +++ b/tensorlayer/layers/convolution/dorefa_conv.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/expert_conv.py b/tensorlayer/layers/convolution/expert_conv.py index d7e59a0e8..fb27b9df6 100644 --- a/tensorlayer/layers/convolution/expert_conv.py +++ b/tensorlayer/layers/convolution/expert_conv.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/expert_deconv.py b/tensorlayer/layers/convolution/expert_deconv.py index cb5cd6773..a1571b2cb 100644 --- a/tensorlayer/layers/convolution/expert_deconv.py +++ b/tensorlayer/layers/convolution/expert_deconv.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/group_conv.py b/tensorlayer/layers/convolution/group_conv.py index 34d8c10e6..2923a10ae 100644 --- a/tensorlayer/layers/convolution/group_conv.py +++ b/tensorlayer/layers/convolution/group_conv.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/quan_conv.py b/tensorlayer/layers/convolution/quan_conv.py index e235dfeb4..662df2661 100644 --- a/tensorlayer/layers/convolution/quan_conv.py +++ b/tensorlayer/layers/convolution/quan_conv.py @@ -2,12 +2,12 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer -from tensorlayer.layers.utils import (quantize_active_overflow, quantize_weight_overflow) +from tensorlayer.layers.utils import (quantize_active_overflow, + quantize_weight_overflow) __all__ = ['QuanConv2d'] diff --git a/tensorlayer/layers/convolution/quan_conv_bn.py b/tensorlayer/layers/convolution/quan_conv_bn.py index ef0f9bfda..1c1593373 100644 --- a/tensorlayer/layers/convolution/quan_conv_bn.py +++ b/tensorlayer/layers/convolution/quan_conv_bn.py @@ -3,11 +3,11 @@ import tensorflow as tf from tensorflow.python.training import moving_averages - from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer -from tensorlayer.layers.utils import (quantize_active_overflow, quantize_weight_overflow) +from tensorlayer.layers.utils import (quantize_active_overflow, + quantize_weight_overflow) # from tensorlayer.layers.core import LayersConfig diff --git a/tensorlayer/layers/convolution/separable_conv.py b/tensorlayer/layers/convolution/separable_conv.py index b6ae62446..ff67672ba 100644 --- a/tensorlayer/layers/convolution/separable_conv.py +++ b/tensorlayer/layers/convolution/separable_conv.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/simplified_conv.py b/tensorlayer/layers/convolution/simplified_conv.py index c00ff8fe7..8c8eebece 100644 --- a/tensorlayer/layers/convolution/simplified_conv.py +++ b/tensorlayer/layers/convolution/simplified_conv.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/simplified_deconv.py b/tensorlayer/layers/convolution/simplified_deconv.py index 847062859..569fe0810 100644 --- a/tensorlayer/layers/convolution/simplified_deconv.py +++ b/tensorlayer/layers/convolution/simplified_deconv.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/super_resolution.py b/tensorlayer/layers/convolution/super_resolution.py index 35fee8722..a3f51e2a8 100644 --- a/tensorlayer/layers/convolution/super_resolution.py +++ b/tensorlayer/layers/convolution/super_resolution.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias, private_method diff --git a/tensorlayer/layers/convolution/ternary_conv.py b/tensorlayer/layers/convolution/ternary_conv.py index 9a97c7bec..512350ba5 100644 --- a/tensorlayer/layers/convolution/ternary_conv.py +++ b/tensorlayer/layers/convolution/ternary_conv.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/core.py b/tensorlayer/layers/core.py index ce98f156c..8e13631b7 100644 --- a/tensorlayer/layers/core.py +++ b/tensorlayer/layers/core.py @@ -1,17 +1,15 @@ #! /usr/bin/python # -*- coding: utf-8 -*- +import inspect from abc import abstractmethod import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import (deprecated_alias, private_method, protected_method) -from tensorlayer.layers.utils import (get_variable_with_initializer, list_remove_repeat) from tensorlayer.files import utils - -import inspect +from tensorlayer.layers.utils import (get_variable_with_initializer, list_remove_repeat) __all__ = ['Layer', 'ModelLayer', 'LayerList'] diff --git a/tensorlayer/layers/dense/base_dense.py b/tensorlayer/layers/dense/base_dense.py index a5b800f04..bec9d3f6f 100644 --- a/tensorlayer/layers/dense/base_dense.py +++ b/tensorlayer/layers/dense/base_dense.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/dense/binary_dense.py b/tensorlayer/layers/dense/binary_dense.py index 4067ac4c3..74d5208cd 100644 --- a/tensorlayer/layers/dense/binary_dense.py +++ b/tensorlayer/layers/dense/binary_dense.py @@ -3,7 +3,6 @@ import tensorflow as tf import tensorlayer as tl - from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/dense/dorefa_dense.py b/tensorlayer/layers/dense/dorefa_dense.py index 80ae3365c..73069d478 100644 --- a/tensorlayer/layers/dense/dorefa_dense.py +++ b/tensorlayer/layers/dense/dorefa_dense.py @@ -3,7 +3,6 @@ import tensorflow as tf import tensorlayer as tl - from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/dense/dropconnect.py b/tensorlayer/layers/dense/dropconnect.py index d68e6c762..371ed2e6b 100644 --- a/tensorlayer/layers/dense/dropconnect.py +++ b/tensorlayer/layers/dense/dropconnect.py @@ -1,13 +1,13 @@ #! /usr/bin/python # -*- coding: utf-8 -*- +import numbers + import tensorflow as tf import tensorlayer as tl - from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer -import numbers __all__ = [ 'DropconnectDense', diff --git a/tensorlayer/layers/dense/quan_dense.py b/tensorlayer/layers/dense/quan_dense.py index 5a2513259..8d5c594c7 100644 --- a/tensorlayer/layers/dense/quan_dense.py +++ b/tensorlayer/layers/dense/quan_dense.py @@ -3,11 +3,11 @@ import tensorflow as tf import tensorlayer as tl - from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer -from tensorlayer.layers.utils import (quantize_active_overflow, quantize_weight_overflow) +from tensorlayer.layers.utils import (quantize_active_overflow, + quantize_weight_overflow) __all__ = [ 'QuanDense', diff --git a/tensorlayer/layers/dense/quan_dense_bn.py b/tensorlayer/layers/dense/quan_dense_bn.py index e647a7e6e..bcbd70950 100644 --- a/tensorlayer/layers/dense/quan_dense_bn.py +++ b/tensorlayer/layers/dense/quan_dense_bn.py @@ -4,11 +4,11 @@ import tensorflow as tf # from tensorlayer.layers.core import LayersConfig from tensorflow.python.training import moving_averages - from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer -from tensorlayer.layers.utils import (quantize_active_overflow, quantize_weight_overflow) +from tensorlayer.layers.utils import (quantize_active_overflow, + quantize_weight_overflow) __all__ = [ 'QuanDenseLayerWithBN', diff --git a/tensorlayer/layers/dense/ternary_dense.py b/tensorlayer/layers/dense/ternary_dense.py index 27efb9090..28d84297e 100644 --- a/tensorlayer/layers/dense/ternary_dense.py +++ b/tensorlayer/layers/dense/ternary_dense.py @@ -3,7 +3,6 @@ import tensorflow as tf import tensorlayer as tl - from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/dropout.py b/tensorlayer/layers/dropout.py index 3724d8b43..25fe80a36 100644 --- a/tensorlayer/layers/dropout.py +++ b/tensorlayer/layers/dropout.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/embedding.py b/tensorlayer/layers/embedding.py index 80c5cadfa..a82c1a93b 100644 --- a/tensorlayer/layers/embedding.py +++ b/tensorlayer/layers/embedding.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer import logging from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/extend.py b/tensorlayer/layers/extend.py index 42395a537..09d5508db 100644 --- a/tensorlayer/layers/extend.py +++ b/tensorlayer/layers/extend.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/image_resampling.py b/tensorlayer/layers/image_resampling.py index 3b2a2825a..4713200d3 100644 --- a/tensorlayer/layers/image_resampling.py +++ b/tensorlayer/layers/image_resampling.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/inputs.py b/tensorlayer/layers/inputs.py index 0330347fe..4f2544b06 100644 --- a/tensorlayer/layers/inputs.py +++ b/tensorlayer/layers/inputs.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer import logging from tensorlayer.layers.core import Layer, LayerNode diff --git a/tensorlayer/layers/lambda_layers.py b/tensorlayer/layers/lambda_layers.py index 13bc3ecbe..9b82ad603 100644 --- a/tensorlayer/layers/lambda_layers.py +++ b/tensorlayer/layers/lambda_layers.py @@ -2,11 +2,10 @@ # -*- coding: utf-8 -*- import tensorflow as tf - from tensorlayer import logging from tensorlayer.decorators import deprecated_alias -from tensorlayer.layers.core import Layer from tensorlayer.files import utils +from tensorlayer.layers.core import Layer # from tensorlayer.layers.core import TF_GRAPHKEYS_VARIABLES diff --git a/tensorlayer/layers/merge.py b/tensorlayer/layers/merge.py index 346a65962..2509d35a6 100644 --- a/tensorlayer/layers/merge.py +++ b/tensorlayer/layers/merge.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - from tensorlayer import logging from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/noise.py b/tensorlayer/layers/noise.py index bd9c2df9c..c658f8e19 100644 --- a/tensorlayer/layers/noise.py +++ b/tensorlayer/layers/noise.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/normalization.py b/tensorlayer/layers/normalization.py index d8cec274c..0de0e8ed1 100644 --- a/tensorlayer/layers/normalization.py +++ b/tensorlayer/layers/normalization.py @@ -2,11 +2,10 @@ # -*- coding: utf-8 -*- import tensorflow as tf +import tensorlayer as tl from tensorflow.python.framework import ops from tensorflow.python.ops import math_ops from tensorflow.python.training import moving_averages - -import tensorlayer as tl from tensorlayer import logging from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/padding.py b/tensorlayer/layers/padding.py index db1bbb304..edcb720a5 100644 --- a/tensorlayer/layers/padding.py +++ b/tensorlayer/layers/padding.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/pooling.py b/tensorlayer/layers/pooling.py index 2046de6c5..a22cea358 100644 --- a/tensorlayer/layers/pooling.py +++ b/tensorlayer/layers/pooling.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/quantize.py b/tensorlayer/layers/quantize.py index 3b5b19635..47ad2a088 100644 --- a/tensorlayer/layers/quantize.py +++ b/tensorlayer/layers/quantize.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/recurrent.py b/tensorlayer/layers/recurrent.py index 16b7208d0..2364c6a7d 100644 --- a/tensorlayer/layers/recurrent.py +++ b/tensorlayer/layers/recurrent.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/scale.py b/tensorlayer/layers/scale.py index ac1800529..6546d70af 100644 --- a/tensorlayer/layers/scale.py +++ b/tensorlayer/layers/scale.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - from tensorlayer import logging from tensorlayer.initializers import constant from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/shape.py b/tensorlayer/layers/shape.py index f8e7b47db..e308eb0c4 100644 --- a/tensorlayer/layers/shape.py +++ b/tensorlayer/layers/shape.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/spatial_transformer.py b/tensorlayer/layers/spatial_transformer.py index e456625a7..262108a68 100644 --- a/tensorlayer/layers/spatial_transformer.py +++ b/tensorlayer/layers/spatial_transformer.py @@ -2,11 +2,11 @@ # -*- coding: utf-8 -*- import numpy as np +from six.moves import xrange + import tensorflow as tf import tensorlayer as tl -from six.moves import xrange from tensorflow.python.ops import array_ops - from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/stack.py b/tensorlayer/layers/stack.py index c31327989..c35e3837f 100644 --- a/tensorlayer/layers/stack.py +++ b/tensorlayer/layers/stack.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/utils.py b/tensorlayer/layers/utils.py index 10cc1fc18..6d411589f 100644 --- a/tensorlayer/layers/utils.py +++ b/tensorlayer/layers/utils.py @@ -2,10 +2,10 @@ # -*- coding: utf-8 -*- import numpy as np + import tensorflow as tf import tensorlayer as tl from tensorflow.python.ops.rnn_cell import LSTMStateTuple - from tensorlayer import logging from tensorlayer.decorators import deprecated, deprecated_alias diff --git a/tensorlayer/logging/contrib/hyperdash.py b/tensorlayer/logging/contrib/hyperdash.py index 122a8c7e5..6e19c8e9b 100644 --- a/tensorlayer/logging/contrib/hyperdash.py +++ b/tensorlayer/logging/contrib/hyperdash.py @@ -4,7 +4,6 @@ from __future__ import absolute_import import hyperdash as hd - import tensorlayer as tl __all__ = ["HyperDashHandler", "monitor", "Experiment", "IPythonMagicsWrapper"] diff --git a/tensorlayer/models/core.py b/tensorlayer/models/core.py index c811b9648..cbcff4bf3 100644 --- a/tensorlayer/models/core.py +++ b/tensorlayer/models/core.py @@ -3,9 +3,8 @@ from queue import Queue import tensorflow as tf -from tensorflow.python.framework import ops as tf_ops - import tensorlayer as tl +from tensorflow.python.framework import ops as tf_ops from tensorlayer import logging from tensorlayer.files import utils from tensorlayer.layers import Layer, ModelLayer diff --git a/tensorlayer/models/mobilenetv1.py b/tensorlayer/models/mobilenetv1.py index 8065eeef3..4908b3d89 100644 --- a/tensorlayer/models/mobilenetv1.py +++ b/tensorlayer/models/mobilenetv1.py @@ -5,7 +5,6 @@ import os import tensorflow as tf - from tensorlayer import logging from tensorlayer.files import (assign_weights, load_npz, maybe_download_and_extract) from tensorlayer.layers import (BatchNorm, Conv2d, DepthwiseConv2d, Flatten, GlobalMeanPool2d, Input, Reshape) diff --git a/tensorlayer/models/squeezenetv1.py b/tensorlayer/models/squeezenetv1.py index 6d6a70535..a2d7e4304 100644 --- a/tensorlayer/models/squeezenetv1.py +++ b/tensorlayer/models/squeezenetv1.py @@ -5,10 +5,9 @@ import os import tensorflow as tf - from tensorlayer import logging from tensorlayer.files import (assign_weights, load_npz, maybe_download_and_extract) -from tensorlayer.layers import (Concat, Conv2d, Dropout, GlobalMeanPool2d, Input, MaxPool2d, Lambda) +from tensorlayer.layers import (Concat, Conv2d, Dropout, GlobalMeanPool2d, Input, Lambda, MaxPool2d) from tensorlayer.models import Model __all__ = [ diff --git a/tensorlayer/models/vgg.py b/tensorlayer/models/vgg.py index 391878c61..06648cb53 100644 --- a/tensorlayer/models/vgg.py +++ b/tensorlayer/models/vgg.py @@ -30,12 +30,12 @@ import os import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer import logging from tensorlayer.files import assign_weights, maybe_download_and_extract -from tensorlayer.layers import (BatchNorm, Conv2d, Dense, Flatten, Input, LayerList, MaxPool2d, Lambda) +from tensorlayer.layers import (BatchNorm, Conv2d, Dense, Flatten, Input, Lambda, LayerList, MaxPool2d) from tensorlayer.models import Model __all__ = [ diff --git a/tensorlayer/nlp.py b/tensorlayer/nlp.py index d96a7acf1..ed1ce975d 100755 --- a/tensorlayer/nlp.py +++ b/tensorlayer/nlp.py @@ -11,11 +11,11 @@ from collections import Counter import numpy as np -import tensorflow as tf from six.moves import urllib, xrange -from tensorflow.python.platform import gfile +import tensorflow as tf import tensorlayer as tl +from tensorflow.python.platform import gfile from tensorlayer.lazy_imports import LazyImport nltk = LazyImport("nltk") diff --git a/tensorlayer/rein.py b/tensorlayer/rein.py index e5cbe6bd4..8ddce7316 100644 --- a/tensorlayer/rein.py +++ b/tensorlayer/rein.py @@ -2,9 +2,10 @@ # -*- coding: utf-8 -*- import numpy as np -import tensorflow as tf from six.moves import xrange +import tensorflow as tf + __all__ = [ 'discount_episode_rewards', 'cross_entropy_reward_loss', diff --git a/tensorlayer/utils.py b/tensorlayer/utils.py index d6b8e6d78..35e054afb 100644 --- a/tensorlayer/utils.py +++ b/tensorlayer/utils.py @@ -11,9 +11,9 @@ from sys import platform as _platform import numpy as np -import tensorflow as tf from sklearn.metrics import accuracy_score, confusion_matrix, f1_score +import tensorflow as tf import tensorlayer as tl __all__ = [ From 4b104a426a9409b3748fa5eefa1bb5d954b952f5 Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Sat, 18 May 2019 19:11:00 +0100 Subject: [PATCH 03/57] sac added --- .../reinforcement_learning/tutorial_sac.py | 428 ++++++++++-------- .../reinforcement_learning/tutorial_td3.py | 13 +- 2 files changed, 248 insertions(+), 193 deletions(-) diff --git a/examples/reinforcement_learning/tutorial_sac.py b/examples/reinforcement_learning/tutorial_sac.py index 876c726f9..22c31a1d5 100644 --- a/examples/reinforcement_learning/tutorial_sac.py +++ b/examples/reinforcement_learning/tutorial_sac.py @@ -3,6 +3,19 @@ using target Q instead of V net: 2 Q net, 2 target Q net, 1 policy net add alpha loss compared with version 1 paper: https://arxiv.org/pdf/1812.05905.pdf +Actor policy is stochastic. + +Env: Openai Gym Pendulum-v0, continuous action space + +tensorflow 2.0.0a0 +tensorflow-probability 0.6.0 +tensorlayer 2.0.0 + +&& +pip install box2d box2d-kengz --user + +To run: +python tutorial_sac.py --train/test ''' import argparse @@ -43,7 +56,6 @@ args = parser.parse_args() - class ReplayBuffer: def __init__(self, capacity): self.capacity = capacity @@ -89,21 +101,13 @@ def _reverse_action(self, action): return action -def plot(frame_idx, rewards, predict_qs): - clear_output(True) - plt.figure(figsize=(20,5)) - plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1])) - plt.plot(rewards) - plt.plot(predict_qs) - plt.savefig('sac.png') - # plt.show() class SoftQNetwork(Model): def __init__(self, num_inputs, num_actions, hidden_dim, init_w=3e-3): super(SoftQNetwork, self).__init__() input_dim = num_inputs + num_actions - # w_init = tf.keras.initializers.glorot_normal(seed=None) - w_init = tf.random_uniform_initializer(-init_w, init_w) + w_init = tf.keras.initializers.glorot_normal(seed=None) # glorot initialization is better than uniform in practice + # w_init = tf.random_uniform_initializer(-init_w, init_w) self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=input_dim, name='q1') self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='q2') @@ -116,50 +120,37 @@ def forward(self, input): return x -class PolicyNetwork(nn.Module): +class PolicyNetwork(Model): def __init__(self, num_inputs, num_actions, hidden_dim, action_range=1., init_w=3e-3, log_std_min=-20, log_std_max=2): super(PolicyNetwork, self).__init__() self.log_std_min = log_std_min self.log_std_max = log_std_max - # w_init = tf.keras.initializers.glorot_normal(seed=None) - w_init = tf.random_uniform_initializer(-init_w, init_w) + w_init = tf.keras.initializers.glorot_normal(seed=None) + # w_init = tf.random_uniform_initializer(-init_w, init_w) self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=num_inputs, name='policy1') self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy2') self.linear3 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy3') self.mean_linear = Dense(n_units=num_actions, W_init=w_init, \ - b_init=tf.random_uniform_initializer(-init_w, init_w), in_channels=hidden_dim, name='policy_output') + b_init=tf.random_uniform_initializer(-init_w, init_w), in_channels=hidden_dim, name='policy_mean') self.log_std_linear = Dense(n_units=num_actions, W_init=w_init, \ - b_init=tf.random_uniform_initializer(-init_w, init_w), in_channels=hidden_dim, name='policy_output') - - - # self.mean_linear = nn.Linear(hidden_size, num_actions) - # self.mean_linear.weight.data.uniform_(-init_w, init_w) - # self.mean_linear.bias.data.uniform_(-init_w, init_w) - - # self.log_std_linear = nn.Linear(hidden_size, num_actions) - # self.log_std_linear.weight.data.uniform_(-init_w, init_w) - # self.log_std_linear.bias.data.uniform_(-init_w, init_w) + b_init=tf.random_uniform_initializer(-init_w, init_w), in_channels=hidden_dim, name='policy_logstd') self.action_range = action_range self.num_actions = num_actions def forward(self, state): - # x = F.relu(self.linear1(state)) - # x = F.relu(self.linear2(x)) - # x = F.relu(self.linear3(x)) - # x = F.relu(self.linear4(x)) x = self.linear1(state) x = self.linear2(x) x = self.linear3(x) mean = self.mean_linear(x) log_std = self.log_std_linear(x) - log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max) + log_std = tf.clip_by_value(log_std, self.log_std_min, self.log_std_max) return mean, log_std @@ -173,209 +164,270 @@ def evaluate(self, state, epsilon=1e-6): z = normal.sample() action_0 = tf.math.tanh(mean + std*z) # TanhNormal distribution as actions; reparameterization trick action = self.action_range*action_0 - log_prob = Normal(mean, std).log_prob(mean+ std*z) - tf.math.log(1. - action_0.pow(2) + epsilon) - np.log(self.action_range) + # according to original paper, with an extra last term for normalizing different action range + log_prob = Normal(mean, std).log_prob(mean+ std*z) - tf.math.log(1. - action_0**2 + epsilon) - np.log(self.action_range) # both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action); # the Normal.log_prob outputs the same dim of input features instead of 1 dim probability, - # needs sum up across the features dim to get 1 dim prob; or else use Multivariate Normal. - log_prob = tf.reduce_sum(log_prob, axis=1) + # needs sum up across the dim of actions to get 1 dim probability; or else use Multivariate Normal. + log_prob = tf.reduce_sum(log_prob, axis=1)[:, np.newaxis] # expand dim as reduce_sum causes 1 dim reduced + return action, log_prob, z, mean, log_std def get_action(self, state, deterministic): + ''' generate action with state for interaction with envronment ''' mean, log_std = self.forward([state]) std = tf.math.exp(log_std) normal = Normal(0, 1) z = normal.sample() - action = self.action_range * tf.math.tanh(mean + std*z) + action = self.action_range * tf.math.tanh(mean + std*z) # TanhNormal distribution as actions; reparameterization trick - action = self.action_range*mean.detach().cpu().numpy()[0] if deterministic else action.detach().cpu().numpy()[0] - return action + action = self.action_range*mean if deterministic else action + return action.numpy()[0] def sample_action(self,): - a=torch.FloatTensor(self.num_actions).uniform_(-1, 1) + ''' generate random actions for exploration ''' + a = tf.random.uniform([self.num_actions], -1, 1) + return self.action_range*a.numpy() class SAC_Trainer(): - def __init__(self, replay_buffer, hidden_dim, action_range): + def __init__(self, replay_buffer, hidden_dim, action_range, soft_q_lr = 3e-4, policy_lr = 3e-4, alpha_lr = 3e-4): self.replay_buffer = replay_buffer - self.soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(device) - self.soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(device) - self.target_soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(device) - self.target_soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(device) - self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim, action_range).to(device) - self.log_alpha = torch.zeros(1, dtype=torch.float32, requires_grad=True, device=device) + # initialize all networks + self.soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim) + self.soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim) + self.target_soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim) + self.target_soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim) + self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim, action_range) + self.log_alpha = tf.Variable(0, dtype=np.float32, name='log_alpha') + self.alpha = tf.math.exp(self.log_alpha) print('Soft Q Network (1,2): ', self.soft_q_net1) print('Policy Network: ', self.policy_net) - for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()): - target_param.data.copy_(param.data) - for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()): - target_param.data.copy_(param.data) - - self.soft_q_criterion1 = nn.MSELoss() - self.soft_q_criterion2 = nn.MSELoss() - - soft_q_lr = 3e-4 - policy_lr = 3e-4 - alpha_lr = 3e-4 - - self.soft_q_optimizer1 = optim.Adam(self.soft_q_net1.parameters(), lr=soft_q_lr) - self.soft_q_optimizer2 = optim.Adam(self.soft_q_net2.parameters(), lr=soft_q_lr) - self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) - self.alpha_optimizer = optim.Adam([self.log_alpha], lr=alpha_lr) + # initialize weights of target networks + self.target_soft_q_net1 = self.target_ini(self.soft_q_net1, self.target_soft_q_net1) + self.target_soft_q_net2 = self.target_ini(self.soft_q_net2, self.target_soft_q_net2) + self.soft_q_optimizer1 = tf.optimizers.Adam(soft_q_lr) + self.soft_q_optimizer2 = tf.optimizers.Adam(soft_q_lr) + self.policy_optimizer = tf.optimizers.Adam(policy_lr) + self.alpha_optimizer = tf.optimizers.Adam(alpha_lr) + # self.alpha_optimizer = optim.Adam([self.log_alpha], lr=alpha_lr) + + def target_ini(self, net, target_net): + ''' hard-copy update for initializing target networks ''' + for target_param, param in zip(target_net.trainable_weights, net.trainable_weights): + target_param.assign(param) + return target_net + + def target_soft_update(self, net, target_net, soft_tau): + ''' soft update the target net with Polyak averaging ''' + for target_param, param in zip(target_net.trainable_weights, net.trainable_weights): + target_param.assign( # copy weight value into target parameters + target_param * (1.0 - soft_tau) + param * soft_tau + ) + return target_net def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy=-2, gamma=0.99,soft_tau=1e-2): + ''' update all networks in SAC ''' state, action, reward, next_state, done = self.replay_buffer.sample(batch_size) - # print('sample:', state, action, reward, done) - state = torch.FloatTensor(state).to(device) - next_state = torch.FloatTensor(next_state).to(device) - action = torch.FloatTensor(action).to(device) - reward = torch.FloatTensor(reward).unsqueeze(1).to(device) # reward is single value, unsqueeze() to add one dim to be [reward] at the sample dim; - done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(device) + reward = reward[:, np.newaxis] # expand dim + done = done[:, np.newaxis] - predicted_q_value1 = self.soft_q_net1(state, action) - predicted_q_value2 = self.soft_q_net2(state, action) - new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state) - new_next_action, next_log_prob, _, _, _ = self.policy_net.evaluate(next_state) + reward = reward_scale * (reward - np.mean(reward, axis=0)) /np.std(reward, axis=0) # normalize with batch mean and std - reward = reward_scale * (reward - reward.mean(dim=0)) /reward.std(dim=0) # normalize with batch mean and std - # Updating alpha wrt entropy - # alpha = 0.0 # trade-off between exploration (max entropy) and exploitation (max Q) - if auto_entropy is True: - alpha_loss = -(self.log_alpha * (log_prob + target_entropy).detach()).mean() - # print('alpha loss: ',alpha_loss) - self.alpha_optimizer.zero_grad() - alpha_loss.backward() - self.alpha_optimizer.step() - self.alpha = self.log_alpha.exp() - else: - self.alpha = 1. - alpha_loss = 0 # Training Q Function - target_q_min = torch.min(self.target_soft_q_net1(next_state, new_next_action),self.target_soft_q_net2(next_state, new_next_action)) - self.alpha * next_log_prob + new_next_action, next_log_prob, _, _, _ = self.policy_net.evaluate(next_state) + target_q_input = tf.concat([next_state, new_next_action], 1) # the dim 0 is number of samples + target_q_min = tf.minimum(self.target_soft_q_net1(target_q_input),self.target_soft_q_net2(target_q_input)) - self.alpha * next_log_prob target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward - q_value_loss1 = self.soft_q_criterion1(predicted_q_value1, target_q_value.detach()) # detach: no gradients for the variable - q_value_loss2 = self.soft_q_criterion2(predicted_q_value2, target_q_value.detach()) - - - self.soft_q_optimizer1.zero_grad() - q_value_loss1.backward() - self.soft_q_optimizer1.step() - self.soft_q_optimizer2.zero_grad() - q_value_loss2.backward() - self.soft_q_optimizer2.step() + q_input = tf.concat([state, action], 1) # the dim 0 is number of samples + + with tf.GradientTape() as q1_tape: + predicted_q_value1 = self.soft_q_net1(q_input) + q_value_loss1 = tf.reduce_mean(tf.losses.mean_squared_error(predicted_q_value1, target_q_value)) + q1_grad = q1_tape.gradient(q_value_loss1, self.soft_q_net1.trainable_weights) + self.soft_q_optimizer1.apply_gradients(zip(q1_grad, self.soft_q_net1.trainable_weights)) + + with tf.GradientTape() as q2_tape: + predicted_q_value2 = self.soft_q_net2(q_input) + q_value_loss2 = tf.reduce_mean(tf.losses.mean_squared_error(predicted_q_value2, target_q_value)) + q2_grad = q2_tape.gradient(q_value_loss2, self.soft_q_net2.trainable_weights) + self.soft_q_optimizer2.apply_gradients(zip(q2_grad, self.soft_q_net2.trainable_weights)) # Training Policy Function - predicted_new_q_value = torch.min(self.soft_q_net1(state, new_action),self.soft_q_net2(state, new_action)) - policy_loss = (self.alpha * log_prob - predicted_new_q_value).mean() - - self.policy_optimizer.zero_grad() - policy_loss.backward() - self.policy_optimizer.step() - - # print('q loss: ', q_value_loss1, q_value_loss2) - # print('policy loss: ', policy_loss ) + with tf.GradientTape() as p_tape: + new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state) + new_q_input = tf.concat([state, new_action], 1) # the dim 0 is number of samples + ''' implementation 1 ''' + predicted_new_q_value = tf.minimum(self.soft_q_net1(new_q_input),self.soft_q_net2(new_q_input)) + ''' implementation 2 ''' + # predicted_new_q_value = self.soft_q_net1(new_q_input) + policy_loss = tf.reduce_mean(self.alpha * log_prob - predicted_new_q_value) + p_grad = p_tape.gradient(policy_loss, self.policy_net.trainable_weights) + self.policy_optimizer.apply_gradients(zip(p_grad, self.policy_net.trainable_weights)) + + + # Updating alpha w.r.t entropy + # alpha: trade-off between exploration (max entropy) and exploitation (max Q) + if auto_entropy is True: + with tf.GradientTape() as alpha_tape: + alpha_loss = -tf.reduce_mean((self.log_alpha * (log_prob + target_entropy))) + alpha_grad = alpha_tape.gradient(alpha_loss, [self.log_alpha]) + self.alpha_optimizer.apply_gradients(zip(alpha_grad, [self.log_alpha])) + self.alpha = tf.math.exp(self.log_alpha) + else: # fixed alpha + self.alpha = 1. + alpha_loss = 0 + # Soft update the target value nets + self.target_soft_q_net1=self.target_soft_update(self.soft_q_net1, self.target_soft_q_net1, soft_tau) + self.target_soft_q_net2=self.target_soft_update(self.soft_q_net2, self.target_soft_q_net2, soft_tau) + + def save_weights(self): # save trained weights + tl.files.save_npz(self.soft_q_net1.trainable_weights, name='model_q_net1.npz') + tl.files.save_npz(self.soft_q_net2.trainable_weights, name='model_q_net2.npz') + tl.files.save_npz(self.target_soft_q_net1.trainable_weights, name='model_target_q_net1.npz') + tl.files.save_npz(self.target_soft_q_net2.trainable_weights, name='model_target_q_net2.npz') + tl.files.save_npz(self.policy_net.trainable_weights, name='model_policy_net.npz') + + def load_weights(self): # load trained weights + tl.files.load_and_assign_npz(name='model_q_net1.npz', network=self.soft_q_net1) + tl.files.load_and_assign_npz(name='model_q_net2.npz', network=self.soft_q_net2) + tl.files.load_and_assign_npz(name='model_target_q_net1.npz', network=self.target_soft_q_net1) + tl.files.load_and_assign_npz(name='model_target_q_net2.npz', network=self.target_soft_q_net2) + tl.files.load_and_assign_npz(name='model_policy_net.npz', network=self.policy_net) + +def plot(frame_idx, rewards): + clear_output(True) + plt.figure(figsize=(20,5)) + plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1])) + plt.plot(rewards) + plt.xlabel('Episode') + plt.ylabel('Episode Reward') + plt.savefig('sac.png') + # plt.show() - # Soft update the target value net - for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()): - target_param.data.copy_( # copy data value into target parameters - target_param.data * (1.0 - soft_tau) + param.data * soft_tau - ) - for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()): - target_param.data.copy_( # copy data value into target parameters - target_param.data * (1.0 - soft_tau) + param.data * soft_tau - ) - return predicted_new_q_value.mean() +# choose env +ENV = 'Pendulum-v0' +env = NormalizedActions(gym.make(ENV)) +action_dim = env.action_space.shape[0] +state_dim = env.observation_space.shape[0] +action_range=1. -replay_buffer_size = 1e6 +replay_buffer_size = 5e5 replay_buffer = ReplayBuffer(replay_buffer_size) -# choose env -ENV = ['Pendulum', 'Reacher'][0] -if ENV == 'Reacher': - NUM_JOINTS=2 - LINK_LENGTH=[200, 140] - INI_JOING_ANGLES=[0.1, 0.1] - # NUM_JOINTS=4 - # LINK_LENGTH=[200, 140, 80, 50] - # INI_JOING_ANGLES=[0.1, 0.1, 0.1, 0.1] - SCREEN_SIZE=1000 - SPARSE_REWARD=False - SCREEN_SHOT=False - action_range = 10.0 - - env=Reacher(screen_size=SCREEN_SIZE, num_joints=NUM_JOINTS, link_lengths = LINK_LENGTH, \ - ini_joint_angles=INI_JOING_ANGLES, target_pos = [369,430], render=True, change_goal=False) - action_dim = env.num_actions - state_dim = env.num_observations -elif ENV == 'Pendulum': - env = NormalizedActions(gym.make("Pendulum-v0")) - action_dim = env.action_space.shape[0] - state_dim = env.observation_space.shape[0] - action_range=1. - # hyper-parameters for RL training -max_frames = 40000 -max_steps = 20 if ENV == 'Reacher' else 150 # Pendulum needs 150 steps per episode to learn well, cannot handle 20 -frame_idx = 0 -batch_size = 256 -explore_steps = 200 # for random action sampling in the beginning of training -update_itr = 1 -AUTO_ENTROPY=True -DETERMINISTIC=False -hidden_dim = 512 -rewards = [] -predict_qs = [] - - -sac_trainer=SAC_Trainer(replay_buffer, hidden_dim=hidden_dim, action_range=action_range ) +max_frames = 40000 # total number of steps for training +test_frames = 300 # total number of steps for testing +max_steps = 150 # maximum number of steps for one episode +batch_size = 64 # udpate batchsize +explore_steps = 100 # 500 for random action sampling in the beginning of training +update_itr = 3 # repeated updates for single step +hidden_dim = 32 # size of hidden layers for networks +soft_q_lr = 3e-4 # q_net learning rate +policy_lr = 3e-4 # policy_net learning rate +alpha_lr = 3e-4 # alpha learning rate +policy_target_update_interval = 3 # delayed update for the policy network and target networks +# explore_noise_scale = 1.0 # range of action noise for exploration +# eval_noise_scale = 0.5 # range of action noise for evaluation of action value +reward_scale = 1. # value range of reward + +AUTO_ENTROPY=True # automatically udpating variable alpha for entropy +DETERMINISTIC=False # stochastic action policy if False, otherwise deterministic + + +sac_trainer=SAC_Trainer(replay_buffer, hidden_dim=hidden_dim, action_range=action_range, \ +soft_q_lr=soft_q_lr, policy_lr=policy_lr, alpha_lr=alpha_lr ) + +#set train mode +sac_trainer.soft_q_net1.train() +sac_trainer.soft_q_net2.train() +sac_trainer.target_soft_q_net1.train() +sac_trainer.target_soft_q_net2.train() +sac_trainer.policy_net.train() + # training loop -while frame_idx < max_frames: - if ENV == 'Reacher': - state = env.reset(SCREEN_SHOT) - elif ENV == 'Pendulum': +if args.train: + frame_idx = 0 + rewards = [] + while frame_idx < max_frames: state = env.reset() - episode_reward = 0 - predict_q = 0 - - - for step in range(max_steps): - if frame_idx > explore_steps: + state = state.astype(np.float32) + episode_reward = 0 + if frame_idx <1 : + print('intialize') + _=sac_trainer.policy_net([state]) # need an extra call here to make inside functions be able to use model.forward + + for step in range(max_steps): + if frame_idx > explore_steps: + action = sac_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC) + else: + action = sac_trainer.policy_net.sample_action() + + next_state, reward, done, _ = env.step(action) + next_state = next_state.astype(np.float32) + env.render() + done = 1 if done == True else 0 + + replay_buffer.push(state, action, reward, next_state, done) + + state = next_state + episode_reward += reward + frame_idx += 1 + + if len(replay_buffer) > batch_size: + for i in range(update_itr): + sac_trainer.update(batch_size, reward_scale=reward_scale, auto_entropy=AUTO_ENTROPY, target_entropy=-1.*action_dim) + + if frame_idx % 500 == 0: + plot(frame_idx, rewards) + + if done: + break + print('Episode: ', frame_idx/max_steps, '| Episode Reward: ', episode_reward) + rewards.append(episode_reward) + sac_trainer.save_weights() + +if args.test: + frame_idx = 0 + rewards = [] + sac_trainer.load_weights() + + while frame_idx < test_frames: + state = env.reset() + state = state.astype(np.float32) + episode_reward = 0 + if frame_idx <1 : + print('intialize') + _=sac_trainer.policy_net([state]) # need an extra call to make inside functions be able to use forward + + + for step in range(max_steps): action = sac_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC) - else: - action = sac_trainer.policy_net.sample_action() - if ENV == 'Reacher': - next_state, reward, done, _ = env.step(action, SPARSE_REWARD, SCREEN_SHOT) - elif ENV == 'Pendulum': - next_state, reward, done, _ = env.step(action) - env.render() + next_state, reward, done, _ = env.step(action) + next_state = next_state.astype(np.float32) + env.render() + done = 1 if done == True else 0 - replay_buffer.push(state, action, reward, next_state, done) - - state = next_state - episode_reward += reward - frame_idx += 1 - - - if len(replay_buffer) > batch_size: - for i in range(update_itr): - predict_q=sac_trainer.update(batch_size, reward_scale=10., auto_entropy=AUTO_ENTROPY, target_entropy=-1.*action_dim) - - if frame_idx % batch_size == 0: - plot(frame_idx, rewards, predict_qs) - - if done: - break - print('Episode: ', frame_idx/max_steps, '| Episode Reward: ', episode_reward) - rewards.append(episode_reward) - predict_qs.append(predict_q) + state = next_state + episode_reward += reward + frame_idx += 1 + + # if frame_idx % 50 == 0: + # plot(frame_idx, rewards) + + if done: + break + print('Episode: ', frame_idx/max_steps, '| Episode Reward: ', episode_reward) + rewards.append(episode_reward) diff --git a/examples/reinforcement_learning/tutorial_td3.py b/examples/reinforcement_learning/tutorial_td3.py index 52ae022ff..63c5dbf0c 100644 --- a/examples/reinforcement_learning/tutorial_td3.py +++ b/examples/reinforcement_learning/tutorial_td3.py @@ -12,6 +12,9 @@ && pip install box2d box2d-kengz --user + +To run: +python tutorial_td3.py --train/test ''' import argparse import math @@ -252,7 +255,7 @@ def update(self, batch_size, eval_noise_scale, reward_scale=10., gamma=0.9, soft new_action = self.policy_net.evaluate(state, eval_noise_scale=0.0) # no noise, deterministic policy gradients new_q_input = tf.concat([state, new_action], 1) ''' implementation 1 ''' - # predicted_new_q_value = torch.min(self.q_net1(new_q_input),self.q_net2(new_q_input)) + # predicted_new_q_value = tf.minimum(self.q_net1(new_q_input),self.q_net2(new_q_input)) ''' implementation 2 ''' predicted_new_q_value = self.q_net1(new_q_input) policy_loss = - tf.reduce_mean(predicted_new_q_value) @@ -309,12 +312,12 @@ def plot(frame_idx, rewards): test_frames = 300 # total number of steps for testing max_steps = 150 # maximum number of steps for one episode batch_size = 64 # udpate batchsize -explore_steps = 0 # 500 for random action sampling in the beginning of training -update_itr = 3 # delayed steps for updating the policy network and target networks +explore_steps = 500 # 500 for random action sampling in the beginning of training +update_itr = 3 # repeated updates for single step hidden_dim = 32 # size of hidden layers for networks q_lr = 3e-4 # q_net learning rate policy_lr = 3e-4 # policy_net learning rate -policy_target_update_interval = 3 # delayed update for the policy network and target networks +policy_target_update_interval = 3 # delayed steps for updating the policy network and target networks explore_noise_scale = 1.0 # range of action noise for exploration eval_noise_scale = 0.5 # range of action noise for evaluation of action value reward_scale = 1. # value range of reward @@ -339,7 +342,7 @@ def plot(frame_idx, rewards): episode_reward = 0 if frame_idx <1 : print('intialize') - _=td3_trainer.policy_net([state]) # need an extra call to make inside functions be able to use forward + _=td3_trainer.policy_net([state]) # need an extra call here to make inside functions be able to use model.forward _=td3_trainer.target_policy_net([state]) From 149f0b9cd4f596b4965f4d1a4133c4964958ebb0 Mon Sep 17 00:00:00 2001 From: Officium Date: Sun, 19 May 2019 07:05:21 +0800 Subject: [PATCH 04/57] add tutorial_wrappers --- .../tutorial_wrappers.py | 541 ++++++++++++++++++ 1 file changed, 541 insertions(+) create mode 100644 examples/reinforcement_learning/tutorial_wrappers.py diff --git a/examples/reinforcement_learning/tutorial_wrappers.py b/examples/reinforcement_learning/tutorial_wrappers.py new file mode 100644 index 000000000..eec4b7433 --- /dev/null +++ b/examples/reinforcement_learning/tutorial_wrappers.py @@ -0,0 +1,541 @@ +"""Env wrappers +Note that this file is adapted from `https://pypi.org/project/gym-vec-env` and +`https://github.com/openai/baselines/blob/master/baselines/common/*wrappers.py` +""" +from collections import deque +from functools import partial +from multiprocessing import cpu_count, Process, Pipe +from sys import platform + +import cv2 +import gym +import numpy as np +from gym import spaces + + +__all__ = ( + 'build_env', # build env + 'TimeLimit', # Time limit wrapper + 'NoopResetEnv', # Run random number of no-ops on reset + 'FireResetEnv', # Reset wrapper for envs with fire action + 'EpisodicLifeEnv', # end-of-life == end-of-episode wrapper + 'MaxAndSkipEnv', # skip frame wrapper + 'ClipRewardEnv', # clip reward wrapper + 'WarpFrame', # warp observation wrapper + 'FrameStack', # stack frame wrapper + 'LazyFrames', # lazy store wrapper + 'RewardScaler', # reward scale + 'SubprocVecEnv', # vectorized env wrapper + 'VecFrameStack', # stack frames in vectorized env + 'Monitor', # Episode reward and length monitor +) +cv2.ocl.setUseOpenCL(False) +# env_id -> env_type +id2type = dict() +for _env in gym.envs.registry.all(): + id2type[_env.id] = _env._entry_point.split(':')[0].rsplit('.', 1)[1] + + +def build_env(env_id, vectorized=False, seed=0, reward_scale=1.0, nenv=0): + """Build env based on options""" + env_type = id2type[env_id] + nenv = nenv or cpu_count() // (1 + (platform == 'darwin')) + stack = env_type == 'atari' + if not vectorized: + env = _make_env(env_id, env_type, seed, reward_scale, stack) + else: + env = _make_vec_env(env_id, env_type, nenv, seed, reward_scale, stack) + + return env + + +def _make_env(env_id, env_type, seed, reward_scale, frame_stack=True): + """Make single env""" + if env_type == 'atari': + env = gym.make(env_id) + assert 'NoFrameskip' in env.spec.id + env = NoopResetEnv(env, noop_max=30) + env = MaxAndSkipEnv(env, skip=4) + env = Monitor(env) + # deepmind wrap + env = EpisodicLifeEnv(env) + if 'FIRE' in env.unwrapped.get_action_meanings(): + env = FireResetEnv(env) + env = WarpFrame(env) + env = ClipRewardEnv(env) + if frame_stack: + env = FrameStack(env, 4) + elif env_type == 'classic_control': + env = Monitor(gym.make(env_id)) + else: + raise NotImplementedError + if reward_scale != 1: + env = RewardScaler(env, reward_scale) + env.seed(seed) + return env + + +def _make_vec_env(env_id, env_type, nenv, seed, reward_scale, frame_stack=True): + """Make vectorized env""" + env = SubprocVecEnv([ + partial(_make_env, env_id, env_type, seed + i, reward_scale, False) + for i in range(nenv) + ]) + if frame_stack: + env = VecFrameStack(env, 4) + return env + + +class TimeLimit(gym.Wrapper): + def __init__(self, env, max_episode_steps=None): + super(TimeLimit, self).__init__(env) + self._max_episode_steps = max_episode_steps + self._elapsed_steps = 0 + + def step(self, ac): + observation, reward, done, info = self.env.step(ac) + self._elapsed_steps += 1 + if self._elapsed_steps >= self._max_episode_steps: + done = True + info['TimeLimit.truncated'] = True + return observation, reward, done, info + + def reset(self, **kwargs): + self._elapsed_steps = 0 + return self.env.reset(**kwargs) + + +class NoopResetEnv(gym.Wrapper): + def __init__(self, env, noop_max=30): + """Sample initial states by taking random number of no-ops on reset. + No-op is assumed to be action 0. + """ + super(NoopResetEnv, self).__init__(env) + self.noop_max = noop_max + self.override_num_noops = None + self.noop_action = 0 + assert env.unwrapped.get_action_meanings()[0] == 'NOOP' + + def reset(self, **kwargs): + """ Do no-op action for a number of steps in [1, noop_max].""" + self.env.reset(**kwargs) + if self.override_num_noops is not None: + noops = self.override_num_noops + else: + noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) + assert noops > 0 + obs = None + for _ in range(noops): + obs, _, done, _ = self.env.step(self.noop_action) + if done: + obs = self.env.reset(**kwargs) + return obs + + def step(self, ac): + return self.env.step(ac) + + +class FireResetEnv(gym.Wrapper): + def __init__(self, env): + """Take action on reset for environments that are fixed until firing.""" + super(FireResetEnv, self).__init__(env) + assert env.unwrapped.get_action_meanings()[1] == 'FIRE' + assert len(env.unwrapped.get_action_meanings()) >= 3 + + def reset(self, **kwargs): + self.env.reset(**kwargs) + obs, _, done, _ = self.env.step(1) + if done: + self.env.reset(**kwargs) + obs, _, done, _ = self.env.step(2) + if done: + self.env.reset(**kwargs) + return obs + + def step(self, ac): + return self.env.step(ac) + + +class EpisodicLifeEnv(gym.Wrapper): + def __init__(self, env): + """Make end-of-life == end-of-episode, but only reset on true game over. + Done by DeepMind for the DQN and co. since it helps value estimation. + """ + super(EpisodicLifeEnv, self).__init__(env) + self.lives = 0 + self.was_real_done = True + + def step(self, action): + obs, reward, done, info = self.env.step(action) + self.was_real_done = done + # check current lives, make loss of life terminal, + # then update lives to handle bonus lives + lives = self.env.unwrapped.ale.lives() + if 0 < lives < self.lives: + # for Qbert sometimes we stay in lives == 0 condition for a few + # frames so it's important to keep lives > 0, so that we only reset + # once the environment advertises done. + done = True + self.lives = lives + return obs, reward, done, info + + def reset(self, **kwargs): + """Reset only when lives are exhausted. + This way all states are still reachable even though lives are episodic, + and the learner need not know about any of this behind-the-scenes. + """ + if self.was_real_done: + obs = self.env.reset(**kwargs) + else: + # no-op step to advance from terminal/lost life state + obs, _, _, _ = self.env.step(0) + self.lives = self.env.unwrapped.ale.lives() + return obs + + +class MaxAndSkipEnv(gym.Wrapper): + def __init__(self, env, skip=4): + """Return only every `skip`-th frame""" + super(MaxAndSkipEnv, self).__init__(env) + # most recent raw observations (for max pooling across time steps) + shape = (2, ) + env.observation_space.shape + self._obs_buffer = np.zeros(shape, dtype=np.uint8) + self._skip = skip + + def step(self, action): + """Repeat action, sum reward, and max over last observations.""" + total_reward = 0.0 + done = info = None + for i in range(self._skip): + obs, reward, done, info = self.env.step(action) + if i == self._skip - 2: + self._obs_buffer[0] = obs + if i == self._skip - 1: + self._obs_buffer[1] = obs + total_reward += reward + if done: + break + # Note that the observation on the done=True frame doesn't matter + max_frame = self._obs_buffer.max(axis=0) + + return max_frame, total_reward, done, info + + def reset(self, **kwargs): + return self.env.reset(**kwargs) + + +class ClipRewardEnv(gym.RewardWrapper): + def __init__(self, env): + super(ClipRewardEnv, self).__init__(env) + + def reward(self, reward): + """Bin reward to {+1, 0, -1} by its sign.""" + return np.sign(reward) + + +class WarpFrame(gym.ObservationWrapper): + def __init__(self, env, width=84, height=84, grayscale=True): + """Warp frames to 84x84 as done in the Nature paper and later work.""" + super(WarpFrame, self).__init__(env) + self.width = width + self.height = height + self.grayscale = grayscale + shape = (self.height, self.width, 1 if self.grayscale else 3) + self.observation_space = spaces.Box( + low=0, high=255, shape=shape, dtype=np.uint8 + ) + + def observation(self, frame): + if self.grayscale: + frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) + size = (self.width, self.height) + frame = cv2.resize(frame, size, interpolation=cv2.INTER_AREA) + if self.grayscale: + frame = np.expand_dims(frame, -1) + return frame + + +class FrameStack(gym.Wrapper): + def __init__(self, env, k): + """Stack k last frames. + Returns lazy array, which is much more memory efficient. + See Also `LazyFrames` + """ + super(FrameStack, self).__init__(env) + self.k = k + self.frames = deque([], maxlen=k) + shp = env.observation_space.shape + shape = shp[:-1] + (shp[-1] * k, ) + self.observation_space = spaces.Box( + low=0, high=255, shape=shape, dtype=env.observation_space.dtype + ) + + def reset(self): + ob = self.env.reset() + for _ in range(self.k): + self.frames.append(ob) + return np.asarray(self._get_ob()) + + def step(self, action): + ob, reward, done, info = self.env.step(action) + self.frames.append(ob) + return np.asarray(self._get_ob()), reward, done, info + + def _get_ob(self): + assert len(self.frames) == self.k + return LazyFrames(list(self.frames)) + + +class LazyFrames(object): + def __init__(self, frames): + """This object ensures that common frames between the observations are + only stored once. It exists purely to optimize memory usage which can be + huge for DQN's 1M frames replay buffers. + + This object should only be converted to numpy array before being passed + to the model. You'd not believe how complex the previous solution was. + """ + self._frames = frames + self._out = None + + def _force(self): + if self._out is None: + self._out = np.concatenate(self._frames, axis=-1) + self._frames = None + return self._out + + def __array__(self, dtype=None): + out = self._force() + if dtype is not None: + out = out.astype(dtype) + return out + + def __len__(self): + return len(self._force()) + + def __getitem__(self, i): + return self._force()[i] + + +class RewardScaler(gym.RewardWrapper): + """Bring rewards to a reasonable scale for PPO. + This is incredibly important and effects performance drastically. + """ + def __init__(self, env, scale=0.01): + super(RewardScaler, self).__init__(env) + self.scale = scale + + def reward(self, reward): + return reward * self.scale + + +class VecFrameStack(object): + def __init__(self, env, k): + self.env = env + self.k = k + self.action_space = env.action_space + self.frames = deque([], maxlen=k) + shp = env.observation_space.shape + shape = shp[:-1] + (shp[-1] * k, ) + self.observation_space = spaces.Box( + low=0, high=255, shape=shape, dtype=env.observation_space.dtype + ) + + def reset(self): + ob = self.env.reset() + for _ in range(self.k): + self.frames.append(ob) + return np.asarray(self._get_ob()) + + def step(self, action): + ob, reward, done, info = self.env.step(action) + self.frames.append(ob) + return np.asarray(self._get_ob()), reward, done, info + + def _get_ob(self): + assert len(self.frames) == self.k + return LazyFrames(list(self.frames)) + + +def _worker(remote, parent_remote, env_fn_wrapper): + parent_remote.close() + env = env_fn_wrapper.x() + while True: + cmd, data = remote.recv() + if cmd == 'step': + ob, reward, done, info = env.step(data) + if done: + ob = env.reset() + remote.send((ob, reward, done, info)) + elif cmd == 'reset': + ob = env.reset() + remote.send(ob) + elif cmd == 'reset_task': + ob = env._reset_task() + remote.send(ob) + elif cmd == 'close': + remote.close() + break + elif cmd == 'get_spaces': + remote.send((env.observation_space, env.action_space)) + else: + raise NotImplementedError + + +class CloudpickleWrapper(object): + """ + Uses cloudpickle to serialize contents + """ + def __init__(self, x): + self.x = x + + def __getstate__(self): + import cloudpickle + return cloudpickle.dumps(self.x) + + def __setstate__(self, ob): + import pickle + self.x = pickle.loads(ob) + + +class SubprocVecEnv(object): + def __init__(self, env_fns): + """ + envs: list of gym environments to run in subprocesses + """ + self.num_envs = len(env_fns) + + self.waiting = False + self.closed = False + nenvs = len(env_fns) + self.nenvs = nenvs + self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) + zipped_args = zip(self.work_remotes, self.remotes, env_fns) + self.ps = [ + Process(target=_worker, + args=(work_remote, remote, CloudpickleWrapper(env_fn))) + for (work_remote, remote, env_fn) in zipped_args + ] + + for p in self.ps: + # if the main process crashes, we should not cause things to hang + p.daemon = True + p.start() + for remote in self.work_remotes: + remote.close() + + self.remotes[0].send(('get_spaces', None)) + observation_space, action_space = self.remotes[0].recv() + self.observation_space = observation_space + self.action_space = action_space + + def _step_async(self, actions): + """ + Tell all the environments to start taking a step + with the given actions. + Call step_wait() to get the results of the step. + You should not call this if a step_async run is + already pending. + """ + for remote, action in zip(self.remotes, actions): + remote.send(('step', action)) + self.waiting = True + + def _step_wait(self): + """ + Wait for the step taken with step_async(). + Returns (obs, rews, dones, infos): + - obs: an array of observations, or a tuple of + arrays of observations. + - rews: an array of rewards + - dones: an array of "episode done" booleans + - infos: a sequence of info objects + """ + results = [remote.recv() for remote in self.remotes] + self.waiting = False + obs, rews, dones, infos = zip(*results) + return np.stack(obs), np.stack(rews), np.stack(dones), infos + + def reset(self): + """ + Reset all the environments and return an array of + observations, or a tuple of observation arrays. + If step_async is still doing work, that work will + be cancelled and step_wait() should not be called + until step_async() is invoked again. + """ + for remote in self.remotes: + remote.send(('reset', None)) + return np.stack([remote.recv() for remote in self.remotes]) + + def _reset_task(self): + for remote in self.remotes: + remote.send(('reset_task', None)) + return np.stack([remote.recv() for remote in self.remotes]) + + def close(self): + if self.closed: + return + if self.waiting: + for remote in self.remotes: + remote.recv() + for remote in self.remotes: + remote.send(('close', None)) + for p in self.ps: + p.join() + self.closed = True + + def __len__(self): + return self.nenvs + + def step(self, actions): + self._step_async(actions) + return self._step_wait() + + +class Monitor(gym.Wrapper): + def __init__(self, env): + super(Monitor, self).__init__(env) + self._monitor_rewards = None + + def reset(self, **kwargs): + self._monitor_rewards = [] + return self.env.reset(**kwargs) + + def step(self, action): + o_, r, done, info = self.env.step(action) + self._monitor_rewards.append(r) + if done: + info['episode'] = { + 'r': sum(self._monitor_rewards), + 'l': len(self._monitor_rewards)} + return o_, r, done, info + + +def unit_test(): + env_id = 'CartPole-v0' + unwrapped_env = gym.make(env_id) + wrapped_env = build_env(env_id, False) + o = wrapped_env.reset() + print('Reset {} observation shape {}'.format(env_id, o.shape)) + done = False + while not done: + a = unwrapped_env.action_space.sample() + o_, r, done, info = wrapped_env.step(a) + print('Take action {} get reward {} info {}'.format(a, r, info)) + + env_id = 'PongNoFrameskip-v4' + nenv = 2 + unwrapped_env = gym.make(env_id) + wrapped_env = build_env(env_id, True, nenv=nenv) + o = wrapped_env.reset() + print('Reset {} observation shape {}'.format(env_id, o.shape)) + for _ in range(1000): + a = [unwrapped_env.action_space.sample() for _ in range(nenv)] + a = np.asarray(a, 'int64') + o_, r, done, info = wrapped_env.step(a) + print('Take action {} get reward {} info {}'.format(a, r, info)) + + +if __name__ == '__main__': + unit_test() From d23bae6eaff2824d36189fc98f4fda39eee221e2 Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Sun, 19 May 2019 16:08:44 +0100 Subject: [PATCH 05/57] add readme --- examples/reinforcement_learning/README.md | 111 +++++++++++++++++++++- 1 file changed, 110 insertions(+), 1 deletion(-) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index d99b5fec3..ad5aeb479 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -1,2 +1,111 @@ +# Reinforcement Learning Tutorial with Tensorlayer -### More examples can be found in [example List](https://tensorlayer.readthedocs.io/en/stable/user/examples.html) +This folder contains implementation of most popular reinforcement learning algorithms with Tensorlayer 2.0. + +## Prerequisites: + +* python 3.5 + +* tensorflow >= 2.0.0 + +* tensorlayer >= 2.0.0 + +* tensorflow-probability + +* tf-nightly-2.0-preview + +## To Use: + +`python ***.py` + +or `python ***.py --train` for training and `python ***.py --test` for testing. + +## Table of Contents: + +* Q-learning + + Code: `./tutorial_frozenlake_q_table.py` + + + +* Deep Q-Network (DQN) + + Code: `./tutorial_frozenlake_dqn.py` + + + +* Double DQN / Dueling DQN / Noisy DQN + + To do. + + + +* Distributed DQN + + To do. + + + +* Actor-Critic (AC) + + Code:`./tutorial_cartpole_ac.py` + + + +* Asynchronous Advantage Actor-Critic (A3C) + + Code: `./tutorial_bipedalwalker_a3c_continuous_action.py` + + + +* Soft Actor-Critic (SAC) + + Code: `./tutorial_sac.py` + + Paper: [Soft Actor-Critic Algorithms and Applications](https://arxiv.org/pdf/1812.05905.pdf) + + + +* Deep Deterministic Policy Gradient (DDPG) + + To do. + + + +* Twin Delayed DDPG (TD3) + + Code: `./tutorial_td3.py` + + Paper: [Addressing Function Approximation Error in Actor-Critic Methods](https://arxiv.org/pdf/1802.09477.pdf) + + + +* Hindsight Experience Replay (HER) + + To do. + + + +* Trust Region Policy Optimization (TRPO) + + To do. + + + +* Proximal Policy Optimization (PPO) + + To do. + + + +* etc + +## Environment: + +[Openai Gym](https://gym.openai.com/) + +Our env wrapper: `./tutorial_wrappers.py` + + + +### More examples can be found in [example List](https://tensorlayer.readthedocs.io/en/stable/user/examples.html) \ No newline at end of file From 6a3950f0369ffa7ef3dadf31b26022c95e650613 Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Mon, 20 May 2019 14:00:29 +0100 Subject: [PATCH 06/57] readme --- examples/reinforcement_learning/README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index ad5aeb479..486b52967 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -22,6 +22,25 @@ or `python ***.py --train` for training and `python ***.py --test` for testing. ## Table of Contents: +| Algorithms | Observation Space | Action Space | +| ------------ | ----------------- | ------------ | +| Q-learning | Discrete | Discrete | +| DQN | Discrete | Discrete | +| Actor-Critic | Continuous | Discrete | +| A3C | Continuous | Continuous | +| SAC | Continuous | Continuous | +| DDPG | Continuous | Continuous | +| TD3 | Continuous | C | +| HER | | | +| TRPO | | | +| PPO | | | +| | | | +| | | | +| | | | +| | | | + + + * Q-learning Code: `./tutorial_frozenlake_q_table.py` From 782b71fd27cd12eaf76945d521bee0976a115cfe Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Mon, 20 May 2019 14:01:55 +0100 Subject: [PATCH 07/57] readme --- examples/reinforcement_learning/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index 486b52967..5bb8d945f 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -30,7 +30,7 @@ or `python ***.py --train` for training and `python ***.py --test` for testing. | A3C | Continuous | Continuous | | SAC | Continuous | Continuous | | DDPG | Continuous | Continuous | -| TD3 | Continuous | C | +| TD3 | Continuous | Continuous | | HER | | | | TRPO | | | | PPO | | | From 54a7f16a43f23a99c988e7a572cfb43d02a674f5 Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Tue, 21 May 2019 11:14:23 +0100 Subject: [PATCH 08/57] bug --- examples/reinforcement_learning/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index 5bb8d945f..56ed726c3 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -5,15 +5,15 @@ This folder contains implementation of most popular reinforcement learning algor ## Prerequisites: * python 3.5 - * tensorflow >= 2.0.0 - * tensorlayer >= 2.0.0 - * tensorflow-probability - * tf-nightly-2.0-preview +*** If you meet problem `AttributeError: module 'tensorflow' has no attribute 'contrib'` when running the code after install tensorflow-probability, try: + +`pip install --upgrade tf-nightly-2.0-preview tfp-nightly` + ## To Use: `python ***.py` From 28565b0b2a5673f2d4f262c779df5347e3c365cd Mon Sep 17 00:00:00 2001 From: Zihan Ding <1402434478@qq.com> Date: Tue, 21 May 2019 11:16:39 +0100 Subject: [PATCH 09/57] Update README.md --- examples/reinforcement_learning/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index 56ed726c3..dce82b8c8 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -6,7 +6,7 @@ This folder contains implementation of most popular reinforcement learning algor * python 3.5 * tensorflow >= 2.0.0 -* tensorlayer >= 2.0.0 +* tensorlayer >= 2.0.1 * tensorflow-probability * tf-nightly-2.0-preview @@ -127,4 +127,4 @@ Our env wrapper: `./tutorial_wrappers.py` -### More examples can be found in [example List](https://tensorlayer.readthedocs.io/en/stable/user/examples.html) \ No newline at end of file +### More examples can be found in [example List](https://tensorlayer.readthedocs.io/en/stable/user/examples.html) From 120ddef4f9729dbf6baf8c728146b0f1679727a4 Mon Sep 17 00:00:00 2001 From: Officium Date: Tue, 21 May 2019 19:25:43 +0800 Subject: [PATCH 10/57] add double, dueling, noisy dqn --- examples/reinforcement_learning/README.md | 4 +- .../tutorial_double_dueling_noisy_dqn.py | 291 ++++++++++++++++++ 2 files changed, 294 insertions(+), 1 deletion(-) create mode 100644 examples/reinforcement_learning/tutorial_double_dueling_noisy_dqn.py diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index dce82b8c8..96392e0df 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -55,7 +55,9 @@ or `python ***.py --train` for training and `python ***.py --test` for testing. * Double DQN / Dueling DQN / Noisy DQN - To do. + Code: `./tutorial_double_dueling_noisy_dqn.py` + + Experiment Environments: Pong and Cartpole diff --git a/examples/reinforcement_learning/tutorial_double_dueling_noisy_dqn.py b/examples/reinforcement_learning/tutorial_double_dueling_noisy_dqn.py new file mode 100644 index 000000000..4cc6d3bf5 --- /dev/null +++ b/examples/reinforcement_learning/tutorial_double_dueling_noisy_dqn.py @@ -0,0 +1,291 @@ +"""Implement following enhanced deep q-learning algorithms +1. Double DQN + Van Hasselt H, Guez A, Silver D. Deep reinforcement learning with double + q-learning[C]//Thirtieth AAAI Conference on Artificial Intelligence. 2016. + +2. Dueling DQN + Wang Z, Schaul T, Hessel M, et al. Dueling network architectures for deep + reinforcement learning[J]. arXiv preprint arXiv:1511.06581, 2015. + +3. Noisy DQN + Plappert M, Houthooft R, Dhariwal P, et al. Parameter space noise for + exploration[J]. arXiv preprint arXiv:1706.01905, 2017. + +# Requirements +tensorflow==2.0.0a0 +tensorlayer==2.0.1 + +""" +import random +import time + +import numpy as np +import tensorflow as tf +import tensorlayer as tl + +from tutorial_wrappers import build_env + + +seed = 0 +env_id = 'CartPole-v0' # CartPole-v0, PongNoFrameskip-v4 +if env_id == 'CartPole-v0': + qnet_type = 'MLP' + number_timesteps = 10000 # total number of time steps to train on + explore_timesteps = 100 + # epsilon-greedy schedule, final exploit prob is 0.99 + epsilon = lambda i_iter: 1 - 0.99 * min(1, i_iter / explore_timesteps) + lr = 5e-3 # learning rate + buffer_size = 1000 # replay buffer size + target_q_update_freq = 50 # how frequency target q net update + ob_scale = 1.0 # scale observations +else: + # reward will increase obviously after 1e5 time steps + qnet_type = 'CNN' + number_timesteps = int(1e6) # total number of time steps to train on + explore_timesteps = 1e5 + # epsilon-greedy schedule, final exploit prob is 0.99 + epsilon = lambda i_iter: 1 - 0.99 * min(1, i_iter / explore_timesteps) + lr = 1e-4 # learning rate + buffer_size = 10000 # replay buffer size + target_q_update_freq = 200 # how frequency target q net update + ob_scale = 1.0 / 255 # scale observations + +env = build_env(env_id, seed=seed) +in_dim = env.observation_space.shape +out_dim = env.action_space.n +reward_gamma = 0.99 # reward discount +batch_size = 32 # batch size for sampling from replay buffer +warm_start = buffer_size / 10 # sample times befor learning +noise_update_freq = 50 # how frequency param noise net update + + +class MLP(tl.models.Model): + def __init__(self, name): + super(MLP, self).__init__(name=name) + self.h1 = tl.layers.Dense(64, tf.nn.tanh, in_channels=in_dim[0]) + self.qvalue = tl.layers.Dense(out_dim, in_channels=64, name='q', + W_init=tf.initializers.GlorotUniform()) + self.svalue = tl.layers.Dense(1, in_channels=64, name='s', + W_init=tf.initializers.GlorotUniform()) + self.noise_scale = 0 + + def forward(self, ni): + feature = self.h1(ni) + + # apply noise to all linear layer + if self.noise_scale != 0: + noises = [] + for layer in [self.qvalue, self.svalue]: + for var in layer.trainable_weights: + noise = tf.random.normal(tf.shape(var), 0, self.noise_scale) + noises.append(noise) + var.assign_add(noise) + + qvalue = self.qvalue(feature) + svalue = self.svalue(feature) + + if self.noise_scale != 0: + idx = 0 + for layer in [self.qvalue, self.svalue]: + for var in layer.trainable_weights: + var.assign_sub(noises[idx]) + idx += 1 + + # dueling network + out = svalue + qvalue - tf.reduce_mean(qvalue, 1, keepdims=True) + return out + + +class CNN(tl.models.Model): + def __init__(self, name): + super(CNN, self).__init__(name=name) + h, w, in_channels = in_dim + dense_in_channels = 64 * ((h - 28) // 8) * ((w - 28) // 8) + self.conv1 = tl.layers.Conv2d(32, (8, 8), (4, 4), tf.nn.relu, 'VALID', + in_channels=in_channels, name='conv2d_1', + W_init=tf.initializers.GlorotUniform()) + self.conv2 = tl.layers.Conv2d(64, (4, 4), (2, 2), tf.nn.relu, 'VALID', + in_channels=32, name='conv2d_2', + W_init=tf.initializers.GlorotUniform()) + self.conv3 = tl.layers.Conv2d(64, (3, 3), (1, 1), tf.nn.relu, 'VALID', + in_channels=64, name='conv2d_3', + W_init=tf.initializers.GlorotUniform()) + self.flatten = tl.layers.Flatten(name='flatten') + self.preq = tl.layers.Dense(256, tf.nn.relu, + in_channels=dense_in_channels, name='pre_q', + W_init=tf.initializers.GlorotUniform()) + self.qvalue = tl.layers.Dense(out_dim, in_channels=256, name='q', + W_init=tf.initializers.GlorotUniform()) + self.pres = tl.layers.Dense(256, tf.nn.relu, + in_channels=dense_in_channels, name='pre_s', + W_init=tf.initializers.GlorotUniform()) + self.svalue = tl.layers.Dense(1, in_channels=256, name='state', + W_init=tf.initializers.GlorotUniform()) + self.noise_scale = 0 + + def forward(self, ni): + feature = self.flatten(self.conv3(self.conv2(self.conv1(ni)))) + + # apply noise to all linear layer + if self.noise_scale != 0: + noises = [] + for layer in [self.preq, self.qvalue, self.pres, self.svalue]: + for var in layer.trainable_weights: + noise = tf.random.normal(tf.shape(var), 0, self.noise_scale) + noises.append(noise) + var.assign_add(noise) + + qvalue = self.qvalue(self.preq(feature)) + svalue = self.svalue(self.pres(feature)) + + if self.noise_scale != 0: + idx = 0 + for layer in [self.preq, self.qvalue, self.pres, self.svalue]: + for var in layer.trainable_weights: + var.assign_sub(noises[idx]) + idx += 1 + + # dueling network + return svalue + qvalue - tf.reduce_mean(qvalue, 1, keepdims=True) + + +class ReplayBuffer(object): + def __init__(self, size): + self._storage = [] + self._maxsize = size + self._next_idx = 0 + + def __len__(self): + return len(self._storage) + + def add(self, *args): + if self._next_idx >= len(self._storage): + self._storage.append(args) + else: + self._storage[self._next_idx] = args + self._next_idx = (self._next_idx + 1) % self._maxsize + + def _encode_sample(self, idxes): + b_o, b_a, b_r, b_o_, b_d = [], [], [], [], [] + for i in idxes: + o, a, r, o_, d = self._storage[i] + b_o.append(o) + b_a.append(a) + b_r.append(r) + b_o_.append(o_) + b_d.append(d) + return ( + np.stack(b_o).astype('float32') * ob_scale, + np.stack(b_a).astype('int32'), + np.stack(b_r).astype('float32'), + np.stack(b_o_).astype('float32') * ob_scale, + np.stack(b_d).astype('float32'), + ) + + def sample(self, batch_size): + indexes = range(len(self._storage)) + idxes = [random.choice(indexes) for _ in range(batch_size)] + return self._encode_sample(idxes) + + +def huber_loss(x): + """Loss function for value""" + return tf.where(tf.abs(x) < 1, tf.square(x) * 0.5, tf.abs(x) - 0.5) + + +def sync(net, net_tar): + """Copy q network to target q network""" + for var, var_tar in zip(net.trainable_weights, net_tar.trainable_weights): + var_tar.assign(var) + + +def log_softmax(x, dim): + temp = x - np.max(x, dim, keepdims=True) + return temp - np.log(np.exp(temp).sum(dim, keepdims=True)) + + +def softmax(x, dim): + temp = np.exp(x - np.max(x, dim, keepdims=True)) + return temp / temp.sum(dim, keepdims=True) + + +qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') +qnet.train() +trainabel_weights = qnet.trainable_weights +targetqnet = MLP('targetq') if qnet_type == 'MLP' else CNN('targetq') +targetqnet.infer() +sync(qnet, targetqnet) +optimizer = tf.optimizers.Adam(learning_rate=lr) +buffer = ReplayBuffer(buffer_size) + +o = env.reset() +nepisode = 0 +t = time.time() +noise_scale = 1e-2 +for i in range(1, number_timesteps + 1): + eps = epsilon(i) + + # select action + if random.random() < eps: + a = int(random.random() * out_dim) + else: + # noise schedule is based on KL divergence between perturbed and + # non-perturbed policy, see https://arxiv.org/pdf/1706.01905.pdf + obv = np.expand_dims(o, 0).astype('float32') * ob_scale + if i < explore_timesteps: + qnet.noise_scale = noise_scale + q_ptb = qnet(obv).numpy() + qnet.noise_scale = 0 + if i % noise_update_freq == 0: + q = qnet(obv).numpy() + kl_ptb = (log_softmax(q, 1) - log_softmax(q_ptb, 1)) + kl_ptb = np.sum(kl_ptb * softmax(q, 1), 1).mean() + kl_explore = -np.log(1 - eps + eps / out_dim) + if kl_ptb < kl_explore: + noise_scale *= 1.01 + else: + noise_scale /= 1.01 + a = q_ptb.argmax(1)[0] + else: + a = qnet(obv).numpy().argmax(1)[0] + + # execute action and feed to replay buffer + # note that `_` tail in var name means next + o_, r, done, info = env.step(a) + buffer.add(o, a, r, o_, done) + + if i >= warm_start: + # sync q net and target q net + if i % target_q_update_freq == 0: + sync(qnet, targetqnet) + + # sample from replay buffer + b_o, b_a, b_r, b_o_, b_d = buffer.sample(batch_size) + + # double q estimation + b_a_ = tf.one_hot(tf.argmax(qnet(b_o_), 1), out_dim) + b_q_ = (1 - b_d) * tf.reduce_sum(targetqnet(b_o_) * b_a_, 1) + + # calculate loss + with tf.GradientTape() as q_tape: + b_q = tf.reduce_sum(qnet(b_o) * tf.one_hot(b_a, out_dim), 1) + loss = tf.reduce_mean(huber_loss(b_q - (b_r + reward_gamma * b_q_))) + + # backward gradients + q_grad = q_tape.gradient(loss, trainabel_weights) + optimizer.apply_gradients(zip(q_grad, trainabel_weights)) + + if done: + o = env.reset() + else: + o = o_ + + # episode in info is real (unwrapped) message + if info.get('episode'): + nepisode += 1 + reward, length = info['episode']['r'], info['episode']['l'] + fps = int(length / (time.time() - t)) + print('Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}, FPS: {}' + .format(i, nepisode, reward, length, fps)) + t = time.time() From 63a1f933e5a83fb43de4a4c93b33333786d5b2ec Mon Sep 17 00:00:00 2001 From: Zihan Ding <1402434478@qq.com> Date: Tue, 21 May 2019 14:05:39 +0100 Subject: [PATCH 11/57] Update README.md --- examples/reinforcement_learning/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index 96392e0df..b645db34f 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -10,7 +10,7 @@ This folder contains implementation of most popular reinforcement learning algor * tensorflow-probability * tf-nightly-2.0-preview -*** If you meet problem `AttributeError: module 'tensorflow' has no attribute 'contrib'` when running the code after install tensorflow-probability, try: +*** If you meet the error`AttributeError: module 'tensorflow' has no attribute 'contrib'` when running the code after installing tensorflow-probability, try: `pip install --upgrade tf-nightly-2.0-preview tfp-nightly` From 38470ac69cf96d93053576a4dc23697338207286 Mon Sep 17 00:00:00 2001 From: Zihan Ding <1402434478@qq.com> Date: Tue, 21 May 2019 14:08:27 +0100 Subject: [PATCH 12/57] Update tutorial_sac.py --- examples/reinforcement_learning/tutorial_sac.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/reinforcement_learning/tutorial_sac.py b/examples/reinforcement_learning/tutorial_sac.py index 22c31a1d5..508d36520 100644 --- a/examples/reinforcement_learning/tutorial_sac.py +++ b/examples/reinforcement_learning/tutorial_sac.py @@ -1,7 +1,8 @@ ''' -Soft Actor-Critic version 2 +Soft Actor-Critic using target Q instead of V net: 2 Q net, 2 target Q net, 1 policy net -add alpha loss compared with version 1 +adding alpha loss + paper: https://arxiv.org/pdf/1812.05905.pdf Actor policy is stochastic. From 8d6afdb7fbd5fafc3bc5d3af4ce15b645baa3b5c Mon Sep 17 00:00:00 2001 From: huangyanhua Date: Wed, 22 May 2019 17:02:38 +0800 Subject: [PATCH 13/57] add prioritized replay --- examples/reinforcement_learning/README.md | 6 + .../tutorial_prioritized_replay.py | 478 ++++++++++++++++++ 2 files changed, 484 insertions(+) create mode 100644 examples/reinforcement_learning/tutorial_prioritized_replay.py diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index b645db34f..f2732f071 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -59,6 +59,12 @@ or `python ***.py --train` for training and `python ***.py --test` for testing. Experiment Environments: Pong and Cartpole + +* Prioritized replay + + Code: `./tutorial_prioritized_replay.py` + + Experiment Environments: Pong and Cartpole * Distributed DQN diff --git a/examples/reinforcement_learning/tutorial_prioritized_replay.py b/examples/reinforcement_learning/tutorial_prioritized_replay.py new file mode 100644 index 000000000..3fd72160f --- /dev/null +++ b/examples/reinforcement_learning/tutorial_prioritized_replay.py @@ -0,0 +1,478 @@ +"""Implement prioritized replay +Schaul T, Quan J, Antonoglou I, et al. Prioritized experience replay[J]. arXiv +preprint arXiv:1511.05952, 2015. + +# Requirements +tensorflow==2.0.0a0 +tensorlayer==2.0.1 + +""" +import operator +import random +import time + +import numpy as np +import tensorflow as tf +import tensorlayer as tl + +from tutorial_wrappers import build_env + + +seed = 0 +env_id = 'CartPole-v0' # CartPole-v0, PongNoFrameskip-v4 +if env_id == 'CartPole-v0': + qnet_type = 'MLP' + number_timesteps = 10000 # total number of time steps to train on + explore_timesteps = 100 + # epsilon-greedy schedule, final exploit prob is 0.99 + epsilon = lambda i_iter: 1 - 0.99 * min(1, i_iter / explore_timesteps) + lr = 5e-3 # learning rate + buffer_size = 1000 # replay buffer size + target_q_update_freq = 50 # how frequency target q net update + ob_scale = 1.0 # scale observations +else: + # reward will increase obviously after 1e5 time steps + qnet_type = 'CNN' + number_timesteps = int(1e6) # total number of time steps to train on + explore_timesteps = 1e5 + # epsilon-greedy schedule, final exploit prob is 0.99 + epsilon = lambda i_iter: 1 - 0.99 * min(1, i_iter / explore_timesteps) + lr = 1e-4 # learning rate + buffer_size = 10000 # replay buffer size + target_q_update_freq = 200 # how frequency target q net update + ob_scale = 1.0 / 255 # scale observations + +env = build_env(env_id, seed=seed) +in_dim = env.observation_space.shape +out_dim = env.action_space.n +reward_gamma = 0.99 # reward discount +batch_size = 32 # batch size for sampling from replay buffer +warm_start = buffer_size / 10 # sample times befor learning +prioritized_replay_alpha = 0.6 +prioritized_replay_beta0 = 0.4 + + +class MLP(tl.models.Model): + def __init__(self, name): + super(MLP, self).__init__(name=name) + self.h1 = tl.layers.Dense(64, tf.nn.tanh, in_channels=in_dim[0]) + self.qvalue = tl.layers.Dense(out_dim, in_channels=64, name='q', + W_init=tf.initializers.GlorotUniform()) + self.svalue = tl.layers.Dense(1, in_channels=64, name='s', + W_init=tf.initializers.GlorotUniform()) + self.noise_scale = 0 + + def forward(self, ni): + feature = self.h1(ni) + + # apply noise to all linear layer + if self.noise_scale != 0: + noises = [] + for layer in [self.qvalue, self.svalue]: + for var in layer.trainable_weights: + noise = tf.random.normal(tf.shape(var), 0, self.noise_scale) + noises.append(noise) + var.assign_add(noise) + + qvalue = self.qvalue(feature) + svalue = self.svalue(feature) + + if self.noise_scale != 0: + idx = 0 + for layer in [self.qvalue, self.svalue]: + for var in layer.trainable_weights: + var.assign_sub(noises[idx]) + idx += 1 + + # dueling network + out = svalue + qvalue - tf.reduce_mean(qvalue, 1, keepdims=True) + return out + + +class CNN(tl.models.Model): + def __init__(self, name): + super(CNN, self).__init__(name=name) + h, w, in_channels = in_dim + dense_in_channels = 64 * ((h - 28) // 8) * ((w - 28) // 8) + self.conv1 = tl.layers.Conv2d(32, (8, 8), (4, 4), tf.nn.relu, 'VALID', + in_channels=in_channels, name='conv2d_1', + W_init=tf.initializers.GlorotUniform()) + self.conv2 = tl.layers.Conv2d(64, (4, 4), (2, 2), tf.nn.relu, 'VALID', + in_channels=32, name='conv2d_2', + W_init=tf.initializers.GlorotUniform()) + self.conv3 = tl.layers.Conv2d(64, (3, 3), (1, 1), tf.nn.relu, 'VALID', + in_channels=64, name='conv2d_3', + W_init=tf.initializers.GlorotUniform()) + self.flatten = tl.layers.Flatten(name='flatten') + self.preq = tl.layers.Dense(256, tf.nn.relu, + in_channels=dense_in_channels, name='pre_q', + W_init=tf.initializers.GlorotUniform()) + self.qvalue = tl.layers.Dense(out_dim, in_channels=256, name='q', + W_init=tf.initializers.GlorotUniform()) + self.pres = tl.layers.Dense(256, tf.nn.relu, + in_channels=dense_in_channels, name='pre_s', + W_init=tf.initializers.GlorotUniform()) + self.svalue = tl.layers.Dense(1, in_channels=256, name='state', + W_init=tf.initializers.GlorotUniform()) + self.noise_scale = 0 + + def forward(self, ni): + feature = self.flatten(self.conv3(self.conv2(self.conv1(ni)))) + + # apply noise to all linear layer + if self.noise_scale != 0: + noises = [] + for layer in [self.preq, self.qvalue, self.pres, self.svalue]: + for var in layer.trainable_weights: + noise = tf.random.normal(tf.shape(var), 0, self.noise_scale) + noises.append(noise) + var.assign_add(noise) + + qvalue = self.qvalue(self.preq(feature)) + svalue = self.svalue(self.pres(feature)) + + if self.noise_scale != 0: + idx = 0 + for layer in [self.preq, self.qvalue, self.pres, self.svalue]: + for var in layer.trainable_weights: + var.assign_sub(noises[idx]) + idx += 1 + + # dueling network + return svalue + qvalue - tf.reduce_mean(qvalue, 1, keepdims=True) + + +class SegmentTree(object): + def __init__(self, capacity, operation, neutral_element): + """Build a Segment Tree data structure. + + https://en.wikipedia.org/wiki/Segment_tree + + Can be used as regular array, but with two + important differences: + + a) setting item's value is slightly slower. + It is O(lg capacity) instead of O(1). + b) user has access to an efficient ( O(log segment size) ) + `reduce` operation which reduces `operation` over + a contiguous subsequence of items in the array. + + Paramters + --------- + capacity: int + Total size of the array - must be a power of two. + operation: lambda obj, obj -> obj + and operation for combining elements (eg. sum, max) + must form a mathematical group together with the set of + possible values for array elements (i.e. be associative) + neutral_element: obj + neutral element for the operation above. eg. float('-inf') + for max and 0 for sum. + """ + assert capacity > 0 and capacity & (capacity - 1) == 0, \ + "capacity must be positive and a power of 2." + self._capacity = capacity + self._value = [neutral_element for _ in range(2 * capacity)] + self._operation = operation + + def _reduce_helper(self, start, end, node, node_start, node_end): + if start == node_start and end == node_end: + return self._value[node] + mid = (node_start + node_end) // 2 + if end <= mid: + return self._reduce_helper(start, end, 2 * node, node_start, mid) + else: + if mid + 1 <= start: + return self._reduce_helper(start, end, + 2 * node + 1, mid + 1, node_end) + else: + return self._operation( + self._reduce_helper(start, mid, + 2 * node, node_start, mid), + self._reduce_helper(mid + 1, end, + 2 * node + 1, mid + 1, node_end) + ) + + def reduce(self, start=0, end=None): + """Returns result of applying `self.operation` + to a contiguous subsequence of the array. + + Parameters + ---------- + start: int + beginning of the subsequence + end: int + end of the subsequences + + Returns + ------- + reduced: obj + result of reducing self.operation over the specified range of array. + """ + if end is None: + end = self._capacity + if end < 0: + end += self._capacity + end -= 1 + return self._reduce_helper(start, end, 1, 0, self._capacity - 1) + + def __setitem__(self, idx, val): + # index of the leaf + idx += self._capacity + self._value[idx] = val + idx //= 2 + while idx >= 1: + self._value[idx] = self._operation( + self._value[2 * idx], + self._value[2 * idx + 1] + ) + idx //= 2 + + def __getitem__(self, idx): + assert 0 <= idx < self._capacity + return self._value[self._capacity + idx] + + +class SumSegmentTree(SegmentTree): + def __init__(self, capacity): + super(SumSegmentTree, self).__init__( + capacity=capacity, + operation=operator.add, + neutral_element=0.0 + ) + + def sum(self, start=0, end=None): + """Returns arr[start] + ... + arr[end]""" + return super(SumSegmentTree, self).reduce(start, end) + + def find_prefixsum_idx(self, prefixsum): + """Find the highest index `i` in the array such that + sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum + + if array values are probabilities, this function + allows to sample indexes according to the discrete + probability efficiently. + + Parameters + ---------- + perfixsum: float + upperbound on the sum of array prefix + + Returns + ------- + idx: int + highest index satisfying the prefixsum constraint + """ + assert 0 <= prefixsum <= self.sum() + 1e-5 + idx = 1 + while idx < self._capacity: # while non-leaf + if self._value[2 * idx] > prefixsum: + idx = 2 * idx + else: + prefixsum -= self._value[2 * idx] + idx = 2 * idx + 1 + return idx - self._capacity + + +class MinSegmentTree(SegmentTree): + def __init__(self, capacity): + super(MinSegmentTree, self).__init__( + capacity=capacity, + operation=min, + neutral_element=float('inf') + ) + + def min(self, start=0, end=None): + """Returns min(arr[start], ..., arr[end])""" + + return super(MinSegmentTree, self).reduce(start, end) + + +class ReplayBuffer(object): + def __init__(self, size): + self._storage = [] + self._maxsize = size + self._next_idx = 0 + + def __len__(self): + return len(self._storage) + + def add(self, *args): + if self._next_idx >= len(self._storage): + self._storage.append(args) + else: + self._storage[self._next_idx] = args + self._next_idx = (self._next_idx + 1) % self._maxsize + + def _encode_sample(self, idxes): + b_o, b_a, b_r, b_o_, b_d = [], [], [], [], [] + for i in idxes: + o, a, r, o_, d = self._storage[i] + b_o.append(o) + b_a.append(a) + b_r.append(r) + b_o_.append(o_) + b_d.append(d) + return ( + np.stack(b_o).astype('float32') * ob_scale, + np.stack(b_a).astype('int32'), + np.stack(b_r).astype('float32'), + np.stack(b_o_).astype('float32') * ob_scale, + np.stack(b_d).astype('float32'), + ) + + def sample(self, batch_size): + indexes = range(len(self._storage)) + idxes = [random.choice(indexes) for _ in range(batch_size)] + return self._encode_sample(idxes) + + +class PrioritizedReplayBuffer(ReplayBuffer): + def __init__(self, size, alpha, beta): + """Create Prioritized Replay buffer. + + Parameters + ---------- + size: int + Max number of transitions to store in the buffer. When the buffer + overflows the old memories are dropped. + alpha: float + how much prioritization is used + (0 - no prioritization, 1 - full prioritization) + + See Also + -------- + ReplayBuffer.__init__ + """ + super(PrioritizedReplayBuffer, self).__init__(size) + assert alpha >= 0 + self._alpha = alpha + + it_capacity = 1 + while it_capacity < size: + it_capacity *= 2 + + self._it_sum = SumSegmentTree(it_capacity) + self._it_min = MinSegmentTree(it_capacity) + self._max_priority = 1.0 + self.beta = beta + + def add(self, *args): + """See ReplayBuffer.store_effect""" + idx = self._next_idx + super().add(*args) + self._it_sum[idx] = self._max_priority ** self._alpha + self._it_min[idx] = self._max_priority ** self._alpha + + def _sample_proportional(self, batch_size): + res = [] + p_total = self._it_sum.sum(0, len(self._storage) - 1) + every_range_len = p_total / batch_size + for i in range(batch_size): + mass = random.random() * every_range_len + i * every_range_len + idx = self._it_sum.find_prefixsum_idx(mass) + res.append(idx) + return res + + def sample(self, batch_size): + """Sample a batch of experiences""" + idxes = self._sample_proportional(batch_size) + + it_sum = self._it_sum.sum() + p_min = self._it_min.min() / it_sum + max_weight = (p_min * len(self._storage)) ** (-self.beta) + + p_samples = np.asarray([self._it_sum[idx] for idx in idxes]) / it_sum + weights = (p_samples * len(self._storage)) ** (-self.beta) / max_weight + encoded_sample = self._encode_sample(idxes) + return encoded_sample + (weights, idxes) + + def update_priorities(self, idxes, priorities): + """Update priorities of sampled transitions""" + assert len(idxes) == len(priorities) + for idx, priority in zip(idxes, priorities): + assert priority > 0 + assert 0 <= idx < len(self._storage) + self._it_sum[idx] = priority ** self._alpha + self._it_min[idx] = priority ** self._alpha + + self._max_priority = max(self._max_priority, priority) + + +def huber_loss(x): + """Loss function for value""" + return tf.where(tf.abs(x) < 1, tf.square(x) * 0.5, tf.abs(x) - 0.5) + + +def sync(net, net_tar): + """Copy q network to target q network""" + for var, var_tar in zip(net.trainable_weights, net_tar.trainable_weights): + var_tar.assign(var) + + +qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') +qnet.train() +trainabel_weights = qnet.trainable_weights +targetqnet = MLP('targetq') if qnet_type == 'MLP' else CNN('targetq') +targetqnet.infer() +sync(qnet, targetqnet) +optimizer = tf.optimizers.Adam(learning_rate=lr) +buffer = PrioritizedReplayBuffer( + buffer_size, prioritized_replay_alpha, prioritized_replay_beta0) + +o = env.reset() +nepisode = 0 +t = time.time() +for i in range(1, number_timesteps + 1): + eps = epsilon(i) + buffer.beta += (1 - prioritized_replay_beta0) / number_timesteps + + # select action + if random.random() < eps: + a = int(random.random() * out_dim) + else: + obv = np.expand_dims(o, 0).astype('float32') * ob_scale + a = qnet(obv).numpy().argmax(1)[0] + + # execute action and feed to replay buffer + # note that `_` tail in var name means next + o_, r, done, info = env.step(a) + buffer.add(o, a, r, o_, done) + + if i >= warm_start: + # sync q net and target q net + if i % target_q_update_freq == 0: + sync(qnet, targetqnet) + + # sample from replay buffer + b_o, b_a, b_r, b_o_, b_d, weights, idxs = buffer.sample(batch_size) + + # q estimation + b_q_ = (1 - b_d) * tf.reduce_max(targetqnet(b_o_), 1) + + # calculate loss + with tf.GradientTape() as q_tape: + b_q = tf.reduce_sum(qnet(b_o) * tf.one_hot(b_a, out_dim), 1) + abs_td_error = tf.abs(b_q - (b_r + reward_gamma * b_q_)) + priorities = np.clip(abs_td_error.numpy(), 1e-6, None) + buffer.update_priorities(idxs, priorities) + loss = tf.reduce_mean(weights * huber_loss(abs_td_error)) + + # backward gradients + q_grad = q_tape.gradient(loss, trainabel_weights) + optimizer.apply_gradients(zip(q_grad, trainabel_weights)) + + if done: + o = env.reset() + else: + o = o_ + + # episode in info is real (unwrapped) message + if info.get('episode'): + nepisode += 1 + reward, length = info['episode']['r'], info['episode']['l'] + fps = int(length / (time.time() - t)) + print('Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}, FPS: {}' + .format(i, nepisode, reward, length, fps)) + t = time.time() From 974ff6110b9784f2ff54530d382305cebc27cdec Mon Sep 17 00:00:00 2001 From: huangyanhua Date: Thu, 23 May 2019 15:36:21 +0800 Subject: [PATCH 14/57] add c51 --- examples/reinforcement_learning/README.md | 4 +- .../reinforcement_learning/tutorial_c51.py | 225 ++++++++++++++++++ .../tutorial_prioritized_replay.py | 57 +---- 3 files changed, 230 insertions(+), 56 deletions(-) create mode 100644 examples/reinforcement_learning/tutorial_c51.py diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index f2732f071..9146325a4 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -69,7 +69,9 @@ or `python ***.py --train` for training and `python ***.py --test` for testing. * Distributed DQN - To do. + Code: `./tutorial_c51.py` + + Experiment Environments: Pong and Cartpole diff --git a/examples/reinforcement_learning/tutorial_c51.py b/examples/reinforcement_learning/tutorial_c51.py new file mode 100644 index 000000000..b649f9b46 --- /dev/null +++ b/examples/reinforcement_learning/tutorial_c51.py @@ -0,0 +1,225 @@ +"""Implement prioritized replay +Schaul T, Quan J, Antonoglou I, et al. Prioritized experience replay[J]. arXiv +preprint arXiv:1511.05952, 2015. + +# Requirements +tensorflow==2.0.0a0 +tensorlayer==2.0.1 + +""" +import random +import time + +import numpy as np +import tensorflow as tf +import tensorlayer as tl + +from tutorial_wrappers import build_env + + +seed = 0 +env_id = 'CartPole-v0' # CartPole-v0, PongNoFrameskip-v4 +if env_id == 'CartPole-v0': + qnet_type = 'MLP' + number_timesteps = 10000 # total number of time steps to train on + explore_timesteps = 100 + # epsilon-greedy schedule, final exploit prob is 0.99 + epsilon = lambda i_iter: 1 - 0.99 * min(1, i_iter / explore_timesteps) + lr = 5e-3 # learning rate + buffer_size = 1000 # replay buffer size + target_q_update_freq = 50 # how frequency target q net update + ob_scale = 1.0 # scale observations +else: + # reward will increase obviously after 1e5 time steps + qnet_type = 'CNN' + number_timesteps = int(1e6) # total number of time steps to train on + explore_timesteps = 1e5 + # epsilon-greedy schedule, final exploit prob is 0.99 + epsilon = lambda i_iter: 1 - 0.99 * min(1, i_iter / explore_timesteps) + lr = 1e-4 # learning rate + buffer_size = 10000 # replay buffer size + target_q_update_freq = 200 # how frequency target q net update + ob_scale = 1.0 / 255 # scale observations + +env = build_env(env_id, seed=seed) +in_dim = env.observation_space.shape +out_dim = env.action_space.n +reward_gamma = 0.99 # reward discount +batch_size = 32 # batch size for sampling from replay buffer +warm_start = buffer_size / 10 # sample times befor learning +atom_num = 51 +min_value = -10 +max_value = 10 +vrange = np.linspace(min_value, max_value, atom_num) +deltaz = float(max_value - min_value) / (atom_num - 1) + + +class MLP(tl.models.Model): + def __init__(self, name): + super(MLP, self).__init__(name=name) + self.h1 = tl.layers.Dense(64, tf.nn.tanh, in_channels=in_dim[0], + W_init=tf.initializers.GlorotUniform()) + self.qvalue = tl.layers.Dense(out_dim * atom_num, + in_channels=64, name='q', + W_init=tf.initializers.GlorotUniform()) + self.reshape = tl.layers.Reshape((-1, out_dim, atom_num)) + + def forward(self, ni): + qvalues = self.qvalue(self.h1(ni)) + return tf.nn.log_softmax(self.reshape(qvalues), 2) + + +class CNN(tl.models.Model): + def __init__(self, name): + super(CNN, self).__init__(name=name) + h, w, in_channels = in_dim + dense_in_channels = 64 * ((h - 28) // 8) * ((w - 28) // 8) + self.conv1 = tl.layers.Conv2d(32, (8, 8), (4, 4), tf.nn.relu, 'VALID', + in_channels=in_channels, name='conv2d_1', + W_init=tf.initializers.GlorotUniform()) + self.conv2 = tl.layers.Conv2d(64, (4, 4), (2, 2), tf.nn.relu, 'VALID', + in_channels=32, name='conv2d_2', + W_init=tf.initializers.GlorotUniform()) + self.conv3 = tl.layers.Conv2d(64, (3, 3), (1, 1), tf.nn.relu, 'VALID', + in_channels=64, name='conv2d_3', + W_init=tf.initializers.GlorotUniform()) + self.flatten = tl.layers.Flatten(name='flatten') + self.preq = tl.layers.Dense(256, tf.nn.relu, + in_channels=dense_in_channels, name='pre_q', + W_init=tf.initializers.GlorotUniform()) + self.qvalue = tl.layers.Dense(out_dim * atom_num, + in_channels=256, name='q', + W_init=tf.initializers.GlorotUniform()) + self.reshape = tl.layers.Reshape((-1, out_dim, atom_num)) + + def forward(self, ni): + feature = self.flatten(self.conv3(self.conv2(self.conv1(ni)))) + qvalues = self.qvalue(self.preq(feature)) + return tf.nn.log_softmax(self.reshape(qvalues), 2) + + +class ReplayBuffer(object): + def __init__(self, size): + self._storage = [] + self._maxsize = size + self._next_idx = 0 + + def __len__(self): + return len(self._storage) + + def add(self, *args): + if self._next_idx >= len(self._storage): + self._storage.append(args) + else: + self._storage[self._next_idx] = args + self._next_idx = (self._next_idx + 1) % self._maxsize + + def _encode_sample(self, idxes): + b_o, b_a, b_r, b_o_, b_d = [], [], [], [], [] + for i in idxes: + o, a, r, o_, d = self._storage[i] + b_o.append(o) + b_a.append(a) + b_r.append(r) + b_o_.append(o_) + b_d.append(d) + return ( + np.stack(b_o).astype('float32') * ob_scale, + np.stack(b_a).astype('int32'), + np.stack(b_r).astype('float32'), + np.stack(b_o_).astype('float32') * ob_scale, + np.stack(b_d).astype('float32'), + ) + + def sample(self, batch_size): + indexes = range(len(self._storage)) + idxes = [random.choice(indexes) for _ in range(batch_size)] + return self._encode_sample(idxes) + + +def sync(net, net_tar): + """Copy q network to target q network""" + for var, var_tar in zip(net.trainable_weights, net_tar.trainable_weights): + var_tar.assign(var) + + +qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') +qnet.train() +trainabel_weights = qnet.trainable_weights +targetqnet = MLP('targetq') if qnet_type == 'MLP' else CNN('targetq') +targetqnet.infer() +sync(qnet, targetqnet) +optimizer = tf.optimizers.Adam(learning_rate=lr) +buffer = ReplayBuffer(buffer_size) + +o = env.reset() +nepisode = 0 +t = time.time() +for i in range(1, number_timesteps + 1): + eps = epsilon(i) + + # select action + if random.random() < eps: + a = int(random.random() * out_dim) + else: + obv = np.expand_dims(o, 0).astype('float32') * ob_scale + qdist = np.exp(qnet(obv).numpy()) + qvalues = (qdist * vrange).sum(-1) + a = qvalues.argmax(1)[0] + + # execute action and feed to replay buffer + # note that `_` tail in var name means next + o_, r, done, info = env.step(a) + buffer.add(o, a, r, o_, done) + + if i >= warm_start: + # sync q net and target q net + if i % target_q_update_freq == 0: + sync(qnet, targetqnet) + + # sample from replay buffer + b_o, b_a, b_r, b_o_, b_d = buffer.sample(batch_size) + + # q estimation, see Algorithm 1 in paper for detail + b_dist_ = np.exp(targetqnet(b_o_).numpy()) + b_a_ = (b_dist_ * vrange).sum(-1).argmax(1) + b_tzj = np.clip(reward_gamma * (1 - b_d[:, None]) * vrange[None, :] + + b_r[:, None], min_value, max_value) + b_i = (b_tzj - min_value) / deltaz + b_l = np.floor(b_i).astype('int64') + b_u = np.ceil(b_i).astype('int64') + templ = b_dist_[range(batch_size), b_a_, :] * (b_u - b_i) + tempu = b_dist_[range(batch_size), b_a_, :] * (b_i - b_l) + b_m = np.zeros((batch_size, atom_num)) + # TODO: aggregate value by index and batch update (scatter_add) + for j in range(batch_size): + for k in range(atom_num): + b_m[j][b_l[j][k]] += templ[j][k] + b_m[j][b_u[j][k]] += tempu[j][k] + b_m = tf.convert_to_tensor(b_m, dtype='float32') + + # calculate loss + with tf.GradientTape() as q_tape: + b_index = np.stack([range(batch_size), b_a], 1) + b_index = tf.convert_to_tensor(b_index, 'int64') + b_dist_a = tf.gather_nd(qnet(b_o), b_index) + loss = -tf.reduce_mean(tf.reduce_sum(b_dist_a * b_m, 1)) + + # backward gradients + q_grad = q_tape.gradient(loss, trainabel_weights) + optimizer.apply_gradients(zip(q_grad, trainabel_weights)) + + if done: + o = env.reset() + else: + o = o_ + + # episode in info is real (unwrapped) message + if info.get('episode'): + nepisode += 1 + reward, length = info['episode']['r'], info['episode']['l'] + fps = int(length / (time.time() - t)) + print('Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}, FPS: {}' + .format(i, nepisode, reward, length, fps)) + t = time.time() diff --git a/examples/reinforcement_learning/tutorial_prioritized_replay.py b/examples/reinforcement_learning/tutorial_prioritized_replay.py index 3fd72160f..8d625b5b9 100644 --- a/examples/reinforcement_learning/tutorial_prioritized_replay.py +++ b/examples/reinforcement_learning/tutorial_prioritized_replay.py @@ -58,35 +58,9 @@ def __init__(self, name): self.h1 = tl.layers.Dense(64, tf.nn.tanh, in_channels=in_dim[0]) self.qvalue = tl.layers.Dense(out_dim, in_channels=64, name='q', W_init=tf.initializers.GlorotUniform()) - self.svalue = tl.layers.Dense(1, in_channels=64, name='s', - W_init=tf.initializers.GlorotUniform()) - self.noise_scale = 0 def forward(self, ni): - feature = self.h1(ni) - - # apply noise to all linear layer - if self.noise_scale != 0: - noises = [] - for layer in [self.qvalue, self.svalue]: - for var in layer.trainable_weights: - noise = tf.random.normal(tf.shape(var), 0, self.noise_scale) - noises.append(noise) - var.assign_add(noise) - - qvalue = self.qvalue(feature) - svalue = self.svalue(feature) - - if self.noise_scale != 0: - idx = 0 - for layer in [self.qvalue, self.svalue]: - for var in layer.trainable_weights: - var.assign_sub(noises[idx]) - idx += 1 - - # dueling network - out = svalue + qvalue - tf.reduce_mean(qvalue, 1, keepdims=True) - return out + return self.qvalue(self.h1(ni)) class CNN(tl.models.Model): @@ -109,37 +83,10 @@ def __init__(self, name): W_init=tf.initializers.GlorotUniform()) self.qvalue = tl.layers.Dense(out_dim, in_channels=256, name='q', W_init=tf.initializers.GlorotUniform()) - self.pres = tl.layers.Dense(256, tf.nn.relu, - in_channels=dense_in_channels, name='pre_s', - W_init=tf.initializers.GlorotUniform()) - self.svalue = tl.layers.Dense(1, in_channels=256, name='state', - W_init=tf.initializers.GlorotUniform()) - self.noise_scale = 0 def forward(self, ni): feature = self.flatten(self.conv3(self.conv2(self.conv1(ni)))) - - # apply noise to all linear layer - if self.noise_scale != 0: - noises = [] - for layer in [self.preq, self.qvalue, self.pres, self.svalue]: - for var in layer.trainable_weights: - noise = tf.random.normal(tf.shape(var), 0, self.noise_scale) - noises.append(noise) - var.assign_add(noise) - - qvalue = self.qvalue(self.preq(feature)) - svalue = self.svalue(self.pres(feature)) - - if self.noise_scale != 0: - idx = 0 - for layer in [self.preq, self.qvalue, self.pres, self.svalue]: - for var in layer.trainable_weights: - var.assign_sub(noises[idx]) - idx += 1 - - # dueling network - return svalue + qvalue - tf.reduce_mean(qvalue, 1, keepdims=True) + return self.qvalue(self.preq(feature)) class SegmentTree(object): From eb816eaa58e41b8067669150dfd7092ac7777f14 Mon Sep 17 00:00:00 2001 From: Officium Date: Fri, 24 May 2019 11:42:16 +0800 Subject: [PATCH 15/57] add retrace tutorial and update doc in preplay --- examples/reinforcement_learning/README.md | 8 +- .../reinforcement_learning/tutorial_c51.py | 7 +- .../tutorial_retrace.py | 205 ++++++++++++++++++ 3 files changed, 216 insertions(+), 4 deletions(-) create mode 100644 examples/reinforcement_learning/tutorial_retrace.py diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index 9146325a4..1f062e913 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -73,7 +73,13 @@ or `python ***.py --train` for training and `python ***.py --test` for testing. Experiment Environments: Pong and Cartpole - + +* Retrace(lambda) DQN + + Code: `./tutorial_retrace.py` + + Experiment Environments: Pong and Cartpole + * Actor-Critic (AC) diff --git a/examples/reinforcement_learning/tutorial_c51.py b/examples/reinforcement_learning/tutorial_c51.py index b649f9b46..25cf0251c 100644 --- a/examples/reinforcement_learning/tutorial_c51.py +++ b/examples/reinforcement_learning/tutorial_c51.py @@ -1,6 +1,7 @@ -"""Implement prioritized replay -Schaul T, Quan J, Antonoglou I, et al. Prioritized experience replay[J]. arXiv -preprint arXiv:1511.05952, 2015. +"""Implement C51 algorithm +Bellemare M G, Dabney W, Munos R. A distributional perspective on reinforcement +learning[C]//Proceedings of the 34th International Conference on Machine +Learning-Volume 70. JMLR. org, 2017: 449-458. # Requirements tensorflow==2.0.0a0 diff --git a/examples/reinforcement_learning/tutorial_retrace.py b/examples/reinforcement_learning/tutorial_retrace.py new file mode 100644 index 000000000..24b260168 --- /dev/null +++ b/examples/reinforcement_learning/tutorial_retrace.py @@ -0,0 +1,205 @@ +"""Implement retrace(\lambda) algorithm +Munos R, Stepleton T, Harutyunyan A, et al. Safe and efficient off-policy +reinforcement learning[C]//Advances in Neural Information Processing Systems. +2016: 1054-1062. + +# Requirements +tensorflow==2.0.0a0 +tensorlayer==2.0.1 + +""" +import random +import time + +import numpy as np +import tensorflow as tf +import tensorlayer as tl + +from tutorial_wrappers import build_env + + +seed = 0 +env_id = 'CartPole-v0' # CartPole-v0, PongNoFrameskip-v4 +if env_id == 'CartPole-v0': + qnet_type = 'MLP' + number_timesteps = 10000 # total number of time steps to train on + lr = 5e-3 # learning rate + buffer_size = 1000 # replay buffer size + target_q_update_freq = 50 # how frequency target q net update + ob_scale = 1.0 # scale observations +else: + # reward will increase obviously after 1e5 time steps + qnet_type = 'CNN' + number_timesteps = int(1e6) # total number of time steps to train on + lr = 1e-4 # learning rate + buffer_size = 10000 # replay buffer size + target_q_update_freq = 200 # how frequency target q net update + ob_scale = 1.0 / 255 # scale observations + +env = build_env(env_id, seed=seed) +in_dim = env.observation_space.shape +out_dim = env.action_space.n +reward_gamma = 0.99 # reward discount +batch_size = 32 # batch size for sampling from replay buffer +warm_start = buffer_size / 10 # sample times befor learning +retrace_lambda = 1.0 + + +class MLP(tl.models.Model): + def __init__(self, name): + super(MLP, self).__init__(name=name) + self.h1 = tl.layers.Dense(64, tf.nn.tanh, in_channels=in_dim[0]) + self.qvalue = tl.layers.Dense(out_dim, in_channels=64, name='q', + W_init=tf.initializers.GlorotUniform()) + + def forward(self, ni): + feature = self.h1(ni) + qvalue = self.qvalue(feature) + return qvalue, tf.nn.softmax(qvalue, 1) + + +class CNN(tl.models.Model): + def __init__(self, name): + super(CNN, self).__init__(name=name) + h, w, in_channels = in_dim + dense_in_channels = 64 * ((h - 28) // 8) * ((w - 28) // 8) + self.conv1 = tl.layers.Conv2d(32, (8, 8), (4, 4), tf.nn.relu, 'VALID', + in_channels=in_channels, name='conv2d_1', + W_init=tf.initializers.GlorotUniform()) + self.conv2 = tl.layers.Conv2d(64, (4, 4), (2, 2), tf.nn.relu, 'VALID', + in_channels=32, name='conv2d_2', + W_init=tf.initializers.GlorotUniform()) + self.conv3 = tl.layers.Conv2d(64, (3, 3), (1, 1), tf.nn.relu, 'VALID', + in_channels=64, name='conv2d_3', + W_init=tf.initializers.GlorotUniform()) + self.flatten = tl.layers.Flatten(name='flatten') + self.preq = tl.layers.Dense(256, tf.nn.relu, + in_channels=dense_in_channels, name='pre_q', + W_init=tf.initializers.GlorotUniform()) + self.qvalue = tl.layers.Dense(out_dim, in_channels=256, name='q', + W_init=tf.initializers.GlorotUniform()) + + def forward(self, ni): + feature = self.flatten(self.conv3(self.conv2(self.conv1(ni)))) + qvalue = self.qvalue(self.preq(feature)) + return qvalue, tf.nn.softmax(qvalue, 1) + + +class ReplayBuffer(object): + def __init__(self, size): + self._storage = [] + self._maxsize = size + self._next_idx = 0 + + def __len__(self): + return len(self._storage) + + def add(self, *args): + if self._next_idx >= len(self._storage): + self._storage.append(args) + else: + self._storage[self._next_idx] = args + self._next_idx = (self._next_idx + 1) % self._maxsize + + def _encode_sample(self, idxes): + b_o, b_a, b_r, b_o_, b_d, b_pi = [], [], [], [], [], [] + for i in idxes: + o, a, r, o_, d, pi = self._storage[i] + b_o.append(o) + b_a.append(a) + b_r.append(r) + b_o_.append(o_) + b_d.append(d) + b_pi.append(pi) + return ( + np.stack(b_o).astype('float32') * ob_scale, + np.stack(b_a).astype('int32'), + np.stack(b_r).astype('float32'), + np.stack(b_o_).astype('float32') * ob_scale, + np.stack(b_d).astype('float32'), + np.stack(b_pi).astype('float32') + ) + + def sample(self, batch_size): + indexes = range(len(self._storage)) + idxes = [random.choice(indexes) for _ in range(batch_size)] + return self._encode_sample(idxes) + + +def huber_loss(x): + """Loss function for value""" + return tf.where(tf.abs(x) < 1, tf.square(x) * 0.5, tf.abs(x) - 0.5) + + +def sync(net, net_tar): + """Copy q network to target q network""" + for var, var_tar in zip(net.trainable_weights, net_tar.trainable_weights): + var_tar.assign(var) + + +qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') +qnet.train() +trainabel_weights = qnet.trainable_weights +targetqnet = MLP('targetq') if qnet_type == 'MLP' else CNN('targetq') +targetqnet.infer() +sync(qnet, targetqnet) +optimizer = tf.optimizers.Adam(learning_rate=lr) +buffer = ReplayBuffer(buffer_size) + +o = env.reset() +nepisode = 0 +t = time.time() +for i in range(1, number_timesteps + 1): + # select action based on boltzmann exploration + obv = np.expand_dims(o, 0).astype('float32') * ob_scale + qs, pi = qnet(obv) + a = np.random.multinomial(1, pi.numpy()[0]).argmax() + pi = pi.numpy()[0] + + # execute action and feed to replay buffer + # note that `_` tail in var name means next + o_, r, done, info = env.step(a) + buffer.add(o, a, r, o_, done, pi) + + if i >= warm_start: + # sync q net and target q net + if i % target_q_update_freq == 0: + sync(qnet, targetqnet) + + # sample from replay buffer + b_o, b_a, b_r, b_o_, b_d, b_old_pi = buffer.sample(batch_size) + + # q estimation based on 1 step retrace(\lambda) + b_q_, b_pi_ = targetqnet(b_o_) + b_v_ = (b_q_ * b_pi_).numpy().sum(1) + b_q, b_pi = targetqnet(b_o) + b_q = tf.reduce_sum(b_q * tf.one_hot(b_a, out_dim), 1).numpy() + c = np.clip(b_pi.numpy() / (b_old_pi + 1e-8), None, 1) + c = c[range(batch_size), b_a] + td = b_r + reward_gamma * (1 - b_d) * b_v_ - b_q + q_target = c * td + b_q + + # calculate loss + with tf.GradientTape() as q_tape: + b_q, _ = qnet(b_o) + b_q = tf.reduce_sum(b_q * tf.one_hot(b_a, out_dim), 1) + loss = tf.reduce_mean(huber_loss(b_q - q_target)) + + # backward gradients + q_grad = q_tape.gradient(loss, trainabel_weights) + optimizer.apply_gradients(zip(q_grad, trainabel_weights)) + + if done: + o = env.reset() + else: + o = o_ + + # episode in info is real (unwrapped) message + if info.get('episode'): + nepisode += 1 + reward, length = info['episode']['r'], info['episode']['l'] + fps = int(length / (time.time() - t)) + print('Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}, FPS: {}' + .format(i, nepisode, reward, length, fps)) + t = time.time() From 54a6c599297880956293e43aaa45fded596cdcce Mon Sep 17 00:00:00 2001 From: initial-h <18811472492@163.com> Date: Tue, 28 May 2019 00:23:50 +0800 Subject: [PATCH 16/57] Update README.md --- examples/reinforcement_learning/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index 1f062e913..c9874fd91 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -32,8 +32,9 @@ or `python ***.py --train` for training and `python ***.py --test` for testing. | DDPG | Continuous | Continuous | | TD3 | Continuous | Continuous | | HER | | | -| TRPO | | | -| PPO | | | +| PG | Continuous | Continuous | +| TRPO | Continuous | Continuous | +| PPO | Continuous | Continuous | | | | | | | | | | | | | From f0f344602203ef14f266014b3423eb9628acb91b Mon Sep 17 00:00:00 2001 From: Tokarev-TT-33 <34995488+Tokarev-TT-33@users.noreply.github.com> Date: Tue, 28 May 2019 00:25:22 +0800 Subject: [PATCH 17/57] Update PG, DDPG, PPO, DPPO, TRPO Update PG, DDPG, PPO, DPPO, TRPO codes --- .../reinforcement_learning/tutorial_DDPG.py | 281 +++++++ .../reinforcement_learning/tutorial_DPPO.py | 328 +++++++++ .../reinforcement_learning/tutorial_PG.py | 235 ++++++ .../reinforcement_learning/tutorial_PPO.py | 289 ++++++++ .../reinforcement_learning/tutorial_TRPO.py | 687 ++++++++++++++++++ 5 files changed, 1820 insertions(+) create mode 100644 examples/reinforcement_learning/tutorial_DDPG.py create mode 100644 examples/reinforcement_learning/tutorial_DPPO.py create mode 100644 examples/reinforcement_learning/tutorial_PG.py create mode 100644 examples/reinforcement_learning/tutorial_PPO.py create mode 100644 examples/reinforcement_learning/tutorial_TRPO.py diff --git a/examples/reinforcement_learning/tutorial_DDPG.py b/examples/reinforcement_learning/tutorial_DDPG.py new file mode 100644 index 000000000..dafd7f59b --- /dev/null +++ b/examples/reinforcement_learning/tutorial_DDPG.py @@ -0,0 +1,281 @@ +""" +Deep Deterministic Policy Gradient (DDPG) +----------------------------------------- +An algorithm concurrently learns a Q-function and a policy. +It uses off-policy data and the Bellman equation to learn the Q-function, +and uses the Q-function to learn the policy. + +Reference +--------- +Deterministic Policy Gradient Algorithms, Silver et al. 2014 +Continuous Control With Deep Reinforcement Learning, Lillicrap et al. 2016 +MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/ + +Env +--- +Openai Gym Pendulum-v0, continual action space + +To run +------ +python *.py + +""" + +import tensorflow as tf +import tensorlayer as tl +import numpy as np + +##################### hyper parameters #################### + +LR_A = 0.001 # learning rate for actor +LR_C = 0.002 # learning rate for critic +GAMMA = 0.9 # reward discount +TAU = 0.01 # soft replacement +MEMORY_CAPACITY = 10000 +BATCH_SIZE = 32 + +############################### DDPG #################################### + + +class DDPG(object): + ''' + DDPG class + ''' + def __init__(self, a_dim, s_dim, a_bound, ): + self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32) + self.pointer = 0 + self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound + + W_init = tf.random_normal_initializer(mean=0, stddev=0.3) + b_init = tf.constant_initializer(0.1) + + def get_actor(input_state_shape, name=''): + ''' + Build actor network + :param input_state_shape: state + :return: act + ''' + inputs = tl.layers.Input(input_state_shape, name='A_input') + x = tl.layers.Dense(n_units=30, act=tf.nn.relu, W_init=W_init, b_init=b_init, name='A_l1')(inputs) + x = tl.layers.Dense(n_units=a_dim, act=tf.nn.tanh, W_init=W_init, b_init=b_init, name='A_a')(x) + # x = tl.layers.Lambda(lambda x: np.array(a_bound)*x)(x) + # x = tf.multiply(x, a_bound, name='A_scaled_a') + return tl.models.Model(inputs=inputs, outputs=x, name='Actor' + name) + + def get_critic(input_state_shape, input_action_shape, name=''): + ''' + Build critic network + :param input_state_shape: state + :param input_action_shape: act + :return: Q value Q(s,a) + ''' + s = tl.layers.Input(input_state_shape, name='C_s_input') + a = tl.layers.Input(input_action_shape, name='C_a_input') + x = tl.layers.Concat(1)([s, a]) + x = tl.layers.Dense(n_units=60, act=tf.nn.relu, W_init=W_init, b_init=b_init, name='C_l1')(x) + x = tl.layers.Dense(n_units=1, W_init=W_init, b_init=b_init, name='C_out')(x) + return tl.models.Model(inputs=[s, a], outputs=x, name='Critic' + name) + + self.actor = get_actor([None, s_dim]) + self.critic = get_critic([None, s_dim], [None, a_dim]) + self.actor.train() + self.critic.train() + + def copy_para(from_model, to_model): + ''' + Copy parameters for soft updating + :param from_model: latest model + :param to_model: target model + :return: None + ''' + for i, j in zip(from_model.trainable_weights, to_model.trainable_weights): + j.assign(i) + + self.actor_target = get_actor([None, s_dim], name='_target') + copy_para(self.actor, self.actor_target) + self.actor_target.eval() + + self.critic_target = get_critic([None, s_dim], [None, a_dim], name='_target') + copy_para(self.critic, self.critic_target) + self.critic_target.eval() + + self.R = tl.layers.Input([None, 1], tf.float32, 'r') + + self.ema = tf.train.ExponentialMovingAverage(decay=1 - TAU) # soft replacement + + self.actor_opt = tf.optimizers.Adam(LR_A) + self.critic_opt = tf.optimizers.Adam(LR_C) + + def ema_update(self): + ''' + Soft updating by exponential smoothing + :return: None + ''' + paras = self.actor.trainable_weights + self.critic.trainable_weights + self.ema.apply(paras) + for i, j in zip(self.actor_target.trainable_weights + self.critic_target.trainable_weights, paras): + i.assign(self.ema.average(j)) + + def choose_action(self, s): + ''' + Choose action + :param s: state + :return: act + ''' + return self.actor(np.array([s], dtype=np.float32))[0] + + def learn(self): + ''' + Update parameters + :return: None + ''' + indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE) + bt = self.memory[indices, :] + bs = bt[:, :self.s_dim] + ba = bt[:, self.s_dim: self.s_dim + self.a_dim] + br = bt[:, -self.s_dim - 1: -self.s_dim] + bs_ = bt[:, -self.s_dim:] + + with tf.GradientTape() as tape: + a_ = self.actor_target(bs_) + q_ = self.critic_target([bs_, a_]) + y = br + GAMMA * q_ + q = self.critic([bs, ba]) + td_error = tf.losses.mean_squared_error(y, q) + c_grads = tape.gradient(td_error, self.critic.trainable_weights) + self.critic_opt.apply_gradients(zip(c_grads, self.critic.trainable_weights)) + + with tf.GradientTape() as tape: + a = self.actor(bs) + q = self.critic([bs, a]) + a_loss = - tf.reduce_mean(q) # maximize the q + a_grads = tape.gradient(a_loss, self.actor.trainable_weights) + self.actor_opt.apply_gradients(zip(a_grads, self.actor.trainable_weights)) + + self.ema_update() + + def store_transition(self, s, a, r, s_): + ''' + Store data in data buffer + :param s: state + :param a: act + :param r: reward + :param s_: next state + :return: None + ''' + s = s.astype(np.float32) + s_ = s_.astype(np.float32) + transition = np.hstack((s, a, [r], s_)) + index = self.pointer % MEMORY_CAPACITY # replace the old memory with new memory + self.memory[index, :] = transition + self.pointer += 1 + + def save_ckpt(self): + """ + save trained weights + :return: None + """ + tl.files.save_npz(self.actor.trainable_weights, name='model/actor.npz') + tl.files.save_npz(self.actor_target.trainable_weights, name='model/actor_target.npz') + tl.files.save_npz(self.critic.trainable_weights, name='model/critic.npz') + tl.files.save_npz(self.critic_target.trainable_weights, name='model/critic_target.npz') + + def load_ckpt(self): + """ + load trained weights + :return: None + """ + tl.files.load_and_assign_npz(name='model/actor.npz', network=self.actor) + tl.files.load_and_assign_npz(name='model/actor_target.npz', network=self.actor_target) + tl.files.load_and_assign_npz(name='model/critic.npz', network=self.critic) + tl.files.load_and_assign_npz(name='model/critic_target.npz', network=self.critic_target) + + +if __name__ == '__main__': + import gym + import time + import matplotlib.pyplot as plt + + MAX_EPISODES = 200 + MAX_EP_STEPS = 200 + TEST_PER_EPISODES = 10 + ENV_NAME = 'Pendulum-v0' + + ############################### training #################################### + + env = gym.make(ENV_NAME) + env = env.unwrapped + env.seed(1) + + s_dim = env.observation_space.shape[0] + a_dim = env.action_space.shape[0] + a_bound = env.action_space.high + + ddpg = DDPG(a_dim, s_dim, a_bound) + + var = 3 # control exploration + reward_buffer = [] + t0 = time.time() + for i in range(MAX_EPISODES): + t1 = time.time() + s = env.reset() + ep_reward = 0 + for j in range(MAX_EP_STEPS): + # Add exploration noise + a = ddpg.choose_action(s) + a = np.clip(np.random.normal(a, var), -2, 2) # add randomness to action selection for exploration + s_, r, done, info = env.step(a) + + ddpg.store_transition(s, a, r / 10, s_) + + if ddpg.pointer > MEMORY_CAPACITY: + # var *= .9995 # decay the action randomness + ddpg.learn() + + s = s_ + ep_reward += r + if j == MAX_EP_STEPS - 1: + print("\rEpisode [%d/%d] \tReward: %i \tExplore: %.2f \ttook: %.5fs " % + (i, MAX_EPISODES, ep_reward, var, time.time() - t1), end='') + + # test + if i and not i % TEST_PER_EPISODES: + t1 = time.time() + s = env.reset() + ep_reward = 0 + for j in range(MAX_EP_STEPS): + + a = ddpg.choose_action(s) # without exploration noise + s_, r, done, info = env.step(a) + + s = s_ + ep_reward += r + if j == MAX_EP_STEPS - 1: + print("\rEpisode [%d/%d] \tReward: %i \tExplore: %.2f \ttook: %.5fs " % + (i, MAX_EPISODES, ep_reward, var, time.time() - t1)) + + reward_buffer.append(ep_reward) + + if reward_buffer: + plt.ion() + plt.title('DDPG') + plt.plot(np.array(range(len(reward_buffer)))*TEST_PER_EPISODES, reward_buffer) # plot the episode vt + plt.xlabel('episode steps') + plt.ylabel('normalized state-action value') + plt.ylim(-2000, 0) + plt.show() + plt.pause(0.1) + plt.cla() + plt.ioff() + + print('\nRunning time: ', time.time() - t0) + s = env.reset() + while True: + s = env.reset() + for i in range(MAX_EP_STEPS): + env.render() + a = ddpg.choose_action(s) + s_, r, done, info = env.step(a) + if done: + break + s = s_ diff --git a/examples/reinforcement_learning/tutorial_DPPO.py b/examples/reinforcement_learning/tutorial_DPPO.py new file mode 100644 index 000000000..564895198 --- /dev/null +++ b/examples/reinforcement_learning/tutorial_DPPO.py @@ -0,0 +1,328 @@ +""" +Distributed Proximal Policy Optimization (DPPO) +---------------------------- +A distributing version of OpenAI's Proximal Policy Optimization (PPO). +Workers in parallel to collect data, then stop worker's roll-out and train PPO on collected data. +Restart workers once PPO is updated. + +Reference +--------- +Emergence of Locomotion Behaviours in Rich Environments, Heess et al. 2017 +Proximal Policy Optimization Algorithms, Schulman et al. 2017 +High Dimensional Continuous Control Using Generalized Advantage Estimation, Schulman et al. 2016 +MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials + +Env +--- +Openai Gym Pendulum-v0, continual action space + +To run +------ +python *.py + + +""" + +import tensorflow as tf +import numpy as np +import matplotlib.pyplot as plt +import gym, threading, queue + +import tensorlayer as tl +import tensorflow_probability as tfp + +EP_MAX = 1000 +EP_LEN = 200 +GAMMA = 0.9 +A_LR = 0.0001 +C_LR = 0.0002 +BATCH = 32 +A_UPDATE_STEPS = 10 +C_UPDATE_STEPS = 10 +S_DIM, A_DIM = 3, 1 +EPS = 1e-8 +METHOD = [ + dict(name='kl_pen', kl_target=0.01, lam=0.5), # KL penalty + dict(name='clip', epsilon=0.2), # Clipped surrogate objective, find this is better +][1] # choose the method for optimization + + +class PPO(object): + ''' + PPO class + ''' + + def __init__(self): + + # critic + tfs = tl.layers.Input([None, S_DIM], tf.float32, 'state') + l1 = tl.layers.Dense(100, tf.nn.relu)(tfs) + v = tl.layers.Dense(1)(l1) + self.critic = tl.models.Model(tfs, v) + self.critic.train() + + # actor + self.actor = self._build_anet('pi', trainable=True) + self.actor_old = self._build_anet('oldpi', trainable=False) + + def a_train(self, tfs, tfa, tfadv): + ''' + Update policy network + :param tfs: state + :param tfa: act + :param tfadv: advantage + :return: + ''' + tfs = np.array(tfs, np.float32) + tfa = np.array(tfa, np.float32) + tfadv = np.array(tfadv, np.float32) + with tf.GradientTape() as tape: + mu, sigma = self.actor(tfs) + pi = tfp.distributions.Normal(mu, sigma) + + mu_old, sigma_old = self.actor_old(tfs) + oldpi = tfp.distributions.Normal(mu_old, sigma_old) + + # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa)) + ratio = pi.prob(tfa) / (oldpi.prob(tfa) + EPS) + surr = ratio * tfadv + if METHOD['name'] == 'kl_pen': + tflam = METHOD['lam'] + kl = tfp.distributions.kl_divergence(oldpi, pi) + kl_mean = tf.reduce_mean(kl) + aloss = -(tf.reduce_mean(surr - tflam * kl)) + else: # clipping method, find this is better + aloss = -tf.reduce_mean(tf.minimum( + surr, + tf.clip_by_value(ratio, 1. - METHOD['epsilon'], 1. + METHOD['epsilon']) * tfadv)) + a_gard = tape.gradient(aloss, self.actor.trainable_weights) + + tf.optimizers.Adam(A_LR).apply_gradients(zip(a_gard, self.actor.trainable_weights)) + + if METHOD['name'] == 'kl_pen': + return kl_mean + + def update_old_pi(self): + ''' + Update old policy parameter + :return: None + ''' + for p, oldp in zip(self.actor.trainable_weights, self.actor_old.trainable_weights): + oldp.assign(p) + + def c_train(self, tfdc_r, s): + ''' + Update actor network + :param tfdc_r: cumulative reward + :param s: state + :return: None + ''' + tfdc_r = np.array(tfdc_r, dtype=np.float32) + with tf.GradientTape() as tape: + advantage = tfdc_r - self.critic(s) + closs = tf.reduce_mean(tf.square(advantage)) + grad = tape.gradient(closs, self.critic.trainable_weights) + tf.optimizers.Adam(C_LR).apply_gradients(zip(grad, self.critic.trainable_weights)) + + def cal_adv(self, tfs, tfdc_r): + ''' + Calculate advantage + :param tfs: state + :param tfdc_r: cumulative reward + :return: advantage + ''' + tfdc_r = np.array(tfdc_r, dtype=np.float32) + advantage = tfdc_r - self.critic(tfs) + return advantage.numpy() + + def update(self): + ''' + Update parameter with the constraint of KL divergent + :return: None + ''' + global GLOBAL_UPDATE_COUNTER + while not COORD.should_stop(): + if GLOBAL_EP < EP_MAX: + UPDATE_EVENT.wait() # wait until get batch of data + self.update_old_pi() # copy pi to old pi + data = [QUEUE.get() for _ in range(QUEUE.qsize())] # collect data from all workers + data = np.vstack(data) + + s, a, r = data[:, :S_DIM].astype(np.float32), \ + data[:, S_DIM: S_DIM + A_DIM].astype(np.float32), \ + data[:, -1:].astype(np.float32) + + adv = self.cal_adv(s, r) + # adv = (adv - adv.mean())/(adv.std()+1e-6) # sometimes helpful + + # update actor + if METHOD['name'] == 'kl_pen': + for _ in range(A_UPDATE_STEPS): + kl = self.a_train(s, a, adv) + if kl > 4 * METHOD['kl_target']: # this in in google's paper + break + if kl < METHOD['kl_target'] / 1.5: # adaptive lambda, this is in OpenAI's paper + METHOD['lam'] /= 2 + elif kl > METHOD['kl_target'] * 1.5: + METHOD['lam'] *= 2 + METHOD['lam'] = np.clip(METHOD['lam'], 1e-4, 10) # sometimes explode, this clipping is MorvanZhou's solution + else: # clipping method, find this is better (OpenAI's paper) + for _ in range(A_UPDATE_STEPS): + self.a_train(s, a, adv) + + # update critic + for _ in range(C_UPDATE_STEPS): + self.c_train(r, s) + + UPDATE_EVENT.clear() # updating finished + GLOBAL_UPDATE_COUNTER = 0 # reset counter + ROLLING_EVENT.set() # set roll-out available + + def _build_anet(self, name, trainable): + ''' + Build policy network + :param name: name + :param trainable: trainable flag + :return: policy network + ''' + tfs = tl.layers.Input([None, S_DIM], tf.float32, name + '_state') + l1 = tl.layers.Dense(100, tf.nn.relu, name=name + '_l1')(tfs) + a = tl.layers.Dense(A_DIM, tf.nn.tanh, name=name + '_a')(l1) + mu = tl.layers.Lambda(lambda x: x * 2, name=name + '_lambda')(a) + sigma = tl.layers.Dense(A_DIM, tf.nn.softplus, name=name + '_sigma')(l1) + model = tl.models.Model(tfs, [mu, sigma], name) + + if trainable: + model.train() + else: + model.eval() + return model + + def choose_action(self, s): + ''' + Choose action + :param s: state + :return: clipped act + ''' + s = s[np.newaxis, :].astype(np.float32) + mu, sigma = self.actor(s) + pi = tfp.distributions.Normal(mu, sigma) + a = tf.squeeze(pi.sample(1), axis=0)[0] # choosing action + return np.clip(a, -2, 2) + + def get_v(self, s): + ''' + Compute value + :param s: state + :return: value + ''' + s = s.astype(np.float32) + if s.ndim < 2: s = s[np.newaxis, :] + return self.critic(s)[0, 0] + + +'''--------------------------------------------------------------''' + +N_WORKER = 4 # parallel workers +MIN_BATCH_SIZE = 64 # minimum batch size for updating PPO +UPDATE_STEP = 10 # loop update operation n-steps + +GAME = 'Pendulum-v0' + + +class Worker(object): + ''' + Worker class for distributional running + ''' + + def __init__(self, wid): + self.wid = wid + self.env = gym.make(GAME).unwrapped + self.ppo = GLOBAL_PPO + + def work(self): + ''' + Define a worker + :return: None + ''' + global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER + while not COORD.should_stop(): + s = self.env.reset() + ep_r = 0 + buffer_s, buffer_a, buffer_r = [], [], [] + for t in range(EP_LEN): + if not ROLLING_EVENT.is_set(): # while global PPO is updating + ROLLING_EVENT.wait() # wait until PPO is updated + buffer_s, buffer_a, buffer_r = [], [], [] # clear history buffer, use new policy to collect data + a = self.ppo.choose_action(s) + s_, r, done, _ = self.env.step(a) + buffer_s.append(s) + buffer_a.append(a) + buffer_r.append((r + 8) / 8) # normalize reward, find to be useful + s = s_ + ep_r += r + + GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size, no need to wait other workers + if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: + v_s_ = self.ppo.get_v(s_) + discounted_r = [] # compute discounted reward + for r in buffer_r[::-1]: + v_s_ = r + GAMMA * v_s_ + discounted_r.append(v_s_) + discounted_r.reverse() + + bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis] + buffer_s, buffer_a, buffer_r = [], [], [] + QUEUE.put(np.hstack((bs, ba, br))) # put data in the queue + if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: + ROLLING_EVENT.clear() # stop collecting data + UPDATE_EVENT.set() # globalPPO update + + if GLOBAL_EP >= EP_MAX: # stop training + COORD.request_stop() + break + + # record reward changes, plot later + if len(GLOBAL_RUNNING_R) == 0: + GLOBAL_RUNNING_R.append(ep_r) + else: + GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 + ep_r * 0.1) + GLOBAL_EP += 1 + print('{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r, ) + + +if __name__ == '__main__': + GLOBAL_PPO = PPO() + UPDATE_EVENT, ROLLING_EVENT = threading.Event(), threading.Event() + UPDATE_EVENT.clear() # not update now + ROLLING_EVENT.set() # start to roll out + workers = [Worker(wid=i) for i in range(N_WORKER)] + + GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0 + GLOBAL_RUNNING_R = [] + COORD = tf.train.Coordinator() + QUEUE = queue.Queue() # workers putting data in this queue + threads = [] + for worker in workers: # worker threads + t = threading.Thread(target=worker.work, args=()) + t.start() # training + threads.append(t) + # add a PPO updating thread + threads.append(threading.Thread(target=GLOBAL_PPO.update, )) + threads[-1].start() + COORD.join(threads) + + # plot reward change and test + plt.title('DPPO') + plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R) + plt.xlabel('Episode') + plt.ylabel('Moving reward') + plt.ylim(-2000, 0) + plt.show() + + env = gym.make('Pendulum-v0') + while True: + s = env.reset() + for t in range(300): + env.render() + s = env.step(GLOBAL_PPO.choose_action(s))[0] diff --git a/examples/reinforcement_learning/tutorial_PG.py b/examples/reinforcement_learning/tutorial_PG.py new file mode 100644 index 000000000..56d15986e --- /dev/null +++ b/examples/reinforcement_learning/tutorial_PG.py @@ -0,0 +1,235 @@ +""" +Vanilla Policy Gradient(VPG or REINFORCE) +----------------------------------------- +The policy gradient algorithm works by updating policy parameters via stochastic gradient ascent on policy performance. +It's an on-policy algorithm can be used for environments with either discrete or continuous action spaces. +Here is an example on discrete action space game CartPole-v0. +To apply it on continuous action space, you need to change the last softmax layer and the choose_action function. + +Reference +--------- +Cookbook: Barto A G, Sutton R S. Reinforcement Learning: An Introduction[J]. 1998. +MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/ + +Env +--- +Openai Gym CartPole-v0, discrete action space + +To run +------ +python *.py + +""" + +import tensorflow as tf +import tensorlayer as tl +import numpy as np + +tl.logging.set_verbosity(tl.logging.DEBUG) + +# reproducible +np.random.seed(1) +tf.random.set_seed(1) + + +class PolicyGradient: + """ + PG class + """ + def __init__(self, n_features, n_actions, learning_rate=0.01, reward_decay=0.95): + self.n_actions = n_actions + self.n_features = n_features + self.lr = learning_rate + self.gamma = reward_decay + + self.ep_obs, self.ep_as, self.ep_rs = [], [], [] + + def get_model(inputs_shape): + """ + Build a neural network model. + :param inputs_shape: state_shape + :return: act + """ + with tf.name_scope('inputs'): + self.tf_obs = tl.layers.Input(inputs_shape, tf.float32, name="observations") + self.tf_acts = tl.layers.Input([None, ], tf.int32, name="actions_num") + self.tf_vt = tl.layers.Input([None, ], tf.float32, name="actions_value") + # fc1 + layer = tl.layers.Dense(n_units=30, act=tf.nn.tanh, + W_init=tf.random_normal_initializer(mean=0, stddev=0.3), + b_init=tf.constant_initializer(0.1), name='fc1')(self.tf_obs) + # fc2 + all_act = tl.layers.Dense(n_units=self.n_actions, act=None, + W_init=tf.random_normal_initializer(mean=0, stddev=0.3), + b_init=tf.constant_initializer(0.1), name='all_act')(layer) + return tl.models.Model(inputs=self.tf_obs, outputs=all_act, name='PG model') + + self.model = get_model([None, n_features]) + self.model.train() + self.optimizer = tf.optimizers.Adam(self.lr) + + def choose_action(self, s): + """ + choose action with probabilities. + :param s: state + :return: act + """ + _logits = self.model(np.array([s], np.float32)) + _probs = tf.nn.softmax(_logits).numpy() + return tl.rein.choice_action_by_probs(_probs.ravel()) + + def choose_action_greedy(self, s): + """ + choose action with greedy policy + :param s: state + :return: act + """ + _probs = tf.nn.softmax(self.model(np.array([s], np.float32))).numpy() + return np.argmax(_probs.ravel()) + + def store_transition(self, s, a, r): + """ + store data in memory buffer + :param s: state + :param a: act + :param r: reward + :return: + """ + self.ep_obs.append(np.array([s], np.float32)) + self.ep_as.append(a) + self.ep_rs.append(r) + + def learn(self): + """ + update policy parameters via stochastic gradient ascent + :return: None + """ + # discount and normalize episode reward + discounted_ep_rs_norm = self._discount_and_norm_rewards() + + with tf.GradientTape() as tape: + + _logits = self.model(np.vstack(self.ep_obs)) + # to maximize total reward (log_p * R) is to minimize -(log_p * R), and the tf only have minimize(loss) + neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=_logits, labels=np.array(self.ep_as)) + # this is negative log of chosen action + + # or in this way: + # neg_log_prob = tf.reduce_sum(-tf.log(self.all_act_prob)*tf.one_hot(self.tf_acts, self.n_actions), axis=1) + + loss = tf.reduce_mean(neg_log_prob * discounted_ep_rs_norm) # reward guided loss + + grad = tape.gradient(loss, self.model.trainable_weights) + self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights)) + + self.ep_obs, self.ep_as, self.ep_rs = [], [], [] # empty episode data + return discounted_ep_rs_norm + + def _discount_and_norm_rewards(self): + """ + compute discount_and_norm_rewards + :return: discount_and_norm_rewards + """ + # discount episode rewards + discounted_ep_rs = np.zeros_like(self.ep_rs) + running_add = 0 + for t in reversed(range(0, len(self.ep_rs))): + running_add = running_add * self.gamma + self.ep_rs[t] + discounted_ep_rs[t] = running_add + + # normalize episode rewards + discounted_ep_rs -= np.mean(discounted_ep_rs) + discounted_ep_rs /= np.std(discounted_ep_rs) + return discounted_ep_rs + + def save_ckpt(self): + """ + save trained weights + :return: None + """ + tl.files.save_npz(self.model.trainable_weights, name='model.npz') + + def load_ckpt(self): + """ + load trained weights + :return: None + """ + tl.files.load_and_assign_npz(name='model.npz', network=self.model) + + +if __name__ == '__main__': + + import gym + import matplotlib.pyplot as plt + import time + + DISPLAY_REWARD_THRESHOLD = 400 # renders environment if total episode reward is greater then this threshold + RENDER = False # rendering wastes time + num_episodes = 3000 + + env = gym.make('CartPole-v0') + env.seed(1) # reproducible, general Policy gradient has high variance + env = env.unwrapped + + print(env.action_space) + print(env.observation_space) + print(env.observation_space.high) + print(env.observation_space.low) + + RL = PolicyGradient( + n_actions=env.action_space.n, + n_features=env.observation_space.shape[0], + learning_rate=0.02, + reward_decay=0.99, + # output_graph=True, + ) + reward_buffer = [] + + + for i_episode in range(num_episodes): + + episode_time = time.time() + observation = env.reset() + + while True: + if RENDER: + env.render() + + action = RL.choose_action(observation) + + observation_, reward, done, info = env.step(action) + + RL.store_transition(observation, action, reward) + + if done: + ep_rs_sum = sum(RL.ep_rs) + + if 'running_reward' not in globals(): + running_reward = ep_rs_sum + else: + running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 + + if running_reward > DISPLAY_REWARD_THRESHOLD: + RENDER = True # rendering + + # print("episode:", i_episode, " reward:", int(running_reward)) + + print("Episode [%d/%d] \tsum reward: %d \trunning reward: %f \ttook: %.5fs " % + (i_episode, num_episodes, ep_rs_sum, running_reward, time.time() - episode_time)) + reward_buffer.append(running_reward) + + vt = RL.learn() + + plt.ion() + plt.title('PG') + plt.plot(reward_buffer, ) # plot the episode vt + plt.xlabel('episode steps') + plt.ylabel('normalized state-action value') + plt.show() + plt.pause(0.1) + plt.cla() + plt.ioff() + + break + + observation = observation_ diff --git a/examples/reinforcement_learning/tutorial_PPO.py b/examples/reinforcement_learning/tutorial_PPO.py new file mode 100644 index 000000000..727704e40 --- /dev/null +++ b/examples/reinforcement_learning/tutorial_PPO.py @@ -0,0 +1,289 @@ +""" +Proximal Policy Optimization (PPO) +---------------------------- +A simple version of Proximal Policy Optimization (PPO) using single thread. +PPO is a family of first-order methods that use a few other tricks to keep new policies close to old. +PPO methods are significantly simpler to implement, and empirically seem to perform at least as well as TRPO. + +Reference +--------- +Proximal Policy Optimization Algorithms, Schulman et al. 2017 +High Dimensional Continuous Control Using Generalized Advantage Estimation, Schulman et al. 2016 +Emergence of Locomotion Behaviours in Rich Environments, Heess et al. 2017 +MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials + +Env +--- +Openai Gym Pendulum-v0, continual action space + +To run +------ +python *.py + +""" + +import tensorflow as tf +import numpy as np +import matplotlib.pyplot as plt +import gym +import tensorlayer as tl +import tensorflow_probability as tfp + +EP_MAX = 1000 +EP_LEN = 200 +GAMMA = 0.9 +A_LR = 0.0001 +C_LR = 0.0002 +BATCH = 32 +A_UPDATE_STEPS = 10 +C_UPDATE_STEPS = 10 +S_DIM, A_DIM = 3, 1 +EPS = 1e-8 +METHOD = [ + dict(name='kl_pen', kl_target=0.01, lam=0.5), # KL penalty + dict(name='clip', epsilon=0.2), # Clipped surrogate objective, find this is better +][1] # choose the method for optimization + + +class PPO(object): + ''' + PPO class + ''' + + def __init__(self): + + # critic + tfs = tl.layers.Input([None, S_DIM], tf.float32, 'state') + l1 = tl.layers.Dense(100, tf.nn.relu)(tfs) + v = tl.layers.Dense(1)(l1) + self.critic = tl.models.Model(tfs, v) + self.critic.train() + + # actor + self.actor = self._build_anet('pi', trainable=True) + self.actor_old = self._build_anet('oldpi', trainable=False) + + def a_train(self, tfs, tfa, tfadv): + ''' + Update policy network + :param tfs: state + :param tfa: act + :param tfadv: advantage + :return: + ''' + tfs = np.array(tfs, np.float32) + tfa = np.array(tfa, np.float32) + tfadv = np.array(tfadv, np.float32) + with tf.GradientTape() as tape: + mu, sigma = self.actor(tfs) + pi = tfp.distributions.Normal(mu, sigma) + + mu_old, sigma_old = self.actor_old(tfs) + oldpi = tfp.distributions.Normal(mu_old, sigma_old) + + # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa)) + ratio = pi.prob(tfa) / (oldpi.prob(tfa) + EPS) + surr = ratio * tfadv + if METHOD['name'] == 'kl_pen': + tflam = METHOD['lam'] + kl = tfp.distributions.kl_divergence(oldpi, pi) + kl_mean = tf.reduce_mean(kl) + aloss = -(tf.reduce_mean(surr - tflam * kl)) + else: # clipping method, find this is better + aloss = -tf.reduce_mean(tf.minimum( + surr, + tf.clip_by_value(ratio, 1. - METHOD['epsilon'], 1. + METHOD['epsilon']) * tfadv)) + a_gard = tape.gradient(aloss, self.actor.trainable_weights) + + tf.optimizers.Adam(A_LR).apply_gradients(zip(a_gard, self.actor.trainable_weights)) + + if METHOD['name'] == 'kl_pen': + return kl_mean + + def update_old_pi(self): + ''' + Update old policy parameter + :return: None + ''' + for p, oldp in zip(self.actor.trainable_weights, self.actor_old.trainable_weights): + oldp.assign(p) + + def c_train(self, tfdc_r, s): + ''' + Update actor network + :param tfdc_r: cumulative reward + :param s: state + :return: None + ''' + tfdc_r = np.array(tfdc_r, dtype=np.float32) + with tf.GradientTape() as tape: + advantage = tfdc_r - self.critic(s) + closs = tf.reduce_mean(tf.square(advantage)) + grad = tape.gradient(closs, self.critic.trainable_weights) + tf.optimizers.Adam(C_LR).apply_gradients(zip(grad, self.critic.trainable_weights)) + + def cal_adv(self, tfs, tfdc_r): + ''' + Calculate advantage + :param tfs: state + :param tfdc_r: cumulative reward + :return: advantage + ''' + tfdc_r = np.array(tfdc_r, dtype=np.float32) + advantage = tfdc_r - self.critic(tfs) + return advantage.numpy() + + def update(self, s, a, r): + ''' + Update parameter with the constraint of KL divergent + :param s: state + :param a: act + :param r: reward + :return: None + ''' + s, a, r = s.astype(np.float32), a.astype(np.float32), r.astype(np.float32) + + self.update_old_pi() + adv = self.cal_adv(s, r) + # adv = (adv - adv.mean())/(adv.std()+1e-6) # sometimes helpful + + # update actor + if METHOD['name'] == 'kl_pen': + for _ in range(A_UPDATE_STEPS): + kl = self.a_train(s, a, adv) + if kl > 4 * METHOD['kl_target']: # this in in google's paper + break + if kl < METHOD['kl_target'] / 1.5: # adaptive lambda, this is in OpenAI's paper + METHOD['lam'] /= 2 + elif kl > METHOD['kl_target'] * 1.5: + METHOD['lam'] *= 2 + METHOD['lam'] = np.clip(METHOD['lam'], 1e-4, 10) # sometimes explode, this clipping is MorvanZhou's solution + else: # clipping method, find this is better (OpenAI's paper) + for _ in range(A_UPDATE_STEPS): + self.a_train(s, a, adv) + + # update critic + for _ in range(C_UPDATE_STEPS): + self.c_train(r, s) + + def _build_anet(self, name, trainable): + ''' + Build policy network + :param name: name + :param trainable: trainable flag + :return: policy network + ''' + tfs = tl.layers.Input([None, S_DIM], tf.float32, name + '_state') + l1 = tl.layers.Dense(100, tf.nn.relu, name=name + '_l1')(tfs) + a = tl.layers.Dense(A_DIM, tf.nn.tanh, name=name + '_a')(l1) + mu = tl.layers.Lambda(lambda x: x * 2, name=name + '_lambda')(a) + sigma = tl.layers.Dense(A_DIM, tf.nn.softplus, name=name + '_sigma')(l1) + model = tl.models.Model(tfs, [mu, sigma], name) + + if trainable: + model.train() + else: + model.eval() + return model + + def choose_action(self, s): + ''' + Choose action + :param s: state + :return: clipped act + ''' + s = s[np.newaxis, :].astype(np.float32) + mu, sigma = self.actor(s) + pi = tfp.distributions.Normal(mu, sigma) + a = tf.squeeze(pi.sample(1), axis=0)[0] # choosing action + return np.clip(a, -2, 2) + + def get_v(self, s): + ''' + Compute value + :param s: state + :return: value + ''' + s = s.astype(np.float32) + if s.ndim < 2: s = s[np.newaxis, :] + return self.critic(s)[0, 0] + + def save_ckpt(self): + """ + save trained weights + :return: None + """ + tl.files.save_npz(self.actor.trainable_weights, name='model/actor.npz') + tl.files.save_npz(self.actor_old.trainable_weights, name='model/actor_old.npz') + tl.files.save_npz(self.critic.trainable_weights, name='model/critic.npz') + + def load_ckpt(self): + """ + load trained weights + :return: None + """ + tl.files.load_and_assign_npz(name='model/actor.npz', network=self.actor) + tl.files.load_and_assign_npz(name='model/actor_old.npz', network=self.actor_old) + tl.files.load_and_assign_npz(name='model/critic.npz', network=self.critic) + + +env = gym.make('Pendulum-v0').unwrapped +ppo = PPO() +all_ep_r = [] + +for ep in range(EP_MAX): + s = env.reset() + buffer_s, buffer_a, buffer_r = [], [], [] + ep_r = 0 + for t in range(EP_LEN): # in one episode + # env.render() + a = ppo.choose_action(s) + s_, r, done, _ = env.step(a) + buffer_s.append(s) + buffer_a.append(a) + buffer_r.append((r + 8) / 8) # normalize reward, find to be useful + s = s_ + ep_r += r + + # update ppo + if (t + 1) % BATCH == 0 or t == EP_LEN - 1: + v_s_ = ppo.get_v(s_) + discounted_r = [] + for r in buffer_r[::-1]: + v_s_ = r + GAMMA * v_s_ + discounted_r.append(v_s_) + discounted_r.reverse() + + bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis] + buffer_s, buffer_a, buffer_r = [], [], [] + ppo.update(bs, ba, br) + if ep == 0: + all_ep_r.append(ep_r) + else: + all_ep_r.append(all_ep_r[-1] * 0.9 + ep_r * 0.1) + print( + 'Ep: %i' % ep, + "|Ep_r: %i" % ep_r, + ("|Lam: %.4f" % METHOD['lam']) if METHOD['name'] == 'kl_pen' else '', + ) + + plt.ion() + plt.cla() + plt.title('PPO') + plt.plot(np.arange(len(all_ep_r)), all_ep_r) + plt.ylim(-2000, 0) + plt.xlabel('Episode') + plt.ylabel('Moving averaged episode reward') + plt.show() + plt.pause(0.1) + plt.ioff() + +while True: + s = env.reset() + for i in range(EP_LEN): + env.render() + a = ppo.choose_action(s) + s_, r, done, _ = env.step(a) + if done: + break + s = s_ diff --git a/examples/reinforcement_learning/tutorial_TRPO.py b/examples/reinforcement_learning/tutorial_TRPO.py new file mode 100644 index 000000000..fedb60b7b --- /dev/null +++ b/examples/reinforcement_learning/tutorial_TRPO.py @@ -0,0 +1,687 @@ +""" +Trust Region Policy Optimization (TRPO) +--------------------------------------- +PG method with a large step can collapse the policy performance, +even with a small step can lead a large differences in policy. +TRPO constraint the step in policy space using KL divergence (rather than in parameter space), +which can monotonically improve performance and avoid a collapsed update. + +Reference +--------- +Trust Region Policy Optimization, Schulman et al. 2015 +High Dimensional Continuous Control Using Generalized Advantage Estimation, Schulman et al. 2016 +Approximately Optimal Approximate Reinforcement Learning, Kakade and Langford 2002 +openai/spinningup : http://spinningup.openai.com/en/latest/algorithms/trpo.html + +Env +--- +Openai Gym Pendulum-v0, continual action space + +To run +------ +python *.py + + +""" +import numpy as np + +import tensorflow as tf +import tensorflow_probability as tfp +import tensorlayer as tl +import gym +from gym.spaces import Box, Discrete +import time +from matplotlib import pyplot as plt +from scipy import signal +import copy + +EPS = 1e-8 + + +def assign_params_from_flat(x, params): + flat_size = lambda p: int(np.prod(p.shape.as_list())) # the 'int' is important for scalars + splits = tf.split(x, [flat_size(p) for p in params]) + new_params = [tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits)] + return tf.group([p.assign(p_new) for p, p_new in zip(params, new_params)]) + + +def hessian_vector_product(func, inputs, params, x): + # for H = grad**2 f, compute Hx + with tf.GradientTape() as tape0: + with tf.GradientTape() as tape1: + pi, logp, logp_pi, info, info_phs, d_kl, v = func(*inputs) + grad1 = tape1.gradient(d_kl, params) + g = flat_concat(grad1) + assert g.shape == x.shape + a = tf.reduce_sum(g * x) + grad0 = tape0.gradient(a, params) + g0 = flat_concat(grad0) + return x, g0 + + +def flat_concat(xs): + return tf.concat([tf.reshape(x, (-1,)) for x in xs], axis=0) + + +def flat_grad(f, params): + return flat_concat(tf.gradients(xs=params, ys=f)) + + +def diagonal_gaussian_kl(mu0, log_std0, mu1, log_std1): + """ + tf symbol for mean KL divergence between two batches of diagonal gaussian distributions, + where distributions are specified by means and log stds. + (https://en.wikipedia.org/wiki/Kullback-Leibler_divergence#Multivariate_normal_distributions) + """ + var0, var1 = tf.exp(2 * log_std0), tf.exp(2 * log_std1) + pre_sum = 0.5 * (((mu1 - mu0) ** 2 + var0) / (var1 + EPS) - 1) + log_std1 - log_std0 + all_kls = tf.reduce_sum(pre_sum, axis=1) + return tf.reduce_mean(all_kls) + + +def gaussian_likelihood(x, mu, log_std): + pre_sum = -0.5 * (((x - mu) / (tf.exp(log_std) + EPS)) ** 2 + 2 * log_std + np.log(2 * np.pi)) + return tf.reduce_sum(pre_sum, axis=1) + + +def categorical_kl(logp0, logp1): + """ + tf symbol for mean KL divergence between two batches of categorical probability distributions, + where the distributions are input as log probs. + """ + all_kls = tf.reduce_sum(tf.exp(logp1) * (logp1 - logp0), axis=1) + return tf.reduce_mean(all_kls) + + +def mlp(input_space, hidden_sizes=(32,), activation=tf.tanh, output_activation=None, name=None): + inputs = input_layers_from_space(input_space) + x = inputs + for i, h in enumerate(hidden_sizes[:-1]): + # x = tf.layers.dense(x, units=h, activation=activation) + if name: + n = name + '_layer_' + str(i) + else: + n = None + x = tl.layers.Dense(h, activation, name=n)(x) + if name: + n = name + '_output' + else: + n = None + outputs = tl.layers.Dense(hidden_sizes[-1], output_activation, name=n)(x) + return tl.models.Model(inputs, outputs, name) + + +def mlp_categorical_policy(obs_space, act_space, hidden_sizes, activation, output_activation): + act_dim = act_space.n + actor = mlp(obs_space, list(hidden_sizes) + [act_dim], activation, None, name='actor') + + critic = mlp(obs_space, list(hidden_sizes) + [1], activation, None, name='critic') + + def cal(s, a=None, old_logp_all=None): + s = s.astype(np.float32) + logits = actor(s) + logp_all = tf.nn.log_softmax(logits) + pi = tf.squeeze(tfp.distributions.Multinomial(logits, 1), axis=1) + logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=act_dim) * logp_all, axis=1) + v = tf.squeeze(critic(s), axis=1) + info = {'logp_all': logp_all} + + if a is None and old_logp_all is None: + info = values_as_sorted_list(info) + return [pi, v, logp_pi] + info + else: + a = a.astype(np.float32) + logp = tf.reduce_sum(tf.one_hot(a, depth=act_dim) * logp_all, axis=1) + + # check_shape(old_logp_all, act_dim) + d_kl = categorical_kl(logp_all, old_logp_all) + + info_phs = {'logp_all': old_logp_all} + return pi, logp, logp_pi, info, info_phs, d_kl, v + + return actor, cal + + +def check_shape(array, shape): + try: + assert np.shape(array) == shape + except Exception as e: + print(np.shape(array), '!=', shape) + raise e + + +def check_shapes(array_list, shape_list): + for arr, shp in zip(array_list, shape_list): + check_shape(arr, shp) + + +def mlp_gaussian_policy(obs_space, act_space, hidden_sizes, activation, output_activation): + act_dim = act_space.shape[0] + actor = mlp(obs_space, list(hidden_sizes) + [act_dim], activation, output_activation, name='actor') + + critic = mlp(obs_space, list(hidden_sizes) + [1], activation, None, name='critic') + + def cal(s, a=None, old_mu_ph=None, old_log_std_ph=None): + s = s.astype(np.float32) + mu = actor(s) + log_std = -0.5 * np.ones(act_dim, dtype=np.float32) + std = tf.exp(log_std) + pi = mu + tf.random.normal(tf.shape(mu)) * std + logp_pi = gaussian_likelihood(pi, mu, log_std) + v = tf.squeeze(critic(s), axis=1) + info = {'mu': mu, 'log_std': log_std} + + if a is None and old_mu_ph is None and old_log_std_ph is None: + info = values_as_sorted_list(info) + return [pi, v, logp_pi] + info + elif a is not None and old_mu_ph is not None and old_log_std_ph is not None: + a = a.astype(np.float32) + logp = gaussian_likelihood(a, mu, log_std) + # check_shapes((old_mu_ph, old_log_std_ph), (act_dim, act_dim)) + d_kl = diagonal_gaussian_kl(mu, log_std, old_mu_ph, old_log_std_ph) + + info_phs = {'mu': old_mu_ph, 'log_std': old_log_std_ph} + + return pi, logp, logp_pi, info, info_phs, d_kl, v + else: + print(a, old_mu_ph, old_log_std_ph) + raise Exception + + return actor, critic, cal + + +def mlp_actor_critic(obs_space, act_space, hidden_sizes=(64, 64), activation=tf.tanh, + output_activation=None, policy=None): + # default policy builder depends on action space + if policy is None and isinstance(act_space, Box): + policy = mlp_gaussian_policy + name = 'mlp_gaussian_policy' + elif policy is None and isinstance(act_space, Discrete): + policy = mlp_categorical_policy + name = 'mlp_categorical_policy' + else: + raise Exception + + actor, critic, cal_func = policy(obs_space, act_space, hidden_sizes, activation, output_activation) + + return actor, critic, cal_func, name + + +def combined_shape(length, shape=None): + if shape is None: + return length, + return (length, shape) if np.isscalar(shape) else (length, *shape) + + +def keys_as_sorted_list(dict): + return sorted(list(dict.keys())) + + +def values_as_sorted_list(dict): + return [dict[k] for k in keys_as_sorted_list(dict)] + + +def discount_cumsum(x, discount): + """ + magic from rllab for computing discounted cumulative sums of vectors. + + input: + vector x, + [x0, + x1, + x2] + + output: + [x0 + discount * x1 + discount^2 * x2, + x1 + discount * x2, + x2] + """ + return signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1] + + +def input_layer(dim=None): + return tl.layers.Input(combined_shape(None, dim)) + + +def input_layers(*args): + return [input_layer(dim) for dim in args] + + +def input_layers_from_space(space): + if isinstance(space, Box): + return input_layer(space.shape) + elif isinstance(space, Discrete): + return tl.layers.Input((None,)) + raise NotImplementedError + + +def input_layers_from_spaces(*args): + return [input_layers_from_space(space) for space in args] + + +class GAEBuffer: + """ + A buffer for storing trajectories experienced by a TRPO agent interacting + with the environment, and using Generalized Advantage Estimation (GAE-Lambda) + for calculating the advantages of state-action pairs. + """ + + def __init__(self, obs_dim, act_dim, size, info_shapes, gamma=0.99, lam=0.95): + self.obs_buf = np.zeros(combined_shape(size, obs_dim), dtype=np.float32) + self.act_buf = np.zeros(combined_shape(size, act_dim), dtype=np.float32) + self.adv_buf = np.zeros(size, dtype=np.float32) + self.rew_buf = np.zeros(size, dtype=np.float32) + self.ret_buf = np.zeros(size, dtype=np.float32) + self.val_buf = np.zeros(size, dtype=np.float32) + self.logp_buf = np.zeros(size, dtype=np.float32) + self.info_bufs = {k: np.zeros([size] + list(v), dtype=np.float32) for k, v in info_shapes.items()} + self.sorted_info_keys = keys_as_sorted_list(self.info_bufs) + self.gamma, self.lam = gamma, lam + self.ptr, self.path_start_idx, self.max_size = 0, 0, size + + def store(self, obs, act, rew, val, logp, info): + """ + Append one timestep of agent-environment interaction to the buffer. + """ + assert self.ptr < self.max_size # buffer has to have room so you can store + self.obs_buf[self.ptr] = obs + self.act_buf[self.ptr] = act + self.rew_buf[self.ptr] = rew + self.val_buf[self.ptr] = val + self.logp_buf[self.ptr] = logp + for i, k in enumerate(self.sorted_info_keys): + self.info_bufs[k][self.ptr] = info[i] + self.ptr += 1 + + def finish_path(self, last_val=0): + """ + Call this at the end of a trajectory, or when one gets cut off + by an epoch ending. This looks back in the buffer to where the + trajectory started, and uses rewards and value estimates from + the whole trajectory to compute advantage estimates with GAE-Lambda, + as well as compute the rewards-to-go for each state, to use as + the targets for the value function. + + The "last_val" argument should be 0 if the trajectory ended + because the agent reached a terminal state (died), and otherwise + should be V(s_T), the value function estimated for the last state. + This allows us to bootstrap the reward-to-go calculation to account + for timesteps beyond the arbitrary episode horizon (or epoch cutoff). + """ + + path_slice = slice(self.path_start_idx, self.ptr) + rews = np.append(self.rew_buf[path_slice], last_val) + vals = np.append(self.val_buf[path_slice], last_val) + + # GAE-Lambda advantage calculation + deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1] + self.adv_buf[path_slice] = discount_cumsum(deltas, self.gamma * self.lam) + + # the next line computes rewards-to-go, to be targets for the value function + self.ret_buf[path_slice] = discount_cumsum(rews, self.gamma)[:-1] + + self.path_start_idx = self.ptr + + def get(self): + """ + Call this at the end of an epoch to get all of the data from + the buffer, with advantages appropriately normalized (shifted to have + mean zero and std one). Also, resets some pointers in the buffer. + """ + assert self.ptr == self.max_size # buffer has to be full before you can get + self.ptr, self.path_start_idx = 0, 0 + # The advantage normalization trick + _sum, _n = np.sum(self.adv_buf), len(self.adv_buf) + mean = _sum / _n + + _sum_sq = np.sum((self.adv_buf - mean) ** 2) + std = np.sqrt(_sum_sq / _n) # compute global std + + self.adv_buf = (self.adv_buf - mean) / std + return [self.obs_buf, self.act_buf, self.adv_buf, self.ret_buf, + self.logp_buf] + values_as_sorted_list(self.info_bufs) + + +""" + +Trust Region Policy Optimization + +(with support for Natural Policy Gradient) + +""" + + +def trpo(env_fn, actor_critic=mlp_actor_critic, ac_kwargs=dict(), seed=0, + steps_per_epoch=4000, epochs=50, gamma=0.99, delta=0.01, vf_lr=1e-3, + train_v_iters=80, damping_coeff=0.1, cg_iters=10, backtrack_iters=10, + backtrack_coeff=0.8, lam=0.97, max_ep_len=1000, save_freq=10, algo='trpo'): + """ + + Args: + env_fn : A function which creates a copy of the environment. + The environment must satisfy the OpenAI Gym API. + + actor_critic: A function which takes in placeholder symbols + for state, ``x_ph``, and action, ``a_ph``, and returns the main + outputs from the agent's Tensorflow computation graph: + + ============ ================ ======================================== + Symbol Shape Description + ============ ================ ======================================== + ``pi`` (batch, act_dim) | Samples actions from policy given + | states. + ``logp`` (batch,) | Gives log probability, according to + | the policy, of taking actions ``a_ph`` + | in states ``x_ph``. + ``logp_pi`` (batch,) | Gives log probability, according to + | the policy, of the action sampled by + | ``pi``. + ``info`` N/A | A dict of any intermediate quantities + | (from calculating the policy or log + | probabilities) which are needed for + | analytically computing KL divergence. + | (eg sufficient statistics of the + | distributions) + ``info_phs`` N/A | A dict of placeholders for old values + | of the entries in ``info``. + ``d_kl`` () | A symbol for computing the mean KL + | divergence between the current policy + | (``pi``) and the old policy (as + | specified by the inputs to + | ``info_phs``) over the batch of + | states given in ``x_ph``. + ``v`` (batch,) | Gives the value estimate for states + | in ``x_ph``. (Critical: make sure + | to flatten this!) + ============ ================ ======================================== + + ac_kwargs (dict): Any kwargs appropriate for the actor_critic + function you provided to TRPO. + + seed (int): Seed for random number generators. + + steps_per_epoch (int): Number of steps of interaction (state-action pairs) + for the agent and the environment in each epoch. + + epochs (int): Number of epochs of interaction (equivalent to + number of policy updates) to perform. + + gamma (float): Discount factor. (Always between 0 and 1.) + + delta (float): KL-divergence limit for TRPO / NPG update. + (Should be small for stability. Values like 0.01, 0.05.) + + vf_lr (float): Learning rate for value function optimizer. + + train_v_iters (int): Number of gradient descent steps to take on + value function per epoch. + + damping_coeff (float): Artifact for numerical stability, should be + smallish. Adjusts Hessian-vector product calculation: + + .. math:: Hv \\rightarrow (\\alpha I + H)v + + where :math:`\\alpha` is the damping coefficient. + Probably don't play with this hyperparameter. + + cg_iters (int): Number of iterations of conjugate gradient to perform. + Increasing this will lead to a more accurate approximation + to :math:`H^{-1} g`, and possibly slightly-improved performance, + but at the cost of slowing things down. + + Also probably don't play with this hyperparameter. + + backtrack_iters (int): Maximum number of steps allowed in the + backtracking line search. Since the line search usually doesn't + backtrack, and usually only steps back once when it does, this + hyperparameter doesn't often matter. + + backtrack_coeff (float): How far back to step during backtracking line + search. (Always between 0 and 1, usually above 0.5.) + + lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, + close to 1.) + + max_ep_len (int): Maximum length of trajectory / episode / rollout. + + save_freq (int): How often (in terms of gap between epochs) to save + the current policy and value function. + + algo: Either 'trpo' or 'npg': this code supports both, since they are + almost the same. + + """ + tf.random.set_seed(seed) + np.random.seed(seed) + + env = env_fn() + obs_dim = env.observation_space.shape + act_dim = env.action_space.shape + + actor, critic, cal_func, name = actor_critic(env.observation_space, env.action_space, **ac_kwargs) + actor.train() + critic.train() + + # Experience buffer + local_steps_per_epoch = int(steps_per_epoch) + + if name == 'mlp_categorical_policy': + info_shapes = {'logp_all': env.action_space.n} + else: + info_shapes = {'mu': env.action_space.shape, 'log_std': env.action_space.shape} + + buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes, gamma, lam) + + # TRPO losses + def cal_pi_loss(inputs): + obs_buf, act_buf, adv_buf, ret_buf, logp_buf, *info_bufs = inputs + x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, *info_phs = inputs + pi, logp, logp_pi, info, info_phs, d_kl, v = cal_func(obs_buf, act_buf, *info_bufs) + + ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) + pi_loss = -tf.reduce_mean(ratio * adv_ph) + return pi_loss + + def cal_v_loss(inputs): + obs_buf, act_buf, adv_buf, ret_buf, logp_buf, *info_bufs = inputs + x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, *info_phs = inputs + + s = obs_buf.astype(np.float32) + v = tf.squeeze(critic(s), axis=1) + + v_loss = tf.reduce_mean((ret_ph - v) ** 2) + return v_loss + + # Train value function + def train_vf(inputs): + with tf.GradientTape() as tape: + v_loss = cal_v_loss(inputs) + grad = tape.gradient(v_loss, critic.trainable_weights) + tf.optimizers.Adam(vf_lr).apply_gradients(zip(grad, critic.trainable_weights)) + + # Symbols needed for CG solver + def pi_params(): + return actor.trainable_weights + + def gradient(inputs): + with tf.GradientTape() as tape: + pi_loss = cal_pi_loss(inputs) + grad = tape.gradient(pi_loss, pi_params()) + return flat_concat(grad) + + def get_hvp(inputs, v_ph): + obs_buf, act_buf, adv_buf, ret_buf, logp_buf, *info_bufs = inputs + v_ph, hvp = hessian_vector_product(cal_func, (obs_buf, act_buf, *info_bufs), pi_params(), v_ph) + if damping_coeff > 0: + hvp += damping_coeff * v_ph + return hvp + + # Symbols for getting and setting params + def get_pi_params(): + return flat_concat(actor.trainable_weights) + + def set_pi_params(v_ph): + assign_params_from_flat(v_ph, pi_params()) + + def save_ckpt(): + """ + save trained weights + :return: None + """ + tl.files.save_npz(actor.trainable_weights, name='model/actor.npz') + tl.files.save_npz(critic.trainable_weights, name='model/critic.npz') + + def load_ckpt(): + """ + load trained weights + :return: None + """ + tl.files.load_and_assign_npz(name='model/actor.npz', network=actor) + tl.files.load_and_assign_npz(name='model/critic.npz', network=critic) + + def cg(Ax, b): + """ + Conjugate gradient algorithm + (see https://en.wikipedia.org/wiki/Conjugate_gradient_method) + """ + x = np.zeros_like(b) + r = copy.deepcopy(b) # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start. + p = copy.deepcopy(r) + r_dot_old = np.dot(r, r) + for _ in range(cg_iters): + z = Ax(p) + alpha = r_dot_old / (np.dot(p, z) + EPS) + x += alpha * p + r -= alpha * z + r_dot_new = np.dot(r, r) + p = r + (r_dot_new / r_dot_old) * p + r_dot_old = r_dot_new + return x + + def update(): + # Prepare hessian func, gradient eval + inputs = buf.get() + + Hx = lambda x: get_hvp(inputs, x) + + g = gradient(inputs) + pi_l_old = cal_pi_loss(inputs) + v_l_old = cal_v_loss(inputs) + + # Core calculations for TRPO or NPG + x = cg(Hx, g) + alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS)) + old_params = get_pi_params() + + def set_and_eval(step): + set_pi_params(old_params - alpha * x * step) + obs_buf, act_buf, adv_buf, ret_buf, logp_buf, *info_bufs = inputs + pi, logp, logp_pi, info, info_phs, d_kl, v = cal_func(obs_buf, act_buf, *info_bufs) + pi_loss = cal_pi_loss(inputs) + return d_kl, pi_loss + + if algo == 'npg': + # npg has no backtracking or hard kl constraint enforcement + kl, pi_l_new = set_and_eval(step=1.) + + elif algo == 'trpo': + # trpo augments npg with backtracking line search, hard kl + for j in range(backtrack_iters): + kl, pi_l_new = set_and_eval(step=backtrack_coeff ** j) + if kl <= delta and pi_l_new <= pi_l_old: + break + + if j == backtrack_iters - 1: + kl, pi_l_new = set_and_eval(step=0.) + + # Value function updates + for _ in range(train_v_iters): + train_vf(inputs) + + o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 + + sum_reward_list = [] + + # Main loop: collect experience in env and update/log each epoch + for epoch in range(epochs): + t0 = time.time() + for t in range(local_steps_per_epoch): + agent_outs = cal_func(o.reshape(1, -1)) + + a, v_t, logp_t, info_t = np.array(agent_outs[0][0], np.float32), \ + np.array(agent_outs[1], np.float32), \ + np.array(agent_outs[2], np.float32), \ + np.array(agent_outs[3:], np.float32) + + buf.store(o, a, r, v_t, logp_t, info_t) + + o, r, d, _ = env.step(a) + + ep_ret += r + ep_len += 1 + + terminal = d or (ep_len == max_ep_len) + + if terminal or (t == local_steps_per_epoch - 1): + if not (terminal): + print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) + # if trajectory didn't reach terminal state, bootstrap value target + last_val = r if d else critic(o.reshape(1, -1)) + buf.finish_path(last_val) + + if terminal: + # trajectory finished + + sum_reward_list.append(ep_ret) + print("Episode [%d/%d]\ttrajectory counter: %d \tsum reward: %d \ttook: %.5fs " % + (epoch, epochs, len(sum_reward_list), ep_ret, time.time() - t0)) + t0 = time.time() + + plt.ion() + plt.cla() + plt.title('TRPO') + plt.plot(np.arange(len(sum_reward_list)), sum_reward_list) + plt.ylim(-2000, 0) + plt.xlabel('Episode') + plt.ylabel('Moving averaged episode reward') + plt.show() + plt.pause(0.1) + plt.ioff() + + o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 + + # Save model + if (epoch % save_freq == 0) or (epoch == epochs - 1): + pass + + # Perform TRPO or NPG update! + update() + + plt.savefig('plt.jpg') + while True: + o = env.reset() + for i in range(200): + env.render() + agent_outs = cal_func(o.reshape(1, -1)) + a, v_t, logp_t, info_t = agent_outs[0][0], agent_outs[1], agent_outs[2], agent_outs[3:] + o, r, d, _ = env.step(a) + if d: + break + + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument('--env', type=str, default='Pendulum-v0') + parser.add_argument('--hid', type=int, default=64) + parser.add_argument('--l', type=int, default=2) + parser.add_argument('--gamma', type=float, default=0.99) + parser.add_argument('--seed', '-s', type=int, default=0) + parser.add_argument('--steps', type=int, default=4000) + parser.add_argument('--epochs', type=int, default=50) + args = parser.parse_args() + + trpo(lambda: gym.make(args.env), actor_critic=mlp_actor_critic, + ac_kwargs=dict(hidden_sizes=[args.hid] * args.l), gamma=args.gamma, + seed=args.seed, steps_per_epoch=args.steps, epochs=args.epochs,) From 7e332c40e668c99ed6368b2f3cf37a82c1faab41 Mon Sep 17 00:00:00 2001 From: initial-h <18811472492@163.com> Date: Tue, 28 May 2019 00:43:33 +0800 Subject: [PATCH 18/57] Update README.md --- examples/reinforcement_learning/README.md | 35 ++++++++++++++--------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index c9874fd91..341bc1764 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -32,7 +32,7 @@ or `python ***.py --train` for training and `python ***.py --test` for testing. | DDPG | Continuous | Continuous | | TD3 | Continuous | Continuous | | HER | | | -| PG | Continuous | Continuous | +| PG | Continuous | Discrete | | TRPO | Continuous | Continuous | | PPO | Continuous | Continuous | | | | | @@ -47,13 +47,11 @@ or `python ***.py --train` for training and `python ***.py --test` for testing. Code: `./tutorial_frozenlake_q_table.py` - * Deep Q-Network (DQN) Code: `./tutorial_frozenlake_dqn.py` - * Double DQN / Dueling DQN / Noisy DQN Code: `./tutorial_double_dueling_noisy_dqn.py` @@ -87,26 +85,31 @@ or `python ***.py --train` for training and `python ***.py --test` for testing. Code:`./tutorial_cartpole_ac.py` - * Asynchronous Advantage Actor-Critic (A3C) Code: `./tutorial_bipedalwalker_a3c_continuous_action.py` - * Soft Actor-Critic (SAC) Code: `./tutorial_sac.py` Paper: [Soft Actor-Critic Algorithms and Applications](https://arxiv.org/pdf/1812.05905.pdf) - -* Deep Deterministic Policy Gradient (DDPG) +* Policy Gradient (PG/REINFORCE) - To do. + Code: `./tutorial_PG.py` + + Paper: [Policy Gradient Methods for Reinforcement Learning with Function Approximation](https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf) + + +* Deep Deterministic Policy Gradient (DDPG) + Code: `./tutorial_DDPG.py` + Paper: [CONTINUOUS CONTROL WITH DEEP REINFORCEMENT LEARNING](https://arxiv.org/pdf/1509.02971.pdf) + * Twin Delayed DDPG (TD3) @@ -115,25 +118,31 @@ or `python ***.py --train` for training and `python ***.py --test` for testing. Paper: [Addressing Function Approximation Error in Actor-Critic Methods](https://arxiv.org/pdf/1802.09477.pdf) - * Hindsight Experience Replay (HER) To do. - * Trust Region Policy Optimization (TRPO) - To do. - + Code: `./tutorial_TRPO.py` + Paper: [Trust Region Policy Optimization](https://arxiv.org/pdf/1502.05477.pdf) + * Proximal Policy Optimization (PPO) - To do. + Code: `./tutorial_PPO.py` + + Paper: [Proximal Policy Optimization Algorithms](https://arxiv.org/pdf/1707.06347.pdf) +* Distributed Proximal Policy Optimization (PPO) + + Code: `./tutorial_DPPO.py` + Paper: [Emergence of Locomotion Behaviours in Rich Environments](https://arxiv.org/pdf/1707.02286.pdf) + * etc ## Environment: From 51681407b4d1955d7e66cb67650644a5483ddd87 Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Thu, 30 May 2019 15:50:48 +0100 Subject: [PATCH 19/57] readme --- examples/reinforcement_learning/README.md | 76 ++++++++++++------- .../tutorial_cartpole_ac.py | 13 +++- 2 files changed, 60 insertions(+), 29 deletions(-) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index 341bc1764..da4514422 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -1,6 +1,6 @@ # Reinforcement Learning Tutorial with Tensorlayer -This folder contains implementation of most popular reinforcement learning algorithms with Tensorlayer 2.0. +This repository contains implementation of most popular reinforcement learning algorithms with Tensorlayer 2.0, supporting Tensorflow 2.0. We aim to make the reinforcement learning tutorial for each algorithm simple and straight-forward to use, as this would not only benefits new learners of reinforcement learning, but also provide convenience for senior researchers to testify their new ideas quickly. ## Prerequisites: @@ -14,31 +14,31 @@ This folder contains implementation of most popular reinforcement learning algor `pip install --upgrade tf-nightly-2.0-preview tfp-nightly` +## Status: Beta + +We are currently open to any suggestions or pull requests from you to make the reinforcement learning tutorial with TensorLayer2.0 a better code repository for both new learners and senior researchers. Some of the algorithms mentioned in the this markdown may be not yet available, since we are still trying to implement more RL algorithms and optimize their performances. However, those algorithms listed above will come out in a few weeks, and the repository will keep updating more advanced RL algorithms in the future. + ## To Use: +For each tutorial, open a terminal and run: + `python ***.py` or `python ***.py --train` for training and `python ***.py --test` for testing. ## Table of Contents: -| Algorithms | Observation Space | Action Space | -| ------------ | ----------------- | ------------ | -| Q-learning | Discrete | Discrete | -| DQN | Discrete | Discrete | -| Actor-Critic | Continuous | Discrete | -| A3C | Continuous | Continuous | -| SAC | Continuous | Continuous | -| DDPG | Continuous | Continuous | -| TD3 | Continuous | Continuous | -| HER | | | -| PG | Continuous | Discrete | -| TRPO | Continuous | Continuous | -| PPO | Continuous | Continuous | -| | | | -| | | | -| | | | -| | | | +| Algorithms | Observation Space | Action Space | Tutorial Env | +| ---------------- | ----------------- | ------------ | -------------- | +| Q-learning | Discrete | Discrete | FrozenLake | +| DQN and variants | Discrete | Discrete | Pong, CartPole | +| Actor-Critic | Continuous | Discrete | CartPole | +| A3C | Continuous | Continuous | BipedalWalker | +| SAC | Continuous | Continuous | Pendulum | +| PG | Continuous | Discrete | CartPole | +| DDPG | Continuous | Continuous | Pendulum | +| TD3 | Continuous | Continuous | Pendulum | +| C51 | | | CartPole | @@ -47,23 +47,28 @@ or `python ***.py --train` for training and `python ***.py --test` for testing. Code: `./tutorial_frozenlake_q_table.py` + * Deep Q-Network (DQN) Code: `./tutorial_frozenlake_dqn.py` + * Double DQN / Dueling DQN / Noisy DQN Code: `./tutorial_double_dueling_noisy_dqn.py` Experiment Environments: Pong and Cartpole + + * Prioritized replay Code: `./tutorial_prioritized_replay.py` Experiment Environments: Pong and Cartpole + * Distributed DQN @@ -72,6 +77,8 @@ or `python ***.py --train` for training and `python ***.py --test` for testing. Experiment Environments: Pong and Cartpole + + * Retrace(lambda) DQN @@ -79,37 +86,46 @@ or `python ***.py --train` for training and `python ***.py --test` for testing. Experiment Environments: Pong and Cartpole + + * Actor-Critic (AC) Code:`./tutorial_cartpole_ac.py` + * Asynchronous Advantage Actor-Critic (A3C) Code: `./tutorial_bipedalwalker_a3c_continuous_action.py` + * Soft Actor-Critic (SAC) Code: `./tutorial_sac.py` Paper: [Soft Actor-Critic Algorithms and Applications](https://arxiv.org/pdf/1812.05905.pdf) + + * Policy Gradient (PG/REINFORCE) Code: `./tutorial_PG.py` - + Paper: [Policy Gradient Methods for Reinforcement Learning with Function Approximation](https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf) + - + * Deep Deterministic Policy Gradient (DDPG) Code: `./tutorial_DDPG.py` - + Paper: [CONTINUOUS CONTROL WITH DEEP REINFORCEMENT LEARNING](https://arxiv.org/pdf/1509.02971.pdf) + + * Twin Delayed DDPG (TD3) @@ -118,31 +134,35 @@ or `python ***.py --train` for training and `python ***.py --test` for testing. Paper: [Addressing Function Approximation Error in Actor-Critic Methods](https://arxiv.org/pdf/1802.09477.pdf) -* Hindsight Experience Replay (HER) - - To do. - * Trust Region Policy Optimization (TRPO) Code: `./tutorial_TRPO.py` - + Paper: [Trust Region Policy Optimization](https://arxiv.org/pdf/1502.05477.pdf) + * Proximal Policy Optimization (PPO) Code: `./tutorial_PPO.py` - + Paper: [Proximal Policy Optimization Algorithms](https://arxiv.org/pdf/1707.06347.pdf) + + * Distributed Proximal Policy Optimization (PPO) Code: `./tutorial_DPPO.py` - + Paper: [Emergence of Locomotion Behaviours in Rich Environments](https://arxiv.org/pdf/1707.02286.pdf) + +* Hindsight Experience Replay (HER) + + To do. + * etc ## Environment: diff --git a/examples/reinforcement_learning/tutorial_cartpole_ac.py b/examples/reinforcement_learning/tutorial_cartpole_ac.py index 119ad7eb7..8a6427ac2 100644 --- a/examples/reinforcement_learning/tutorial_cartpole_ac.py +++ b/examples/reinforcement_learning/tutorial_cartpole_ac.py @@ -54,11 +54,22 @@ LR_A = 0.001 # learning rate for actor LR_C = 0.01 # learning rate for critic +''' +choose environment +1. Openai gym: +env = gym.make() +2. DeepMind Control Suite: +env = dm_control2gym.make() +''' + env = gym.make('CartPole-v0') +# dm_control2gym.create_render_mode('example mode', show=True, return_pixel=False, height=240, width=320, camera_id=-1, overlays=(), +# depth=False, scene_option=None) +# env = dm_control2gym.make(domain_name="cartpole", task_name="balance") env.seed(2) # reproducible # env = env.unwrapped - N_F = env.observation_space.shape[0] +# N_A = env.action_space.shape[0] N_A = env.action_space.n print("observation dimension: %d" % N_F) # 4 From a4f3e4f6386901a6c527185f0f26b626b55c276e Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Fri, 31 May 2019 15:29:49 +0100 Subject: [PATCH 20/57] readme --- examples/reinforcement_learning/README.md | 6 +++--- examples/reinforcement_learning/tutorial_TRPO.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index da4514422..a6ad60579 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -1,6 +1,6 @@ # Reinforcement Learning Tutorial with Tensorlayer -This repository contains implementation of most popular reinforcement learning algorithms with Tensorlayer 2.0, supporting Tensorflow 2.0. We aim to make the reinforcement learning tutorial for each algorithm simple and straight-forward to use, as this would not only benefits new learners of reinforcement learning, but also provide convenience for senior researchers to testify their new ideas quickly. +This repository contains implementation of most popular reinforcement learning algorithms with Tensorlayer 2.0, supporting [Tensorflow 2.0](https://www.tensorflow.org/alpha/guide/effective_tf2). We aim to make the reinforcement learning tutorial for each algorithm simple and straight-forward to use, as this would not only benefits new learners of reinforcement learning, but also provide convenience for senior researchers to testify their new ideas quickly. ## Prerequisites: @@ -40,7 +40,7 @@ or `python ***.py --train` for training and `python ***.py --test` for testing. | TD3 | Continuous | Continuous | Pendulum | | C51 | | | CartPole | - +## Examples of RL Algorithms: * Q-learning @@ -167,7 +167,7 @@ or `python ***.py --train` for training and `python ***.py --test` for testing. ## Environment: -[Openai Gym](https://gym.openai.com/) +We typically apply game environments in [Openai Gym](https://gym.openai.com/) for our tutorials. For other environment sources like [DeepMind Control Suite](https://github.com/deepmind/dm_control) and [Marathon-Envs in Unity](https://github.com/Unity-Technologies/marathon-envs), they all have wrappers to convert into format of Gym environments, see [here](https://github.com/martinseilair/dm_control2gym) and [here](https://github.com/Unity-Technologies/marathon-envs/tree/master/gym-unity). Our env wrapper: `./tutorial_wrappers.py` diff --git a/examples/reinforcement_learning/tutorial_TRPO.py b/examples/reinforcement_learning/tutorial_TRPO.py index fedb60b7b..7e344ddc5 100644 --- a/examples/reinforcement_learning/tutorial_TRPO.py +++ b/examples/reinforcement_learning/tutorial_TRPO.py @@ -497,6 +497,7 @@ def train_vf(inputs): with tf.GradientTape() as tape: v_loss = cal_v_loss(inputs) grad = tape.gradient(v_loss, critic.trainable_weights) + print(grad) tf.optimizers.Adam(vf_lr).apply_gradients(zip(grad, critic.trainable_weights)) # Symbols needed for CG solver From 3e4f01c870033ebb65f6f687479dfd1eb7df5b30 Mon Sep 17 00:00:00 2001 From: Tokarev-TT-33 <34995488+Tokarev-TT-33@users.noreply.github.com> Date: Tue, 4 Jun 2019 17:53:08 +0800 Subject: [PATCH 21/57] Add files via upload --- .../reinforcement_learning/tutorial_DDPG.py | 20 +- .../reinforcement_learning/tutorial_DPPO.py | 22 + .../reinforcement_learning/tutorial_PG.py | 7 +- .../reinforcement_learning/tutorial_PPO.py | 15 +- .../reinforcement_learning/tutorial_TRPO.py | 488 +++++++++--------- 5 files changed, 289 insertions(+), 263 deletions(-) diff --git a/examples/reinforcement_learning/tutorial_DDPG.py b/examples/reinforcement_learning/tutorial_DDPG.py index dafd7f59b..71ac9bf06 100644 --- a/examples/reinforcement_learning/tutorial_DDPG.py +++ b/examples/reinforcement_learning/tutorial_DDPG.py @@ -24,6 +24,7 @@ import tensorflow as tf import tensorlayer as tl import numpy as np +import os ##################### hyper parameters #################### @@ -175,20 +176,23 @@ def save_ckpt(self): save trained weights :return: None """ - tl.files.save_npz(self.actor.trainable_weights, name='model/actor.npz') - tl.files.save_npz(self.actor_target.trainable_weights, name='model/actor_target.npz') - tl.files.save_npz(self.critic.trainable_weights, name='model/critic.npz') - tl.files.save_npz(self.critic_target.trainable_weights, name='model/critic_target.npz') + if not os.path.exists('model'): + os.makedirs('model') + + tl.files.save_weights_to_hdf5('model/ddpg_actor.hdf5', self.actor) + tl.files.save_weights_to_hdf5('model/ddpg_actor_target.hdf5', self.actor_target) + tl.files.save_weights_to_hdf5('model/ddpg_critic.hdf5', self.critic) + tl.files.save_weights_to_hdf5('model/ddpg_critic_target.hdf5', self.critic_target) def load_ckpt(self): """ load trained weights :return: None """ - tl.files.load_and_assign_npz(name='model/actor.npz', network=self.actor) - tl.files.load_and_assign_npz(name='model/actor_target.npz', network=self.actor_target) - tl.files.load_and_assign_npz(name='model/critic.npz', network=self.critic) - tl.files.load_and_assign_npz(name='model/critic_target.npz', network=self.critic_target) + tl.files.load_hdf5_to_weights_in_order('model/ddpg_actor.hdf5', self.actor) + tl.files.load_hdf5_to_weights_in_order('model/ddpg_actor_target.hdf5', self.actor_target) + tl.files.load_hdf5_to_weights_in_order('model/ddpg_critic.hdf5', self.critic) + tl.files.load_hdf5_to_weights_in_order('model/ddpg_critic_target.hdf5', self.critic_target) if __name__ == '__main__': diff --git a/examples/reinforcement_learning/tutorial_DPPO.py b/examples/reinforcement_learning/tutorial_DPPO.py index 564895198..9bcad026e 100644 --- a/examples/reinforcement_learning/tutorial_DPPO.py +++ b/examples/reinforcement_learning/tutorial_DPPO.py @@ -30,6 +30,7 @@ import tensorlayer as tl import tensorflow_probability as tfp +import os EP_MAX = 1000 EP_LEN = 200 @@ -220,6 +221,27 @@ def get_v(self, s): if s.ndim < 2: s = s[np.newaxis, :] return self.critic(s)[0, 0] + def save_ckpt(self): + """ + save trained weights + :return: None + """ + if not os.path.exists('model'): + os.makedirs('model') + tl.files.save_weights_to_hdf5('model/dppo_actor.hdf5', self.actor) + tl.files.save_weights_to_hdf5('model/dppo_actor_old.hdf5', self.actor_old) + tl.files.save_weights_to_hdf5('model/dppo_critic.hdf5', self.critic) + + def load_ckpt(self): + """ + load trained weights + :return: None + """ + tl.files.load_hdf5_to_weights_in_order('model/dppo_actor.hdf5', self.actor) + tl.files.load_hdf5_to_weights_in_order('model/dppo_actor_old.hdf5', self.actor_old) + tl.files.load_hdf5_to_weights_in_order('model/dppo_critic.hdf5', self.critic) + + '''--------------------------------------------------------------''' diff --git a/examples/reinforcement_learning/tutorial_PG.py b/examples/reinforcement_learning/tutorial_PG.py index 56d15986e..8e31817cb 100644 --- a/examples/reinforcement_learning/tutorial_PG.py +++ b/examples/reinforcement_learning/tutorial_PG.py @@ -24,6 +24,7 @@ import tensorflow as tf import tensorlayer as tl import numpy as np +import os tl.logging.set_verbosity(tl.logging.DEBUG) @@ -147,14 +148,16 @@ def save_ckpt(self): save trained weights :return: None """ - tl.files.save_npz(self.model.trainable_weights, name='model.npz') + if not os.path.exists('model'): + os.makedirs('model') + tl.files.save_weights_to_hdf5('model/pg_policy.hdf5', self.model) def load_ckpt(self): """ load trained weights :return: None """ - tl.files.load_and_assign_npz(name='model.npz', network=self.model) + tl.files.load_hdf5_to_weights_in_order('model/pg_policy.hdf5', self.model) if __name__ == '__main__': diff --git a/examples/reinforcement_learning/tutorial_PPO.py b/examples/reinforcement_learning/tutorial_PPO.py index 727704e40..fddf39406 100644 --- a/examples/reinforcement_learning/tutorial_PPO.py +++ b/examples/reinforcement_learning/tutorial_PPO.py @@ -28,6 +28,7 @@ import gym import tensorlayer as tl import tensorflow_probability as tfp +import os EP_MAX = 1000 EP_LEN = 200 @@ -213,18 +214,20 @@ def save_ckpt(self): save trained weights :return: None """ - tl.files.save_npz(self.actor.trainable_weights, name='model/actor.npz') - tl.files.save_npz(self.actor_old.trainable_weights, name='model/actor_old.npz') - tl.files.save_npz(self.critic.trainable_weights, name='model/critic.npz') + if not os.path.exists('model'): + os.makedirs('model') + tl.files.save_weights_to_hdf5('model/ppo_actor.hdf5', self.actor) + tl.files.save_weights_to_hdf5('model/ppo_actor_old.hdf5', self.actor_old) + tl.files.save_weights_to_hdf5('model/ppo_critic.hdf5', self.critic) def load_ckpt(self): """ load trained weights :return: None """ - tl.files.load_and_assign_npz(name='model/actor.npz', network=self.actor) - tl.files.load_and_assign_npz(name='model/actor_old.npz', network=self.actor_old) - tl.files.load_and_assign_npz(name='model/critic.npz', network=self.critic) + tl.files.load_hdf5_to_weights_in_order('model/ppo_actor.hdf5', self.actor) + tl.files.load_hdf5_to_weights_in_order('model/ppo_actor_old.hdf5', self.actor_old) + tl.files.load_hdf5_to_weights_in_order('model/ppo_critic.hdf5', self.critic) env = gym.make('Pendulum-v0').unwrapped diff --git a/examples/reinforcement_learning/tutorial_TRPO.py b/examples/reinforcement_learning/tutorial_TRPO.py index 7e344ddc5..67c11b0f5 100644 --- a/examples/reinforcement_learning/tutorial_TRPO.py +++ b/examples/reinforcement_learning/tutorial_TRPO.py @@ -24,47 +24,73 @@ """ import numpy as np - import tensorflow as tf import tensorflow_probability as tfp import tensorlayer as tl import gym -from gym.spaces import Box, Discrete import time -from matplotlib import pyplot as plt -from scipy import signal +import os + +import matplotlib.pyplot as plt +import scipy.signal import copy +from gym.spaces import Box, Discrete EPS = 1e-8 -def assign_params_from_flat(x, params): - flat_size = lambda p: int(np.prod(p.shape.as_list())) # the 'int' is important for scalars - splits = tf.split(x, [flat_size(p) for p in params]) - new_params = [tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits)] - return tf.group([p.assign(p_new) for p, p_new in zip(params, new_params)]) +def combined_shape(length, shape=None): + if shape is None: + return length, + return (length, shape) if np.isscalar(shape) else (length, *shape) -def hessian_vector_product(func, inputs, params, x): - # for H = grad**2 f, compute Hx - with tf.GradientTape() as tape0: - with tf.GradientTape() as tape1: - pi, logp, logp_pi, info, info_phs, d_kl, v = func(*inputs) - grad1 = tape1.gradient(d_kl, params) - g = flat_concat(grad1) - assert g.shape == x.shape - a = tf.reduce_sum(g * x) - grad0 = tape0.gradient(a, params) - g0 = flat_concat(grad0) - return x, g0 +def keys_as_sorted_list(dict): + return sorted(list(dict.keys())) -def flat_concat(xs): - return tf.concat([tf.reshape(x, (-1,)) for x in xs], axis=0) +def values_as_sorted_list(dict): + return [dict[k] for k in keys_as_sorted_list(dict)] -def flat_grad(f, params): - return flat_concat(tf.gradients(xs=params, ys=f)) +def input_layer(dim=None): + return tl.layers.Input(dtype=tf.float32, shape=combined_shape(None, dim)) + + +def input_layers(*args): + return [input_layer(dim) for dim in args] + + +def input_layer_from_space(space): + if isinstance(space, Box): + return input_layer(space.shape) + elif isinstance(space, Discrete): + return tl.layers.Input(dtype=tf.int32, shape=(None,)) + raise NotImplementedError + + +def input_layers_from_spaces(*args): + return [input_layer_from_space(space) for space in args] + + +def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): + for h in hidden_sizes[:-1]: + x = tl.layers.Dense(n_units=h, act=activation)(x) + return tl.layers.Dense(n_units=hidden_sizes[-1], act=output_activation)(x) + + +def get_vars(model: tl.models.Model): + return model.trainable_weights + + +def count_vars(model: tl.models.Model): + v = get_vars(model) + return sum([np.prod(var.shape.as_list()) for var in v]) + + +def gaussian_likelihood(x, mu, log_std): + pre_sum = -0.5 * (((x - mu) / (tf.exp(log_std) + EPS)) ** 2 + 2 * log_std + np.log(2 * np.pi)) + return tf.reduce_sum(pre_sum, axis=1) def diagonal_gaussian_kl(mu0, log_std0, mu1, log_std1): @@ -79,11 +105,6 @@ def diagonal_gaussian_kl(mu0, log_std0, mu1, log_std1): return tf.reduce_mean(all_kls) -def gaussian_likelihood(x, mu, log_std): - pre_sum = -0.5 * (((x - mu) / (tf.exp(log_std) + EPS)) ** 2 + 2 * log_std + np.log(2 * np.pi)) - return tf.reduce_sum(pre_sum, axis=1) - - def categorical_kl(logp0, logp1): """ tf symbol for mean KL divergence between two batches of categorical probability distributions, @@ -93,170 +114,135 @@ def categorical_kl(logp0, logp1): return tf.reduce_mean(all_kls) -def mlp(input_space, hidden_sizes=(32,), activation=tf.tanh, output_activation=None, name=None): - inputs = input_layers_from_space(input_space) - x = inputs - for i, h in enumerate(hidden_sizes[:-1]): - # x = tf.layers.dense(x, units=h, activation=activation) - if name: - n = name + '_layer_' + str(i) - else: - n = None - x = tl.layers.Dense(h, activation, name=n)(x) - if name: - n = name + '_output' - else: - n = None - outputs = tl.layers.Dense(hidden_sizes[-1], output_activation, name=n)(x) - return tl.models.Model(inputs, outputs, name) - - -def mlp_categorical_policy(obs_space, act_space, hidden_sizes, activation, output_activation): - act_dim = act_space.n - actor = mlp(obs_space, list(hidden_sizes) + [act_dim], activation, None, name='actor') - - critic = mlp(obs_space, list(hidden_sizes) + [1], activation, None, name='critic') +def flat_concat(xs): + return tf.concat([tf.reshape(x, (-1,)) for x in xs], axis=0) - def cal(s, a=None, old_logp_all=None): - s = s.astype(np.float32) - logits = actor(s) - logp_all = tf.nn.log_softmax(logits) - pi = tf.squeeze(tfp.distributions.Multinomial(logits, 1), axis=1) - logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=act_dim) * logp_all, axis=1) - v = tf.squeeze(critic(s), axis=1) - info = {'logp_all': logp_all} - if a is None and old_logp_all is None: - info = values_as_sorted_list(info) - return [pi, v, logp_pi] + info - else: - a = a.astype(np.float32) - logp = tf.reduce_sum(tf.one_hot(a, depth=act_dim) * logp_all, axis=1) +def flat_grad(f, params): + return flat_concat(tf.gradients(xs=params, ys=f)) - # check_shape(old_logp_all, act_dim) - d_kl = categorical_kl(logp_all, old_logp_all) - info_phs = {'logp_all': old_logp_all} - return pi, logp, logp_pi, info, info_phs, d_kl, v +def hessian_vector_product(f, params, x): + # for H = grad**2 f, compute Hx + g = flat_grad(f, params) + return flat_grad(tf.reduce_sum(g * x), params) - return actor, cal +def assign_params_from_flat(x, params): + flat_size = lambda p: int(np.prod(p.shape.as_list())) # the 'int' is important for scalars + splits = tf.split(x, [flat_size(p) for p in params]) + new_params = [tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits)] + return tf.group([p.assign(p_new) for p, p_new in zip(params, new_params)]) -def check_shape(array, shape): - try: - assert np.shape(array) == shape - except Exception as e: - print(np.shape(array), '!=', shape) - raise e +def discount_cumsum(x, discount): + """ + magic from rllab for computing discounted cumulative sums of vectors. -def check_shapes(array_list, shape_list): - for arr, shp in zip(array_list, shape_list): - check_shape(arr, shp) + input: + vector x, + [x0, + x1, + x2] + output: + [x0 + discount * x1 + discount^2 * x2, + x1 + discount * x2, + x2] + """ + return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1] -def mlp_gaussian_policy(obs_space, act_space, hidden_sizes, activation, output_activation): - act_dim = act_space.shape[0] - actor = mlp(obs_space, list(hidden_sizes) + [act_dim], activation, output_activation, name='actor') - critic = mlp(obs_space, list(hidden_sizes) + [1], activation, None, name='critic') +""" +Policies +""" - def cal(s, a=None, old_mu_ph=None, old_log_std_ph=None): - s = s.astype(np.float32) - mu = actor(s) - log_std = -0.5 * np.ones(act_dim, dtype=np.float32) - std = tf.exp(log_std) - pi = mu + tf.random.normal(tf.shape(mu)) * std - logp_pi = gaussian_likelihood(pi, mu, log_std) - v = tf.squeeze(critic(s), axis=1) - info = {'mu': mu, 'log_std': log_std} - if a is None and old_mu_ph is None and old_log_std_ph is None: - info = values_as_sorted_list(info) - return [pi, v, logp_pi] + info - elif a is not None and old_mu_ph is not None and old_log_std_ph is not None: - a = a.astype(np.float32) - logp = gaussian_likelihood(a, mu, log_std) - # check_shapes((old_mu_ph, old_log_std_ph), (act_dim, act_dim)) - d_kl = diagonal_gaussian_kl(mu, log_std, old_mu_ph, old_log_std_ph) +def mlp_categorical_policy(x, a, hidden_sizes, activation, output_activation): + act_dim = a.n - info_phs = {'mu': old_mu_ph, 'log_std': old_log_std_ph} + x = input_layer_from_space(x) + logits = mlp(x, list(hidden_sizes) + [act_dim], activation, None) + actor = tl.models.Model(x, logits) - return pi, logp, logp_pi, info, info_phs, d_kl, v - else: - print(a, old_mu_ph, old_log_std_ph) - raise Exception + def cal_outputs_0(states): + states = states.astype(np.float32) + logits = actor(states) + logp_all = tf.nn.log_softmax(logits) + pi = tf.squeeze(tfp.distributions.Multinomial(1, logits), axis=1) + logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=act_dim) * logp_all, axis=1) + info = {'logp_all': logp_all} + return pi, logp_pi, info, logp_all - return actor, critic, cal + def cal_outputs_1(states, actions, old_logp_all): + pi, logp_pi, info, logp_all = cal_outputs_0(states) + logp = tf.reduce_sum(tf.one_hot(actions, depth=act_dim) * logp_all, axis=1) + d_kl = categorical_kl(logp_all, old_logp_all) + info_phs = {'logp_all': old_logp_all} -def mlp_actor_critic(obs_space, act_space, hidden_sizes=(64, 64), activation=tf.tanh, - output_activation=None, policy=None): - # default policy builder depends on action space - if policy is None and isinstance(act_space, Box): - policy = mlp_gaussian_policy - name = 'mlp_gaussian_policy' - elif policy is None and isinstance(act_space, Discrete): - policy = mlp_categorical_policy - name = 'mlp_categorical_policy' - else: - raise Exception + return pi, logp, logp_pi, info, info_phs, d_kl - actor, critic, cal_func = policy(obs_space, act_space, hidden_sizes, activation, output_activation) + return actor, cal_outputs_0, cal_outputs_1 - return actor, critic, cal_func, name +def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation): + act_dim = a.shape[0] -def combined_shape(length, shape=None): - if shape is None: - return length, - return (length, shape) if np.isscalar(shape) else (length, *shape) + x = input_layer_from_space(x) + mu = mlp(x, list(hidden_sizes) + [act_dim], activation, output_activation) + actor = tl.models.Model(x, mu) + def cal_outputs_0(states): + states = states.astype(np.float32) + mu = actor(states) + log_std = -0.5 * np.ones(act_dim, dtype=np.float32) + std = tf.exp(log_std) + pi = mu + tf.random.normal(tf.shape(mu)) * std + logp_pi = gaussian_likelihood(pi, mu, log_std) -def keys_as_sorted_list(dict): - return sorted(list(dict.keys())) + info = {'mu': mu, 'log_std': log_std} + return pi, logp_pi, info, mu, log_std -def values_as_sorted_list(dict): - return [dict[k] for k in keys_as_sorted_list(dict)] + def cal_outputs_1(states, actions, old_log_std_ph, old_mu_ph): + pi, logp_pi, info, mu, log_std = cal_outputs_0(states) + logp = gaussian_likelihood(actions, mu, log_std) + d_kl = diagonal_gaussian_kl(mu, log_std, old_mu_ph, old_log_std_ph) + info_phs = {'mu': old_mu_ph, 'log_std': old_log_std_ph} -def discount_cumsum(x, discount): - """ - magic from rllab for computing discounted cumulative sums of vectors. + return pi, logp, logp_pi, info, info_phs, d_kl - input: - vector x, - [x0, - x1, - x2] + return actor, cal_outputs_0, cal_outputs_1 - output: - [x0 + discount * x1 + discount^2 * x2, - x1 + discount * x2, - x2] - """ - return signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1] +""" +Actor-Critics +""" -def input_layer(dim=None): - return tl.layers.Input(combined_shape(None, dim)) +def mlp_actor_critic(x: 'env.observation_space', a: 'env.action_space', hidden_sizes=(64, 64), activation=tf.tanh, + output_activation=None, policy=None): + # default policy builder depends on action space + if policy is None and isinstance(a, Box): + policy = mlp_gaussian_policy + elif policy is None and isinstance(a, Discrete): + policy = mlp_categorical_policy -def input_layers(*args): - return [input_layer(dim) for dim in args] + actor, actor_cal_func_0, actor_cal_func_1 = policy(x, a, hidden_sizes, activation, output_activation) + x = input_layer_from_space(x) + critic = tl.models.Model(x, mlp(x, list(hidden_sizes) + [1], activation, None)) -def input_layers_from_space(space): - if isinstance(space, Box): - return input_layer(space.shape) - elif isinstance(space, Discrete): - return tl.layers.Input((None,)) - raise NotImplementedError + actor.train() + critic.train() + def critic_cal_func(states): + states = states.astype(np.float32) + return tf.squeeze(critic(states), axis=1) -def input_layers_from_spaces(*args): - return [input_layers_from_space(space) for space in args] + return actor, actor_cal_func_0, actor_cal_func_1, critic, critic_cal_func class GAEBuffer: @@ -313,7 +299,7 @@ def finish_path(self, last_val=0): rews = np.append(self.rew_buf[path_slice], last_val) vals = np.append(self.val_buf[path_slice], last_val) - # GAE-Lambda advantage calculation + # the next two lines implement GAE-Lambda advantage calculation deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1] self.adv_buf[path_slice] = discount_cumsum(deltas, self.gamma * self.lam) @@ -330,14 +316,10 @@ def get(self): """ assert self.ptr == self.max_size # buffer has to be full before you can get self.ptr, self.path_start_idx = 0, 0 - # The advantage normalization trick - _sum, _n = np.sum(self.adv_buf), len(self.adv_buf) - mean = _sum / _n - - _sum_sq = np.sum((self.adv_buf - mean) ** 2) - std = np.sqrt(_sum_sq / _n) # compute global std - self.adv_buf = (self.adv_buf - mean) / std + # the next two lines implement the advantage normalization trick + adv_mean, adv_std = np.mean(self.adv_buf), np.std(self.adv_buf) + self.adv_buf = (self.adv_buf - adv_mean) / adv_std return [self.obs_buf, self.act_buf, self.adv_buf, self.ret_buf, self.logp_buf] + values_as_sorted_list(self.info_bufs) @@ -351,7 +333,7 @@ def get(self): """ -def trpo(env_fn, actor_critic=mlp_actor_critic, ac_kwargs=dict(), seed=0, +def trpo(env_fn, actor_critic=mlp_actor_critic, ac_kwargs=dict(), seed=1, steps_per_epoch=4000, epochs=50, gamma=0.99, delta=0.01, vf_lr=1e-3, train_v_iters=80, damping_coeff=0.1, cg_iters=10, backtrack_iters=10, backtrack_coeff=0.8, lam=0.97, max_ep_len=1000, save_freq=10, algo='trpo'): @@ -418,7 +400,7 @@ def trpo(env_fn, actor_critic=mlp_actor_critic, ac_kwargs=dict(), seed=0, damping_coeff (float): Artifact for numerical stability, should be smallish. Adjusts Hessian-vector product calculation: - + .. math:: Hv \\rightarrow (\\alpha I + H)v where :math:`\\alpha` is the damping coefficient. @@ -458,87 +440,109 @@ def trpo(env_fn, actor_critic=mlp_actor_critic, ac_kwargs=dict(), seed=0, obs_dim = env.observation_space.shape act_dim = env.action_space.shape - actor, critic, cal_func, name = actor_critic(env.observation_space, env.action_space, **ac_kwargs) - actor.train() - critic.train() + # Share information about action space with policy architecture + ac_kwargs['action_space'] = env.action_space + + # Main models and functions + actor, actor_cal_func_0, actor_cal_func_1, critic, critic_cal_func = \ + actor_critic(env.observation_space, env.action_space) + + # Every step, get: action, value, logprob, & info for pdist (for computing kl div) + def get_action_ops(states): + pi, logp_pi, info, *_ = actor_cal_func_0(states) + v = critic_cal_func(states) + return [pi, v, logp_pi] + values_as_sorted_list(info) # Experience buffer - local_steps_per_epoch = int(steps_per_epoch) + local_steps_per_epoch = steps_per_epoch + + if isinstance(env.action_space, Box): + act_dim = env.action_space.shape[0] + info_shapes = {'mu': [act_dim], 'log_std': [act_dim]} - if name == 'mlp_categorical_policy': - info_shapes = {'logp_all': env.action_space.n} + elif isinstance(env.action_space, Discrete): + act_dim = env.action_space.n + info_shapes = {'logp_all': [act_dim]} else: - info_shapes = {'mu': env.action_space.shape, 'log_std': env.action_space.shape} + raise Exception('info_shape error') buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes, gamma, lam) # TRPO losses - def cal_pi_loss(inputs): - obs_buf, act_buf, adv_buf, ret_buf, logp_buf, *info_bufs = inputs - x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, *info_phs = inputs - pi, logp, logp_pi, info, info_phs, d_kl, v = cal_func(obs_buf, act_buf, *info_bufs) + def pi_loss(inputs): + x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, *info_values = inputs + pi, logp, logp_pi, info, info_phs, d_kl = actor_cal_func_1(x_ph, a_ph, *info_values) ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) pi_loss = -tf.reduce_mean(ratio * adv_ph) return pi_loss - def cal_v_loss(inputs): - obs_buf, act_buf, adv_buf, ret_buf, logp_buf, *info_bufs = inputs - x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, *info_phs = inputs - - s = obs_buf.astype(np.float32) - v = tf.squeeze(critic(s), axis=1) - + def v_loss(inputs): + x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, *info_values = inputs + v = critic_cal_func(x_ph) v_loss = tf.reduce_mean((ret_ph - v) ** 2) return v_loss - # Train value function + # Optimizer for value function + critic_optimizer = tf.optimizers.Adam(learning_rate=vf_lr) + def train_vf(inputs): with tf.GradientTape() as tape: - v_loss = cal_v_loss(inputs) - grad = tape.gradient(v_loss, critic.trainable_weights) - print(grad) - tf.optimizers.Adam(vf_lr).apply_gradients(zip(grad, critic.trainable_weights)) + loss = v_loss(inputs) + grad = tape.gradient(loss, critic.trainable_weights) + critic_optimizer.apply_gradients(zip(grad, critic.trainable_weights)) # Symbols needed for CG solver - def pi_params(): - return actor.trainable_weights - def gradient(inputs): + pi_params = actor.trainable_weights with tf.GradientTape() as tape: - pi_loss = cal_pi_loss(inputs) - grad = tape.gradient(pi_loss, pi_params()) - return flat_concat(grad) + loss = pi_loss(inputs) + grad = tape.gradient(loss, pi_params) + gradient = flat_concat(grad) + return gradient + + def hvp(inputs, v_ph): + pi_params = actor.trainable_weights + x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, *info_values = inputs + + with tf.GradientTape() as tape1: + with tf.GradientTape() as tape0: + pi, logp, logp_pi, info, info_phs, d_kl = actor_cal_func_1(x_ph, a_ph, *info_values) + g = flat_concat(tape0.gradient(d_kl, pi_params)) + l = tf.reduce_sum(g * v_ph) + hvp = flat_concat(tape1.gradient(l, pi_params)) - def get_hvp(inputs, v_ph): - obs_buf, act_buf, adv_buf, ret_buf, logp_buf, *info_bufs = inputs - v_ph, hvp = hessian_vector_product(cal_func, (obs_buf, act_buf, *info_bufs), pi_params(), v_ph) if damping_coeff > 0: hvp += damping_coeff * v_ph return hvp # Symbols for getting and setting params def get_pi_params(): - return flat_concat(actor.trainable_weights) + pi_params = actor.trainable_weights + return flat_concat(pi_params) def set_pi_params(v_ph): - assign_params_from_flat(v_ph, pi_params()) + pi_params = actor.trainable_weights + assign_params_from_flat(v_ph, pi_params) def save_ckpt(): """ save trained weights :return: None """ - tl.files.save_npz(actor.trainable_weights, name='model/actor.npz') - tl.files.save_npz(critic.trainable_weights, name='model/critic.npz') + if not os.path.exists('model'): + os.makedirs('model') + + tl.files.save_weights_to_hdf5('model/trpo_actor.hdf5', actor) + tl.files.save_weights_to_hdf5('model/trpo_critic.hdf5', critic) def load_ckpt(): """ load trained weights :return: None """ - tl.files.load_and_assign_npz(name='model/actor.npz', network=actor) - tl.files.load_and_assign_npz(name='model/critic.npz', network=critic) + tl.files.load_hdf5_to_weights_in_order('model/trpo_actor.hdf5', actor) + tl.files.load_hdf5_to_weights_in_order('model/trpo_critic.hdf5', critic) def cg(Ax, b): """ @@ -546,7 +550,7 @@ def cg(Ax, b): (see https://en.wikipedia.org/wiki/Conjugate_gradient_method) """ x = np.zeros_like(b) - r = copy.deepcopy(b) # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start. + r = copy.deepcopy(b) # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start. p = copy.deepcopy(r) r_dot_old = np.dot(r, r) for _ in range(cg_iters): @@ -562,12 +566,9 @@ def cg(Ax, b): def update(): # Prepare hessian func, gradient eval inputs = buf.get() - - Hx = lambda x: get_hvp(inputs, x) - - g = gradient(inputs) - pi_l_old = cal_pi_loss(inputs) - v_l_old = cal_v_loss(inputs) + ''''all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] + values_as_sorted_list(info_phs)''' + Hx = lambda x: hvp(inputs, x) + g, pi_l_old, v_l_old = gradient(inputs), pi_loss(inputs), v_loss(inputs) # Core calculations for TRPO or NPG x = cg(Hx, g) @@ -576,10 +577,10 @@ def update(): def set_and_eval(step): set_pi_params(old_params - alpha * x * step) - obs_buf, act_buf, adv_buf, ret_buf, logp_buf, *info_bufs = inputs - pi, logp, logp_pi, info, info_phs, d_kl, v = cal_func(obs_buf, act_buf, *info_bufs) - pi_loss = cal_pi_loss(inputs) - return d_kl, pi_loss + x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, *info_values = inputs + pi, logp, logp_pi, info, info_phs, d_kl = actor_cal_func_1(x_ph, a_ph, *info_values) + loss = pi_loss(inputs) + return [d_kl, loss] if algo == 'npg': # npg has no backtracking or hard kl constraint enforcement @@ -590,80 +591,73 @@ def set_and_eval(step): for j in range(backtrack_iters): kl, pi_l_new = set_and_eval(step=backtrack_coeff ** j) if kl <= delta and pi_l_new <= pi_l_old: + # Accepting new params at step of line search break if j == backtrack_iters - 1: + # Line search failed! Keeping old params. kl, pi_l_new = set_and_eval(step=0.) # Value function updates for _ in range(train_v_iters): train_vf(inputs) + start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 - sum_reward_list = [] - + reward_list = [] # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): t0 = time.time() + rew = 0 for t in range(local_steps_per_epoch): - agent_outs = cal_func(o.reshape(1, -1)) - + agent_outs = get_action_ops(o.reshape(1, -1)) a, v_t, logp_t, info_t = np.array(agent_outs[0][0], np.float32), \ np.array(agent_outs[1], np.float32), \ np.array(agent_outs[2], np.float32), \ np.array(agent_outs[3:], np.float32) + # store buf.store(o, a, r, v_t, logp_t, info_t) o, r, d, _ = env.step(a) - ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) - if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target - last_val = r if d else critic(o.reshape(1, -1)) + last_val = r if d else critic_cal_func(o.reshape(1, -1)) buf.finish_path(last_val) - - if terminal: - # trajectory finished - - sum_reward_list.append(ep_ret) - print("Episode [%d/%d]\ttrajectory counter: %d \tsum reward: %d \ttook: %.5fs " % - (epoch, epochs, len(sum_reward_list), ep_ret, time.time() - t0)) - t0 = time.time() - - plt.ion() - plt.cla() - plt.title('TRPO') - plt.plot(np.arange(len(sum_reward_list)), sum_reward_list) - plt.ylim(-2000, 0) - plt.xlabel('Episode') - plt.ylabel('Moving averaged episode reward') - plt.show() - plt.pause(0.1) - plt.ioff() - + rew = ep_ret o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): - pass + save_ckpt() # Perform TRPO or NPG update! update() - - plt.savefig('plt.jpg') + print('epoch [{}/{}] ep_ret: {} time: {}'.format(epoch, epochs, rew, time.time() - t0)) + + reward_list.append(rew) + plt.clf() + plt.ion() + plt.plot(reward_list) + plt.title('TRPO' + str(delta)) + plt.ylim(-2000, 0) + plt.show() + plt.pause(0.1) + + plt.ioff() + plt.show() while True: o = env.reset() for i in range(200): env.render() - agent_outs = cal_func(o.reshape(1, -1)) + agent_outs = get_action_ops(o.reshape(1, -1)) a, v_t, logp_t, info_t = agent_outs[0][0], agent_outs[1], agent_outs[2], agent_outs[3:] o, r, d, _ = env.step(a) if d: @@ -680,9 +674,9 @@ def set_and_eval(step): parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--seed', '-s', type=int, default=0) parser.add_argument('--steps', type=int, default=4000) - parser.add_argument('--epochs', type=int, default=50) + parser.add_argument('--epochs', type=int, default=500) args = parser.parse_args() trpo(lambda: gym.make(args.env), actor_critic=mlp_actor_critic, ac_kwargs=dict(hidden_sizes=[args.hid] * args.l), gamma=args.gamma, - seed=args.seed, steps_per_epoch=args.steps, epochs=args.epochs,) + seed=args.seed, steps_per_epoch=args.steps, epochs=args.epochs) From 7b29928a1e051a29c7335b3fdae6fdd5130e19d8 Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Tue, 4 Jun 2019 13:41:45 +0100 Subject: [PATCH 22/57] change readme --- examples/reinforcement_learning/README.md | 23 +++++++++--------- .../model/trpo_actor.hdf5 | Bin 0 -> 28840 bytes .../model/trpo_critic.hdf5 | Bin 0 -> 28840 bytes 3 files changed, 12 insertions(+), 11 deletions(-) create mode 100644 examples/reinforcement_learning/model/trpo_actor.hdf5 create mode 100644 examples/reinforcement_learning/model/trpo_critic.hdf5 diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index a6ad60579..16e0186d9 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -28,17 +28,18 @@ or `python ***.py --train` for training and `python ***.py --test` for testing. ## Table of Contents: -| Algorithms | Observation Space | Action Space | Tutorial Env | -| ---------------- | ----------------- | ------------ | -------------- | -| Q-learning | Discrete | Discrete | FrozenLake | -| DQN and variants | Discrete | Discrete | Pong, CartPole | -| Actor-Critic | Continuous | Discrete | CartPole | -| A3C | Continuous | Continuous | BipedalWalker | -| SAC | Continuous | Continuous | Pendulum | -| PG | Continuous | Discrete | CartPole | -| DDPG | Continuous | Continuous | Pendulum | -| TD3 | Continuous | Continuous | Pendulum | -| C51 | | | CartPole | +| Algorithms | Observation Space | Action Space | Tutorial Env | +| --------------- | ----------------- | ------------ | -------------- | +| Q-learning | Discrete | Discrete | FrozenLake | +| DQN | Discrete | Discrete | FrozenLake | +| Variants of DQN | Continuous | Discrete | Pong, CartPole | +| Actor-Critic | Continuous | Discrete | CartPole | +| A3C | Continuous | Continuous | BipedalWalker | +| SAC | Continuous | Continuous | Pendulum | +| PG | Continuous | Discrete | CartPole | +| DDPG | Continuous | Continuous | Pendulum | +| TD3 | Continuous | Continuous | Pendulum | +| C51 | Continuous | Discrete | CartPole | ## Examples of RL Algorithms: diff --git a/examples/reinforcement_learning/model/trpo_actor.hdf5 b/examples/reinforcement_learning/model/trpo_actor.hdf5 new file mode 100644 index 0000000000000000000000000000000000000000..1df8c0aea0bda0fba534ae4fcef88927ba352db8 GIT binary patch literal 28840 zcmeFZc|6wNw>WCdJQIqHDMQEnL^WZ+8Z=8G2=lss^yzaU8p5O0v&-eYuv-Ys|w4OEXwfFlS=j-L8B;QM3?5{^w zR!mAv^&iRZpU$o)fAOD^?tFKBa92XS3&wWAe2G5^aZ!(d)#-GVt9JEoDEX%#zpFf8 zjJLO#2(!EYZrD{V7Te=bs(*LEKjD8X3ix_^`2Dru30_ZH8Z=N;7#g;4S(qsG2ikQ(`+xA{Uv2&)ep2h=*T3_%qS#^)XaB{qiej=N?*234 z!s!dbmi^VOn1+bymHw(1|05ZH;r{h>!+$l;?n(XwqkE?RmyG>C(p^9QT0XmL{yjNs zkokiV-i7!_&cyy*IqUB4pYXpY1^zB)vR!1b?b5&9RO#0D{}9UV!v6#M)hOHs_XOJ6C8X^QmKs<1h;W-kx=7lox}=+;GhcQFSAF^A4> z`giHCf4l7edwNJ!;ZF}~T^;>L_I|a>AH=S|^WP5t&yxRNQ{eARsC6uQs9MK2H%7`>$1y;-cC9 zHFmch=>FdxDR#}WOMm=FdZwz2-2c=wU3B|z2L7&R`gg70u#11X$?-38+g0#)vHc^3 zVq*WN9Cx?;AHzTXj`+X$J7ryb+uhIK`S^d_%Rfv1&{5)F^nh%y|2a*%cbj+b#lksc@*eS56OVI=UrrWvWxt$e+*mLYqB?d_Y1eF5lFfc>8Wb=cYN- zcdusU1mkCzbypE?9N)#hmA)qxmYT36Xb_j0^bJR>v2c?dDMpt{J7M|vSRU72fRPCg zh{^diDD&|gMm2eX!@O5~p^F@p$94#1mu=v$Pec*J5IbsVy zR3`1eD2Z2FThQhxbLv(R%apF3;7d29V&%C5v@!Q6J^tbcliVy*x$=7;KV8-YZYJq; z&d$MZ=7H7tJ@E-0@i3FvWo;wb>nDJJS+js!U8>PT4yxQr9pm)5iM=4f6(SdxcHeV59+wT$%Om)%0aP_UEc-X2p~HMyB4nl5^~$N@%+K8OKe)n0JxhP#ZOn}^XxERYGQeZsY&#tzl$Eh3imot z_F03$Xo@|K-{9}&*6>+g4sOmjGP`q$E$p}Uw#CaZn=zt`$Zt@e+QmFlB1djHGCuaR2I}AVjhmKvZXEO%BWq|bgb`?A+@9PFmhWG+zD{y z0S?n~!`7*IRq`8LJH4oK@4-X3xG9~!-(f~FQlCQHzUz!uohJ=iDX5@S3@c}U;Dr^= zv~Ytud()X9+?1?_#Xi^ApmpIS*Xk!x{j#30z}{S|EgV$5jOfHc0Rrng<$~?^tVpp( zUwkw(4_fc0(1alqF!<7D=ApZde+*AU!^xXjR9Og?EB17Adll`vc515d^K1z|(Jh2b z87z)oM*sq1M&q4>`IV2?Tk;u`=g{&E7JRl!GHnS0`tuMs)TVJx0gOkfAe7iGfSok_T5Lv-OJ1IQ!ldd%V z;f49?eFQ09E;wa*v;g$YS=p2g@ci2;+&9CBr1p&Dqb@p$oS7+&!OfBmZ@J?{l+JzYY+jFDwWhUDVFS;~;!zb8-fwZ*L$Rp51B84RerEGXJ| zjI<1N$Gp@77;W7Sua6j`Oa2YPz1CdTXXhk3%KK-fBsl#qJw9~0~wlS5t^prOoT}gx2ZbR4I)-eC7irc1MW2n^bP4qCl z1n#a+r&aWzy(D|#heD}J{j1aYwx6c7F1&#YT!*9L!Y}BcX27{jH7%KD@8-XL5+DEd zq42SB1)K1zceybX(~{`5JnvB>-7RKEkJT}r8zA9UzQEFLT;xI?>Z$2=&q2)1r+*p! z?_=%$xzfK=;NR{q|Lm-~_p|O@4Z5ME$m9CUSN%`yC$+k|>UMSik^Q&($<?XLJw z!tQv;-{ai>@5Hg}{vOA&v-$gmxa-{P@6-RcdgjlLyE~5M-qqmWga3KG(_s3yuK$t! zw|ZxG7utUf`1j&i`38T+uQU1s|06vUYw`#0pL*uM=70O&ZIr+3nLkn`Cf3o#Ki%{F z|2mEp-o>}wdhG9f+&zzf!hcPHzsIo}LjQIB-*eq^zPQidYyNxZ>4Q#ad^L=0Qyzy! zJzfZS_9Zxgse&1G4^by4nSIVY#8eD-fRg0}P{~xl14(MUv#}9IKGh@F#G_%=09DBK zHRImm(rykT`h#HH9hSb$nKqaC!>X(ESpKNfcws>`=t`Tx6XQ!f{?S92Uthxv2a3_3 z?<0YibdX=;>fph*Vw8?g#N^#ehCONcO8I)0{x!U4?RY|gPW(7`N+$9=+U7Ino<_H`_wdSx%eCI6EQe{^9S6x zxf14g4(8VFk4e~0J+8X^5t_$m!QTEp`07;$aFxgGXa8_&^yC2i%8}zaGfuE`s&}BY zS_?bIMv>#DFWBv=+u?rURCL&5&X->cW+-$gsSryhx{krz!1Z|hgFM&X>Q1xua>*UX zJkaE>{OETHP7W5~uY&nt+bGVPaG%h`eLXsM41R0bj$>cE%(Eklr8u)&k0PQX5;OT87@o49%pMt+=V4G#z1P08h1F5fD83+ z!cWBnP>adIfc9-nE?$dIu1UrX)0VM?*`;{%%xC8H@*MP4Gyu($nSz|TIXKSUip~j? z;XN;WB(B@6X>HJ8flrbyel$|T_qX3*;fl|2bh8w4bXs3;!8UqeB( z1GOodj*`0?QC3GEJEj_P_gevImK+JE*P3$86>D+6Vg~5_O2HBX70!}ofGCE+H!bUh zl@ms?#hctn=X5jLab^Z#?{|Z7yEI>^mIT8bV`1wOEw*s&Q)sNw#iEjdbcMw$!ROW{ z80nOZFBC@Nqy@|I%U1(BYMC+D-#r?Rev-k!1!-WVtp`1F>L7HP6qoAUgt0GOaqHZ> z$Zyv}#tKQA@HQ0I#sW|d^FCN~~h*i^{ zlCWd>AhAY~_lfKa2cJB{L3K*})blzPxGRDE@OA=)H%9P4U;`%)+R~)theT|(JnyV; z#9j)~xaDC1$tmiK3oh7TtbA|o<(xy_HoEb?wd>)fp@2d|5YI?DjN#!TnxAaJgF!2V z*Oc-h`=u&%ofApSG|u7vOliKZM1jU-E`+S2Eqv1crDR@OGOe&5$LB=tp;h~&V4cMj zp7}HemT!z_Rn@*!Vec?xQO!7DXA+yB;as6EGYk7y6`=0Y;WS~@WZvQCf-=r_JXgw{ zNA4QH2OFAl=TZr}qxW>auy-`%uNs2W`kmwN&#dLQXSHJ9_#-&@WEMK7z7VW^R}0?h zTj=b2HZ)CDlMm^0l+Il&Mk8*&hZ%BdQ1)^vUtf8GM_dl2O3m`LjP3!=SE=OFST}y$ ztOU|tdh%)O2lC?Qx4`9tIo~pJ36(!@PH(R!d`)={FI*Q4F;%TtdHAs4T)ii@ex8L2 zrL8dP`Cu>!c_XycTTX6P^u#Ke3B>W}S9tR)84os=;N+1#FvW>O+9Z8QJ2!}H)tb=uzjRi7?zYl5JJmhV!L+z*6y-kYyZ+b%!>i zv)(nV`fk8?3U@=auo5~B#K9_OM}Gdk7Y16zu-xQdu34iV;cNY0*e=Y%R+$m5pN)$k zhrLAUQ6JEKc3(QoYBC%S-a>TV42Dmm72Co@+;nRSnd9!q}0Y9%%?1c4Z~M4K2VDCNzVkc-`SG{xpIhA=9o6Fm#{X* z6%K4(j6y?cUY)FpAFm!OUq4kyVmGFN*wH~a?CL!J?zJ;9dnU47%|2*yMVzM2J4}AB z+D`UGjD=q2j@-P(i8t;ngu>;=d~_=kC)@T=TsBA1}nk)OB^eCFj|n)?!kxe zOM_jbZnM(pBz*gQIePnKg6w%6p6@afR>YgL5~~or9dchd8$EcVkpk6FJSS{raVYox z2z-8!fO4|EF=vh%UvTz{a8+S0K9f1i?%X@Yq`Ysz5#Jp|EeA>HG8q>8FwEt*R2^Cs zpJ(~93jB+PW`#yn1tIIK23W2BlKj&B#sG(-U+}a>$ZSdAz$V9ygDyCAEDrA!fovbeU(veM)>|WtR_RHP?%YS7ovnan#2(_DD%ht?UugP>;WJw`{yu#d?9=ebu^NJW zir}ftBT_Lq2z|2mGN+-s-0qh)$vHu17laNbS?`{ppJezX7iV@moHV z-P)-*W}Y|r?Qs&0p~a|rS&KKOn6h8?X>g?U6ug_5K=kfSB6TmvgVObHpmllycGMM+ zaceKK^7GnUabp>Ad6SM4g3WNM!%OtsGnFgH6uQ1Ty_FTYd-2L|`M7*nHOumSis4>n z!sx+D-2Qko3pg+nHFSG{TD?B4s?G=H!gB2KU<6A_%!jb?iZIJ#I$V?N$7g7sg;XPR z{C*=3@>fQ}xnsf5f$LdPo)KJ<8jX(o(_vNR7vgvQ7O9=`+Ex3+L-^>}0`r7F9rcZ4SLm?B(r|24EM_>Jy$w)}i^G^|{r#EbL>;L1K%;i7vGsatjxq@J9_ zgpsMPM0pPy7isV}?&V~-);Js81xQA9k=;9@`1jT90NRggN2!<^YejSMRu$kUdbv%;@)&cjrzHDsxE z3`|Qmr()T&F-F}N!$&?N?P6b{*~yn@+}r|FgYOe3tzSZa{qJyk@^)&Wm5Z|v+tERB z4X8KLnDd+NqWL)RxVL(A&mDvtOT2M-L_0Ba`vApu#qc9#Ki!^aM00h{2}-3x$l`Ey zmXh*77#!J%Q{5z7&732!U}YOSx%e(N6&B!?%{{>>bp%xU3Fz|l?a(rIw~$@$0LiF8 z2=fbOmt-3F-b3%%)p2*R=As6FnY|O_4;&RtJ*`Jpg?NJSYXDeVO`!J{$x$=oxn#`F zboBc+3|>u=rB@YSp#K_Q`l!(XBF6qi*ZXfUr`m>1n{R+mj{4)SkR7NlaRqwC>5!fS z%s}B(5?QD|3CfM41cUVbF*HMlEq@h&g7>@d<9j`R&)X4lDrC{brJnhXbEYnS3iRgj z{=Dy}Tu?Z81~4}Y?-llyP=_TsTKdL9oTy68ztkf=Y)g zl=_;&Z_P>IFsqGZIz_^Q`F;41_s>yb@gY35`#oq({0uXmy{%Z8zX&JHSHNvM+i+FV z1{PQzfB{V}SgG_yrkt=Hu10!6rpQlU@}rGZdL_b-U0Q;O5wYO;$dOAcmZC?91P{{lZ=X@9`=_oR3*xK;{d_aDIpa=9>cS~@u0n-4}+ zdE|H60g~7uNskTGrHk6!(Vh0h(`uP)?z#+;wl@)PsvU=b&+e?W;1v8glmcI)&ts3P zqu`;`G17ZjDw#At93(_~Wp}V4^EcRww~b@5PO=Zkx^j?yqC;=>tphW`Hd4^Dnp!SW zj&D!TgYHL0Apxco|OH#o}FLM5i0o2{t z05-ECQS*8d8M~^C*hKZfs_hl{)xqVmI`IdL!b1xr_J>~i0$2wFn zJPi{>C(&YCHAvJQIdXlG8V!}ph5;^;@VHc+UR`wzmM_0brVVrC6|bM;`6Nxewy7N4 zuVLA>gxzGRv<)>qycxfkyW``x6EU`63@!>k00vVHsM7--aFG0tLoLO4=}>QYmG~1k zs_z#DN##R7NdvCa9>hkfltRG57JQdYXi|y>T{KP!&zMW`RT-ANd1xH8CjBPcrr*WB zV`4!u!JitWS7YS*XiP5Jf?sm`u%ODZ5Wi{{-Wj|WhAq2@{?!Cc&79%Xv|Xg1!DldB z^admH-9atoG^wA_k1wvjf_%dfJpE%4O|}|^j$ciMc^fL=^g~@3JueJ?dRq%qbsq`% zbvJx}V=s^0JQtp2_TV??JD|kS8(yGLx%#uSEqF zBi^vFeZR6s>tq(}UW7A-x!@w<8CY_m1gL8fj?!O(t4>X#eN2Slnf?*#&z>QBlIq!; zajzgcwlBA9Qm5!Gpp7w0@b>^kI-_b5s$m&=+`WJ^4!pyL3{$*%O9|&o6@c})-R#NB zMm%mb5$E171=+2SKqdSoSPye08{JmIFH095EABvYPx->RZ^LO>RUUq|l1II7)-5B@wA`>mDl!x1KD%%)7nVk1n1LG<9u6`h^ z|LwwSvnBZLEB$$)&t&{k>BoVYnpDd12Jq2`-8T}<&1q%ZbfyP+i z=7uNG*zY%Ma1G?I)Aejxi{dd79q*mLCAcQptI-G%uo+i>HL_2m3CdHQOo4X*zXk3mBwP`QLwGFy5( zR`xN)z=l2e!g@AYz4$zu-`)V3OE*KUr7D||OHugaIh?htMyuAPs5GYtQhwCps$EN9 zlVT&@*(56lw8d zDIvSNy&c262Jpg0T=I8!?>mSVAdLhwr_5gi``P>bFU4c>lc0``#e?Y z8f^lz28jz)A6Jt(WHX{yI-K0L$ob0IZ)BsDF|3*;&VPk}6Zuo(+}F>Cw&+sK$V?=Q zXcJ5H?t?E{z2K5^3~MBh8f z@iButna8x=c+J!Umbg-WdZ;yiI`|qb%hp5C1xtQ5OP&uZ)#tT~9B@Y61?HDg#msNs zBVi_6;PK&peCGO30?BVtq+w4dtCUmZz1yVt{B8d1^z5sUJ)!^{Og@pCxc#v4hZ^5E zLXul)M?>o>WjgbvH2?lKTu|FXOOT|U06SzA;85@vvhtfKUYEZXFT6{^!J+SALDhS- z|DjC-^;Tf%t);l@@Es`3?$6ora#xo{;`IJG&Sprr;kTvt&|tg|9+!~h$PLLfgH}Az z_#9{6S%(*A+`yety8K1%B9f9UztOMORavofDwZvP=O7L`}jPUJbD`%x|2~cOPLcP~X;+yG~ zxZ;XFUA$o+Jv?R&=r}qts}rJFWx76u_?|_H(;^0&r&L$1Qx-b&OhuoR`5#zWeU8rbT5 zkZo)?qdTM(xZ+?t(s5Lh@7d4^<|TKqV(>NcQNoCRTbPTz7OP^QvpXvH6sN~NwnM)s z{g{;TZm8uV-deO7ggsxwlIELusX7J5e;y*7axNNMQf-*+9s|Pl{GcHA5F|X$f#A9% z65FInBkFI#fCzk@OGiP)tRFPSKaAvInjU z=DjV4x)(FiYTZYCvbF@rORmR>nqstdLJ&xqdP2O`U9hpuV4x7kWLPWS8}G~Ar6l?7 z&Kshb&Io?aAOYO>H=xz)-EikbAA0e?M_3w+5O{4FD_xq2o{EaF>%w3(r^6xqXDsd= zx|;ax*MY({sSvR85qsj{$%H$a(ZT5^8|D9iP4FE=4;(&)5)-`W!R3fCFo0+V9AP0j z@$8`V1?D0h!?R=G;W1+;{5Va9Y8KVw(b$)$<+LB(-|@uamkj#YGzhNh>HyyDLxWTL z^G^voz-v5V?hZ+!xMB>Pe{aqwnoR`_+o2$y7=?%Z-;#S)qj{9U^#3#QG59v`rR<%7Oee&vv zaOQpPH}59ySNG!Q_wE2@(+Ev1y|5$v0#vLUP8CxtF+o3;nhofI+fEK454~Jy)&X^% zy1o@3L^t4tC8pf4+y;N{K0>07f*2Z?91E(Y^%5-?%J#g zkxT2?Sv3WIVp%>C?UOuYw*ics{FSXf9>tcudJP#H7Qzzgt(c?G3B|eNndO^n*c>AX zXZ@a$BR~7`k=7!cyRj#|77b~!W3g!LS4dWR4}Rx*6NQ4^IQp{&w;iB~ zzpm-fPbm_l*g|xMJNq=T?Gq2Z#dUCF&z|%`OBg!cpN;wL#;{+x9ghSxu`6CF__|pi z#%uXu=!a60)Xxx$xIF*8%?s!Hctcq7B6#%75-o?6L+Zz#+&Juoz|lnp3XfRBo8V=b zX<{R&8I&$~5m*iTmY1UNmodw>TL;hl7eKt!5tQ9&59x4)EmYlwHF~eXz~(hzh7;Z7 zV947?oktbZdX#7ihvwR|s5W9J$Sp|3!;g&dH_c?HmN*J09jJljGnIIgu_IS*&qNu$ z-8hmc;ogDkh}uS3-mDZv?UH4wLv0Cp_{APl*E0xkwB<{`Ulm@Qw3)f?J%Ey4(QFBu zPxj>PCKraj#Oe$wp5@a6jEtYNS}6^>QofUkos5M0CSxIUQ7Eam73b>j>)D{=>U`zI zy>N>ThOj%c*oR9x^!CAN=yNOtQzaz0Lc=mt395mh2bOeks1&sS2BL=;`g_n9>0atrTa_hPkXasqY>YnBh*J4jsuh>- zgW!y;7dZCLz$Ptu;p%8Byts9s=sksc9M`rChisX~wv)YRes_AF?LxJV@ydih1ODi=L&(|fS+f6KtI`G&GaB!I5{DKvqn~Vkxta7_4EuQ>vdkWXt(LEV&Rhd81NO3hhu5(r z8GYC|!;03vT!d1JgU}}aITmYYkP6iZkiS>D%<2u`pNfxnFNJq+d!khFH+cR- zgTI^EP6C(qp*tM(Y1SDh?5rJ#KPNpSH4S(13OC|gLdSsSHGmCK$xKnd5o^=q@wBHd zf9oj=s}z#iBuK~)#QT^#XaT%9u?2GEuHY^$Igl%R2vJqF9=KZ#SmS3SXx(5A*oGGy)3b%LyetzrjBP-O>nh*0uFyAI^RJd9*yE)Zf=BE9994? zA5F#uS3+X;R@}PgE_~Tlf-CYYc=dHdwKjem3;e!c46Dr6SCZnZ>8oc+zVj%{ayLKIn(5Q|Z$?NOD#jE^qahb54LhkA9!jm=@Ji@`7@YGv1LLpP5f~%5M>VNNXUyPaGr% zLR`>kR|;O7GDo<5P&}Mec!=_Yqbg3OOM_E{7?lwyL9)=4WjXhzJRG?7kKq_{b0U0P z+U7DV>J+313bFSJ3zRzEAHE5P@mBLW^hQi7q(7fbMv9u`^n1q&zq--^a<7@v>(Q0| z;XZVIVlTSJserA15()9wvT@4BMOZg89rezRCue*O=;TM8?AVDdZ2rMLuq0jdj?t)n zthrv97)4CLo5m(kp5wxQ&&b8&&UM)4phnXQd(gJ_!9t0JqamWkfDbzp4lWmVz?26P zwE40Jo%GIv79`&x5vpR)WZVQ(@Hz@-tw5=`vuspAt>}(YGUjg6!Y!p{SRn4m^@{v? z%q$&d>k|w~&2oI?VMp4!re2_~?|bbys*Aj?cXN}jmIklL4Ey z(e_d>F|-|QS7nzMhOZ-nRl3lsD~bhgRK^Igr_gp@8aMP4Df;h{~ux|J-*t*pK zzs&c5l_8HA8+Z{DKW5;e!5<+m+Z-0W=)w2;XArC27a6M%z&QmS!7vPEDjLw+ zAriNWmEp6mMR2ZB55td%@k5>A(0S-GTj#V82kC^sF@^i+ot1zjT%1qpFHa4;?Sbt% z0R1L)usiD?xo#1zCCOu=VRFqm){y0ozvnIBVPZB^-lPmy@4iYZVn(6SFClJSSppli zK49tLBwQdX$B$B8mQs)7wRD0qa5h@m;exb(6`%BPR0Dx!|iXOlk%!d~%POZ%#swNxjf$r7wDZna!$hN8-)Qdl;gb zj#@X(MDK-~!{X`-aAdwMj@CW~1wX84=`=Nd%3Y*0hX#Y^fz9~+^92D5$wDiya&%bx z6Qn8>1^x1NfV7zd*d5%8qZ_7SKOH@uQfUz zFsM9YAh}zHn#R~s$FH}bXidB*Zg~#$mdVk1{Z+YF$}1RqIRwv7PZjJh(V``5`_u2i zXRzp%1Z|iULn;)nFiG!m;5y0_=aff4lKyjYFG?P6F4m)2pY~$n_jQn%JQc63Pr`$n zn(^fCLEtCTffHwzG4uS*%tWvsWr}U-!4pqmhW#*z6`e+kA6Mg3Y_~%z6yf|vC9Gd* zPEV$5!Ost;a1jeE(D6pdvi1$4e&T&NNqCK(pRTemY!DW1j9}Rx55StJ-H^Q}0h}(o)7exS zFG|VsvLQm@9H)b*JGBjK+N+twV|S?gRYH6UOd&Y42Dsc%-ZwLv9Qh(-Yd;XGJa!8- zK9u5H%f@41xfh>NKLY*LT1dtd1xgQlP?wk}TvQMaVbXRuc9{bP+?HZ}?u~&9$Nf;P z>Nr%4^@pL)+JTMyiaNcHVgA9#csjovee1R9X^a2|juktrW@4g12}k;OqM7ehaw+CA z?s@Wrq+L1!6I&{!#=&emIaDt7*>aOGJ6cM3~`r zm!zsK!G4rP_Vn4IB&JuK1xCY`!k+lm*qZ)JI7zD~C`Gu?mj&8XIyfChPVEDIzMAmt=i%US zVJJqvS;`s>Yz4FKX5f`~DlqtiDkO?2R>&-j#4_&^$cRjZL8^<%!I!xeL$|!g7v56f zqNTwFM?b>Wq@N^lK>*x;x*v`$bbuiSsyKB2Z!m4#3>9Cdz?q~dZ2KAnx23~iOp+8m zIGRG~xGV_sxd)?XDMI7ktwMXW75Rs5aD41~cw&~reiuC?Q8toPaf=a%t9wI-!AYp@ zs0HPdciFbd-dMDE2ObLRM_Y2kV9)ODY&c5^$)E2v4NZ|$;5^aMKIk*8{V`mgWk0+Tx#zMY9}1T^DUxrpqw!c zat`N5mj%$wKtJjj6Awj?XVb?alSKC^L~*@|+uXMQbm6lVm*B7L0dQgJMsjAfJQv(M zLl?g8&u7Lsz>G9q?&&?9wp8DPp6_?@vYG@qYG;M7TwRy}+fJ3nX@K*>ku+prEvS#z zq{GIR2{e})u}!7LylUK7K08twO@!n5LSqAFeY;+Gh9t53kYsxnY_$;20J6KM}sX*#;s4!-HoL2CC{bca*U3uheJK(APa(3x`M;H`^j zk8^21%bi#ar$LJ);W)M@JzJ_c)uTITST`GX~6^8tTbv8=vts}|Pobk)Hdsd%`>BKVs!A9^=dS!_yrM0@tDwZ20}Gb;N=1n z8vF4F`Se5r&dGg-aZegxVS+!OKlu>^@B4wH2B=`xoi@0ctqiI^osgBEVkVc{$zl&9 z4D36Qi%rXd-9A4-Y>-%mAhMmD5g$WN4L=M)>nz~ChXXg;G#@_&oP{&_Q$bB^6(-DD zg5M>I!8gvDJ3iZpw={=i&ZVKE?^GPcC(4$5z^p(vM9iKlY+1*O`bUz6oz{5Bgwq|K zn_<&v7Z|CVfqhyc(06|Td~t8Xmv~07%kLQsPuefor*Ro1HcTOMs>#?h?K~T6ya(kh z#$uh?Z!|afh$gH8qju|}wz4SndxwTb5RUZ*G?t|>ufPn0s7 z3u{>JI45Q}e+3?_`hd$roWW5xjvRMNC3>CvVfE@#a5yeWn{!3q7|>Oq9~+;c_24nE zF6uQ}ZP3Q2i=uIv*$g)OR3x!p_EwNYu45T}EZjHV925M@0Vlg+|0j$1yYog=y#5Va zQ5FKF2}i)jvWAI$nTC(Xnt|1TS$K2PY@R%M7|@VPOgVKEmPuIh$F<^YZE=5IHB5Bx zbDIs_KUIitM0Utj^(>1yqRO2V5mK%;x(;`Fgc_wk@O`}x#(1B`vSuA(Y7>XCxs=!3 zmcYlVEw~^)1FWXp#zm_xL9g;KTzuoNZxoCG{Z4VZNJo#3%-7+4-sWMSMq?1W@s>#U zm*RTvW8m=EVDxf~24A;9bl5=`e7-gjJq=%zCT~UZ%RYk{Y|^6_z>1HJSKxQns8NR^ zd0y?R&gZ%&IXwVp<^#UQ+`KduL#w zs~Ro6@*LVcf3rm0cxd}1MvHPbIhK{#w$0fxvI6Pe61)+0-oJ`tS@@3m?dD8D@o zw{APpx^KVmNQS;3x+IP)wp9_xKU*(~Z*PGW{!3YMo+4@_sS53XY{Hl42LrpbMtD6p z4jcO(1&Q=lvU_L;8#?U_JkK@8wI`(csqKYO=9h!tdsCucUxmHql*5$qbI8p(dsx(^ z0#?K?;3Pi<%rRRJ_KjiS6RJwz&FKT4&sV^*q!l16P~ne-b?nkUCp!L90o3<46j*GU z33Wq$LR`lJ9961`+Lv}hbKpvRmZSi0X77il>svt1Oq**t#(|!@I9=A*z#?i^@ZWEw z>4-^vq4yJ8QZqLgg6DOxg{wZ0cE5CtI{p>VVmA&RodjZg5=F7wGnjCp5A=(a5SAUP0P}sf+o@)rf^hzS;mx;a;lCMjvUK(+AT{r$D-juz0R!XjxBtU(( zC$)@y%Jw~I!_g+*aBI6R1iG(>aR&_WuIPIyALbl|<^#^WZy-lD;|i(|^`t}1d+_N$ zCHeH7{ZUpTfNEh8ToCSulOyv$+rWwC|9S==p57!rdkfLZ(E+v}h!gJKsm(hxd*jpj zC!pDFF0SuC7jJ$p!mBq%fyWnp-sUq9AFUn=E`4&Dq(?98vG^Ol%Xq_b=QfJs_xkkh zav3^e;wk(YB8|6xT5{Sq1I~5I(fxb&p}6XH5^2;EvqM(k#|{tvPB9Zc>{x`xH5{w+;0V(v8&TEcS!7hmc$h403AgpO!Kb;upkKQ(y<>A0=I9k+dff!U zl4UwH?_3(By$OOvmJwu8T`w+QJ`=r0OVQ^7Ch60I1kVe$ z;ORc1cLDtR;HuOUxP75L5Tz+(QZkmKgUN8thPMOU-CA!l!pXG#X7rd>#4Pn3c zVzHV9#>(yBcGWlWR?U0igZnF>W714;Kfe_%Ru!<0mKrkSrZMfS`2c=J)0q`vOS-6x5oSehlu%umHR*2Cz=$&F5w`D!d?3 zUKCepz=PWBVFQYBlVhuSkCLTi6-$Jlqa(mfAziQ`Ns+6cc`Z7FI!NMzjcJcwPwDX4 zM%c1rEEXJH0k6KLfvR{ccDhhh93{)omM6dnbw#MTrodZPZ-j~u6JTg3!?}Kb_~7Sa zSSiS3;-z=MPVEB@QkoAxW+<^XqcRNbR|t6?uLM`><+15VZ@TZoB^Z=qLi@wH3WdwX zIQt3{Jd5AL&Zns0qoOo!bRiG6?!FGjs{3J9a50(lrVi{6I=7>MaF zpTNX7wzzk#7_aTI3j9F8J08BpOn+&9Vl$z2rgr##jVt*-+B?^vsE#ZEqw-KtM+G7x z;(&k{f~@#JXZkc?bj25Fk`a(thyhUs6j!05U<6U(g9t$=9)hAlUP^orMP}}4&=5sL z@dXAUxGFO!gNi7i5@c@$rnZ8$m91>7O_ll8b*k^&>F#@O_qlz}`C7yD2P~Z<;|NV7 z17mi=l#DCb81_A(+WJhx%z?(-HK*>;N=&a7fPu=s=+%Ka^Z|U1HSX`TerE3Y3Ga)h z?J`n%qD3zK`$=%w|0Ovqnj=nXe!^*fU_>v)N0Cj3Uvl|wQz3DmJr&hz(orq~s+*Gm z8wE#j$*Wd!{MlX-v+o)8YPMH^n1y(yi z7~~{cF-8}T`IKX0L>>qlHSl1l7p%EImZ)ADumMmAIc=8o`OKHtCOI#@mbVxjMbE)Y zb{r2nWmSOnsYb;ao{Qx2;x@x!J|@I$%4T@6$sK0tox`2p z!{Br=!|xtd;>lxa-2B}YIF7oJ@J&ZhX|)SV_4Mf(u?4hTY{vQ}mdtY3Y_u{tMy9*C zK}*UaI>I`JydP|Z(+|cGuOFLb_p{F6^9jY6%QcX(A4!;Zc?1S^523^F#E~r}A@Y@Z z*37v{NLJKm6NS+##z~`P$v*Ljdxx+MxQF<+i%@XqI?nAE0>WJ!A4fJ7M~$+fF$ubC z^+Y}ofl4ubzBOwf_azZ5*aU}9TGLSeyHICWGw=7hj1P_ZeV=bfp#6(mq;XE(LSKQhU2M*eR>6s?bI(1Y&YvfJxK;r@` zmFvli2^lciVlUa;7KP`R&SE*iGv%%oo~*vk36=f>xyNl|ppmz&I>$-jLb?snQY?q( zX}mrA>1jN%*AYZ57s0e)BP`kNOd}&qpk`nycSSoEWgdrS82C@47mOOod|yvcgvD~M ze9o)O{H-wh&SHRz#xy0rn*=#H(gD&F+}Yp~Sm^f?#$8ttFOd%O9&QJx*WTgA+xKTQ zgYRqVR!Fu1rt{%!b~@6PekhwrXSJqdii3&_w`oMrkSOkElpVFytAg+x8!qQ#Kk}sG zJM5gve@oc9@h)mzcG$2Ggl_z%db%-lrvad0Ami#f&x8DS32rD2BU_wjV9_!@U*eVt z^>}>>Mol{k%RG{>P})Sof+E0IWzAZZiJW4#5SAx}qU)M}i1qCr5YwfvNUKvbX*0LQ z*`KG79gDL#?_5tDpsDslf& z5sRwieILznkZCEvZlxVvZQw7idRhY>ZietfP9!Y*bRzpq5=aeAFTsWC5$JLCiY%`- z0ap4=;k?3=iBa@ew0Ay*cNgDgg5i76eh=?&EaUgphiXyV^>G*~w5QJ5#dz)317elG zg|sD`L%A`Zx061YtyZP#G{sZR)GP6Uf#HSW=S)RZ%JM{JlWJ3dVEF9aC|c z*HXBi8vsW420^CO0!YywvO8%KJD(Vhon8Nw8`q`72-_W8;Jpu+cSV0J3?0Ut100w` zmXzd%2V+C#J?Ng*Mtz4?kXVHdeVrs>h7J+jla?-Yb*>ZNICmMR&5&@e?cr>7@FJQu z_A7CzZXs7Qemodm+73759CpQPqwwSrh<95ko-4VEw{Ld9?!+P#Cxn1IMq~TqIv}hA z=UzPy721YWR;)q8+JBU7HtWZI^5~`bZ<9Q@mp|oWnqxH{R`7GeMGG2OIGtOYa|{!! zU1i55{i*+}cxZI#fKs;u;zY}purJ+^RoxgvGs5$sWZC!P34zx1qfi|*Sbj;I@WfVL zHhCWUMNsB5I2GKg4Unpex!8IYfm3PNmVK4Wv^a!;W;01!xGjYHgh0W~jriH%og}8* zjH=>YP&2h1Qu#RC85bXNPF5BWYc`(U4>$*AbPczEQUZQ+b0fDP&zR)4 z`7&>fWA$DyX+e#;_(gn^C-djzSY`BLZvI=2)xe;h-m1A1?>1%P-Rpf{zKoh<<JjPTSpEBa6&H2I?=00EtG?$j{4pG>$$dFilmBRrm3ne;>g~T;{{0-Qfg}6e>KF0N zd8f~V>HX2$D*hA4Dmb9$AL9ah%HQHxEn3ra-Cy&}+j>%e?)&zgXRLdhd%RbF)UAKp z94nVzy;Z-jef3z~j=#!Zk3e6JRr1rAeQVxr`M2y?wO*>r|6x7pF_3#t P1yPUso9oH{*N*)cV?dvp literal 0 HcmV?d00001 diff --git a/examples/reinforcement_learning/model/trpo_critic.hdf5 b/examples/reinforcement_learning/model/trpo_critic.hdf5 new file mode 100644 index 0000000000000000000000000000000000000000..952653da8c5f597f7a6a2d7f6b563ce858d12705 GIT binary patch literal 28840 zcmeFZc{rAD*Dz|HB4ZH>QBpJ*%5Y!nL`o7FQX~yRbBZRBWC#tWN@&nLs0&3Rb*^)*b=}G1oW?3C zXe)^Q^~lMINr|caBia4)yX!f!FQr! z@=w8$uJTD9E9)B&^w`i#q0;#rG=!~g15OkG6uN`KXh|KW_kaQ}L`;lG(@_ay&;@h4;TkMuvM`zQC0 z-+#-W-4*{4pVi6!!HDTX{KIEr{~@1s_xCUO-*X54&S!F6+c3OK{&w$5x4i!cQ+5~r z-+hJrdC^oK3UMY!bh~(AE;CXZAE}QXBfu*X? zy7t~)%(91=4shN z9QLPOxb5)ng1~vR!h=Qmf6ljirT;AWM?zdo=8w+OwNUXc%|%?|PeB*EirIHH`}cXR zf3#QC*Xzr^x*BbwNB*o>_{*e$RyWl^7T9>r=t4)wk z$22jq|Fr^ATr}Ii#_m1`y5HMl#jaU)$&dd`&Xjj;_rK&!*S`HX1Amt@gSytQ-$g&& z+wo7n?JD>?+y3D~F|q%XkGos`kKrGENBp1q&e<-y?e6FAbo@W=0p=Gwp5=XvBBLEMz`JP+#JR3?NdYLdvmG0K${+}N@nR>roysyo1t!Oe_CC{vCkLJ zLyDv#1-$lvVPQcE*w=?{KAk}`6Vljl#Sp$urw7~hv@f+Rvu6>WF>I-y7c5$D z%nD3nNUDG_?aeYwDo2v-x8BC}9DfZ4H1?w2>ps9*HBTzAmLaWTO=_7untrUk15(dd zv7s84!i@&QSWM4ske0Dw3#CS}x&%$;{&^k=S6sruwRY@5%UaU6egenFN3ycLk5FCm z2s3}1kE6vhxKryTnfBzVTml?oCfp0|)rKB|k9M!vEKY(q_bb9l4I?37s11t~@)(x2 z-|k>}G8>&bn#x;D$f&XfeMZO7``SUI(s>3_5=(Gzzj)5)13}dB5H`m!nV-;mC%wAW zn<;b@(!6K673u}67LHVc~-1a$!@bzo0 zz>Fpf!_AN4qNasxZj~7BIhc*giDIDh>n=z~A4Z=}2@El6gz|Iipzoc__+#NOE_GHB zw3F9bEdbejJ+G} zn3vmI7$AI%qp!F!`QB4tGpMq`PwO$Yb|@{5vZj(pJ=ns*6DalIP^#H&%Ua^q*kE%j zwr&X{?{#WSVa7SA-kXeTI^^icXnQ(z$cs%3a-{*QSKt>-DZ2mOhGwmv2qTnTsCDaC zoW4Pl0(I&+?VRy++q(@0opPt?cD)e<-=IdW2dk*vZ0GS-pB9-o;QIr!*fRbhem1fu z*PWwr^0TiftD-^^dzC^j--YbnDxiCxnz+ml@{~Bc4>exsfey=**<&?XcFtiFDyVH_ z^QEV!TY8+@V@j4 zaJjt-mZT=KRM&P;ZXd%xzpRGlZ?eIy;xhXF-~dxGZ1$69-?sh2 zao1&O+Nd(T{`~^K?eP#6l)V8O6#KBbLzGz2PI37DN|N^V@6YDU(xgF#pLp@v-ykU8 zh+519kmZ<%NAeZe?@v$4Wn#{jqw=M4Wd+{ek<)TeC~hc!H@T_&>dm_H83)zQJ8w{Q zc-gC_JSp;x{hG~H<#V2ul=lfKwO@GszJ11>6ZWgks_YdeG~1tA*HLa)qUn%6zN|cI z=xh6PzopIxm`FO+Da*cW|7D;}p-|m|kkXsj948 zN9?S9Z@riHmEuDCyN^oB?`$t9|L=3!{cELv-+_OBzx;Ev>b}p~cQxpSk|K@kFJ1M& zaG&he)m68u`;Y9uzfZ2}Li^tX{v&((-7b4!R@b!tv-_oe|38S`dguS1)%*n(%{r+C_-#b6$R0lju%I0koa*WPN&d@H01y zv(!3aOk#-17pG5Jd=U+i* zzjA(FlQ}Nkn*x4mgIGstGJ7n49~KWYu)DZVhaJ_-#eI!s`1?W$lZ}`|&i3M@XWol@ zte^!3S`u8S;67fwxl`b1dl^PayW=%iX|jhLP$(XcH=A>Cxok1Pqs8pH^IrC__!sB* zdM~}Vehu-O^4RwBI@*hD122`enagoUW;G)Ws>UybWdd2yTdjnlJ(pwr>y@l=OesrR zJ65>(&Ta0v$~~Ou6oj(rh5Vk?WuRaC7GhLB!I}?!*+Ly}VzYh&N;<-p-z~7)s03y| zypJ6hm+@mg*RrxHSMc7CDPSdM&8{jN)3 zk|xcVlZFCo8_?IZ#piijcvJmFSo+bG?esFFT@q;|?q?0IDK?Zo@--@r9>~nq7vts? z>gH~*GT`RP{ zHRP2eZAijm0M*?kP+d5Z%xtDm@xrI5n#{w)bHs-BUPMX$fpjT+7u(RJ-Tog zhS%?f-TN+smub{$F;(Pd8XIDAnzjLsF~e-WMDOj^?a9w zmm4zR>Hssgk(%-LD+Bh%_qw3O*nllh8BQZPV?OG+ItAbUfsd^hvnFXdexj2O*{NN@ zDJzDsB{!^CvMNXJd=zDxOy!qu*be;yYEbXI3R@_@k8ghI#!CIAS+he0oZWI6UO(TB zI&w*5ANqysh{=N^*E6U^R)<>8e}N||#^j|uO?XFg08=we0VOYgL4~sf8@~Jwmg{r0 zzH~H;_C3SbJ{02ZXJOpEu|8~J-^Z|5vH_!9eOS|IM^^Q87y2$ejmhVRuwZco=3V5> z3NP*l`{NPJNa#;TdQ7G72i)kP!y&rmXvNyj^<>c(D;bPbWKk34sG{{CG%I}MI^#F9 z{6GPjMa6Ij+=Td8Tbn*gtcUh^Kbm^(1nrLXrlY27!EEk37%*%w4UZZ`x;5f>G58kt zm;Qp;)opys_??{Ot|Z|qCm+F;pkjRdB$8jCHImd1UB@47Tfuas11t=T$3FX|p@V$T zX~-V<_TB>S%&&y3MB*RX_eJrI9iXMH!L;%;sBY0Sun%j+iihv1!0u>CHa9T{vp|XaZU;UCCLc=rhG$oA~!lQZORq3>&n)j1wP7Jo}Oj zI=8RDtb==SkA4ET?TRlv*`i2$G)>8~@C4LnrPBqK0_v?;kIO&A^WR@eGTSFV@shz{ z3X3WfESX$@Gkv9K_mBpz-SxY0yzmqEYi%iX2IpYm?$zkAU?#Phs8XKSB`_O51g@Q~ z#ua07pvUQSEcJaRT#_%(of{FY|Zb5&Rmb44~W+!p6-x1#&{zp(Pv z9SjK)qb0B6?Yz#ufXv>fP<>Z0Xf6(iDG@o?A~%qo2(dstcn*&(q)FZwsQL3qteLD% z*T0G}y_L!AimncU^CY^oI$tp5fi#`29Lsf@N7B&fq0IWlN|?NC31;*bu*SLVpxD}n zDO{<72=BzKE3^m8X~b2p6I^qNUh!;n<#MAg0GV+`8I^oi}LU z8rssx@{>PI&rO0;u|w%ezy*jnBu(`R3pKX&+J7G$&;v%Vuo3q8g^;P<`Ritp|_(yLdKU^&|d2@V=`J2xH0 zU!>vVm1&$#pfTNP{SFd>aTvC67QWqBElkn8gl0b8f;r!wuuAQ4ihmMHXPwvMO@pV@ z|LH2>vl#)THK+jg76($R$^@91Db55L-gs-zEqH2m3*y@EL9#0Dancg~!av%WHCnVfGm zyAW;6&N-?BSBdppss!w!5awqy9&3!oD23u^mf z;aihB87K`RkF`g+&<{?O;y;1=CZkY!KVN$c#SP^`^66>ws9uraeU;ydJOM9g6=i%Vy_ew@WzQJ*lgTi_-xxLSa94K_p3^?+9~4XcVi@I zS6#+iR}SG;$pRME+Jgq37{kwR_D47CQ~Wf$3UCOqCHbuLeCF?Oz|!KVW$;Zf@Ka!8 zzbMiNISU-RK8ogVB<$%}&mQ}Ff{#%JT!@o|ckTPpN^Lm|YuI6Y=RBXf!YTK-(*eDpu!P2)ZH!1wT8SXQ6NI)-*~&wa+h zx#)1V-0u?pFul%wJ#hy$LOycAh6;Fbr6Sg*?dNPpyhh=U9iTXIIhpc1&p-dvs2S&f!sPgHizB5O2;3_r=lv?<@JtTd{(>UnGd`CDQ~|1!{Y4 zMQ$o{*}02zY4Eusk)K@}+_s*ig*(?VFV&T_erO@&PSnGWG1jCq!v(z!vw24@)d9Rr$NqZ zGw$@RF~W+}k$B>iFHxieO0{zwseQ)4EN>Wy#xQ^BQ0BRF89!{pN3LzkUM7*g6-pkx zyt)O&9HwDoiz^Kd8xCdFFVRum z6Mgm?(l4hSjMs7on+YHJ^k!o=)wmCvr*;9F=T^YsYFqMsbqk~~xg+z<#fO*e*`t$+ zq>vvARR`{i{J%2Xw{b?S@md7+o~y^!Ufu?wtr=+4oCD?AVeshvLQ=s+py)b@N{aKa zUB#It#za9$KULt4EQHhudmOB4#BS}Eprej^@a>7qP$1j|k<^Q&`Bihl>3MK>-blW) zY$HBt6ym3ZRt&vg1Sj22gKPT@!HYo=oawPU=ymA<*o-?sW!z8babO`nm~V>W%35q^ z{cX@n*@Fk#U0IHT4cplunjaqHV^?R=AgyZ1OwnaeZ?+0HSw^uf#@5VU<_x~O1!($J zj6UCdgmYBuxj_RC3h&7IvxyIevaJrj_@ttgZ+vqM1fBQ!T_c9Gyfi0x`TG!8_eqte zbo8P|wIXhrt|4g}Y=^ab0$EG_Ra_Il1zH<@SVIxwhpWouF`t>06HiaJn|Ejo^&KpPBnNeL z=^;*H`fBV=v>WLrn1IjN-}vH+1^dNvpmRwl*KB(ibbJc%*rug8;*vajGDnl`opBls zqt1Y=;}#4)kqVROC`qgt%gyvw0Lzvs-P*w(Y@sDCQSf41+P7 zZqmK4XW5B+=h3vKUZ81z2x?dy4eGs~(IZjZ$*dqJlAZc z%ia|h@ogE_@Ye4bEEwnklQ{{z-cy75jo8cXM=C&)vLrNT9v7OOj%6e6JjYYchoNQL zboOZLdN^_R70f&ilvN#1POd9hzo=x^zF;C(9uf`;6?**SRnE+<8mLe2C;Z||$}suU zdYb5xjGC&mDf85FTrg-W?2Z}1Mef(8iA7TEtm01Ub^8gdm=uF`(=+)`P=rt613tW1 z4dV|+Q-tqCge!~L)wfpo!)+c$6lk)aXR0w^)NT;_d=@?rAfj#NH0@?2^YTz2@7B9` zA=rzoviD-f+ zU%AHz)v3ZNUYU*exP{A>sIppra~juY7_3jFzWKn@t?#+T%Nk{G8OfBOcnkcY) zve|+g=O=@%Q5EM?Fc_V0KIW}ItJB@(wp{NU`GOdyMf7gr0Zbn;7OTcPfvH~rZBA-} z6}z>7dgehBUx<^}3?xS5`HPteI5561Rcm$dVROv+He;Z0x33)JP@j(9l*fvz&dkVb#EkK8x!aY7T zhnAg9p{R!SW%AjxxyMUo$n=&Z6Yfufl_xD(t?X@l@>3eEl)vMdv?xq#L40ca1kFCY zfEn_!G_;R49_v%Y#a?p8bgy}k5WOGz=$wUGKOt()k!3ZLIm~)%Pa!+J(CMB$cXFg0 z6*sJf7f$DGYOGa-%ba{r(5y}Bx5QZ9L=Cb(wi8Vz2?fL0Xed3L0xLt4SkMqT=5Mx2Al_X^SUYM0L_|&n zeqC?Y+%k|X`$l5?(PMl?VHH=CXfJ5D*C72xBVqX65L)qMUQY~XoZV{5J|e@pNK`|%nl|$utBX3~X|!on1`D1whk1n+ zLZ7v^AdKtFrr88hknRjnY4RuAgh4n#S(`rB9stEOg7fQpvY9=<@vi$SQ73dLGyHmx z^O!%4KHjily0_-@-tEWf)V#guGvg$>EEz>g@dLOt_u=eAye7RIpu~Gm9?6tbH}Sex zE$C_CQFM{?6il&Gr&U+WX?lUdmrlJoJRpdOppkP9zx!SA!&Et56$@}wQ>^tl6N2Pfiu?W3q$ zUIe}dX5ibZLB^WL`6cyUEUoG-%n9`&&1pusCu9#knW@Yk-VEd-?=)~WmxFP&Vieyr z;yZU{a2{XbdIf}eja*Tt1m$hD0%MO?@XP509qQAGkFJkl3zENJ?6FXAm)C~x)?3he zb0n9qtIG2&@6azyhQcz^V9cBaAb(#C>qbk0{tpRO;bdal$HE7HjN;hxnQ2gb|1tX7 z8u6?3dcmElG8hmr2fjEt0UJ{Zc^T{IXJ0MSTit;3*Zf4UY5U>Dy{j;8fDM~+rZc{eV0R7cpp&CehC%{6PdToK$uaP32O@9bBV%GOggg%_MSL} zSK>UtEhz)1(QTY~J{caktVN&S8-?Rv3DE3GFY-O~g*$RBfPRT((%!5C&@=ZAFFAby zx}2N{SF0<8%9Ha1b3AlOa)>YG?$qN$CKvEyi(0Kr2!=G`pmU_a?mZQAkkA?8QAqaIoT*tTW zEf5$mlm*_+N6()_VE>mFFsJ??ICDeD`Qb!5-$xW<*14X^zq^MGhpXY&OfB}x@-$Y( ztiys!suUQrh~`!85R9!kftwCbgd4q%LcfoMHvW@=FFy()Vy7VeE2! z0G+Pv<(8}wW9z3iVx&Yb_N*X@CAvw|bD*j17u?gvxg1I5JygZ~bWtflpibUz1~4{gE!ALavg={P%OZ5Ye~@V`zRcDJuLGU9 z`&?1cC~8%dW-+P;Y{@NE>U^XN_7d-5vi}TJHlLN_~E#ioYzj3rz}ymv7Y1HBHsy zTc4^k1uqjk5){mC$7}~bjee*)!XGtmIAO!1{dhc2hAXR$hnTsC;fHJ~#`PLYp^tRg zfgTf{awsKdjOQLNX=6?811P~- zv6tsHPJZ7b9!E&Cmb%kIZNptK%i=Xou-2wNwwu9oyRGnt`DXTAESXu}A4i|WjhU%` zTA7kz2BjDGfUA%4*=3_uOjlluwJwaq)RQfE@x2+pGiVmSD5XTWc6SvY1sD;8|WNtyvt-PMKW7G1IM?%BW@hk+mvxsWXt7 zeGQ^OH-8Y9+Zx%N}vaZaD#QQ{q^T!EaEHy97^r5^fv3fX(Og*y~TG zys`dimQr|wOyU_6)7Z|Ro^uF08cW&C^Yh5y&2GrD(xT(|imaz^<(JR-fF>OqxG?h{ zAb&oYhRo81AShyOLnopjY7LF`3BU*XGsu1QA~@<9gkqYvxr}9>Ii+4V_{5C^nC_`m zyYH3^PaJb$CoGJ)NDUn@*p`Yl%4(P&{SY-#t=z{Y6L|x3;icS6e0)WW-ntjVT){j} zFTV{M{PZvw@MxY zjFWM5U@_>m?tlZS)5&hFD%+h^4qz)F3)uj^E%F&2SX_o}YKPcn6)9GAW;Bx+>@AD;4G1klDi9W@2D_1Gb&$Xt55<5Ejw2J;EC|S_uVkn zd<#C@_5jmI^3brErK8X8YKjp?BmB+YK0nGhw9IN`c0i({%!Z%XmSZG88^l_A< zkT<~Q56=g+=wCS9U!IzWY_n_oVnt0V8(^KYBU^F(DQ4sbLi9BoRuOR!?Ls1GSeYC3 z)C%Y4$oAno<9_1zNk74*VicR7qC);l=9Bk6IqH)#93`B*VZi%%HhZcyJGNvj8*lj& zp2Ux!BO7MJ_@y&Q>-QXrSoRwhjsAemM)6P+Hjuu)%VM9-SAf)937Rp%owkWhXDdDo z#wO{$^w6#Y{hgLG^CA_tuK6r2_`ZYo8ze%U(o4Y?xQUV*&T}ofUeJ)di96WmA!xsP zDXb8_g!26vSTWq6no8!vTlsu$dc<%Fx@t_m8fMUYZXSNyzaLva)xoP+UmUqFjo)(d z0bIYcl`|Nyij3E&uu-)MXgQ!3+f=v0%%&E|i8NwUMk=Gzr&K|c(E}Xb%s#&i_Nq$|2?!kmxJ7;E7&TL?2@>=5lhzJ z66v?drg%Ud|E5~Tqn;$cbl=nlU~B6Q6fLD;v{OEv72efN3v-J1{k?LgAI{Qq%ohL z!kp6nUNtEC6qyYb%f%VKS+wbqI&QH%0Tw4xMZV21-1MDi@To*2e_1S^ znKl)0Wm@huC14R-R$~i#?xm3MCW-nK$#WM{YK4!#s0oI(sIkR|-@uHs&LCD*jD-gr z1pPLN!NL63ja!Es6sByp&mb=;t4@aue;N8z*hi@>u zFnbrbS5v_9ZjD6uiovLCxt`Q_tI%dSN9Gq2k1tk_hUteKguB=oA)hX??Rx}K&()`( z%{_yuXPMFL$)8}WbPm);_oQ2!3WcxQH;~Gy7R+tl$eN1e*o#RMnc%%NjQ@5JZBA;j z+kH0hpKsdIB#{qwMsH~*WqK91&62^sfhyScqJ%R)q)AV!RO!3MZGPU32XM#d5pR28 zpK$Qo#{!)k4_dNTot4$(VwuAWFnOR((s#c=tobpFTwQ=252V=aF9Ug@L?`w-FA9S( z@nz{hRM}J6Vf5g+7UhcKOJom+VZLrV&hLE{UJRQJ?NKIBx=)kYd8lKKyFJ*RJBqU` zPGQgT0yv$QE13IQ2dft;(S@J~cyWaani!wvlZNI&K+9d>DEAa>bqfbsWpnK5GK5YK zz6Z7ZYCcPG2NsE41+$Bn!S`ewRLy>bl_mu+Wsfp=6lG!h4?DE`T@Mp>*F%e>DQizx z5Gdd21gEtzG+}%Tx_5}~X*=fA#Ev}J8T*1ynjw;Tdq=S^kA83lj^iOJ)|`Cj=`hv4 zPZ7U5u#!hN&MgTY!zl;`;PkYE^wc>XEAC`M(}f$D`>Y?k+E~x;i>c>ia$_kx{SEfA zZ|3iRDMP!#v1BTe$9`w`AQ~-X$JF=HCfBtjW-JDIMILne*>?7R-UVEoKAHvIP$NwX zMw1RZuzbCFSoC}&D5!5>C(RCWi=SnH%A`#~1C_V<$+#!0?|+5g^HG;7%(D5O`;Ifs zSOq-Yag%dcu?}xsh+|S0`=RG#Z?v1P$KTno2gl!jh<64>p`1)M&bD?Jo_PM8-=y>$ zKKHvM+?uVzHL<14d`}T)X1SMloi~|_eW}7M)py{L&Xe4n*bgA%9D%Du--=aUQ)l5Z zPQ164BE21*gQfet8Aj_c`-Ahy|LIDi*yC_4u)3^qR6jIdHw+Sji-bxaH-hok&9MIH zT~0^u3#|N+z#m!FkM3XX3A5U_3eGfq!?^5cm^)?!XDjm86tgG%#JG*{V6ri4R_()$ zCI_K)kQy$7a8guOW3%md^JiuCQEImYvp|t-u3ZM2Z+uXS4#ENnORiq*0A_qBz<#df z5dFLkt~9x^p(YaCm%hrZX4(pRZ&r+}&pg5vv3qH~$4%&2-4i-S)IzyK7(4RA0Fy_| zfG5W9uG1t_h-5>{dAzmz4AbMMQQ@Zz znDBc&I_hu1Yb4IxPK~9VLp>oi>^Oh=VJ2Nuz66ug+8}4uCmdM6o(~&uNgks+ajV)R z)^Q;a8edgGYk!`bez_IH#%+N7M}u(Fk)wF3V-k3_)q$M-GicgrOlOZq(8488F?G;u ztea(xS=pYpQn{s&nplVrYLjr~H&Zk{<&S>0Y2aBoit3i z^L7pUyeEdO{lX}8MlvO&kH@|RMmYX*0;Fe}P>#kBCTs7?RaK}^h{honTl5n)THCRX zk_Kqobb*hN;CNr{>)>pzPX<%xvbSp=pvk!h03HMQ1Ck8FPOO8sk(DaUlAOhCE9jN%^uhVA!^Y3Q>;%qz=c>+TI_2PSIKxpM)~5xW6K){kT#O73yn zedps~bu~6ic@Y$eVuvQm2v~s623G4cgq_?ym_~+~ko!9ovfcj!l-}60r8`nU^T=B0 z<2IK%oU5UElM$KTSO$mv1@>Ty=o{3gHLz${Hs($Fz)9XtWk-j^vXA9@5S`GD zzve%OtA`x;>Cp-FQ?ryw+>E6&R$qCa`?o;bE0KLJ^<&20qVa_2o8+USn1p`vZ#em_ zSNS80sER1Zhm$xCi<#L?$TQHC!CIr#vF`F>FVgf{OG^gm6 zvoQ7k7>F>DWAl9H!fC@+(0$AJ*?HC!ws$d=oowKet%@NnK$+^Zl3+`q2Aq%gq`_0O z&`DDi`xhpP56P1PzXnS-uBia?pIXz!h(x$0-2wA9)}0#StSIE%YyO&lBy&t{ z!|P^!n40qf7LiYQVC;AnnLLc&ovX)uJ4Et)!wHCdAjYy^J^}BrI~cX$Hw;YnXa1u% zv*2bKGz%-_2I_GFoi-b0pmUUbe*Wam&a1IOzjFCD6Du-l@y7O};&`m0&e-5qG@`x(6yf|dx8&K89ju(}52;zyyQd@^Je*5J4MJ}|cM2b{FogvI$~s4-WK zRO83N;Wjy368VEKyj>1+Bo6^TDTI5f7esM%y@ZEdQ#pn9ZIIPNpQTmSD%96 z>6B{BzFQ439d}_A$G31b42n+h}MQ1lNS}7F|$I5!LQIG6d z>w)oDwfa4}4!;dqx7raS7n9osJ+|SX78_YF!N%+<hK{sIjL`S9@*M>oh`!H9zZM^EMHc$v3MPGmVLB)zyWx+3_@Rj;&l$o0Y zRd>!nljwWkmw=I~N^?NGQabkfI2e3!YaCWLr#ojVA zn@b2!hv>jiOv&Ga7MR6k4Ygp8^by+rAr?LcZN-5pAy7YZ4|^GHPDk~^`Rq@XWijhS z0ZRN2!CBvhc&zUyE;Cw}RX&O!yM?Lz@=hPF=~azzUE4qm;sV(wpLO1|xX zVO8FuT80&rFj~0X7I#*uv7*^K;Eu@t-QM4Ts#Rh@J2(cmgd4B{Q#L?u%rBVh-=9}$ z60k!XooVkv~{XmLp6Yk)o2Odnrv4C$-UcqKuj0T&T z11abJ0d)C!f^X2N#`w-z-2T-=s9}F2{M?icyNA|c;MQjdo$6#VWUigHw-}9D(~J)u zd;|CJU9k0bB^*1Oh8{n2(6heEWB9$nl48romx+ z{%AAzMplce+FD`j@*DhJjWD=%&<{?$cu0x67qFDR&HNInHL!7C5jW+HB6%FvV5xN!d-HrPan@8-1PkV{tV#RpC3J#HE+s7ZonE}!Y7c`&9;D8;MSkMonB9D}XZ zW;j1R6m*p3nOJTS=50NM*T0-X;{a9mEprse&6vfNOE$sR)fM3SwjaECUW>aX31Cfh z34|WVhOFdTu$}t|*EBC>OHbOMw?P5KTzdwjTq2YoUdxa9n8Ig`J;BG=odM0^@8E%< z8L#wu5O_54wdeDThWG+S&YpVV88cp9B%=);vCl1NXTVCr-hU=reFVWg0U*_!fp{Ch?O?BO$c^WSo(x!&V>Gq=5-jn66$W zP8I!MMEd^cAT{(9@BUa9<{a0g5$dyP)*KILw^>Hf@|Lt{)p7Xg)WYkJOuI$LA zpcI$6mZH)bA>@v0$3^?HXr*m`G7LEbQc8(n(oX{$Z+%9k)BurvpppNOkd8UYFHtwg z9i%J;_*S79K38g|E1`hZobF;7wJ=I&?HM9oME=V85TOppbbQdv=y_ zBll*a#Gr7R3t4$Ob+FC4h(9XB?i8CxD_VZi6ZqH{VArtEEoPY-_y%AzZAg?APg zaq=tw>(vXaiQLKkUY-v3Zasv8erlAta50(RkfjZ852KF%M%=blhJPP*32c|<;G7qW z@k^dN8rN@uAo< zpVsy2g>mK;+|~3a-1dfGzGUrZTokbr5__f7xChyouOx|Ou9IO-`&1~Yd;|d_w7Ac* zin#Q;6d7y9;C-1|O#Hf>onJbF^=lnMGiwoVt+54Z@kZ`M=zIzj`cfXh5DnheK%$#I zPB`hwj+O=C9)$q5G+myBn?8mL&(CnMaXy$%i&Rc#zT>s%W``R&{_2k^{zYgTcoO=4n#3|l zm!_u7r7?|xT=D)g3_15upgwsZjh-gQ-oCj6aN?5Y8>S>VZc?T8z zJ0W;@II}eFLF)YuvoD1oanNNmSg)3jhi(PKwGb7UvVAFfy0|c{hU*fziJhU+nKRc`QGf!DGQeUDjNTf_Rch@sv`@) zs1JffqWAz81PLXIOAtko?Vf%RB?t-+1SC8hcyU2CQBhoQBBCIR2q>ZfSq#XgC@CPg z+&--isE8mM5*1K`28;m_1zZT~yb8QZ1#2o(nOKu5^Xt^9zPE2Lr%!ivpZoRE;x2k2 z2jSCnJ(y`?NFcWhR-d0ryl=U1g)53swD2T~_bg&plg2{bW-T&uybHFa`BF)O77H&H zz{_MY`$9`dN}BSiYIX=T)%}yccKR8oZ0Et)UX5IC7qW!l4Ai4b$o}cyV^GdER4vzI za~3KQv3?S^bZVg4bupY>eh4%i1f0aD9Q2dU(VhqaG5taohv+TkGPW3##xx`PSSpLJ z%(cV&V=Iw1J_nPN9rVnpMX*pk7#o!5upU=;{z_OZhK%u&ojVOBKGojnxqc2xf<@$9 zxd*@dkvcQ3GG>y)!B~`)%IUW5z?ya?nxPU+MbXC8dH#pQ^SvTiYyrRA3XJM;Y67t&f44u7Ju1FMW468wT%tjn5I-1 zw^G1Vmo}oXP>Y1tyO0@i60}lCfXr_qxMk~aLY|f%b06h~m(NhnX_hf7FtKBXG1_dJ z%_TabM4zm_zk@jyD6#|)gJNM4TWb|X-Vb?<_o^&N)+;fpxOHNP?p%@@JBo!{OoYOr zsbqZFYHU4m9gEs6iQ@5aJ~!hf_~{jb+r(=uYw!W2bL+uWnuh)rrSxo-fW3G!o!qgH z;WP^N;Nir48W_Z}Ar&8Bfa_*FE{h*(sGkgtogUDuA|!*1zvOEVXXC7$%H))0C=@DO zh4z)sn4x!!++U`~EYrjIC%2j{XE&P@b5AK*ai#+Oti@#cFe_5jr3|)qt#~MoXLY&p zWaG62GFd?kHM4^(cN@$jr>pBQ&)_A!Ql`QZosQv@q33b!WnFrNw;~0Gfg~V;KMnW7 z@oIZH{NS=&c1|0?MQoCib3q4T_v%tylFKp2%JCR!`w`PIi-OE7BN8u6!=#2RScMCS z`kGR7QwU@Wa?IGIiQBllcZCq&_6U8`;-S_^LM~NUv-YoIm|ZQ8%?>>@+5I|1zmSmi z%!ZhqUO<%7n!%&&Gd9IbNQMV#GEd#joYUU2C^ygh1pRB<4pNb`CLt5csxg7>7 zjAYzTJUm`ugsk0^eLMdn+l5X+kr2M_ z1Lk9Iz|wW+LPUib9oe4940~f>kffbgwH3{L+Q$DG`Km1 z@VVV6S~tl;^T9fHCaRRy&IzV#|B;FFK9#a@{%yERCz^VP*pOuv7ug+Y1iS5_hToiv zXGe{)NMO)>HtWM73qFhAbwpZP z%i@2sySXF(ZRUiVSbatq(|caW$KQO!5ra+|OgUEC6h(}37vqflMr?Hp zgqIOkq;v4Y(+7+@AaC6{RK08lG;Rd*9(M;bbq*2Bu)%DmMmof(bshkzLJ4Tk&?8A)9A|#& zC4QD&2jR7d-Q#pvbB8*yF}(nxoPgLU<|64y!R1RPK)S9hmOzq*HF+usf($rXF@byF z9>Z0PFhsM?iLCt9ZP@wDhfz^F2!}nxA?6o3N6A5azTFM?T%16Ps0kb1v5IL%0lVJ( z19TtB0r$u--0zlz+vXXOgKaUGWZj9`UwdNq+6iE_L7RK}{ZVR=C}syN_P~#J_H5&h z^XMQ|B39c{n6Y~?w`Q*%Gjxf^XyHWSQPqXz@f{FQ;KmwehA~GB0gVw}#5Fbx=gx8` z$yEn&O?e}RNyTKHmjmY@uqNzciDlw3Tc-Q39BBOHF5cQ)jP{;aAc=2-lG#ecT~tIP zpWQ&c@?fT3dKJq=8*r81nA46ez7Tgvjj8O_#fZW<(xG*oHr#Fp-`5$;?2$VuEP9{4 z$(MO+94pO!(t;dy@r(K1%P-K+gTzKon>W!rB@ z@)qgiSp9mximg214@x=5YT!K#e-6jWWFW`N|<$(@6!zy5Olk8`Y)M-9m87xhiQ zGoWMoUwT{OKXI&9`}KV>A)rtG9>+?&s_(hK{mk3yslRr<1O1Fae{qZY?MGhvx6QG# z?YCR`cpYfR@^btwf7=2BIaUXPew#i}laF&#-+?Lbw*GtTSh-!w<^M3B^evG4Zv~OJ M`kV8~|92hxZ*PpRVgLXD literal 0 HcmV?d00001 From 2a89c43206cdc0c69d074781438ddca8edd50e87 Mon Sep 17 00:00:00 2001 From: Hao Date: Tue, 4 Jun 2019 21:35:02 +0800 Subject: [PATCH 23/57] Update README.md --- examples/README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/examples/README.md b/examples/README.md index 82fc62055..04c0aefa5 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1 +1,11 @@ +
+ + +
+ +
+
+ +
+ # [Click Me](https://github.com/tensorlayer/awesome-tensorlayer) From fde26fb4ae9cfa2200026a8b6e9420e0e38870bd Mon Sep 17 00:00:00 2001 From: Hao Date: Tue, 4 Jun 2019 21:35:53 +0800 Subject: [PATCH 24/57] Update README.md --- examples/reinforcement_learning/README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index 16e0186d9..2596ee8fb 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -1,5 +1,15 @@ # Reinforcement Learning Tutorial with Tensorlayer +
+ + +
+ +
+
+ +
+ This repository contains implementation of most popular reinforcement learning algorithms with Tensorlayer 2.0, supporting [Tensorflow 2.0](https://www.tensorflow.org/alpha/guide/effective_tf2). We aim to make the reinforcement learning tutorial for each algorithm simple and straight-forward to use, as this would not only benefits new learners of reinforcement learning, but also provide convenience for senior researchers to testify their new ideas quickly. ## Prerequisites: From 4622397a8e9fc44280276d774dc35c51855d990e Mon Sep 17 00:00:00 2001 From: Hao Date: Tue, 4 Jun 2019 21:43:05 +0800 Subject: [PATCH 25/57] Update README.md --- examples/reinforcement_learning/README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index 2596ee8fb..d4e9f0781 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -182,6 +182,9 @@ We typically apply game environments in [Openai Gym](https://gym.openai.com/) fo Our env wrapper: `./tutorial_wrappers.py` - +## Authors +- @xxxx XXXXX +- @xxxx XXXXX +- @xxxx XXXXX ### More examples can be found in [example List](https://tensorlayer.readthedocs.io/en/stable/user/examples.html) From 9b1109c06ac7742637f41699ea6fb1c3d7cccd80 Mon Sep 17 00:00:00 2001 From: Hao Date: Tue, 4 Jun 2019 21:43:42 +0800 Subject: [PATCH 26/57] Update README.md --- examples/reinforcement_learning/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index d4e9f0781..1839bc85b 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -183,8 +183,8 @@ We typically apply game environments in [Openai Gym](https://gym.openai.com/) fo Our env wrapper: `./tutorial_wrappers.py` ## Authors -- @xxxx XXXXX -- @xxxx XXXXX +- @xxxx XXXXX : AC, A3C +- @xxxx XXXXX : TPRO - @xxxx XXXXX -### More examples can be found in [example List](https://tensorlayer.readthedocs.io/en/stable/user/examples.html) +### More examples can be found in [example list](https://tensorlayer.readthedocs.io/en/stable/user/examples.html) From 14732899d44508cd872df28b139c8a34686b27df Mon Sep 17 00:00:00 2001 From: Hao Date: Tue, 4 Jun 2019 21:49:44 +0800 Subject: [PATCH 27/57] Update README.md --- examples/reinforcement_learning/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index 1839bc85b..2de4bd427 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -133,7 +133,7 @@ or `python ***.py --train` for training and `python ***.py --test` for testing. Code: `./tutorial_DDPG.py` - Paper: [CONTINUOUS CONTROL WITH DEEP REINFORCEMENT LEARNING](https://arxiv.org/pdf/1509.02971.pdf) + Paper: [Continuous Control With Deep Reinforcement Learning](https://arxiv.org/pdf/1509.02971.pdf) From bc260930735aa2ef59fa58f696db54b2c3101c90 Mon Sep 17 00:00:00 2001 From: Hao Date: Tue, 4 Jun 2019 21:50:19 +0800 Subject: [PATCH 28/57] Update README.md --- examples/reinforcement_learning/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index 2de4bd427..32480f4d1 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -187,4 +187,4 @@ Our env wrapper: `./tutorial_wrappers.py` - @xxxx XXXXX : TPRO - @xxxx XXXXX -### More examples can be found in [example list](https://tensorlayer.readthedocs.io/en/stable/user/examples.html) +### More examples can be found in the [example list](https://tensorlayer.readthedocs.io/en/stable/user/examples.html) From c392f977b798c403e6765b2495ad809db457b8df Mon Sep 17 00:00:00 2001 From: initial-h <18811472492@163.com> Date: Tue, 4 Jun 2019 21:59:34 +0800 Subject: [PATCH 29/57] Update README.md --- examples/reinforcement_learning/README.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index 32480f4d1..c75c18676 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -41,15 +41,19 @@ or `python ***.py --train` for training and `python ***.py --test` for testing. | Algorithms | Observation Space | Action Space | Tutorial Env | | --------------- | ----------------- | ------------ | -------------- | | Q-learning | Discrete | Discrete | FrozenLake | +| C51 | Continuous | Discrete | CartPole | | DQN | Discrete | Discrete | FrozenLake | | Variants of DQN | Continuous | Discrete | Pong, CartPole | | Actor-Critic | Continuous | Discrete | CartPole | | A3C | Continuous | Continuous | BipedalWalker | -| SAC | Continuous | Continuous | Pendulum | -| PG | Continuous | Discrete | CartPole | | DDPG | Continuous | Continuous | Pendulum | | TD3 | Continuous | Continuous | Pendulum | -| C51 | Continuous | Discrete | CartPole | +| SAC | Continuous | Continuous | Pendulum | +| PG | Continuous | Discrete | CartPole | +| TRPO | Continuous | Continuous | Pendulum | +| PPO | Continuous | Continuous | Pendulum | +| DPPO | Continuous | Continuous | Pendulum | + ## Examples of RL Algorithms: @@ -162,7 +166,7 @@ or `python ***.py --train` for training and `python ***.py --test` for testing. -* Distributed Proximal Policy Optimization (PPO) +* Distributed Proximal Policy Optimization (DPPO) Code: `./tutorial_DPPO.py` From e6c453264a85ad66356832ab3e0a12de2bf87f64 Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Tue, 4 Jun 2019 15:09:09 +0100 Subject: [PATCH 30/57] remove model --- .../model/trpo_actor.hdf5 | Bin 28840 -> 0 bytes .../model/trpo_critic.hdf5 | Bin 28840 -> 0 bytes 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 examples/reinforcement_learning/model/trpo_actor.hdf5 delete mode 100644 examples/reinforcement_learning/model/trpo_critic.hdf5 diff --git a/examples/reinforcement_learning/model/trpo_actor.hdf5 b/examples/reinforcement_learning/model/trpo_actor.hdf5 deleted file mode 100644 index 1df8c0aea0bda0fba534ae4fcef88927ba352db8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 28840 zcmeFZc|6wNw>WCdJQIqHDMQEnL^WZ+8Z=8G2=lss^yzaU8p5O0v&-eYuv-Ys|w4OEXwfFlS=j-L8B;QM3?5{^w zR!mAv^&iRZpU$o)fAOD^?tFKBa92XS3&wWAe2G5^aZ!(d)#-GVt9JEoDEX%#zpFf8 zjJLO#2(!EYZrD{V7Te=bs(*LEKjD8X3ix_^`2Dru30_ZH8Z=N;7#g;4S(qsG2ikQ(`+xA{Uv2&)ep2h=*T3_%qS#^)XaB{qiej=N?*234 z!s!dbmi^VOn1+bymHw(1|05ZH;r{h>!+$l;?n(XwqkE?RmyG>C(p^9QT0XmL{yjNs zkokiV-i7!_&cyy*IqUB4pYXpY1^zB)vR!1b?b5&9RO#0D{}9UV!v6#M)hOHs_XOJ6C8X^QmKs<1h;W-kx=7lox}=+;GhcQFSAF^A4> z`giHCf4l7edwNJ!;ZF}~T^;>L_I|a>AH=S|^WP5t&yxRNQ{eARsC6uQs9MK2H%7`>$1y;-cC9 zHFmch=>FdxDR#}WOMm=FdZwz2-2c=wU3B|z2L7&R`gg70u#11X$?-38+g0#)vHc^3 zVq*WN9Cx?;AHzTXj`+X$J7ryb+uhIK`S^d_%Rfv1&{5)F^nh%y|2a*%cbj+b#lksc@*eS56OVI=UrrWvWxt$e+*mLYqB?d_Y1eF5lFfc>8Wb=cYN- zcdusU1mkCzbypE?9N)#hmA)qxmYT36Xb_j0^bJR>v2c?dDMpt{J7M|vSRU72fRPCg zh{^diDD&|gMm2eX!@O5~p^F@p$94#1mu=v$Pec*J5IbsVy zR3`1eD2Z2FThQhxbLv(R%apF3;7d29V&%C5v@!Q6J^tbcliVy*x$=7;KV8-YZYJq; z&d$MZ=7H7tJ@E-0@i3FvWo;wb>nDJJS+js!U8>PT4yxQr9pm)5iM=4f6(SdxcHeV59+wT$%Om)%0aP_UEc-X2p~HMyB4nl5^~$N@%+K8OKe)n0JxhP#ZOn}^XxERYGQeZsY&#tzl$Eh3imot z_F03$Xo@|K-{9}&*6>+g4sOmjGP`q$E$p}Uw#CaZn=zt`$Zt@e+QmFlB1djHGCuaR2I}AVjhmKvZXEO%BWq|bgb`?A+@9PFmhWG+zD{y z0S?n~!`7*IRq`8LJH4oK@4-X3xG9~!-(f~FQlCQHzUz!uohJ=iDX5@S3@c}U;Dr^= zv~Ytud()X9+?1?_#Xi^ApmpIS*Xk!x{j#30z}{S|EgV$5jOfHc0Rrng<$~?^tVpp( zUwkw(4_fc0(1alqF!<7D=ApZde+*AU!^xXjR9Og?EB17Adll`vc515d^K1z|(Jh2b z87z)oM*sq1M&q4>`IV2?Tk;u`=g{&E7JRl!GHnS0`tuMs)TVJx0gOkfAe7iGfSok_T5Lv-OJ1IQ!ldd%V z;f49?eFQ09E;wa*v;g$YS=p2g@ci2;+&9CBr1p&Dqb@p$oS7+&!OfBmZ@J?{l+JzYY+jFDwWhUDVFS;~;!zb8-fwZ*L$Rp51B84RerEGXJ| zjI<1N$Gp@77;W7Sua6j`Oa2YPz1CdTXXhk3%KK-fBsl#qJw9~0~wlS5t^prOoT}gx2ZbR4I)-eC7irc1MW2n^bP4qCl z1n#a+r&aWzy(D|#heD}J{j1aYwx6c7F1&#YT!*9L!Y}BcX27{jH7%KD@8-XL5+DEd zq42SB1)K1zceybX(~{`5JnvB>-7RKEkJT}r8zA9UzQEFLT;xI?>Z$2=&q2)1r+*p! z?_=%$xzfK=;NR{q|Lm-~_p|O@4Z5ME$m9CUSN%`yC$+k|>UMSik^Q&($<?XLJw z!tQv;-{ai>@5Hg}{vOA&v-$gmxa-{P@6-RcdgjlLyE~5M-qqmWga3KG(_s3yuK$t! zw|ZxG7utUf`1j&i`38T+uQU1s|06vUYw`#0pL*uM=70O&ZIr+3nLkn`Cf3o#Ki%{F z|2mEp-o>}wdhG9f+&zzf!hcPHzsIo}LjQIB-*eq^zPQidYyNxZ>4Q#ad^L=0Qyzy! zJzfZS_9Zxgse&1G4^by4nSIVY#8eD-fRg0}P{~xl14(MUv#}9IKGh@F#G_%=09DBK zHRImm(rykT`h#HH9hSb$nKqaC!>X(ESpKNfcws>`=t`Tx6XQ!f{?S92Uthxv2a3_3 z?<0YibdX=;>fph*Vw8?g#N^#ehCONcO8I)0{x!U4?RY|gPW(7`N+$9=+U7Ino<_H`_wdSx%eCI6EQe{^9S6x zxf14g4(8VFk4e~0J+8X^5t_$m!QTEp`07;$aFxgGXa8_&^yC2i%8}zaGfuE`s&}BY zS_?bIMv>#DFWBv=+u?rURCL&5&X->cW+-$gsSryhx{krz!1Z|hgFM&X>Q1xua>*UX zJkaE>{OETHP7W5~uY&nt+bGVPaG%h`eLXsM41R0bj$>cE%(Eklr8u)&k0PQX5;OT87@o49%pMt+=V4G#z1P08h1F5fD83+ z!cWBnP>adIfc9-nE?$dIu1UrX)0VM?*`;{%%xC8H@*MP4Gyu($nSz|TIXKSUip~j? z;XN;WB(B@6X>HJ8flrbyel$|T_qX3*;fl|2bh8w4bXs3;!8UqeB( z1GOodj*`0?QC3GEJEj_P_gevImK+JE*P3$86>D+6Vg~5_O2HBX70!}ofGCE+H!bUh zl@ms?#hctn=X5jLab^Z#?{|Z7yEI>^mIT8bV`1wOEw*s&Q)sNw#iEjdbcMw$!ROW{ z80nOZFBC@Nqy@|I%U1(BYMC+D-#r?Rev-k!1!-WVtp`1F>L7HP6qoAUgt0GOaqHZ> z$Zyv}#tKQA@HQ0I#sW|d^FCN~~h*i^{ zlCWd>AhAY~_lfKa2cJB{L3K*})blzPxGRDE@OA=)H%9P4U;`%)+R~)theT|(JnyV; z#9j)~xaDC1$tmiK3oh7TtbA|o<(xy_HoEb?wd>)fp@2d|5YI?DjN#!TnxAaJgF!2V z*Oc-h`=u&%ofApSG|u7vOliKZM1jU-E`+S2Eqv1crDR@OGOe&5$LB=tp;h~&V4cMj zp7}HemT!z_Rn@*!Vec?xQO!7DXA+yB;as6EGYk7y6`=0Y;WS~@WZvQCf-=r_JXgw{ zNA4QH2OFAl=TZr}qxW>auy-`%uNs2W`kmwN&#dLQXSHJ9_#-&@WEMK7z7VW^R}0?h zTj=b2HZ)CDlMm^0l+Il&Mk8*&hZ%BdQ1)^vUtf8GM_dl2O3m`LjP3!=SE=OFST}y$ ztOU|tdh%)O2lC?Qx4`9tIo~pJ36(!@PH(R!d`)={FI*Q4F;%TtdHAs4T)ii@ex8L2 zrL8dP`Cu>!c_XycTTX6P^u#Ke3B>W}S9tR)84os=;N+1#FvW>O+9Z8QJ2!}H)tb=uzjRi7?zYl5JJmhV!L+z*6y-kYyZ+b%!>i zv)(nV`fk8?3U@=auo5~B#K9_OM}Gdk7Y16zu-xQdu34iV;cNY0*e=Y%R+$m5pN)$k zhrLAUQ6JEKc3(QoYBC%S-a>TV42Dmm72Co@+;nRSnd9!q}0Y9%%?1c4Z~M4K2VDCNzVkc-`SG{xpIhA=9o6Fm#{X* z6%K4(j6y?cUY)FpAFm!OUq4kyVmGFN*wH~a?CL!J?zJ;9dnU47%|2*yMVzM2J4}AB z+D`UGjD=q2j@-P(i8t;ngu>;=d~_=kC)@T=TsBA1}nk)OB^eCFj|n)?!kxe zOM_jbZnM(pBz*gQIePnKg6w%6p6@afR>YgL5~~or9dchd8$EcVkpk6FJSS{raVYox z2z-8!fO4|EF=vh%UvTz{a8+S0K9f1i?%X@Yq`Ysz5#Jp|EeA>HG8q>8FwEt*R2^Cs zpJ(~93jB+PW`#yn1tIIK23W2BlKj&B#sG(-U+}a>$ZSdAz$V9ygDyCAEDrA!fovbeU(veM)>|WtR_RHP?%YS7ovnan#2(_DD%ht?UugP>;WJw`{yu#d?9=ebu^NJW zir}ftBT_Lq2z|2mGN+-s-0qh)$vHu17laNbS?`{ppJezX7iV@moHV z-P)-*W}Y|r?Qs&0p~a|rS&KKOn6h8?X>g?U6ug_5K=kfSB6TmvgVObHpmllycGMM+ zaceKK^7GnUabp>Ad6SM4g3WNM!%OtsGnFgH6uQ1Ty_FTYd-2L|`M7*nHOumSis4>n z!sx+D-2Qko3pg+nHFSG{TD?B4s?G=H!gB2KU<6A_%!jb?iZIJ#I$V?N$7g7sg;XPR z{C*=3@>fQ}xnsf5f$LdPo)KJ<8jX(o(_vNR7vgvQ7O9=`+Ex3+L-^>}0`r7F9rcZ4SLm?B(r|24EM_>Jy$w)}i^G^|{r#EbL>;L1K%;i7vGsatjxq@J9_ zgpsMPM0pPy7isV}?&V~-);Js81xQA9k=;9@`1jT90NRggN2!<^YejSMRu$kUdbv%;@)&cjrzHDsxE z3`|Qmr()T&F-F}N!$&?N?P6b{*~yn@+}r|FgYOe3tzSZa{qJyk@^)&Wm5Z|v+tERB z4X8KLnDd+NqWL)RxVL(A&mDvtOT2M-L_0Ba`vApu#qc9#Ki!^aM00h{2}-3x$l`Ey zmXh*77#!J%Q{5z7&732!U}YOSx%e(N6&B!?%{{>>bp%xU3Fz|l?a(rIw~$@$0LiF8 z2=fbOmt-3F-b3%%)p2*R=As6FnY|O_4;&RtJ*`Jpg?NJSYXDeVO`!J{$x$=oxn#`F zboBc+3|>u=rB@YSp#K_Q`l!(XBF6qi*ZXfUr`m>1n{R+mj{4)SkR7NlaRqwC>5!fS z%s}B(5?QD|3CfM41cUVbF*HMlEq@h&g7>@d<9j`R&)X4lDrC{brJnhXbEYnS3iRgj z{=Dy}Tu?Z81~4}Y?-llyP=_TsTKdL9oTy68ztkf=Y)g zl=_;&Z_P>IFsqGZIz_^Q`F;41_s>yb@gY35`#oq({0uXmy{%Z8zX&JHSHNvM+i+FV z1{PQzfB{V}SgG_yrkt=Hu10!6rpQlU@}rGZdL_b-U0Q;O5wYO;$dOAcmZC?91P{{lZ=X@9`=_oR3*xK;{d_aDIpa=9>cS~@u0n-4}+ zdE|H60g~7uNskTGrHk6!(Vh0h(`uP)?z#+;wl@)PsvU=b&+e?W;1v8glmcI)&ts3P zqu`;`G17ZjDw#At93(_~Wp}V4^EcRww~b@5PO=Zkx^j?yqC;=>tphW`Hd4^Dnp!SW zj&D!TgYHL0Apxco|OH#o}FLM5i0o2{t z05-ECQS*8d8M~^C*hKZfs_hl{)xqVmI`IdL!b1xr_J>~i0$2wFn zJPi{>C(&YCHAvJQIdXlG8V!}ph5;^;@VHc+UR`wzmM_0brVVrC6|bM;`6Nxewy7N4 zuVLA>gxzGRv<)>qycxfkyW``x6EU`63@!>k00vVHsM7--aFG0tLoLO4=}>QYmG~1k zs_z#DN##R7NdvCa9>hkfltRG57JQdYXi|y>T{KP!&zMW`RT-ANd1xH8CjBPcrr*WB zV`4!u!JitWS7YS*XiP5Jf?sm`u%ODZ5Wi{{-Wj|WhAq2@{?!Cc&79%Xv|Xg1!DldB z^admH-9atoG^wA_k1wvjf_%dfJpE%4O|}|^j$ciMc^fL=^g~@3JueJ?dRq%qbsq`% zbvJx}V=s^0JQtp2_TV??JD|kS8(yGLx%#uSEqF zBi^vFeZR6s>tq(}UW7A-x!@w<8CY_m1gL8fj?!O(t4>X#eN2Slnf?*#&z>QBlIq!; zajzgcwlBA9Qm5!Gpp7w0@b>^kI-_b5s$m&=+`WJ^4!pyL3{$*%O9|&o6@c})-R#NB zMm%mb5$E171=+2SKqdSoSPye08{JmIFH095EABvYPx->RZ^LO>RUUq|l1II7)-5B@wA`>mDl!x1KD%%)7nVk1n1LG<9u6`h^ z|LwwSvnBZLEB$$)&t&{k>BoVYnpDd12Jq2`-8T}<&1q%ZbfyP+i z=7uNG*zY%Ma1G?I)Aejxi{dd79q*mLCAcQptI-G%uo+i>HL_2m3CdHQOo4X*zXk3mBwP`QLwGFy5( zR`xN)z=l2e!g@AYz4$zu-`)V3OE*KUr7D||OHugaIh?htMyuAPs5GYtQhwCps$EN9 zlVT&@*(56lw8d zDIvSNy&c262Jpg0T=I8!?>mSVAdLhwr_5gi``P>bFU4c>lc0``#e?Y z8f^lz28jz)A6Jt(WHX{yI-K0L$ob0IZ)BsDF|3*;&VPk}6Zuo(+}F>Cw&+sK$V?=Q zXcJ5H?t?E{z2K5^3~MBh8f z@iButna8x=c+J!Umbg-WdZ;yiI`|qb%hp5C1xtQ5OP&uZ)#tT~9B@Y61?HDg#msNs zBVi_6;PK&peCGO30?BVtq+w4dtCUmZz1yVt{B8d1^z5sUJ)!^{Og@pCxc#v4hZ^5E zLXul)M?>o>WjgbvH2?lKTu|FXOOT|U06SzA;85@vvhtfKUYEZXFT6{^!J+SALDhS- z|DjC-^;Tf%t);l@@Es`3?$6ora#xo{;`IJG&Sprr;kTvt&|tg|9+!~h$PLLfgH}Az z_#9{6S%(*A+`yety8K1%B9f9UztOMORavofDwZvP=O7L`}jPUJbD`%x|2~cOPLcP~X;+yG~ zxZ;XFUA$o+Jv?R&=r}qts}rJFWx76u_?|_H(;^0&r&L$1Qx-b&OhuoR`5#zWeU8rbT5 zkZo)?qdTM(xZ+?t(s5Lh@7d4^<|TKqV(>NcQNoCRTbPTz7OP^QvpXvH6sN~NwnM)s z{g{;TZm8uV-deO7ggsxwlIELusX7J5e;y*7axNNMQf-*+9s|Pl{GcHA5F|X$f#A9% z65FInBkFI#fCzk@OGiP)tRFPSKaAvInjU z=DjV4x)(FiYTZYCvbF@rORmR>nqstdLJ&xqdP2O`U9hpuV4x7kWLPWS8}G~Ar6l?7 z&Kshb&Io?aAOYO>H=xz)-EikbAA0e?M_3w+5O{4FD_xq2o{EaF>%w3(r^6xqXDsd= zx|;ax*MY({sSvR85qsj{$%H$a(ZT5^8|D9iP4FE=4;(&)5)-`W!R3fCFo0+V9AP0j z@$8`V1?D0h!?R=G;W1+;{5Va9Y8KVw(b$)$<+LB(-|@uamkj#YGzhNh>HyyDLxWTL z^G^voz-v5V?hZ+!xMB>Pe{aqwnoR`_+o2$y7=?%Z-;#S)qj{9U^#3#QG59v`rR<%7Oee&vv zaOQpPH}59ySNG!Q_wE2@(+Ev1y|5$v0#vLUP8CxtF+o3;nhofI+fEK454~Jy)&X^% zy1o@3L^t4tC8pf4+y;N{K0>07f*2Z?91E(Y^%5-?%J#g zkxT2?Sv3WIVp%>C?UOuYw*ics{FSXf9>tcudJP#H7Qzzgt(c?G3B|eNndO^n*c>AX zXZ@a$BR~7`k=7!cyRj#|77b~!W3g!LS4dWR4}Rx*6NQ4^IQp{&w;iB~ zzpm-fPbm_l*g|xMJNq=T?Gq2Z#dUCF&z|%`OBg!cpN;wL#;{+x9ghSxu`6CF__|pi z#%uXu=!a60)Xxx$xIF*8%?s!Hctcq7B6#%75-o?6L+Zz#+&Juoz|lnp3XfRBo8V=b zX<{R&8I&$~5m*iTmY1UNmodw>TL;hl7eKt!5tQ9&59x4)EmYlwHF~eXz~(hzh7;Z7 zV947?oktbZdX#7ihvwR|s5W9J$Sp|3!;g&dH_c?HmN*J09jJljGnIIgu_IS*&qNu$ z-8hmc;ogDkh}uS3-mDZv?UH4wLv0Cp_{APl*E0xkwB<{`Ulm@Qw3)f?J%Ey4(QFBu zPxj>PCKraj#Oe$wp5@a6jEtYNS}6^>QofUkos5M0CSxIUQ7Eam73b>j>)D{=>U`zI zy>N>ThOj%c*oR9x^!CAN=yNOtQzaz0Lc=mt395mh2bOeks1&sS2BL=;`g_n9>0atrTa_hPkXasqY>YnBh*J4jsuh>- zgW!y;7dZCLz$Ptu;p%8Byts9s=sksc9M`rChisX~wv)YRes_AF?LxJV@ydih1ODi=L&(|fS+f6KtI`G&GaB!I5{DKvqn~Vkxta7_4EuQ>vdkWXt(LEV&Rhd81NO3hhu5(r z8GYC|!;03vT!d1JgU}}aITmYYkP6iZkiS>D%<2u`pNfxnFNJq+d!khFH+cR- zgTI^EP6C(qp*tM(Y1SDh?5rJ#KPNpSH4S(13OC|gLdSsSHGmCK$xKnd5o^=q@wBHd zf9oj=s}z#iBuK~)#QT^#XaT%9u?2GEuHY^$Igl%R2vJqF9=KZ#SmS3SXx(5A*oGGy)3b%LyetzrjBP-O>nh*0uFyAI^RJd9*yE)Zf=BE9994? zA5F#uS3+X;R@}PgE_~Tlf-CYYc=dHdwKjem3;e!c46Dr6SCZnZ>8oc+zVj%{ayLKIn(5Q|Z$?NOD#jE^qahb54LhkA9!jm=@Ji@`7@YGv1LLpP5f~%5M>VNNXUyPaGr% zLR`>kR|;O7GDo<5P&}Mec!=_Yqbg3OOM_E{7?lwyL9)=4WjXhzJRG?7kKq_{b0U0P z+U7DV>J+313bFSJ3zRzEAHE5P@mBLW^hQi7q(7fbMv9u`^n1q&zq--^a<7@v>(Q0| z;XZVIVlTSJserA15()9wvT@4BMOZg89rezRCue*O=;TM8?AVDdZ2rMLuq0jdj?t)n zthrv97)4CLo5m(kp5wxQ&&b8&&UM)4phnXQd(gJ_!9t0JqamWkfDbzp4lWmVz?26P zwE40Jo%GIv79`&x5vpR)WZVQ(@Hz@-tw5=`vuspAt>}(YGUjg6!Y!p{SRn4m^@{v? z%q$&d>k|w~&2oI?VMp4!re2_~?|bbys*Aj?cXN}jmIklL4Ey z(e_d>F|-|QS7nzMhOZ-nRl3lsD~bhgRK^Igr_gp@8aMP4Df;h{~ux|J-*t*pK zzs&c5l_8HA8+Z{DKW5;e!5<+m+Z-0W=)w2;XArC27a6M%z&QmS!7vPEDjLw+ zAriNWmEp6mMR2ZB55td%@k5>A(0S-GTj#V82kC^sF@^i+ot1zjT%1qpFHa4;?Sbt% z0R1L)usiD?xo#1zCCOu=VRFqm){y0ozvnIBVPZB^-lPmy@4iYZVn(6SFClJSSppli zK49tLBwQdX$B$B8mQs)7wRD0qa5h@m;exb(6`%BPR0Dx!|iXOlk%!d~%POZ%#swNxjf$r7wDZna!$hN8-)Qdl;gb zj#@X(MDK-~!{X`-aAdwMj@CW~1wX84=`=Nd%3Y*0hX#Y^fz9~+^92D5$wDiya&%bx z6Qn8>1^x1NfV7zd*d5%8qZ_7SKOH@uQfUz zFsM9YAh}zHn#R~s$FH}bXidB*Zg~#$mdVk1{Z+YF$}1RqIRwv7PZjJh(V``5`_u2i zXRzp%1Z|iULn;)nFiG!m;5y0_=aff4lKyjYFG?P6F4m)2pY~$n_jQn%JQc63Pr`$n zn(^fCLEtCTffHwzG4uS*%tWvsWr}U-!4pqmhW#*z6`e+kA6Mg3Y_~%z6yf|vC9Gd* zPEV$5!Ost;a1jeE(D6pdvi1$4e&T&NNqCK(pRTemY!DW1j9}Rx55StJ-H^Q}0h}(o)7exS zFG|VsvLQm@9H)b*JGBjK+N+twV|S?gRYH6UOd&Y42Dsc%-ZwLv9Qh(-Yd;XGJa!8- zK9u5H%f@41xfh>NKLY*LT1dtd1xgQlP?wk}TvQMaVbXRuc9{bP+?HZ}?u~&9$Nf;P z>Nr%4^@pL)+JTMyiaNcHVgA9#csjovee1R9X^a2|juktrW@4g12}k;OqM7ehaw+CA z?s@Wrq+L1!6I&{!#=&emIaDt7*>aOGJ6cM3~`r zm!zsK!G4rP_Vn4IB&JuK1xCY`!k+lm*qZ)JI7zD~C`Gu?mj&8XIyfChPVEDIzMAmt=i%US zVJJqvS;`s>Yz4FKX5f`~DlqtiDkO?2R>&-j#4_&^$cRjZL8^<%!I!xeL$|!g7v56f zqNTwFM?b>Wq@N^lK>*x;x*v`$bbuiSsyKB2Z!m4#3>9Cdz?q~dZ2KAnx23~iOp+8m zIGRG~xGV_sxd)?XDMI7ktwMXW75Rs5aD41~cw&~reiuC?Q8toPaf=a%t9wI-!AYp@ zs0HPdciFbd-dMDE2ObLRM_Y2kV9)ODY&c5^$)E2v4NZ|$;5^aMKIk*8{V`mgWk0+Tx#zMY9}1T^DUxrpqw!c zat`N5mj%$wKtJjj6Awj?XVb?alSKC^L~*@|+uXMQbm6lVm*B7L0dQgJMsjAfJQv(M zLl?g8&u7Lsz>G9q?&&?9wp8DPp6_?@vYG@qYG;M7TwRy}+fJ3nX@K*>ku+prEvS#z zq{GIR2{e})u}!7LylUK7K08twO@!n5LSqAFeY;+Gh9t53kYsxnY_$;20J6KM}sX*#;s4!-HoL2CC{bca*U3uheJK(APa(3x`M;H`^j zk8^21%bi#ar$LJ);W)M@JzJ_c)uTITST`GX~6^8tTbv8=vts}|Pobk)Hdsd%`>BKVs!A9^=dS!_yrM0@tDwZ20}Gb;N=1n z8vF4F`Se5r&dGg-aZegxVS+!OKlu>^@B4wH2B=`xoi@0ctqiI^osgBEVkVc{$zl&9 z4D36Qi%rXd-9A4-Y>-%mAhMmD5g$WN4L=M)>nz~ChXXg;G#@_&oP{&_Q$bB^6(-DD zg5M>I!8gvDJ3iZpw={=i&ZVKE?^GPcC(4$5z^p(vM9iKlY+1*O`bUz6oz{5Bgwq|K zn_<&v7Z|CVfqhyc(06|Td~t8Xmv~07%kLQsPuefor*Ro1HcTOMs>#?h?K~T6ya(kh z#$uh?Z!|afh$gH8qju|}wz4SndxwTb5RUZ*G?t|>ufPn0s7 z3u{>JI45Q}e+3?_`hd$roWW5xjvRMNC3>CvVfE@#a5yeWn{!3q7|>Oq9~+;c_24nE zF6uQ}ZP3Q2i=uIv*$g)OR3x!p_EwNYu45T}EZjHV925M@0Vlg+|0j$1yYog=y#5Va zQ5FKF2}i)jvWAI$nTC(Xnt|1TS$K2PY@R%M7|@VPOgVKEmPuIh$F<^YZE=5IHB5Bx zbDIs_KUIitM0Utj^(>1yqRO2V5mK%;x(;`Fgc_wk@O`}x#(1B`vSuA(Y7>XCxs=!3 zmcYlVEw~^)1FWXp#zm_xL9g;KTzuoNZxoCG{Z4VZNJo#3%-7+4-sWMSMq?1W@s>#U zm*RTvW8m=EVDxf~24A;9bl5=`e7-gjJq=%zCT~UZ%RYk{Y|^6_z>1HJSKxQns8NR^ zd0y?R&gZ%&IXwVp<^#UQ+`KduL#w zs~Ro6@*LVcf3rm0cxd}1MvHPbIhK{#w$0fxvI6Pe61)+0-oJ`tS@@3m?dD8D@o zw{APpx^KVmNQS;3x+IP)wp9_xKU*(~Z*PGW{!3YMo+4@_sS53XY{Hl42LrpbMtD6p z4jcO(1&Q=lvU_L;8#?U_JkK@8wI`(csqKYO=9h!tdsCucUxmHql*5$qbI8p(dsx(^ z0#?K?;3Pi<%rRRJ_KjiS6RJwz&FKT4&sV^*q!l16P~ne-b?nkUCp!L90o3<46j*GU z33Wq$LR`lJ9961`+Lv}hbKpvRmZSi0X77il>svt1Oq**t#(|!@I9=A*z#?i^@ZWEw z>4-^vq4yJ8QZqLgg6DOxg{wZ0cE5CtI{p>VVmA&RodjZg5=F7wGnjCp5A=(a5SAUP0P}sf+o@)rf^hzS;mx;a;lCMjvUK(+AT{r$D-juz0R!XjxBtU(( zC$)@y%Jw~I!_g+*aBI6R1iG(>aR&_WuIPIyALbl|<^#^WZy-lD;|i(|^`t}1d+_N$ zCHeH7{ZUpTfNEh8ToCSulOyv$+rWwC|9S==p57!rdkfLZ(E+v}h!gJKsm(hxd*jpj zC!pDFF0SuC7jJ$p!mBq%fyWnp-sUq9AFUn=E`4&Dq(?98vG^Ol%Xq_b=QfJs_xkkh zav3^e;wk(YB8|6xT5{Sq1I~5I(fxb&p}6XH5^2;EvqM(k#|{tvPB9Zc>{x`xH5{w+;0V(v8&TEcS!7hmc$h403AgpO!Kb;upkKQ(y<>A0=I9k+dff!U zl4UwH?_3(By$OOvmJwu8T`w+QJ`=r0OVQ^7Ch60I1kVe$ z;ORc1cLDtR;HuOUxP75L5Tz+(QZkmKgUN8thPMOU-CA!l!pXG#X7rd>#4Pn3c zVzHV9#>(yBcGWlWR?U0igZnF>W714;Kfe_%Ru!<0mKrkSrZMfS`2c=J)0q`vOS-6x5oSehlu%umHR*2Cz=$&F5w`D!d?3 zUKCepz=PWBVFQYBlVhuSkCLTi6-$Jlqa(mfAziQ`Ns+6cc`Z7FI!NMzjcJcwPwDX4 zM%c1rEEXJH0k6KLfvR{ccDhhh93{)omM6dnbw#MTrodZPZ-j~u6JTg3!?}Kb_~7Sa zSSiS3;-z=MPVEB@QkoAxW+<^XqcRNbR|t6?uLM`><+15VZ@TZoB^Z=qLi@wH3WdwX zIQt3{Jd5AL&Zns0qoOo!bRiG6?!FGjs{3J9a50(lrVi{6I=7>MaF zpTNX7wzzk#7_aTI3j9F8J08BpOn+&9Vl$z2rgr##jVt*-+B?^vsE#ZEqw-KtM+G7x z;(&k{f~@#JXZkc?bj25Fk`a(thyhUs6j!05U<6U(g9t$=9)hAlUP^orMP}}4&=5sL z@dXAUxGFO!gNi7i5@c@$rnZ8$m91>7O_ll8b*k^&>F#@O_qlz}`C7yD2P~Z<;|NV7 z17mi=l#DCb81_A(+WJhx%z?(-HK*>;N=&a7fPu=s=+%Ka^Z|U1HSX`TerE3Y3Ga)h z?J`n%qD3zK`$=%w|0Ovqnj=nXe!^*fU_>v)N0Cj3Uvl|wQz3DmJr&hz(orq~s+*Gm z8wE#j$*Wd!{MlX-v+o)8YPMH^n1y(yi z7~~{cF-8}T`IKX0L>>qlHSl1l7p%EImZ)ADumMmAIc=8o`OKHtCOI#@mbVxjMbE)Y zb{r2nWmSOnsYb;ao{Qx2;x@x!J|@I$%4T@6$sK0tox`2p z!{Br=!|xtd;>lxa-2B}YIF7oJ@J&ZhX|)SV_4Mf(u?4hTY{vQ}mdtY3Y_u{tMy9*C zK}*UaI>I`JydP|Z(+|cGuOFLb_p{F6^9jY6%QcX(A4!;Zc?1S^523^F#E~r}A@Y@Z z*37v{NLJKm6NS+##z~`P$v*Ljdxx+MxQF<+i%@XqI?nAE0>WJ!A4fJ7M~$+fF$ubC z^+Y}ofl4ubzBOwf_azZ5*aU}9TGLSeyHICWGw=7hj1P_ZeV=bfp#6(mq;XE(LSKQhU2M*eR>6s?bI(1Y&YvfJxK;r@` zmFvli2^lciVlUa;7KP`R&SE*iGv%%oo~*vk36=f>xyNl|ppmz&I>$-jLb?snQY?q( zX}mrA>1jN%*AYZ57s0e)BP`kNOd}&qpk`nycSSoEWgdrS82C@47mOOod|yvcgvD~M ze9o)O{H-wh&SHRz#xy0rn*=#H(gD&F+}Yp~Sm^f?#$8ttFOd%O9&QJx*WTgA+xKTQ zgYRqVR!Fu1rt{%!b~@6PekhwrXSJqdii3&_w`oMrkSOkElpVFytAg+x8!qQ#Kk}sG zJM5gve@oc9@h)mzcG$2Ggl_z%db%-lrvad0Ami#f&x8DS32rD2BU_wjV9_!@U*eVt z^>}>>Mol{k%RG{>P})Sof+E0IWzAZZiJW4#5SAx}qU)M}i1qCr5YwfvNUKvbX*0LQ z*`KG79gDL#?_5tDpsDslf& z5sRwieILznkZCEvZlxVvZQw7idRhY>ZietfP9!Y*bRzpq5=aeAFTsWC5$JLCiY%`- z0ap4=;k?3=iBa@ew0Ay*cNgDgg5i76eh=?&EaUgphiXyV^>G*~w5QJ5#dz)317elG zg|sD`L%A`Zx061YtyZP#G{sZR)GP6Uf#HSW=S)RZ%JM{JlWJ3dVEF9aC|c z*HXBi8vsW420^CO0!YywvO8%KJD(Vhon8Nw8`q`72-_W8;Jpu+cSV0J3?0Ut100w` zmXzd%2V+C#J?Ng*Mtz4?kXVHdeVrs>h7J+jla?-Yb*>ZNICmMR&5&@e?cr>7@FJQu z_A7CzZXs7Qemodm+73759CpQPqwwSrh<95ko-4VEw{Ld9?!+P#Cxn1IMq~TqIv}hA z=UzPy721YWR;)q8+JBU7HtWZI^5~`bZ<9Q@mp|oWnqxH{R`7GeMGG2OIGtOYa|{!! zU1i55{i*+}cxZI#fKs;u;zY}purJ+^RoxgvGs5$sWZC!P34zx1qfi|*Sbj;I@WfVL zHhCWUMNsB5I2GKg4Unpex!8IYfm3PNmVK4Wv^a!;W;01!xGjYHgh0W~jriH%og}8* zjH=>YP&2h1Qu#RC85bXNPF5BWYc`(U4>$*AbPczEQUZQ+b0fDP&zR)4 z`7&>fWA$DyX+e#;_(gn^C-djzSY`BLZvI=2)xe;h-m1A1?>1%P-Rpf{zKoh<<JjPTSpEBa6&H2I?=00EtG?$j{4pG>$$dFilmBRrm3ne;>g~T;{{0-Qfg}6e>KF0N zd8f~V>HX2$D*hA4Dmb9$AL9ah%HQHxEn3ra-Cy&}+j>%e?)&zgXRLdhd%RbF)UAKp z94nVzy;Z-jef3z~j=#!Zk3e6JRr1rAeQVxr`M2y?wO*>r|6x7pF_3#t P1yPUso9oH{*N*)cV?dvp diff --git a/examples/reinforcement_learning/model/trpo_critic.hdf5 b/examples/reinforcement_learning/model/trpo_critic.hdf5 deleted file mode 100644 index 952653da8c5f597f7a6a2d7f6b563ce858d12705..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 28840 zcmeFZc{rAD*Dz|HB4ZH>QBpJ*%5Y!nL`o7FQX~yRbBZRBWC#tWN@&nLs0&3Rb*^)*b=}G1oW?3C zXe)^Q^~lMINr|caBia4)yX!f!FQr! z@=w8$uJTD9E9)B&^w`i#q0;#rG=!~g15OkG6uN`KXh|KW_kaQ}L`;lG(@_ay&;@h4;TkMuvM`zQC0 z-+#-W-4*{4pVi6!!HDTX{KIEr{~@1s_xCUO-*X54&S!F6+c3OK{&w$5x4i!cQ+5~r z-+hJrdC^oK3UMY!bh~(AE;CXZAE}QXBfu*X? zy7t~)%(91=4shN z9QLPOxb5)ng1~vR!h=Qmf6ljirT;AWM?zdo=8w+OwNUXc%|%?|PeB*EirIHH`}cXR zf3#QC*Xzr^x*BbwNB*o>_{*e$RyWl^7T9>r=t4)wk z$22jq|Fr^ATr}Ii#_m1`y5HMl#jaU)$&dd`&Xjj;_rK&!*S`HX1Amt@gSytQ-$g&& z+wo7n?JD>?+y3D~F|q%XkGos`kKrGENBp1q&e<-y?e6FAbo@W=0p=Gwp5=XvBBLEMz`JP+#JR3?NdYLdvmG0K${+}N@nR>roysyo1t!Oe_CC{vCkLJ zLyDv#1-$lvVPQcE*w=?{KAk}`6Vljl#Sp$urw7~hv@f+Rvu6>WF>I-y7c5$D z%nD3nNUDG_?aeYwDo2v-x8BC}9DfZ4H1?w2>ps9*HBTzAmLaWTO=_7untrUk15(dd zv7s84!i@&QSWM4ske0Dw3#CS}x&%$;{&^k=S6sruwRY@5%UaU6egenFN3ycLk5FCm z2s3}1kE6vhxKryTnfBzVTml?oCfp0|)rKB|k9M!vEKY(q_bb9l4I?37s11t~@)(x2 z-|k>}G8>&bn#x;D$f&XfeMZO7``SUI(s>3_5=(Gzzj)5)13}dB5H`m!nV-;mC%wAW zn<;b@(!6K673u}67LHVc~-1a$!@bzo0 zz>Fpf!_AN4qNasxZj~7BIhc*giDIDh>n=z~A4Z=}2@El6gz|Iipzoc__+#NOE_GHB zw3F9bEdbejJ+G} zn3vmI7$AI%qp!F!`QB4tGpMq`PwO$Yb|@{5vZj(pJ=ns*6DalIP^#H&%Ua^q*kE%j zwr&X{?{#WSVa7SA-kXeTI^^icXnQ(z$cs%3a-{*QSKt>-DZ2mOhGwmv2qTnTsCDaC zoW4Pl0(I&+?VRy++q(@0opPt?cD)e<-=IdW2dk*vZ0GS-pB9-o;QIr!*fRbhem1fu z*PWwr^0TiftD-^^dzC^j--YbnDxiCxnz+ml@{~Bc4>exsfey=**<&?XcFtiFDyVH_ z^QEV!TY8+@V@j4 zaJjt-mZT=KRM&P;ZXd%xzpRGlZ?eIy;xhXF-~dxGZ1$69-?sh2 zao1&O+Nd(T{`~^K?eP#6l)V8O6#KBbLzGz2PI37DN|N^V@6YDU(xgF#pLp@v-ykU8 zh+519kmZ<%NAeZe?@v$4Wn#{jqw=M4Wd+{ek<)TeC~hc!H@T_&>dm_H83)zQJ8w{Q zc-gC_JSp;x{hG~H<#V2ul=lfKwO@GszJ11>6ZWgks_YdeG~1tA*HLa)qUn%6zN|cI z=xh6PzopIxm`FO+Da*cW|7D;}p-|m|kkXsj948 zN9?S9Z@riHmEuDCyN^oB?`$t9|L=3!{cELv-+_OBzx;Ev>b}p~cQxpSk|K@kFJ1M& zaG&he)m68u`;Y9uzfZ2}Li^tX{v&((-7b4!R@b!tv-_oe|38S`dguS1)%*n(%{r+C_-#b6$R0lju%I0koa*WPN&d@H01y zv(!3aOk#-17pG5Jd=U+i* zzjA(FlQ}Nkn*x4mgIGstGJ7n49~KWYu)DZVhaJ_-#eI!s`1?W$lZ}`|&i3M@XWol@ zte^!3S`u8S;67fwxl`b1dl^PayW=%iX|jhLP$(XcH=A>Cxok1Pqs8pH^IrC__!sB* zdM~}Vehu-O^4RwBI@*hD122`enagoUW;G)Ws>UybWdd2yTdjnlJ(pwr>y@l=OesrR zJ65>(&Ta0v$~~Ou6oj(rh5Vk?WuRaC7GhLB!I}?!*+Ly}VzYh&N;<-p-z~7)s03y| zypJ6hm+@mg*RrxHSMc7CDPSdM&8{jN)3 zk|xcVlZFCo8_?IZ#piijcvJmFSo+bG?esFFT@q;|?q?0IDK?Zo@--@r9>~nq7vts? z>gH~*GT`RP{ zHRP2eZAijm0M*?kP+d5Z%xtDm@xrI5n#{w)bHs-BUPMX$fpjT+7u(RJ-Tog zhS%?f-TN+smub{$F;(Pd8XIDAnzjLsF~e-WMDOj^?a9w zmm4zR>Hssgk(%-LD+Bh%_qw3O*nllh8BQZPV?OG+ItAbUfsd^hvnFXdexj2O*{NN@ zDJzDsB{!^CvMNXJd=zDxOy!qu*be;yYEbXI3R@_@k8ghI#!CIAS+he0oZWI6UO(TB zI&w*5ANqysh{=N^*E6U^R)<>8e}N||#^j|uO?XFg08=we0VOYgL4~sf8@~Jwmg{r0 zzH~H;_C3SbJ{02ZXJOpEu|8~J-^Z|5vH_!9eOS|IM^^Q87y2$ejmhVRuwZco=3V5> z3NP*l`{NPJNa#;TdQ7G72i)kP!y&rmXvNyj^<>c(D;bPbWKk34sG{{CG%I}MI^#F9 z{6GPjMa6Ij+=Td8Tbn*gtcUh^Kbm^(1nrLXrlY27!EEk37%*%w4UZZ`x;5f>G58kt zm;Qp;)opys_??{Ot|Z|qCm+F;pkjRdB$8jCHImd1UB@47Tfuas11t=T$3FX|p@V$T zX~-V<_TB>S%&&y3MB*RX_eJrI9iXMH!L;%;sBY0Sun%j+iihv1!0u>CHa9T{vp|XaZU;UCCLc=rhG$oA~!lQZORq3>&n)j1wP7Jo}Oj zI=8RDtb==SkA4ET?TRlv*`i2$G)>8~@C4LnrPBqK0_v?;kIO&A^WR@eGTSFV@shz{ z3X3WfESX$@Gkv9K_mBpz-SxY0yzmqEYi%iX2IpYm?$zkAU?#Phs8XKSB`_O51g@Q~ z#ua07pvUQSEcJaRT#_%(of{FY|Zb5&Rmb44~W+!p6-x1#&{zp(Pv z9SjK)qb0B6?Yz#ufXv>fP<>Z0Xf6(iDG@o?A~%qo2(dstcn*&(q)FZwsQL3qteLD% z*T0G}y_L!AimncU^CY^oI$tp5fi#`29Lsf@N7B&fq0IWlN|?NC31;*bu*SLVpxD}n zDO{<72=BzKE3^m8X~b2p6I^qNUh!;n<#MAg0GV+`8I^oi}LU z8rssx@{>PI&rO0;u|w%ezy*jnBu(`R3pKX&+J7G$&;v%Vuo3q8g^;P<`Ritp|_(yLdKU^&|d2@V=`J2xH0 zU!>vVm1&$#pfTNP{SFd>aTvC67QWqBElkn8gl0b8f;r!wuuAQ4ihmMHXPwvMO@pV@ z|LH2>vl#)THK+jg76($R$^@91Db55L-gs-zEqH2m3*y@EL9#0Dancg~!av%WHCnVfGm zyAW;6&N-?BSBdppss!w!5awqy9&3!oD23u^mf z;aihB87K`RkF`g+&<{?O;y;1=CZkY!KVN$c#SP^`^66>ws9uraeU;ydJOM9g6=i%Vy_ew@WzQJ*lgTi_-xxLSa94K_p3^?+9~4XcVi@I zS6#+iR}SG;$pRME+Jgq37{kwR_D47CQ~Wf$3UCOqCHbuLeCF?Oz|!KVW$;Zf@Ka!8 zzbMiNISU-RK8ogVB<$%}&mQ}Ff{#%JT!@o|ckTPpN^Lm|YuI6Y=RBXf!YTK-(*eDpu!P2)ZH!1wT8SXQ6NI)-*~&wa+h zx#)1V-0u?pFul%wJ#hy$LOycAh6;Fbr6Sg*?dNPpyhh=U9iTXIIhpc1&p-dvs2S&f!sPgHizB5O2;3_r=lv?<@JtTd{(>UnGd`CDQ~|1!{Y4 zMQ$o{*}02zY4Eusk)K@}+_s*ig*(?VFV&T_erO@&PSnGWG1jCq!v(z!vw24@)d9Rr$NqZ zGw$@RF~W+}k$B>iFHxieO0{zwseQ)4EN>Wy#xQ^BQ0BRF89!{pN3LzkUM7*g6-pkx zyt)O&9HwDoiz^Kd8xCdFFVRum z6Mgm?(l4hSjMs7on+YHJ^k!o=)wmCvr*;9F=T^YsYFqMsbqk~~xg+z<#fO*e*`t$+ zq>vvARR`{i{J%2Xw{b?S@md7+o~y^!Ufu?wtr=+4oCD?AVeshvLQ=s+py)b@N{aKa zUB#It#za9$KULt4EQHhudmOB4#BS}Eprej^@a>7qP$1j|k<^Q&`Bihl>3MK>-blW) zY$HBt6ym3ZRt&vg1Sj22gKPT@!HYo=oawPU=ymA<*o-?sW!z8babO`nm~V>W%35q^ z{cX@n*@Fk#U0IHT4cplunjaqHV^?R=AgyZ1OwnaeZ?+0HSw^uf#@5VU<_x~O1!($J zj6UCdgmYBuxj_RC3h&7IvxyIevaJrj_@ttgZ+vqM1fBQ!T_c9Gyfi0x`TG!8_eqte zbo8P|wIXhrt|4g}Y=^ab0$EG_Ra_Il1zH<@SVIxwhpWouF`t>06HiaJn|Ejo^&KpPBnNeL z=^;*H`fBV=v>WLrn1IjN-}vH+1^dNvpmRwl*KB(ibbJc%*rug8;*vajGDnl`opBls zqt1Y=;}#4)kqVROC`qgt%gyvw0Lzvs-P*w(Y@sDCQSf41+P7 zZqmK4XW5B+=h3vKUZ81z2x?dy4eGs~(IZjZ$*dqJlAZc z%ia|h@ogE_@Ye4bEEwnklQ{{z-cy75jo8cXM=C&)vLrNT9v7OOj%6e6JjYYchoNQL zboOZLdN^_R70f&ilvN#1POd9hzo=x^zF;C(9uf`;6?**SRnE+<8mLe2C;Z||$}suU zdYb5xjGC&mDf85FTrg-W?2Z}1Mef(8iA7TEtm01Ub^8gdm=uF`(=+)`P=rt613tW1 z4dV|+Q-tqCge!~L)wfpo!)+c$6lk)aXR0w^)NT;_d=@?rAfj#NH0@?2^YTz2@7B9` zA=rzoviD-f+ zU%AHz)v3ZNUYU*exP{A>sIppra~juY7_3jFzWKn@t?#+T%Nk{G8OfBOcnkcY) zve|+g=O=@%Q5EM?Fc_V0KIW}ItJB@(wp{NU`GOdyMf7gr0Zbn;7OTcPfvH~rZBA-} z6}z>7dgehBUx<^}3?xS5`HPteI5561Rcm$dVROv+He;Z0x33)JP@j(9l*fvz&dkVb#EkK8x!aY7T zhnAg9p{R!SW%AjxxyMUo$n=&Z6Yfufl_xD(t?X@l@>3eEl)vMdv?xq#L40ca1kFCY zfEn_!G_;R49_v%Y#a?p8bgy}k5WOGz=$wUGKOt()k!3ZLIm~)%Pa!+J(CMB$cXFg0 z6*sJf7f$DGYOGa-%ba{r(5y}Bx5QZ9L=Cb(wi8Vz2?fL0Xed3L0xLt4SkMqT=5Mx2Al_X^SUYM0L_|&n zeqC?Y+%k|X`$l5?(PMl?VHH=CXfJ5D*C72xBVqX65L)qMUQY~XoZV{5J|e@pNK`|%nl|$utBX3~X|!on1`D1whk1n+ zLZ7v^AdKtFrr88hknRjnY4RuAgh4n#S(`rB9stEOg7fQpvY9=<@vi$SQ73dLGyHmx z^O!%4KHjily0_-@-tEWf)V#guGvg$>EEz>g@dLOt_u=eAye7RIpu~Gm9?6tbH}Sex zE$C_CQFM{?6il&Gr&U+WX?lUdmrlJoJRpdOppkP9zx!SA!&Et56$@}wQ>^tl6N2Pfiu?W3q$ zUIe}dX5ibZLB^WL`6cyUEUoG-%n9`&&1pusCu9#knW@Yk-VEd-?=)~WmxFP&Vieyr z;yZU{a2{XbdIf}eja*Tt1m$hD0%MO?@XP509qQAGkFJkl3zENJ?6FXAm)C~x)?3he zb0n9qtIG2&@6azyhQcz^V9cBaAb(#C>qbk0{tpRO;bdal$HE7HjN;hxnQ2gb|1tX7 z8u6?3dcmElG8hmr2fjEt0UJ{Zc^T{IXJ0MSTit;3*Zf4UY5U>Dy{j;8fDM~+rZc{eV0R7cpp&CehC%{6PdToK$uaP32O@9bBV%GOggg%_MSL} zSK>UtEhz)1(QTY~J{caktVN&S8-?Rv3DE3GFY-O~g*$RBfPRT((%!5C&@=ZAFFAby zx}2N{SF0<8%9Ha1b3AlOa)>YG?$qN$CKvEyi(0Kr2!=G`pmU_a?mZQAkkA?8QAqaIoT*tTW zEf5$mlm*_+N6()_VE>mFFsJ??ICDeD`Qb!5-$xW<*14X^zq^MGhpXY&OfB}x@-$Y( ztiys!suUQrh~`!85R9!kftwCbgd4q%LcfoMHvW@=FFy()Vy7VeE2! z0G+Pv<(8}wW9z3iVx&Yb_N*X@CAvw|bD*j17u?gvxg1I5JygZ~bWtflpibUz1~4{gE!ALavg={P%OZ5Ye~@V`zRcDJuLGU9 z`&?1cC~8%dW-+P;Y{@NE>U^XN_7d-5vi}TJHlLN_~E#ioYzj3rz}ymv7Y1HBHsy zTc4^k1uqjk5){mC$7}~bjee*)!XGtmIAO!1{dhc2hAXR$hnTsC;fHJ~#`PLYp^tRg zfgTf{awsKdjOQLNX=6?811P~- zv6tsHPJZ7b9!E&Cmb%kIZNptK%i=Xou-2wNwwu9oyRGnt`DXTAESXu}A4i|WjhU%` zTA7kz2BjDGfUA%4*=3_uOjlluwJwaq)RQfE@x2+pGiVmSD5XTWc6SvY1sD;8|WNtyvt-PMKW7G1IM?%BW@hk+mvxsWXt7 zeGQ^OH-8Y9+Zx%N}vaZaD#QQ{q^T!EaEHy97^r5^fv3fX(Og*y~TG zys`dimQr|wOyU_6)7Z|Ro^uF08cW&C^Yh5y&2GrD(xT(|imaz^<(JR-fF>OqxG?h{ zAb&oYhRo81AShyOLnopjY7LF`3BU*XGsu1QA~@<9gkqYvxr}9>Ii+4V_{5C^nC_`m zyYH3^PaJb$CoGJ)NDUn@*p`Yl%4(P&{SY-#t=z{Y6L|x3;icS6e0)WW-ntjVT){j} zFTV{M{PZvw@MxY zjFWM5U@_>m?tlZS)5&hFD%+h^4qz)F3)uj^E%F&2SX_o}YKPcn6)9GAW;Bx+>@AD;4G1klDi9W@2D_1Gb&$Xt55<5Ejw2J;EC|S_uVkn zd<#C@_5jmI^3brErK8X8YKjp?BmB+YK0nGhw9IN`c0i({%!Z%XmSZG88^l_A< zkT<~Q56=g+=wCS9U!IzWY_n_oVnt0V8(^KYBU^F(DQ4sbLi9BoRuOR!?Ls1GSeYC3 z)C%Y4$oAno<9_1zNk74*VicR7qC);l=9Bk6IqH)#93`B*VZi%%HhZcyJGNvj8*lj& zp2Ux!BO7MJ_@y&Q>-QXrSoRwhjsAemM)6P+Hjuu)%VM9-SAf)937Rp%owkWhXDdDo z#wO{$^w6#Y{hgLG^CA_tuK6r2_`ZYo8ze%U(o4Y?xQUV*&T}ofUeJ)di96WmA!xsP zDXb8_g!26vSTWq6no8!vTlsu$dc<%Fx@t_m8fMUYZXSNyzaLva)xoP+UmUqFjo)(d z0bIYcl`|Nyij3E&uu-)MXgQ!3+f=v0%%&E|i8NwUMk=Gzr&K|c(E}Xb%s#&i_Nq$|2?!kmxJ7;E7&TL?2@>=5lhzJ z66v?drg%Ud|E5~Tqn;$cbl=nlU~B6Q6fLD;v{OEv72efN3v-J1{k?LgAI{Qq%ohL z!kp6nUNtEC6qyYb%f%VKS+wbqI&QH%0Tw4xMZV21-1MDi@To*2e_1S^ znKl)0Wm@huC14R-R$~i#?xm3MCW-nK$#WM{YK4!#s0oI(sIkR|-@uHs&LCD*jD-gr z1pPLN!NL63ja!Es6sByp&mb=;t4@aue;N8z*hi@>u zFnbrbS5v_9ZjD6uiovLCxt`Q_tI%dSN9Gq2k1tk_hUteKguB=oA)hX??Rx}K&()`( z%{_yuXPMFL$)8}WbPm);_oQ2!3WcxQH;~Gy7R+tl$eN1e*o#RMnc%%NjQ@5JZBA;j z+kH0hpKsdIB#{qwMsH~*WqK91&62^sfhyScqJ%R)q)AV!RO!3MZGPU32XM#d5pR28 zpK$Qo#{!)k4_dNTot4$(VwuAWFnOR((s#c=tobpFTwQ=252V=aF9Ug@L?`w-FA9S( z@nz{hRM}J6Vf5g+7UhcKOJom+VZLrV&hLE{UJRQJ?NKIBx=)kYd8lKKyFJ*RJBqU` zPGQgT0yv$QE13IQ2dft;(S@J~cyWaani!wvlZNI&K+9d>DEAa>bqfbsWpnK5GK5YK zz6Z7ZYCcPG2NsE41+$Bn!S`ewRLy>bl_mu+Wsfp=6lG!h4?DE`T@Mp>*F%e>DQizx z5Gdd21gEtzG+}%Tx_5}~X*=fA#Ev}J8T*1ynjw;Tdq=S^kA83lj^iOJ)|`Cj=`hv4 zPZ7U5u#!hN&MgTY!zl;`;PkYE^wc>XEAC`M(}f$D`>Y?k+E~x;i>c>ia$_kx{SEfA zZ|3iRDMP!#v1BTe$9`w`AQ~-X$JF=HCfBtjW-JDIMILne*>?7R-UVEoKAHvIP$NwX zMw1RZuzbCFSoC}&D5!5>C(RCWi=SnH%A`#~1C_V<$+#!0?|+5g^HG;7%(D5O`;Ifs zSOq-Yag%dcu?}xsh+|S0`=RG#Z?v1P$KTno2gl!jh<64>p`1)M&bD?Jo_PM8-=y>$ zKKHvM+?uVzHL<14d`}T)X1SMloi~|_eW}7M)py{L&Xe4n*bgA%9D%Du--=aUQ)l5Z zPQ164BE21*gQfet8Aj_c`-Ahy|LIDi*yC_4u)3^qR6jIdHw+Sji-bxaH-hok&9MIH zT~0^u3#|N+z#m!FkM3XX3A5U_3eGfq!?^5cm^)?!XDjm86tgG%#JG*{V6ri4R_()$ zCI_K)kQy$7a8guOW3%md^JiuCQEImYvp|t-u3ZM2Z+uXS4#ENnORiq*0A_qBz<#df z5dFLkt~9x^p(YaCm%hrZX4(pRZ&r+}&pg5vv3qH~$4%&2-4i-S)IzyK7(4RA0Fy_| zfG5W9uG1t_h-5>{dAzmz4AbMMQQ@Zz znDBc&I_hu1Yb4IxPK~9VLp>oi>^Oh=VJ2Nuz66ug+8}4uCmdM6o(~&uNgks+ajV)R z)^Q;a8edgGYk!`bez_IH#%+N7M}u(Fk)wF3V-k3_)q$M-GicgrOlOZq(8488F?G;u ztea(xS=pYpQn{s&nplVrYLjr~H&Zk{<&S>0Y2aBoit3i z^L7pUyeEdO{lX}8MlvO&kH@|RMmYX*0;Fe}P>#kBCTs7?RaK}^h{honTl5n)THCRX zk_Kqobb*hN;CNr{>)>pzPX<%xvbSp=pvk!h03HMQ1Ck8FPOO8sk(DaUlAOhCE9jN%^uhVA!^Y3Q>;%qz=c>+TI_2PSIKxpM)~5xW6K){kT#O73yn zedps~bu~6ic@Y$eVuvQm2v~s623G4cgq_?ym_~+~ko!9ovfcj!l-}60r8`nU^T=B0 z<2IK%oU5UElM$KTSO$mv1@>Ty=o{3gHLz${Hs($Fz)9XtWk-j^vXA9@5S`GD zzve%OtA`x;>Cp-FQ?ryw+>E6&R$qCa`?o;bE0KLJ^<&20qVa_2o8+USn1p`vZ#em_ zSNS80sER1Zhm$xCi<#L?$TQHC!CIr#vF`F>FVgf{OG^gm6 zvoQ7k7>F>DWAl9H!fC@+(0$AJ*?HC!ws$d=oowKet%@NnK$+^Zl3+`q2Aq%gq`_0O z&`DDi`xhpP56P1PzXnS-uBia?pIXz!h(x$0-2wA9)}0#StSIE%YyO&lBy&t{ z!|P^!n40qf7LiYQVC;AnnLLc&ovX)uJ4Et)!wHCdAjYy^J^}BrI~cX$Hw;YnXa1u% zv*2bKGz%-_2I_GFoi-b0pmUUbe*Wam&a1IOzjFCD6Du-l@y7O};&`m0&e-5qG@`x(6yf|dx8&K89ju(}52;zyyQd@^Je*5J4MJ}|cM2b{FogvI$~s4-WK zRO83N;Wjy368VEKyj>1+Bo6^TDTI5f7esM%y@ZEdQ#pn9ZIIPNpQTmSD%96 z>6B{BzFQ439d}_A$G31b42n+h}MQ1lNS}7F|$I5!LQIG6d z>w)oDwfa4}4!;dqx7raS7n9osJ+|SX78_YF!N%+<hK{sIjL`S9@*M>oh`!H9zZM^EMHc$v3MPGmVLB)zyWx+3_@Rj;&l$o0Y zRd>!nljwWkmw=I~N^?NGQabkfI2e3!YaCWLr#ojVA zn@b2!hv>jiOv&Ga7MR6k4Ygp8^by+rAr?LcZN-5pAy7YZ4|^GHPDk~^`Rq@XWijhS z0ZRN2!CBvhc&zUyE;Cw}RX&O!yM?Lz@=hPF=~azzUE4qm;sV(wpLO1|xX zVO8FuT80&rFj~0X7I#*uv7*^K;Eu@t-QM4Ts#Rh@J2(cmgd4B{Q#L?u%rBVh-=9}$ z60k!XooVkv~{XmLp6Yk)o2Odnrv4C$-UcqKuj0T&T z11abJ0d)C!f^X2N#`w-z-2T-=s9}F2{M?icyNA|c;MQjdo$6#VWUigHw-}9D(~J)u zd;|CJU9k0bB^*1Oh8{n2(6heEWB9$nl48romx+ z{%AAzMplce+FD`j@*DhJjWD=%&<{?$cu0x67qFDR&HNInHL!7C5jW+HB6%FvV5xN!d-HrPan@8-1PkV{tV#RpC3J#HE+s7ZonE}!Y7c`&9;D8;MSkMonB9D}XZ zW;j1R6m*p3nOJTS=50NM*T0-X;{a9mEprse&6vfNOE$sR)fM3SwjaECUW>aX31Cfh z34|WVhOFdTu$}t|*EBC>OHbOMw?P5KTzdwjTq2YoUdxa9n8Ig`J;BG=odM0^@8E%< z8L#wu5O_54wdeDThWG+S&YpVV88cp9B%=);vCl1NXTVCr-hU=reFVWg0U*_!fp{Ch?O?BO$c^WSo(x!&V>Gq=5-jn66$W zP8I!MMEd^cAT{(9@BUa9<{a0g5$dyP)*KILw^>Hf@|Lt{)p7Xg)WYkJOuI$LA zpcI$6mZH)bA>@v0$3^?HXr*m`G7LEbQc8(n(oX{$Z+%9k)BurvpppNOkd8UYFHtwg z9i%J;_*S79K38g|E1`hZobF;7wJ=I&?HM9oME=V85TOppbbQdv=y_ zBll*a#Gr7R3t4$Ob+FC4h(9XB?i8CxD_VZi6ZqH{VArtEEoPY-_y%AzZAg?APg zaq=tw>(vXaiQLKkUY-v3Zasv8erlAta50(RkfjZ852KF%M%=blhJPP*32c|<;G7qW z@k^dN8rN@uAo< zpVsy2g>mK;+|~3a-1dfGzGUrZTokbr5__f7xChyouOx|Ou9IO-`&1~Yd;|d_w7Ac* zin#Q;6d7y9;C-1|O#Hf>onJbF^=lnMGiwoVt+54Z@kZ`M=zIzj`cfXh5DnheK%$#I zPB`hwj+O=C9)$q5G+myBn?8mL&(CnMaXy$%i&Rc#zT>s%W``R&{_2k^{zYgTcoO=4n#3|l zm!_u7r7?|xT=D)g3_15upgwsZjh-gQ-oCj6aN?5Y8>S>VZc?T8z zJ0W;@II}eFLF)YuvoD1oanNNmSg)3jhi(PKwGb7UvVAFfy0|c{hU*fziJhU+nKRc`QGf!DGQeUDjNTf_Rch@sv`@) zs1JffqWAz81PLXIOAtko?Vf%RB?t-+1SC8hcyU2CQBhoQBBCIR2q>ZfSq#XgC@CPg z+&--isE8mM5*1K`28;m_1zZT~yb8QZ1#2o(nOKu5^Xt^9zPE2Lr%!ivpZoRE;x2k2 z2jSCnJ(y`?NFcWhR-d0ryl=U1g)53swD2T~_bg&plg2{bW-T&uybHFa`BF)O77H&H zz{_MY`$9`dN}BSiYIX=T)%}yccKR8oZ0Et)UX5IC7qW!l4Ai4b$o}cyV^GdER4vzI za~3KQv3?S^bZVg4bupY>eh4%i1f0aD9Q2dU(VhqaG5taohv+TkGPW3##xx`PSSpLJ z%(cV&V=Iw1J_nPN9rVnpMX*pk7#o!5upU=;{z_OZhK%u&ojVOBKGojnxqc2xf<@$9 zxd*@dkvcQ3GG>y)!B~`)%IUW5z?ya?nxPU+MbXC8dH#pQ^SvTiYyrRA3XJM;Y67t&f44u7Ju1FMW468wT%tjn5I-1 zw^G1Vmo}oXP>Y1tyO0@i60}lCfXr_qxMk~aLY|f%b06h~m(NhnX_hf7FtKBXG1_dJ z%_TabM4zm_zk@jyD6#|)gJNM4TWb|X-Vb?<_o^&N)+;fpxOHNP?p%@@JBo!{OoYOr zsbqZFYHU4m9gEs6iQ@5aJ~!hf_~{jb+r(=uYw!W2bL+uWnuh)rrSxo-fW3G!o!qgH z;WP^N;Nir48W_Z}Ar&8Bfa_*FE{h*(sGkgtogUDuA|!*1zvOEVXXC7$%H))0C=@DO zh4z)sn4x!!++U`~EYrjIC%2j{XE&P@b5AK*ai#+Oti@#cFe_5jr3|)qt#~MoXLY&p zWaG62GFd?kHM4^(cN@$jr>pBQ&)_A!Ql`QZosQv@q33b!WnFrNw;~0Gfg~V;KMnW7 z@oIZH{NS=&c1|0?MQoCib3q4T_v%tylFKp2%JCR!`w`PIi-OE7BN8u6!=#2RScMCS z`kGR7QwU@Wa?IGIiQBllcZCq&_6U8`;-S_^LM~NUv-YoIm|ZQ8%?>>@+5I|1zmSmi z%!ZhqUO<%7n!%&&Gd9IbNQMV#GEd#joYUU2C^ygh1pRB<4pNb`CLt5csxg7>7 zjAYzTJUm`ugsk0^eLMdn+l5X+kr2M_ z1Lk9Iz|wW+LPUib9oe4940~f>kffbgwH3{L+Q$DG`Km1 z@VVV6S~tl;^T9fHCaRRy&IzV#|B;FFK9#a@{%yERCz^VP*pOuv7ug+Y1iS5_hToiv zXGe{)NMO)>HtWM73qFhAbwpZP z%i@2sySXF(ZRUiVSbatq(|caW$KQO!5ra+|OgUEC6h(}37vqflMr?Hp zgqIOkq;v4Y(+7+@AaC6{RK08lG;Rd*9(M;bbq*2Bu)%DmMmof(bshkzLJ4Tk&?8A)9A|#& zC4QD&2jR7d-Q#pvbB8*yF}(nxoPgLU<|64y!R1RPK)S9hmOzq*HF+usf($rXF@byF z9>Z0PFhsM?iLCt9ZP@wDhfz^F2!}nxA?6o3N6A5azTFM?T%16Ps0kb1v5IL%0lVJ( z19TtB0r$u--0zlz+vXXOgKaUGWZj9`UwdNq+6iE_L7RK}{ZVR=C}syN_P~#J_H5&h z^XMQ|B39c{n6Y~?w`Q*%Gjxf^XyHWSQPqXz@f{FQ;KmwehA~GB0gVw}#5Fbx=gx8` z$yEn&O?e}RNyTKHmjmY@uqNzciDlw3Tc-Q39BBOHF5cQ)jP{;aAc=2-lG#ecT~tIP zpWQ&c@?fT3dKJq=8*r81nA46ez7Tgvjj8O_#fZW<(xG*oHr#Fp-`5$;?2$VuEP9{4 z$(MO+94pO!(t;dy@r(K1%P-K+gTzKon>W!rB@ z@)qgiSp9mximg214@x=5YT!K#e-6jWWFW`N|<$(@6!zy5Olk8`Y)M-9m87xhiQ zGoWMoUwT{OKXI&9`}KV>A)rtG9>+?&s_(hK{mk3yslRr<1O1Fae{qZY?MGhvx6QG# z?YCR`cpYfR@^btwf7=2BIaUXPew#i}laF&#-+?Lbw*GtTSh-!w<^M3B^evG4Zv~OJ M`kV8~|92hxZ*PpRVgLXD From bb9f919ea053d05ec3c3a9a03dc2eedbf41d335b Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Tue, 4 Jun 2019 17:25:05 +0100 Subject: [PATCH 31/57] add baselines --- examples/reinforcement_learning/README.md | 38 +- .../reinforcement_learning/baselines/SAC.py | 394 ++++++++++++ .../reinforcement_learning/baselines/utils.py | 97 +++ .../baselines/wrappers.py | 561 ++++++++++++++++++ ...tutorial_cartpole_ac.py => tutorial_AC.py} | 0 .../{tutorial_c51.py => tutorial_C51.py} | 0 ...rial_frozenlake_dqn.py => tutorial_DQN.py} | 0 ..._noisy_dqn.py => tutorial_DQN_variants.py} | 0 ...nlake_q_table.py => tutorial_Qlearning.py} | 0 ...utorial_retrace.py => tutorial_Retrace.py} | 0 .../{tutorial_sac.py => tutorial_SAC.py} | 97 +-- .../{tutorial_td3.py => tutorial_TD3.py} | 98 +-- .../tutorial_wrappers.py | 20 + 13 files changed, 1204 insertions(+), 101 deletions(-) create mode 100644 examples/reinforcement_learning/baselines/SAC.py create mode 100644 examples/reinforcement_learning/baselines/utils.py create mode 100644 examples/reinforcement_learning/baselines/wrappers.py rename examples/reinforcement_learning/{tutorial_cartpole_ac.py => tutorial_AC.py} (100%) rename examples/reinforcement_learning/{tutorial_c51.py => tutorial_C51.py} (100%) rename examples/reinforcement_learning/{tutorial_frozenlake_dqn.py => tutorial_DQN.py} (100%) rename examples/reinforcement_learning/{tutorial_double_dueling_noisy_dqn.py => tutorial_DQN_variants.py} (100%) rename examples/reinforcement_learning/{tutorial_frozenlake_q_table.py => tutorial_Qlearning.py} (100%) rename examples/reinforcement_learning/{tutorial_retrace.py => tutorial_Retrace.py} (100%) rename examples/reinforcement_learning/{tutorial_sac.py => tutorial_SAC.py} (93%) rename examples/reinforcement_learning/{tutorial_td3.py => tutorial_TD3.py} (92%) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index c75c18676..4f77bc498 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -59,66 +59,74 @@ or `python ***.py --train` for training and `python ***.py --test` for testing. * Q-learning - Code: `./tutorial_frozenlake_q_table.py` + Code: `./tutorial_Qlearning.py` + + Paper: [Technical Note Q-Learning](http://www.gatsby.ucl.ac.uk/~dayan/papers/cjch.pdf) * Deep Q-Network (DQN) - Code: `./tutorial_frozenlake_dqn.py` + Code: `./tutorial_DQN.py` + + Paper: [Human-level control through deep reinforcementlearning](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf) + + [Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) * Double DQN / Dueling DQN / Noisy DQN - Code: `./tutorial_double_dueling_noisy_dqn.py` + Code: `./tutorial_DQN_variants.py` - Experiment Environments: Pong and Cartpole + Paper: [Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461) -* Prioritized replay +* Prioritized Experience Replay Code: `./tutorial_prioritized_replay.py` - Experiment Environments: Pong and Cartpole + Paper: [Prioritized Experience Replay](https://arxiv.org/abs/1511.05952) * Distributed DQN - Code: `./tutorial_c51.py` + Code: `./tutorial_C51.py` - Experiment Environments: Pong and Cartpole + Paper: [A Distributional Perspective on Reinforcement Learning](https://arxiv.org/pdf/1707.06887.pdf) * Retrace(lambda) DQN - Code: `./tutorial_retrace.py` + Code: `./tutorial_Retrace.py` - Experiment Environments: Pong and Cartpole + Paper: [Safe and Efficient Off-Policy Reinforcement Learning](https://arxiv.org/abs/1606.02647) * Actor-Critic (AC) - Code:`./tutorial_cartpole_ac.py` + Code:`./tutorial_AC.py` + + Paper: [Actor-Critic Algorithms](https://papers.nips.cc/paper/1786-actor-critic-algorithms.pdf) * Asynchronous Advantage Actor-Critic (A3C) - Code: `./tutorial_bipedalwalker_a3c_continuous_action.py` + Code: `./tutorial_A3C.py` * Soft Actor-Critic (SAC) - Code: `./tutorial_sac.py` + Code: `./tutorial_SAC.py` Paper: [Soft Actor-Critic Algorithms and Applications](https://arxiv.org/pdf/1812.05905.pdf) @@ -144,7 +152,7 @@ or `python ***.py --train` for training and `python ***.py --test` for testing. * Twin Delayed DDPG (TD3) - Code: `./tutorial_td3.py` + Code: `./tutorial_TD3.py` Paper: [Addressing Function Approximation Error in Actor-Critic Methods](https://arxiv.org/pdf/1802.09477.pdf) @@ -189,6 +197,6 @@ Our env wrapper: `./tutorial_wrappers.py` ## Authors - @xxxx XXXXX : AC, A3C - @xxxx XXXXX : TPRO -- @xxxx XXXXX +- @quantumiracle Zihan Ding: SAC, TD3. ### More examples can be found in the [example list](https://tensorlayer.readthedocs.io/en/stable/user/examples.html) diff --git a/examples/reinforcement_learning/baselines/SAC.py b/examples/reinforcement_learning/baselines/SAC.py new file mode 100644 index 000000000..df017edbf --- /dev/null +++ b/examples/reinforcement_learning/baselines/SAC.py @@ -0,0 +1,394 @@ +''' +Soft Actor-Critic +using target Q instead of V net: 2 Q net, 2 target Q net, 1 policy net +adding alpha loss + +paper: https://arxiv.org/pdf/1812.05905.pdf +Actor policy is stochastic. + +Env: Openai Gym Pendulum-v0, continuous action space + +tensorflow 2.0.0a0 +tensorflow-probability 0.6.0 +tensorlayer 2.0.0 + +&& +pip install box2d box2d-kengz --user + +To run: +python tutorial_sac.py --train/test +''' + +import argparse +import math +import random +import time + +import matplotlib.pyplot as plt +import numpy as np +from IPython.display import clear_output + +import gym +import tensorflow as tf +import tensorflow_probability as tfp +import tensorlayer as tl +from tensorlayer.layers import Dense +from tensorlayer.models import Model +from utils import * +from wrappers import NormalizedActions + +tfd = tfp.distributions +Normal = tfd.Normal + +tl.logging.set_verbosity(tl.logging.DEBUG) + +np.random.seed(2) +tf.random.set_seed(2) # reproducible + +parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') +parser.add_argument('--train', dest='train', action='store_true', default=False) +parser.add_argument('--test', dest='test', action='store_true', default=True) +args = parser.parse_args() + + + +class SoftQNetwork(Model): + def __init__(self, num_inputs, num_actions, hidden_dim, init_w=3e-3): + super(SoftQNetwork, self).__init__() + input_dim = num_inputs + num_actions + w_init = tf.keras.initializers.glorot_normal(seed=None) # glorot initialization is better than uniform in practice + # w_init = tf.random_uniform_initializer(-init_w, init_w) + + self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=input_dim, name='q1') + self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='q2') + self.linear3 = Dense(n_units=1, W_init=w_init, in_channels=hidden_dim, name='q3') + + def forward(self, input): + x = self.linear1(input) + x = self.linear2(x) + x = self.linear3(x) + return x + + +class PolicyNetwork(Model): + def __init__(self, num_inputs, num_actions, hidden_dim, action_range=1., init_w=3e-3, log_std_min=-20, log_std_max=2): + super(PolicyNetwork, self).__init__() + + self.log_std_min = log_std_min + self.log_std_max = log_std_max + + w_init = tf.keras.initializers.glorot_normal(seed=None) + # w_init = tf.random_uniform_initializer(-init_w, init_w) + + self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=num_inputs, name='policy1') + self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy2') + self.linear3 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy3') + + self.mean_linear = Dense(n_units=num_actions, W_init=w_init, \ + b_init=tf.random_uniform_initializer(-init_w, init_w), in_channels=hidden_dim, name='policy_mean') + self.log_std_linear = Dense(n_units=num_actions, W_init=w_init, \ + b_init=tf.random_uniform_initializer(-init_w, init_w), in_channels=hidden_dim, name='policy_logstd') + + self.action_range = action_range + self.num_actions = num_actions + + + def forward(self, state): + x = self.linear1(state) + x = self.linear2(x) + x = self.linear3(x) + + mean = self.mean_linear(x) + log_std = self.log_std_linear(x) + log_std = tf.clip_by_value(log_std, self.log_std_min, self.log_std_max) + + return mean, log_std + + def evaluate(self, state, epsilon=1e-6): + ''' generate action with state for calculating gradients ''' + state = state.astype(np.float32) + mean, log_std = self.forward(state) + std = tf.math.exp(log_std) # no clip in evaluation, clip affects gradients flow + + normal = Normal(0, 1) + z = normal.sample() + action_0 = tf.math.tanh(mean + std*z) # TanhNormal distribution as actions; reparameterization trick + action = self.action_range*action_0 + # according to original paper, with an extra last term for normalizing different action range + log_prob = Normal(mean, std).log_prob(mean+ std*z) - tf.math.log(1. - action_0**2 + epsilon) - np.log(self.action_range) + # both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action); + # the Normal.log_prob outputs the same dim of input features instead of 1 dim probability, + # needs sum up across the dim of actions to get 1 dim probability; or else use Multivariate Normal. + log_prob = tf.reduce_sum(log_prob, axis=1)[:, np.newaxis] # expand dim as reduce_sum causes 1 dim reduced + + return action, log_prob, z, mean, log_std + + + def get_action(self, state, deterministic): + ''' generate action with state for interaction with envronment ''' + mean, log_std = self.forward([state]) + std = tf.math.exp(log_std) + + normal = Normal(0, 1) + z = normal.sample() + action = self.action_range * tf.math.tanh(mean + std*z) # TanhNormal distribution as actions; reparameterization trick + + action = self.action_range*mean if deterministic else action + return action.numpy()[0] + + + def sample_action(self,): + ''' generate random actions for exploration ''' + a = tf.random.uniform([self.num_actions], -1, 1) + + return self.action_range*a.numpy() + + +class SAC_Trainer(): + def __init__(self, replay_buffer, hidden_dim, action_range, soft_q_lr = 3e-4, policy_lr = 3e-4, alpha_lr = 3e-4): + self.replay_buffer = replay_buffer + + # initialize all networks + self.soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim) + self.soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim) + self.target_soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim) + self.target_soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim) + self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim, action_range) + self.log_alpha = tf.Variable(0, dtype=np.float32, name='log_alpha') + self.alpha = tf.math.exp(self.log_alpha) + print('Soft Q Network (1,2): ', self.soft_q_net1) + print('Policy Network: ', self.policy_net) + + # initialize weights of target networks + self.target_soft_q_net1 = self.target_ini(self.soft_q_net1, self.target_soft_q_net1) + self.target_soft_q_net2 = self.target_ini(self.soft_q_net2, self.target_soft_q_net2) + + self.soft_q_optimizer1 = tf.optimizers.Adam(soft_q_lr) + self.soft_q_optimizer2 = tf.optimizers.Adam(soft_q_lr) + self.policy_optimizer = tf.optimizers.Adam(policy_lr) + self.alpha_optimizer = tf.optimizers.Adam(alpha_lr) + # self.alpha_optimizer = optim.Adam([self.log_alpha], lr=alpha_lr) + + def target_ini(self, net, target_net): + ''' hard-copy update for initializing target networks ''' + for target_param, param in zip(target_net.trainable_weights, net.trainable_weights): + target_param.assign(param) + return target_net + + def target_soft_update(self, net, target_net, soft_tau): + ''' soft update the target net with Polyak averaging ''' + for target_param, param in zip(target_net.trainable_weights, net.trainable_weights): + target_param.assign( # copy weight value into target parameters + target_param * (1.0 - soft_tau) + param * soft_tau + ) + return target_net + + def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy=-2, gamma=0.99,soft_tau=1e-2): + ''' update all networks in SAC ''' + state, action, reward, next_state, done = self.replay_buffer.sample(batch_size) + + reward = reward[:, np.newaxis] # expand dim + done = done[:, np.newaxis] + + reward = reward_scale * (reward - np.mean(reward, axis=0)) /np.std(reward, axis=0) # normalize with batch mean and std + + + # Training Q Function + new_next_action, next_log_prob, _, _, _ = self.policy_net.evaluate(next_state) + target_q_input = tf.concat([next_state, new_next_action], 1) # the dim 0 is number of samples + target_q_min = tf.minimum(self.target_soft_q_net1(target_q_input),self.target_soft_q_net2(target_q_input)) - self.alpha * next_log_prob + target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward + q_input = tf.concat([state, action], 1) # the dim 0 is number of samples + + with tf.GradientTape() as q1_tape: + predicted_q_value1 = self.soft_q_net1(q_input) + q_value_loss1 = tf.reduce_mean(tf.losses.mean_squared_error(predicted_q_value1, target_q_value)) + q1_grad = q1_tape.gradient(q_value_loss1, self.soft_q_net1.trainable_weights) + self.soft_q_optimizer1.apply_gradients(zip(q1_grad, self.soft_q_net1.trainable_weights)) + + with tf.GradientTape() as q2_tape: + predicted_q_value2 = self.soft_q_net2(q_input) + q_value_loss2 = tf.reduce_mean(tf.losses.mean_squared_error(predicted_q_value2, target_q_value)) + q2_grad = q2_tape.gradient(q_value_loss2, self.soft_q_net2.trainable_weights) + self.soft_q_optimizer2.apply_gradients(zip(q2_grad, self.soft_q_net2.trainable_weights)) + + # Training Policy Function + with tf.GradientTape() as p_tape: + new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state) + new_q_input = tf.concat([state, new_action], 1) # the dim 0 is number of samples + ''' implementation 1 ''' + predicted_new_q_value = tf.minimum(self.soft_q_net1(new_q_input),self.soft_q_net2(new_q_input)) + ''' implementation 2 ''' + # predicted_new_q_value = self.soft_q_net1(new_q_input) + policy_loss = tf.reduce_mean(self.alpha * log_prob - predicted_new_q_value) + p_grad = p_tape.gradient(policy_loss, self.policy_net.trainable_weights) + self.policy_optimizer.apply_gradients(zip(p_grad, self.policy_net.trainable_weights)) + + + # Updating alpha w.r.t entropy + # alpha: trade-off between exploration (max entropy) and exploitation (max Q) + if auto_entropy is True: + with tf.GradientTape() as alpha_tape: + alpha_loss = -tf.reduce_mean((self.log_alpha * (log_prob + target_entropy))) + alpha_grad = alpha_tape.gradient(alpha_loss, [self.log_alpha]) + self.alpha_optimizer.apply_gradients(zip(alpha_grad, [self.log_alpha])) + self.alpha = tf.math.exp(self.log_alpha) + else: # fixed alpha + self.alpha = 1. + alpha_loss = 0 + + # Soft update the target value nets + self.target_soft_q_net1=self.target_soft_update(self.soft_q_net1, self.target_soft_q_net1, soft_tau) + self.target_soft_q_net2=self.target_soft_update(self.soft_q_net2, self.target_soft_q_net2, soft_tau) + + def save_weights(self): # save trained weights + save_model(self.soft_q_net1, 'model_q_net1', 'SAC') + save_model(self.soft_q_net2, 'model_q_net2', 'SAC') + save_model(self.target_soft_q_net1, 'model_target_q_net1', 'SAC') + save_model(self.target_soft_q_net2, 'model_target_q_net2', 'SAC') + save_model(self.policy_net, 'model_policy_net', 'SAC') + + # tl.files.save_npz(self.soft_q_net1.trainable_weights, name='model_q_net1.npz') + # tl.files.save_npz(self.soft_q_net2.trainable_weights, name='model_q_net2.npz') + # tl.files.save_npz(self.target_soft_q_net1.trainable_weights, name='model_target_q_net1.npz') + # tl.files.save_npz(self.target_soft_q_net2.trainable_weights, name='model_target_q_net2.npz') + # tl.files.save_npz(self.policy_net.trainable_weights, name='model_policy_net.npz') + + def load_weights(self): # load trained weights + # tl.files.load_and_assign_npz(name='model_q_net1.npz', network=self.soft_q_net1) + # tl.files.load_and_assign_npz(name='model_q_net2.npz', network=self.soft_q_net2) + # tl.files.load_and_assign_npz(name='model_target_q_net1.npz', network=self.target_soft_q_net1) + # tl.files.load_and_assign_npz(name='model_target_q_net2.npz', network=self.target_soft_q_net2) + # tl.files.load_and_assign_npz(name='model_policy_net.npz', network=self.policy_net) + load_model(self.soft_q_net1, 'model_q_net1', 'SAC') + load_model(self.soft_q_net2, 'model_q_net2', 'SAC') + load_model(self.target_soft_q_net1, 'model_target_q_net1', 'SAC') + load_model(self.target_soft_q_net2, 'model_target_q_net2', 'SAC') + load_model(self.policy_net, 'model_policy_net', 'SAC') + +# def plot(frame_idx, rewards): +# clear_output(True) +# plt.figure(figsize=(20,5)) +# plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1])) +# plt.plot(rewards) +# plt.xlabel('Episode') +# plt.ylabel('Episode Reward') +# plt.savefig('sac.png') + # plt.show() + + +# choose env +ENV = 'Pendulum-v0' +env = NormalizedActions(gym.make(ENV)) +action_dim = env.action_space.shape[0] +state_dim = env.observation_space.shape[0] +action_range=1. + +replay_buffer_size = 5e5 +replay_buffer = ReplayBuffer(replay_buffer_size) + + +# hyper-parameters for RL training +max_frames = 30000 # total number of steps for training +test_frames = 300 # total number of steps for testing +max_steps = 150 # maximum number of steps for one episode +batch_size = 64 # udpate batchsize +explore_steps = 100 # 500 for random action sampling in the beginning of training +update_itr = 3 # repeated updates for single step +hidden_dim = 32 # size of hidden layers for networks +soft_q_lr = 3e-4 # q_net learning rate +policy_lr = 3e-4 # policy_net learning rate +alpha_lr = 3e-4 # alpha learning rate +policy_target_update_interval = 3 # delayed update for the policy network and target networks +# explore_noise_scale = 1.0 # range of action noise for exploration +# eval_noise_scale = 0.5 # range of action noise for evaluation of action value +reward_scale = 1. # value range of reward + +AUTO_ENTROPY=True # automatically udpating variable alpha for entropy +DETERMINISTIC=False # stochastic action policy if False, otherwise deterministic + + +sac_trainer=SAC_Trainer(replay_buffer, hidden_dim=hidden_dim, action_range=action_range, \ +soft_q_lr=soft_q_lr, policy_lr=policy_lr, alpha_lr=alpha_lr ) + +#set train mode +sac_trainer.soft_q_net1.train() +sac_trainer.soft_q_net2.train() +sac_trainer.target_soft_q_net1.train() +sac_trainer.target_soft_q_net2.train() +sac_trainer.policy_net.train() + +# training loop +if args.train: + frame_idx = 0 + rewards = [] + while frame_idx < max_frames: + state = env.reset() + state = state.astype(np.float32) + episode_reward = 0 + if frame_idx <1 : + print('intialize') + _=sac_trainer.policy_net([state]) # need an extra call here to make inside functions be able to use model.forward + + for step in range(max_steps): + if frame_idx > explore_steps: + action = sac_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC) + else: + action = sac_trainer.policy_net.sample_action() + + next_state, reward, done, _ = env.step(action) + next_state = next_state.astype(np.float32) + env.render() + done = 1 if done == True else 0 + + replay_buffer.push(state, action, reward, next_state, done) + + state = next_state + episode_reward += reward + frame_idx += 1 + + if len(replay_buffer) > batch_size: + for i in range(update_itr): + sac_trainer.update(batch_size, reward_scale=reward_scale, auto_entropy=AUTO_ENTROPY, target_entropy=-1.*action_dim) + + if frame_idx % 500 == 0: + plot(rewards, Algorithm_name = 'SAC', Env_name = ENV) + + if done: + break + print('Episode: ', frame_idx/max_steps, '| Episode Reward: ', episode_reward) + rewards.append(episode_reward) + sac_trainer.save_weights() + +if args.test: + frame_idx = 0 + rewards = [] + sac_trainer.load_weights() + + while frame_idx < test_frames: + state = env.reset() + state = state.astype(np.float32) + episode_reward = 0 + if frame_idx <1 : + print('intialize') + _=sac_trainer.policy_net([state]) # need an extra call to make inside functions be able to use forward + + + for step in range(max_steps): + action = sac_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC) + next_state, reward, done, _ = env.step(action) + next_state = next_state.astype(np.float32) + env.render() + done = 1 if done == True else 0 + + state = next_state + episode_reward += reward + frame_idx += 1 + + # if frame_idx % 50 == 0: + # plot(frame_idx, rewards) + + if done: + break + print('Episode: ', frame_idx/max_steps, '| Episode Reward: ', episode_reward) + rewards.append(episode_reward) diff --git a/examples/reinforcement_learning/baselines/utils.py b/examples/reinforcement_learning/baselines/utils.py new file mode 100644 index 000000000..f8b537a0c --- /dev/null +++ b/examples/reinforcement_learning/baselines/utils.py @@ -0,0 +1,97 @@ +""" +Functions for utilization. + +# Requirements +tensorflow==2.0.0a0 +tensorlayer==2.0.1 + +""" +import random +import time + +import matplotlib.pyplot as plt +import tensorlayer as tl +import numpy as np +import os + + + +def plot(episode_rewards, Algorithm_name, Env_name): + ''' + plot the learning curve, saved as ./img/Algorithm_name.png + :episode_rewards: array of floats + :Algorithm_name: string + :Env_name: string + ''' + plt.figure(figsize=(10,5)) + plt.title(Algorithm_name + '-' + Env_name ) + plt.plot(np.arange(len(episode_rewards)), episode_rewards) + plt.xlabel('Episode') + plt.ylabel('Episode Reward') + if not os.path.exists('img'): + os.makedirs('img') + plt.savefig( './img/' + Algorithm_name + '.png') + + +def save_model(model, Model_name, Algorithm_name): + ''' + save trained neural network model + :model: tensorlayer.models.Model + :Model_name: string, e.g. 'model_sac_q1' + :Algorithm_name: string, e.g. 'SAC' + ''' + if not os.path.exists('model/'+Algorithm_name): + os.makedirs('model/'+Algorithm_name) + tl.files.save_npz(model.trainable_weights, './model/' + Algorithm_name + '/'+Model_name) + +def load_model(model, Model_name, Algorithm_name): + ''' + load saved neural network model + :model: tensorlayer.models.Model + :Model_name: string, e.g. 'model_sac_q1' + :Algorithm_name: string, e.g. 'SAC' + ''' + try: + tl.files.load_and_assign_npz('./model/' + Algorithm_name + '/'+Model_name + '.npz', model) + except: + print('Load Model Fails!') + + +class ReplayBuffer: + ''' + a ring buffer for storing transitions and sampling for training + :state: (state_dim,) + :action: (action_dim,) + :reward: (,), scalar + :next_state: (state_dim,) + :done: (,), scalar (0 and 1) or bool (True and False) + ''' + def __init__(self, capacity): + self.capacity = capacity # mamimum number of samples + self.buffer = [] + self.position = 0 # pointer + + def push(self, state, action, reward, next_state, done): + if len(self.buffer) < self.capacity: + self.buffer.append(None) + self.buffer[self.position] = (state, action, reward, next_state, done) + self.position = int((self.position + 1) % self.capacity) # as a ring buffer + + def sample(self, batch_size): + batch = random.sample(self.buffer, batch_size) + state, action, reward, next_state, done = map(np.stack, zip(*batch)) # stack for each element + ''' + the * serves as unpack: sum(a,b) <=> batch=(a,b), sum(*batch) ; + zip: a=[1,2], b=[2,3], zip(a,b) => [(1, 2), (2, 3)] ; + the map serves as mapping the function on each list element: map(square, [2,3]) => [4,9] ; + np.stack((1,2)) => array([1, 2]) + ''' + return state, action, reward, next_state, done + + def __len__(self): + return len(self.buffer) + + + + + diff --git a/examples/reinforcement_learning/baselines/wrappers.py b/examples/reinforcement_learning/baselines/wrappers.py new file mode 100644 index 000000000..231a9880b --- /dev/null +++ b/examples/reinforcement_learning/baselines/wrappers.py @@ -0,0 +1,561 @@ +"""Env wrappers +Note that this file is adapted from `https://pypi.org/project/gym-vec-env` and +`https://github.com/openai/baselines/blob/master/baselines/common/*wrappers.py` +""" +from collections import deque +from functools import partial +from multiprocessing import cpu_count, Process, Pipe +from sys import platform + +import cv2 +import gym +import numpy as np +from gym import spaces + + +__all__ = ( + 'build_env', # build env + 'TimeLimit', # Time limit wrapper + 'NoopResetEnv', # Run random number of no-ops on reset + 'FireResetEnv', # Reset wrapper for envs with fire action + 'EpisodicLifeEnv', # end-of-life == end-of-episode wrapper + 'MaxAndSkipEnv', # skip frame wrapper + 'ClipRewardEnv', # clip reward wrapper + 'WarpFrame', # warp observation wrapper + 'FrameStack', # stack frame wrapper + 'LazyFrames', # lazy store wrapper + 'RewardScaler', # reward scale + 'SubprocVecEnv', # vectorized env wrapper + 'VecFrameStack', # stack frames in vectorized env + 'Monitor', # Episode reward and length monitor +) +cv2.ocl.setUseOpenCL(False) +# env_id -> env_type +id2type = dict() +for _env in gym.envs.registry.all(): + id2type[_env.id] = _env._entry_point.split(':')[0].rsplit('.', 1)[1] + + +def build_env(env_id, vectorized=False, seed=0, reward_scale=1.0, nenv=0): + """Build env based on options""" + env_type = id2type[env_id] + nenv = nenv or cpu_count() // (1 + (platform == 'darwin')) + stack = env_type == 'atari' + if not vectorized: + env = _make_env(env_id, env_type, seed, reward_scale, stack) + else: + env = _make_vec_env(env_id, env_type, nenv, seed, reward_scale, stack) + + return env + + +def _make_env(env_id, env_type, seed, reward_scale, frame_stack=True): + """Make single env""" + if env_type == 'atari': + env = gym.make(env_id) + assert 'NoFrameskip' in env.spec.id + env = NoopResetEnv(env, noop_max=30) + env = MaxAndSkipEnv(env, skip=4) + env = Monitor(env) + # deepmind wrap + env = EpisodicLifeEnv(env) + if 'FIRE' in env.unwrapped.get_action_meanings(): + env = FireResetEnv(env) + env = WarpFrame(env) + env = ClipRewardEnv(env) + if frame_stack: + env = FrameStack(env, 4) + elif env_type == 'classic_control': + env = Monitor(gym.make(env_id)) + else: + raise NotImplementedError + if reward_scale != 1: + env = RewardScaler(env, reward_scale) + env.seed(seed) + return env + + +def _make_vec_env(env_id, env_type, nenv, seed, reward_scale, frame_stack=True): + """Make vectorized env""" + env = SubprocVecEnv([ + partial(_make_env, env_id, env_type, seed + i, reward_scale, False) + for i in range(nenv) + ]) + if frame_stack: + env = VecFrameStack(env, 4) + return env + + +class TimeLimit(gym.Wrapper): + def __init__(self, env, max_episode_steps=None): + super(TimeLimit, self).__init__(env) + self._max_episode_steps = max_episode_steps + self._elapsed_steps = 0 + + def step(self, ac): + observation, reward, done, info = self.env.step(ac) + self._elapsed_steps += 1 + if self._elapsed_steps >= self._max_episode_steps: + done = True + info['TimeLimit.truncated'] = True + return observation, reward, done, info + + def reset(self, **kwargs): + self._elapsed_steps = 0 + return self.env.reset(**kwargs) + + +class NoopResetEnv(gym.Wrapper): + def __init__(self, env, noop_max=30): + """Sample initial states by taking random number of no-ops on reset. + No-op is assumed to be action 0. + """ + super(NoopResetEnv, self).__init__(env) + self.noop_max = noop_max + self.override_num_noops = None + self.noop_action = 0 + assert env.unwrapped.get_action_meanings()[0] == 'NOOP' + + def reset(self, **kwargs): + """ Do no-op action for a number of steps in [1, noop_max].""" + self.env.reset(**kwargs) + if self.override_num_noops is not None: + noops = self.override_num_noops + else: + noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) + assert noops > 0 + obs = None + for _ in range(noops): + obs, _, done, _ = self.env.step(self.noop_action) + if done: + obs = self.env.reset(**kwargs) + return obs + + def step(self, ac): + return self.env.step(ac) + + +class FireResetEnv(gym.Wrapper): + def __init__(self, env): + """Take action on reset for environments that are fixed until firing.""" + super(FireResetEnv, self).__init__(env) + assert env.unwrapped.get_action_meanings()[1] == 'FIRE' + assert len(env.unwrapped.get_action_meanings()) >= 3 + + def reset(self, **kwargs): + self.env.reset(**kwargs) + obs, _, done, _ = self.env.step(1) + if done: + self.env.reset(**kwargs) + obs, _, done, _ = self.env.step(2) + if done: + self.env.reset(**kwargs) + return obs + + def step(self, ac): + return self.env.step(ac) + + +class EpisodicLifeEnv(gym.Wrapper): + def __init__(self, env): + """Make end-of-life == end-of-episode, but only reset on true game over. + Done by DeepMind for the DQN and co. since it helps value estimation. + """ + super(EpisodicLifeEnv, self).__init__(env) + self.lives = 0 + self.was_real_done = True + + def step(self, action): + obs, reward, done, info = self.env.step(action) + self.was_real_done = done + # check current lives, make loss of life terminal, + # then update lives to handle bonus lives + lives = self.env.unwrapped.ale.lives() + if 0 < lives < self.lives: + # for Qbert sometimes we stay in lives == 0 condition for a few + # frames so it's important to keep lives > 0, so that we only reset + # once the environment advertises done. + done = True + self.lives = lives + return obs, reward, done, info + + def reset(self, **kwargs): + """Reset only when lives are exhausted. + This way all states are still reachable even though lives are episodic, + and the learner need not know about any of this behind-the-scenes. + """ + if self.was_real_done: + obs = self.env.reset(**kwargs) + else: + # no-op step to advance from terminal/lost life state + obs, _, _, _ = self.env.step(0) + self.lives = self.env.unwrapped.ale.lives() + return obs + + +class MaxAndSkipEnv(gym.Wrapper): + def __init__(self, env, skip=4): + """Return only every `skip`-th frame""" + super(MaxAndSkipEnv, self).__init__(env) + # most recent raw observations (for max pooling across time steps) + shape = (2, ) + env.observation_space.shape + self._obs_buffer = np.zeros(shape, dtype=np.uint8) + self._skip = skip + + def step(self, action): + """Repeat action, sum reward, and max over last observations.""" + total_reward = 0.0 + done = info = None + for i in range(self._skip): + obs, reward, done, info = self.env.step(action) + if i == self._skip - 2: + self._obs_buffer[0] = obs + if i == self._skip - 1: + self._obs_buffer[1] = obs + total_reward += reward + if done: + break + # Note that the observation on the done=True frame doesn't matter + max_frame = self._obs_buffer.max(axis=0) + + return max_frame, total_reward, done, info + + def reset(self, **kwargs): + return self.env.reset(**kwargs) + + +class ClipRewardEnv(gym.RewardWrapper): + def __init__(self, env): + super(ClipRewardEnv, self).__init__(env) + + def reward(self, reward): + """Bin reward to {+1, 0, -1} by its sign.""" + return np.sign(reward) + + +class WarpFrame(gym.ObservationWrapper): + def __init__(self, env, width=84, height=84, grayscale=True): + """Warp frames to 84x84 as done in the Nature paper and later work.""" + super(WarpFrame, self).__init__(env) + self.width = width + self.height = height + self.grayscale = grayscale + shape = (self.height, self.width, 1 if self.grayscale else 3) + self.observation_space = spaces.Box( + low=0, high=255, shape=shape, dtype=np.uint8 + ) + + def observation(self, frame): + if self.grayscale: + frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) + size = (self.width, self.height) + frame = cv2.resize(frame, size, interpolation=cv2.INTER_AREA) + if self.grayscale: + frame = np.expand_dims(frame, -1) + return frame + + +class FrameStack(gym.Wrapper): + def __init__(self, env, k): + """Stack k last frames. + Returns lazy array, which is much more memory efficient. + See Also `LazyFrames` + """ + super(FrameStack, self).__init__(env) + self.k = k + self.frames = deque([], maxlen=k) + shp = env.observation_space.shape + shape = shp[:-1] + (shp[-1] * k, ) + self.observation_space = spaces.Box( + low=0, high=255, shape=shape, dtype=env.observation_space.dtype + ) + + def reset(self): + ob = self.env.reset() + for _ in range(self.k): + self.frames.append(ob) + return np.asarray(self._get_ob()) + + def step(self, action): + ob, reward, done, info = self.env.step(action) + self.frames.append(ob) + return np.asarray(self._get_ob()), reward, done, info + + def _get_ob(self): + assert len(self.frames) == self.k + return LazyFrames(list(self.frames)) + + +class LazyFrames(object): + def __init__(self, frames): + """This object ensures that common frames between the observations are + only stored once. It exists purely to optimize memory usage which can be + huge for DQN's 1M frames replay buffers. + + This object should only be converted to numpy array before being passed + to the model. You'd not believe how complex the previous solution was. + """ + self._frames = frames + self._out = None + + def _force(self): + if self._out is None: + self._out = np.concatenate(self._frames, axis=-1) + self._frames = None + return self._out + + def __array__(self, dtype=None): + out = self._force() + if dtype is not None: + out = out.astype(dtype) + return out + + def __len__(self): + return len(self._force()) + + def __getitem__(self, i): + return self._force()[i] + + +class RewardScaler(gym.RewardWrapper): + """Bring rewards to a reasonable scale for PPO. + This is incredibly important and effects performance drastically. + """ + def __init__(self, env, scale=0.01): + super(RewardScaler, self).__init__(env) + self.scale = scale + + def reward(self, reward): + return reward * self.scale + + +class VecFrameStack(object): + def __init__(self, env, k): + self.env = env + self.k = k + self.action_space = env.action_space + self.frames = deque([], maxlen=k) + shp = env.observation_space.shape + shape = shp[:-1] + (shp[-1] * k, ) + self.observation_space = spaces.Box( + low=0, high=255, shape=shape, dtype=env.observation_space.dtype + ) + + def reset(self): + ob = self.env.reset() + for _ in range(self.k): + self.frames.append(ob) + return np.asarray(self._get_ob()) + + def step(self, action): + ob, reward, done, info = self.env.step(action) + self.frames.append(ob) + return np.asarray(self._get_ob()), reward, done, info + + def _get_ob(self): + assert len(self.frames) == self.k + return LazyFrames(list(self.frames)) + + +def _worker(remote, parent_remote, env_fn_wrapper): + parent_remote.close() + env = env_fn_wrapper.x() + while True: + cmd, data = remote.recv() + if cmd == 'step': + ob, reward, done, info = env.step(data) + if done: + ob = env.reset() + remote.send((ob, reward, done, info)) + elif cmd == 'reset': + ob = env.reset() + remote.send(ob) + elif cmd == 'reset_task': + ob = env._reset_task() + remote.send(ob) + elif cmd == 'close': + remote.close() + break + elif cmd == 'get_spaces': + remote.send((env.observation_space, env.action_space)) + else: + raise NotImplementedError + + +class CloudpickleWrapper(object): + """ + Uses cloudpickle to serialize contents + """ + def __init__(self, x): + self.x = x + + def __getstate__(self): + import cloudpickle + return cloudpickle.dumps(self.x) + + def __setstate__(self, ob): + import pickle + self.x = pickle.loads(ob) + + +class SubprocVecEnv(object): + def __init__(self, env_fns): + """ + envs: list of gym environments to run in subprocesses + """ + self.num_envs = len(env_fns) + + self.waiting = False + self.closed = False + nenvs = len(env_fns) + self.nenvs = nenvs + self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) + zipped_args = zip(self.work_remotes, self.remotes, env_fns) + self.ps = [ + Process(target=_worker, + args=(work_remote, remote, CloudpickleWrapper(env_fn))) + for (work_remote, remote, env_fn) in zipped_args + ] + + for p in self.ps: + # if the main process crashes, we should not cause things to hang + p.daemon = True + p.start() + for remote in self.work_remotes: + remote.close() + + self.remotes[0].send(('get_spaces', None)) + observation_space, action_space = self.remotes[0].recv() + self.observation_space = observation_space + self.action_space = action_space + + def _step_async(self, actions): + """ + Tell all the environments to start taking a step + with the given actions. + Call step_wait() to get the results of the step. + You should not call this if a step_async run is + already pending. + """ + for remote, action in zip(self.remotes, actions): + remote.send(('step', action)) + self.waiting = True + + def _step_wait(self): + """ + Wait for the step taken with step_async(). + Returns (obs, rews, dones, infos): + - obs: an array of observations, or a tuple of + arrays of observations. + - rews: an array of rewards + - dones: an array of "episode done" booleans + - infos: a sequence of info objects + """ + results = [remote.recv() for remote in self.remotes] + self.waiting = False + obs, rews, dones, infos = zip(*results) + return np.stack(obs), np.stack(rews), np.stack(dones), infos + + def reset(self): + """ + Reset all the environments and return an array of + observations, or a tuple of observation arrays. + If step_async is still doing work, that work will + be cancelled and step_wait() should not be called + until step_async() is invoked again. + """ + for remote in self.remotes: + remote.send(('reset', None)) + return np.stack([remote.recv() for remote in self.remotes]) + + def _reset_task(self): + for remote in self.remotes: + remote.send(('reset_task', None)) + return np.stack([remote.recv() for remote in self.remotes]) + + def close(self): + if self.closed: + return + if self.waiting: + for remote in self.remotes: + remote.recv() + for remote in self.remotes: + remote.send(('close', None)) + for p in self.ps: + p.join() + self.closed = True + + def __len__(self): + return self.nenvs + + def step(self, actions): + self._step_async(actions) + return self._step_wait() + + +class Monitor(gym.Wrapper): + def __init__(self, env): + super(Monitor, self).__init__(env) + self._monitor_rewards = None + + def reset(self, **kwargs): + self._monitor_rewards = [] + return self.env.reset(**kwargs) + + def step(self, action): + o_, r, done, info = self.env.step(action) + self._monitor_rewards.append(r) + if done: + info['episode'] = { + 'r': sum(self._monitor_rewards), + 'l': len(self._monitor_rewards)} + return o_, r, done, info + + +class NormalizedActions(gym.ActionWrapper): + def _action(self, action): + low = self.action_space.low + high = self.action_space.high + + action = low + (action + 1.0) * 0.5 * (high - low) + action = np.clip(action, low, high) + + return action + + def _reverse_action(self, action): + low = self.action_space.low + high = self.action_space.high + + action = 2 * (action - low) / (high - low) - 1 + action = np.clip(action, low, high) + + return action + + +def unit_test(): + env_id = 'CartPole-v0' + unwrapped_env = gym.make(env_id) + wrapped_env = build_env(env_id, False) + o = wrapped_env.reset() + print('Reset {} observation shape {}'.format(env_id, o.shape)) + done = False + while not done: + a = unwrapped_env.action_space.sample() + o_, r, done, info = wrapped_env.step(a) + print('Take action {} get reward {} info {}'.format(a, r, info)) + + env_id = 'PongNoFrameskip-v4' + nenv = 2 + unwrapped_env = gym.make(env_id) + wrapped_env = build_env(env_id, True, nenv=nenv) + o = wrapped_env.reset() + print('Reset {} observation shape {}'.format(env_id, o.shape)) + for _ in range(1000): + a = [unwrapped_env.action_space.sample() for _ in range(nenv)] + a = np.asarray(a, 'int64') + o_, r, done, info = wrapped_env.step(a) + print('Take action {} get reward {} info {}'.format(a, r, info)) + + +if __name__ == '__main__': + unit_test() diff --git a/examples/reinforcement_learning/tutorial_cartpole_ac.py b/examples/reinforcement_learning/tutorial_AC.py similarity index 100% rename from examples/reinforcement_learning/tutorial_cartpole_ac.py rename to examples/reinforcement_learning/tutorial_AC.py diff --git a/examples/reinforcement_learning/tutorial_c51.py b/examples/reinforcement_learning/tutorial_C51.py similarity index 100% rename from examples/reinforcement_learning/tutorial_c51.py rename to examples/reinforcement_learning/tutorial_C51.py diff --git a/examples/reinforcement_learning/tutorial_frozenlake_dqn.py b/examples/reinforcement_learning/tutorial_DQN.py similarity index 100% rename from examples/reinforcement_learning/tutorial_frozenlake_dqn.py rename to examples/reinforcement_learning/tutorial_DQN.py diff --git a/examples/reinforcement_learning/tutorial_double_dueling_noisy_dqn.py b/examples/reinforcement_learning/tutorial_DQN_variants.py similarity index 100% rename from examples/reinforcement_learning/tutorial_double_dueling_noisy_dqn.py rename to examples/reinforcement_learning/tutorial_DQN_variants.py diff --git a/examples/reinforcement_learning/tutorial_frozenlake_q_table.py b/examples/reinforcement_learning/tutorial_Qlearning.py similarity index 100% rename from examples/reinforcement_learning/tutorial_frozenlake_q_table.py rename to examples/reinforcement_learning/tutorial_Qlearning.py diff --git a/examples/reinforcement_learning/tutorial_retrace.py b/examples/reinforcement_learning/tutorial_Retrace.py similarity index 100% rename from examples/reinforcement_learning/tutorial_retrace.py rename to examples/reinforcement_learning/tutorial_Retrace.py diff --git a/examples/reinforcement_learning/tutorial_sac.py b/examples/reinforcement_learning/tutorial_SAC.py similarity index 93% rename from examples/reinforcement_learning/tutorial_sac.py rename to examples/reinforcement_learning/tutorial_SAC.py index 508d36520..e1836d8e9 100644 --- a/examples/reinforcement_learning/tutorial_sac.py +++ b/examples/reinforcement_learning/tutorial_SAC.py @@ -1,21 +1,29 @@ ''' Soft Actor-Critic -using target Q instead of V net: 2 Q net, 2 target Q net, 1 policy net -adding alpha loss +------------------ +It uses target Q instead of V net: 2 Q net, 2 target Q net, 1 policy net +It uses alpha loss. +Actor policy is stochastic, with off-policy training. +Reference +--------- paper: https://arxiv.org/pdf/1812.05905.pdf -Actor policy is stochastic. -Env: Openai Gym Pendulum-v0, continuous action space +Env +--- +Openai Gym Pendulum-v0, continuous action space -tensorflow 2.0.0a0 +Prerequisites +-------------- +tensorflow >=2.0.0a0 tensorflow-probability 0.6.0 -tensorlayer 2.0.0 +tensorlayer >=2.0.0 && pip install box2d box2d-kengz --user -To run: +To run +------ python tutorial_sac.py --train/test ''' @@ -43,19 +51,35 @@ np.random.seed(2) tf.random.set_seed(2) # reproducible -# GPU = True -# device_idx = 0 -# if GPU: -# device = torch.device("cuda:" + str(device_idx) if torch.cuda.is_available() else "cpu") -# else: -# device = torch.device("cpu") -# print(device) - parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') parser.add_argument('--train', dest='train', action='store_true', default=False) parser.add_argument('--test', dest='test', action='store_true', default=True) args = parser.parse_args() +##################### hyper parameters #################### +# choose env +ENV = 'Pendulum-v0' +action_range=1. # scale action, [-action_range, action_range] + +# RL training +max_frames = 40000 # total number of steps for training +test_frames = 300 # total number of steps for testing +max_steps = 150 # maximum number of steps for one episode +batch_size = 64 # udpate batchsize +explore_steps = 100 # 500 for random action sampling in the beginning of training +update_itr = 3 # repeated updates for single step +hidden_dim = 32 # size of hidden layers for networks +soft_q_lr = 3e-4 # q_net learning rate +policy_lr = 3e-4 # policy_net learning rate +alpha_lr = 3e-4 # alpha learning rate +policy_target_update_interval = 3 # delayed update for the policy network and target networks +reward_scale = 1. # value range of reward +replay_buffer_size = 5e5 + +AUTO_ENTROPY=True # automatically udpating variable alpha for entropy +DETERMINISTIC=False # stochastic action policy if False, otherwise deterministic + +############################### SAC #################################### class ReplayBuffer: def __init__(self, capacity): @@ -316,41 +340,15 @@ def plot(frame_idx, rewards): plt.savefig('sac.png') # plt.show() - -# choose env -ENV = 'Pendulum-v0' +# initialization of env env = NormalizedActions(gym.make(ENV)) action_dim = env.action_space.shape[0] state_dim = env.observation_space.shape[0] -action_range=1. - -replay_buffer_size = 5e5 +# initialization of buffer replay_buffer = ReplayBuffer(replay_buffer_size) - - -# hyper-parameters for RL training -max_frames = 40000 # total number of steps for training -test_frames = 300 # total number of steps for testing -max_steps = 150 # maximum number of steps for one episode -batch_size = 64 # udpate batchsize -explore_steps = 100 # 500 for random action sampling in the beginning of training -update_itr = 3 # repeated updates for single step -hidden_dim = 32 # size of hidden layers for networks -soft_q_lr = 3e-4 # q_net learning rate -policy_lr = 3e-4 # policy_net learning rate -alpha_lr = 3e-4 # alpha learning rate -policy_target_update_interval = 3 # delayed update for the policy network and target networks -# explore_noise_scale = 1.0 # range of action noise for exploration -# eval_noise_scale = 0.5 # range of action noise for evaluation of action value -reward_scale = 1. # value range of reward - -AUTO_ENTROPY=True # automatically udpating variable alpha for entropy -DETERMINISTIC=False # stochastic action policy if False, otherwise deterministic - - +# initialization of trainer sac_trainer=SAC_Trainer(replay_buffer, hidden_dim=hidden_dim, action_range=action_range, \ soft_q_lr=soft_q_lr, policy_lr=policy_lr, alpha_lr=alpha_lr ) - #set train mode sac_trainer.soft_q_net1.train() sac_trainer.soft_q_net2.train() @@ -362,6 +360,7 @@ def plot(frame_idx, rewards): if args.train: frame_idx = 0 rewards = [] + t0 = time.time() while frame_idx < max_frames: state = env.reset() state = state.astype(np.float32) @@ -380,6 +379,7 @@ def plot(frame_idx, rewards): next_state = next_state.astype(np.float32) env.render() done = 1 if done == True else 0 + print('s:', state, action, reward, next_state, done) replay_buffer.push(state, action, reward, next_state, done) @@ -396,13 +396,16 @@ def plot(frame_idx, rewards): if done: break - print('Episode: ', frame_idx/max_steps, '| Episode Reward: ', episode_reward) + episode = int(frame_idx/max_steps) # current episode + all_episodes = int(max_frames/max_steps) # total episodes + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(episode, all_episodes, episode_reward, time.time()-t0 ) ) rewards.append(episode_reward) sac_trainer.save_weights() if args.test: frame_idx = 0 rewards = [] + t0 = time.time() sac_trainer.load_weights() while frame_idx < test_frames: @@ -430,5 +433,7 @@ def plot(frame_idx, rewards): if done: break - print('Episode: ', frame_idx/max_steps, '| Episode Reward: ', episode_reward) + episode = int(frame_idx/max_steps) + all_episodes = int(test_frames/max_steps) + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(episode, all_episodes, episode_reward, time.time()-t0 ) ) rewards.append(episode_reward) diff --git a/examples/reinforcement_learning/tutorial_td3.py b/examples/reinforcement_learning/tutorial_TD3.py similarity index 92% rename from examples/reinforcement_learning/tutorial_td3.py rename to examples/reinforcement_learning/tutorial_TD3.py index 63c5dbf0c..a4e2d377a 100644 --- a/examples/reinforcement_learning/tutorial_td3.py +++ b/examples/reinforcement_learning/tutorial_TD3.py @@ -1,21 +1,34 @@ ''' -Twin Delayed DDPG (TD3), if no twin no delayed then it's DDPG. -using networks including: 2 Q-net, 2 target Q-net, 1 policy net, 1 target policy net -original paper: https://arxiv.org/pdf/1802.09477.pdf +Twin Delayed DDPG (TD3) +------------------------ +If no twin no delayed then it's DDPG. +It uses networks including: 2 Q-net, 2 target Q-net, 1 policy net, 1 target policy net Actor policy is deterministic, with Gaussian exploration noise. -Env: Openai Gym Pendulum-v0, continuous action space +Reference +--------- +original paper: https://arxiv.org/pdf/1802.09477.pdf + -tensorflow 2.0.0a0 +Env +--- +Openai Gym Pendulum-v0, continuous action space + +Prerequisites +--- +tensorflow >=2.0.0a0 tensorflow-probability 0.6.0 -tensorlayer 2.0.0 +tensorlayer >=2.0.0 && pip install box2d box2d-kengz --user -To run: +To run +------- python tutorial_td3.py --train/test + ''' + import argparse import math import random @@ -40,20 +53,35 @@ np.random.seed(2) tf.random.set_seed(2) # reproducible - -# GPU = True -# device_idx = 0 -# if GPU: -# device = torch.device("cuda:" + str(device_idx) if torch.cuda.is_available() else "cpu") -# else: -# device = torch.device("cpu") -# print(device) - parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') parser.add_argument('--train', dest='train', action='store_true', default=False) parser.add_argument('--test', dest='test', action='store_true', default=True) args = parser.parse_args() +##################### hyper parameters #################### +# choose env +ENV = 'Pendulum-v0' +action_range=1. # scale action, [-action_range, action_range] + +# RL training +max_frames = 40000 # total number of steps for training +test_frames = 300 # total number of steps for testing +max_steps = 150 # maximum number of steps for one episode +batch_size = 64 # udpate batchsize +explore_steps = 500 # 500 for random action sampling in the beginning of training +update_itr = 3 # repeated updates for single step +hidden_dim = 32 # size of hidden layers for networks +q_lr = 3e-4 # q_net learning rate +policy_lr = 3e-4 # policy_net learning rate +policy_target_update_interval = 3 # delayed steps for updating the policy network and target networks +explore_noise_scale = 1.0 # range of action noise for exploration +eval_noise_scale = 0.5 # range of action noise for evaluation of action value +reward_scale = 1. # value range of reward +replay_buffer_size = 5e5 # size of replay buffer + + +############################### TD3 #################################### + class ReplayBuffer: def __init__(self, capacity): self.capacity = capacity @@ -296,32 +324,13 @@ def plot(frame_idx, rewards): # plt.show() -# choose env -ENV = 'Pendulum-v0' +# initialization of env env = NormalizedActions(gym.make(ENV)) action_dim = env.action_space.shape[0] state_dim = env.observation_space.shape[0] -action_range=1. - -replay_buffer_size = 5e5 +# initialization of buffer replay_buffer = ReplayBuffer(replay_buffer_size) - - -# hyper-parameters for RL training -max_frames = 40000 # total number of steps for training -test_frames = 300 # total number of steps for testing -max_steps = 150 # maximum number of steps for one episode -batch_size = 64 # udpate batchsize -explore_steps = 500 # 500 for random action sampling in the beginning of training -update_itr = 3 # repeated updates for single step -hidden_dim = 32 # size of hidden layers for networks -q_lr = 3e-4 # q_net learning rate -policy_lr = 3e-4 # policy_net learning rate -policy_target_update_interval = 3 # delayed steps for updating the policy network and target networks -explore_noise_scale = 1.0 # range of action noise for exploration -eval_noise_scale = 0.5 # range of action noise for evaluation of action value -reward_scale = 1. # value range of reward - +# initialization of trainer td3_trainer=TD3_Trainer(replay_buffer, hidden_dim=hidden_dim, policy_target_update_interval=policy_target_update_interval, \ action_range=action_range, q_lr=q_lr, policy_lr=policy_lr ) # set train mode @@ -336,6 +345,7 @@ def plot(frame_idx, rewards): if args.train: frame_idx = 0 rewards = [] + t0 = time.time() while frame_idx < max_frames: state = env.reset() state = state.astype(np.float32) @@ -372,13 +382,18 @@ def plot(frame_idx, rewards): if done: break - print('Episode: ', frame_idx/max_steps, '| Episode Reward: ', episode_reward) + episode = int(frame_idx/max_steps) # current episode + all_episodes = int(max_frames/max_steps) # total episodes + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'\ + .format(episode, all_episodes, episode_reward, time.time()-t0 )) rewards.append(episode_reward) td3_trainer.save_weights() if args.test: frame_idx = 0 rewards = [] + t0 = time.time() + td3_trainer.load_weights() while frame_idx < test_frames: @@ -407,5 +422,8 @@ def plot(frame_idx, rewards): if done: break - print('Episode: ', frame_idx/max_steps, '| Episode Reward: ', episode_reward) + episode = int(frame_idx/max_steps) + all_episodes = int(test_frames/max_steps) + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'\ + .format(episode, all_episodes, episode_reward, time.time()-t0 ) ) rewards.append(episode_reward) diff --git a/examples/reinforcement_learning/tutorial_wrappers.py b/examples/reinforcement_learning/tutorial_wrappers.py index eec4b7433..231a9880b 100644 --- a/examples/reinforcement_learning/tutorial_wrappers.py +++ b/examples/reinforcement_learning/tutorial_wrappers.py @@ -512,6 +512,26 @@ def step(self, action): return o_, r, done, info +class NormalizedActions(gym.ActionWrapper): + def _action(self, action): + low = self.action_space.low + high = self.action_space.high + + action = low + (action + 1.0) * 0.5 * (high - low) + action = np.clip(action, low, high) + + return action + + def _reverse_action(self, action): + low = self.action_space.low + high = self.action_space.high + + action = 2 * (action - low) / (high - low) - 1 + action = np.clip(action, low, high) + + return action + + def unit_test(): env_id = 'CartPole-v0' unwrapped_env = gym.make(env_id) From 6a3b21c20ab9db77e800007db2914cfc357cd2b2 Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Tue, 4 Jun 2019 17:45:22 +0100 Subject: [PATCH 32/57] readme --- examples/reinforcement_learning/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index 4f77bc498..a5a219c21 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -15,7 +15,7 @@ This repository contains implementation of most popular reinforcement learning a ## Prerequisites: * python 3.5 -* tensorflow >= 2.0.0 +* tensorflow >= 2.0.0 or tensorflow-gpu >= 2.0.0a0 * tensorlayer >= 2.0.1 * tensorflow-probability * tf-nightly-2.0-preview @@ -32,9 +32,7 @@ We are currently open to any suggestions or pull requests from you to make the r For each tutorial, open a terminal and run: -`python ***.py` - -or `python ***.py --train` for training and `python ***.py --test` for testing. + `python ***.py --train` for training and `python ***.py --test` for testing. ## Table of Contents: @@ -122,6 +120,8 @@ or `python ***.py --train` for training and `python ***.py --test` for testing. Code: `./tutorial_A3C.py` + Paper: [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/pdf/1602.01783.pdf) + * Soft Actor-Critic (SAC) From 4335b0dba17ffd4e7594d4011c9fe89724bba1f8 Mon Sep 17 00:00:00 2001 From: Tokarev-TT-33 <34995488+Tokarev-TT-33@users.noreply.github.com> Date: Thu, 6 Jun 2019 15:33:20 +0800 Subject: [PATCH 33/57] Update tutorial_TRPO.py --- examples/reinforcement_learning/tutorial_TRPO.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/reinforcement_learning/tutorial_TRPO.py b/examples/reinforcement_learning/tutorial_TRPO.py index 67c11b0f5..1f1b19aad 100644 --- a/examples/reinforcement_learning/tutorial_TRPO.py +++ b/examples/reinforcement_learning/tutorial_TRPO.py @@ -192,11 +192,12 @@ def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation): x = input_layer_from_space(x) mu = mlp(x, list(hidden_sizes) + [act_dim], activation, output_activation) actor = tl.models.Model(x, mu) + log_std = tf.Variable(-0.5 * np.ones(act_dim, dtype=np.float32)) + actor.trainable_weights.append(log_std) def cal_outputs_0(states): states = states.astype(np.float32) mu = actor(states) - log_std = -0.5 * np.ones(act_dim, dtype=np.float32) std = tf.exp(log_std) pi = mu + tf.random.normal(tf.shape(mu)) * std logp_pi = gaussian_likelihood(pi, mu, log_std) From e52e1e38fd43ff712fa3ac2543b9dc063d2337db Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Sun, 9 Jun 2019 14:09:42 +0100 Subject: [PATCH 34/57] chang format --- ...c_continuous_action.py => tutorial_A3C.py} | 0 .../reinforcement_learning/tutorial_AC.py | 24 ++++++++++++++++++- .../reinforcement_learning/tutorial_SAC.py | 3 ++- .../reinforcement_learning/tutorial_TD3.py | 3 ++- 4 files changed, 27 insertions(+), 3 deletions(-) rename examples/reinforcement_learning/{tutorial_bipedalwalker_a3c_continuous_action.py => tutorial_A3C.py} (100%) diff --git a/examples/reinforcement_learning/tutorial_bipedalwalker_a3c_continuous_action.py b/examples/reinforcement_learning/tutorial_A3C.py similarity index 100% rename from examples/reinforcement_learning/tutorial_bipedalwalker_a3c_continuous_action.py rename to examples/reinforcement_learning/tutorial_A3C.py diff --git a/examples/reinforcement_learning/tutorial_AC.py b/examples/reinforcement_learning/tutorial_AC.py index 8a6427ac2..67e268db4 100644 --- a/examples/reinforcement_learning/tutorial_AC.py +++ b/examples/reinforcement_learning/tutorial_AC.py @@ -1,4 +1,7 @@ -"""Actor-Critic using TD-error as the Advantage, Reinforcement Learning. +""" +Actor-Critic +------------- +It uses TD-error as the Advantage. Actor Critic History ---------------------- @@ -15,6 +18,7 @@ Reference ---------- +paper: https://papers.nips.cc/paper/1786-actor-critic-algorithms.pdf View more on MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/ Environment @@ -30,6 +34,16 @@ The episode ends when the pole is more than 15 degrees from vertical, or the cart moves more than 2.4 units from the center. + +Prerequisites +-------------- +tensorflow >=2.0.0a0 +tensorlayer >=2.0.0 + +To run +------ +python tutorial_sac.py --train/test + """ import time @@ -38,12 +52,20 @@ import gym import tensorflow as tf import tensorlayer as tl +import argparse + tl.logging.set_verbosity(tl.logging.DEBUG) np.random.seed(2) tf.random.set_seed(2) # reproducible +# add arguments in command --train/test +parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') +parser.add_argument('--train', dest='train', action='store_true', default=False) +parser.add_argument('--test', dest='test', action='store_true', default=True) +args = parser.parse_args() + # hyper-parameters OUTPUT_GRAPH = False MAX_EPISODE = 3000 diff --git a/examples/reinforcement_learning/tutorial_SAC.py b/examples/reinforcement_learning/tutorial_SAC.py index e1836d8e9..0006630d0 100644 --- a/examples/reinforcement_learning/tutorial_SAC.py +++ b/examples/reinforcement_learning/tutorial_SAC.py @@ -9,7 +9,7 @@ --------- paper: https://arxiv.org/pdf/1812.05905.pdf -Env +Environment --- Openai Gym Pendulum-v0, continuous action space @@ -51,6 +51,7 @@ np.random.seed(2) tf.random.set_seed(2) # reproducible +# add arguments in command --train/test parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') parser.add_argument('--train', dest='train', action='store_true', default=False) parser.add_argument('--test', dest='test', action='store_true', default=True) diff --git a/examples/reinforcement_learning/tutorial_TD3.py b/examples/reinforcement_learning/tutorial_TD3.py index a4e2d377a..63ed83758 100644 --- a/examples/reinforcement_learning/tutorial_TD3.py +++ b/examples/reinforcement_learning/tutorial_TD3.py @@ -10,7 +10,7 @@ original paper: https://arxiv.org/pdf/1802.09477.pdf -Env +Environment --- Openai Gym Pendulum-v0, continuous action space @@ -53,6 +53,7 @@ np.random.seed(2) tf.random.set_seed(2) # reproducible +# add arguments in command --train/test parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') parser.add_argument('--train', dest='train', action='store_true', default=False) parser.add_argument('--test', dest='test', action='store_true', default=True) From f6011db684a9b4196be8089e8acd9016497947a3 Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Sun, 9 Jun 2019 14:37:52 +0100 Subject: [PATCH 35/57] tutorial format --- .../reinforcement_learning/tutorial_AC.py | 192 ++++++++-------- .../reinforcement_learning/tutorial_SAC.py | 193 ++++++++-------- .../reinforcement_learning/tutorial_TD3.py | 207 +++++++++--------- .../reinforcement_learning/tutorial_format.py | 92 ++++++++ 4 files changed, 393 insertions(+), 291 deletions(-) create mode 100644 examples/reinforcement_learning/tutorial_format.py diff --git a/examples/reinforcement_learning/tutorial_AC.py b/examples/reinforcement_learning/tutorial_AC.py index 67e268db4..a90a3a59b 100644 --- a/examples/reinforcement_learning/tutorial_AC.py +++ b/examples/reinforcement_learning/tutorial_AC.py @@ -66,9 +66,10 @@ parser.add_argument('--test', dest='test', action='store_true', default=True) args = parser.parse_args() -# hyper-parameters +##################### hyper parameters #################### + OUTPUT_GRAPH = False -MAX_EPISODE = 3000 +MAX_EPISODE = 3000 # number of overall episodes for training DISPLAY_REWARD_THRESHOLD = 100 # renders environment if running reward is greater then this threshold MAX_EP_STEPS = 1000 # maximum time step in one episode RENDER = False # rendering wastes time @@ -76,28 +77,8 @@ LR_A = 0.001 # learning rate for actor LR_C = 0.01 # learning rate for critic -''' -choose environment -1. Openai gym: -env = gym.make() -2. DeepMind Control Suite: -env = dm_control2gym.make() -''' - -env = gym.make('CartPole-v0') -# dm_control2gym.create_render_mode('example mode', show=True, return_pixel=False, height=240, width=320, camera_id=-1, overlays=(), -# depth=False, scene_option=None) -# env = dm_control2gym.make(domain_name="cartpole", task_name="balance") -env.seed(2) # reproducible -# env = env.unwrapped -N_F = env.observation_space.shape[0] -# N_A = env.action_space.shape[0] -N_A = env.action_space.n - -print("observation dimension: %d" % N_F) # 4 -print("observation high: %s" % env.observation_space.high) # [ 2.4 , inf , 0.41887902 , inf] -print("observation low : %s" % env.observation_space.low) # [-2.4 , -inf , -0.41887902 , -inf] -print("num of actions: %d" % N_A) # 2 : left or right + +############################### Actor-Critic #################################### class Actor(object): @@ -174,73 +155,100 @@ def save_ckpt(self): # save trained weights def load_ckpt(self): # load trained weights tl.files.load_and_assign_npz(name='model_critic.npz', network=self.model) -actor = Actor(n_features=N_F, n_actions=N_A, lr=LR_A) -# we need a good teacher, so the teacher should learn faster than the actor -critic = Critic(n_features=N_F, lr=LR_C) - - -for i_episode in range(MAX_EPISODE): - episode_time = time.time() - s = env.reset().astype(np.float32) - t = 0 # number of step in this episode - all_r = [] # rewards of all steps - while True: - - if RENDER: env.render() - - a = actor.choose_action(s) - - s_new, r, done, info = env.step(a) - s_new = s_new.astype(np.float32) - - if done: r = -20 - # these may helpful in some tasks - # if abs(s_new[0]) >= env.observation_space.high[0]: - # # cart moves more than 2.4 units from the center - # r = -20 - # reward for the distance between cart to the center - # r -= abs(s_new[0]) * .1 - - all_r.append(r) - - td_error = critic.learn(s, r, s_new) # learn Value-function : gradient = grad[r + lambda * V(s_new) - V(s)] - try: - actor.learn(s, a, td_error) # learn Policy : true_gradient = grad[logPi(s, a) * td_error] - except KeyboardInterrupt: # if Ctrl+C at running actor.learn(), then save model, or exit if not at actor.learn() - actor.save_ckpt() - critic.save_ckpt() - # logging - - s = s_new - t += 1 - - if done or t >= MAX_EP_STEPS: - ep_rs_sum = sum(all_r) - - if 'running_reward' not in globals(): - running_reward = ep_rs_sum - else: - running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 - # start rending if running_reward greater than a threshold - # if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True - print("Episode: %d reward: %f running_reward %f took: %.5f" % \ - (i_episode, ep_rs_sum, running_reward, time.time() - episode_time)) - - # Early Stopping for quick check - if t >= MAX_EP_STEPS: - print("Early Stopping") - s = env.reset().astype(np.float32) - rall = 0 - while True: - env.render() - # a = actor.choose_action(s) - a = actor.choose_action_greedy(s) # Hao Dong: it is important for this task - s_new, r, done, info = env.step(a) - s_new = np.concatenate((s_new[0:N_F], s[N_F:]), axis=0).astype(np.float32) - rall += r - s = s_new - if done: - print("reward", rall) +if __name__ == '__main__': + + ''' + choose environment + 1. Openai gym: + env = gym.make() + 2. DeepMind Control Suite: + env = dm_control2gym.make() + ''' + + env = gym.make('CartPole-v0') + # dm_control2gym.create_render_mode('example mode', show=True, return_pixel=False, height=240, width=320, camera_id=-1, overlays=(), + # depth=False, scene_option=None) + # env = dm_control2gym.make(domain_name="cartpole", task_name="balance") + env.seed(2) # reproducible + # env = env.unwrapped + N_F = env.observation_space.shape[0] + # N_A = env.action_space.shape[0] + N_A = env.action_space.n + + print("observation dimension: %d" % N_F) # 4 + print("observation high: %s" % env.observation_space.high) # [ 2.4 , inf , 0.41887902 , inf] + print("observation low : %s" % env.observation_space.low) # [-2.4 , -inf , -0.41887902 , -inf] + print("num of actions: %d" % N_A) # 2 : left or right + + actor = Actor(n_features=N_F, n_actions=N_A, lr=LR_A) + # we need a good teacher, so the teacher should learn faster than the actor + critic = Critic(n_features=N_F, lr=LR_C) + + if args.train(): + for i_episode in range(MAX_EPISODE): + episode_time = time.time() + s = env.reset().astype(np.float32) + t = 0 # number of step in this episode + all_r = [] # rewards of all steps + while True: + + if RENDER: env.render() + + a = actor.choose_action(s) + + s_new, r, done, info = env.step(a) + s_new = s_new.astype(np.float32) + + if done: r = -20 + # these may helpful in some tasks + # if abs(s_new[0]) >= env.observation_space.high[0]: + # # cart moves more than 2.4 units from the center + # r = -20 + # reward for the distance between cart to the center + # r -= abs(s_new[0]) * .1 + + all_r.append(r) + + td_error = critic.learn(s, r, s_new) # learn Value-function : gradient = grad[r + lambda * V(s_new) - V(s)] + try: + actor.learn(s, a, td_error) # learn Policy : true_gradient = grad[logPi(s, a) * td_error] + except KeyboardInterrupt: # if Ctrl+C at running actor.learn(), then save model, or exit if not at actor.learn() + actor.save_ckpt() + critic.save_ckpt() + # logging + + s = s_new + t += 1 + + if done or t >= MAX_EP_STEPS: + ep_rs_sum = sum(all_r) + + if 'running_reward' not in globals(): + running_reward = ep_rs_sum + else: + running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 + # start rending if running_reward greater than a threshold + # if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True + print("Episode: %d reward: %f running_reward %f took: %.5f" % \ + (i_episode, ep_rs_sum, running_reward, time.time() - episode_time)) + + # Early Stopping for quick check + if t >= MAX_EP_STEPS: + print("Early Stopping") s = env.reset().astype(np.float32) rall = 0 - break + while True: + env.render() + # a = actor.choose_action(s) + a = actor.choose_action_greedy(s) # Hao Dong: it is important for this task + s_new, r, done, info = env.step(a) + s_new = np.concatenate((s_new[0:N_F], s[N_F:]), axis=0).astype(np.float32) + rall += r + s = s_new + if done: + print("reward", rall) + s = env.reset().astype(np.float32) + rall = 0 + break + + if args.test(): \ No newline at end of file diff --git a/examples/reinforcement_learning/tutorial_SAC.py b/examples/reinforcement_learning/tutorial_SAC.py index 0006630d0..3902fcbac 100644 --- a/examples/reinforcement_learning/tutorial_SAC.py +++ b/examples/reinforcement_learning/tutorial_SAC.py @@ -341,100 +341,101 @@ def plot(frame_idx, rewards): plt.savefig('sac.png') # plt.show() -# initialization of env -env = NormalizedActions(gym.make(ENV)) -action_dim = env.action_space.shape[0] -state_dim = env.observation_space.shape[0] -# initialization of buffer -replay_buffer = ReplayBuffer(replay_buffer_size) -# initialization of trainer -sac_trainer=SAC_Trainer(replay_buffer, hidden_dim=hidden_dim, action_range=action_range, \ -soft_q_lr=soft_q_lr, policy_lr=policy_lr, alpha_lr=alpha_lr ) -#set train mode -sac_trainer.soft_q_net1.train() -sac_trainer.soft_q_net2.train() -sac_trainer.target_soft_q_net1.train() -sac_trainer.target_soft_q_net2.train() -sac_trainer.policy_net.train() - -# training loop -if args.train: - frame_idx = 0 - rewards = [] - t0 = time.time() - while frame_idx < max_frames: - state = env.reset() - state = state.astype(np.float32) - episode_reward = 0 - if frame_idx <1 : - print('intialize') - _=sac_trainer.policy_net([state]) # need an extra call here to make inside functions be able to use model.forward - - for step in range(max_steps): - if frame_idx > explore_steps: +if __name__ == '__main__': + # initialization of env + env = NormalizedActions(gym.make(ENV)) + action_dim = env.action_space.shape[0] + state_dim = env.observation_space.shape[0] + # initialization of buffer + replay_buffer = ReplayBuffer(replay_buffer_size) + # initialization of trainer + sac_trainer=SAC_Trainer(replay_buffer, hidden_dim=hidden_dim, action_range=action_range, \ + soft_q_lr=soft_q_lr, policy_lr=policy_lr, alpha_lr=alpha_lr ) + #set train mode + sac_trainer.soft_q_net1.train() + sac_trainer.soft_q_net2.train() + sac_trainer.target_soft_q_net1.train() + sac_trainer.target_soft_q_net2.train() + sac_trainer.policy_net.train() + + # training loop + if args.train: + frame_idx = 0 + rewards = [] + t0 = time.time() + while frame_idx < max_frames: + state = env.reset() + state = state.astype(np.float32) + episode_reward = 0 + if frame_idx <1 : + print('intialize') + _=sac_trainer.policy_net([state]) # need an extra call here to make inside functions be able to use model.forward + + for step in range(max_steps): + if frame_idx > explore_steps: + action = sac_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC) + else: + action = sac_trainer.policy_net.sample_action() + + next_state, reward, done, _ = env.step(action) + next_state = next_state.astype(np.float32) + env.render() + done = 1 if done == True else 0 + # print('s:', state, action, reward, next_state, done) + + replay_buffer.push(state, action, reward, next_state, done) + + state = next_state + episode_reward += reward + frame_idx += 1 + + if len(replay_buffer) > batch_size: + for i in range(update_itr): + sac_trainer.update(batch_size, reward_scale=reward_scale, auto_entropy=AUTO_ENTROPY, target_entropy=-1.*action_dim) + + if frame_idx % 500 == 0: + plot(frame_idx, rewards) + + if done: + break + episode = int(frame_idx/max_steps) # current episode + all_episodes = int(max_frames/max_steps) # total episodes + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(episode, all_episodes, episode_reward, time.time()-t0 ) ) + rewards.append(episode_reward) + sac_trainer.save_weights() + + if args.test: + frame_idx = 0 + rewards = [] + t0 = time.time() + sac_trainer.load_weights() + + while frame_idx < test_frames: + state = env.reset() + state = state.astype(np.float32) + episode_reward = 0 + if frame_idx <1 : + print('intialize') + _=sac_trainer.policy_net([state]) # need an extra call to make inside functions be able to use forward + + + for step in range(max_steps): action = sac_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC) - else: - action = sac_trainer.policy_net.sample_action() - - next_state, reward, done, _ = env.step(action) - next_state = next_state.astype(np.float32) - env.render() - done = 1 if done == True else 0 - print('s:', state, action, reward, next_state, done) - - replay_buffer.push(state, action, reward, next_state, done) - - state = next_state - episode_reward += reward - frame_idx += 1 - - if len(replay_buffer) > batch_size: - for i in range(update_itr): - sac_trainer.update(batch_size, reward_scale=reward_scale, auto_entropy=AUTO_ENTROPY, target_entropy=-1.*action_dim) - - if frame_idx % 500 == 0: - plot(frame_idx, rewards) - - if done: - break - episode = int(frame_idx/max_steps) # current episode - all_episodes = int(max_frames/max_steps) # total episodes - print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(episode, all_episodes, episode_reward, time.time()-t0 ) ) - rewards.append(episode_reward) - sac_trainer.save_weights() - -if args.test: - frame_idx = 0 - rewards = [] - t0 = time.time() - sac_trainer.load_weights() - - while frame_idx < test_frames: - state = env.reset() - state = state.astype(np.float32) - episode_reward = 0 - if frame_idx <1 : - print('intialize') - _=sac_trainer.policy_net([state]) # need an extra call to make inside functions be able to use forward - - - for step in range(max_steps): - action = sac_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC) - next_state, reward, done, _ = env.step(action) - next_state = next_state.astype(np.float32) - env.render() - done = 1 if done == True else 0 - - state = next_state - episode_reward += reward - frame_idx += 1 - - # if frame_idx % 50 == 0: - # plot(frame_idx, rewards) - - if done: - break - episode = int(frame_idx/max_steps) - all_episodes = int(test_frames/max_steps) - print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(episode, all_episodes, episode_reward, time.time()-t0 ) ) - rewards.append(episode_reward) + next_state, reward, done, _ = env.step(action) + next_state = next_state.astype(np.float32) + env.render() + done = 1 if done == True else 0 + + state = next_state + episode_reward += reward + frame_idx += 1 + + # if frame_idx % 50 == 0: + # plot(frame_idx, rewards) + + if done: + break + episode = int(frame_idx/max_steps) + all_episodes = int(test_frames/max_steps) + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(episode, all_episodes, episode_reward, time.time()-t0 ) ) + rewards.append(episode_reward) diff --git a/examples/reinforcement_learning/tutorial_TD3.py b/examples/reinforcement_learning/tutorial_TD3.py index 63ed83758..e51f202ad 100644 --- a/examples/reinforcement_learning/tutorial_TD3.py +++ b/examples/reinforcement_learning/tutorial_TD3.py @@ -324,107 +324,108 @@ def plot(frame_idx, rewards): plt.savefig('td3.png') # plt.show() - -# initialization of env -env = NormalizedActions(gym.make(ENV)) -action_dim = env.action_space.shape[0] -state_dim = env.observation_space.shape[0] -# initialization of buffer -replay_buffer = ReplayBuffer(replay_buffer_size) -# initialization of trainer -td3_trainer=TD3_Trainer(replay_buffer, hidden_dim=hidden_dim, policy_target_update_interval=policy_target_update_interval, \ -action_range=action_range, q_lr=q_lr, policy_lr=policy_lr ) -# set train mode -td3_trainer.q_net1.train() -td3_trainer.q_net2.train() -td3_trainer.target_q_net1.train() -td3_trainer.target_q_net2.train() -td3_trainer.policy_net.train() -td3_trainer.target_policy_net.train() - -# training loop -if args.train: - frame_idx = 0 - rewards = [] - t0 = time.time() - while frame_idx < max_frames: - state = env.reset() - state = state.astype(np.float32) - episode_reward = 0 - if frame_idx <1 : - print('intialize') - _=td3_trainer.policy_net([state]) # need an extra call here to make inside functions be able to use model.forward - _=td3_trainer.target_policy_net([state]) - - - for step in range(max_steps): - if frame_idx > explore_steps: +if __name__ == '__main__': + + # initialization of env + env = NormalizedActions(gym.make(ENV)) + action_dim = env.action_space.shape[0] + state_dim = env.observation_space.shape[0] + # initialization of buffer + replay_buffer = ReplayBuffer(replay_buffer_size) + # initialization of trainer + td3_trainer=TD3_Trainer(replay_buffer, hidden_dim=hidden_dim, policy_target_update_interval=policy_target_update_interval, \ + action_range=action_range, q_lr=q_lr, policy_lr=policy_lr ) + # set train mode + td3_trainer.q_net1.train() + td3_trainer.q_net2.train() + td3_trainer.target_q_net1.train() + td3_trainer.target_q_net2.train() + td3_trainer.policy_net.train() + td3_trainer.target_policy_net.train() + + # training loop + if args.train: + frame_idx = 0 + rewards = [] + t0 = time.time() + while frame_idx < max_frames: + state = env.reset() + state = state.astype(np.float32) + episode_reward = 0 + if frame_idx <1 : + print('intialize') + _=td3_trainer.policy_net([state]) # need an extra call here to make inside functions be able to use model.forward + _=td3_trainer.target_policy_net([state]) + + + for step in range(max_steps): + if frame_idx > explore_steps: + action = td3_trainer.policy_net.get_action(state, explore_noise_scale=1.0) + else: + action = td3_trainer.policy_net.sample_action() + + next_state, reward, done, _ = env.step(action) + next_state = next_state.astype(np.float32) + env.render() + done = 1 if done == True else 0 + + replay_buffer.push(state, action, reward, next_state, done) + + state = next_state + episode_reward += reward + frame_idx += 1 + + if len(replay_buffer) > batch_size: + for i in range(update_itr): + td3_trainer.update(batch_size, eval_noise_scale=0.5, reward_scale=1.) + + if frame_idx % 500 == 0: + plot(frame_idx, rewards) + + if done: + break + episode = int(frame_idx/max_steps) # current episode + all_episodes = int(max_frames/max_steps) # total episodes + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'\ + .format(episode, all_episodes, episode_reward, time.time()-t0 )) + rewards.append(episode_reward) + td3_trainer.save_weights() + + if args.test: + frame_idx = 0 + rewards = [] + t0 = time.time() + + td3_trainer.load_weights() + + while frame_idx < test_frames: + state = env.reset() + state = state.astype(np.float32) + episode_reward = 0 + if frame_idx <1 : + print('intialize') + _=td3_trainer.policy_net([state]) # need an extra call to make inside functions be able to use forward + _=td3_trainer.target_policy_net([state]) + + + for step in range(max_steps): action = td3_trainer.policy_net.get_action(state, explore_noise_scale=1.0) - else: - action = td3_trainer.policy_net.sample_action() - - next_state, reward, done, _ = env.step(action) - next_state = next_state.astype(np.float32) - env.render() - done = 1 if done == True else 0 - - replay_buffer.push(state, action, reward, next_state, done) - - state = next_state - episode_reward += reward - frame_idx += 1 - - if len(replay_buffer) > batch_size: - for i in range(update_itr): - td3_trainer.update(batch_size, eval_noise_scale=0.5, reward_scale=1.) - - if frame_idx % 500 == 0: - plot(frame_idx, rewards) - - if done: - break - episode = int(frame_idx/max_steps) # current episode - all_episodes = int(max_frames/max_steps) # total episodes - print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'\ - .format(episode, all_episodes, episode_reward, time.time()-t0 )) - rewards.append(episode_reward) - td3_trainer.save_weights() - -if args.test: - frame_idx = 0 - rewards = [] - t0 = time.time() - - td3_trainer.load_weights() - - while frame_idx < test_frames: - state = env.reset() - state = state.astype(np.float32) - episode_reward = 0 - if frame_idx <1 : - print('intialize') - _=td3_trainer.policy_net([state]) # need an extra call to make inside functions be able to use forward - _=td3_trainer.target_policy_net([state]) - - - for step in range(max_steps): - action = td3_trainer.policy_net.get_action(state, explore_noise_scale=1.0) - next_state, reward, done, _ = env.step(action) - next_state = next_state.astype(np.float32) - env.render() - done = 1 if done == True else 0 - - state = next_state - episode_reward += reward - frame_idx += 1 - - # if frame_idx % 50 == 0: - # plot(frame_idx, rewards) - - if done: - break - episode = int(frame_idx/max_steps) - all_episodes = int(test_frames/max_steps) - print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'\ - .format(episode, all_episodes, episode_reward, time.time()-t0 ) ) - rewards.append(episode_reward) + next_state, reward, done, _ = env.step(action) + next_state = next_state.astype(np.float32) + env.render() + done = 1 if done == True else 0 + + state = next_state + episode_reward += reward + frame_idx += 1 + + # if frame_idx % 50 == 0: + # plot(frame_idx, rewards) + + if done: + break + episode = int(frame_idx/max_steps) + all_episodes = int(test_frames/max_steps) + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'\ + .format(episode, all_episodes, episode_reward, time.time()-t0 ) ) + rewards.append(episode_reward) diff --git a/examples/reinforcement_learning/tutorial_format.py b/examples/reinforcement_learning/tutorial_format.py new file mode 100644 index 000000000..6737439c5 --- /dev/null +++ b/examples/reinforcement_learning/tutorial_format.py @@ -0,0 +1,92 @@ +# the format of turorial algorithm # +# please heavily annotate the code # + +''' +Algorithm Name +------------------------ +Briefly describe the algorithms, add some details. + +Reference +--------- +original paper: https://arxiv.org/pdf/1802.09477.pdf +website: ... + + +Environment +--- +Openai Gym Pendulum-v0, continuous action space + +Prerequisites +--- +tensorflow >=2.0.0a0 +tensorlayer >=2.0.0 +... + +To run +------- +python tutorial_***.py --train/test + +''' + +import 'package_name' + +np.random.seed(2) +tf.random.set_seed(2) # reproducible + +# add arguments in command --train/test +parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') +parser.add_argument('--train', dest='train', action='store_true', default=False) +parser.add_argument('--test', dest='test', action='store_true', default=True) +args = parser.parse_args() + +##################### hyper parameters #################### +A=a # description +B=b # description + +############################### Algorithm Name #################################### + +class C(): # algorithm-specific classes + +def D(): # some common functions, could be extracted into utils afterwards + + +if __name__ == '__main__': + + '''initialization of env, buffer, networks in algorithms''' + + + # training loop + if args.train: + t0 = time.time() + while: # loop of episodes + while: # loop of steps in episode + ''' step ''' + + ''' train ''' + + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'\ + .format(episode, all_episodes, episode_reward, time.time()-t0 )) + + ''' plot , following the format of ./baselines/utils/plot''' + plot(rewards, Algorithm_name = 'SAC', Env_name = ENV) + + ''' save weights ''' + model.save_weights() + + + # testing loop + if args.test: + t0 = time.time() + ''' load_weights ''' + model.load_weights() + + while: # loop of episodes + while: # loop of steps in episode + ''' step ''' + + ''' train ''' + + + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'\ + .format(episode, all_episodes, episode_reward, time.time()-t0 ) ) + From 485c8e22c4fa1709906bb18354ca7cdc08431578 Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Sun, 9 Jun 2019 14:39:27 +0100 Subject: [PATCH 36/57] tutorial format --- examples/reinforcement_learning/tutorial_format.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/reinforcement_learning/tutorial_format.py b/examples/reinforcement_learning/tutorial_format.py index 6737439c5..20485064b 100644 --- a/examples/reinforcement_learning/tutorial_format.py +++ b/examples/reinforcement_learning/tutorial_format.py @@ -84,9 +84,6 @@ def D(): # some common functions, could be extracted into utils afterwards while: # loop of steps in episode ''' step ''' - ''' train ''' - - print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'\ .format(episode, all_episodes, episode_reward, time.time()-t0 ) ) From cacb4a3aff046d3ccfc6d5b00fe386c9f0120e9f Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Sun, 9 Jun 2019 14:47:02 +0100 Subject: [PATCH 37/57] tutorial format 1 --- examples/reinforcement_learning/README.md | 2 ++ .../reinforcement_learning/tutorial_format.py | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index a5a219c21..5d7365cd0 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -34,6 +34,8 @@ For each tutorial, open a terminal and run: `python ***.py --train` for training and `python ***.py --test` for testing. +The tutorial algorithms follow the same basic structure, as shown in file: `./tutorial_format.py` + ## Table of Contents: | Algorithms | Observation Space | Action Space | Tutorial Env | diff --git a/examples/reinforcement_learning/tutorial_format.py b/examples/reinforcement_learning/tutorial_format.py index 20485064b..df7c4b1c3 100644 --- a/examples/reinforcement_learning/tutorial_format.py +++ b/examples/reinforcement_learning/tutorial_format.py @@ -8,13 +8,13 @@ Reference --------- -original paper: https://arxiv.org/pdf/1802.09477.pdf +original paper: e.g. https://arxiv.org/pdf/1802.09477.pdf website: ... Environment --- -Openai Gym Pendulum-v0, continuous action space +e.g. Openai Gym Pendulum-v0, continuous action space Prerequisites --- @@ -40,19 +40,27 @@ args = parser.parse_args() ##################### hyper parameters #################### -A=a # description -B=b # description +A=a # description of hyper parameter +B=b # description of hyper parameter ############################### Algorithm Name #################################### class C(): # algorithm-specific classes + ''' description of class ''' + def C1(): + ''' description of function''' def D(): # some common functions, could be extracted into utils afterwards + ''' description of function ''' if __name__ == '__main__': '''initialization of env, buffer, networks in algorithms''' + env=... + buffer=... + network1=... + network2=... # training loop From 81c22db8cc2b6eb6c70cc0a6fdf20a39ad3df9d0 Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Sun, 9 Jun 2019 14:52:31 +0100 Subject: [PATCH 38/57] readme --- examples/reinforcement_learning/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index 5d7365cd0..af42ac6c3 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -34,7 +34,7 @@ For each tutorial, open a terminal and run: `python ***.py --train` for training and `python ***.py --test` for testing. -The tutorial algorithms follow the same basic structure, as shown in file: `./tutorial_format.py` +The tutorial algorithms follow the same basic structure, as shown in file: `[./tutorial_format.py](https://github.com/tensorlayer/tensorlayer/blob/reinforcement-learning/examples/reinforcement_learning/tutorial_format.py)` ## Table of Contents: From 980990c98a7fee749ea7bdb5cc337fb99c0b8bd7 Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Sun, 9 Jun 2019 14:54:22 +0100 Subject: [PATCH 39/57] readme --- examples/reinforcement_learning/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index af42ac6c3..6a9e55cb2 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -34,7 +34,7 @@ For each tutorial, open a terminal and run: `python ***.py --train` for training and `python ***.py --test` for testing. -The tutorial algorithms follow the same basic structure, as shown in file: `[./tutorial_format.py](https://github.com/tensorlayer/tensorlayer/blob/reinforcement-learning/examples/reinforcement_learning/tutorial_format.py)` +The tutorial algorithms follow the same basic structure, as shown in file: [`./tutorial_format.py`](https://github.com/tensorlayer/tensorlayer/blob/reinforcement-learning/examples/reinforcement_learning/tutorial_format.py) ## Table of Contents: From 7b8fc632909bb1aad6d468f8f83c5b4c9028aa41 Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Sun, 9 Jun 2019 14:57:38 +0100 Subject: [PATCH 40/57] format 2 --- examples/reinforcement_learning/tutorial_format.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/reinforcement_learning/tutorial_format.py b/examples/reinforcement_learning/tutorial_format.py index df7c4b1c3..e77b46ce2 100644 --- a/examples/reinforcement_learning/tutorial_format.py +++ b/examples/reinforcement_learning/tutorial_format.py @@ -13,11 +13,11 @@ Environment ---- +----------- e.g. Openai Gym Pendulum-v0, continuous action space Prerequisites ---- +--------------- tensorflow >=2.0.0a0 tensorlayer >=2.0.0 ... @@ -28,7 +28,12 @@ ''' -import 'package_name' +import time +import argparse +import numpy as np +import tensorflow as tf +import 'other package name' + np.random.seed(2) tf.random.set_seed(2) # reproducible From 0e24c30c526835cd84d4e01dfd25f5bf3cd3de05 Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Sun, 9 Jun 2019 15:10:42 +0100 Subject: [PATCH 41/57] format 2 --- .../reinforcement_learning/tutorial_AC.py | 65 ++++++++++++++++++- .../reinforcement_learning/tutorial_format.py | 8 +-- 2 files changed, 67 insertions(+), 6 deletions(-) diff --git a/examples/reinforcement_learning/tutorial_AC.py b/examples/reinforcement_learning/tutorial_AC.py index a90a3a59b..1128b1da5 100644 --- a/examples/reinforcement_learning/tutorial_AC.py +++ b/examples/reinforcement_learning/tutorial_AC.py @@ -42,7 +42,7 @@ To run ------ -python tutorial_sac.py --train/test +python tutorial_AC.py --train/test """ import time @@ -250,5 +250,66 @@ def load_ckpt(self): # load trained weights s = env.reset().astype(np.float32) rall = 0 break + actor.save_ckpt() + critic.save_ckpt() - if args.test(): \ No newline at end of file + if args.test(): + for i_episode in range(MAX_EPISODE): + episode_time = time.time() + s = env.reset().astype(np.float32) + t = 0 # number of step in this episode + all_r = [] # rewards of all steps + while True: + if RENDER: env.render() + a = actor.choose_action(s) + s_new, r, done, info = env.step(a) + s_new = s_new.astype(np.float32) + if done: r = -20 + # these may helpful in some tasks + # if abs(s_new[0]) >= env.observation_space.high[0]: + # # cart moves more than 2.4 units from the center + # r = -20 + # reward for the distance between cart to the center + # r -= abs(s_new[0]) * .1 + + all_r.append(r) + try: + actor.learn(s, a, td_error) # learn Policy : true_gradient = grad[logPi(s, a) * td_error] + except KeyboardInterrupt: # if Ctrl+C at running actor.learn(), then save model, or exit if not at actor.learn() + actor.save_ckpt() + critic.save_ckpt() + # logging + + s = s_new + t += 1 + + if done or t >= MAX_EP_STEPS: + ep_rs_sum = sum(all_r) + + if 'running_reward' not in globals(): + running_reward = ep_rs_sum + else: + running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 + # start rending if running_reward greater than a threshold + # if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True + print("Episode: %d reward: %f running_reward %f took: %.5f" % \ + (i_episode, ep_rs_sum, running_reward, time.time() - episode_time)) + + # Early Stopping for quick check + if t >= MAX_EP_STEPS: + print("Early Stopping") + s = env.reset().astype(np.float32) + rall = 0 + while True: + env.render() + # a = actor.choose_action(s) + a = actor.choose_action_greedy(s) # Hao Dong: it is important for this task + s_new, r, done, info = env.step(a) + s_new = np.concatenate((s_new[0:N_F], s[N_F:]), axis=0).astype(np.float32) + rall += r + s = s_new + if done: + print("reward", rall) + s = env.reset().astype(np.float32) + rall = 0 + break \ No newline at end of file diff --git a/examples/reinforcement_learning/tutorial_format.py b/examples/reinforcement_learning/tutorial_format.py index e77b46ce2..645f5742b 100644 --- a/examples/reinforcement_learning/tutorial_format.py +++ b/examples/reinforcement_learning/tutorial_format.py @@ -80,17 +80,17 @@ def D(): # some common functions, could be extracted into utils afterwards print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'\ .format(episode, all_episodes, episode_reward, time.time()-t0 )) - ''' plot , following the format of ./baselines/utils/plot''' - plot(rewards, Algorithm_name = 'SAC', Env_name = ENV) + ''' plot , following the format of ./baselines/utils/plot()''' + plot(rewards, Algorithm_name = 'SAC', Env_name = 'Pendulum-v0') - ''' save weights ''' + ''' save weights, implemented in defined classes above, following the format of ./baselines/utils/save_model() ''' model.save_weights() # testing loop if args.test: t0 = time.time() - ''' load_weights ''' + ''' save weights, implemented in defined classes above, following the format of ./baselines/utils/load_model() ''' model.load_weights() while: # loop of episodes From 8e9400a6e594ea33eae8bb4f32b2bae4ce56654e Mon Sep 17 00:00:00 2001 From: Officium Date: Sun, 9 Jun 2019 22:57:29 +0800 Subject: [PATCH 42/57] manual set random.seed in SAC and TD3, change format of tutorial C51 --- .../baselines/wrappers.py | 1 + .../reinforcement_learning/tutorial_C51.py | 211 +++++++++++------- .../reinforcement_learning/tutorial_SAC.py | 1 + .../reinforcement_learning/tutorial_TD3.py | 1 + 4 files changed, 135 insertions(+), 79 deletions(-) diff --git a/examples/reinforcement_learning/baselines/wrappers.py b/examples/reinforcement_learning/baselines/wrappers.py index 231a9880b..963849598 100644 --- a/examples/reinforcement_learning/baselines/wrappers.py +++ b/examples/reinforcement_learning/baselines/wrappers.py @@ -28,6 +28,7 @@ 'SubprocVecEnv', # vectorized env wrapper 'VecFrameStack', # stack frames in vectorized env 'Monitor', # Episode reward and length monitor + 'NormalizedActions', # normalized action to actual space ) cv2.ocl.setUseOpenCL(False) # env_id -> env_type diff --git a/examples/reinforcement_learning/tutorial_C51.py b/examples/reinforcement_learning/tutorial_C51.py index 25cf0251c..25ed688b2 100644 --- a/examples/reinforcement_learning/tutorial_C51.py +++ b/examples/reinforcement_learning/tutorial_C51.py @@ -8,6 +8,8 @@ tensorlayer==2.0.1 """ +import argparse +import os import random import time @@ -18,8 +20,25 @@ from tutorial_wrappers import build_env -seed = 0 -env_id = 'CartPole-v0' # CartPole-v0, PongNoFrameskip-v4 +parser = argparse.ArgumentParser() +parser.add_argument('--mode', help='train or test', default='train') +parser.add_argument('--save_path', default='c51', + help='folder to save if mode == train else model path,' + 'qnet will be saved once target net update') +parser.add_argument('--seed', help='random seed', type=int, default=0) +parser.add_argument('--env_id', default='CartPole-v0', + help='CartPole-v0 or PongNoFrameskip-v4') +args = parser.parse_args() +print(args) + +if args.mode == 'train': + os.makedirs(args.save_path, exist_ok=True) +random.seed(args.seed) +np.random.seed(args.seed) +tf.random.set_seed(args.seed) # reproducible +env_id = args.env_id +env = build_env(env_id, seed=args.seed) + if env_id == 'CartPole-v0': qnet_type = 'MLP' number_timesteps = 10000 # total number of time steps to train on @@ -42,12 +61,11 @@ target_q_update_freq = 200 # how frequency target q net update ob_scale = 1.0 / 255 # scale observations -env = build_env(env_id, seed=seed) in_dim = env.observation_space.shape out_dim = env.action_space.n reward_gamma = 0.99 # reward discount batch_size = 32 # batch size for sampling from replay buffer -warm_start = buffer_size / 10 # sample times befor learning +warm_start = buffer_size / 10 # sample times before learning atom_num = 51 min_value = -10 max_value = 10 @@ -116,6 +134,7 @@ def add(self, *args): self._next_idx = (self._next_idx + 1) % self._maxsize def _encode_sample(self, idxes): + # encode sample to numpy.array with right dtype b_o, b_a, b_r, b_o_, b_d = [], [], [], [], [] for i in idxes: o, a, r, o_, d = self._storage[i] @@ -134,6 +153,7 @@ def _encode_sample(self, idxes): def sample(self, batch_size): indexes = range(len(self._storage)) + # allow sampling with replacement idxes = [random.choice(indexes) for _ in range(batch_size)] return self._encode_sample(idxes) @@ -144,83 +164,116 @@ def sync(net, net_tar): var_tar.assign(var) -qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') -qnet.train() -trainabel_weights = qnet.trainable_weights -targetqnet = MLP('targetq') if qnet_type == 'MLP' else CNN('targetq') -targetqnet.infer() -sync(qnet, targetqnet) -optimizer = tf.optimizers.Adam(learning_rate=lr) -buffer = ReplayBuffer(buffer_size) - -o = env.reset() -nepisode = 0 -t = time.time() -for i in range(1, number_timesteps + 1): - eps = epsilon(i) - - # select action - if random.random() < eps: - a = int(random.random() * out_dim) - else: +if args.mode == 'train': + qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') + qnet.train() + trainabel_weights = qnet.trainable_weights + targetqnet = MLP('targetq') if qnet_type == 'MLP' else CNN('targetq') + targetqnet.infer() + sync(qnet, targetqnet) + optimizer = tf.optimizers.Adam(learning_rate=lr) + buffer = ReplayBuffer(buffer_size) + + o = env.reset() + nepisode = 0 + t = time.time() + for i in range(1, number_timesteps + 1): + eps = epsilon(i) + + # select action + if random.random() < eps: + a = int(random.random() * out_dim) + else: + obv = np.expand_dims(o, 0).astype('float32') * ob_scale + qdist = np.exp(qnet(obv).numpy()) + qvalues = (qdist * vrange).sum(-1) + a = qvalues.argmax(1)[0] + + # execute action and feed to replay buffer + # note that `_` tail in var name means next + o_, r, done, info = env.step(a) + buffer.add(o, a, r, o_, done) + + if i >= warm_start: + # sync q net and target q net + if i % target_q_update_freq == 0: + sync(qnet, targetqnet) + path = os.path.join(args.save_path, '{}.npz'.format(i)) + tl.files.save_npz(qnet.trainable_weights, name=path) + + # sample from replay buffer + b_o, b_a, b_r, b_o_, b_d = buffer.sample(batch_size) + + # q estimation, see Algorithm 1 in paper for detail + b_dist_ = np.exp(targetqnet(b_o_).numpy()) + b_a_ = (b_dist_ * vrange).sum(-1).argmax(1) + b_tzj = np.clip(reward_gamma * (1 - b_d[:, None]) * vrange[None, :] + + b_r[:, None], min_value, max_value) + b_i = (b_tzj - min_value) / deltaz + b_l = np.floor(b_i).astype('int64') + b_u = np.ceil(b_i).astype('int64') + templ = b_dist_[range(batch_size), b_a_, :] * (b_u - b_i) + tempu = b_dist_[range(batch_size), b_a_, :] * (b_i - b_l) + b_m = np.zeros((batch_size, atom_num)) + # TODO: aggregate value by index and batch update (scatter_add) + for j in range(batch_size): + for k in range(atom_num): + b_m[j][b_l[j][k]] += templ[j][k] + b_m[j][b_u[j][k]] += tempu[j][k] + b_m = tf.convert_to_tensor(b_m, dtype='float32') + + # calculate loss + with tf.GradientTape() as q_tape: + b_index = np.stack([range(batch_size), b_a], 1) + b_index = tf.convert_to_tensor(b_index, 'int64') + b_dist_a = tf.gather_nd(qnet(b_o), b_index) + loss = -tf.reduce_mean(tf.reduce_sum(b_dist_a * b_m, 1)) + + # backward gradients + q_grad = q_tape.gradient(loss, trainabel_weights) + optimizer.apply_gradients(zip(q_grad, trainabel_weights)) + + if done: + o = env.reset() + else: + o = o_ + + # episode in info is real (unwrapped) message + if info.get('episode'): + nepisode += 1 + reward, length = info['episode']['r'], info['episode']['l'] + fps = int(length / (time.time() - t)) + print('Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}, FPS: {}' + .format(i, nepisode, reward, length, fps)) + t = time.time() +else: + qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') + tl.files.load_and_assign_npz(name=args.save_path, network=qnet) + qnet.eval() + + nepisode = 0 + o = env.reset() + for i in range(1, number_timesteps + 1): obv = np.expand_dims(o, 0).astype('float32') * ob_scale qdist = np.exp(qnet(obv).numpy()) qvalues = (qdist * vrange).sum(-1) a = qvalues.argmax(1)[0] - # execute action and feed to replay buffer - # note that `_` tail in var name means next - o_, r, done, info = env.step(a) - buffer.add(o, a, r, o_, done) - - if i >= warm_start: - # sync q net and target q net - if i % target_q_update_freq == 0: - sync(qnet, targetqnet) - - # sample from replay buffer - b_o, b_a, b_r, b_o_, b_d = buffer.sample(batch_size) - - # q estimation, see Algorithm 1 in paper for detail - b_dist_ = np.exp(targetqnet(b_o_).numpy()) - b_a_ = (b_dist_ * vrange).sum(-1).argmax(1) - b_tzj = np.clip(reward_gamma * (1 - b_d[:, None]) * vrange[None, :] + - b_r[:, None], min_value, max_value) - b_i = (b_tzj - min_value) / deltaz - b_l = np.floor(b_i).astype('int64') - b_u = np.ceil(b_i).astype('int64') - templ = b_dist_[range(batch_size), b_a_, :] * (b_u - b_i) - tempu = b_dist_[range(batch_size), b_a_, :] * (b_i - b_l) - b_m = np.zeros((batch_size, atom_num)) - # TODO: aggregate value by index and batch update (scatter_add) - for j in range(batch_size): - for k in range(atom_num): - b_m[j][b_l[j][k]] += templ[j][k] - b_m[j][b_u[j][k]] += tempu[j][k] - b_m = tf.convert_to_tensor(b_m, dtype='float32') - - # calculate loss - with tf.GradientTape() as q_tape: - b_index = np.stack([range(batch_size), b_a], 1) - b_index = tf.convert_to_tensor(b_index, 'int64') - b_dist_a = tf.gather_nd(qnet(b_o), b_index) - loss = -tf.reduce_mean(tf.reduce_sum(b_dist_a * b_m, 1)) - - # backward gradients - q_grad = q_tape.gradient(loss, trainabel_weights) - optimizer.apply_gradients(zip(q_grad, trainabel_weights)) - - if done: - o = env.reset() - else: - o = o_ - - # episode in info is real (unwrapped) message - if info.get('episode'): - nepisode += 1 - reward, length = info['episode']['r'], info['episode']['l'] - fps = int(length / (time.time() - t)) - print('Time steps so far: {}, episode so far: {}, ' - 'episode reward: {:.4f}, episode length: {}, FPS: {}' - .format(i, nepisode, reward, length, fps)) - t = time.time() + # execute action and feed to replay buffer + # note that `_` tail in var name means next + o_, r, done, info = env.step(a) + + if done: + o = env.reset() + else: + o = o_ + + # episode in info is real (unwrapped) message + if info.get('episode'): + nepisode += 1 + reward, length = info['episode']['r'], info['episode']['l'] + print('Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}' + .format(i, nepisode, reward, length)) + diff --git a/examples/reinforcement_learning/tutorial_SAC.py b/examples/reinforcement_learning/tutorial_SAC.py index 3902fcbac..f2049df8e 100644 --- a/examples/reinforcement_learning/tutorial_SAC.py +++ b/examples/reinforcement_learning/tutorial_SAC.py @@ -48,6 +48,7 @@ tl.logging.set_verbosity(tl.logging.DEBUG) +random.seed(2) np.random.seed(2) tf.random.set_seed(2) # reproducible diff --git a/examples/reinforcement_learning/tutorial_TD3.py b/examples/reinforcement_learning/tutorial_TD3.py index e51f202ad..48d1f109c 100644 --- a/examples/reinforcement_learning/tutorial_TD3.py +++ b/examples/reinforcement_learning/tutorial_TD3.py @@ -50,6 +50,7 @@ tl.logging.set_verbosity(tl.logging.DEBUG) +random.seed(2) np.random.seed(2) tf.random.set_seed(2) # reproducible From a0d0c18fcde09f06be9ca1556052cf8064df4ac6 Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Sun, 9 Jun 2019 16:29:58 +0100 Subject: [PATCH 43/57] format ac a3c q-learning dqn sac td3 --- examples/reinforcement_learning/a3c.png | Bin 0 -> 22295 bytes .../reinforcement_learning/tutorial_A3C.py | 176 +++++++++------- .../reinforcement_learning/tutorial_AC.py | 34 ++-- .../reinforcement_learning/tutorial_DQN.py | 192 ++++++++++++------ .../reinforcement_learning/tutorial_PG.py | 1 + .../reinforcement_learning/tutorial_SAC.py | 3 +- .../reinforcement_learning/tutorial_TD3.py | 3 +- 7 files changed, 252 insertions(+), 157 deletions(-) create mode 100644 examples/reinforcement_learning/a3c.png diff --git a/examples/reinforcement_learning/a3c.png b/examples/reinforcement_learning/a3c.png new file mode 100644 index 0000000000000000000000000000000000000000..918b5f2d305bbbd47dc37ec1fe69319f8a4ef97b GIT binary patch literal 22295 zcmeFZWmr~S_dU7^i!=a{G{7KLx}-z|q@CUarQL) zgrjR@3jTv@CnBME7XG-Mef}2ye$M)_njL}=>!SZ)r3JTtPJg)bZrd~ z3tf9_b1Qpu6TPdB2DWx4R+d-USnjg0Fel!f-Eollk6^OqIO2-;k#N%9pa4%625t54+*UM_FY$hM61i5?Y$j9BQUhs z4PPOc3C7B|7%o*WF%2CwfFDGB{XBz(ZZa_z9&FX-c#0jaHQ>M3#Jk4tQV%U zdrEYVhezSN%;~Zop9;YpyesxtXvun~lT z7%QJ*D8fZ?C2zoPY<#0oZF=$-i&b>A8^0u%6>(Yj4*$rM-MTv~G1uRLk!h16hM5~;ruhPSQD7^a?cADHP7tnDbKA2)mAR8P$` zo18(sB4^)aB?jJe*z%;8mhmkc_l|K@^V%2aJv=HZ%dsy~XEeh7IcLw%FRw^;x@et> zH)clePjh^ct*me3C&8J^Bv}dR4^6~BFfPrm+0l%DZ^&UH@!fO1^X-`RTGzN#B34}H z!lb-O-6aIs70ppUmy&t!(~FGBQ;nZ>@8-Gbw6!qG8ODi5R(u}5wtsH=vf@E-Ni-%& zNAN@8NAkV2kL0%Z>3n0Tbf-MT5FxQHPd4ngk`qh6Qh!uF@XKDX`{}+xE%sS=bw@73 z^|!2GxbL@dR`JW>+zy?~ox)X%@V#EwJF>(@4)7qhcNP@rB5nMNixxZ5nUz{Eo{H4)nVd3-t#^m`tgDJKZNa z9pcmQR*02G6$>e6$G)0LA04gq>!lVR@n$f|i>AXEnY2JE#@;~h3dsi%St;)|=;KFV z6(n13yUX&*ZO0<4PK~~2(*HIK4P5`K-Tw7pdc}b4=rRtHKrum6k6$lpi;h zU8+Gx?QXG|ieasvl#FLfKk@p}*_XL;q9q0*DLd3p_p81`)T^GCI76X%fLC!6CM*uMU2_D%||{@A_i zN$baN1YNQdMvQfzR%#9t|2dq#Z|l6}UEO->Ys7i&M%?`j4{EE5d+vlVLVes-?A6N; zm$v&`e^lMy2ni0&8;$(i!lU(~X~nr*4U-@DLGP72DfRDXdsg}JFxg5S{oZHfkoxDF z2q5{43eqqSErqjg%xbX07T-Yxx#VVxFGJK}*2Z zy+RQ@MDg67739-jxL>tNVOb|;*ShaUppSL_jCGGu8v<{^{MZT~e^s$4zq^9q*+bmh zocu!4B8OAIeu~~kn(30?RyXaM+(6z#f&~|}c74Z+G#;_2jv~5aym6{TB6_RcF zj62`vXhqI#Z^)Uy@KS#mlIwDj@>kx?M2~g*ndaw6eZ@WO7g)Ra*pF;^V7nk=&@5TYsenaKfA#w!DOq@W!t9jt z!S}k*zuK?i?dK^kCoZUqrQgv#FT^2yER9rOjS)%@@bi=M*3k`82>h)_O2*gvN>A73 z9hI2p>+fHGE8tA6z0%7Wa#yW0?|2GtY>2=iWy~tfkY-@-Agb;3w z+Rrr+=0Qg}e9knp!N4b^WXEgqQ3KYVTLJ!^B{|o7xnJKsn_$}Ic?n@Xd+WU1J)ZA) zCERNXWOP)Wh16;c40mWx=b4cv*5#@@f zI<0S!EerUv%>~nkNW0#aAcfUutOl<&S;NfJ36s5t*p-SSE;<%!oU@- zyKwrYfgQmS$B$_qzvstN>H8ZUVxIa>zwIoSD+=g|GOm^3=y3jb;P1mjXF9*;Tv7k{ zF;}UISiAFEU-0vi#~yLZ>pO+b{f^qTwcL~>-B`+>1@~maCa6@jRX(=@jn~E#&AGVp{gWb2#f(-B^;4RkVjk zan&m~{x+6#Li7p8Vdbe^A3{^Lg9eMTsBT3Uq-i?&EY_jzar2u4P;DLgf$B)UCS zmM}j>D{iXbBB9>MS;WXj?UT-y>4nta7t?OCK4u3TmrerknfuMH{nC%^?Ma`TBlqt7 z{Wv&{)StS6VAu0JJhMN){WE}0qJ!nJqP)$!ic?tir>-N|LZ|o#iZbSR2Z}f3;vT5Q znmY0z*21O+NDRMoVWs`tby`~5Q>RYFI>z_*DsC+et2}t{0FRImllQiTLoSJs z^o3cAp4qW>dI?tMvD_GaA9R?W+BpPXa3KY|@yAE+$!&%@{QdngWmejxBqYX-ex!R7 z-o%?n`zvN&U!PObxZ@Qy6()59!_+1V>1aNbHiGQ5P`eT%&IG_V6xlR3HpXsmi20M5 z0La^$P5#iEDov?r*Uk~eZW0XbMaHXFuNZR4grwJIOmrV;RER?gZ(}}-9HpvXK{^bk zTSEQ({S{PIRR>*FwB1V<9ji($M@{t%kj$#qXy0@r^K?JS7?(g3U8Glz) zpoBpl{Ljx_fj14;sz$XeaVA{iW`{+VmHj;)^QLH*8bxUmAu(kXmyqv))QtX(wtdWZ z?lgB&q&|sEs;{pvWWImDyK@^+lnoF)U@taTkFa=xkCV`3!&T+w=Q0%}ZvUOyDaIi_ zA|e8hnvtJhBTFTZWoPV*_nFoQe4oD4dTJz#e_(O7C>uWa(#zi^-AJK*Fkc(W*jx|C zK6{En$RItIkPjvTA-{Mr>Ep+bl03>YKVvzZ*Pm1K+CNTAObkwG^uNq_^{No_vr_)a z9ex3U>A5+zI~+**Ahx?Xg}V})^sZr+mU>9GM-J=LgtGRu|^E1;QAFmA+n|vZ;(MY$MY?^y{hUV%7R(k#y zYKlyg^mr5~5TyEG$$9s-Al$lNaT~vXU&u&{d#a^{0(nwe+Po68e#zjGkr8&!BRA+j zGVEr4RxXtev)j+g32d~n-Bck;xM|~5WIEDpqC2KtAYK&-K|!edAM~C9Ylw}56Z+{> zvX>W9)QY7O%s2il2LgW{bN$K6$k-U{qZph+CuV9aT(8W{=i_0vMe4Jd9(X~we3wauhh z^>Y^xtb~hO#x$9h?IMb*i@kQ^H&3F74C_3?Fj!v8JY`s7+*RtWUhoMWkRUps=h6)a z4$r>(2B#SPu~M)(x3Pqit_2O2j@KJk4h669s^I&!Tc>)yBI$-FKjp2ZaVb|}ljz>` zX+|SX{Ti?f!oK!_PXRL#&0IGA^AuefcgQjISD~IZZCPBCa9QaW7;=q3A4&`ly({*(TJs`;-pLpMpM%-=+Y;S$ z=-mn7X9;-kD>EYxYDP+!M{@@kwxeOcToy-gy%^;B`$dVj%o!wHJah{ILqf`^4@Kw3 zk6>x!ao3kx6z)5~eRQs4Uquuz00xMwF11&WaPpAus$JAazpsRTA8*V2LT0Y?DS-<%mj$t$%ECH$O5WzeX_|+(6S`c%vnW;u)Kw}QepxOb-dUq7rfbakb_Q73*4t3 zH(6E&zlgp)msn*PZHrla;k!4_Zg2EPJSYfGxlDEwngE+mjMx*B=o|;5VeYO-P0e__ z?{&qqDSO*H8HVH5i=i|05M4*lotwcaX)_iI zKMO?A+u@fO4o_myfG3e| z)#DJ~=H^y~WZIdnR`k>PQ%S?GkE52XWmsNaEEb9?-k#bwEs0vcVU=66P$Kcpf3CQ% zHC|S)eo>B9h*T5$*I$)!z_SMQBq_^RJc-Q=kD876|HmF-9azE z%W)G4q~_Z$Cbm3qmXegrTwQDWK%3XfQuSnQZH@I8&|;=P*HulPoXKPNv3=ElUt z6k3eex?kQig9?D_GR$6}4R^FUD3${FiUz_>yZ>vEsdGPor(4e$g$6vyq# z0>|#bxu#cA{a;lBZzM#hZ=<}vCz4mYXX>*<&13uAfK&5NAm4K58Co~%mVK#k7Nwo# z(XGRs@rAYTwEaJ0`KIl8F)0$40?6-)G`t}W$EdFjUtjtkmBXv$fE1Z{FDV&AvH3?b zXV~>&jr!~PrNkwJ0_!{E>9_Z+aIkgk?d>c2BO@bu9T!yx{iwu0(1d>Y@IgdWw8V91 zxy8J)#RePeZtJ#p#r*#KeP{u;_Hry7EHqw2pmUr)EW?t;N=r!zvG{aLRymr}A|xUr z9cISB$cXXYJ$YbhV1S<(s!R~;E097tlAh^HFkLz`aiFw!b|Rj`azs_zeJd1dW@}BI zHR9YghvU-oG)nVH#hA` zNl9bZJ#1_Wh-mnYpZ6eg3qawpZ#|#gx95Q1OXn8F(dmd2a9F9?F+JSdGFZuZRwi@f z(R;Z(t(y1d6o`h%o7u|B&kfB6ziZLFK&o$y#Bi8}!6w4gy6+cCgz(wVCgkVet=ag= z_lT1aVbNdYxE21cwXJ^vULygUpyte~6QARv9FS~ucQ`qf+oL$xDw`R;2;d@_{wvVv zeEYs&U{_@PUeG>^kQOw{Tb1S1hI z^%avP7J=slOe_x8f}-G0XKJmMrqSY}EC<)f;uNiQgTFg+N_ad9=~gPdoL2Qvy; z22BDtq7mfa9mce)H(EVoC94&MAGPDbiDra`V{73xb!WiMc*=W1z$WM2_nR(xBR z7WRHQ70?I74hCZ9%MOB#w5&l**unT3IJOVr?a~UCZ+qIUa=O#96dc?a&N`S-nLDZ>ndd~s5eO)yQRkUhJbe5f zUqZ^>MA0*tEEU-*rxJDSVDJ1aV8e>>T=7* zT%_0x+djT|^9D^!%v#l|DE%rU6Bs0ogU|_hSm1M{Pv#to2QE}tU211JBgCO=j69K+ zRtjg)q>~sM8*6EA52Yi#a{pnx$6m(CMSqgOn2_c zOGvyM8I_{tJX$R3DAsGlChZS_OY`%|zZ>j{P|E3Aav9l`i#qG!Fi>N0*rN z+@zR|G;wKXg4yLQGs^JHRq=c-m} z6_l03YLE8JP6p|N4)RP%7*L4&k}g>yqdjLVNj$ zLuHy=*+|h&DX{J;(n|&m?fmhOt%X$CLS~q*i`ip7K(KHvf$*`# z>RBBTkyk9WyLxCoQN#vTmh~vPWUppBi_D5Iy@ImGUepZ-eb9MPTI`D5-~LL9iYfF` zVYBUAi@IyR69+b5#zwIu|z?Y}r#dB_WAetBG%Dp6+ii zIZXYy;Oy!eUgOz7$Tj>l$Ed@^?==?gFr4RG-zeF9N~6; z5u&YMD4+aDQcCJ%gyX&JZQ9AMT%YKq_r^J?xm`x4{18P>xFYh;-f*2~9VjCKP@1f< ze6?wkcYTF{UNUJjhmb4@__u9b{!WLtYCU5jo1UJYgRZl?q+iPn-TL?w&|d8&^sU>M znGOarxmUWxqiFP@@9BC-qw=lctSYTx%nrZ5og<{-FJD|=0ey(T%n1r41*drg$Sl)K zOF4joPFh)EVd09yix)52qfcvXl>&U{a`bnGFzXy*8r5{o!)0^+PjE^zeyNG}_B|A9 zs}aPW$p;|~?VBGQAJ|z86-zifJEOO@tl8?bCH7G)O-Z3cH9Bk#6NInH}1R&OK?y< zw4e3Ak}`T`MlMe3lBIZ(+<+baX5Q z-Oui)OZgkbE9ycjzkg$r3C^?Tjvh@wOY@`J9V?PVUh`i%F9;Pe z0e;|fTJ8n3j!6*qwv9jZXym(lSI(b|xf_R=Kf!Tn=nhngOE;hB5TO+3p70p3qWA%k zHEKPVEmVM~Fr{c(j}wyCgODkq-Z_t0D}ki#R9p8*klrZM!CR=Erb}$uvSdB!qUp80 z6?K<$E4mJa`x4NudezbMcS^}>dlt~mP16sD#srz!iXjXw zovz5Gix*|~Ya8`~!o$O1)83YqmAxT_Amp=6rCVD|f9wxG+CTo#F4GxvfuEG17Z;KW zr^V0@J)RBEv*w)9mbHHkYySR;hMwnsAqgm`Ol1By$Iqu>SP!$Mp}($~zEiosP7Zn8 zn#flR`)F?=59DbDNDR;ZOTeMXt38^l|5m(PlGC`mir~?1GGrs8=T{H}2im5&ceTuP z56mwU4EiGi}g+42e|j`{yf=Q=$9p=V3*5M&Y=qp z#3Q?W+3e`xZ*U64-~tfH=+lKrD>e#^+hrg3H@S;*EH5y-%xrIK;GNFA)}Vsj+}@6X zeVe^BTqYF^z%UmjSjX%Buu^`54#YZh>5Ls_2V=m%$D2_0KlBky7Ap}oxmW`P{@I)nQqGO?KI7M{-K zIhK=;$TCx4q+gD4LWP~JGgxK6D;`MI1$*kfIW{5}5HOE<0@NC>v6FGj3CT1;2Q?Kw zE;jDEegZA@wK%z?YwpPeZw^MetJku%Lg9UUS`ZgFgG&Llf?BW!Pc<}R%uClrj{)+t z7dj(C9G&-4<9zM?PDBbI57c*8N!Qoz9$a~d80o@}?CR<=vK-LsS{|(m00eDKjD;vZ zS$-#?dAOo&e|6^dawU`crCci$M+s})&GFM8kkphEa_1L_5aF*CgPtkMN&6XBv&}I- z63u>I5ev$`dsR&J23K5&uOdj@LKnXxS5tp@&mDnnrB_s9jli{)XG_vzQ3<#d-;|D` z)7I7oQ^a2Oe(h$rROl!_2)&e`s2mpFP~an;Hu>{`kE7Db4yH@v>E|=`gp{1=`T6;a zGielP8%3%lyy`ZL3 z2zpe5$dUv@y?yg$>d0xtS~?L+3)0)1;@xAdYE96RpX8pkX7-iAvD+DST9}FV?E834 zl;pyNFYca**L|E}j?QZ7GsWiOVNE~BvOAXN=(0jRu+#B;$8g9=NK#JWlg$>kvan^I zQ?P-F*NAW*{BFP1S~sgITyS%%PF0kb3+RG!UDMtA}!^B4|h)Bkv(x5O?6>9}$CnHx63U$;nCJ z_gP?aK)!j*8J#mSv(TT%l{v@o<@|j_F;Fwox$;k_0mE&EE&zuMsg8t@FG|P(+Y`Se zAirDx1NwKS|u3AUicq(I*&+TvHJ&T22>i+K%SP@TQL}>aB6HAv@WNXQm zmu7#H&%)NTVZiLBclK5oqRv((Bna*Atf-ipnpO}ZUgLu!*)ASBj1_9kC=UrW&eBmr zD96AU8L4%zVyQc@1^mj$z#s`R>lYZPoct(&1V9oB94Tgh`(rz3mv#GKH?@yu{!_Vn zWnh|L%)~mijE-t<%ybxCH%5ef_+J<(xYyB(f&|53@`Y2Hws#0P4fXGxNI7_ZPj!XVA^2$pcZNLDa3v3VQ@6Dyf+wZRGqNb>5`R&EQWZeeeJG{K=&~2NJ zlo!Og%-=y7g5mFa1u=y=>Lo4tmC$+aK<9}`2rTOkr0K^WzZI!TB$q|*a;{t@Pb)Yj zPQW$Hb$rggMXw;eynNuJEwIV~8#e(m>dD?=PGI5ZY=1=a= z=F_I0Ed6Zj2XTbLNmB~;z9~PXK~`A(zK*=|-*`8oS&N`YY>&8?JTe;(f3)1z#55fT z4GPtBq6Az^PZN+QRM^eL-NWUjBI2P6EbM6q2a+;UYYkD9bY|oRwPR}|6*bPP2R`&v_yUK^hQ;@pA$&L zr%wvY4!XLDoaO@wYm?v2)~A}&_^E7JQ2NqscU5f%=(He!X(v75{8&vDufu!_wC=~n zFE=|-CDkb2U3qRUP1|GE{)QK{G%W7eSVBt8b|e`ct#!u7L6dtkpM#=Y6c~%7LFKwn;~!|kiy6cp9(`K@4)`iDF);{@Qo(=PB9me9V>j;KL+bfs_2x|m z47yxq`bh+~F^0?(O5zUg0zhx4+dv*ZckbM@79Uth($dTTY@7C^2Iwxa{I8%?!E|>N zSlVKz6|IV@Dn`IB@t#K&r-`VHRW+fj+{w#|UPxPt9CE%bJ2_+7W#&DpW=6v;>nn(> zSD?Dte-oW#7!PKb<5k5Q1ZgX9_6)Rq;a-8N&eQQ0y>-&==yZylH2ad0k`?{RSxVQg zT|2>wYKRc?<^HR3$4&;9YO0^VY3dkF3hdep*tT% zS-1Jg@uz@(((e)Adj;DMTa?X>I*p_Czzg5(;(XpccgdAo@zq!5T?^Mi{7kSMDYV%*>8B1philyjY~Y1oUL=7&4YSn3SP642dWh+DkTIe{i$CJ!%rzvhCU#C zk5MlW7K4@q*eFy`SR>~_6s;!zSoVo$Jte&``+0@7R^$ZOiEsc8tf!xVmK@K5L*M`l z$*jATYmu1l@<6hKC-L1zDd@>^7S=EkKy$2F_e&icGiQlK6|1!ts&;vi>B1WujmXhY@$1DT3nq;dszx6-sWy#DHIS>H(MG_3HNl3Gf&00f4g0iyI zedltnEvbGQ^FbHs+za4kE1|*p4`pPA$y6Gua>^SjF-x24NuwNOM-T)4(b3Ir!J!|2 z47+|?oZ)bFj{>G9^*%ddIZ`hFfq(sNwsID@2d1pKNv>SU0AQpg%fI#-C6nc3!2Y*s zoo5)F0@K0pPq%FY-2%eipy%srUtiSM5KKZ&o(|6DE9leG%6I(8ODxD!G{l2w4sAb4 zN)lAzRaPH8C?J?0ii++gwX0INka3wS+R}>6Gh)_c6c5s2p(>J z0wWZi1SphB!2WE4mVSqh?%R()Sjg@()VoBvGh^ISo9r_2X=`u2_UF326B~fdZgO7i zT`*)ZK!`w;D*@|NSX5L3+~eSqV*nLTH#&j4!>}XS?X0V@Yb+!O;**>4FQTPF5$!Z! zaZtJ$-1f{nNAs90OqG)rwSY@+dTJIf@G(M5+xp$YqgzMn65a2g{RTP8AG(^*mq)`B z7wUt9FAkcQDvXs4nls$Fa}PLUgk>PKK)15aRsTfW848UwfrJD$Z_-|@$`EHH~MU?aUu z%a+PO0BC*jh(oQ@Hg;uiuk_0GTzNp80ETENogpD1bP}drNj@%H^9Ij9rEyq|@e`8Y z6(V{g${8(fRlVjL=e90k@ccO(F%S)A2OYXSmaiu*7%sT2n1`+Ax<6O0w=jmr z#^%88im{~vtJ|9f`RyUgGDdK(eU`n7SktqGFZbf(f0~}N#PsiIr-of#8$XJAix}7Q zd;L@5Ib0YbfRLDl$?E|U6%KjaYL6GJTP%A2oxO%=v9US8!IYxqOP4Mw zILeO4FOAn}0~S^<(DUJST0SUa!8#@dZm2vJj4`I2p33R@0~u@mCB6B%q?89jC{?ri z)H;-zj&90|m3C*p+Pm(<#Q)%5TmLXXAT@mhxv1U&y7a^GTa?8* zc9qw$Yl?NIrZX2@=SgoX3PKV-Q%@t{g0fQ#EG#HrOfNB9W)&VC9StXDLStf39#z}( zPzmmxUF?Wbgl}kE9)r!{sfS1H_3PJr(`0GENhkv}9_ujcwQItt@(#yrl7ajZfTRQV zmM1>Lzu<^Wf0%2M+@aR9+>mTH%`T>gE8{j9*~-gjvX^J;2#LI)K`VrgGZj^17d);9i@5kCz^8* z)jAJ%P_1`Ex@w>u&jg(nq^BM*7@&E}z`)>&lCLf>Ye?|s^pLyDW)`j*bGLW{Z>H2I zog%5PYMq{x)g&4qcN2j(sAV4-5J(WgDWK6phfS__8jk2t^geicY&A}IOC^$SMf=5P29rl;X7QkCm<^~lPm; zV9KI<9Dw?wmslAa(-L_0I#T1pksr*TRDIpX#)blrfxVeZS?ZUL5QIC{Z1TFjc=)5> z6#00oUU2CBA;XP%6bi5!(P+NZUn&><x;-?4f|x+3s~uxEt5TXAY6ue6_Eryd%uh{Xnj;pNAA{H{9{EYT0kj4 z6oXuI-*yG8Z7FH|?meW(374x4~0naw|2URMTc7fRHBVph1Dkjl_#Hhony(M zO%FfnQV(V~bM|}iEY{Rk%$gpZ&L_6g8n4mX7R-JW&Aex^1BSs@StnDLbSBR{2|c_3 znd$;mC>hWAC^^ z^BKFX&NNCzjjy&gcZtBV>ScNr#1bn)3s!JwqONE#N`fKO^d5R8<{>P4&E5P^x;a6? z6`0q3Xc|jMPcBRGjNWre5I?E#(IpSI-Zqmaa-K{~JHM@x-InE*R5*e$y77w_)qfX* z`1H~QHZ0jR$f!UNfexx7+S|!&uF3u zjSHqYCvP#5upLC<;ap6IieXQ`9wh@(iwZ=qBD}jDMK<5X|1dD=BH^)h4$+V-G@7P& z(aSiz&~USQrvjD<>S8|ojyGlwN%;edg>s@N;c@Mt#vGL?@YV}VdXA`2{yIh`j-cr; zSU3?m?}wNT%tBddXw^T{#8$i`%4%Lcx$_fnzC;Zi>q&G&Ro*%bBlyH9B7HIerz=bJ zLo8RQ=p`r)8y{Kl>41@`m#%~p*+xYaO3n}vt{^9~KmT!_ntS#8h=l9*Vo;+VkHItW znY>ei*2!iv&nyG{v<}$`sttk~S#VNpkUr5jy#y=&{}%g9+}USVD||RNH)rq+grq0w zWQf;KtC1|5~5cDb*FK$rL6~2d8ljBW0dd)#kEv>Rso>z!&S6(gxpV%FCc10+Nm}=*ZpDrk@IdSkK_2#G<-_H6OY;=`{?S)u_ zI}ik>xyoni0m1y}?^gltQo+o=9M3>cF9uB#-_i^~KcernTmwT(j$)4^0*88jBO-H5dlPB-NvLxdS zVByg3{rg@&QpR2&EqgqifEsfFlWk~RoK~U>TIDd(a2{m&?+Wi>g#BaT2b}H>+Lt-b zZLw8loBBX!14hz~3;#|S)NPFdT9;tpdbOhhj!wmg55JA`K7c#Ctcq63BMj_kwmMEl zrOa)$ERLBY7Cijb`&I0UH#is&aeMfrjl-pPj~>zqfV6QAC{wG|N#AuC6Fe{u%5CaQCf7_gK;i1HPT`Ul&cM zMX!Qw2))c@docrqNwa}`T~kbEDC&N4c5w+?;xg%`135y5n$NKp*z05TB(#2#u35>lDsk#NM85^HWrhN_T7_!k=oh|#3eYa7l**tIE~LoZPjmNvU& zPz5Q_5Z&Sv*n4FedAD+zV{h$1a%XoEjcO+S^5mpoep_wTx9>OT3QiHP8%oi3dJdJh zO$5!^-9nU2sTdvcP#vCFD znFjiM0pO4+0J71oFX~`+zhA|qcmYj9EDTKbQnVu&VF(zDAtyz&l_ez&hmm9=kl>Na zeRRFDC~g3+0vcDcZZ(nbk~IH))(@1D7UhVtvj1k|(BaH`<+;NHagE7|i}FnpwS^qX9l5K|i{^VV##+`B^kYGN${7 zESK?8oOzMdD>$HxAPMh*?n^lePgn7m3T)9+x?-=*nXjXzJ^vB7Xfg^d-R+ z&n-L^5<)^+VDUNF_yL|}8TQlWpU*y5RU6Jpr2n+LUDBQXcoT#P()?&&j(bS$m!$5MKjw#6m>tuASh5l8*aQQbaT&am3DF;8yOq*JT)vk4WGHFQX%i&zMXy9PSr(O z6hBLdywKkfGnY6XO#j>2@C$hi7c#Fmacj3x{5prI7VO+Va*nY&6HW1NEZ2kSbO+ho z?>VJhMkWM56|I`gbKEczWB!8m@$K8U>Q92{m}n-wPkk9fyh6tf^p}2R9oXl;n>e{G zu~;x&HO0u8ynrn%#O($5nHAc$U+0anxsi((F34w9AN0m=#Su36+j>`-svZy6vC!t-jBU^)dKr8Aj1W{~)60`*Hov9Gdne?7{OFd5 zY}E35N4ZjA^WprPCd;N>9{mP8@5JK8S2*GRtG`R1D30cEII0(ZKbfc@%;RyI0dDuZ z*`%7QhoO3HL+dyUX56?Gm;T+}>Bb)S^=mG-%|pq!J+`>2Kfa(K<=A;28)hDPh^T66 z27IS+B?zYb1x~Be`z#veUKegWsvjDn$j!}Ft+4$ZR0tDy-&0s?v|!pNaraXweYa<& z&s?-jVs(^g!o7$?WOv9cPdK@&cQ_Ygde5+pyu7OF1{>QY=nq89&6z-m(s|7UVKSwg zFV1BidymAiFO2(e!-BtTjZTtrg@TWH@ZJKJH54jvlU`NM)6%pW2Q%Kws3;2aq2hCJ zVgRC{9;`&Kh#&0MpN)hCZvHM*CcZk|rkh`;G-0lFF79wfB1Ey5oR3-U-Y^zO5@$h@ z_zF~vu9?|0D^C{}-nbXP&)p7{JS_eA@%(cmqsi^1Vd_YR3m1YGH=X1sTO%7fbFwa8 zKCU7OijV)6vu>)NMo($TgWz2|PfZ;FqqzX?Cd<32`T6H%WMobc&o6`K!ujHfg9Asg zdd`;s7$J>}U;rMUA+dKueD+7B1hJ`^E4Nv)?!z+aJU#JE6G;R|Rclt*o?7V35mi&W z0xD+!r~3ie%F{8XhwT_liFE(1>(_%Rt#0wrwO=`XE0omMujN9)H5@T@qL)k37eyl4 zw2hCp`n7G~4mb6I=Li5ZjUF5X`A8>GS5sO_27liA`F({FxOTA7A)1BJ1Lxvi1mvgp z6dZVU$47iL&Qn1yW-2Nw$(4J@tg>b$>gAP{NyWt^4568X46xF{Y4M&_-X`%zSL=%L zXhtp8rn7ZB2ZoDV^-W6DVW4WB$DA2htHMC3U8xp`j*tHUGrBlbf)Ak0+QvrM&5b{d zRXZ>#>H2LFJft2xp82jjmKqhd(=Tb>9DBh;4gF-TQ_&d`WLs@r3AJ|qc6!)|(A%tR zPw@M=`oWQfo)wnhZ7wb@emLg27hP7yh19pU;Ws00ecCYrDa=;mgw<9AO zxVX4F;1X6X&^u+=5ofX33!{22rEle(GT1ONJwW>Rvy!UA)!~6Vk@Q7w_~pr~7w1U8!ap`&>FEi@0$0+vOQ!BOi>yK|rkUVDQqjddPeaZ5vPa z=VQZ@9CD$V;AB5ukNlT4=-5t@=iBGLC&@Rc%^J;6*MqN|?D~iI8~)!f4r`LB4>@-Y zLDX~leng(XaG}dPbH7$|$6X_k8z~im_(JM27|h8>@={@R#A&>au$7jL2P5EBX<~J_1 zB+$zc#0a>Ofy3@BK7Io{k(Ya;kH=;6Hn{%$ftCfU=~>{t8iBX9hEu~gKPY4<-oAai za~#yQF5xR>%z}b~-n>VLE1p~wW-J0{yn51PDSJ`H38@dcqd+0hMB?f(^OJgae}D3m z>;n_)Vj73_3PIOnVFeg9MAUDb(y3wTJrKQ*^GWpfVufw zB%2YM7$6tx10md=I~;7+L>{%kgS@Ba>r3zjw;oqTD$gdBmvB2ABOfR>xkM-Nb*lMO zv_@Ic&Pevzd26uTkphd28PChfA*Bd-^sTXxgIo}-hW9)UUU*wTKGK7eI!jSTu8b@! zWXQL!WO2#>iN}x6o0e4K6q(ZAyLa#F`1tY5$@Y>Nk+W;_S$TM@tgMMWa}*<1L;`27 zLI+5B=pHL`D<(F!5s0qMVG!nD!%iBgb$3l(DBdhdVk@uP_Jt^uM9y zz@fOaIoBg*Y;5e`cdtM0jjyjCnATcK!6^WvS?Ob<>n@b z{+W>GQ(TkXbs7 z*=I+qxIpv&7#`kloL#Y9otk*~6Hb5h;*r@eAUMh0kSXWZP}Rzb)41;sOvb7a~F!TNn*7DNZVRS_ zeT88ag$z4l9AWD$X}3biY2|vftW}zEDJqw;Bc}@!F>V?6k@umof1@{d`g0= z_7wAJAcl97bb;vT==k{h!pp(nOd!Y;9u*ZP)(i?7)@Y87!nifo^u?#)Os}o2tri`> z=#ZEwtN0Gu`NWSW9#z!Tq}0?z7o5%U?2ze)H`pppvK_Xb-dh7t^M}t~IKB4(Z3qju zM0*)KJrR4z{QO)w{n-t(+j+o)@bK{9YK2H_Lmg(kPH%7TR=Y`=8PWNm7MTi;L6!?n z&VJbUKZfqQp$c!QY3=CL9)u4RrigG+XTH;bjS*cA>?4KsU{L{{B3s-!;92h!8oCvJ z8ZKHrzwS=fEv~QE=_&VSRjWphEYgrL%0p*KlNleEmSL81Gd3oMkI&63_9)fRKX5ao zS~}ZZ-~6$X4NqD#nj2jZ+BC|$UR(Q9!rs=WLJ2YV;8@Sqx8q?~QK9c{lwbY)vzyymTyDUCX>j~-dpCL7GZz8+5Oyn;_r1})1! zYS`SFzZ)&780Lf%6I*|DDH6}jtZ8j+)x91uCKD{q{^5tb%-&N@-UKZh1H4Q0N6A$sHMMn6R9{S5_T22T4)0GOHbCGT zn*fL*Kf6;h`FoiK;P`FajGJ(9aO2&(cbkA%W45#pA3k8hT6{c@l&`(vc;w#j(2zyU z2fRnr1ArcCzt*02rw+c(CY*Lj1qB5Mx2R(SpLqI{dCA)iK^eB?Xc)%$`upnv;I+IO z)nmibU!$p94#3yX@%O4BGUl62x3C6EDE&g{~KDX4#BL|I!~8^^6bhKNXj zZ3l#7a4-O7SwbKXXj+750>cWK%gZ$h9l6ZR%X)kG?d__DSri!=8Hq53HoMdS%}KJj z$ZKn(tY5$W+?US_E2j-iOj?qZ*ZBDR%R(=VLx6qmI}-x~9%zM7BvQ(9xrh}e{Ioa| z`nj9B$~^6BBK7e`TnXX}D3>53*$H`1$0kYzm(dyP=<3dyx@H^mfJMUb@k@bEvVC#A z%j4G8M2Gx;)to$fR2_~vu0(PU7qRrC-27OJO|$RDuC-uUnEXenYxKp_Pqq~x;>69! zP#^&NdQ2t_ac9ifx%XLLAN2AH5o_W)zr^(Pm5|cZ?s=p`SoEJ2b>UOK!fE-R$yNrV zE*B$edhFOS=_xS4lptxpwM~m-c>{syi2o7@F;EqLn99#zCv~2CMC&XgU>BZ zjE|Fketw)xOBidXket*G@r12a)Ki{|ze3kmM#?H(8kj~#t^}19FUiNR2Oedv$a2|) z6G=rdH9fr=aKIWlVo!D(*8k0!+&ql&0Kz|faex_GaUUl%AfvlajY4_Oegh|tL?YQ> zWK_a4UuWXQ1ugPJb*H4N+5(!@|Ngx^@wlzc0p6;wJ|X&_fevaO3Ozs+(2BITPl#_Q zFK6Aa$w2@O$QaEzYiMK?0ku}cPL?D~rA5TP{pZhH6B}?=-;bHw@iHhdaPFY&F;wQz z)044+#`_wiT^s=#uiIprC%lLQ&0mU3)q=zt0cH-ur!lI(T zH{C8O;=OvMjY1Wqx*1R|WkU4b$pnF|ehB8`Oxn(1Dk4h0-+rsCrj{wNlgDH5rH_$; zYDHt)$xHCG+ZVqOZeuU5fNXm?J6j>kMLs$@ns^93JRLEE#@1G4u%r|YN0!g$kCwuq zmI`}HKa0Jj4d4XzaJz1IK?`ww1;W^)S_!Tw9FPQJTFk}OKBG*zDpsGJaClfB0H}%Vmlb=tep$kdS1PQl z^8oTFp-uufq8_E|Q=y1O^ftRXG8*JG^#Ts1a?(WWi_RKA$ zx5^Pcm@%dtBZ=x!>i%hf_Js<9i~~(LI%){INWvv!!}KJ{WZ>NPd8!N9oh_v#-wis$ z3I8+Gdk@)7EHi@tD!cw{k8L!ITag10bUt-XUfQ1fKdahhMY~SzfSrkrB!Smn$NglP IgU|Q>1!)Iz= MAX_GLOBAL_EP -env = gym.make(GAME) - -N_S = env.observation_space.shape[0] -N_A = env.action_space.shape[0] - -A_BOUND = [env.action_space.low, env.action_space.high] -A_BOUND[0] = A_BOUND[0].reshape(1, N_A) -A_BOUND[1] = A_BOUND[1].reshape(1, N_A) -# print(A_BOUND) +################### Asynchronous Advantage Actor Critic (A3C) #################################### class ACNet(object): @@ -228,67 +233,82 @@ def work(self, globalAC): GLOBAL_RUNNING_R.append(ep_r) else: # moving average GLOBAL_RUNNING_R.append(0.95 * GLOBAL_RUNNING_R[-1] + 0.05 * ep_r) - print( - self.name, - "episode:", - GLOBAL_EP, - # "| pos: %i" % self.env.unwrapped.hull.position[0], # number of move - '| reward: %.1f' % ep_r, - "| running_reward: %.1f" % GLOBAL_RUNNING_R[-1], - # '| sigma:', test, # debug - # 'WIN ' * 5 if self.env.unwrapped.hull.position[0] >= 88 else '', - ) + # print( + # self.name, + # "Episode: ", + # GLOBAL_EP, + # # "| pos: %i" % self.env.unwrapped.hull.position[0], # number of move + # '| reward: %.1f' % ep_r, + # "| running_reward: %.1f" % GLOBAL_RUNNING_R[-1], + # # '| sigma:', test, # debug + # # 'WIN ' * 5 if self.env.unwrapped.hull.position[0] >= 88 else '', + # ) + print('{}, Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'\ + .format(self.name, GLOBAL_EP, MAX_GLOBAL_EP, ep_r, time.time()-t0 )) GLOBAL_EP += 1 break if __name__ == "__main__": - # ============================= TRAINING =============================== - with tf.device("/cpu:0"): - - OPT_A = tf.optimizers.RMSprop(LR_A, name='RMSPropA') - OPT_C = tf.optimizers.RMSprop(LR_C, name='RMSPropC') - - GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params - workers = [] - # Create worker - for i in range(N_WORKERS): - i_name = 'Worker_%i' % i # worker name - workers.append(Worker(i_name, GLOBAL_AC)) - - COORD = tf.train.Coordinator() - - # start TF threading - worker_threads = [] - for worker in workers: - # t = threading.Thread(target=worker.work) - job = lambda: worker.work(GLOBAL_AC) - t = threading.Thread(target=job) - t.start() - worker_threads.append(t) - COORD.join(worker_threads) - import matplotlib.pyplot as plt - plt.plot(GLOBAL_RUNNING_R) - plt.xlabel('episode') - plt.ylabel('global running reward') - plt.savefig('a3c.png') - plt.show() - - GLOBAL_AC.save_ckpt() - - # ============================= EVALUATION ============================= - # env = gym.make(GAME) - # GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) - GLOBAL_AC.load_ckpt() - while True: - s = env.reset() - rall = 0 + + env = gym.make(GAME) + + N_S = env.observation_space.shape[0] + N_A = env.action_space.shape[0] + + A_BOUND = [env.action_space.low, env.action_space.high] + A_BOUND[0] = A_BOUND[0].reshape(1, N_A) + A_BOUND[1] = A_BOUND[1].reshape(1, N_A) + # print(A_BOUND) + if args.train: + # ============================= TRAINING =============================== + t0 = time.time() + with tf.device("/cpu:0"): + + OPT_A = tf.optimizers.RMSprop(LR_A, name='RMSPropA') + OPT_C = tf.optimizers.RMSprop(LR_C, name='RMSPropC') + + GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) # we only need its params + workers = [] + # Create worker + for i in range(N_WORKERS): + i_name = 'Worker_%i' % i # worker name + workers.append(Worker(i_name, GLOBAL_AC)) + + COORD = tf.train.Coordinator() + + # start TF threading + worker_threads = [] + for worker in workers: + # t = threading.Thread(target=worker.work) + job = lambda: worker.work(GLOBAL_AC) + t = threading.Thread(target=job) + t.start() + worker_threads.append(t) + COORD.join(worker_threads) + import matplotlib.pyplot as plt + plt.plot(GLOBAL_RUNNING_R) + plt.xlabel('episode') + plt.ylabel('global running reward') + plt.savefig('a3c.png') + plt.show() + + GLOBAL_AC.save_ckpt() + + if args.test: + # ============================= EVALUATION ============================= + # env = gym.make(GAME) + # GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE) + GLOBAL_AC.load_ckpt() while True: - env.render() - s = s.astype('float32') # double to float - a = GLOBAL_AC.choose_action(s) - s, r, d, _ = env.step(a) - rall += r - if d: - print("reward", rall) - break + s = env.reset() + rall = 0 + while True: + env.render() + s = s.astype('float32') # double to float + a = GLOBAL_AC.choose_action(s) + s, r, d, _ = env.step(a) + rall += r + if d: + print("reward", rall) + break diff --git a/examples/reinforcement_learning/tutorial_AC.py b/examples/reinforcement_learning/tutorial_AC.py index 1128b1da5..f91d07f8c 100644 --- a/examples/reinforcement_learning/tutorial_AC.py +++ b/examples/reinforcement_learning/tutorial_AC.py @@ -157,14 +157,13 @@ def load_ckpt(self): # load trained weights if __name__ == '__main__': - ''' + ''' choose environment 1. Openai gym: env = gym.make() 2. DeepMind Control Suite: env = dm_control2gym.make() ''' - env = gym.make('CartPole-v0') # dm_control2gym.create_render_mode('example mode', show=True, return_pixel=False, height=240, width=320, camera_id=-1, overlays=(), # depth=False, scene_option=None) @@ -184,9 +183,10 @@ def load_ckpt(self): # load trained weights # we need a good teacher, so the teacher should learn faster than the actor critic = Critic(n_features=N_F, lr=LR_C) - if args.train(): + if args.train: + t0 = time.time() for i_episode in range(MAX_EPISODE): - episode_time = time.time() + # episode_time = time.time() s = env.reset().astype(np.float32) t = 0 # number of step in this episode all_r = [] # rewards of all steps @@ -229,8 +229,11 @@ def load_ckpt(self): # load trained weights running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 # start rending if running_reward greater than a threshold # if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True - print("Episode: %d reward: %f running_reward %f took: %.5f" % \ - (i_episode, ep_rs_sum, running_reward, time.time() - episode_time)) + # print("Episode: %d reward: %f running_reward %f took: %.5f" % \ + # (i_episode, ep_rs_sum, running_reward, time.time() - episode_time)) + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'\ + .format(i_episode, MAX_EPISODE, ep_rs_sum, time.time()-t0 )) + # Early Stopping for quick check if t >= MAX_EP_STEPS: @@ -253,7 +256,11 @@ def load_ckpt(self): # load trained weights actor.save_ckpt() critic.save_ckpt() - if args.test(): + if args.test: + actor.load_ckpt() + critic.load_ckpt() + t0 = time.time() + for i_episode in range(MAX_EPISODE): episode_time = time.time() s = env.reset().astype(np.float32) @@ -273,13 +280,6 @@ def load_ckpt(self): # load trained weights # r -= abs(s_new[0]) * .1 all_r.append(r) - try: - actor.learn(s, a, td_error) # learn Policy : true_gradient = grad[logPi(s, a) * td_error] - except KeyboardInterrupt: # if Ctrl+C at running actor.learn(), then save model, or exit if not at actor.learn() - actor.save_ckpt() - critic.save_ckpt() - # logging - s = s_new t += 1 @@ -292,8 +292,10 @@ def load_ckpt(self): # load trained weights running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 # start rending if running_reward greater than a threshold # if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True - print("Episode: %d reward: %f running_reward %f took: %.5f" % \ - (i_episode, ep_rs_sum, running_reward, time.time() - episode_time)) + # print("Episode: %d reward: %f running_reward %f took: %.5f" % \ + # (i_episode, ep_rs_sum, running_reward, time.time() - episode_time)) + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'\ + .format(i_episode, MAX_EPISODE, ep_rs_sum, time.time()-t0 )) # Early Stopping for quick check if t >= MAX_EP_STEPS: diff --git a/examples/reinforcement_learning/tutorial_DQN.py b/examples/reinforcement_learning/tutorial_DQN.py index 935e3e04b..65e11d193 100644 --- a/examples/reinforcement_learning/tutorial_DQN.py +++ b/examples/reinforcement_learning/tutorial_DQN.py @@ -1,15 +1,23 @@ -"""Q-Network Q(a, s) - TD Learning, Off-Policy, e-Greedy Exploration (GLIE). +""" +Deep Q-Network Q(a, s) +----------------------- +TD Learning, Off-Policy, e-Greedy Exploration (GLIE). Q(S, A) <- Q(S, A) + alpha * (R + lambda * Q(newS, newA) - Q(S, A)) delta_w = R + lambda * Q(newS, newA) See David Silver RL Tutorial Lecture 5 - Q-Learning for more details. +Reference +---------- +original paper: https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf EN: https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0#.5m3361vlw CN: https://zhuanlan.zhihu.com/p/25710327 Note: Policy Network has been proved to be better than Q-Learning, see tutorial_atari_pong.py +Environment +----------- # The FrozenLake v0 environment https://gym.openai.com/envs/FrozenLake-v0 The agent controls the movement of a character in a grid world. Some tiles of @@ -24,30 +32,51 @@ The episode ends when you reach the goal or fall in a hole. You receive a reward of 1 if you reach the goal, and zero otherwise. +Prerequisites +-------------- +tensorflow>=2.0.0a0 +tensorlayer>=2.0.0 + +To run +------- +python tutorial_DQN.py --train/test -tensorflow==2.0.0a0 -tensorlayer==2.0.0 """ import time - import numpy as np +import argparse import gym import tensorflow as tf import tensorlayer as tl + +# add arguments in command --train/test +parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') +parser.add_argument('--train', dest='train', action='store_true', default=False) +parser.add_argument('--test', dest='test', action='store_true', default=True) +args = parser.parse_args() + tl.logging.set_verbosity(tl.logging.DEBUG) -env = gym.make('FrozenLake-v0') +##################### hyper parameters #################### +lambd = .99 # decay factor +e = 0.1 # e-Greedy Exploration, the larger the more random +num_episodes = 10000 +render = False # display the game environment +running_reward = None + + +##################### DQN ########################## + def to_one_hot(i, n_classes=None): a = np.zeros(n_classes, 'uint8') a[i] = 1 return a -render = False # display the game environment -running_reward = None + ## Define Q-network q(a,s) that ouput the rewards of 4 actions by given state, i.e. Action-Value Function. # encoding for state: 4x4 grid can be represented by one-hot vector with 16 integers. @@ -55,59 +84,100 @@ def get_model(inputs_shape): ni = tl.layers.Input(inputs_shape, name='observation') nn = tl.layers.Dense(4, act=None, W_init=tf.random_uniform_initializer(0, 0.01), b_init=None, name='q_a_s')(ni) return tl.models.Model(inputs=ni, outputs=nn, name="Q-Network") -qnetwork = get_model([None, 16]) -qnetwork.train() -train_weights = qnetwork.trainable_weights - -optimizer = tf.optimizers.SGD(learning_rate=0.1) -## Set learning parameters -lambd = .99 # decay factor -e = 0.1 # e-Greedy Exploration, the larger the more random -num_episodes = 10000 -for i in range(num_episodes): - ## Reset environment and get first new observation - episode_time = time.time() - s = env.reset() # observation is state, integer 0 ~ 15 - rAll = 0 - for j in range(99): # step index, maximum step is 99 - if render: env.render() - ## Choose an action by greedily (with e chance of random action) from the Q-network - allQ = qnetwork(np.asarray([to_one_hot(s, 16)], dtype=np.float32)).numpy() - a = np.argmax(allQ, 1) - - ## e-Greedy Exploration !!! sample random action - if np.random.rand(1) < e: - a[0] = env.action_space.sample() - ## Get new state and reward from environment - s1, r, d, _ = env.step(a[0]) - ## Obtain the Q' values by feeding the new state through our network - Q1 = qnetwork(np.asarray([to_one_hot(s1, 16)], dtype=np.float32)).numpy() - - ## Obtain maxQ' and set our target value for chosen action. - maxQ1 = np.max(Q1) # in Q-Learning, policy is greedy, so we use "max" to select the next action. - targetQ = allQ - targetQ[0, a[0]] = r + lambd * maxQ1 - ## Train network using target and predicted Q values - # it is not real target Q value, it is just an estimation, - # but check the Q-Learning update formula: - # Q'(s,a) <- Q(s,a) + alpha(r + lambd * maxQ(s',a') - Q(s, a)) - # minimizing |r + lambd * maxQ(s',a') - Q(s, a)|^2 equals to force Q'(s,a) ≈ Q(s,a) - with tf.GradientTape() as tape: - _qvalues = qnetwork(np.asarray([to_one_hot(s, 16)], dtype=np.float32)) - _loss = tl.cost.mean_squared_error(targetQ, _qvalues, is_mean=False) - grad = tape.gradient(_loss, train_weights) - optimizer.apply_gradients(zip(grad, train_weights)) - - rAll += r - s = s1 - ## Reduce chance of random action if an episode is done. - if d ==True: - e = 1. / ((i / 50) + 10) # reduce e, GLIE: Greey in the limit with infinite Exploration - break - - ## Note that, the rewards here with random action - running_reward = rAll if running_reward is None else running_reward * 0.99 + rAll * 0.01 - print("Episode [%d/%d] sum reward: %f running reward: %f took: %.5fs " % \ - (i, num_episodes, rAll, running_reward, time.time() - episode_time)) +def save_ckpt(model): # save trained weights + tl.files.save_npz(model.trainable_weights, name='dqn_model.npz') + +def load_ckpt(model): # load trained weights + tl.files.load_and_assign_npz(name='dqn_model.npz', network=model) + +if __name__ == '__main__': + + qnetwork = get_model([None, 16]) + qnetwork.train() + train_weights = qnetwork.trainable_weights + + optimizer = tf.optimizers.SGD(learning_rate=0.1) + env = gym.make('FrozenLake-v0') + + if args.train: + t0 = time.time() + for i in range(num_episodes): + ## Reset environment and get first new observation + # episode_time = time.time() + s = env.reset() # observation is state, integer 0 ~ 15 + rAll = 0 + for j in range(99): # step index, maximum step is 99 + if render: env.render() + ## Choose an action by greedily (with e chance of random action) from the Q-network + allQ = qnetwork(np.asarray([to_one_hot(s, 16)], dtype=np.float32)).numpy() + a = np.argmax(allQ, 1) + + ## e-Greedy Exploration !!! sample random action + if np.random.rand(1) < e: + a[0] = env.action_space.sample() + ## Get new state and reward from environment + s1, r, d, _ = env.step(a[0]) + ## Obtain the Q' values by feeding the new state through our network + Q1 = qnetwork(np.asarray([to_one_hot(s1, 16)], dtype=np.float32)).numpy() + + ## Obtain maxQ' and set our target value for chosen action. + maxQ1 = np.max(Q1) # in Q-Learning, policy is greedy, so we use "max" to select the next action. + targetQ = allQ + targetQ[0, a[0]] = r + lambd * maxQ1 + ## Train network using target and predicted Q values + # it is not real target Q value, it is just an estimation, + # but check the Q-Learning update formula: + # Q'(s,a) <- Q(s,a) + alpha(r + lambd * maxQ(s',a') - Q(s, a)) + # minimizing |r + lambd * maxQ(s',a') - Q(s, a)|^2 equals to force Q'(s,a) ≈ Q(s,a) + with tf.GradientTape() as tape: + _qvalues = qnetwork(np.asarray([to_one_hot(s, 16)], dtype=np.float32)) + _loss = tl.cost.mean_squared_error(targetQ, _qvalues, is_mean=False) + grad = tape.gradient(_loss, train_weights) + optimizer.apply_gradients(zip(grad, train_weights)) + + rAll += r + s = s1 + ## Reduce chance of random action if an episode is done. + if d ==True: + e = 1. / ((i / 50) + 10) # reduce e, GLIE: Greey in the limit with infinite Exploration + break + + ## Note that, the rewards here with random action + running_reward = rAll if running_reward is None else running_reward * 0.99 + rAll * 0.01 + # print("Episode [%d/%d] sum reward: %f running reward: %f took: %.5fs " % \ + # (i, num_episodes, rAll, running_reward, time.time() - episode_time)) + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Average Reward: {:.4f} | Running Time: {:.4f}'\ + .format(i, num_episodes, rAll, running_reward, time.time()-t0 )) + save_ckpt(qnetwork) # save model + + if args.test: + t0 = time.time() + load_ckpt(qnetwork) # load model + for i in range(num_episodes): + ## Reset environment and get first new observation + episode_time = time.time() + s = env.reset() # observation is state, integer 0 ~ 15 + rAll = 0 + for j in range(99): # step index, maximum step is 99 + if render: env.render() + ## Choose an action by greedily (with e chance of random action) from the Q-network + allQ = qnetwork(np.asarray([to_one_hot(s, 16)], dtype=np.float32)).numpy() + a = np.argmax(allQ, 1) # no epsilon, only greedy for testing + + ## Get new state and reward from environment + s1, r, d, _ = env.step(a[0]) + rAll += r + s = s1 + ## Reduce chance of random action if an episode is done. + if d ==True: + e = 1. / ((i / 50) + 10) # reduce e, GLIE: Greey in the limit with infinite Exploration + break + + ## Note that, the rewards here with random action + running_reward = rAll if running_reward is None else running_reward * 0.99 + rAll * 0.01 + # print("Episode [%d/%d] sum reward: %f running reward: %f took: %.5fs " % \ + # (i, num_episodes, rAll, running_reward, time.time() - episode_time)) + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Average Reward: {:.4f} | Running Time: {:.4f}'\ + .format(i, num_episodes, rAll, running_reward, time.time()-t0 )) \ No newline at end of file diff --git a/examples/reinforcement_learning/tutorial_PG.py b/examples/reinforcement_learning/tutorial_PG.py index 8e31817cb..014bb573d 100644 --- a/examples/reinforcement_learning/tutorial_PG.py +++ b/examples/reinforcement_learning/tutorial_PG.py @@ -14,6 +14,7 @@ Env --- Openai Gym CartPole-v0, discrete action space +https://gym.openai.com/envs/CartPole-v0 To run ------ diff --git a/examples/reinforcement_learning/tutorial_SAC.py b/examples/reinforcement_learning/tutorial_SAC.py index 3902fcbac..7f8f50e49 100644 --- a/examples/reinforcement_learning/tutorial_SAC.py +++ b/examples/reinforcement_learning/tutorial_SAC.py @@ -12,6 +12,7 @@ Environment --- Openai Gym Pendulum-v0, continuous action space +https://gym.openai.com/envs/Pendulum-v0/ Prerequisites -------------- @@ -24,7 +25,7 @@ To run ------ -python tutorial_sac.py --train/test +python tutorial_SAC.py --train/test ''' import argparse diff --git a/examples/reinforcement_learning/tutorial_TD3.py b/examples/reinforcement_learning/tutorial_TD3.py index e51f202ad..0aaca77b7 100644 --- a/examples/reinforcement_learning/tutorial_TD3.py +++ b/examples/reinforcement_learning/tutorial_TD3.py @@ -13,6 +13,7 @@ Environment --- Openai Gym Pendulum-v0, continuous action space +https://gym.openai.com/envs/Pendulum-v0/ Prerequisites --- @@ -25,7 +26,7 @@ To run ------- -python tutorial_td3.py --train/test +python tutorial_TD3.py --train/test ''' From 662412eb43d5b07f7a073706cb0d9b2a3fe36e31 Mon Sep 17 00:00:00 2001 From: Officium Date: Mon, 10 Jun 2019 20:48:31 +0800 Subject: [PATCH 44/57] change format of DQN_variants --- .../tutorial_DQN_variants.py | 211 +++++++++++------- 1 file changed, 130 insertions(+), 81 deletions(-) diff --git a/examples/reinforcement_learning/tutorial_DQN_variants.py b/examples/reinforcement_learning/tutorial_DQN_variants.py index 4cc6d3bf5..e2fe42e4e 100644 --- a/examples/reinforcement_learning/tutorial_DQN_variants.py +++ b/examples/reinforcement_learning/tutorial_DQN_variants.py @@ -16,6 +16,8 @@ tensorlayer==2.0.1 """ +import argparse +import os import random import time @@ -26,8 +28,25 @@ from tutorial_wrappers import build_env -seed = 0 -env_id = 'CartPole-v0' # CartPole-v0, PongNoFrameskip-v4 +parser = argparse.ArgumentParser() +parser.add_argument('--mode', help='train or test', default='train') +parser.add_argument('--save_path', default='dqn_variants', + help='folder to save if mode == train else model path,' + 'qnet will be saved once target net update') +parser.add_argument('--seed', help='random seed', type=int, default=0) +parser.add_argument('--env_id', default='CartPole-v0', + help='CartPole-v0 or PongNoFrameskip-v4') +args = parser.parse_args() +print(args) + +if args.mode == 'train': + os.makedirs(args.save_path, exist_ok=True) +random.seed(args.seed) +np.random.seed(args.seed) +tf.random.set_seed(args.seed) # reproducible +env_id = args.env_id +env = build_env(env_id, seed=args.seed) + if env_id == 'CartPole-v0': qnet_type = 'MLP' number_timesteps = 10000 # total number of time steps to train on @@ -50,7 +69,6 @@ target_q_update_freq = 200 # how frequency target q net update ob_scale = 1.0 / 255 # scale observations -env = build_env(env_id, seed=seed) in_dim = env.observation_space.shape out_dim = env.action_space.n reward_gamma = 0.99 # reward discount @@ -209,83 +227,114 @@ def softmax(x, dim): return temp / temp.sum(dim, keepdims=True) -qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') -qnet.train() -trainabel_weights = qnet.trainable_weights -targetqnet = MLP('targetq') if qnet_type == 'MLP' else CNN('targetq') -targetqnet.infer() -sync(qnet, targetqnet) -optimizer = tf.optimizers.Adam(learning_rate=lr) -buffer = ReplayBuffer(buffer_size) - -o = env.reset() -nepisode = 0 -t = time.time() -noise_scale = 1e-2 -for i in range(1, number_timesteps + 1): - eps = epsilon(i) - - # select action - if random.random() < eps: - a = int(random.random() * out_dim) - else: - # noise schedule is based on KL divergence between perturbed and - # non-perturbed policy, see https://arxiv.org/pdf/1706.01905.pdf +if args.mode == 'train': + qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') + qnet.train() + trainabel_weights = qnet.trainable_weights + targetqnet = MLP('targetq') if qnet_type == 'MLP' else CNN('targetq') + targetqnet.infer() + sync(qnet, targetqnet) + optimizer = tf.optimizers.Adam(learning_rate=lr) + buffer = ReplayBuffer(buffer_size) + + o = env.reset() + nepisode = 0 + t = time.time() + noise_scale = 1e-2 + for i in range(1, number_timesteps + 1): + eps = epsilon(i) + + # select action + if random.random() < eps: + a = int(random.random() * out_dim) + else: + # noise schedule is based on KL divergence between perturbed and + # non-perturbed policy, see https://arxiv.org/pdf/1706.01905.pdf + obv = np.expand_dims(o, 0).astype('float32') * ob_scale + if i < explore_timesteps: + qnet.noise_scale = noise_scale + q_ptb = qnet(obv).numpy() + qnet.noise_scale = 0 + if i % noise_update_freq == 0: + q = qnet(obv).numpy() + kl_ptb = (log_softmax(q, 1) - log_softmax(q_ptb, 1)) + kl_ptb = np.sum(kl_ptb * softmax(q, 1), 1).mean() + kl_explore = -np.log(1 - eps + eps / out_dim) + if kl_ptb < kl_explore: + noise_scale *= 1.01 + else: + noise_scale /= 1.01 + a = q_ptb.argmax(1)[0] + else: + a = qnet(obv).numpy().argmax(1)[0] + + # execute action and feed to replay buffer + # note that `_` tail in var name means next + o_, r, done, info = env.step(a) + buffer.add(o, a, r, o_, done) + + if i >= warm_start: + # sync q net and target q net + if i % target_q_update_freq == 0: + sync(qnet, targetqnet) + path = os.path.join(args.save_path, '{}.npz'.format(i)) + tl.files.save_npz(qnet.trainable_weights, name=path) + + # sample from replay buffer + b_o, b_a, b_r, b_o_, b_d = buffer.sample(batch_size) + + # double q estimation + b_a_ = tf.one_hot(tf.argmax(qnet(b_o_), 1), out_dim) + b_q_ = (1 - b_d) * tf.reduce_sum(targetqnet(b_o_) * b_a_, 1) + + # calculate loss + with tf.GradientTape() as q_tape: + b_q = tf.reduce_sum(qnet(b_o) * tf.one_hot(b_a, out_dim), 1) + loss = tf.reduce_mean( + huber_loss(b_q - (b_r + reward_gamma * b_q_))) + + # backward gradients + q_grad = q_tape.gradient(loss, trainabel_weights) + optimizer.apply_gradients(zip(q_grad, trainabel_weights)) + + if done: + o = env.reset() + else: + o = o_ + + # episode in info is real (unwrapped) message + if info.get('episode'): + nepisode += 1 + reward, length = info['episode']['r'], info['episode']['l'] + fps = int(length / (time.time() - t)) + print('Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}, FPS: {}' + .format(i, nepisode, reward, length, fps)) + t = time.time() +else: + qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') + tl.files.load_and_assign_npz(name=args.save_path, network=qnet) + qnet.eval() + + nepisode = 0 + o = env.reset() + for i in range(1, number_timesteps + 1): obv = np.expand_dims(o, 0).astype('float32') * ob_scale - if i < explore_timesteps: - qnet.noise_scale = noise_scale - q_ptb = qnet(obv).numpy() - qnet.noise_scale = 0 - if i % noise_update_freq == 0: - q = qnet(obv).numpy() - kl_ptb = (log_softmax(q, 1) - log_softmax(q_ptb, 1)) - kl_ptb = np.sum(kl_ptb * softmax(q, 1), 1).mean() - kl_explore = -np.log(1 - eps + eps / out_dim) - if kl_ptb < kl_explore: - noise_scale *= 1.01 - else: - noise_scale /= 1.01 - a = q_ptb.argmax(1)[0] + a = qnet(obv).numpy().argmax(1)[0] + + # execute action and feed to replay buffer + # note that `_` tail in var name means next + o_, r, done, info = env.step(a) + + if done: + o = env.reset() else: - a = qnet(obv).numpy().argmax(1)[0] - - # execute action and feed to replay buffer - # note that `_` tail in var name means next - o_, r, done, info = env.step(a) - buffer.add(o, a, r, o_, done) - - if i >= warm_start: - # sync q net and target q net - if i % target_q_update_freq == 0: - sync(qnet, targetqnet) - - # sample from replay buffer - b_o, b_a, b_r, b_o_, b_d = buffer.sample(batch_size) - - # double q estimation - b_a_ = tf.one_hot(tf.argmax(qnet(b_o_), 1), out_dim) - b_q_ = (1 - b_d) * tf.reduce_sum(targetqnet(b_o_) * b_a_, 1) - - # calculate loss - with tf.GradientTape() as q_tape: - b_q = tf.reduce_sum(qnet(b_o) * tf.one_hot(b_a, out_dim), 1) - loss = tf.reduce_mean(huber_loss(b_q - (b_r + reward_gamma * b_q_))) - - # backward gradients - q_grad = q_tape.gradient(loss, trainabel_weights) - optimizer.apply_gradients(zip(q_grad, trainabel_weights)) - - if done: - o = env.reset() - else: - o = o_ - - # episode in info is real (unwrapped) message - if info.get('episode'): - nepisode += 1 - reward, length = info['episode']['r'], info['episode']['l'] - fps = int(length / (time.time() - t)) - print('Time steps so far: {}, episode so far: {}, ' - 'episode reward: {:.4f}, episode length: {}, FPS: {}' - .format(i, nepisode, reward, length, fps)) - t = time.time() + o = o_ + + # episode in info is real (unwrapped) message + if info.get('episode'): + nepisode += 1 + reward, length = info['episode']['r'], info['episode']['l'] + print('Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}' + .format(i, nepisode, reward, length)) From f19b19d82e7487db1d649f592666806d22b2228d Mon Sep 17 00:00:00 2001 From: Officium Date: Mon, 10 Jun 2019 22:05:34 +0800 Subject: [PATCH 45/57] change format of C51 and DQN_variants to follow the tutorial_format --- .../reinforcement_learning/tutorial_C51.py | 248 +++++++++-------- .../tutorial_DQN_variants.py | 262 ++++++++++-------- 2 files changed, 283 insertions(+), 227 deletions(-) diff --git a/examples/reinforcement_learning/tutorial_C51.py b/examples/reinforcement_learning/tutorial_C51.py index 25ed688b2..8bc8c7fd0 100644 --- a/examples/reinforcement_learning/tutorial_C51.py +++ b/examples/reinforcement_learning/tutorial_C51.py @@ -1,12 +1,33 @@ -"""Implement C51 algorithm +""" +C51 Algorithm +------------------------ +Categorical 51 distributional RL algorithm, 51 means the number of atoms. In +this algorithm, instead of estimating actual expected value, value distribution +over a series of continuous sub-intervals (atoms) is considered. + + +Reference: +------------------------ Bellemare M G, Dabney W, Munos R. A distributional perspective on reinforcement learning[C]//Proceedings of the 34th International Conference on Machine Learning-Volume 70. JMLR. org, 2017: 449-458. -# Requirements -tensorflow==2.0.0a0 -tensorlayer==2.0.1 +Environment: +------------------------ +Cartpole and Pong in OpenAI Gym + + +Requirements: +------------------------ +tensorflow>=2.0.0a0 +tensorlayer>=2.0.0 + + +To run: +------------------------ +python tutorial_C51.py --mode=train +python tutorial_C51.py --mode=test --save_path=c51/8000.npz """ import argparse import os @@ -39,6 +60,7 @@ env_id = args.env_id env = build_env(env_id, seed=args.seed) +# #################### hyper parameters #################### if env_id == 'CartPole-v0': qnet_type = 'MLP' number_timesteps = 10000 # total number of time steps to train on @@ -73,6 +95,7 @@ deltaz = float(max_value - min_value) / (atom_num - 1) +# ############################## C51 #################################### class MLP(tl.models.Model): def __init__(self, name): super(MLP, self).__init__(name=name) @@ -164,116 +187,117 @@ def sync(net, net_tar): var_tar.assign(var) -if args.mode == 'train': - qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') - qnet.train() - trainabel_weights = qnet.trainable_weights - targetqnet = MLP('targetq') if qnet_type == 'MLP' else CNN('targetq') - targetqnet.infer() - sync(qnet, targetqnet) - optimizer = tf.optimizers.Adam(learning_rate=lr) - buffer = ReplayBuffer(buffer_size) - - o = env.reset() - nepisode = 0 - t = time.time() - for i in range(1, number_timesteps + 1): - eps = epsilon(i) - - # select action - if random.random() < eps: - a = int(random.random() * out_dim) - else: +if __name__ == '__main__': + if args.mode == 'train': + qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') + qnet.train() + trainabel_weights = qnet.trainable_weights + targetqnet = MLP('targetq') if qnet_type == 'MLP' else CNN('targetq') + targetqnet.infer() + sync(qnet, targetqnet) + optimizer = tf.optimizers.Adam(learning_rate=lr) + buffer = ReplayBuffer(buffer_size) + + o = env.reset() + nepisode = 0 + t = time.time() + for i in range(1, number_timesteps + 1): + eps = epsilon(i) + + # select action + if random.random() < eps: + a = int(random.random() * out_dim) + else: + obv = np.expand_dims(o, 0).astype('float32') * ob_scale + qdist = np.exp(qnet(obv).numpy()) + qvalues = (qdist * vrange).sum(-1) + a = qvalues.argmax(1)[0] + + # execute action and feed to replay buffer + # note that `_` tail in var name means next + o_, r, done, info = env.step(a) + buffer.add(o, a, r, o_, done) + + if i >= warm_start: + # sync q net and target q net + if i % target_q_update_freq == 0: + sync(qnet, targetqnet) + path = os.path.join(args.save_path, '{}.npz'.format(i)) + tl.files.save_npz(qnet.trainable_weights, name=path) + + # sample from replay buffer + b_o, b_a, b_r, b_o_, b_d = buffer.sample(batch_size) + + # q estimation, see Algorithm 1 in paper for detail + b_dist_ = np.exp(targetqnet(b_o_).numpy()) + b_a_ = (b_dist_ * vrange).sum(-1).argmax(1) + b_tzj = np.clip( + reward_gamma * (1 - b_d[:, None]) * vrange[None, :] + + b_r[:, None], min_value, max_value) + b_i = (b_tzj - min_value) / deltaz + b_l = np.floor(b_i).astype('int64') + b_u = np.ceil(b_i).astype('int64') + templ = b_dist_[range(batch_size), b_a_, :] * (b_u - b_i) + tempu = b_dist_[range(batch_size), b_a_, :] * (b_i - b_l) + b_m = np.zeros((batch_size, atom_num)) + # TODO: aggregate value by index and batch update (scatter_add) + for j in range(batch_size): + for k in range(atom_num): + b_m[j][b_l[j][k]] += templ[j][k] + b_m[j][b_u[j][k]] += tempu[j][k] + b_m = tf.convert_to_tensor(b_m, dtype='float32') + + # calculate loss + with tf.GradientTape() as q_tape: + b_index = np.stack([range(batch_size), b_a], 1) + b_index = tf.convert_to_tensor(b_index, 'int64') + b_dist_a = tf.gather_nd(qnet(b_o), b_index) + loss = -tf.reduce_mean(tf.reduce_sum(b_dist_a * b_m, 1)) + + # backward gradients + q_grad = q_tape.gradient(loss, trainabel_weights) + optimizer.apply_gradients(zip(q_grad, trainabel_weights)) + + if done: + o = env.reset() + else: + o = o_ + + # episode in info is real (unwrapped) message + if info.get('episode'): + nepisode += 1 + reward, length = info['episode']['r'], info['episode']['l'] + fps = int(length / (time.time() - t)) + print('Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}, FPS: {}' + .format(i, nepisode, reward, length, fps)) + t = time.time() + else: + qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') + tl.files.load_and_assign_npz(name=args.save_path, network=qnet) + qnet.eval() + + nepisode = 0 + o = env.reset() + for i in range(1, number_timesteps + 1): obv = np.expand_dims(o, 0).astype('float32') * ob_scale qdist = np.exp(qnet(obv).numpy()) qvalues = (qdist * vrange).sum(-1) a = qvalues.argmax(1)[0] - # execute action and feed to replay buffer - # note that `_` tail in var name means next - o_, r, done, info = env.step(a) - buffer.add(o, a, r, o_, done) - - if i >= warm_start: - # sync q net and target q net - if i % target_q_update_freq == 0: - sync(qnet, targetqnet) - path = os.path.join(args.save_path, '{}.npz'.format(i)) - tl.files.save_npz(qnet.trainable_weights, name=path) - - # sample from replay buffer - b_o, b_a, b_r, b_o_, b_d = buffer.sample(batch_size) - - # q estimation, see Algorithm 1 in paper for detail - b_dist_ = np.exp(targetqnet(b_o_).numpy()) - b_a_ = (b_dist_ * vrange).sum(-1).argmax(1) - b_tzj = np.clip(reward_gamma * (1 - b_d[:, None]) * vrange[None, :] - + b_r[:, None], min_value, max_value) - b_i = (b_tzj - min_value) / deltaz - b_l = np.floor(b_i).astype('int64') - b_u = np.ceil(b_i).astype('int64') - templ = b_dist_[range(batch_size), b_a_, :] * (b_u - b_i) - tempu = b_dist_[range(batch_size), b_a_, :] * (b_i - b_l) - b_m = np.zeros((batch_size, atom_num)) - # TODO: aggregate value by index and batch update (scatter_add) - for j in range(batch_size): - for k in range(atom_num): - b_m[j][b_l[j][k]] += templ[j][k] - b_m[j][b_u[j][k]] += tempu[j][k] - b_m = tf.convert_to_tensor(b_m, dtype='float32') - - # calculate loss - with tf.GradientTape() as q_tape: - b_index = np.stack([range(batch_size), b_a], 1) - b_index = tf.convert_to_tensor(b_index, 'int64') - b_dist_a = tf.gather_nd(qnet(b_o), b_index) - loss = -tf.reduce_mean(tf.reduce_sum(b_dist_a * b_m, 1)) - - # backward gradients - q_grad = q_tape.gradient(loss, trainabel_weights) - optimizer.apply_gradients(zip(q_grad, trainabel_weights)) - - if done: - o = env.reset() - else: - o = o_ - - # episode in info is real (unwrapped) message - if info.get('episode'): - nepisode += 1 - reward, length = info['episode']['r'], info['episode']['l'] - fps = int(length / (time.time() - t)) - print('Time steps so far: {}, episode so far: {}, ' - 'episode reward: {:.4f}, episode length: {}, FPS: {}' - .format(i, nepisode, reward, length, fps)) - t = time.time() -else: - qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') - tl.files.load_and_assign_npz(name=args.save_path, network=qnet) - qnet.eval() - - nepisode = 0 - o = env.reset() - for i in range(1, number_timesteps + 1): - obv = np.expand_dims(o, 0).astype('float32') * ob_scale - qdist = np.exp(qnet(obv).numpy()) - qvalues = (qdist * vrange).sum(-1) - a = qvalues.argmax(1)[0] - - # execute action and feed to replay buffer - # note that `_` tail in var name means next - o_, r, done, info = env.step(a) - - if done: - o = env.reset() - else: - o = o_ - - # episode in info is real (unwrapped) message - if info.get('episode'): - nepisode += 1 - reward, length = info['episode']['r'], info['episode']['l'] - print('Time steps so far: {}, episode so far: {}, ' - 'episode reward: {:.4f}, episode length: {}' - .format(i, nepisode, reward, length)) - + # execute action and feed to replay buffer + # note that `_` tail in var name means next + o_, r, done, info = env.step(a) + + if done: + o = env.reset() + else: + o = o_ + + # episode in info is real (unwrapped) message + if info.get('episode'): + nepisode += 1 + reward, length = info['episode']['r'], info['episode']['l'] + print('Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}' + .format(i, nepisode, reward, length)) diff --git a/examples/reinforcement_learning/tutorial_DQN_variants.py b/examples/reinforcement_learning/tutorial_DQN_variants.py index e2fe42e4e..2c390dcd5 100644 --- a/examples/reinforcement_learning/tutorial_DQN_variants.py +++ b/examples/reinforcement_learning/tutorial_DQN_variants.py @@ -1,20 +1,49 @@ -"""Implement following enhanced deep q-learning algorithms +""" +DQN and its variants +------------------------ +We implement Double DQN, Dueling DQN and Noisy DQN here. + +The max operator in standard DQN uses the same values both to select and to +evaluate an action by +Q(s_t, a_t) = R_{t+1} + \gamma * max_{a}Q_{tar}(s_{t+1}, a). +Double DQN propose to use following evaluation to address overestimation problem +of max operator: +Q(s_t, a_t) = R_{t+1} + \gamma * Q_{tar}(s_{t+1}, max_{a}Q(s_{t+1}, a)). + +Dueling DQN uses dueling architecture where the value of state and the advantage +of each action is estimated separately. + +Noisy DQN propose to explore by adding parameter noises. + + +Reference: +------------------------ 1. Double DQN Van Hasselt H, Guez A, Silver D. Deep reinforcement learning with double q-learning[C]//Thirtieth AAAI Conference on Artificial Intelligence. 2016. - 2. Dueling DQN Wang Z, Schaul T, Hessel M, et al. Dueling network architectures for deep reinforcement learning[J]. arXiv preprint arXiv:1511.06581, 2015. - 3. Noisy DQN Plappert M, Houthooft R, Dhariwal P, et al. Parameter space noise for exploration[J]. arXiv preprint arXiv:1706.01905, 2017. -# Requirements -tensorflow==2.0.0a0 -tensorlayer==2.0.1 +Environment: +------------------------ +Cartpole and Pong in OpenAI Gym + + +Requirements: +------------------------ +tensorflow>=2.0.0a0 +tensorlayer>=2.0.0 + + +To run: +------------------------ +python tutorial_DQN_variantes.py --mode=train +python tutorial_DQN_variantes.py --mode=test --save_path=c51/8000.npz """ import argparse import os @@ -47,6 +76,7 @@ env_id = args.env_id env = build_env(env_id, seed=args.seed) +# #################### hyper parameters #################### if env_id == 'CartPole-v0': qnet_type = 'MLP' number_timesteps = 10000 # total number of time steps to train on @@ -77,6 +107,7 @@ noise_update_freq = 50 # how frequency param noise net update +# ############################## DQN #################################### class MLP(tl.models.Model): def __init__(self, name): super(MLP, self).__init__(name=name) @@ -227,114 +258,115 @@ def softmax(x, dim): return temp / temp.sum(dim, keepdims=True) -if args.mode == 'train': - qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') - qnet.train() - trainabel_weights = qnet.trainable_weights - targetqnet = MLP('targetq') if qnet_type == 'MLP' else CNN('targetq') - targetqnet.infer() - sync(qnet, targetqnet) - optimizer = tf.optimizers.Adam(learning_rate=lr) - buffer = ReplayBuffer(buffer_size) - - o = env.reset() - nepisode = 0 - t = time.time() - noise_scale = 1e-2 - for i in range(1, number_timesteps + 1): - eps = epsilon(i) - - # select action - if random.random() < eps: - a = int(random.random() * out_dim) - else: - # noise schedule is based on KL divergence between perturbed and - # non-perturbed policy, see https://arxiv.org/pdf/1706.01905.pdf +if __name__ == '__main__': + if args.mode == 'train': + qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') + qnet.train() + trainabel_weights = qnet.trainable_weights + targetqnet = MLP('targetq') if qnet_type == 'MLP' else CNN('targetq') + targetqnet.infer() + sync(qnet, targetqnet) + optimizer = tf.optimizers.Adam(learning_rate=lr) + buffer = ReplayBuffer(buffer_size) + + o = env.reset() + nepisode = 0 + t = time.time() + noise_scale = 1e-2 + for i in range(1, number_timesteps + 1): + eps = epsilon(i) + + # select action + if random.random() < eps: + a = int(random.random() * out_dim) + else: + # noise schedule is based on KL divergence between perturbed and + # non-perturbed policy, see https://arxiv.org/pdf/1706.01905.pdf + obv = np.expand_dims(o, 0).astype('float32') * ob_scale + if i < explore_timesteps: + qnet.noise_scale = noise_scale + q_ptb = qnet(obv).numpy() + qnet.noise_scale = 0 + if i % noise_update_freq == 0: + q = qnet(obv).numpy() + kl_ptb = (log_softmax(q, 1) - log_softmax(q_ptb, 1)) + kl_ptb = np.sum(kl_ptb * softmax(q, 1), 1).mean() + kl_explore = -np.log(1 - eps + eps / out_dim) + if kl_ptb < kl_explore: + noise_scale *= 1.01 + else: + noise_scale /= 1.01 + a = q_ptb.argmax(1)[0] + else: + a = qnet(obv).numpy().argmax(1)[0] + + # execute action and feed to replay buffer + # note that `_` tail in var name means next + o_, r, done, info = env.step(a) + buffer.add(o, a, r, o_, done) + + if i >= warm_start: + # sync q net and target q net + if i % target_q_update_freq == 0: + sync(qnet, targetqnet) + path = os.path.join(args.save_path, '{}.npz'.format(i)) + tl.files.save_npz(qnet.trainable_weights, name=path) + + # sample from replay buffer + b_o, b_a, b_r, b_o_, b_d = buffer.sample(batch_size) + + # double q estimation + b_a_ = tf.one_hot(tf.argmax(qnet(b_o_), 1), out_dim) + b_q_ = (1 - b_d) * tf.reduce_sum(targetqnet(b_o_) * b_a_, 1) + + # calculate loss + with tf.GradientTape() as q_tape: + b_q = tf.reduce_sum(qnet(b_o) * tf.one_hot(b_a, out_dim), 1) + loss = tf.reduce_mean( + huber_loss(b_q - (b_r + reward_gamma * b_q_))) + + # backward gradients + q_grad = q_tape.gradient(loss, trainabel_weights) + optimizer.apply_gradients(zip(q_grad, trainabel_weights)) + + if done: + o = env.reset() + else: + o = o_ + + # episode in info is real (unwrapped) message + if info.get('episode'): + nepisode += 1 + reward, length = info['episode']['r'], info['episode']['l'] + fps = int(length / (time.time() - t)) + print('Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}, FPS: {}' + .format(i, nepisode, reward, length, fps)) + t = time.time() + else: + qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') + tl.files.load_and_assign_npz(name=args.save_path, network=qnet) + qnet.eval() + + nepisode = 0 + o = env.reset() + for i in range(1, number_timesteps + 1): obv = np.expand_dims(o, 0).astype('float32') * ob_scale - if i < explore_timesteps: - qnet.noise_scale = noise_scale - q_ptb = qnet(obv).numpy() - qnet.noise_scale = 0 - if i % noise_update_freq == 0: - q = qnet(obv).numpy() - kl_ptb = (log_softmax(q, 1) - log_softmax(q_ptb, 1)) - kl_ptb = np.sum(kl_ptb * softmax(q, 1), 1).mean() - kl_explore = -np.log(1 - eps + eps / out_dim) - if kl_ptb < kl_explore: - noise_scale *= 1.01 - else: - noise_scale /= 1.01 - a = q_ptb.argmax(1)[0] + a = qnet(obv).numpy().argmax(1)[0] + + # execute action and feed to replay buffer + # note that `_` tail in var name means next + o_, r, done, info = env.step(a) + + if done: + o = env.reset() else: - a = qnet(obv).numpy().argmax(1)[0] - - # execute action and feed to replay buffer - # note that `_` tail in var name means next - o_, r, done, info = env.step(a) - buffer.add(o, a, r, o_, done) - - if i >= warm_start: - # sync q net and target q net - if i % target_q_update_freq == 0: - sync(qnet, targetqnet) - path = os.path.join(args.save_path, '{}.npz'.format(i)) - tl.files.save_npz(qnet.trainable_weights, name=path) - - # sample from replay buffer - b_o, b_a, b_r, b_o_, b_d = buffer.sample(batch_size) - - # double q estimation - b_a_ = tf.one_hot(tf.argmax(qnet(b_o_), 1), out_dim) - b_q_ = (1 - b_d) * tf.reduce_sum(targetqnet(b_o_) * b_a_, 1) - - # calculate loss - with tf.GradientTape() as q_tape: - b_q = tf.reduce_sum(qnet(b_o) * tf.one_hot(b_a, out_dim), 1) - loss = tf.reduce_mean( - huber_loss(b_q - (b_r + reward_gamma * b_q_))) - - # backward gradients - q_grad = q_tape.gradient(loss, trainabel_weights) - optimizer.apply_gradients(zip(q_grad, trainabel_weights)) - - if done: - o = env.reset() - else: - o = o_ - - # episode in info is real (unwrapped) message - if info.get('episode'): - nepisode += 1 - reward, length = info['episode']['r'], info['episode']['l'] - fps = int(length / (time.time() - t)) - print('Time steps so far: {}, episode so far: {}, ' - 'episode reward: {:.4f}, episode length: {}, FPS: {}' - .format(i, nepisode, reward, length, fps)) - t = time.time() -else: - qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') - tl.files.load_and_assign_npz(name=args.save_path, network=qnet) - qnet.eval() - - nepisode = 0 - o = env.reset() - for i in range(1, number_timesteps + 1): - obv = np.expand_dims(o, 0).astype('float32') * ob_scale - a = qnet(obv).numpy().argmax(1)[0] - - # execute action and feed to replay buffer - # note that `_` tail in var name means next - o_, r, done, info = env.step(a) - - if done: - o = env.reset() - else: - o = o_ - - # episode in info is real (unwrapped) message - if info.get('episode'): - nepisode += 1 - reward, length = info['episode']['r'], info['episode']['l'] - print('Time steps so far: {}, episode so far: {}, ' - 'episode reward: {:.4f}, episode length: {}' - .format(i, nepisode, reward, length)) + o = o_ + + # episode in info is real (unwrapped) message + if info.get('episode'): + nepisode += 1 + reward, length = info['episode']['r'], info['episode']['l'] + print('Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}' + .format(i, nepisode, reward, length)) From 8b68349d43621619fe799ab4afc21da7f1fb2515 Mon Sep 17 00:00:00 2001 From: Tokarev-TT-33 <34995488+Tokarev-TT-33@users.noreply.github.com> Date: Tue, 11 Jun 2019 11:46:18 +0800 Subject: [PATCH 46/57] format pg ddpo ppo dppo trpo --- .../reinforcement_learning/tutorial_DDPG.py | 193 ++--- .../reinforcement_learning/tutorial_DPPO.py | 147 ++-- .../reinforcement_learning/tutorial_PG.py | 132 ++-- .../reinforcement_learning/tutorial_PPO.py | 184 +++-- .../reinforcement_learning/tutorial_TRPO.py | 697 ++++++++++-------- 5 files changed, 765 insertions(+), 588 deletions(-) diff --git a/examples/reinforcement_learning/tutorial_DDPG.py b/examples/reinforcement_learning/tutorial_DDPG.py index 71ac9bf06..0bd9cadd0 100644 --- a/examples/reinforcement_learning/tutorial_DDPG.py +++ b/examples/reinforcement_learning/tutorial_DDPG.py @@ -11,38 +11,63 @@ Continuous Control With Deep Reinforcement Learning, Lillicrap et al. 2016 MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/ -Env ---- +Environment +----------- Openai Gym Pendulum-v0, continual action space +Prerequisites +------------- +tensorflow >=2.0.0a0 +tensorflow-probability 0.6.0 +tensorlayer >=2.0.0 + To run ------ -python *.py +python tutorial_DDPG.py --train/test """ import tensorflow as tf import tensorlayer as tl import numpy as np +import gym +import time +import matplotlib.pyplot as plt import os +import argparse + +parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') +parser.add_argument('--train', dest='train', action='store_true', default=True) +parser.add_argument('--test', dest='train', action='store_false') +args = parser.parse_args() ##################### hyper parameters #################### +ENV_NAME = 'Pendulum-v0' # environment name +RANDOMSEED = 1 # random seed + LR_A = 0.001 # learning rate for actor LR_C = 0.002 # learning rate for critic GAMMA = 0.9 # reward discount TAU = 0.01 # soft replacement -MEMORY_CAPACITY = 10000 -BATCH_SIZE = 32 +MEMORY_CAPACITY = 10000 # size of replay buffer +BATCH_SIZE = 32 # update batchsize + +MAX_EPISODES = 200 # total number of episodes for training +MAX_EP_STEPS = 200 # total number of steps for each episode +TEST_PER_EPISODES = 10 # test the model per episodes +VAR = 3 # control exploration + ############################### DDPG #################################### class DDPG(object): - ''' + """ DDPG class - ''' - def __init__(self, a_dim, s_dim, a_bound, ): + """ + + def __init__(self, a_dim, s_dim, a_bound): self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32) self.pointer = 0 self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound @@ -51,25 +76,26 @@ def __init__(self, a_dim, s_dim, a_bound, ): b_init = tf.constant_initializer(0.1) def get_actor(input_state_shape, name=''): - ''' + """ Build actor network :param input_state_shape: state + :param name: name :return: act - ''' + """ inputs = tl.layers.Input(input_state_shape, name='A_input') x = tl.layers.Dense(n_units=30, act=tf.nn.relu, W_init=W_init, b_init=b_init, name='A_l1')(inputs) x = tl.layers.Dense(n_units=a_dim, act=tf.nn.tanh, W_init=W_init, b_init=b_init, name='A_a')(x) - # x = tl.layers.Lambda(lambda x: np.array(a_bound)*x)(x) - # x = tf.multiply(x, a_bound, name='A_scaled_a') + x = tl.layers.Lambda(lambda x: np.array(a_bound) * x)(x) return tl.models.Model(inputs=inputs, outputs=x, name='Actor' + name) def get_critic(input_state_shape, input_action_shape, name=''): - ''' + """ Build critic network :param input_state_shape: state :param input_action_shape: act + :param name: name :return: Q value Q(s,a) - ''' + """ s = tl.layers.Input(input_state_shape, name='C_s_input') a = tl.layers.Input(input_action_shape, name='C_a_input') x = tl.layers.Concat(1)([s, a]) @@ -83,12 +109,12 @@ def get_critic(input_state_shape, input_action_shape, name=''): self.critic.train() def copy_para(from_model, to_model): - ''' + """ Copy parameters for soft updating :param from_model: latest model :param to_model: target model :return: None - ''' + """ for i, j in zip(from_model.trainable_weights, to_model.trainable_weights): j.assign(i) @@ -108,28 +134,28 @@ def copy_para(from_model, to_model): self.critic_opt = tf.optimizers.Adam(LR_C) def ema_update(self): - ''' + """ Soft updating by exponential smoothing :return: None - ''' + """ paras = self.actor.trainable_weights + self.critic.trainable_weights self.ema.apply(paras) for i, j in zip(self.actor_target.trainable_weights + self.critic_target.trainable_weights, paras): i.assign(self.ema.average(j)) def choose_action(self, s): - ''' + """ Choose action :param s: state :return: act - ''' + """ return self.actor(np.array([s], dtype=np.float32))[0] def learn(self): - ''' + """ Update parameters :return: None - ''' + """ indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE) bt = self.memory[indices, :] bs = bt[:, :self.s_dim] @@ -156,14 +182,14 @@ def learn(self): self.ema_update() def store_transition(self, s, a, r, s_): - ''' + """ Store data in data buffer :param s: state :param a: act :param r: reward :param s_: next state :return: None - ''' + """ s = s.astype(np.float32) s_ = s_.astype(np.float32) transition = np.hstack((s, a, [r], s_)) @@ -196,20 +222,14 @@ def load_ckpt(self): if __name__ == '__main__': - import gym - import time - import matplotlib.pyplot as plt - - MAX_EPISODES = 200 - MAX_EP_STEPS = 200 - TEST_PER_EPISODES = 10 - ENV_NAME = 'Pendulum-v0' - - ############################### training #################################### env = gym.make(ENV_NAME) env = env.unwrapped - env.seed(1) + + # reproducible + env.seed(RANDOMSEED) + np.random.seed(RANDOMSEED) + tf.random.set_seed(RANDOMSEED) s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] @@ -217,69 +237,70 @@ def load_ckpt(self): ddpg = DDPG(a_dim, s_dim, a_bound) - var = 3 # control exploration - reward_buffer = [] - t0 = time.time() - for i in range(MAX_EPISODES): - t1 = time.time() - s = env.reset() - ep_reward = 0 - for j in range(MAX_EP_STEPS): - # Add exploration noise - a = ddpg.choose_action(s) - a = np.clip(np.random.normal(a, var), -2, 2) # add randomness to action selection for exploration - s_, r, done, info = env.step(a) - - ddpg.store_transition(s, a, r / 10, s_) - - if ddpg.pointer > MEMORY_CAPACITY: - # var *= .9995 # decay the action randomness - ddpg.learn() - - s = s_ - ep_reward += r - if j == MAX_EP_STEPS - 1: - print("\rEpisode [%d/%d] \tReward: %i \tExplore: %.2f \ttook: %.5fs " % - (i, MAX_EPISODES, ep_reward, var, time.time() - t1), end='') - - # test - if i and not i % TEST_PER_EPISODES: + if args.train: # train + + reward_buffer = [] + t0 = time.time() + for i in range(MAX_EPISODES): t1 = time.time() s = env.reset() ep_reward = 0 for j in range(MAX_EP_STEPS): - - a = ddpg.choose_action(s) # without exploration noise + # Add exploration noise + a = ddpg.choose_action(s) + a = np.clip(np.random.normal(a, VAR), -2, 2) # add randomness to action selection for exploration s_, r, done, info = env.step(a) + ddpg.store_transition(s, a, r / 10, s_) + + if ddpg.pointer > MEMORY_CAPACITY: + ddpg.learn() + s = s_ ep_reward += r if j == MAX_EP_STEPS - 1: - print("\rEpisode [%d/%d] \tReward: %i \tExplore: %.2f \ttook: %.5fs " % - (i, MAX_EPISODES, ep_reward, var, time.time() - t1)) - - reward_buffer.append(ep_reward) - - if reward_buffer: - plt.ion() - plt.title('DDPG') - plt.plot(np.array(range(len(reward_buffer)))*TEST_PER_EPISODES, reward_buffer) # plot the episode vt - plt.xlabel('episode steps') - plt.ylabel('normalized state-action value') - plt.ylim(-2000, 0) - plt.show() - plt.pause(0.1) - plt.cla() - plt.ioff() - - print('\nRunning time: ', time.time() - t0) - s = env.reset() + print('\rEpisode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}' + .format(i, MAX_EPISODES, ep_reward, time.time() - t1), end='') + plt.show() + # test + if i and not i % TEST_PER_EPISODES: + t1 = time.time() + s = env.reset() + ep_reward = 0 + for j in range(MAX_EP_STEPS): + + a = ddpg.choose_action(s) # without exploration noise + s_, r, done, info = env.step(a) + + s = s_ + ep_reward += r + if j == MAX_EP_STEPS - 1: + print('\rEpisode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}' + .format(i, MAX_EPISODES, ep_reward, time.time() - t1)) + + reward_buffer.append(ep_reward) + + if reward_buffer: + plt.ion() + plt.cla() + plt.title('DDPG') + plt.plot(np.array(range(len(reward_buffer))) * TEST_PER_EPISODES, reward_buffer) # plot the episode vt + plt.xlabel('episode steps') + plt.ylabel('normalized state-action value') + plt.ylim(-2000, 0) + plt.show() + plt.pause(0.1) + plt.ioff() + plt.show() + print('\nRunning time: ', time.time() - t0) + ddpg.save_ckpt() + + # test + ddpg.load_ckpt() while True: s = env.reset() for i in range(MAX_EP_STEPS): env.render() - a = ddpg.choose_action(s) - s_, r, done, info = env.step(a) + s, r, done, info = env.step(ddpg.choose_action(s)) if done: break - s = s_ diff --git a/examples/reinforcement_learning/tutorial_DPPO.py b/examples/reinforcement_learning/tutorial_DPPO.py index 9bcad026e..6bebc64a5 100644 --- a/examples/reinforcement_learning/tutorial_DPPO.py +++ b/examples/reinforcement_learning/tutorial_DPPO.py @@ -12,13 +12,19 @@ High Dimensional Continuous Control Using Generalized Advantage Estimation, Schulman et al. 2016 MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials -Env ---- +Environment +----------- Openai Gym Pendulum-v0, continual action space +Prerequisites +-------------- +tensorflow >=2.0.0a0 +tensorflow-probability 0.6.0 +tensorlayer >=2.0.0 + To run ------ -python *.py +python tutorial_DPPO.py --train/test """ @@ -27,26 +33,43 @@ import numpy as np import matplotlib.pyplot as plt import gym, threading, queue +import time import tensorlayer as tl import tensorflow_probability as tfp import os +import argparse + +parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') +parser.add_argument('--train', dest='train', action='store_true', default=True) +parser.add_argument('--test', dest='train', action='store_false') +args = parser.parse_args() + +##################### hyper parameters #################### + +GAME = 'Pendulum-v0' # environment name +RANDOMSEED = 1 # random seed + +EP_MAX = 1000 # total number of episodes for training +EP_LEN = 200 # total number of steps for each episode +GAMMA = 0.9 # reward discount +A_LR = 0.0001 # learning rate for actor +C_LR = 0.0002 # learning rate for critic +BATCH = 32 # update batchsize +A_UPDATE_STEPS = 10 # actor update steps +C_UPDATE_STEPS = 10 # critic update steps +S_DIM, A_DIM = 3, 1 # state dimension, action dimension +EPS = 1e-8 # epsilon +METHOD = [dict(name='kl_pen', kl_target=0.01, lam=0.5), # KL penalty + dict(name='clip', epsilon=0.2), # Clipped surrogate objective, find this is better + ][1] # choose the method for optimization + +N_WORKER = 4 # parallel workers +MIN_BATCH_SIZE = 64 # minimum batch size for updating PPO +UPDATE_STEP = 10 # loop update operation n-steps -EP_MAX = 1000 -EP_LEN = 200 -GAMMA = 0.9 -A_LR = 0.0001 -C_LR = 0.0002 -BATCH = 32 -A_UPDATE_STEPS = 10 -C_UPDATE_STEPS = 10 -S_DIM, A_DIM = 3, 1 -EPS = 1e-8 -METHOD = [ - dict(name='kl_pen', kl_target=0.01, lam=0.5), # KL penalty - dict(name='clip', epsilon=0.2), # Clipped surrogate objective, find this is better -][1] # choose the method for optimization +############################### DPPO #################################### class PPO(object): ''' @@ -166,7 +189,10 @@ def update(self): METHOD['lam'] /= 2 elif kl > METHOD['kl_target'] * 1.5: METHOD['lam'] *= 2 - METHOD['lam'] = np.clip(METHOD['lam'], 1e-4, 10) # sometimes explode, this clipping is MorvanZhou's solution + + # sometimes explode, this clipping is MorvanZhou's solution + METHOD['lam'] = np.clip(METHOD['lam'], 1e-4, 10) + else: # clipping method, find this is better (OpenAI's paper) for _ in range(A_UPDATE_STEPS): self.a_train(s, a, adv) @@ -242,15 +268,8 @@ def load_ckpt(self): tl.files.load_hdf5_to_weights_in_order('model/dppo_critic.hdf5', self.critic) - '''--------------------------------------------------------------''' -N_WORKER = 4 # parallel workers -MIN_BATCH_SIZE = 64 # minimum batch size for updating PPO -UPDATE_STEP = 10 # loop update operation n-steps - -GAME = 'Pendulum-v0' - class Worker(object): ''' @@ -260,6 +279,7 @@ class Worker(object): def __init__(self, wid): self.wid = wid self.env = gym.make(GAME).unwrapped + self.env.seed(wid*100 + RANDOMSEED) self.ppo = GLOBAL_PPO def work(self): @@ -272,6 +292,7 @@ def work(self): s = self.env.reset() ep_r = 0 buffer_s, buffer_a, buffer_r = [], [], [] + t0 = time.time() for t in range(EP_LEN): if not ROLLING_EVENT.is_set(): # while global PPO is updating ROLLING_EVENT.wait() # wait until PPO is updated @@ -310,41 +331,55 @@ def work(self): else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 + ep_r * 0.1) GLOBAL_EP += 1 - print('{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r, ) + + print('Episode: {}/{} | Worker: {} | Episode Reward: {:.4f} | Running Time: {:.4f}' + .format(GLOBAL_EP, EP_MAX, self.wid, ep_r, time.time() - t0)) if __name__ == '__main__': + + # reproducible + np.random.seed(RANDOMSEED) + tf.random.set_seed(RANDOMSEED) + GLOBAL_PPO = PPO() - UPDATE_EVENT, ROLLING_EVENT = threading.Event(), threading.Event() - UPDATE_EVENT.clear() # not update now - ROLLING_EVENT.set() # start to roll out - workers = [Worker(wid=i) for i in range(N_WORKER)] - - GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0 - GLOBAL_RUNNING_R = [] - COORD = tf.train.Coordinator() - QUEUE = queue.Queue() # workers putting data in this queue - threads = [] - for worker in workers: # worker threads - t = threading.Thread(target=worker.work, args=()) - t.start() # training - threads.append(t) - # add a PPO updating thread - threads.append(threading.Thread(target=GLOBAL_PPO.update, )) - threads[-1].start() - COORD.join(threads) - - # plot reward change and test - plt.title('DPPO') - plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R) - plt.xlabel('Episode') - plt.ylabel('Moving reward') - plt.ylim(-2000, 0) - plt.show() - - env = gym.make('Pendulum-v0') + if args.train: # train + UPDATE_EVENT, ROLLING_EVENT = threading.Event(), threading.Event() + UPDATE_EVENT.clear() # not update now + ROLLING_EVENT.set() # start to roll out + workers = [Worker(wid=i) for i in range(N_WORKER)] + + GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0 + GLOBAL_RUNNING_R = [] + COORD = tf.train.Coordinator() + QUEUE = queue.Queue() # workers putting data in this queue + threads = [] + for worker in workers: # worker threads + t = threading.Thread(target=worker.work, args=()) + t.start() # training + threads.append(t) + # add a PPO updating thread + threads.append(threading.Thread(target=GLOBAL_PPO.update, )) + threads[-1].start() + COORD.join(threads) + + GLOBAL_PPO.save_ckpt() + + # plot reward change and test + plt.title('DPPO') + plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R) + plt.xlabel('Episode') + plt.ylabel('Moving reward') + plt.ylim(-2000, 0) + plt.show() + + # test + GLOBAL_PPO.load_ckpt() + env = gym.make(GAME) while True: s = env.reset() - for t in range(300): + for t in range(EP_LEN): env.render() - s = env.step(GLOBAL_PPO.choose_action(s))[0] + s, r, done, info = env.step(GLOBAL_PPO.choose_action(s)) + if done: + break diff --git a/examples/reinforcement_learning/tutorial_PG.py b/examples/reinforcement_learning/tutorial_PG.py index 014bb573d..7adb76d2d 100644 --- a/examples/reinforcement_learning/tutorial_PG.py +++ b/examples/reinforcement_learning/tutorial_PG.py @@ -11,28 +11,49 @@ Cookbook: Barto A G, Sutton R S. Reinforcement Learning: An Introduction[J]. 1998. MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/ -Env ---- +Environment +----------- Openai Gym CartPole-v0, discrete action space -https://gym.openai.com/envs/CartPole-v0 + +Prerequisites +-------------- +tensorflow >=2.0.0a0 +tensorflow-probability 0.6.0 +tensorlayer >=2.0.0 To run ------ -python *.py +python tutorial_PG.py --train/test """ import tensorflow as tf import tensorlayer as tl import numpy as np + +import gym +import matplotlib.pyplot as plt +import time import os +import argparse + +parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') +parser.add_argument('--train', dest='train', action='store_true', default=True) +parser.add_argument('--test', dest='train', action='store_false') +args = parser.parse_args() + -tl.logging.set_verbosity(tl.logging.DEBUG) +##################### hyper parameters #################### -# reproducible -np.random.seed(1) -tf.random.set_seed(1) +ENV_NAME = 'CartPole-v0' # environment name +RANDOMSEED = 1 # random seed +DISPLAY_REWARD_THRESHOLD = 400 # renders environment if total episode reward is greater then this threshold +RENDER = False # rendering wastes time +num_episodes = 3000 + + +############################### PG #################################### class PolicyGradient: """ @@ -163,16 +184,14 @@ def load_ckpt(self): if __name__ == '__main__': - import gym - import matplotlib.pyplot as plt - import time + # reproducible + np.random.seed(RANDOMSEED) + tf.random.set_seed(RANDOMSEED) - DISPLAY_REWARD_THRESHOLD = 400 # renders environment if total episode reward is greater then this threshold - RENDER = False # rendering wastes time - num_episodes = 3000 + tl.logging.set_verbosity(tl.logging.DEBUG) - env = gym.make('CartPole-v0') - env.seed(1) # reproducible, general Policy gradient has high variance + env = gym.make(ENV_NAME) + env.seed(RANDOMSEED) # reproducible, general Policy gradient has high variance env = env.unwrapped print(env.action_space) @@ -187,53 +206,66 @@ def load_ckpt(self): reward_decay=0.99, # output_graph=True, ) - reward_buffer = [] + if args.train: + reward_buffer = [] + + for i_episode in range(num_episodes): - for i_episode in range(num_episodes): + episode_time = time.time() + observation = env.reset() - episode_time = time.time() - observation = env.reset() + while True: + if RENDER: + env.render() - while True: - if RENDER: - env.render() + action = RL.choose_action(observation) - action = RL.choose_action(observation) + observation_, reward, done, info = env.step(action) - observation_, reward, done, info = env.step(action) + RL.store_transition(observation, action, reward) - RL.store_transition(observation, action, reward) + if done: + ep_rs_sum = sum(RL.ep_rs) - if done: - ep_rs_sum = sum(RL.ep_rs) + if 'running_reward' not in globals(): + running_reward = ep_rs_sum + else: + running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 - if 'running_reward' not in globals(): - running_reward = ep_rs_sum - else: - running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 + if running_reward > DISPLAY_REWARD_THRESHOLD: + RENDER = True # rendering - if running_reward > DISPLAY_REWARD_THRESHOLD: - RENDER = True # rendering + # print("episode:", i_episode, " reward:", int(running_reward)) - # print("episode:", i_episode, " reward:", int(running_reward)) + print("Episode [%d/%d] \tsum reward: %d \trunning reward: %f \ttook: %.5fs " % + (i_episode, num_episodes, ep_rs_sum, running_reward, time.time() - episode_time)) + reward_buffer.append(running_reward) - print("Episode [%d/%d] \tsum reward: %d \trunning reward: %f \ttook: %.5fs " % - (i_episode, num_episodes, ep_rs_sum, running_reward, time.time() - episode_time)) - reward_buffer.append(running_reward) + vt = RL.learn() - vt = RL.learn() + plt.ion() + plt.cla() + plt.title('PG') + plt.plot(reward_buffer, ) # plot the episode vt + plt.xlabel('episode steps') + plt.ylabel('normalized state-action value') + plt.show() + plt.pause(0.1) - plt.ion() - plt.title('PG') - plt.plot(reward_buffer, ) # plot the episode vt - plt.xlabel('episode steps') - plt.ylabel('normalized state-action value') - plt.show() - plt.pause(0.1) - plt.cla() - plt.ioff() + break - break + observation = observation_ + RL.save_ckpt() + plt.ioff() + plt.show() - observation = observation_ + # test + RL.load_ckpt() + observation = env.reset() + while True: + env.render() + action = RL.choose_action(observation) + observation, reward, done, info = env.step(action) + if done: + observation = env.reset() diff --git a/examples/reinforcement_learning/tutorial_PPO.py b/examples/reinforcement_learning/tutorial_PPO.py index fddf39406..72cbc8e1d 100644 --- a/examples/reinforcement_learning/tutorial_PPO.py +++ b/examples/reinforcement_learning/tutorial_PPO.py @@ -12,13 +12,19 @@ Emergence of Locomotion Behaviours in Rich Environments, Heess et al. 2017 MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials -Env ---- +Environment +----------- Openai Gym Pendulum-v0, continual action space +Prerequisites +-------------- +tensorflow >=2.0.0a0 +tensorflow-probability 0.6.0 +tensorlayer >=2.0.0 + To run ------ -python *.py +python tutorial_PPO.py --train/test """ @@ -28,23 +34,36 @@ import gym import tensorlayer as tl import tensorflow_probability as tfp +import time import os +import argparse + +parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') +parser.add_argument('--train', dest='train', action='store_true', default=True) +parser.add_argument('--test', dest='train', action='store_false') +args = parser.parse_args() + +##################### hyper parameters #################### + +ENV_NAME = 'Pendulum-v0' # environment name +RANDOMSEED = 1 # random seed + +EP_MAX = 1000 # total number of episodes for training +EP_LEN = 200 # total number of steps for each episode +GAMMA = 0.9 # reward discount +A_LR = 0.0001 # learning rate for actor +C_LR = 0.0002 # learning rate for critic +BATCH = 32 # update batchsize +A_UPDATE_STEPS = 10 # actor update steps +C_UPDATE_STEPS = 10 # critic update steps +S_DIM, A_DIM = 3, 1 # state dimension, action dimension +EPS = 1e-8 # epsilon +METHOD = [dict(name='kl_pen', kl_target=0.01, lam=0.5), # KL penalty + dict(name='clip', epsilon=0.2), # Clipped surrogate objective, find this is better + ][1] # choose the method for optimization -EP_MAX = 1000 -EP_LEN = 200 -GAMMA = 0.9 -A_LR = 0.0001 -C_LR = 0.0002 -BATCH = 32 -A_UPDATE_STEPS = 10 -C_UPDATE_STEPS = 10 -S_DIM, A_DIM = 3, 1 -EPS = 1e-8 -METHOD = [ - dict(name='kl_pen', kl_target=0.01, lam=0.5), # KL penalty - dict(name='clip', epsilon=0.2), # Clipped surrogate objective, find this is better -][1] # choose the method for optimization +############################### PPO #################################### class PPO(object): ''' @@ -118,8 +137,10 @@ def c_train(self, tfdc_r, s): ''' tfdc_r = np.array(tfdc_r, dtype=np.float32) with tf.GradientTape() as tape: - advantage = tfdc_r - self.critic(s) + v = self.critic(s) + advantage = tfdc_r - v closs = tf.reduce_mean(tf.square(advantage)) + # print('tfdc_r value', tfdc_r) grad = tape.gradient(closs, self.critic.trainable_weights) tf.optimizers.Adam(C_LR).apply_gradients(zip(grad, self.critic.trainable_weights)) @@ -230,63 +251,72 @@ def load_ckpt(self): tl.files.load_hdf5_to_weights_in_order('model/ppo_critic.hdf5', self.critic) -env = gym.make('Pendulum-v0').unwrapped -ppo = PPO() -all_ep_r = [] - -for ep in range(EP_MAX): - s = env.reset() - buffer_s, buffer_a, buffer_r = [], [], [] - ep_r = 0 - for t in range(EP_LEN): # in one episode - # env.render() - a = ppo.choose_action(s) - s_, r, done, _ = env.step(a) - buffer_s.append(s) - buffer_a.append(a) - buffer_r.append((r + 8) / 8) # normalize reward, find to be useful - s = s_ - ep_r += r - - # update ppo - if (t + 1) % BATCH == 0 or t == EP_LEN - 1: - v_s_ = ppo.get_v(s_) - discounted_r = [] - for r in buffer_r[::-1]: - v_s_ = r + GAMMA * v_s_ - discounted_r.append(v_s_) - discounted_r.reverse() - - bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis] +if __name__ == '__main__': + + env = gym.make(ENV_NAME).unwrapped + + # reproducible + env.seed(RANDOMSEED) + np.random.seed(RANDOMSEED) + tf.random.set_seed(RANDOMSEED) + + ppo = PPO() + + if args.train: + all_ep_r = [] + for ep in range(EP_MAX): + s = env.reset() buffer_s, buffer_a, buffer_r = [], [], [] - ppo.update(bs, ba, br) - if ep == 0: - all_ep_r.append(ep_r) - else: - all_ep_r.append(all_ep_r[-1] * 0.9 + ep_r * 0.1) - print( - 'Ep: %i' % ep, - "|Ep_r: %i" % ep_r, - ("|Lam: %.4f" % METHOD['lam']) if METHOD['name'] == 'kl_pen' else '', - ) - - plt.ion() - plt.cla() - plt.title('PPO') - plt.plot(np.arange(len(all_ep_r)), all_ep_r) - plt.ylim(-2000, 0) - plt.xlabel('Episode') - plt.ylabel('Moving averaged episode reward') - plt.show() - plt.pause(0.1) - plt.ioff() - -while True: - s = env.reset() - for i in range(EP_LEN): - env.render() - a = ppo.choose_action(s) - s_, r, done, _ = env.step(a) - if done: - break - s = s_ + ep_r = 0 + t0 = time.time() + for t in range(EP_LEN): # in one episode + # env.render() + a = ppo.choose_action(s) + s_, r, done, _ = env.step(a) + buffer_s.append(s) + buffer_a.append(a) + buffer_r.append((r + 8) / 8) # normalize reward, find to be useful + s = s_ + ep_r += r + + # update ppo + if (t + 1) % BATCH == 0 or t == EP_LEN - 1: + v_s_ = ppo.get_v(s_) + discounted_r = [] + for r in buffer_r[::-1]: + v_s_ = r + GAMMA * v_s_ + discounted_r.append(v_s_) + discounted_r.reverse() + + bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis] + buffer_s, buffer_a, buffer_r = [], [], [] + ppo.update(bs, ba, br) + if ep == 0: + all_ep_r.append(ep_r) + else: + all_ep_r.append(all_ep_r[-1] * 0.9 + ep_r * 0.1) + print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}' + .format(ep, EP_MAX, ep_r, time.time() - t0)) + + plt.ion() + plt.cla() + plt.title('PPO') + plt.plot(np.arange(len(all_ep_r)), all_ep_r) + plt.ylim(-2000, 0) + plt.xlabel('Episode') + plt.ylabel('Moving averaged episode reward') + plt.show() + plt.pause(0.1) + ppo.save_ckpt() + plt.ioff() + plt.show() + + # test + ppo.load_ckpt() + while True: + s = env.reset() + for i in range(EP_LEN): + env.render() + s, r, done, _ = env.step(ppo.choose_action(s)) + if done: + break diff --git a/examples/reinforcement_learning/tutorial_TRPO.py b/examples/reinforcement_learning/tutorial_TRPO.py index 1f1b19aad..017ac086d 100644 --- a/examples/reinforcement_learning/tutorial_TRPO.py +++ b/examples/reinforcement_learning/tutorial_TRPO.py @@ -13,14 +13,19 @@ Approximately Optimal Approximate Reinforcement Learning, Kakade and Langford 2002 openai/spinningup : http://spinningup.openai.com/en/latest/algorithms/trpo.html -Env ---- +Environment +----------- Openai Gym Pendulum-v0, continual action space +Prerequisites +-------------- +tensorflow >=2.0.0a0 +tensorflow-probability 0.6.0 +tensorlayer >=2.0.0 + To run ------ -python *.py - +python tutorial_TRPO.py --train/test """ import numpy as np @@ -29,39 +34,105 @@ import tensorlayer as tl import gym import time -import os import matplotlib.pyplot as plt import scipy.signal import copy from gym.spaces import Box, Discrete - -EPS = 1e-8 - +import os +import argparse + +parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') +parser.add_argument('--train', dest='train', action='store_true', default=True) +parser.add_argument('--test', dest='train', action='store_false') + +parser.add_argument('--env', type=str, default='Pendulum-v0') # environment name +parser.add_argument('--hid', type=int, default=64) # size of each hidden layer +parser.add_argument('--l', type=int, default=2) # hidden layer length +parser.add_argument('--gamma', type=float, default=0.99) # reward discount +parser.add_argument('--seed', '-s', type=int, default=1) # random seed +parser.add_argument('--steps', type=int, default=4000) # total number of steps for each episode +parser.add_argument('--epochs', type=int, default=500) # total number of episodes for training +args = parser.parse_args() + +##################### hyper parameters #################### + +ENV_NAME = args.env # environment name +HIDDEN_SIZES = [args.hid] * args.l # hidden layer size +SEED = args.seed # random seed +STEPS_PER_EPOCH = args.steps # total number of steps for each episode +EPOCHS = args.epochs # total number of episodes for training +GAMMA = args.gamma # reward discount + +DELTA = 0.01 # KL-divergence limit for TRPO update. +VF_LR = 1e-3 # Learning rate for value function optimizer +TRAIN_V_ITERS = 80 # Number of gradient descent steps to take on value function per epoch +DAMPING_COEFF = 0.1 # Artifact for numerical stability +CG_ITERS = 10 # Number of iterations of conjugate gradient to perform +BACKTRACK_ITERS = 10 # Maximum number of steps allowed in the backtracking line search +BACKTRACK_COEFF = 0.8 # How far back to step during backtracking line search +LAM = 0.97 # Lambda for GAE-Lambda +MAX_EP_LEN = 1000 # Maximum length of trajectory +SAVE_FREQ = 10 # How often (in terms of gap between epochs) to save the current policy and value function +EPS = 1e-8 # epsilon + + +##################### functions #################### def combined_shape(length, shape=None): + """ + combine length and shape based on shape type + :param length: int length + :param shape: shape, can be either scalar or array + :return: shape + """ if shape is None: return length, return (length, shape) if np.isscalar(shape) else (length, *shape) def keys_as_sorted_list(dict): + """ + sorted keys of the dict + :param dict: dict input + :return: sorted key list + """ return sorted(list(dict.keys())) def values_as_sorted_list(dict): + """ + sorted values of the dict + :param dict: dict input + :return: sorted value list + """ return [dict[k] for k in keys_as_sorted_list(dict)] def input_layer(dim=None): + """ + create tensorlayer input layer from dimension input + :param dim: dimension int + :return: tensorlayer input layer + """ return tl.layers.Input(dtype=tf.float32, shape=combined_shape(None, dim)) def input_layers(*args): + """ + create tensorlayer input layers from a list of dimensions + :param args: a list of dimensions + :return: list of input layers + """ return [input_layer(dim) for dim in args] def input_layer_from_space(space): + """ + create tensorlayer input layers from env.space input + :param space: env.space + :return: tensorlayer input layer + """ if isinstance(space, Box): return input_layer(space.shape) elif isinstance(space, Discrete): @@ -70,25 +141,55 @@ def input_layer_from_space(space): def input_layers_from_spaces(*args): + """ + create tensorlayer input layers from a list of env.space inputs + :param args: a list of env.space inputs + :return: tensorlayer input layer list + """ return [input_layer_from_space(space) for space in args] def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): + """ + create Multi-Layer Perception + :param x: tensorlayer input layer + :param hidden_sizes: hidden layer size + :param activation: hidden layer activation function + :param output_activation: activation function for the output layer + :return: output layer + """ for h in hidden_sizes[:-1]: x = tl.layers.Dense(n_units=h, act=activation)(x) return tl.layers.Dense(n_units=hidden_sizes[-1], act=output_activation)(x) def get_vars(model: tl.models.Model): + """ + get trainable parameters of the model + :param model: tensorlayer model + :return: a list of trainable parameters of the model + """ return model.trainable_weights def count_vars(model: tl.models.Model): + """ + count trainable parameters of the model + :param model: tensorlayer model + :return: counts + """ v = get_vars(model) return sum([np.prod(var.shape.as_list()) for var in v]) def gaussian_likelihood(x, mu, log_std): + """ + calculate gaussian likelihood + :param x: input distribution + :param mu: mu + :param log_std: log std + :return: gaussian likelihood + """ pre_sum = -0.5 * (((x - mu) / (tf.exp(log_std) + EPS)) ** 2 + 2 * log_std + np.log(2 * np.pi)) return tf.reduce_sum(pre_sum, axis=1) @@ -115,20 +216,21 @@ def categorical_kl(logp0, logp1): def flat_concat(xs): + """ + flat concat input + :param xs: a list of tensor + :return: flat tensor + """ return tf.concat([tf.reshape(x, (-1,)) for x in xs], axis=0) -def flat_grad(f, params): - return flat_concat(tf.gradients(xs=params, ys=f)) - - -def hessian_vector_product(f, params, x): - # for H = grad**2 f, compute Hx - g = flat_grad(f, params) - return flat_grad(tf.reduce_sum(g * x), params) - - def assign_params_from_flat(x, params): + """ + assign params from flat input + :param x: + :param params: + :return: group + """ flat_size = lambda p: int(np.prod(p.shape.as_list())) # the 'int' is important for scalars splits = tf.split(x, [flat_size(p) for p in params]) new_params = [tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits)] @@ -139,14 +241,14 @@ def discount_cumsum(x, discount): """ magic from rllab for computing discounted cumulative sums of vectors. - input: - vector x, - [x0, - x1, + input: + vector x, + [x0, + x1, x2] output: - [x0 + discount * x1 + discount^2 * x2, + [x0 + discount * x1 + discount^2 * x2, x1 + discount * x2, x2] """ @@ -158,56 +260,66 @@ def discount_cumsum(x, discount): """ -def mlp_categorical_policy(x, a, hidden_sizes, activation, output_activation): - act_dim = a.n +class MlpCategoricalPolicy: + """ + Categorical Policy for discrete input + """ - x = input_layer_from_space(x) - logits = mlp(x, list(hidden_sizes) + [act_dim], activation, None) - actor = tl.models.Model(x, logits) + def __init__(self, x, a, hidden_sizes, activation, output_activation): + self.act_dim = a.n + x = input_layer_from_space(x) + logits = mlp(x, list(hidden_sizes) + [self.act_dim], activation, None) + self.model = tl.models.Model(x, logits) + self.model.train() - def cal_outputs_0(states): + def cal_outputs_0(self, states): states = states.astype(np.float32) - logits = actor(states) + logits = self.model(states) logp_all = tf.nn.log_softmax(logits) pi = tf.squeeze(tfp.distributions.Multinomial(1, logits), axis=1) - logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=act_dim) * logp_all, axis=1) + logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=self.act_dim) * logp_all, axis=1) info = {'logp_all': logp_all} return pi, logp_pi, info, logp_all - def cal_outputs_1(states, actions, old_logp_all): - pi, logp_pi, info, logp_all = cal_outputs_0(states) - logp = tf.reduce_sum(tf.one_hot(actions, depth=act_dim) * logp_all, axis=1) + def cal_outputs_1(self, states, actions, old_logp_all): + pi, logp_pi, info, logp_all = self.cal_outputs_0(states) + logp = tf.reduce_sum(tf.one_hot(actions, depth=self.act_dim) * logp_all, axis=1) d_kl = categorical_kl(logp_all, old_logp_all) info_phs = {'logp_all': old_logp_all} return pi, logp, logp_pi, info, info_phs, d_kl - return actor, cal_outputs_0, cal_outputs_1 +class MlpGaussianPolicy: + """ + Gaussian Policy for continuous input + """ + + def __init__(self, x, a, hidden_sizes, activation, output_activation): + act_dim = a.shape[0] -def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation): - act_dim = a.shape[0] + x = input_layer_from_space(x) + mu = mlp(x, list(hidden_sizes) + [act_dim], activation, output_activation) + self.model = tl.models.Model(x, mu) + self.model.train() - x = input_layer_from_space(x) - mu = mlp(x, list(hidden_sizes) + [act_dim], activation, output_activation) - actor = tl.models.Model(x, mu) - log_std = tf.Variable(-0.5 * np.ones(act_dim, dtype=np.float32)) - actor.trainable_weights.append(log_std) + self._log_std = tf.Variable(-0.5 * np.ones(act_dim, dtype=np.float32)) + self.model.trainable_weights.append(self._log_std) - def cal_outputs_0(states): + def cal_outputs_0(self, states): states = states.astype(np.float32) - mu = actor(states) - std = tf.exp(log_std) + mu = self.model(states) + std = tf.exp(self._log_std) pi = mu + tf.random.normal(tf.shape(mu)) * std - logp_pi = gaussian_likelihood(pi, mu, log_std) + logp_pi = gaussian_likelihood(pi, mu, self._log_std) - info = {'mu': mu, 'log_std': log_std} + info = {'mu': mu, 'log_std': self._log_std} - return pi, logp_pi, info, mu, log_std + return pi, logp_pi, info, mu, self._log_std - def cal_outputs_1(states, actions, old_log_std_ph, old_mu_ph): - pi, logp_pi, info, mu, log_std = cal_outputs_0(states) + def cal_outputs_1(self, states, actions, old_log_std_ph, old_mu_ph): + pi, logp_pi, info, mu, log_std = self.cal_outputs_0(states) logp = gaussian_likelihood(actions, mu, log_std) d_kl = diagonal_gaussian_kl(mu, log_std, old_mu_ph, old_log_std_ph) @@ -215,8 +327,6 @@ def cal_outputs_1(states, actions, old_log_std_ph, old_mu_ph): return pi, logp, logp_pi, info, info_phs, d_kl - return actor, cal_outputs_0, cal_outputs_1 - """ Actor-Critics @@ -224,26 +334,37 @@ def cal_outputs_1(states, actions, old_log_std_ph, old_mu_ph): def mlp_actor_critic(x: 'env.observation_space', a: 'env.action_space', hidden_sizes=(64, 64), activation=tf.tanh, - output_activation=None, policy=None): + output_activation=None): + """ + create actor and critic + :param x: observation space + :param a: action space + :param hidden_sizes: hidden layer size + :param activation: hidden layer activation function + :param output_activation: activation function for the output layer + :return: acter class and critic class + """ # default policy builder depends on action space - if policy is None and isinstance(a, Box): - policy = mlp_gaussian_policy - elif policy is None and isinstance(a, Discrete): - policy = mlp_categorical_policy + if isinstance(a, Box): + actor = MlpGaussianPolicy(x, a, hidden_sizes, activation, output_activation) + elif isinstance(a, Discrete): + actor = MlpCategoricalPolicy(x, a, hidden_sizes, activation, output_activation) + else: + raise ValueError('action space type error') - actor, actor_cal_func_0, actor_cal_func_1 = policy(x, a, hidden_sizes, activation, output_activation) + class Critic: + def __init__(self, obs_space, hidden_layer_sizes, activation_funcs): + inputs = input_layer_from_space(obs_space) + self.model = tl.models.Model(inputs, mlp(inputs, list(hidden_layer_sizes) + [1], activation_funcs, None)) + self.model.train() - x = input_layer_from_space(x) - critic = tl.models.Model(x, mlp(x, list(hidden_sizes) + [1], activation, None)) + def critic_cal_func(self, states): + states = states.astype(np.float32) + return tf.squeeze(self.model(states), axis=1) - actor.train() - critic.train() + critic = Critic(x, hidden_sizes, activation) - def critic_cal_func(states): - states = states.astype(np.float32) - return tf.squeeze(critic(states), axis=1) - - return actor, actor_cal_func_0, actor_cal_func_1, critic, critic_cal_func + return actor, critic class GAEBuffer: @@ -325,6 +446,8 @@ def get(self): self.logp_buf] + values_as_sorted_list(self.info_bufs) +##################### TRPO #################### + """ Trust Region Policy Optimization @@ -334,199 +457,138 @@ def get(self): """ -def trpo(env_fn, actor_critic=mlp_actor_critic, ac_kwargs=dict(), seed=1, - steps_per_epoch=4000, epochs=50, gamma=0.99, delta=0.01, vf_lr=1e-3, - train_v_iters=80, damping_coeff=0.1, cg_iters=10, backtrack_iters=10, - backtrack_coeff=0.8, lam=0.97, max_ep_len=1000, save_freq=10, algo='trpo'): +class TRPO: """ - - Args: - env_fn : A function which creates a copy of the environment. - The environment must satisfy the OpenAI Gym API. - - actor_critic: A function which takes in placeholder symbols - for state, ``x_ph``, and action, ``a_ph``, and returns the main - outputs from the agent's Tensorflow computation graph: - - ============ ================ ======================================== - Symbol Shape Description - ============ ================ ======================================== - ``pi`` (batch, act_dim) | Samples actions from policy given - | states. - ``logp`` (batch,) | Gives log probability, according to - | the policy, of taking actions ``a_ph`` - | in states ``x_ph``. - ``logp_pi`` (batch,) | Gives log probability, according to - | the policy, of the action sampled by - | ``pi``. - ``info`` N/A | A dict of any intermediate quantities - | (from calculating the policy or log - | probabilities) which are needed for - | analytically computing KL divergence. - | (eg sufficient statistics of the - | distributions) - ``info_phs`` N/A | A dict of placeholders for old values - | of the entries in ``info``. - ``d_kl`` () | A symbol for computing the mean KL - | divergence between the current policy - | (``pi``) and the old policy (as - | specified by the inputs to - | ``info_phs``) over the batch of - | states given in ``x_ph``. - ``v`` (batch,) | Gives the value estimate for states - | in ``x_ph``. (Critical: make sure - | to flatten this!) - ============ ================ ======================================== - - ac_kwargs (dict): Any kwargs appropriate for the actor_critic - function you provided to TRPO. - - seed (int): Seed for random number generators. - - steps_per_epoch (int): Number of steps of interaction (state-action pairs) - for the agent and the environment in each epoch. - - epochs (int): Number of epochs of interaction (equivalent to - number of policy updates) to perform. - - gamma (float): Discount factor. (Always between 0 and 1.) - - delta (float): KL-divergence limit for TRPO / NPG update. - (Should be small for stability. Values like 0.01, 0.05.) - - vf_lr (float): Learning rate for value function optimizer. - - train_v_iters (int): Number of gradient descent steps to take on - value function per epoch. - - damping_coeff (float): Artifact for numerical stability, should be - smallish. Adjusts Hessian-vector product calculation: - - .. math:: Hv \\rightarrow (\\alpha I + H)v - - where :math:`\\alpha` is the damping coefficient. - Probably don't play with this hyperparameter. - - cg_iters (int): Number of iterations of conjugate gradient to perform. - Increasing this will lead to a more accurate approximation - to :math:`H^{-1} g`, and possibly slightly-improved performance, - but at the cost of slowing things down. - - Also probably don't play with this hyperparameter. - - backtrack_iters (int): Maximum number of steps allowed in the - backtracking line search. Since the line search usually doesn't - backtrack, and usually only steps back once when it does, this - hyperparameter doesn't often matter. - - backtrack_coeff (float): How far back to step during backtracking line - search. (Always between 0 and 1, usually above 0.5.) - - lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, - close to 1.) - - max_ep_len (int): Maximum length of trajectory / episode / rollout. - - save_freq (int): How often (in terms of gap between epochs) to save - the current policy and value function. - - algo: Either 'trpo' or 'npg': this code supports both, since they are - almost the same. - + trpo class """ - tf.random.set_seed(seed) - np.random.seed(seed) + def __init__(self, obs_space, act_space): - env = env_fn() - obs_dim = env.observation_space.shape - act_dim = env.action_space.shape - - # Share information about action space with policy architecture - ac_kwargs['action_space'] = env.action_space + obs_dim = obs_space.shape + act_dim = act_space.shape - # Main models and functions - actor, actor_cal_func_0, actor_cal_func_1, critic, critic_cal_func = \ - actor_critic(env.observation_space, env.action_space) + # # Main models and functions + self.actor, self.critic = mlp_actor_critic(obs_space, act_space, HIDDEN_SIZES) - # Every step, get: action, value, logprob, & info for pdist (for computing kl div) - def get_action_ops(states): - pi, logp_pi, info, *_ = actor_cal_func_0(states) - v = critic_cal_func(states) - return [pi, v, logp_pi] + values_as_sorted_list(info) + if isinstance(act_space, Box): + act_dim = env.action_space.shape[0] + info_shapes = {'mu': [act_dim], 'log_std': [act_dim]} - # Experience buffer - local_steps_per_epoch = steps_per_epoch + elif isinstance(env.action_space, Discrete): + act_dim = env.action_space.n + info_shapes = {'logp_all': [act_dim]} + else: + raise Exception('info_shape error') - if isinstance(env.action_space, Box): - act_dim = env.action_space.shape[0] - info_shapes = {'mu': [act_dim], 'log_std': [act_dim]} + self.buf = GAEBuffer(obs_dim, act_dim, STEPS_PER_EPOCH, info_shapes, GAMMA, LAM) - elif isinstance(env.action_space, Discrete): - act_dim = env.action_space.n - info_shapes = {'logp_all': [act_dim]} - else: - raise Exception('info_shape error') + # Optimizer for value function + self.critic_optimizer = tf.optimizers.Adam(learning_rate=VF_LR) - buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes, gamma, lam) + # Every step, get: action, value, logprob, & info for pdist (for computing kl div) + def get_action_ops(self, states): + """ + get action + :param states: state input + :return: pi, v, logp_pi and other outputs + """ + pi, logp_pi, info, *_ = self.actor.cal_outputs_0(states) + v = self.critic.critic_cal_func(states) + res0 = [pi, v, logp_pi] + values_as_sorted_list(info) + res = [] + for i in res0: + res.append(i + 0) # transfer to tensor + return res # TRPO losses - def pi_loss(inputs): + def pi_loss(self, inputs): + """ + calculate pi loss + :param inputs: a list of x_ph, a_ph, adv_ph, ret_ph, logp_old_ph and other inputs + :return: pi loss + """ x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, *info_values = inputs - pi, logp, logp_pi, info, info_phs, d_kl = actor_cal_func_1(x_ph, a_ph, *info_values) + pi, logp, logp_pi, info, info_phs, d_kl = self.actor.cal_outputs_1(x_ph, a_ph, *info_values) ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) pi_loss = -tf.reduce_mean(ratio * adv_ph) return pi_loss - def v_loss(inputs): + def v_loss(self, inputs): + """ + calculate value loss + :param inputs: a list of x_ph, a_ph, adv_ph, ret_ph, logp_old_ph and other inputs + :return: v loss + """ x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, *info_values = inputs - v = critic_cal_func(x_ph) + v = self.critic.critic_cal_func(x_ph) v_loss = tf.reduce_mean((ret_ph - v) ** 2) return v_loss - # Optimizer for value function - critic_optimizer = tf.optimizers.Adam(learning_rate=vf_lr) - - def train_vf(inputs): + def train_vf(self, inputs): + """ + train v function + :param inputs: a list of x_ph, a_ph, adv_ph, ret_ph, logp_old_ph and other inputs + :return: None + """ with tf.GradientTape() as tape: - loss = v_loss(inputs) - grad = tape.gradient(loss, critic.trainable_weights) - critic_optimizer.apply_gradients(zip(grad, critic.trainable_weights)) + loss = self.v_loss(inputs) + grad = tape.gradient(loss, self.critic.model.trainable_weights) + self.critic_optimizer.apply_gradients(zip(grad, self.critic.model.trainable_weights)) # Symbols needed for CG solver - def gradient(inputs): - pi_params = actor.trainable_weights + def gradient(self, inputs): + """ + pi gradients + :param inputs: a list of x_ph, a_ph, adv_ph, ret_ph, logp_old_ph and other inputs + :return: gradient + """ + pi_params = self.actor.model.trainable_weights with tf.GradientTape() as tape: - loss = pi_loss(inputs) + loss = self.pi_loss(inputs) grad = tape.gradient(loss, pi_params) gradient = flat_concat(grad) return gradient - def hvp(inputs, v_ph): - pi_params = actor.trainable_weights + def hvp(self, inputs, v_ph): + """ + calculate hvp + :param inputs: a list of x_ph, a_ph, adv_ph, ret_ph, logp_old_ph and other inputs + :param v_ph: v input + :return: hvp + """ + pi_params = self.actor.model.trainable_weights x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, *info_values = inputs with tf.GradientTape() as tape1: with tf.GradientTape() as tape0: - pi, logp, logp_pi, info, info_phs, d_kl = actor_cal_func_1(x_ph, a_ph, *info_values) + pi, logp, logp_pi, info, info_phs, d_kl = self.actor.cal_outputs_1(x_ph, a_ph, *info_values) g = flat_concat(tape0.gradient(d_kl, pi_params)) l = tf.reduce_sum(g * v_ph) hvp = flat_concat(tape1.gradient(l, pi_params)) - if damping_coeff > 0: - hvp += damping_coeff * v_ph + if DAMPING_COEFF > 0: + hvp += DAMPING_COEFF * v_ph return hvp # Symbols for getting and setting params - def get_pi_params(): - pi_params = actor.trainable_weights + def get_pi_params(self): + """ + get actor trainable parameters + :return: flat actor trainable parameters + """ + pi_params = self.actor.model.trainable_weights return flat_concat(pi_params) - def set_pi_params(v_ph): - pi_params = actor.trainable_weights + def set_pi_params(self, v_ph): + """ + set actor trainable parameters + :param v_ph: inputs + :return: None + """ + pi_params = self.actor.model.trainable_weights assign_params_from_flat(v_ph, pi_params) - def save_ckpt(): + def save_ckpt(self): """ save trained weights :return: None @@ -534,18 +596,18 @@ def save_ckpt(): if not os.path.exists('model'): os.makedirs('model') - tl.files.save_weights_to_hdf5('model/trpo_actor.hdf5', actor) - tl.files.save_weights_to_hdf5('model/trpo_critic.hdf5', critic) + tl.files.save_weights_to_hdf5('model/trpo_actor.hdf5', self.actor.model) + tl.files.save_weights_to_hdf5('model/trpo_critic.hdf5', self.critic.model) - def load_ckpt(): + def load_ckpt(self): """ load trained weights :return: None """ - tl.files.load_hdf5_to_weights_in_order('model/trpo_actor.hdf5', actor) - tl.files.load_hdf5_to_weights_in_order('model/trpo_critic.hdf5', critic) + tl.files.load_hdf5_to_weights_in_order('model/trpo_actor.hdf5', self.actor.model) + tl.files.load_hdf5_to_weights_in_order('model/trpo_critic.hdf5', self.critic.model) - def cg(Ax, b): + def cg(self, Ax, b): """ Conjugate gradient algorithm (see https://en.wikipedia.org/wiki/Conjugate_gradient_method) @@ -554,7 +616,7 @@ def cg(Ax, b): r = copy.deepcopy(b) # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start. p = copy.deepcopy(r) r_dot_old = np.dot(r, r) - for _ in range(cg_iters): + for _ in range(CG_ITERS): z = Ax(p) alpha = r_dot_old / (np.dot(p, z) + EPS) x += alpha * p @@ -564,120 +626,117 @@ def cg(Ax, b): r_dot_old = r_dot_new return x - def update(): + def update(self): + """ + update trpo + :return: + """ # Prepare hessian func, gradient eval - inputs = buf.get() - ''''all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] + values_as_sorted_list(info_phs)''' - Hx = lambda x: hvp(inputs, x) - g, pi_l_old, v_l_old = gradient(inputs), pi_loss(inputs), v_loss(inputs) + inputs = self.buf.get() + Hx = lambda x: self.hvp(inputs, x) + g, pi_l_old, v_l_old = self.gradient(inputs), self.pi_loss(inputs), self.v_loss(inputs) # Core calculations for TRPO or NPG - x = cg(Hx, g) - alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS)) - old_params = get_pi_params() + x = self.cg(Hx, g) + alpha = np.sqrt(2 * DELTA / (np.dot(x, Hx(x)) + EPS)) + old_params = self.get_pi_params() def set_and_eval(step): - set_pi_params(old_params - alpha * x * step) + aa = alpha * x * step + par = old_params - aa + self.set_pi_params(par) x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, *info_values = inputs - pi, logp, logp_pi, info, info_phs, d_kl = actor_cal_func_1(x_ph, a_ph, *info_values) - loss = pi_loss(inputs) + pi, logp, logp_pi, info, info_phs, d_kl = self.actor.cal_outputs_1(x_ph, a_ph, *info_values) + loss = self.pi_loss(inputs) return [d_kl, loss] - if algo == 'npg': - # npg has no backtracking or hard kl constraint enforcement - kl, pi_l_new = set_and_eval(step=1.) - - elif algo == 'trpo': - # trpo augments npg with backtracking line search, hard kl - for j in range(backtrack_iters): - kl, pi_l_new = set_and_eval(step=backtrack_coeff ** j) - if kl <= delta and pi_l_new <= pi_l_old: - # Accepting new params at step of line search - break + # trpo augments npg with backtracking line search, hard kl + for j in range(BACKTRACK_ITERS): + kl, pi_l_new = set_and_eval(step=BACKTRACK_COEFF ** j) + if kl <= DELTA and pi_l_new <= pi_l_old: + # Accepting new params at step of line search + break - if j == backtrack_iters - 1: - # Line search failed! Keeping old params. - kl, pi_l_new = set_and_eval(step=0.) + if j == BACKTRACK_ITERS - 1: + # Line search failed! Keeping old params. + kl, pi_l_new = set_and_eval(step=0.) # Value function updates - for _ in range(train_v_iters): - train_vf(inputs) - - start_time = time.time() - o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 - - reward_list = [] - # Main loop: collect experience in env and update/log each epoch - for epoch in range(epochs): - t0 = time.time() - rew = 0 - for t in range(local_steps_per_epoch): - agent_outs = get_action_ops(o.reshape(1, -1)) - a, v_t, logp_t, info_t = np.array(agent_outs[0][0], np.float32), \ - np.array(agent_outs[1], np.float32), \ - np.array(agent_outs[2], np.float32), \ - np.array(agent_outs[3:], np.float32) - - # store - buf.store(o, a, r, v_t, logp_t, info_t) + for _ in range(TRAIN_V_ITERS): + self.train_vf(inputs) - o, r, d, _ = env.step(a) - ep_ret += r - ep_len += 1 - - terminal = d or (ep_len == max_ep_len) - if terminal or (t == local_steps_per_epoch - 1): - if not (terminal): - print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) - # if trajectory didn't reach terminal state, bootstrap value target - last_val = r if d else critic_cal_func(o.reshape(1, -1)) - buf.finish_path(last_val) - rew = ep_ret - o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 - - # Save model - if (epoch % save_freq == 0) or (epoch == epochs - 1): - save_ckpt() - - # Perform TRPO or NPG update! - update() - print('epoch [{}/{}] ep_ret: {} time: {}'.format(epoch, epochs, rew, time.time() - t0)) - - reward_list.append(rew) - plt.clf() - plt.ion() - plt.plot(reward_list) - plt.title('TRPO' + str(delta)) - plt.ylim(-2000, 0) + +if __name__ == '__main__': + + tf.random.set_seed(SEED) + np.random.seed(SEED) + + env = gym.make(ENV_NAME) + env.seed(SEED) + + agent = TRPO(env.observation_space, env.action_space) + + if args.train: + start_time = time.time() + o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 + + reward_list = [] + # Main loop: collect experience in env and update/log each epoch + for epoch in range(EPOCHS): + t0 = time.time() + rew = 0 + for t in range(STEPS_PER_EPOCH): + agent_outs = agent.get_action_ops(o.reshape(1, -1)) + a, v_t, logp_t, info_t = np.array(agent_outs[0][0], np.float32), \ + np.array(agent_outs[1], np.float32), \ + np.array(agent_outs[2], np.float32), \ + np.array(agent_outs[3:], np.float32) + + # save and log + agent.buf.store(o, a, r, v_t, logp_t, info_t) + + o, r, d, _ = env.step(a) + ep_ret += r + ep_len += 1 + + terminal = d or (ep_len == MAX_EP_LEN) + if terminal or (t == STEPS_PER_EPOCH - 1): + if not (terminal): + print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) + # if trajectory didn't reach terminal state, bootstrap value target + last_val = r if d else agent.critic.critic_cal_func(o.reshape(1, -1)) + agent.buf.finish_path(last_val) + rew = ep_ret + o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 + + # Save model + if (epoch % SAVE_FREQ == 0) or (epoch == EPOCHS - 1): + agent.save_ckpt() + + # Perform TRPO or NPG update! + agent.update() + print('epoch [{}/{}] ep_ret: {} time: {}'.format(epoch, EPOCHS, rew, time.time() - t0)) + + reward_list.append(rew) + plt.clf() + plt.ion() + plt.plot(reward_list) + plt.title('TRPO ' + str(DELTA)) + plt.ylim(-2000, 0) + plt.show() + plt.pause(0.1) + agent.save_ckpt() + plt.ioff() plt.show() - plt.pause(0.1) - plt.ioff() - plt.show() + # test + agent.load_ckpt() while True: o = env.reset() - for i in range(200): + for i in range(STEPS_PER_EPOCH): env.render() - agent_outs = get_action_ops(o.reshape(1, -1)) + agent_outs = agent.get_action_ops(o.reshape(1, -1)) a, v_t, logp_t, info_t = agent_outs[0][0], agent_outs[1], agent_outs[2], agent_outs[3:] o, r, d, _ = env.step(a) if d: break - - -if __name__ == '__main__': - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument('--env', type=str, default='Pendulum-v0') - parser.add_argument('--hid', type=int, default=64) - parser.add_argument('--l', type=int, default=2) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--seed', '-s', type=int, default=0) - parser.add_argument('--steps', type=int, default=4000) - parser.add_argument('--epochs', type=int, default=500) - args = parser.parse_args() - - trpo(lambda: gym.make(args.env), actor_critic=mlp_actor_critic, - ac_kwargs=dict(hidden_sizes=[args.hid] * args.l), gamma=args.gamma, - seed=args.seed, steps_per_epoch=args.steps, epochs=args.epochs) From eff092883c4b0e8fa4b54db632a2067c803b593b Mon Sep 17 00:00:00 2001 From: Tokarev-TT-33 <34995488+Tokarev-TT-33@users.noreply.github.com> Date: Tue, 11 Jun 2019 11:55:43 +0800 Subject: [PATCH 47/57] update authors --- examples/reinforcement_learning/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index 6a9e55cb2..c76ecf479 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -198,7 +198,7 @@ Our env wrapper: `./tutorial_wrappers.py` ## Authors - @xxxx XXXXX : AC, A3C -- @xxxx XXXXX : TPRO +- @Tokarev-TT-33 Tianyang Yu @initial-h Hongming Zhang : PG, DDPG, PPO, DPPO, TRPO - @quantumiracle Zihan Ding: SAC, TD3. ### More examples can be found in the [example list](https://tensorlayer.readthedocs.io/en/stable/user/examples.html) From a9ac6d656dfb205b1c7275cbf2a34a9334e11261 Mon Sep 17 00:00:00 2001 From: Tokarev-TT-33 <34995488+Tokarev-TT-33@users.noreply.github.com> Date: Tue, 11 Jun 2019 12:02:05 +0800 Subject: [PATCH 48/57] update authors --- examples/reinforcement_learning/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index c76ecf479..6461dfb62 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -198,7 +198,7 @@ Our env wrapper: `./tutorial_wrappers.py` ## Authors - @xxxx XXXXX : AC, A3C -- @Tokarev-TT-33 Tianyang Yu @initial-h Hongming Zhang : PG, DDPG, PPO, DPPO, TRPO - @quantumiracle Zihan Ding: SAC, TD3. +- @Tokarev-TT-33 Tianyang Yu @initial-h Hongming Zhang : PG, DDPG, PPO, DPPO, TRPO ### More examples can be found in the [example list](https://tensorlayer.readthedocs.io/en/stable/user/examples.html) From 77d4a73868889feabea311aa275603a463b81b1b Mon Sep 17 00:00:00 2001 From: Officium Date: Tue, 11 Jun 2019 17:26:46 +0800 Subject: [PATCH 49/57] update formats of C51, DQN_variants, Retrace and PER, update README --- examples/reinforcement_learning/README.md | 9 +- .../reinforcement_learning/tutorial_C51.py | 3 +- .../tutorial_DQN_variants.py | 3 +- .../tutorial_Retrace.py | 214 +++++++++++------ .../tutorial_prioritized_replay.py | 221 ++++++++++++------ 5 files changed, 299 insertions(+), 151 deletions(-) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index 6461dfb62..ff6c4081c 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -41,9 +41,11 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t | Algorithms | Observation Space | Action Space | Tutorial Env | | --------------- | ----------------- | ------------ | -------------- | | Q-learning | Discrete | Discrete | FrozenLake | -| C51 | Continuous | Discrete | CartPole | +| C51 | Discrete | Discrete | Pong, CartPole | | DQN | Discrete | Discrete | FrozenLake | -| Variants of DQN | Continuous | Discrete | Pong, CartPole | +| Variants of DQN | Discrete | Discrete | Pong, CartPole | +| Retrace | Discrete | Discrete | Pong, CartPole | +| PER | Discrete | Discrete | Pong, CartPole | | Actor-Critic | Continuous | Discrete | CartPole | | A3C | Continuous | Continuous | BipedalWalker | | DDPG | Continuous | Continuous | Pendulum | @@ -84,7 +86,7 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t -* Prioritized Experience Replay +* PER (Prioritized Experience Replay) Code: `./tutorial_prioritized_replay.py` @@ -200,5 +202,6 @@ Our env wrapper: `./tutorial_wrappers.py` - @xxxx XXXXX : AC, A3C - @quantumiracle Zihan Ding: SAC, TD3. - @Tokarev-TT-33 Tianyang Yu @initial-h Hongming Zhang : PG, DDPG, PPO, DPPO, TRPO +- @Officium Yanhua Huang: C51, Retrace, DQN_variants, prioritized_replay, wrappers. ### More examples can be found in the [example list](https://tensorlayer.readthedocs.io/en/stable/user/examples.html) diff --git a/examples/reinforcement_learning/tutorial_C51.py b/examples/reinforcement_learning/tutorial_C51.py index 8bc8c7fd0..114daae67 100644 --- a/examples/reinforcement_learning/tutorial_C51.py +++ b/examples/reinforcement_learning/tutorial_C51.py @@ -50,7 +50,6 @@ parser.add_argument('--env_id', default='CartPole-v0', help='CartPole-v0 or PongNoFrameskip-v4') args = parser.parse_args() -print(args) if args.mode == 'train': os.makedirs(args.save_path, exist_ok=True) @@ -285,7 +284,7 @@ def sync(net, net_tar): qvalues = (qdist * vrange).sum(-1) a = qvalues.argmax(1)[0] - # execute action and feed to replay buffer + # execute action # note that `_` tail in var name means next o_, r, done, info = env.step(a) diff --git a/examples/reinforcement_learning/tutorial_DQN_variants.py b/examples/reinforcement_learning/tutorial_DQN_variants.py index 2c390dcd5..a1b281b9b 100644 --- a/examples/reinforcement_learning/tutorial_DQN_variants.py +++ b/examples/reinforcement_learning/tutorial_DQN_variants.py @@ -66,7 +66,6 @@ parser.add_argument('--env_id', default='CartPole-v0', help='CartPole-v0 or PongNoFrameskip-v4') args = parser.parse_args() -print(args) if args.mode == 'train': os.makedirs(args.save_path, exist_ok=True) @@ -354,7 +353,7 @@ def softmax(x, dim): obv = np.expand_dims(o, 0).astype('float32') * ob_scale a = qnet(obv).numpy().argmax(1)[0] - # execute action and feed to replay buffer + # execute action # note that `_` tail in var name means next o_, r, done, info = env.step(a) diff --git a/examples/reinforcement_learning/tutorial_Retrace.py b/examples/reinforcement_learning/tutorial_Retrace.py index 24b260168..13535c3cc 100644 --- a/examples/reinforcement_learning/tutorial_Retrace.py +++ b/examples/reinforcement_learning/tutorial_Retrace.py @@ -1,13 +1,37 @@ -"""Implement retrace(\lambda) algorithm +""" +Retrace(\lambda) algorithm +------------------------ +Retrace(\lambda) is an off-policy algorithm that extend the idea of eligibility +trace. It apply an importance sampling ratio truncated at 1 to several behaviour +policies, which suffer from the variance explosion of standard IS and lead to +safe and efficient learning. + + +Reference: +------------------------ Munos R, Stepleton T, Harutyunyan A, et al. Safe and efficient off-policy reinforcement learning[C]//Advances in Neural Information Processing Systems. 2016: 1054-1062. -# Requirements -tensorflow==2.0.0a0 -tensorlayer==2.0.1 +Environment: +------------------------ +Cartpole and Pong in OpenAI Gym + + +Requirements: +------------------------ +tensorflow>=2.0.0a0 +tensorlayer>=2.0.0 + + +To run: +------------------------ +python tutorial_Retrace.py --mode=train +python tutorial_Retrace.py --mode=test --save_path=retrace/8000.npz """ +import argparse +import os import random import time @@ -18,8 +42,25 @@ from tutorial_wrappers import build_env -seed = 0 -env_id = 'CartPole-v0' # CartPole-v0, PongNoFrameskip-v4 +parser = argparse.ArgumentParser() +parser.add_argument('--mode', help='train or test', default='train') +parser.add_argument('--save_path', default='retrace', + help='folder to save if mode == train else model path,' + 'qnet will be saved once target net update') +parser.add_argument('--seed', help='random seed', type=int, default=0) +parser.add_argument('--env_id', default='CartPole-v0', + help='CartPole-v0 or PongNoFrameskip-v4') +args = parser.parse_args() + +if args.mode == 'train': + os.makedirs(args.save_path, exist_ok=True) +random.seed(args.seed) +np.random.seed(args.seed) +tf.random.set_seed(args.seed) # reproducible +env_id = args.env_id +env = build_env(env_id, seed=args.seed) + +# #################### hyper parameters #################### if env_id == 'CartPole-v0': qnet_type = 'MLP' number_timesteps = 10000 # total number of time steps to train on @@ -36,7 +77,6 @@ target_q_update_freq = 200 # how frequency target q net update ob_scale = 1.0 / 255 # scale observations -env = build_env(env_id, seed=seed) in_dim = env.observation_space.shape out_dim = env.action_space.n reward_gamma = 0.99 # reward discount @@ -45,6 +85,7 @@ retrace_lambda = 1.0 +# ############################## Retrace #################################### class MLP(tl.models.Model): def __init__(self, name): super(MLP, self).__init__(name=name) @@ -137,69 +178,100 @@ def sync(net, net_tar): var_tar.assign(var) -qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') -qnet.train() -trainabel_weights = qnet.trainable_weights -targetqnet = MLP('targetq') if qnet_type == 'MLP' else CNN('targetq') -targetqnet.infer() -sync(qnet, targetqnet) -optimizer = tf.optimizers.Adam(learning_rate=lr) -buffer = ReplayBuffer(buffer_size) - -o = env.reset() -nepisode = 0 -t = time.time() -for i in range(1, number_timesteps + 1): - # select action based on boltzmann exploration - obv = np.expand_dims(o, 0).astype('float32') * ob_scale - qs, pi = qnet(obv) - a = np.random.multinomial(1, pi.numpy()[0]).argmax() - pi = pi.numpy()[0] - - # execute action and feed to replay buffer - # note that `_` tail in var name means next - o_, r, done, info = env.step(a) - buffer.add(o, a, r, o_, done, pi) - - if i >= warm_start: - # sync q net and target q net - if i % target_q_update_freq == 0: - sync(qnet, targetqnet) - - # sample from replay buffer - b_o, b_a, b_r, b_o_, b_d, b_old_pi = buffer.sample(batch_size) - - # q estimation based on 1 step retrace(\lambda) - b_q_, b_pi_ = targetqnet(b_o_) - b_v_ = (b_q_ * b_pi_).numpy().sum(1) - b_q, b_pi = targetqnet(b_o) - b_q = tf.reduce_sum(b_q * tf.one_hot(b_a, out_dim), 1).numpy() - c = np.clip(b_pi.numpy() / (b_old_pi + 1e-8), None, 1) - c = c[range(batch_size), b_a] - td = b_r + reward_gamma * (1 - b_d) * b_v_ - b_q - q_target = c * td + b_q - - # calculate loss - with tf.GradientTape() as q_tape: - b_q, _ = qnet(b_o) - b_q = tf.reduce_sum(b_q * tf.one_hot(b_a, out_dim), 1) - loss = tf.reduce_mean(huber_loss(b_q - q_target)) - - # backward gradients - q_grad = q_tape.gradient(loss, trainabel_weights) - optimizer.apply_gradients(zip(q_grad, trainabel_weights)) - - if done: +if __name__ == '__main__': + if args.mode == 'train': + qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') + qnet.train() + trainabel_weights = qnet.trainable_weights + targetqnet = MLP('targetq') if qnet_type == 'MLP' else CNN('targetq') + targetqnet.infer() + sync(qnet, targetqnet) + optimizer = tf.optimizers.Adam(learning_rate=lr) + buffer = ReplayBuffer(buffer_size) + o = env.reset() - else: - o = o_ - - # episode in info is real (unwrapped) message - if info.get('episode'): - nepisode += 1 - reward, length = info['episode']['r'], info['episode']['l'] - fps = int(length / (time.time() - t)) - print('Time steps so far: {}, episode so far: {}, ' - 'episode reward: {:.4f}, episode length: {}, FPS: {}' - .format(i, nepisode, reward, length, fps)) + nepisode = 0 t = time.time() + for i in range(1, number_timesteps + 1): + # select action based on boltzmann exploration + obv = np.expand_dims(o, 0).astype('float32') * ob_scale + qs, pi = qnet(obv) + a = np.random.multinomial(1, pi.numpy()[0]).argmax() + pi = pi.numpy()[0] + + # execute action and feed to replay buffer + # note that `_` tail in var name means next + o_, r, done, info = env.step(a) + buffer.add(o, a, r, o_, done, pi) + + if i >= warm_start: + # sync q net and target q net + if i % target_q_update_freq == 0: + sync(qnet, targetqnet) + path = os.path.join(args.save_path, '{}.npz'.format(i)) + tl.files.save_npz(qnet.trainable_weights, name=path) + + # sample from replay buffer + b_o, b_a, b_r, b_o_, b_d, b_old_pi = buffer.sample(batch_size) + + # q estimation based on 1 step retrace(\lambda) + b_q_, b_pi_ = targetqnet(b_o_) + b_v_ = (b_q_ * b_pi_).numpy().sum(1) + b_q, b_pi = targetqnet(b_o) + b_q = tf.reduce_sum(b_q * tf.one_hot(b_a, out_dim), 1).numpy() + c = np.clip(b_pi.numpy() / (b_old_pi + 1e-8), None, 1) + c = c[range(batch_size), b_a] + td = b_r + reward_gamma * (1 - b_d) * b_v_ - b_q + q_target = c * td + b_q + + # calculate loss + with tf.GradientTape() as q_tape: + b_q, _ = qnet(b_o) + b_q = tf.reduce_sum(b_q * tf.one_hot(b_a, out_dim), 1) + loss = tf.reduce_mean(huber_loss(b_q - q_target)) + + # backward gradients + q_grad = q_tape.gradient(loss, trainabel_weights) + optimizer.apply_gradients(zip(q_grad, trainabel_weights)) + + if done: + o = env.reset() + else: + o = o_ + + # episode in info is real (unwrapped) message + if info.get('episode'): + nepisode += 1 + reward, length = info['episode']['r'], info['episode']['l'] + fps = int(length / (time.time() - t)) + print('Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}, FPS: {}' + .format(i, nepisode, reward, length, fps)) + t = time.time() + else: + qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') + tl.files.load_and_assign_npz(name=args.save_path, network=qnet) + qnet.eval() + + nepisode = 0 + o = env.reset() + for i in range(1, number_timesteps + 1): + obv = np.expand_dims(o, 0).astype('float32') * ob_scale + a = qnet(obv)[0].numpy().argmax(1)[0] + + # execute action + # note that `_` tail in var name means next + o_, r, done, info = env.step(a) + + if done: + o = env.reset() + else: + o = o_ + + # episode in info is real (unwrapped) message + if info.get('episode'): + nepisode += 1 + reward, length = info['episode']['r'], info['episode']['l'] + print('Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}' + .format(i, nepisode, reward, length)) diff --git a/examples/reinforcement_learning/tutorial_prioritized_replay.py b/examples/reinforcement_learning/tutorial_prioritized_replay.py index 8d625b5b9..52b63632d 100644 --- a/examples/reinforcement_learning/tutorial_prioritized_replay.py +++ b/examples/reinforcement_learning/tutorial_prioritized_replay.py @@ -1,13 +1,39 @@ -"""Implement prioritized replay +""" +Prioritized Experience Replay +------------------------ +Prioritized experience replay is an efficient replay method that replay +important transitions more frequently. Segment tree data structure is used to +speed up indexing. + + +Reference: +------------------------ Schaul T, Quan J, Antonoglou I, et al. Prioritized experience replay[J]. arXiv preprint arXiv:1511.05952, 2015. -# Requirements -tensorflow==2.0.0a0 -tensorlayer==2.0.1 +Dhariwal P, Hesse C, Klimov O, et al. Openai baselines (2017)[J]. URL +https://github. com/opfenai/baselines. + + +Environment: +------------------------ +Cartpole and Pong in OpenAI Gym + +Requirements: +------------------------ +tensorflow>=2.0.0a0 +tensorlayer>=2.0.0 + + +To run: +------------------------ +python tutorial_prioritized_replay.py --mode=train +python tutorial_prioritized_replay.py --mode=test --save_path=per/8000.npz """ +import argparse import operator +import os import random import time @@ -18,8 +44,25 @@ from tutorial_wrappers import build_env -seed = 0 -env_id = 'CartPole-v0' # CartPole-v0, PongNoFrameskip-v4 +parser = argparse.ArgumentParser() +parser.add_argument('--mode', help='train or test', default='train') +parser.add_argument('--save_path', default='per', + help='folder to save if mode == train else model path,' + 'qnet will be saved once target net update') +parser.add_argument('--seed', help='random seed', type=int, default=0) +parser.add_argument('--env_id', default='CartPole-v0', + help='CartPole-v0 or PongNoFrameskip-v4') +args = parser.parse_args() + +if args.mode == 'train': + os.makedirs(args.save_path, exist_ok=True) +random.seed(args.seed) +np.random.seed(args.seed) +tf.random.set_seed(args.seed) # reproducible +env_id = args.env_id +env = build_env(env_id, seed=args.seed) + +# #################### hyper parameters #################### if env_id == 'CartPole-v0': qnet_type = 'MLP' number_timesteps = 10000 # total number of time steps to train on @@ -42,16 +85,16 @@ target_q_update_freq = 200 # how frequency target q net update ob_scale = 1.0 / 255 # scale observations -env = build_env(env_id, seed=seed) in_dim = env.observation_space.shape out_dim = env.action_space.n reward_gamma = 0.99 # reward discount batch_size = 32 # batch size for sampling from replay buffer warm_start = buffer_size / 10 # sample times befor learning -prioritized_replay_alpha = 0.6 -prioritized_replay_beta0 = 0.4 +prioritized_replay_alpha = 0.6 # alpha in PER +prioritized_replay_beta0 = 0.4 # initial beta in PER +# ############################## PER #################################### class MLP(tl.models.Model): def __init__(self, name): super(MLP, self).__init__(name=name) @@ -357,69 +400,101 @@ def sync(net, net_tar): var_tar.assign(var) -qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') -qnet.train() -trainabel_weights = qnet.trainable_weights -targetqnet = MLP('targetq') if qnet_type == 'MLP' else CNN('targetq') -targetqnet.infer() -sync(qnet, targetqnet) -optimizer = tf.optimizers.Adam(learning_rate=lr) -buffer = PrioritizedReplayBuffer( - buffer_size, prioritized_replay_alpha, prioritized_replay_beta0) - -o = env.reset() -nepisode = 0 -t = time.time() -for i in range(1, number_timesteps + 1): - eps = epsilon(i) - buffer.beta += (1 - prioritized_replay_beta0) / number_timesteps - - # select action - if random.random() < eps: - a = int(random.random() * out_dim) - else: - obv = np.expand_dims(o, 0).astype('float32') * ob_scale - a = qnet(obv).numpy().argmax(1)[0] - - # execute action and feed to replay buffer - # note that `_` tail in var name means next - o_, r, done, info = env.step(a) - buffer.add(o, a, r, o_, done) - - if i >= warm_start: - # sync q net and target q net - if i % target_q_update_freq == 0: - sync(qnet, targetqnet) - - # sample from replay buffer - b_o, b_a, b_r, b_o_, b_d, weights, idxs = buffer.sample(batch_size) - - # q estimation - b_q_ = (1 - b_d) * tf.reduce_max(targetqnet(b_o_), 1) - - # calculate loss - with tf.GradientTape() as q_tape: - b_q = tf.reduce_sum(qnet(b_o) * tf.one_hot(b_a, out_dim), 1) - abs_td_error = tf.abs(b_q - (b_r + reward_gamma * b_q_)) - priorities = np.clip(abs_td_error.numpy(), 1e-6, None) - buffer.update_priorities(idxs, priorities) - loss = tf.reduce_mean(weights * huber_loss(abs_td_error)) - - # backward gradients - q_grad = q_tape.gradient(loss, trainabel_weights) - optimizer.apply_gradients(zip(q_grad, trainabel_weights)) - - if done: +if __name__ == '__main__': + if args.mode == 'train': + qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') + qnet.train() + trainabel_weights = qnet.trainable_weights + targetqnet = MLP('targetq') if qnet_type == 'MLP' else CNN('targetq') + targetqnet.infer() + sync(qnet, targetqnet) + optimizer = tf.optimizers.Adam(learning_rate=lr) + buffer = PrioritizedReplayBuffer( + buffer_size, prioritized_replay_alpha, prioritized_replay_beta0) + o = env.reset() - else: - o = o_ - - # episode in info is real (unwrapped) message - if info.get('episode'): - nepisode += 1 - reward, length = info['episode']['r'], info['episode']['l'] - fps = int(length / (time.time() - t)) - print('Time steps so far: {}, episode so far: {}, ' - 'episode reward: {:.4f}, episode length: {}, FPS: {}' - .format(i, nepisode, reward, length, fps)) + nepisode = 0 t = time.time() + for i in range(1, number_timesteps + 1): + eps = epsilon(i) + buffer.beta += (1 - prioritized_replay_beta0) / number_timesteps + + # select action + if random.random() < eps: + a = int(random.random() * out_dim) + else: + obv = np.expand_dims(o, 0).astype('float32') * ob_scale + a = qnet(obv).numpy().argmax(1)[0] + + # execute action and feed to replay buffer + # note that `_` tail in var name means next + o_, r, done, info = env.step(a) + buffer.add(o, a, r, o_, done) + + if i >= warm_start: + # sync q net and target q net + if i % target_q_update_freq == 0: + sync(qnet, targetqnet) + path = os.path.join(args.save_path, '{}.npz'.format(i)) + tl.files.save_npz(qnet.trainable_weights, name=path) + + # sample from replay buffer + b_o, b_a, b_r, b_o_, b_d, weights, idxs \ + = buffer.sample(batch_size) + + # q estimation + b_q_ = (1 - b_d) * tf.reduce_max(targetqnet(b_o_), 1) + + # calculate loss + with tf.GradientTape() as q_tape: + b_q = tf.reduce_sum(qnet(b_o) * tf.one_hot(b_a, out_dim), 1) + abs_td_error = tf.abs(b_q - (b_r + reward_gamma * b_q_)) + priorities = np.clip(abs_td_error.numpy(), 1e-6, None) + buffer.update_priorities(idxs, priorities) + loss = tf.reduce_mean(weights * huber_loss(abs_td_error)) + + # backward gradients + q_grad = q_tape.gradient(loss, trainabel_weights) + optimizer.apply_gradients(zip(q_grad, trainabel_weights)) + + if done: + o = env.reset() + else: + o = o_ + + # episode in info is real (unwrapped) message + if info.get('episode'): + nepisode += 1 + reward, length = info['episode']['r'], info['episode']['l'] + fps = int(length / (time.time() - t)) + print('Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}, FPS: {}' + .format(i, nepisode, reward, length, fps)) + t = time.time() + else: + qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') + tl.files.load_and_assign_npz(name=args.save_path, network=qnet) + qnet.eval() + + nepisode = 0 + o = env.reset() + for i in range(1, number_timesteps + 1): + obv = np.expand_dims(o, 0).astype('float32') * ob_scale + a = qnet(obv).numpy().argmax(1)[0] + + # execute action + # note that `_` tail in var name means next + o_, r, done, info = env.step(a) + + if done: + o = env.reset() + else: + o = o_ + + # episode in info is real (unwrapped) message + if info.get('episode'): + nepisode += 1 + reward, length = info['episode']['r'], info['episode']['l'] + print('Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}' + .format(i, nepisode, reward, length)) From c00af6d115b8965aa7fee3e09908a7fe46560aee Mon Sep 17 00:00:00 2001 From: Officium Date: Tue, 11 Jun 2019 17:34:46 +0800 Subject: [PATCH 50/57] make format --- .../tutorial_cifar10_cnn_static.py | 2 +- .../tutorial_mnist_mlp_dynamic.py | 2 +- .../tutorial_mnist_mlp_dynamic_2.py | 2 +- .../tutorial_mnist_mlp_static.py | 2 +- .../tutorial_mnist_mlp_static_2.py | 2 +- .../basic_tutorials/tutorial_mnist_siamese.py | 2 +- .../basic_tutorials/tutorial_mnist_simple.py | 2 +- examples/data_process/data/__init__.py | 1 - .../tutorial_fast_affine_transform.py | 4 ++-- .../data_process/tutorial_tf_dataset_voc.py | 2 +- examples/data_process/tutorial_tfrecord.py | 2 +- examples/data_process/tutorial_tfrecord2.py | 2 +- examples/data_process/tutorial_tfrecord3.py | 2 +- examples/database/dispatch_tasks.py | 1 + examples/database/task_script.py | 1 + ...tutorial_imagenet_inceptionV3_distributed.py | 8 ++++---- .../tutorial_mnist_distributed.py | 1 + .../tutorial_cifar10_distributed_trainer.py | 2 +- .../tutorial_mnist_distributed_trainer.py | 2 +- examples/keras_tfslim/tutorial_keras.py | 2 +- examples/pretrained_cnn/data/__init__.py | 1 - .../tutorial_models_mobilenetv1.py | 2 +- .../tutorial_models_squeezenetv1.py | 2 +- .../pretrained_cnn/tutorial_models_vgg16.py | 2 +- .../pretrained_cnn/tutorial_models_vgg19.py | 2 +- .../tutorial_models_vgg_static.py | 2 +- .../tutorial_binarynet_cifar10_tfrecord.py | 1 + .../tutorial_binarynet_mnist_cnn.py | 1 + .../tutorial_dorefanet_cifar10_tfrecord.py | 1 + .../tutorial_dorefanet_mnist_cnn.py | 1 + .../quantized_net/tutorial_quanconv_cifar10.py | 2 +- .../quantized_net/tutorial_quanconv_mnist.py | 1 + .../tutorial_ternaryweight_cifar10_tfrecord.py | 1 + .../tutorial_ternaryweight_mnist_cnn.py | 1 + .../reinforcement_learning/baselines/SAC.py | 4 ++-- .../reinforcement_learning/baselines/utils.py | 9 ++------- .../baselines/wrappers.py | 3 +-- examples/reinforcement_learning/tutorial_A3C.py | 7 ++++--- examples/reinforcement_learning/tutorial_AC.py | 9 ++++----- examples/reinforcement_learning/tutorial_C51.py | 3 +-- .../reinforcement_learning/tutorial_DDPG.py | 14 ++++++++------ .../reinforcement_learning/tutorial_DPPO.py | 17 ++++++++++------- examples/reinforcement_learning/tutorial_DQN.py | 8 ++++---- .../tutorial_DQN_variants.py | 3 +-- examples/reinforcement_learning/tutorial_PG.py | 13 +++++++------ examples/reinforcement_learning/tutorial_PPO.py | 16 +++++++++------- .../tutorial_Qlearning.py | 3 +-- .../reinforcement_learning/tutorial_Retrace.py | 3 +-- examples/reinforcement_learning/tutorial_SAC.py | 4 ++-- examples/reinforcement_learning/tutorial_TD3.py | 4 ++-- .../reinforcement_learning/tutorial_TRPO.py | 17 +++++++++-------- .../tutorial_atari_pong.py | 4 ++-- .../reinforcement_learning/tutorial_format.py | 8 +++++--- .../tutorial_prioritized_replay.py | 3 +-- .../reinforcement_learning/tutorial_wrappers.py | 3 +-- ...orial_spatial_transformer_network_dynamic.py | 2 +- ...torial_spatial_transformer_network_static.py | 2 +- .../tutorial_imdb_fasttext.py | 2 +- examples/text_generation/data/__init__.py | 1 + .../text_generation/tutorial_generate_text.py | 2 +- examples/text_ptb/tutorial_ptb_lstm.py | 2 +- .../tutorial_ptb_lstm_state_is_tuple.py | 2 +- .../tutorial_word2vec_basic.py | 2 +- examples/tutorial_work_with_onnx.py | 4 ++-- tensorlayer/__init__.py | 17 ++++------------- tensorlayer/activation.py | 1 + tensorlayer/cost.py | 1 + tensorlayer/db.py | 4 ++-- tensorlayer/decorators/__init__.py | 3 +-- tensorlayer/distributed.py | 1 + tensorlayer/files/__init__.py | 1 - .../files/dataset_loaders/voc_dataset.py | 1 + .../files/dataset_loaders/wmt_en_fr_dataset.py | 1 + tensorlayer/files/utils.py | 8 ++++---- tensorlayer/initializers.py | 1 - tensorlayer/layers/__init__.py | 5 +---- tensorlayer/layers/activation.py | 1 + tensorlayer/layers/convolution/__init__.py | 5 ++--- tensorlayer/layers/convolution/binary_conv.py | 1 + .../layers/convolution/deformable_conv.py | 1 + .../layers/convolution/depthwise_conv.py | 1 + tensorlayer/layers/convolution/dorefa_conv.py | 1 + tensorlayer/layers/convolution/expert_conv.py | 1 + tensorlayer/layers/convolution/expert_deconv.py | 1 + tensorlayer/layers/convolution/group_conv.py | 1 + tensorlayer/layers/convolution/quan_conv.py | 1 + tensorlayer/layers/convolution/quan_conv_bn.py | 1 + .../layers/convolution/separable_conv.py | 2 +- .../layers/convolution/simplified_conv.py | 1 + .../layers/convolution/simplified_deconv.py | 2 +- .../layers/convolution/super_resolution.py | 1 + tensorlayer/layers/convolution/ternary_conv.py | 1 + tensorlayer/layers/core.py | 1 + tensorlayer/layers/dense/__init__.py | 2 +- tensorlayer/layers/dense/base_dense.py | 2 +- tensorlayer/layers/dense/binary_dense.py | 1 + tensorlayer/layers/dense/dorefa_dense.py | 1 + tensorlayer/layers/dense/dropconnect.py | 1 + tensorlayer/layers/dense/quan_dense.py | 1 + tensorlayer/layers/dense/quan_dense_bn.py | 1 + tensorlayer/layers/dense/ternary_dense.py | 1 + tensorlayer/layers/dropout.py | 1 + tensorlayer/layers/embedding.py | 2 +- tensorlayer/layers/extend.py | 1 + tensorlayer/layers/image_resampling.py | 1 + tensorlayer/layers/inputs.py | 2 +- tensorlayer/layers/lambda_layers.py | 1 + tensorlayer/layers/merge.py | 1 + tensorlayer/layers/noise.py | 1 + tensorlayer/layers/normalization.py | 3 ++- tensorlayer/layers/padding.py | 1 + tensorlayer/layers/pooling.py | 1 + tensorlayer/layers/quantize.py | 1 + tensorlayer/layers/recurrent.py | 1 + tensorlayer/layers/scale.py | 1 + tensorlayer/layers/shape.py | 1 + tensorlayer/layers/spatial_transformer.py | 4 ++-- tensorlayer/layers/stack.py | 1 + tensorlayer/layers/utils.py | 4 ++-- tensorlayer/models/__init__.py | 2 +- tensorlayer/models/core.py | 3 ++- tensorlayer/models/mobilenetv1.py | 1 + tensorlayer/models/squeezenetv1.py | 1 + tensorlayer/models/vgg.py | 2 +- tensorlayer/nlp.py | 4 ++-- tensorlayer/rein.py | 3 +-- tensorlayer/utils.py | 2 +- 127 files changed, 197 insertions(+), 164 deletions(-) diff --git a/examples/basic_tutorials/tutorial_cifar10_cnn_static.py b/examples/basic_tutorials/tutorial_cifar10_cnn_static.py index 93794c414..c12c791a1 100644 --- a/examples/basic_tutorials/tutorial_cifar10_cnn_static.py +++ b/examples/basic_tutorials/tutorial_cifar10_cnn_static.py @@ -5,8 +5,8 @@ import time import numpy as np - import tensorflow as tf + import tensorlayer as tl from tensorlayer.layers import (BatchNorm, Conv2d, Dense, Flatten, Input, LocalResponseNorm, MaxPool2d) diff --git a/examples/basic_tutorials/tutorial_mnist_mlp_dynamic.py b/examples/basic_tutorials/tutorial_mnist_mlp_dynamic.py index 13db1abae..1ffa7fbe0 100644 --- a/examples/basic_tutorials/tutorial_mnist_mlp_dynamic.py +++ b/examples/basic_tutorials/tutorial_mnist_mlp_dynamic.py @@ -1,8 +1,8 @@ import time import numpy as np - import tensorflow as tf + import tensorlayer as tl from tensorlayer.layers import Dense, Dropout, Input from tensorlayer.models import Model diff --git a/examples/basic_tutorials/tutorial_mnist_mlp_dynamic_2.py b/examples/basic_tutorials/tutorial_mnist_mlp_dynamic_2.py index 0d94b1dfa..b752012b0 100644 --- a/examples/basic_tutorials/tutorial_mnist_mlp_dynamic_2.py +++ b/examples/basic_tutorials/tutorial_mnist_mlp_dynamic_2.py @@ -1,8 +1,8 @@ import time import numpy as np - import tensorflow as tf + import tensorlayer as tl from tensorlayer.layers import Dense, Dropout, Input, LayerList from tensorlayer.models import Model diff --git a/examples/basic_tutorials/tutorial_mnist_mlp_static.py b/examples/basic_tutorials/tutorial_mnist_mlp_static.py index de811a8d8..c9c15f911 100644 --- a/examples/basic_tutorials/tutorial_mnist_mlp_static.py +++ b/examples/basic_tutorials/tutorial_mnist_mlp_static.py @@ -1,8 +1,8 @@ import time import numpy as np - import tensorflow as tf + import tensorlayer as tl from tensorlayer.layers import Dense, Dropout, Input from tensorlayer.models import Model diff --git a/examples/basic_tutorials/tutorial_mnist_mlp_static_2.py b/examples/basic_tutorials/tutorial_mnist_mlp_static_2.py index a9a2c7d48..f0836c528 100644 --- a/examples/basic_tutorials/tutorial_mnist_mlp_static_2.py +++ b/examples/basic_tutorials/tutorial_mnist_mlp_static_2.py @@ -1,8 +1,8 @@ import time import numpy as np - import tensorflow as tf + import tensorlayer as tl from tensorlayer.layers import Dense, Dropout, Input from tensorlayer.models import Model diff --git a/examples/basic_tutorials/tutorial_mnist_siamese.py b/examples/basic_tutorials/tutorial_mnist_siamese.py index fe4abdc52..db43f1163 100644 --- a/examples/basic_tutorials/tutorial_mnist_siamese.py +++ b/examples/basic_tutorials/tutorial_mnist_siamese.py @@ -14,8 +14,8 @@ import time import numpy as np - import tensorflow as tf + import tensorlayer as tl from tensorlayer.layers import Dense, Dropout, Flatten, Input from tensorlayer.models import Model diff --git a/examples/basic_tutorials/tutorial_mnist_simple.py b/examples/basic_tutorials/tutorial_mnist_simple.py index ceaee0c48..b1ccd052b 100644 --- a/examples/basic_tutorials/tutorial_mnist_simple.py +++ b/examples/basic_tutorials/tutorial_mnist_simple.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- import numpy as np - import tensorflow as tf + import tensorlayer as tl tl.logging.set_verbosity(tl.logging.DEBUG) diff --git a/examples/data_process/data/__init__.py b/examples/data_process/data/__init__.py index 8b31b202a..83d5401c3 100644 --- a/examples/data_process/data/__init__.py +++ b/examples/data_process/data/__init__.py @@ -1,4 +1,3 @@ from __future__ import absolute_import -# from . import imagenet_classes from . import * diff --git a/examples/data_process/tutorial_fast_affine_transform.py b/examples/data_process/tutorial_fast_affine_transform.py index 71890f5bd..52452ffd5 100644 --- a/examples/data_process/tutorial_fast_affine_transform.py +++ b/examples/data_process/tutorial_fast_affine_transform.py @@ -8,10 +8,10 @@ import multiprocessing import time -import numpy as np - import cv2 +import numpy as np import tensorflow as tf + import tensorlayer as tl # tl.logging.set_verbosity(tl.logging.DEBUG) diff --git a/examples/data_process/tutorial_tf_dataset_voc.py b/examples/data_process/tutorial_tf_dataset_voc.py index 9779b1f60..fab1612f7 100644 --- a/examples/data_process/tutorial_tf_dataset_voc.py +++ b/examples/data_process/tutorial_tf_dataset_voc.py @@ -13,8 +13,8 @@ import time import numpy as np - import tensorflow as tf + import tensorlayer as tl # tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/data_process/tutorial_tfrecord.py b/examples/data_process/tutorial_tfrecord.py index bcf3fe46a..c0b0181f8 100644 --- a/examples/data_process/tutorial_tfrecord.py +++ b/examples/data_process/tutorial_tfrecord.py @@ -22,9 +22,9 @@ import os import numpy as np +import tensorflow as tf from PIL import Image -import tensorflow as tf import tensorlayer as tl ## Save data ================================================================== diff --git a/examples/data_process/tutorial_tfrecord2.py b/examples/data_process/tutorial_tfrecord2.py index 22b3d7757..be41b697f 100755 --- a/examples/data_process/tutorial_tfrecord2.py +++ b/examples/data_process/tutorial_tfrecord2.py @@ -14,10 +14,10 @@ import os import numpy as np - # import matplotlib # matplotlib.use('GTK') import tensorflow as tf + import tensorlayer as tl # Download data, and convert to TFRecord format, see ```tutorial_tfrecord.py``` diff --git a/examples/data_process/tutorial_tfrecord3.py b/examples/data_process/tutorial_tfrecord3.py index bc8752f2a..9e5751a25 100644 --- a/examples/data_process/tutorial_tfrecord3.py +++ b/examples/data_process/tutorial_tfrecord3.py @@ -19,9 +19,9 @@ import os import numpy as np +import tensorflow as tf from PIL import Image -import tensorflow as tf import tensorlayer as tl diff --git a/examples/database/dispatch_tasks.py b/examples/database/dispatch_tasks.py index 260257e77..d1204bcd4 100644 --- a/examples/database/dispatch_tasks.py +++ b/examples/database/dispatch_tasks.py @@ -6,6 +6,7 @@ import time import tensorflow as tf + import tensorlayer as tl tl.logging.set_verbosity(tl.logging.DEBUG) diff --git a/examples/database/task_script.py b/examples/database/task_script.py index 58ef60d1a..ad51dd3ed 100644 --- a/examples/database/task_script.py +++ b/examples/database/task_script.py @@ -1,6 +1,7 @@ """Sample task script.""" import tensorflow as tf + import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/deprecated_tutorials/tutorial_imagenet_inceptionV3_distributed.py b/examples/deprecated_tutorials/tutorial_imagenet_inceptionV3_distributed.py index 15c0a3f3c..e54f565b5 100644 --- a/examples/deprecated_tutorials/tutorial_imagenet_inceptionV3_distributed.py +++ b/examples/deprecated_tutorials/tutorial_imagenet_inceptionV3_distributed.py @@ -19,18 +19,18 @@ from xml.etree import ElementTree import numpy as np - import tensorflow as tf -import tensorlayer as tl from tensorflow.contrib import slim -from tensorflow.contrib.slim.python.slim.nets.inception_v3 import (inception_v3, - inception_v3_arg_scope) +from tensorflow.contrib.slim.python.slim.nets.inception_v3 import ( + inception_v3, inception_v3_arg_scope) from tensorflow.python.framework.errors_impl import OutOfRangeError from tensorflow.python.training import session_run_hook from tensorflow.python.training.basic_session_run_hooks import StopAtStepHook from tensorflow.python.training.monitored_session import \ SingularMonitoredSession +import tensorlayer as tl + tf.logging.set_verbosity(tf.logging.DEBUG) tl.logging.set_verbosity(tl.logging.DEBUG) diff --git a/examples/deprecated_tutorials/tutorial_mnist_distributed.py b/examples/deprecated_tutorials/tutorial_mnist_distributed.py index 18f7cdb92..29d291ba4 100644 --- a/examples/deprecated_tutorials/tutorial_mnist_distributed.py +++ b/examples/deprecated_tutorials/tutorial_mnist_distributed.py @@ -13,6 +13,7 @@ """ import tensorflow as tf + import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/distributed_training/tutorial_cifar10_distributed_trainer.py b/examples/distributed_training/tutorial_cifar10_distributed_trainer.py index ce3aec007..1ddc2d937 100644 --- a/examples/distributed_training/tutorial_cifar10_distributed_trainer.py +++ b/examples/distributed_training/tutorial_cifar10_distributed_trainer.py @@ -15,8 +15,8 @@ import multiprocessing import numpy as np - import tensorflow as tf + import tensorlayer as tl from tensorlayer.layers import (BatchNormLayer, Conv2d, DenseLayer, FlattenLayer, InputLayer, MaxPool2d) diff --git a/examples/distributed_training/tutorial_mnist_distributed_trainer.py b/examples/distributed_training/tutorial_mnist_distributed_trainer.py index 0cf916370..0f1b8b6dd 100755 --- a/examples/distributed_training/tutorial_mnist_distributed_trainer.py +++ b/examples/distributed_training/tutorial_mnist_distributed_trainer.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- import numpy as np - import tensorflow as tf + import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/keras_tfslim/tutorial_keras.py b/examples/keras_tfslim/tutorial_keras.py index 33a9ca860..0622bc745 100644 --- a/examples/keras_tfslim/tutorial_keras.py +++ b/examples/keras_tfslim/tutorial_keras.py @@ -4,8 +4,8 @@ import time import numpy as np - import tensorflow as tf + import tensorlayer as tl from tensorlayer.layers import Input, Lambda diff --git a/examples/pretrained_cnn/data/__init__.py b/examples/pretrained_cnn/data/__init__.py index 8b31b202a..83d5401c3 100644 --- a/examples/pretrained_cnn/data/__init__.py +++ b/examples/pretrained_cnn/data/__init__.py @@ -1,4 +1,3 @@ from __future__ import absolute_import -# from . import imagenet_classes from . import * diff --git a/examples/pretrained_cnn/tutorial_models_mobilenetv1.py b/examples/pretrained_cnn/tutorial_models_mobilenetv1.py index 6b797a075..8d7b35a6b 100644 --- a/examples/pretrained_cnn/tutorial_models_mobilenetv1.py +++ b/examples/pretrained_cnn/tutorial_models_mobilenetv1.py @@ -10,8 +10,8 @@ import time import numpy as np - import tensorflow as tf + import tensorlayer as tl from tensorlayer.models.imagenet_classes import class_names diff --git a/examples/pretrained_cnn/tutorial_models_squeezenetv1.py b/examples/pretrained_cnn/tutorial_models_squeezenetv1.py index 755d6c28b..9b6ee4e7f 100644 --- a/examples/pretrained_cnn/tutorial_models_squeezenetv1.py +++ b/examples/pretrained_cnn/tutorial_models_squeezenetv1.py @@ -5,8 +5,8 @@ import time import numpy as np - import tensorflow as tf + import tensorlayer as tl from tensorlayer.models.imagenet_classes import class_names diff --git a/examples/pretrained_cnn/tutorial_models_vgg16.py b/examples/pretrained_cnn/tutorial_models_vgg16.py index b1bd3823f..e6bb1c22e 100644 --- a/examples/pretrained_cnn/tutorial_models_vgg16.py +++ b/examples/pretrained_cnn/tutorial_models_vgg16.py @@ -5,8 +5,8 @@ import time import numpy as np - import tensorflow as tf + import tensorlayer as tl from tensorlayer.models.imagenet_classes import class_names diff --git a/examples/pretrained_cnn/tutorial_models_vgg19.py b/examples/pretrained_cnn/tutorial_models_vgg19.py index 922c3bdf5..850412c38 100644 --- a/examples/pretrained_cnn/tutorial_models_vgg19.py +++ b/examples/pretrained_cnn/tutorial_models_vgg19.py @@ -5,8 +5,8 @@ import time import numpy as np - import tensorflow as tf + import tensorlayer as tl from tensorlayer.models.imagenet_classes import class_names diff --git a/examples/pretrained_cnn/tutorial_models_vgg_static.py b/examples/pretrained_cnn/tutorial_models_vgg_static.py index a0e056e4d..40a3ed865 100644 --- a/examples/pretrained_cnn/tutorial_models_vgg_static.py +++ b/examples/pretrained_cnn/tutorial_models_vgg_static.py @@ -5,8 +5,8 @@ import time import numpy as np - import tensorflow as tf + import tensorlayer as tl from tensorlayer.models.imagenet_classes import class_names diff --git a/examples/quantized_net/tutorial_binarynet_cifar10_tfrecord.py b/examples/quantized_net/tutorial_binarynet_cifar10_tfrecord.py index d3205045a..98532debb 100644 --- a/examples/quantized_net/tutorial_binarynet_cifar10_tfrecord.py +++ b/examples/quantized_net/tutorial_binarynet_cifar10_tfrecord.py @@ -43,6 +43,7 @@ import time import tensorflow as tf + import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/quantized_net/tutorial_binarynet_mnist_cnn.py b/examples/quantized_net/tutorial_binarynet_mnist_cnn.py index 84fbf7fc9..248812e23 100644 --- a/examples/quantized_net/tutorial_binarynet_mnist_cnn.py +++ b/examples/quantized_net/tutorial_binarynet_mnist_cnn.py @@ -4,6 +4,7 @@ import time import tensorflow as tf + import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/quantized_net/tutorial_dorefanet_cifar10_tfrecord.py b/examples/quantized_net/tutorial_dorefanet_cifar10_tfrecord.py index fe7666bab..9c8ab1239 100644 --- a/examples/quantized_net/tutorial_dorefanet_cifar10_tfrecord.py +++ b/examples/quantized_net/tutorial_dorefanet_cifar10_tfrecord.py @@ -43,6 +43,7 @@ import time import tensorflow as tf + import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/quantized_net/tutorial_dorefanet_mnist_cnn.py b/examples/quantized_net/tutorial_dorefanet_mnist_cnn.py index d8cab9bc8..90d7b0893 100644 --- a/examples/quantized_net/tutorial_dorefanet_mnist_cnn.py +++ b/examples/quantized_net/tutorial_dorefanet_mnist_cnn.py @@ -4,6 +4,7 @@ import time import tensorflow as tf + import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/quantized_net/tutorial_quanconv_cifar10.py b/examples/quantized_net/tutorial_quanconv_cifar10.py index f93368467..6eb35ed67 100644 --- a/examples/quantized_net/tutorial_quanconv_cifar10.py +++ b/examples/quantized_net/tutorial_quanconv_cifar10.py @@ -41,8 +41,8 @@ import time import numpy as np - import tensorflow as tf + import tensorlayer as tl bitW = 8 diff --git a/examples/quantized_net/tutorial_quanconv_mnist.py b/examples/quantized_net/tutorial_quanconv_mnist.py index 66d52d13c..4060c6137 100644 --- a/examples/quantized_net/tutorial_quanconv_mnist.py +++ b/examples/quantized_net/tutorial_quanconv_mnist.py @@ -4,6 +4,7 @@ import time import tensorflow as tf + import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/quantized_net/tutorial_ternaryweight_cifar10_tfrecord.py b/examples/quantized_net/tutorial_ternaryweight_cifar10_tfrecord.py index b695fa88a..f1ee7b4bb 100644 --- a/examples/quantized_net/tutorial_ternaryweight_cifar10_tfrecord.py +++ b/examples/quantized_net/tutorial_ternaryweight_cifar10_tfrecord.py @@ -42,6 +42,7 @@ import time import tensorflow as tf + import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/quantized_net/tutorial_ternaryweight_mnist_cnn.py b/examples/quantized_net/tutorial_ternaryweight_mnist_cnn.py index 6850b9591..e1c305db6 100644 --- a/examples/quantized_net/tutorial_ternaryweight_mnist_cnn.py +++ b/examples/quantized_net/tutorial_ternaryweight_mnist_cnn.py @@ -4,6 +4,7 @@ import time import tensorflow as tf + import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/reinforcement_learning/baselines/SAC.py b/examples/reinforcement_learning/baselines/SAC.py index df017edbf..ac8109fad 100644 --- a/examples/reinforcement_learning/baselines/SAC.py +++ b/examples/reinforcement_learning/baselines/SAC.py @@ -24,12 +24,12 @@ import random import time +import gym import matplotlib.pyplot as plt import numpy as np +import tensorflow as tf from IPython.display import clear_output -import gym -import tensorflow as tf import tensorflow_probability as tfp import tensorlayer as tl from tensorlayer.layers import Dense diff --git a/examples/reinforcement_learning/baselines/utils.py b/examples/reinforcement_learning/baselines/utils.py index f8b537a0c..2a02ee1a1 100644 --- a/examples/reinforcement_learning/baselines/utils.py +++ b/examples/reinforcement_learning/baselines/utils.py @@ -6,14 +6,14 @@ tensorlayer==2.0.1 """ +import os import random import time import matplotlib.pyplot as plt -import tensorlayer as tl import numpy as np -import os +import tensorlayer as tl def plot(episode_rewards, Algorithm_name, Env_name): @@ -90,8 +90,3 @@ def sample(self, batch_size): def __len__(self): return len(self.buffer) - - - - - diff --git a/examples/reinforcement_learning/baselines/wrappers.py b/examples/reinforcement_learning/baselines/wrappers.py index 963849598..60b3a563a 100644 --- a/examples/reinforcement_learning/baselines/wrappers.py +++ b/examples/reinforcement_learning/baselines/wrappers.py @@ -4,7 +4,7 @@ """ from collections import deque from functools import partial -from multiprocessing import cpu_count, Process, Pipe +from multiprocessing import Pipe, Process, cpu_count from sys import platform import cv2 @@ -12,7 +12,6 @@ import numpy as np from gym import spaces - __all__ = ( 'build_env', # build env 'TimeLimit', # Time limit wrapper diff --git a/examples/reinforcement_learning/tutorial_A3C.py b/examples/reinforcement_learning/tutorial_A3C.py index fb2c94ce7..70dab7035 100644 --- a/examples/reinforcement_learning/tutorial_A3C.py +++ b/examples/reinforcement_learning/tutorial_A3C.py @@ -44,14 +44,15 @@ """ +import argparse import multiprocessing import threading - -import numpy as np -import argparse import time + import gym +import numpy as np import tensorflow as tf + import tensorflow_probability as tfp import tensorlayer as tl from tensorlayer.layers import DenseLayer, InputLayer diff --git a/examples/reinforcement_learning/tutorial_AC.py b/examples/reinforcement_learning/tutorial_AC.py index f91d07f8c..a9b4d6232 100644 --- a/examples/reinforcement_learning/tutorial_AC.py +++ b/examples/reinforcement_learning/tutorial_AC.py @@ -45,15 +45,14 @@ python tutorial_AC.py --train/test """ +import argparse import time -import numpy as np - import gym +import numpy as np import tensorflow as tf -import tensorlayer as tl -import argparse +import tensorlayer as tl tl.logging.set_verbosity(tl.logging.DEBUG) @@ -314,4 +313,4 @@ def load_ckpt(self): # load trained weights print("reward", rall) s = env.reset().astype(np.float32) rall = 0 - break \ No newline at end of file + break diff --git a/examples/reinforcement_learning/tutorial_C51.py b/examples/reinforcement_learning/tutorial_C51.py index 114daae67..8e443d600 100644 --- a/examples/reinforcement_learning/tutorial_C51.py +++ b/examples/reinforcement_learning/tutorial_C51.py @@ -36,11 +36,10 @@ import numpy as np import tensorflow as tf -import tensorlayer as tl +import tensorlayer as tl from tutorial_wrappers import build_env - parser = argparse.ArgumentParser() parser.add_argument('--mode', help='train or test', default='train') parser.add_argument('--save_path', default='c51', diff --git a/examples/reinforcement_learning/tutorial_DDPG.py b/examples/reinforcement_learning/tutorial_DDPG.py index 0bd9cadd0..a24452375 100644 --- a/examples/reinforcement_learning/tutorial_DDPG.py +++ b/examples/reinforcement_learning/tutorial_DDPG.py @@ -27,14 +27,16 @@ """ -import tensorflow as tf -import tensorlayer as tl -import numpy as np -import gym +import argparse +import os import time + +import gym import matplotlib.pyplot as plt -import os -import argparse +import numpy as np +import tensorflow as tf + +import tensorlayer as tl parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') parser.add_argument('--train', dest='train', action='store_true', default=True) diff --git a/examples/reinforcement_learning/tutorial_DPPO.py b/examples/reinforcement_learning/tutorial_DPPO.py index 6bebc64a5..15b013ec4 100644 --- a/examples/reinforcement_learning/tutorial_DPPO.py +++ b/examples/reinforcement_learning/tutorial_DPPO.py @@ -29,16 +29,19 @@ """ -import tensorflow as tf -import numpy as np -import matplotlib.pyplot as plt -import gym, threading, queue +import argparse +import os +import queue +import threading import time -import tensorlayer as tl +import gym +import matplotlib.pyplot as plt +import numpy as np +import tensorflow as tf + import tensorflow_probability as tfp -import os -import argparse +import tensorlayer as tl parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') parser.add_argument('--train', dest='train', action='store_true', default=True) diff --git a/examples/reinforcement_learning/tutorial_DQN.py b/examples/reinforcement_learning/tutorial_DQN.py index 65e11d193..492a7fd20 100644 --- a/examples/reinforcement_learning/tutorial_DQN.py +++ b/examples/reinforcement_learning/tutorial_DQN.py @@ -43,14 +43,14 @@ """ -import time -import numpy as np import argparse +import time import gym +import numpy as np import tensorflow as tf -import tensorlayer as tl +import tensorlayer as tl # add arguments in command --train/test parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') @@ -180,4 +180,4 @@ def load_ckpt(model): # load trained weights # print("Episode [%d/%d] sum reward: %f running reward: %f took: %.5fs " % \ # (i, num_episodes, rAll, running_reward, time.time() - episode_time)) print('Episode: {}/{} | Episode Reward: {:.4f} | Running Average Reward: {:.4f} | Running Time: {:.4f}'\ - .format(i, num_episodes, rAll, running_reward, time.time()-t0 )) \ No newline at end of file + .format(i, num_episodes, rAll, running_reward, time.time()-t0 )) diff --git a/examples/reinforcement_learning/tutorial_DQN_variants.py b/examples/reinforcement_learning/tutorial_DQN_variants.py index a1b281b9b..d6292e987 100644 --- a/examples/reinforcement_learning/tutorial_DQN_variants.py +++ b/examples/reinforcement_learning/tutorial_DQN_variants.py @@ -52,11 +52,10 @@ import numpy as np import tensorflow as tf -import tensorlayer as tl +import tensorlayer as tl from tutorial_wrappers import build_env - parser = argparse.ArgumentParser() parser.add_argument('--mode', help='train or test', default='train') parser.add_argument('--save_path', default='dqn_variants', diff --git a/examples/reinforcement_learning/tutorial_PG.py b/examples/reinforcement_learning/tutorial_PG.py index 7adb76d2d..773ef9a72 100644 --- a/examples/reinforcement_learning/tutorial_PG.py +++ b/examples/reinforcement_learning/tutorial_PG.py @@ -27,15 +27,16 @@ """ -import tensorflow as tf -import tensorlayer as tl -import numpy as np +import argparse +import os +import time import gym import matplotlib.pyplot as plt -import time -import os -import argparse +import numpy as np +import tensorflow as tf + +import tensorlayer as tl parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') parser.add_argument('--train', dest='train', action='store_true', default=True) diff --git a/examples/reinforcement_learning/tutorial_PPO.py b/examples/reinforcement_learning/tutorial_PPO.py index 72cbc8e1d..3ad01f6b2 100644 --- a/examples/reinforcement_learning/tutorial_PPO.py +++ b/examples/reinforcement_learning/tutorial_PPO.py @@ -28,15 +28,17 @@ """ -import tensorflow as tf -import numpy as np -import matplotlib.pyplot as plt +import argparse +import os +import time + import gym -import tensorlayer as tl +import matplotlib.pyplot as plt +import numpy as np +import tensorflow as tf + import tensorflow_probability as tfp -import time -import os -import argparse +import tensorlayer as tl parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') parser.add_argument('--train', dest='train', action='store_true', default=True) diff --git a/examples/reinforcement_learning/tutorial_Qlearning.py b/examples/reinforcement_learning/tutorial_Qlearning.py index a8decb273..d11bade39 100644 --- a/examples/reinforcement_learning/tutorial_Qlearning.py +++ b/examples/reinforcement_learning/tutorial_Qlearning.py @@ -18,9 +18,8 @@ import time -import numpy as np - import gym +import numpy as np ## Load the environment env = gym.make('FrozenLake-v0') diff --git a/examples/reinforcement_learning/tutorial_Retrace.py b/examples/reinforcement_learning/tutorial_Retrace.py index 13535c3cc..8dfe844ed 100644 --- a/examples/reinforcement_learning/tutorial_Retrace.py +++ b/examples/reinforcement_learning/tutorial_Retrace.py @@ -37,11 +37,10 @@ import numpy as np import tensorflow as tf -import tensorlayer as tl +import tensorlayer as tl from tutorial_wrappers import build_env - parser = argparse.ArgumentParser() parser.add_argument('--mode', help='train or test', default='train') parser.add_argument('--save_path', default='retrace', diff --git a/examples/reinforcement_learning/tutorial_SAC.py b/examples/reinforcement_learning/tutorial_SAC.py index f37b81e5b..cb15e838d 100644 --- a/examples/reinforcement_learning/tutorial_SAC.py +++ b/examples/reinforcement_learning/tutorial_SAC.py @@ -33,12 +33,12 @@ import random import time +import gym import matplotlib.pyplot as plt import numpy as np +import tensorflow as tf from IPython.display import clear_output -import gym -import tensorflow as tf import tensorflow_probability as tfp import tensorlayer as tl from tensorlayer.layers import Dense diff --git a/examples/reinforcement_learning/tutorial_TD3.py b/examples/reinforcement_learning/tutorial_TD3.py index 6ceca51ef..4b1b375d4 100644 --- a/examples/reinforcement_learning/tutorial_TD3.py +++ b/examples/reinforcement_learning/tutorial_TD3.py @@ -35,12 +35,12 @@ import random import time +import gym import matplotlib.pyplot as plt import numpy as np +import tensorflow as tf from IPython.display import clear_output -import gym -import tensorflow as tf import tensorflow_probability as tfp import tensorlayer as tl from tensorlayer.layers import Dense diff --git a/examples/reinforcement_learning/tutorial_TRPO.py b/examples/reinforcement_learning/tutorial_TRPO.py index 017ac086d..ec63d1b5b 100644 --- a/examples/reinforcement_learning/tutorial_TRPO.py +++ b/examples/reinforcement_learning/tutorial_TRPO.py @@ -28,19 +28,20 @@ python tutorial_TRPO.py --train/test """ -import numpy as np -import tensorflow as tf -import tensorflow_probability as tfp -import tensorlayer as tl -import gym +import argparse +import copy +import os import time +import gym import matplotlib.pyplot as plt +import numpy as np import scipy.signal -import copy +import tensorflow as tf from gym.spaces import Box, Discrete -import os -import argparse + +import tensorflow_probability as tfp +import tensorlayer as tl parser = argparse.ArgumentParser(description='Train or test neural net motor controller.') parser.add_argument('--train', dest='train', action='store_true', default=True) diff --git a/examples/reinforcement_learning/tutorial_atari_pong.py b/examples/reinforcement_learning/tutorial_atari_pong.py index 7e1b28822..2c90ed9d8 100644 --- a/examples/reinforcement_learning/tutorial_atari_pong.py +++ b/examples/reinforcement_learning/tutorial_atari_pong.py @@ -28,10 +28,10 @@ """ import time -import numpy as np - import gym +import numpy as np import tensorflow as tf + import tensorlayer as tl tl.logging.set_verbosity(tl.logging.DEBUG) diff --git a/examples/reinforcement_learning/tutorial_format.py b/examples/reinforcement_learning/tutorial_format.py index 645f5742b..18e2c5c78 100644 --- a/examples/reinforcement_learning/tutorial_format.py +++ b/examples/reinforcement_learning/tutorial_format.py @@ -28,12 +28,15 @@ ''' -import time import argparse +import time + import numpy as np import tensorflow as tf -import 'other package name' +import 'other +import name' +import package np.random.seed(2) tf.random.set_seed(2) # reproducible @@ -99,4 +102,3 @@ def D(): # some common functions, could be extracted into utils afterwards print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'\ .format(episode, all_episodes, episode_reward, time.time()-t0 ) ) - diff --git a/examples/reinforcement_learning/tutorial_prioritized_replay.py b/examples/reinforcement_learning/tutorial_prioritized_replay.py index 52b63632d..55b1b9ff7 100644 --- a/examples/reinforcement_learning/tutorial_prioritized_replay.py +++ b/examples/reinforcement_learning/tutorial_prioritized_replay.py @@ -39,11 +39,10 @@ import numpy as np import tensorflow as tf -import tensorlayer as tl +import tensorlayer as tl from tutorial_wrappers import build_env - parser = argparse.ArgumentParser() parser.add_argument('--mode', help='train or test', default='train') parser.add_argument('--save_path', default='per', diff --git a/examples/reinforcement_learning/tutorial_wrappers.py b/examples/reinforcement_learning/tutorial_wrappers.py index 231a9880b..451d0be27 100644 --- a/examples/reinforcement_learning/tutorial_wrappers.py +++ b/examples/reinforcement_learning/tutorial_wrappers.py @@ -4,7 +4,7 @@ """ from collections import deque from functools import partial -from multiprocessing import cpu_count, Process, Pipe +from multiprocessing import Pipe, Process, cpu_count from sys import platform import cv2 @@ -12,7 +12,6 @@ import numpy as np from gym import spaces - __all__ = ( 'build_env', # build env 'TimeLimit', # Time limit wrapper diff --git a/examples/spatial_transformer_network/tutorial_spatial_transformer_network_dynamic.py b/examples/spatial_transformer_network/tutorial_spatial_transformer_network_dynamic.py index 3170585e4..e4b9b1dcf 100644 --- a/examples/spatial_transformer_network/tutorial_spatial_transformer_network_dynamic.py +++ b/examples/spatial_transformer_network/tutorial_spatial_transformer_network_dynamic.py @@ -3,8 +3,8 @@ import time import numpy as np - import tensorflow as tf + import tensorlayer as tl from tensorlayer.layers import * from tensorlayer.models import Model diff --git a/examples/spatial_transformer_network/tutorial_spatial_transformer_network_static.py b/examples/spatial_transformer_network/tutorial_spatial_transformer_network_static.py index 5f09db68b..c09a3c46c 100644 --- a/examples/spatial_transformer_network/tutorial_spatial_transformer_network_static.py +++ b/examples/spatial_transformer_network/tutorial_spatial_transformer_network_static.py @@ -3,8 +3,8 @@ import time import numpy as np - import tensorflow as tf + import tensorlayer as tl from tensorlayer.layers import * from tensorlayer.models import Model diff --git a/examples/text_classification/tutorial_imdb_fasttext.py b/examples/text_classification/tutorial_imdb_fasttext.py index 731d2fce4..2c2c7aed0 100644 --- a/examples/text_classification/tutorial_imdb_fasttext.py +++ b/examples/text_classification/tutorial_imdb_fasttext.py @@ -31,8 +31,8 @@ import time import numpy as np - import tensorflow as tf + import tensorlayer as tl from tensorlayer.layers import * from tensorlayer.models import * diff --git a/examples/text_generation/data/__init__.py b/examples/text_generation/data/__init__.py index 7acccd1ee..5feb25700 100644 --- a/examples/text_generation/data/__init__.py +++ b/examples/text_generation/data/__init__.py @@ -1,4 +1,5 @@ from __future__ import absolute_import from . import imagenet_classes + # from . import diff --git a/examples/text_generation/tutorial_generate_text.py b/examples/text_generation/tutorial_generate_text.py index 4c42d0b12..22a17ea37 100644 --- a/examples/text_generation/tutorial_generate_text.py +++ b/examples/text_generation/tutorial_generate_text.py @@ -28,8 +28,8 @@ import nltk import numpy as np - import tensorflow as tf + import tensorlayer as tl from tensorlayer.layers import * diff --git a/examples/text_ptb/tutorial_ptb_lstm.py b/examples/text_ptb/tutorial_ptb_lstm.py index 77c7c3425..de08399c9 100644 --- a/examples/text_ptb/tutorial_ptb_lstm.py +++ b/examples/text_ptb/tutorial_ptb_lstm.py @@ -104,8 +104,8 @@ import time import numpy as np - import tensorflow as tf + import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/text_ptb/tutorial_ptb_lstm_state_is_tuple.py b/examples/text_ptb/tutorial_ptb_lstm_state_is_tuple.py index 9fccca66a..0021a7bfc 100644 --- a/examples/text_ptb/tutorial_ptb_lstm_state_is_tuple.py +++ b/examples/text_ptb/tutorial_ptb_lstm_state_is_tuple.py @@ -105,8 +105,8 @@ import time import numpy as np - import tensorflow as tf + import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/text_word_embedding/tutorial_word2vec_basic.py b/examples/text_word_embedding/tutorial_word2vec_basic.py index 5a1dc842c..6310699ad 100644 --- a/examples/text_word_embedding/tutorial_word2vec_basic.py +++ b/examples/text_word_embedding/tutorial_word2vec_basic.py @@ -44,9 +44,9 @@ import time import numpy as np +import tensorflow as tf from six.moves import xrange # pylint: disable=redefined-builtin -import tensorflow as tf import tensorlayer as tl import wget diff --git a/examples/tutorial_work_with_onnx.py b/examples/tutorial_work_with_onnx.py index 46fd0cb42..522f2ad8c 100644 --- a/examples/tutorial_work_with_onnx.py +++ b/examples/tutorial_work_with_onnx.py @@ -117,13 +117,13 @@ import time import numpy as np +import tensorflow as tf +from tensorflow.python.tools.freeze_graph import freeze_graph as _freeze_graph import onnx -import tensorflow as tf import tensorlayer as tl from onnx_tf.backend import prepare from onnx_tf.frontend import tensorflow_graph_to_onnx_model -from tensorflow.python.tools.freeze_graph import freeze_graph as _freeze_graph tf.logging.set_verbosity(tf.logging.DEBUG) tl.logging.set_verbosity(tl.logging.DEBUG) diff --git a/tensorlayer/__init__.py b/tensorlayer/__init__.py index 835a4935f..f89eebfff 100644 --- a/tensorlayer/__init__.py +++ b/tensorlayer/__init__.py @@ -5,19 +5,10 @@ import os from distutils.version import LooseVersion -from tensorlayer.package_info import VERSION -from tensorlayer.package_info import __shortversion__ -from tensorlayer.package_info import __version__ - -from tensorlayer.package_info import __package_name__ -from tensorlayer.package_info import __contact_names__ -from tensorlayer.package_info import __contact_emails__ -from tensorlayer.package_info import __homepage__ -from tensorlayer.package_info import __repository_url__ -from tensorlayer.package_info import __download_url__ -from tensorlayer.package_info import __description__ -from tensorlayer.package_info import __license__ -from tensorlayer.package_info import __keywords__ +from tensorlayer.package_info import ( + VERSION, __contact_emails__, __contact_names__, __description__, __download_url__, __homepage__, __keywords__, + __license__, __package_name__, __repository_url__, __shortversion__, __version__ +) if 'TENSORLAYER_PACKAGE_BUILDING' not in os.environ: diff --git a/tensorlayer/activation.py b/tensorlayer/activation.py index 4aef4a429..7c7b833c3 100644 --- a/tensorlayer/activation.py +++ b/tensorlayer/activation.py @@ -3,6 +3,7 @@ """A file containing various activation functions.""" import tensorflow as tf + from tensorlayer.decorators import deprecated __all__ = [ diff --git a/tensorlayer/cost.py b/tensorlayer/cost.py index 2664d8d72..753d58041 100644 --- a/tensorlayer/cost.py +++ b/tensorlayer/cost.py @@ -6,6 +6,7 @@ import tensorflow as tf from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops, math_ops, nn_ops, standard_ops + from tensorlayer import logging __all__ = [ diff --git a/tensorlayer/db.py b/tensorlayer/db.py index 1de73bf6a..7e9561f2f 100644 --- a/tensorlayer/db.py +++ b/tensorlayer/db.py @@ -8,10 +8,10 @@ from datetime import datetime import numpy as np +import tensorflow as tf import gridfs import pymongo -import tensorflow as tf from tensorlayer import logging from tensorlayer.files import ( assign_weights, del_folder, exists_or_mkdir, load_hdf5_to_weights, net2static_graph, save_weights_to_hdf5, @@ -641,7 +641,7 @@ def run_top_task(self, task_name=None, sort=None, **kwargs): logging.info("[Database] Start Task: key: {} sort: {} push time: {}".format(task_name, sort, _datetime)) _script = _script.decode('utf-8') with tf.Graph().as_default(): # # as graph: # clear all TF graphs - exec (_script, globals()) + exec(_script, globals()) # set status to finished _ = self.db.Task.find_one_and_update({'_id': _id}, {'$set': {'status': 'finished'}}) diff --git a/tensorlayer/decorators/__init__.py b/tensorlayer/decorators/__init__.py index 9d4eeaa17..2a289862a 100644 --- a/tensorlayer/decorators/__init__.py +++ b/tensorlayer/decorators/__init__.py @@ -11,7 +11,6 @@ from .deprecated import deprecated from .deprecated_alias import deprecated_alias -from .method_decorator import private_method -from .method_decorator import protected_method +from .method_decorator import private_method, protected_method __all__ = ['deprecated', 'deprecated_alias', 'private_method', 'protected_method'] diff --git a/tensorlayer/distributed.py b/tensorlayer/distributed.py index d3fbdd38f..544aac87e 100644 --- a/tensorlayer/distributed.py +++ b/tensorlayer/distributed.py @@ -6,6 +6,7 @@ import tensorflow as tf from tensorflow.python.training import session_run_hook + from tensorlayer import logging from tensorlayer.decorators import deprecated from tensorlayer.lazy_imports import LazyImport diff --git a/tensorlayer/files/__init__.py b/tensorlayer/files/__init__.py index e96fc663e..4d88fa35d 100644 --- a/tensorlayer/files/__init__.py +++ b/tensorlayer/files/__init__.py @@ -25,7 +25,6 @@ from .dataset_loaders.ptb_dataset import * from .dataset_loaders.voc_dataset import * from .dataset_loaders.wmt_en_fr_dataset import * - from .utils import * __all__ = [ diff --git a/tensorlayer/files/dataset_loaders/voc_dataset.py b/tensorlayer/files/dataset_loaders/voc_dataset.py index c5ccadbcf..458d5eb66 100644 --- a/tensorlayer/files/dataset_loaders/voc_dataset.py +++ b/tensorlayer/files/dataset_loaders/voc_dataset.py @@ -4,6 +4,7 @@ import os import tensorflow as tf + from tensorlayer import logging, utils from tensorlayer.files.utils import (del_file, del_folder, folder_exists, load_file_list, diff --git a/tensorlayer/files/dataset_loaders/wmt_en_fr_dataset.py b/tensorlayer/files/dataset_loaders/wmt_en_fr_dataset.py index 77c1f93f9..0261a8581 100644 --- a/tensorlayer/files/dataset_loaders/wmt_en_fr_dataset.py +++ b/tensorlayer/files/dataset_loaders/wmt_en_fr_dataset.py @@ -6,6 +6,7 @@ import tarfile from tensorflow.python.platform import gfile + from tensorlayer import logging from tensorlayer.files.utils import maybe_download_and_extract diff --git a/tensorlayer/files/utils.py b/tensorlayer/files/utils.py index e4b0f6f8e..d5c972dc6 100644 --- a/tensorlayer/files/utils.py +++ b/tensorlayer/files/utils.py @@ -18,16 +18,16 @@ import cloudpickle import h5py import numpy as np -import scipy.io as sio -from six.moves import cPickle - import progressbar +import scipy.io as sio import tensorflow as tf -import tensorlayer as tl +from six.moves import cPickle from tensorflow.python.keras.saving import model_config as model_config_lib from tensorflow.python.platform import gfile from tensorflow.python.util import serialization from tensorflow.python.util.tf_export import keras_export + +import tensorlayer as tl from tensorlayer import logging, nlp, utils, visualize # from six.moves import zip diff --git a/tensorlayer/initializers.py b/tensorlayer/initializers.py index f68c05c1d..666777824 100644 --- a/tensorlayer/initializers.py +++ b/tensorlayer/initializers.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import numpy as np - import tensorflow as tf __all__ = [ diff --git a/tensorlayer/layers/__init__.py b/tensorlayer/layers/__init__.py index 007a23128..febfc887c 100644 --- a/tensorlayer/layers/__init__.py +++ b/tensorlayer/layers/__init__.py @@ -13,11 +13,10 @@ from .convolution import * from .core import * from .dense import * -from .dropout import * from .deprecated import * +from .dropout import * from .embedding import * from .extend import * -# from .flow_control import * # remove for TF 2.0 from .image_resampling import * from .inputs import * from .lambda_layers import * @@ -27,11 +26,9 @@ from .padding import * from .pooling import * from .quantize import * -# from .reconstruction import * # remove for TF 2.0 from .recurrent import * from .scale import * from .shape import * from .spatial_transformer import * from .stack import * -# from .time_distribution import * # remove for TF 2.0 from .utils import * diff --git a/tensorlayer/layers/activation.py b/tensorlayer/layers/activation.py index 9abb19ce7..44fcc47a9 100644 --- a/tensorlayer/layers/activation.py +++ b/tensorlayer/layers/activation.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + from tensorlayer import logging from tensorlayer.activation import leaky_relu6, leaky_twice_relu6 from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/__init__.py b/tensorlayer/layers/convolution/__init__.py index ba68797f2..8cf4bd74c 100644 --- a/tensorlayer/layers/convolution/__init__.py +++ b/tensorlayer/layers/convolution/__init__.py @@ -9,7 +9,6 @@ More functions can be found in `TensorFlow API `__. """ -# from .atrous_conv import * # remove for TF 2.0 from .binary_conv import * from .deformable_conv import * from .depthwise_conv import * @@ -17,13 +16,13 @@ from .expert_conv import * from .expert_deconv import * from .group_conv import * +from .quan_conv import * +from .quan_conv_bn import * from .separable_conv import * from .simplified_conv import * from .simplified_deconv import * from .super_resolution import * from .ternary_conv import * -from .quan_conv import * -from .quan_conv_bn import * __all__ = [ diff --git a/tensorlayer/layers/convolution/binary_conv.py b/tensorlayer/layers/convolution/binary_conv.py index 14e5a8721..23448cf6f 100644 --- a/tensorlayer/layers/convolution/binary_conv.py +++ b/tensorlayer/layers/convolution/binary_conv.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/deformable_conv.py b/tensorlayer/layers/convolution/deformable_conv.py index b9a8224db..5f75bbe15 100644 --- a/tensorlayer/layers/convolution/deformable_conv.py +++ b/tensorlayer/layers/convolution/deformable_conv.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias, private_method diff --git a/tensorlayer/layers/convolution/depthwise_conv.py b/tensorlayer/layers/convolution/depthwise_conv.py index 4fe4dc34c..d6136ede3 100644 --- a/tensorlayer/layers/convolution/depthwise_conv.py +++ b/tensorlayer/layers/convolution/depthwise_conv.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/dorefa_conv.py b/tensorlayer/layers/convolution/dorefa_conv.py index 1f8944382..ed9b32dd8 100644 --- a/tensorlayer/layers/convolution/dorefa_conv.py +++ b/tensorlayer/layers/convolution/dorefa_conv.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/expert_conv.py b/tensorlayer/layers/convolution/expert_conv.py index fb27b9df6..d7e59a0e8 100644 --- a/tensorlayer/layers/convolution/expert_conv.py +++ b/tensorlayer/layers/convolution/expert_conv.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/expert_deconv.py b/tensorlayer/layers/convolution/expert_deconv.py index a1571b2cb..cb5cd6773 100644 --- a/tensorlayer/layers/convolution/expert_deconv.py +++ b/tensorlayer/layers/convolution/expert_deconv.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/group_conv.py b/tensorlayer/layers/convolution/group_conv.py index 2923a10ae..34d8c10e6 100644 --- a/tensorlayer/layers/convolution/group_conv.py +++ b/tensorlayer/layers/convolution/group_conv.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/quan_conv.py b/tensorlayer/layers/convolution/quan_conv.py index 662df2661..432764b63 100644 --- a/tensorlayer/layers/convolution/quan_conv.py +++ b/tensorlayer/layers/convolution/quan_conv.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/quan_conv_bn.py b/tensorlayer/layers/convolution/quan_conv_bn.py index 1c1593373..0ef5ac313 100644 --- a/tensorlayer/layers/convolution/quan_conv_bn.py +++ b/tensorlayer/layers/convolution/quan_conv_bn.py @@ -3,6 +3,7 @@ import tensorflow as tf from tensorflow.python.training import moving_averages + from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/convolution/separable_conv.py b/tensorlayer/layers/convolution/separable_conv.py index ff67672ba..b6ae62446 100644 --- a/tensorlayer/layers/convolution/separable_conv.py +++ b/tensorlayer/layers/convolution/separable_conv.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- import numpy as np - import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/simplified_conv.py b/tensorlayer/layers/convolution/simplified_conv.py index 8c8eebece..c00ff8fe7 100644 --- a/tensorlayer/layers/convolution/simplified_conv.py +++ b/tensorlayer/layers/convolution/simplified_conv.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/simplified_deconv.py b/tensorlayer/layers/convolution/simplified_deconv.py index 569fe0810..847062859 100644 --- a/tensorlayer/layers/convolution/simplified_deconv.py +++ b/tensorlayer/layers/convolution/simplified_deconv.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- import numpy as np - import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/super_resolution.py b/tensorlayer/layers/convolution/super_resolution.py index a3f51e2a8..35fee8722 100644 --- a/tensorlayer/layers/convolution/super_resolution.py +++ b/tensorlayer/layers/convolution/super_resolution.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias, private_method diff --git a/tensorlayer/layers/convolution/ternary_conv.py b/tensorlayer/layers/convolution/ternary_conv.py index 512350ba5..9a97c7bec 100644 --- a/tensorlayer/layers/convolution/ternary_conv.py +++ b/tensorlayer/layers/convolution/ternary_conv.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/core.py b/tensorlayer/layers/core.py index 8e13631b7..e5b2c27fb 100644 --- a/tensorlayer/layers/core.py +++ b/tensorlayer/layers/core.py @@ -5,6 +5,7 @@ from abc import abstractmethod import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import (deprecated_alias, private_method, protected_method) diff --git a/tensorlayer/layers/dense/__init__.py b/tensorlayer/layers/dense/__init__.py index 675559eaf..87b064f0c 100644 --- a/tensorlayer/layers/dense/__init__.py +++ b/tensorlayer/layers/dense/__init__.py @@ -13,9 +13,9 @@ from .binary_dense import * from .dorefa_dense import * from .dropconnect import * -from .ternary_dense import * from .quan_dense import * from .quan_dense_bn import * +from .ternary_dense import * __all__ = [ 'BinaryDense', diff --git a/tensorlayer/layers/dense/base_dense.py b/tensorlayer/layers/dense/base_dense.py index bec9d3f6f..a5b800f04 100644 --- a/tensorlayer/layers/dense/base_dense.py +++ b/tensorlayer/layers/dense/base_dense.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- import numpy as np - import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/dense/binary_dense.py b/tensorlayer/layers/dense/binary_dense.py index 74d5208cd..0492a01d9 100644 --- a/tensorlayer/layers/dense/binary_dense.py +++ b/tensorlayer/layers/dense/binary_dense.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/dense/dorefa_dense.py b/tensorlayer/layers/dense/dorefa_dense.py index 73069d478..1e80e2339 100644 --- a/tensorlayer/layers/dense/dorefa_dense.py +++ b/tensorlayer/layers/dense/dorefa_dense.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/dense/dropconnect.py b/tensorlayer/layers/dense/dropconnect.py index 371ed2e6b..4b16fba5b 100644 --- a/tensorlayer/layers/dense/dropconnect.py +++ b/tensorlayer/layers/dense/dropconnect.py @@ -4,6 +4,7 @@ import numbers import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/dense/quan_dense.py b/tensorlayer/layers/dense/quan_dense.py index 8d5c594c7..2e6296434 100644 --- a/tensorlayer/layers/dense/quan_dense.py +++ b/tensorlayer/layers/dense/quan_dense.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/dense/quan_dense_bn.py b/tensorlayer/layers/dense/quan_dense_bn.py index bcbd70950..7b517b9ba 100644 --- a/tensorlayer/layers/dense/quan_dense_bn.py +++ b/tensorlayer/layers/dense/quan_dense_bn.py @@ -4,6 +4,7 @@ import tensorflow as tf # from tensorlayer.layers.core import LayersConfig from tensorflow.python.training import moving_averages + from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/dense/ternary_dense.py b/tensorlayer/layers/dense/ternary_dense.py index 28d84297e..dce6be9eb 100644 --- a/tensorlayer/layers/dense/ternary_dense.py +++ b/tensorlayer/layers/dense/ternary_dense.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/dropout.py b/tensorlayer/layers/dropout.py index 25fe80a36..3724d8b43 100644 --- a/tensorlayer/layers/dropout.py +++ b/tensorlayer/layers/dropout.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/embedding.py b/tensorlayer/layers/embedding.py index a82c1a93b..80c5cadfa 100644 --- a/tensorlayer/layers/embedding.py +++ b/tensorlayer/layers/embedding.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- import numpy as np - import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/extend.py b/tensorlayer/layers/extend.py index 09d5508db..42395a537 100644 --- a/tensorlayer/layers/extend.py +++ b/tensorlayer/layers/extend.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/image_resampling.py b/tensorlayer/layers/image_resampling.py index 4713200d3..3b2a2825a 100644 --- a/tensorlayer/layers/image_resampling.py +++ b/tensorlayer/layers/image_resampling.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/inputs.py b/tensorlayer/layers/inputs.py index 4f2544b06..0330347fe 100644 --- a/tensorlayer/layers/inputs.py +++ b/tensorlayer/layers/inputs.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- import numpy as np - import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.layers.core import Layer, LayerNode diff --git a/tensorlayer/layers/lambda_layers.py b/tensorlayer/layers/lambda_layers.py index 9b82ad603..17501a4e4 100644 --- a/tensorlayer/layers/lambda_layers.py +++ b/tensorlayer/layers/lambda_layers.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.files import utils diff --git a/tensorlayer/layers/merge.py b/tensorlayer/layers/merge.py index 2509d35a6..346a65962 100644 --- a/tensorlayer/layers/merge.py +++ b/tensorlayer/layers/merge.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + from tensorlayer import logging from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/noise.py b/tensorlayer/layers/noise.py index c658f8e19..bd9c2df9c 100644 --- a/tensorlayer/layers/noise.py +++ b/tensorlayer/layers/noise.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/normalization.py b/tensorlayer/layers/normalization.py index 0de0e8ed1..d8cec274c 100644 --- a/tensorlayer/layers/normalization.py +++ b/tensorlayer/layers/normalization.py @@ -2,10 +2,11 @@ # -*- coding: utf-8 -*- import tensorflow as tf -import tensorlayer as tl from tensorflow.python.framework import ops from tensorflow.python.ops import math_ops from tensorflow.python.training import moving_averages + +import tensorlayer as tl from tensorlayer import logging from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/padding.py b/tensorlayer/layers/padding.py index edcb720a5..db1bbb304 100644 --- a/tensorlayer/layers/padding.py +++ b/tensorlayer/layers/padding.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/pooling.py b/tensorlayer/layers/pooling.py index a22cea358..2046de6c5 100644 --- a/tensorlayer/layers/pooling.py +++ b/tensorlayer/layers/pooling.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/quantize.py b/tensorlayer/layers/quantize.py index 47ad2a088..3b5b19635 100644 --- a/tensorlayer/layers/quantize.py +++ b/tensorlayer/layers/quantize.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/recurrent.py b/tensorlayer/layers/recurrent.py index 2364c6a7d..16b7208d0 100644 --- a/tensorlayer/layers/recurrent.py +++ b/tensorlayer/layers/recurrent.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/scale.py b/tensorlayer/layers/scale.py index 6546d70af..ac1800529 100644 --- a/tensorlayer/layers/scale.py +++ b/tensorlayer/layers/scale.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + from tensorlayer import logging from tensorlayer.initializers import constant from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/shape.py b/tensorlayer/layers/shape.py index e308eb0c4..f8e7b47db 100644 --- a/tensorlayer/layers/shape.py +++ b/tensorlayer/layers/shape.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/spatial_transformer.py b/tensorlayer/layers/spatial_transformer.py index 262108a68..0d0f578d0 100644 --- a/tensorlayer/layers/spatial_transformer.py +++ b/tensorlayer/layers/spatial_transformer.py @@ -2,11 +2,11 @@ # -*- coding: utf-8 -*- import numpy as np +import tensorflow as tf from six.moves import xrange +from tensorflow.python.ops import array_ops -import tensorflow as tf import tensorlayer as tl -from tensorflow.python.ops import array_ops from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/stack.py b/tensorlayer/layers/stack.py index c35e3837f..c31327989 100644 --- a/tensorlayer/layers/stack.py +++ b/tensorlayer/layers/stack.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import tensorflow as tf + from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/utils.py b/tensorlayer/layers/utils.py index 6d411589f..e5dd154b1 100644 --- a/tensorlayer/layers/utils.py +++ b/tensorlayer/layers/utils.py @@ -2,10 +2,10 @@ # -*- coding: utf-8 -*- import numpy as np - import tensorflow as tf -import tensorlayer as tl from tensorflow.python.ops.rnn_cell import LSTMStateTuple + +import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated, deprecated_alias diff --git a/tensorlayer/models/__init__.py b/tensorlayer/models/__init__.py index 5375efcdd..ec4b021d2 100644 --- a/tensorlayer/models/__init__.py +++ b/tensorlayer/models/__init__.py @@ -4,6 +4,6 @@ # """A collections of pre-defined well known models.""" from .core import * -from .squeezenetv1 import SqueezeNetV1 from .mobilenetv1 import MobileNetV1 +from .squeezenetv1 import SqueezeNetV1 from .vgg import * diff --git a/tensorlayer/models/core.py b/tensorlayer/models/core.py index cbcff4bf3..c811b9648 100644 --- a/tensorlayer/models/core.py +++ b/tensorlayer/models/core.py @@ -3,8 +3,9 @@ from queue import Queue import tensorflow as tf -import tensorlayer as tl from tensorflow.python.framework import ops as tf_ops + +import tensorlayer as tl from tensorlayer import logging from tensorlayer.files import utils from tensorlayer.layers import Layer, ModelLayer diff --git a/tensorlayer/models/mobilenetv1.py b/tensorlayer/models/mobilenetv1.py index 4908b3d89..8065eeef3 100644 --- a/tensorlayer/models/mobilenetv1.py +++ b/tensorlayer/models/mobilenetv1.py @@ -5,6 +5,7 @@ import os import tensorflow as tf + from tensorlayer import logging from tensorlayer.files import (assign_weights, load_npz, maybe_download_and_extract) from tensorlayer.layers import (BatchNorm, Conv2d, DepthwiseConv2d, Flatten, GlobalMeanPool2d, Input, Reshape) diff --git a/tensorlayer/models/squeezenetv1.py b/tensorlayer/models/squeezenetv1.py index a2d7e4304..b38d42dc8 100644 --- a/tensorlayer/models/squeezenetv1.py +++ b/tensorlayer/models/squeezenetv1.py @@ -5,6 +5,7 @@ import os import tensorflow as tf + from tensorlayer import logging from tensorlayer.files import (assign_weights, load_npz, maybe_download_and_extract) from tensorlayer.layers import (Concat, Conv2d, Dropout, GlobalMeanPool2d, Input, Lambda, MaxPool2d) diff --git a/tensorlayer/models/vgg.py b/tensorlayer/models/vgg.py index 06648cb53..b072841be 100644 --- a/tensorlayer/models/vgg.py +++ b/tensorlayer/models/vgg.py @@ -30,8 +30,8 @@ import os import numpy as np - import tensorflow as tf + import tensorlayer as tl from tensorlayer import logging from tensorlayer.files import assign_weights, maybe_download_and_extract diff --git a/tensorlayer/nlp.py b/tensorlayer/nlp.py index ed1ce975d..d96a7acf1 100755 --- a/tensorlayer/nlp.py +++ b/tensorlayer/nlp.py @@ -11,11 +11,11 @@ from collections import Counter import numpy as np +import tensorflow as tf from six.moves import urllib, xrange +from tensorflow.python.platform import gfile -import tensorflow as tf import tensorlayer as tl -from tensorflow.python.platform import gfile from tensorlayer.lazy_imports import LazyImport nltk = LazyImport("nltk") diff --git a/tensorlayer/rein.py b/tensorlayer/rein.py index 8ddce7316..e5cbe6bd4 100644 --- a/tensorlayer/rein.py +++ b/tensorlayer/rein.py @@ -2,9 +2,8 @@ # -*- coding: utf-8 -*- import numpy as np -from six.moves import xrange - import tensorflow as tf +from six.moves import xrange __all__ = [ 'discount_episode_rewards', diff --git a/tensorlayer/utils.py b/tensorlayer/utils.py index 35e054afb..d6b8e6d78 100644 --- a/tensorlayer/utils.py +++ b/tensorlayer/utils.py @@ -11,9 +11,9 @@ from sys import platform as _platform import numpy as np +import tensorflow as tf from sklearn.metrics import accuracy_score, confusion_matrix, f1_score -import tensorflow as tf import tensorlayer as tl __all__ = [ From edaf61ec02866387450366ddd3ab09a5ca851cab Mon Sep 17 00:00:00 2001 From: Officium Date: Tue, 11 Jun 2019 17:41:39 +0800 Subject: [PATCH 51/57] update annocation --- examples/reinforcement_learning/tutorial_DQN_variants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/reinforcement_learning/tutorial_DQN_variants.py b/examples/reinforcement_learning/tutorial_DQN_variants.py index d6292e987..376eda23c 100644 --- a/examples/reinforcement_learning/tutorial_DQN_variants.py +++ b/examples/reinforcement_learning/tutorial_DQN_variants.py @@ -43,7 +43,7 @@ To run: ------------------------ python tutorial_DQN_variantes.py --mode=train -python tutorial_DQN_variantes.py --mode=test --save_path=c51/8000.npz +python tutorial_DQN_variantes.py --mode=test --save_path=dqn_variants/8000.npz """ import argparse import os From dc19fc552825d7bc8e00fec29de4cda831980a27 Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Tue, 11 Jun 2019 13:59:48 +0100 Subject: [PATCH 52/57] modify readme --- examples/reinforcement_learning/README.md | 217 ++++++++++++++---- examples/reinforcement_learning/a3c.png | Bin 22295 -> 0 bytes .../reinforcement_learning/baselines/SAC.py | 4 +- .../baselines/wrappers.py | 3 +- .../reinforcement_learning/tutorial_A3C.py | 4 +- .../reinforcement_learning/tutorial_AC.py | 4 +- .../reinforcement_learning/tutorial_C51.py | 2 +- .../reinforcement_learning/tutorial_DDPG.py | 4 +- .../reinforcement_learning/tutorial_DPPO.py | 6 +- .../reinforcement_learning/tutorial_DQN.py | 4 +- .../tutorial_DQN_variants.py | 2 +- .../reinforcement_learning/tutorial_PG.py | 4 +- .../reinforcement_learning/tutorial_PPO.py | 4 +- .../tutorial_Qlearning.py | 3 +- .../tutorial_Retrace.py | 2 +- .../reinforcement_learning/tutorial_SAC.py | 32 ++- .../reinforcement_learning/tutorial_TD3.py | 38 ++- .../reinforcement_learning/tutorial_TRPO.py | 6 +- .../tutorial_atari_pong.py | 4 +- .../reinforcement_learning/tutorial_format.py | 3 +- .../tutorial_prioritized_replay.py | 2 +- .../tutorial_wrappers.py | 3 +- 22 files changed, 256 insertions(+), 95 deletions(-) delete mode 100644 examples/reinforcement_learning/a3c.png diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index ff6c4081c..c1e4977ad 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -59,138 +59,256 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t ## Examples of RL Algorithms: -* Q-learning +* **Q-learning** Code: `./tutorial_Qlearning.py` - Paper: [Technical Note Q-Learning](http://www.gatsby.ucl.ac.uk/~dayan/papers/cjch.pdf) + Paper: [Technical Note Q-Learning](http://www.gatsby.ucl.ac.uk/~dayan/papers/cjch.pdf) + + Description: + + Non deep learning method with TD Learning, Off-Policy, e-Greedy Exploration. + + Central formula: + + Q(S, A) <- Q(S, A) + alpha * (R + lambda * Q(newS, newA) - Q(S, A)) + + See David Silver RL Tutorial Lecture 5 - Q-Learning for more details. -* Deep Q-Network (DQN) +* **Deep Q-Network (DQN)** - Code: `./tutorial_DQN.py` + Code: `./tutorial_DQN.py` - Paper: [Human-level control through deep reinforcementlearning](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf) + Paper: [Human-level control through deep reinforcementlearning](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf) [Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) - + Description: + + Deep Q-Network (DQN) is a method of TD Learning, Off-Policy, e-Greedy Exploration (GLIE). + + Central formula: -* Double DQN / Dueling DQN / Noisy DQN + Q(S, A) <- Q(S, A) + alpha * (R + lambda * Q(newS, newA) - Q(S, A)), - Code: `./tutorial_DQN_variants.py` + delta_w = R + lambda * Q(newS, newA). - Paper: [Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461) + See David Silver RL Tutorial Lecture 5 - Q-Learning for more details. +* **Double DQN / Dueling DQN / Noisy DQN** -* PER (Prioritized Experience Replay) + Code: `./tutorial_DQN_variants.py` - Code: `./tutorial_prioritized_replay.py` + Paper: [Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461) - Paper: [Prioritized Experience Replay](https://arxiv.org/abs/1511.05952) + Description: - + We implement Double DQN, Dueling DQN and Noisy DQN here. + + * The max operator in standard DQN uses the same values both to select and to evaluate an action by: + + ​ Q(s_t, a_t) = R\_{t+1\} + gamma \* max\_{a}Q\_\{target\}(s_{t+1}, a). + + * Double DQN proposes to use following evaluation to address overestimation problem of max operator: + + ​ Q(s_t, a_t) = R\_{t+1\} + gamma \* Q\_{target}(s\_\{t+1\}, max{a}Q(s_{t+1}, a)). + + * Dueling DQN uses dueling architecture where the value of state and the advantage of each action is estimated separately. + + * Noisy DQN propose to explore by adding parameter noises. + + -* Distributed DQN - Code: `./tutorial_C51.py` +* **Prioritized Experience Replay** - Paper: [A Distributional Perspective on Reinforcement Learning](https://arxiv.org/pdf/1707.06887.pdf) + Code: `./tutorial_prioritized_replay.py` + + Paper: [Prioritized Experience Replay](https://arxiv.org/abs/1511.05952) + + Description: + + Prioritized experience replay is an efficient replay method that replay important transitions more frequently. Segment tree data structure is used to speed up indexing. +* **Distributed DQN (C51)** + + Code: `./tutorial_C51.py` -* Retrace(lambda) DQN + Paper: [A Distributional Perspective on Reinforcement Learning](https://arxiv.org/pdf/1707.06887.pdf) - Code: `./tutorial_Retrace.py` + Description: - Paper: [Safe and Efficient Off-Policy Reinforcement Learning](https://arxiv.org/abs/1606.02647) + Categorical 51 distributional RL algorithm is a distrbuted DQN, where 51 means the number of atoms. In this algorithm, instead of estimating actual expected value, value distribution over a series of continuous sub-intervals (atoms) is considered. -* Actor-Critic (AC) +* **Retrace(lambda) DQN** - Code:`./tutorial_AC.py` + Code: `./tutorial_Retrace.py` - Paper: [Actor-Critic Algorithms](https://papers.nips.cc/paper/1786-actor-critic-algorithms.pdf) + Paper: [Safe and Efficient Off-Policy Reinforcement Learning](https://arxiv.org/abs/1606.02647) + + Description: + + Retrace (lambda) is an off-policy algorithm that extend the idea of eligibility trace. It apply an importance sampling ratio truncated at 1 to several behaviour policies, which suffer from the variance explosion of standard IS and lead to safe and efficient learning. -* Asynchronous Advantage Actor-Critic (A3C) - Code: `./tutorial_A3C.py` +* **Actor-Critic (AC)** + + Code:`./tutorial_AC.py` + + Paper: [Actor-Critic Algorithms](https://papers.nips.cc/paper/1786-actor-critic-algorithms.pdf) - Paper: [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/pdf/1602.01783.pdf) + Description: + + The implementation of Advantage Actor-Critic, using TD-error as the advantage. -* Soft Actor-Critic (SAC) +* **Asynchronous Advantage Actor-Critic (A3C)** + + Code: `./tutorial_A3C.py` - Code: `./tutorial_SAC.py` + Paper: [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/pdf/1602.01783.pdf) - Paper: [Soft Actor-Critic Algorithms and Applications](https://arxiv.org/pdf/1812.05905.pdf) + Description: + + The implementation of Asynchronous Advantage Actor-Critic (A3C), using multi-threading for distributed policy learning on Actor-Critic structure. +* **Soft Actor-Critic (SAC)** + + Code: `./tutorial_SAC.py` -* Policy Gradient (PG/REINFORCE) + Paper: [Soft Actor-Critic Algorithms and Applications](https://arxiv.org/pdf/1812.05905.pdf) - Code: `./tutorial_PG.py` + Description: - Paper: [Policy Gradient Methods for Reinforcement Learning with Function Approximation](https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf) + Actor policy in SAC is stochastic, with off-policy training. And 'soft' in SAC indicates the trade-off between the entropy and expected return. The additional consideration of entropy term helps with more explorative policy. And this implementation contains an automatic update for the entropy factor. + + This version of Soft Actor-Critic (SAC) implementation contains 5 networks: + + 2 Q-networks, 2 target Q-networks and 1 policy network. -* Deep Deterministic Policy Gradient (DDPG) - Code: `./tutorial_DDPG.py` +* **Vanilla Policy Gradient (PG or REINFORCE)** + + Code: `./tutorial_PG.py` - Paper: [Continuous Control With Deep Reinforcement Learning](https://arxiv.org/pdf/1509.02971.pdf) + Paper: [Policy Gradient Methods for Reinforcement Learning with Function Approximation](https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf) + + Description: + + The policy gradient algorithm works by updating policy parameters via stochastic gradient ascent on policy performance. It's an on-policy algorithm can be used for environments with either discrete or continuous action spaces. + + To apply it on continuous action space, you need to change the last softmax layer and the choose_action function. +* **Deep Deterministic Policy Gradient (DDPG)** + + Code: `./tutorial_DDPG.py` -* Twin Delayed DDPG (TD3) + Paper: [Continuous Control With Deep Reinforcement Learning](https://arxiv.org/pdf/1509.02971.pdf) - Code: `./tutorial_TD3.py` + Description: - Paper: [Addressing Function Approximation Error in Actor-Critic Methods](https://arxiv.org/pdf/1802.09477.pdf) + An algorithm concurrently learns a Q-function and a policy. + + It uses off-policy data and the Bellman equation to learn the Q-function, and uses the Q-function to learn the policy. -* Trust Region Policy Optimization (TRPO) - Code: `./tutorial_TRPO.py` +* **Twin Delayed DDPG (TD3)** + + Code: `./tutorial_TD3.py` + + Paper: [Addressing Function Approximation Error in Actor-Critic Methods](https://arxiv.org/pdf/1802.09477.pdf) + + Description: + + DDPG suffers from problems like overestimate of Q-values and sensitivity to hyper-parameters. + + Twin Delayed DDPG (TD3) is a variant of DDPG with several tricks: + + * Trick One: Clipped Double-Q Learning. TD3 learns two Q-functions instead of one (hence “twin”), + + and uses the smaller of the two Q-values to form the targets in the Bellman error loss functions. + + * Trick Two: “Delayed” Policy Updates. TD3 updates the policy (and target networks) less frequently + + than the Q-function. + + * Trick Three: Target Policy Smoothing. TD3 adds noise to the target action, to make it harder for + + the policy to exploit Q-function errors by smoothing out Q along changes in action. - Paper: [Trust Region Policy Optimization](https://arxiv.org/pdf/1502.05477.pdf) + The implementation of TD3 includes 6 networks: + + 2 Q-networks, 2 target Q-networks, 1 policy network, 1 target policy network. + + Actor policy in TD3 is deterministic, with Gaussian exploration noise. -* Proximal Policy Optimization (PPO) +* **Trust Region Policy Optimization (TRPO)** + + Code: `./tutorial_TRPO.py` + + Paper: [Trust Region Policy Optimization](https://arxiv.org/pdf/1502.05477.pdf) - Code: `./tutorial_PPO.py` + Description: - Paper: [Proximal Policy Optimization Algorithms](https://arxiv.org/pdf/1707.06347.pdf) + PG method with a large step can crash the policy performance, even with a small step can lead a large differences in policy. + + TRPO constraints the step in policy space using KL divergence (rather than in parameter space), which can monotonically improve performance and avoid a collapsed update. -* Distributed Proximal Policy Optimization (DPPO) +* **Proximal Policy Optimization (PPO)** + + Code: `./tutorial_PPO.py` + + Paper: [Proximal Policy Optimization Algorithms](https://arxiv.org/pdf/1707.06347.pdf) + + Description: - Code: `./tutorial_DPPO.py` + A simple version of Proximal Policy Optimization (PPO) using single thread. - Paper: [Emergence of Locomotion Behaviours in Rich Environments](https://arxiv.org/pdf/1707.02286.pdf) + PPO is a family of first-order methods that use a few other tricks to keep new policies close to old. + + PPO methods are significantly simpler to implement, and empirically seem to perform at least as well as TRPO. -* Hindsight Experience Replay (HER) +* **Distributed Proximal Policy Optimization (DPPO)** + + Code: `./tutorial_DPPO.py` + + Paper: [Emergence of Locomotion Behaviours in Rich Environments](https://arxiv.org/pdf/1707.02286.pdf) + + Description: - To do. + A distributed version of OpenAI's Proximal Policy Optimization (PPO). + + Distribute the workers to collect data in parallel, then stop worker's roll-out and train PPO on collected data. + + -* etc +* More in recent weeks ## Environment: @@ -204,4 +322,3 @@ Our env wrapper: `./tutorial_wrappers.py` - @Tokarev-TT-33 Tianyang Yu @initial-h Hongming Zhang : PG, DDPG, PPO, DPPO, TRPO - @Officium Yanhua Huang: C51, Retrace, DQN_variants, prioritized_replay, wrappers. -### More examples can be found in the [example list](https://tensorlayer.readthedocs.io/en/stable/user/examples.html) diff --git a/examples/reinforcement_learning/a3c.png b/examples/reinforcement_learning/a3c.png deleted file mode 100644 index 918b5f2d305bbbd47dc37ec1fe69319f8a4ef97b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 22295 zcmeFZWmr~S_dU7^i!=a{G{7KLx}-z|q@CUarQL) zgrjR@3jTv@CnBME7XG-Mef}2ye$M)_njL}=>!SZ)r3JTtPJg)bZrd~ z3tf9_b1Qpu6TPdB2DWx4R+d-USnjg0Fel!f-Eollk6^OqIO2-;k#N%9pa4%625t54+*UM_FY$hM61i5?Y$j9BQUhs z4PPOc3C7B|7%o*WF%2CwfFDGB{XBz(ZZa_z9&FX-c#0jaHQ>M3#Jk4tQV%U zdrEYVhezSN%;~Zop9;YpyesxtXvun~lT z7%QJ*D8fZ?C2zoPY<#0oZF=$-i&b>A8^0u%6>(Yj4*$rM-MTv~G1uRLk!h16hM5~;ruhPSQD7^a?cADHP7tnDbKA2)mAR8P$` zo18(sB4^)aB?jJe*z%;8mhmkc_l|K@^V%2aJv=HZ%dsy~XEeh7IcLw%FRw^;x@et> zH)clePjh^ct*me3C&8J^Bv}dR4^6~BFfPrm+0l%DZ^&UH@!fO1^X-`RTGzN#B34}H z!lb-O-6aIs70ppUmy&t!(~FGBQ;nZ>@8-Gbw6!qG8ODi5R(u}5wtsH=vf@E-Ni-%& zNAN@8NAkV2kL0%Z>3n0Tbf-MT5FxQHPd4ngk`qh6Qh!uF@XKDX`{}+xE%sS=bw@73 z^|!2GxbL@dR`JW>+zy?~ox)X%@V#EwJF>(@4)7qhcNP@rB5nMNixxZ5nUz{Eo{H4)nVd3-t#^m`tgDJKZNa z9pcmQR*02G6$>e6$G)0LA04gq>!lVR@n$f|i>AXEnY2JE#@;~h3dsi%St;)|=;KFV z6(n13yUX&*ZO0<4PK~~2(*HIK4P5`K-Tw7pdc}b4=rRtHKrum6k6$lpi;h zU8+Gx?QXG|ieasvl#FLfKk@p}*_XL;q9q0*DLd3p_p81`)T^GCI76X%fLC!6CM*uMU2_D%||{@A_i zN$baN1YNQdMvQfzR%#9t|2dq#Z|l6}UEO->Ys7i&M%?`j4{EE5d+vlVLVes-?A6N; zm$v&`e^lMy2ni0&8;$(i!lU(~X~nr*4U-@DLGP72DfRDXdsg}JFxg5S{oZHfkoxDF z2q5{43eqqSErqjg%xbX07T-Yxx#VVxFGJK}*2Z zy+RQ@MDg67739-jxL>tNVOb|;*ShaUppSL_jCGGu8v<{^{MZT~e^s$4zq^9q*+bmh zocu!4B8OAIeu~~kn(30?RyXaM+(6z#f&~|}c74Z+G#;_2jv~5aym6{TB6_RcF zj62`vXhqI#Z^)Uy@KS#mlIwDj@>kx?M2~g*ndaw6eZ@WO7g)Ra*pF;^V7nk=&@5TYsenaKfA#w!DOq@W!t9jt z!S}k*zuK?i?dK^kCoZUqrQgv#FT^2yER9rOjS)%@@bi=M*3k`82>h)_O2*gvN>A73 z9hI2p>+fHGE8tA6z0%7Wa#yW0?|2GtY>2=iWy~tfkY-@-Agb;3w z+Rrr+=0Qg}e9knp!N4b^WXEgqQ3KYVTLJ!^B{|o7xnJKsn_$}Ic?n@Xd+WU1J)ZA) zCERNXWOP)Wh16;c40mWx=b4cv*5#@@f zI<0S!EerUv%>~nkNW0#aAcfUutOl<&S;NfJ36s5t*p-SSE;<%!oU@- zyKwrYfgQmS$B$_qzvstN>H8ZUVxIa>zwIoSD+=g|GOm^3=y3jb;P1mjXF9*;Tv7k{ zF;}UISiAFEU-0vi#~yLZ>pO+b{f^qTwcL~>-B`+>1@~maCa6@jRX(=@jn~E#&AGVp{gWb2#f(-B^;4RkVjk zan&m~{x+6#Li7p8Vdbe^A3{^Lg9eMTsBT3Uq-i?&EY_jzar2u4P;DLgf$B)UCS zmM}j>D{iXbBB9>MS;WXj?UT-y>4nta7t?OCK4u3TmrerknfuMH{nC%^?Ma`TBlqt7 z{Wv&{)StS6VAu0JJhMN){WE}0qJ!nJqP)$!ic?tir>-N|LZ|o#iZbSR2Z}f3;vT5Q znmY0z*21O+NDRMoVWs`tby`~5Q>RYFI>z_*DsC+et2}t{0FRImllQiTLoSJs z^o3cAp4qW>dI?tMvD_GaA9R?W+BpPXa3KY|@yAE+$!&%@{QdngWmejxBqYX-ex!R7 z-o%?n`zvN&U!PObxZ@Qy6()59!_+1V>1aNbHiGQ5P`eT%&IG_V6xlR3HpXsmi20M5 z0La^$P5#iEDov?r*Uk~eZW0XbMaHXFuNZR4grwJIOmrV;RER?gZ(}}-9HpvXK{^bk zTSEQ({S{PIRR>*FwB1V<9ji($M@{t%kj$#qXy0@r^K?JS7?(g3U8Glz) zpoBpl{Ljx_fj14;sz$XeaVA{iW`{+VmHj;)^QLH*8bxUmAu(kXmyqv))QtX(wtdWZ z?lgB&q&|sEs;{pvWWImDyK@^+lnoF)U@taTkFa=xkCV`3!&T+w=Q0%}ZvUOyDaIi_ zA|e8hnvtJhBTFTZWoPV*_nFoQe4oD4dTJz#e_(O7C>uWa(#zi^-AJK*Fkc(W*jx|C zK6{En$RItIkPjvTA-{Mr>Ep+bl03>YKVvzZ*Pm1K+CNTAObkwG^uNq_^{No_vr_)a z9ex3U>A5+zI~+**Ahx?Xg}V})^sZr+mU>9GM-J=LgtGRu|^E1;QAFmA+n|vZ;(MY$MY?^y{hUV%7R(k#y zYKlyg^mr5~5TyEG$$9s-Al$lNaT~vXU&u&{d#a^{0(nwe+Po68e#zjGkr8&!BRA+j zGVEr4RxXtev)j+g32d~n-Bck;xM|~5WIEDpqC2KtAYK&-K|!edAM~C9Ylw}56Z+{> zvX>W9)QY7O%s2il2LgW{bN$K6$k-U{qZph+CuV9aT(8W{=i_0vMe4Jd9(X~we3wauhh z^>Y^xtb~hO#x$9h?IMb*i@kQ^H&3F74C_3?Fj!v8JY`s7+*RtWUhoMWkRUps=h6)a z4$r>(2B#SPu~M)(x3Pqit_2O2j@KJk4h669s^I&!Tc>)yBI$-FKjp2ZaVb|}ljz>` zX+|SX{Ti?f!oK!_PXRL#&0IGA^AuefcgQjISD~IZZCPBCa9QaW7;=q3A4&`ly({*(TJs`;-pLpMpM%-=+Y;S$ z=-mn7X9;-kD>EYxYDP+!M{@@kwxeOcToy-gy%^;B`$dVj%o!wHJah{ILqf`^4@Kw3 zk6>x!ao3kx6z)5~eRQs4Uquuz00xMwF11&WaPpAus$JAazpsRTA8*V2LT0Y?DS-<%mj$t$%ECH$O5WzeX_|+(6S`c%vnW;u)Kw}QepxOb-dUq7rfbakb_Q73*4t3 zH(6E&zlgp)msn*PZHrla;k!4_Zg2EPJSYfGxlDEwngE+mjMx*B=o|;5VeYO-P0e__ z?{&qqDSO*H8HVH5i=i|05M4*lotwcaX)_iI zKMO?A+u@fO4o_myfG3e| z)#DJ~=H^y~WZIdnR`k>PQ%S?GkE52XWmsNaEEb9?-k#bwEs0vcVU=66P$Kcpf3CQ% zHC|S)eo>B9h*T5$*I$)!z_SMQBq_^RJc-Q=kD876|HmF-9azE z%W)G4q~_Z$Cbm3qmXegrTwQDWK%3XfQuSnQZH@I8&|;=P*HulPoXKPNv3=ElUt z6k3eex?kQig9?D_GR$6}4R^FUD3${FiUz_>yZ>vEsdGPor(4e$g$6vyq# z0>|#bxu#cA{a;lBZzM#hZ=<}vCz4mYXX>*<&13uAfK&5NAm4K58Co~%mVK#k7Nwo# z(XGRs@rAYTwEaJ0`KIl8F)0$40?6-)G`t}W$EdFjUtjtkmBXv$fE1Z{FDV&AvH3?b zXV~>&jr!~PrNkwJ0_!{E>9_Z+aIkgk?d>c2BO@bu9T!yx{iwu0(1d>Y@IgdWw8V91 zxy8J)#RePeZtJ#p#r*#KeP{u;_Hry7EHqw2pmUr)EW?t;N=r!zvG{aLRymr}A|xUr z9cISB$cXXYJ$YbhV1S<(s!R~;E097tlAh^HFkLz`aiFw!b|Rj`azs_zeJd1dW@}BI zHR9YghvU-oG)nVH#hA` zNl9bZJ#1_Wh-mnYpZ6eg3qawpZ#|#gx95Q1OXn8F(dmd2a9F9?F+JSdGFZuZRwi@f z(R;Z(t(y1d6o`h%o7u|B&kfB6ziZLFK&o$y#Bi8}!6w4gy6+cCgz(wVCgkVet=ag= z_lT1aVbNdYxE21cwXJ^vULygUpyte~6QARv9FS~ucQ`qf+oL$xDw`R;2;d@_{wvVv zeEYs&U{_@PUeG>^kQOw{Tb1S1hI z^%avP7J=slOe_x8f}-G0XKJmMrqSY}EC<)f;uNiQgTFg+N_ad9=~gPdoL2Qvy; z22BDtq7mfa9mce)H(EVoC94&MAGPDbiDra`V{73xb!WiMc*=W1z$WM2_nR(xBR z7WRHQ70?I74hCZ9%MOB#w5&l**unT3IJOVr?a~UCZ+qIUa=O#96dc?a&N`S-nLDZ>ndd~s5eO)yQRkUhJbe5f zUqZ^>MA0*tEEU-*rxJDSVDJ1aV8e>>T=7* zT%_0x+djT|^9D^!%v#l|DE%rU6Bs0ogU|_hSm1M{Pv#to2QE}tU211JBgCO=j69K+ zRtjg)q>~sM8*6EA52Yi#a{pnx$6m(CMSqgOn2_c zOGvyM8I_{tJX$R3DAsGlChZS_OY`%|zZ>j{P|E3Aav9l`i#qG!Fi>N0*rN z+@zR|G;wKXg4yLQGs^JHRq=c-m} z6_l03YLE8JP6p|N4)RP%7*L4&k}g>yqdjLVNj$ zLuHy=*+|h&DX{J;(n|&m?fmhOt%X$CLS~q*i`ip7K(KHvf$*`# z>RBBTkyk9WyLxCoQN#vTmh~vPWUppBi_D5Iy@ImGUepZ-eb9MPTI`D5-~LL9iYfF` zVYBUAi@IyR69+b5#zwIu|z?Y}r#dB_WAetBG%Dp6+ii zIZXYy;Oy!eUgOz7$Tj>l$Ed@^?==?gFr4RG-zeF9N~6; z5u&YMD4+aDQcCJ%gyX&JZQ9AMT%YKq_r^J?xm`x4{18P>xFYh;-f*2~9VjCKP@1f< ze6?wkcYTF{UNUJjhmb4@__u9b{!WLtYCU5jo1UJYgRZl?q+iPn-TL?w&|d8&^sU>M znGOarxmUWxqiFP@@9BC-qw=lctSYTx%nrZ5og<{-FJD|=0ey(T%n1r41*drg$Sl)K zOF4joPFh)EVd09yix)52qfcvXl>&U{a`bnGFzXy*8r5{o!)0^+PjE^zeyNG}_B|A9 zs}aPW$p;|~?VBGQAJ|z86-zifJEOO@tl8?bCH7G)O-Z3cH9Bk#6NInH}1R&OK?y< zw4e3Ak}`T`MlMe3lBIZ(+<+baX5Q z-Oui)OZgkbE9ycjzkg$r3C^?Tjvh@wOY@`J9V?PVUh`i%F9;Pe z0e;|fTJ8n3j!6*qwv9jZXym(lSI(b|xf_R=Kf!Tn=nhngOE;hB5TO+3p70p3qWA%k zHEKPVEmVM~Fr{c(j}wyCgODkq-Z_t0D}ki#R9p8*klrZM!CR=Erb}$uvSdB!qUp80 z6?K<$E4mJa`x4NudezbMcS^}>dlt~mP16sD#srz!iXjXw zovz5Gix*|~Ya8`~!o$O1)83YqmAxT_Amp=6rCVD|f9wxG+CTo#F4GxvfuEG17Z;KW zr^V0@J)RBEv*w)9mbHHkYySR;hMwnsAqgm`Ol1By$Iqu>SP!$Mp}($~zEiosP7Zn8 zn#flR`)F?=59DbDNDR;ZOTeMXt38^l|5m(PlGC`mir~?1GGrs8=T{H}2im5&ceTuP z56mwU4EiGi}g+42e|j`{yf=Q=$9p=V3*5M&Y=qp z#3Q?W+3e`xZ*U64-~tfH=+lKrD>e#^+hrg3H@S;*EH5y-%xrIK;GNFA)}Vsj+}@6X zeVe^BTqYF^z%UmjSjX%Buu^`54#YZh>5Ls_2V=m%$D2_0KlBky7Ap}oxmW`P{@I)nQqGO?KI7M{-K zIhK=;$TCx4q+gD4LWP~JGgxK6D;`MI1$*kfIW{5}5HOE<0@NC>v6FGj3CT1;2Q?Kw zE;jDEegZA@wK%z?YwpPeZw^MetJku%Lg9UUS`ZgFgG&Llf?BW!Pc<}R%uClrj{)+t z7dj(C9G&-4<9zM?PDBbI57c*8N!Qoz9$a~d80o@}?CR<=vK-LsS{|(m00eDKjD;vZ zS$-#?dAOo&e|6^dawU`crCci$M+s})&GFM8kkphEa_1L_5aF*CgPtkMN&6XBv&}I- z63u>I5ev$`dsR&J23K5&uOdj@LKnXxS5tp@&mDnnrB_s9jli{)XG_vzQ3<#d-;|D` z)7I7oQ^a2Oe(h$rROl!_2)&e`s2mpFP~an;Hu>{`kE7Db4yH@v>E|=`gp{1=`T6;a zGielP8%3%lyy`ZL3 z2zpe5$dUv@y?yg$>d0xtS~?L+3)0)1;@xAdYE96RpX8pkX7-iAvD+DST9}FV?E834 zl;pyNFYca**L|E}j?QZ7GsWiOVNE~BvOAXN=(0jRu+#B;$8g9=NK#JWlg$>kvan^I zQ?P-F*NAW*{BFP1S~sgITyS%%PF0kb3+RG!UDMtA}!^B4|h)Bkv(x5O?6>9}$CnHx63U$;nCJ z_gP?aK)!j*8J#mSv(TT%l{v@o<@|j_F;Fwox$;k_0mE&EE&zuMsg8t@FG|P(+Y`Se zAirDx1NwKS|u3AUicq(I*&+TvHJ&T22>i+K%SP@TQL}>aB6HAv@WNXQm zmu7#H&%)NTVZiLBclK5oqRv((Bna*Atf-ipnpO}ZUgLu!*)ASBj1_9kC=UrW&eBmr zD96AU8L4%zVyQc@1^mj$z#s`R>lYZPoct(&1V9oB94Tgh`(rz3mv#GKH?@yu{!_Vn zWnh|L%)~mijE-t<%ybxCH%5ef_+J<(xYyB(f&|53@`Y2Hws#0P4fXGxNI7_ZPj!XVA^2$pcZNLDa3v3VQ@6Dyf+wZRGqNb>5`R&EQWZeeeJG{K=&~2NJ zlo!Og%-=y7g5mFa1u=y=>Lo4tmC$+aK<9}`2rTOkr0K^WzZI!TB$q|*a;{t@Pb)Yj zPQW$Hb$rggMXw;eynNuJEwIV~8#e(m>dD?=PGI5ZY=1=a= z=F_I0Ed6Zj2XTbLNmB~;z9~PXK~`A(zK*=|-*`8oS&N`YY>&8?JTe;(f3)1z#55fT z4GPtBq6Az^PZN+QRM^eL-NWUjBI2P6EbM6q2a+;UYYkD9bY|oRwPR}|6*bPP2R`&v_yUK^hQ;@pA$&L zr%wvY4!XLDoaO@wYm?v2)~A}&_^E7JQ2NqscU5f%=(He!X(v75{8&vDufu!_wC=~n zFE=|-CDkb2U3qRUP1|GE{)QK{G%W7eSVBt8b|e`ct#!u7L6dtkpM#=Y6c~%7LFKwn;~!|kiy6cp9(`K@4)`iDF);{@Qo(=PB9me9V>j;KL+bfs_2x|m z47yxq`bh+~F^0?(O5zUg0zhx4+dv*ZckbM@79Uth($dTTY@7C^2Iwxa{I8%?!E|>N zSlVKz6|IV@Dn`IB@t#K&r-`VHRW+fj+{w#|UPxPt9CE%bJ2_+7W#&DpW=6v;>nn(> zSD?Dte-oW#7!PKb<5k5Q1ZgX9_6)Rq;a-8N&eQQ0y>-&==yZylH2ad0k`?{RSxVQg zT|2>wYKRc?<^HR3$4&;9YO0^VY3dkF3hdep*tT% zS-1Jg@uz@(((e)Adj;DMTa?X>I*p_Czzg5(;(XpccgdAo@zq!5T?^Mi{7kSMDYV%*>8B1philyjY~Y1oUL=7&4YSn3SP642dWh+DkTIe{i$CJ!%rzvhCU#C zk5MlW7K4@q*eFy`SR>~_6s;!zSoVo$Jte&``+0@7R^$ZOiEsc8tf!xVmK@K5L*M`l z$*jATYmu1l@<6hKC-L1zDd@>^7S=EkKy$2F_e&icGiQlK6|1!ts&;vi>B1WujmXhY@$1DT3nq;dszx6-sWy#DHIS>H(MG_3HNl3Gf&00f4g0iyI zedltnEvbGQ^FbHs+za4kE1|*p4`pPA$y6Gua>^SjF-x24NuwNOM-T)4(b3Ir!J!|2 z47+|?oZ)bFj{>G9^*%ddIZ`hFfq(sNwsID@2d1pKNv>SU0AQpg%fI#-C6nc3!2Y*s zoo5)F0@K0pPq%FY-2%eipy%srUtiSM5KKZ&o(|6DE9leG%6I(8ODxD!G{l2w4sAb4 zN)lAzRaPH8C?J?0ii++gwX0INka3wS+R}>6Gh)_c6c5s2p(>J z0wWZi1SphB!2WE4mVSqh?%R()Sjg@()VoBvGh^ISo9r_2X=`u2_UF326B~fdZgO7i zT`*)ZK!`w;D*@|NSX5L3+~eSqV*nLTH#&j4!>}XS?X0V@Yb+!O;**>4FQTPF5$!Z! zaZtJ$-1f{nNAs90OqG)rwSY@+dTJIf@G(M5+xp$YqgzMn65a2g{RTP8AG(^*mq)`B z7wUt9FAkcQDvXs4nls$Fa}PLUgk>PKK)15aRsTfW848UwfrJD$Z_-|@$`EHH~MU?aUu z%a+PO0BC*jh(oQ@Hg;uiuk_0GTzNp80ETENogpD1bP}drNj@%H^9Ij9rEyq|@e`8Y z6(V{g${8(fRlVjL=e90k@ccO(F%S)A2OYXSmaiu*7%sT2n1`+Ax<6O0w=jmr z#^%88im{~vtJ|9f`RyUgGDdK(eU`n7SktqGFZbf(f0~}N#PsiIr-of#8$XJAix}7Q zd;L@5Ib0YbfRLDl$?E|U6%KjaYL6GJTP%A2oxO%=v9US8!IYxqOP4Mw zILeO4FOAn}0~S^<(DUJST0SUa!8#@dZm2vJj4`I2p33R@0~u@mCB6B%q?89jC{?ri z)H;-zj&90|m3C*p+Pm(<#Q)%5TmLXXAT@mhxv1U&y7a^GTa?8* zc9qw$Yl?NIrZX2@=SgoX3PKV-Q%@t{g0fQ#EG#HrOfNB9W)&VC9StXDLStf39#z}( zPzmmxUF?Wbgl}kE9)r!{sfS1H_3PJr(`0GENhkv}9_ujcwQItt@(#yrl7ajZfTRQV zmM1>Lzu<^Wf0%2M+@aR9+>mTH%`T>gE8{j9*~-gjvX^J;2#LI)K`VrgGZj^17d);9i@5kCz^8* z)jAJ%P_1`Ex@w>u&jg(nq^BM*7@&E}z`)>&lCLf>Ye?|s^pLyDW)`j*bGLW{Z>H2I zog%5PYMq{x)g&4qcN2j(sAV4-5J(WgDWK6phfS__8jk2t^geicY&A}IOC^$SMf=5P29rl;X7QkCm<^~lPm; zV9KI<9Dw?wmslAa(-L_0I#T1pksr*TRDIpX#)blrfxVeZS?ZUL5QIC{Z1TFjc=)5> z6#00oUU2CBA;XP%6bi5!(P+NZUn&><x;-?4f|x+3s~uxEt5TXAY6ue6_Eryd%uh{Xnj;pNAA{H{9{EYT0kj4 z6oXuI-*yG8Z7FH|?meW(374x4~0naw|2URMTc7fRHBVph1Dkjl_#Hhony(M zO%FfnQV(V~bM|}iEY{Rk%$gpZ&L_6g8n4mX7R-JW&Aex^1BSs@StnDLbSBR{2|c_3 znd$;mC>hWAC^^ z^BKFX&NNCzjjy&gcZtBV>ScNr#1bn)3s!JwqONE#N`fKO^d5R8<{>P4&E5P^x;a6? z6`0q3Xc|jMPcBRGjNWre5I?E#(IpSI-Zqmaa-K{~JHM@x-InE*R5*e$y77w_)qfX* z`1H~QHZ0jR$f!UNfexx7+S|!&uF3u zjSHqYCvP#5upLC<;ap6IieXQ`9wh@(iwZ=qBD}jDMK<5X|1dD=BH^)h4$+V-G@7P& z(aSiz&~USQrvjD<>S8|ojyGlwN%;edg>s@N;c@Mt#vGL?@YV}VdXA`2{yIh`j-cr; zSU3?m?}wNT%tBddXw^T{#8$i`%4%Lcx$_fnzC;Zi>q&G&Ro*%bBlyH9B7HIerz=bJ zLo8RQ=p`r)8y{Kl>41@`m#%~p*+xYaO3n}vt{^9~KmT!_ntS#8h=l9*Vo;+VkHItW znY>ei*2!iv&nyG{v<}$`sttk~S#VNpkUr5jy#y=&{}%g9+}USVD||RNH)rq+grq0w zWQf;KtC1|5~5cDb*FK$rL6~2d8ljBW0dd)#kEv>Rso>z!&S6(gxpV%FCc10+Nm}=*ZpDrk@IdSkK_2#G<-_H6OY;=`{?S)u_ zI}ik>xyoni0m1y}?^gltQo+o=9M3>cF9uB#-_i^~KcernTmwT(j$)4^0*88jBO-H5dlPB-NvLxdS zVByg3{rg@&QpR2&EqgqifEsfFlWk~RoK~U>TIDd(a2{m&?+Wi>g#BaT2b}H>+Lt-b zZLw8loBBX!14hz~3;#|S)NPFdT9;tpdbOhhj!wmg55JA`K7c#Ctcq63BMj_kwmMEl zrOa)$ERLBY7Cijb`&I0UH#is&aeMfrjl-pPj~>zqfV6QAC{wG|N#AuC6Fe{u%5CaQCf7_gK;i1HPT`Ul&cM zMX!Qw2))c@docrqNwa}`T~kbEDC&N4c5w+?;xg%`135y5n$NKp*z05TB(#2#u35>lDsk#NM85^HWrhN_T7_!k=oh|#3eYa7l**tIE~LoZPjmNvU& zPz5Q_5Z&Sv*n4FedAD+zV{h$1a%XoEjcO+S^5mpoep_wTx9>OT3QiHP8%oi3dJdJh zO$5!^-9nU2sTdvcP#vCFD znFjiM0pO4+0J71oFX~`+zhA|qcmYj9EDTKbQnVu&VF(zDAtyz&l_ez&hmm9=kl>Na zeRRFDC~g3+0vcDcZZ(nbk~IH))(@1D7UhVtvj1k|(BaH`<+;NHagE7|i}FnpwS^qX9l5K|i{^VV##+`B^kYGN${7 zESK?8oOzMdD>$HxAPMh*?n^lePgn7m3T)9+x?-=*nXjXzJ^vB7Xfg^d-R+ z&n-L^5<)^+VDUNF_yL|}8TQlWpU*y5RU6Jpr2n+LUDBQXcoT#P()?&&j(bS$m!$5MKjw#6m>tuASh5l8*aQQbaT&am3DF;8yOq*JT)vk4WGHFQX%i&zMXy9PSr(O z6hBLdywKkfGnY6XO#j>2@C$hi7c#Fmacj3x{5prI7VO+Va*nY&6HW1NEZ2kSbO+ho z?>VJhMkWM56|I`gbKEczWB!8m@$K8U>Q92{m}n-wPkk9fyh6tf^p}2R9oXl;n>e{G zu~;x&HO0u8ynrn%#O($5nHAc$U+0anxsi((F34w9AN0m=#Su36+j>`-svZy6vC!t-jBU^)dKr8Aj1W{~)60`*Hov9Gdne?7{OFd5 zY}E35N4ZjA^WprPCd;N>9{mP8@5JK8S2*GRtG`R1D30cEII0(ZKbfc@%;RyI0dDuZ z*`%7QhoO3HL+dyUX56?Gm;T+}>Bb)S^=mG-%|pq!J+`>2Kfa(K<=A;28)hDPh^T66 z27IS+B?zYb1x~Be`z#veUKegWsvjDn$j!}Ft+4$ZR0tDy-&0s?v|!pNaraXweYa<& z&s?-jVs(^g!o7$?WOv9cPdK@&cQ_Ygde5+pyu7OF1{>QY=nq89&6z-m(s|7UVKSwg zFV1BidymAiFO2(e!-BtTjZTtrg@TWH@ZJKJH54jvlU`NM)6%pW2Q%Kws3;2aq2hCJ zVgRC{9;`&Kh#&0MpN)hCZvHM*CcZk|rkh`;G-0lFF79wfB1Ey5oR3-U-Y^zO5@$h@ z_zF~vu9?|0D^C{}-nbXP&)p7{JS_eA@%(cmqsi^1Vd_YR3m1YGH=X1sTO%7fbFwa8 zKCU7OijV)6vu>)NMo($TgWz2|PfZ;FqqzX?Cd<32`T6H%WMobc&o6`K!ujHfg9Asg zdd`;s7$J>}U;rMUA+dKueD+7B1hJ`^E4Nv)?!z+aJU#JE6G;R|Rclt*o?7V35mi&W z0xD+!r~3ie%F{8XhwT_liFE(1>(_%Rt#0wrwO=`XE0omMujN9)H5@T@qL)k37eyl4 zw2hCp`n7G~4mb6I=Li5ZjUF5X`A8>GS5sO_27liA`F({FxOTA7A)1BJ1Lxvi1mvgp z6dZVU$47iL&Qn1yW-2Nw$(4J@tg>b$>gAP{NyWt^4568X46xF{Y4M&_-X`%zSL=%L zXhtp8rn7ZB2ZoDV^-W6DVW4WB$DA2htHMC3U8xp`j*tHUGrBlbf)Ak0+QvrM&5b{d zRXZ>#>H2LFJft2xp82jjmKqhd(=Tb>9DBh;4gF-TQ_&d`WLs@r3AJ|qc6!)|(A%tR zPw@M=`oWQfo)wnhZ7wb@emLg27hP7yh19pU;Ws00ecCYrDa=;mgw<9AO zxVX4F;1X6X&^u+=5ofX33!{22rEle(GT1ONJwW>Rvy!UA)!~6Vk@Q7w_~pr~7w1U8!ap`&>FEi@0$0+vOQ!BOi>yK|rkUVDQqjddPeaZ5vPa z=VQZ@9CD$V;AB5ukNlT4=-5t@=iBGLC&@Rc%^J;6*MqN|?D~iI8~)!f4r`LB4>@-Y zLDX~leng(XaG}dPbH7$|$6X_k8z~im_(JM27|h8>@={@R#A&>au$7jL2P5EBX<~J_1 zB+$zc#0a>Ofy3@BK7Io{k(Ya;kH=;6Hn{%$ftCfU=~>{t8iBX9hEu~gKPY4<-oAai za~#yQF5xR>%z}b~-n>VLE1p~wW-J0{yn51PDSJ`H38@dcqd+0hMB?f(^OJgae}D3m z>;n_)Vj73_3PIOnVFeg9MAUDb(y3wTJrKQ*^GWpfVufw zB%2YM7$6tx10md=I~;7+L>{%kgS@Ba>r3zjw;oqTD$gdBmvB2ABOfR>xkM-Nb*lMO zv_@Ic&Pevzd26uTkphd28PChfA*Bd-^sTXxgIo}-hW9)UUU*wTKGK7eI!jSTu8b@! zWXQL!WO2#>iN}x6o0e4K6q(ZAyLa#F`1tY5$@Y>Nk+W;_S$TM@tgMMWa}*<1L;`27 zLI+5B=pHL`D<(F!5s0qMVG!nD!%iBgb$3l(DBdhdVk@uP_Jt^uM9y zz@fOaIoBg*Y;5e`cdtM0jjyjCnATcK!6^WvS?Ob<>n@b z{+W>GQ(TkXbs7 z*=I+qxIpv&7#`kloL#Y9otk*~6Hb5h;*r@eAUMh0kSXWZP}Rzb)41;sOvb7a~F!TNn*7DNZVRS_ zeT88ag$z4l9AWD$X}3biY2|vftW}zEDJqw;Bc}@!F>V?6k@umof1@{d`g0= z_7wAJAcl97bb;vT==k{h!pp(nOd!Y;9u*ZP)(i?7)@Y87!nifo^u?#)Os}o2tri`> z=#ZEwtN0Gu`NWSW9#z!Tq}0?z7o5%U?2ze)H`pppvK_Xb-dh7t^M}t~IKB4(Z3qju zM0*)KJrR4z{QO)w{n-t(+j+o)@bK{9YK2H_Lmg(kPH%7TR=Y`=8PWNm7MTi;L6!?n z&VJbUKZfqQp$c!QY3=CL9)u4RrigG+XTH;bjS*cA>?4KsU{L{{B3s-!;92h!8oCvJ z8ZKHrzwS=fEv~QE=_&VSRjWphEYgrL%0p*KlNleEmSL81Gd3oMkI&63_9)fRKX5ao zS~}ZZ-~6$X4NqD#nj2jZ+BC|$UR(Q9!rs=WLJ2YV;8@Sqx8q?~QK9c{lwbY)vzyymTyDUCX>j~-dpCL7GZz8+5Oyn;_r1})1! zYS`SFzZ)&780Lf%6I*|DDH6}jtZ8j+)x91uCKD{q{^5tb%-&N@-UKZh1H4Q0N6A$sHMMn6R9{S5_T22T4)0GOHbCGT zn*fL*Kf6;h`FoiK;P`FajGJ(9aO2&(cbkA%W45#pA3k8hT6{c@l&`(vc;w#j(2zyU z2fRnr1ArcCzt*02rw+c(CY*Lj1qB5Mx2R(SpLqI{dCA)iK^eB?Xc)%$`upnv;I+IO z)nmibU!$p94#3yX@%O4BGUl62x3C6EDE&g{~KDX4#BL|I!~8^^6bhKNXj zZ3l#7a4-O7SwbKXXj+750>cWK%gZ$h9l6ZR%X)kG?d__DSri!=8Hq53HoMdS%}KJj z$ZKn(tY5$W+?US_E2j-iOj?qZ*ZBDR%R(=VLx6qmI}-x~9%zM7BvQ(9xrh}e{Ioa| z`nj9B$~^6BBK7e`TnXX}D3>53*$H`1$0kYzm(dyP=<3dyx@H^mfJMUb@k@bEvVC#A z%j4G8M2Gx;)to$fR2_~vu0(PU7qRrC-27OJO|$RDuC-uUnEXenYxKp_Pqq~x;>69! zP#^&NdQ2t_ac9ifx%XLLAN2AH5o_W)zr^(Pm5|cZ?s=p`SoEJ2b>UOK!fE-R$yNrV zE*B$edhFOS=_xS4lptxpwM~m-c>{syi2o7@F;EqLn99#zCv~2CMC&XgU>BZ zjE|Fketw)xOBidXket*G@r12a)Ki{|ze3kmM#?H(8kj~#t^}19FUiNR2Oedv$a2|) z6G=rdH9fr=aKIWlVo!D(*8k0!+&ql&0Kz|faex_GaUUl%AfvlajY4_Oegh|tL?YQ> zWK_a4UuWXQ1ugPJb*H4N+5(!@|Ngx^@wlzc0p6;wJ|X&_fevaO3Ozs+(2BITPl#_Q zFK6Aa$w2@O$QaEzYiMK?0ku}cPL?D~rA5TP{pZhH6B}?=-;bHw@iHhdaPFY&F;wQz z)044+#`_wiT^s=#uiIprC%lLQ&0mU3)q=zt0cH-ur!lI(T zH{C8O;=OvMjY1Wqx*1R|WkU4b$pnF|ehB8`Oxn(1Dk4h0-+rsCrj{wNlgDH5rH_$; zYDHt)$xHCG+ZVqOZeuU5fNXm?J6j>kMLs$@ns^93JRLEE#@1G4u%r|YN0!g$kCwuq zmI`}HKa0Jj4d4XzaJz1IK?`ww1;W^)S_!Tw9FPQJTFk}OKBG*zDpsGJaClfB0H}%Vmlb=tep$kdS1PQl z^8oTFp-uufq8_E|Q=y1O^ftRXG8*JG^#Ts1a?(WWi_RKA$ zx5^Pcm@%dtBZ=x!>i%hf_Js<9i~~(LI%){INWvv!!}KJ{WZ>NPd8!N9oh_v#-wis$ z3I8+Gdk@)7EHi@tD!cw{k8L!ITag10bUt-XUfQ1fKdahhMY~SzfSrkrB!Smn$NglP IgU|Q>1!)Iz Date: Tue, 11 Jun 2019 14:07:55 +0100 Subject: [PATCH 53/57] readme --- examples/reinforcement_learning/README.md | 107 +++++++++++++--------- 1 file changed, 62 insertions(+), 45 deletions(-) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index c1e4977ad..c07171fec 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -67,15 +67,16 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t Description: + ``` Non deep learning method with TD Learning, Off-Policy, e-Greedy Exploration. - + Central formula: - Q(S, A) <- Q(S, A) + alpha * (R + lambda * Q(newS, newA) - Q(S, A)) - + See David Silver RL Tutorial Lecture 5 - Q-Learning for more details. + ``` - + ​ * **Deep Q-Network (DQN)** @@ -87,15 +88,15 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t Description: + ``` Deep Q-Network (DQN) is a method of TD Learning, Off-Policy, e-Greedy Exploration (GLIE). - + Central formula: - Q(S, A) <- Q(S, A) + alpha * (R + lambda * Q(newS, newA) - Q(S, A)), - delta_w = R + lambda * Q(newS, newA). - + See David Silver RL Tutorial Lecture 5 - Q-Learning for more details. + ``` @@ -107,19 +108,23 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t Description: - We implement Double DQN, Dueling DQN and Noisy DQN here. - - * The max operator in standard DQN uses the same values both to select and to evaluate an action by: - - ​ Q(s_t, a_t) = R\_{t+1\} + gamma \* max\_{a}Q\_\{target\}(s_{t+1}, a). - - * Double DQN proposes to use following evaluation to address overestimation problem of max operator: - - ​ Q(s_t, a_t) = R\_{t+1\} + gamma \* Q\_{target}(s\_\{t+1\}, max{a}Q(s_{t+1}, a)). - - * Dueling DQN uses dueling architecture where the value of state and the advantage of each action is estimated separately. - - * Noisy DQN propose to explore by adding parameter noises. + * ``` + We implement Double DQN, Dueling DQN and Noisy DQN here. + + - The max operator in standard DQN uses the same values both to select and to evaluate an action by: + + Q(s_t, a_t) = R\_{t+1\} + gamma \* max\_{a}Q\_\{target\}(s_{t+1}, a). + + - Double DQN proposes to use following evaluation to address overestimation problem of max operator: + + Q(s_t, a_t) = R\_{t+1\} + gamma \* Q\_{target}(s\_\{t+1\}, max{a}Q(s_{t+1}, a)). + + - Dueling DQN uses dueling architecture where the value of state and the advantage of each action is estimated separately. + + - Noisy DQN propose to explore by adding parameter noises. + + + ``` @@ -132,7 +137,9 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t Description: + ``` Prioritized experience replay is an efficient replay method that replay important transitions more frequently. Segment tree data structure is used to speed up indexing. + ``` @@ -144,7 +151,9 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t Description: + ``` Categorical 51 distributional RL algorithm is a distrbuted DQN, where 51 means the number of atoms. In this algorithm, instead of estimating actual expected value, value distribution over a series of continuous sub-intervals (atoms) is considered. + ``` @@ -170,7 +179,9 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t Description: + ``` The implementation of Advantage Actor-Critic, using TD-error as the advantage. + ``` @@ -182,7 +193,9 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t Description: + ``` The implementation of Asynchronous Advantage Actor-Critic (A3C), using multi-threading for distributed policy learning on Actor-Critic structure. + ``` @@ -194,11 +207,12 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t Description: + ``` Actor policy in SAC is stochastic, with off-policy training. And 'soft' in SAC indicates the trade-off between the entropy and expected return. The additional consideration of entropy term helps with more explorative policy. And this implementation contains an automatic update for the entropy factor. - + This version of Soft Actor-Critic (SAC) implementation contains 5 networks: - 2 Q-networks, 2 target Q-networks and 1 policy network. + ``` @@ -211,9 +225,11 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t Description: + ``` The policy gradient algorithm works by updating policy parameters via stochastic gradient ascent on policy performance. It's an on-policy algorithm can be used for environments with either discrete or continuous action spaces. - + To apply it on continuous action space, you need to change the last softmax layer and the choose_action function. + ``` @@ -240,27 +256,20 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t Description: + ``` DDPG suffers from problems like overestimate of Q-values and sensitivity to hyper-parameters. - + Twin Delayed DDPG (TD3) is a variant of DDPG with several tricks: - - * Trick One: Clipped Double-Q Learning. TD3 learns two Q-functions instead of one (hence “twin”), - - and uses the smaller of the two Q-values to form the targets in the Bellman error loss functions. - - * Trick Two: “Delayed” Policy Updates. TD3 updates the policy (and target networks) less frequently - - than the Q-function. - - * Trick Three: Target Policy Smoothing. TD3 adds noise to the target action, to make it harder for - - the policy to exploit Q-function errors by smoothing out Q along changes in action. - + + - Trick One: Clipped Double-Q Learning. TD3 learns two Q-functions instead of one (hence “twin”), and uses the smaller of the two Q-values to form the targets in the Bellman error loss functions. + - Trick Two: “Delayed” Policy Updates. TD3 updates the policy (and target networks) less frequently than the Q-function. + - Trick Three: Target Policy Smoothing. TD3 adds noise to the target action, to make it harder for the policy to exploit Q-function errors by smoothing out Q along changes in action. + The implementation of TD3 includes 6 networks: - 2 Q-networks, 2 target Q-networks, 1 policy network, 1 target policy network. - + Actor policy in TD3 is deterministic, with Gaussian exploration noise. + ``` @@ -272,9 +281,11 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t Description: + ``` PG method with a large step can crash the policy performance, even with a small step can lead a large differences in policy. - + TRPO constraints the step in policy space using KL divergence (rather than in parameter space), which can monotonically improve performance and avoid a collapsed update. + ``` @@ -286,11 +297,15 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t Description: + ``` A simple version of Proximal Policy Optimization (PPO) using single thread. - + PPO is a family of first-order methods that use a few other tricks to keep new policies close to old. - + PPO methods are significantly simpler to implement, and empirically seem to perform at least as well as TRPO. + + + ``` @@ -302,13 +317,15 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t Description: + ``` A distributed version of OpenAI's Proximal Policy Optimization (PPO). - + Distribute the workers to collect data in parallel, then stop worker's roll-out and train PPO on collected data. + ``` -* More in recent weeks +* **More in recent weeks** ## Environment: From d6116dec4855f8f60d0f99e3bb9ac07d8edbe764 Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Tue, 11 Jun 2019 14:10:17 +0100 Subject: [PATCH 54/57] readme --- examples/reinforcement_learning/README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index c07171fec..c2e2ad8f5 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -122,8 +122,6 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t - Dueling DQN uses dueling architecture where the value of state and the advantage of each action is estimated separately. - Noisy DQN propose to explore by adding parameter noises. - - ``` @@ -166,7 +164,9 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t Description: + ``` Retrace (lambda) is an off-policy algorithm that extend the idea of eligibility trace. It apply an importance sampling ratio truncated at 1 to several behaviour policies, which suffer from the variance explosion of standard IS and lead to safe and efficient learning. + ``` @@ -241,9 +241,11 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t Description: + ``` An algorithm concurrently learns a Q-function and a policy. - + It uses off-policy data and the Bellman equation to learn the Q-function, and uses the Q-function to learn the policy. + ``` From 64da954c3351578c065f5707d0cc14b04ee5946b Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Tue, 11 Jun 2019 14:13:20 +0100 Subject: [PATCH 55/57] readme --- examples/reinforcement_learning/README.md | 38 ++++++++++++----------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md index c2e2ad8f5..dc9b412f5 100644 --- a/examples/reinforcement_learning/README.md +++ b/examples/reinforcement_learning/README.md @@ -68,7 +68,7 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t Description: ``` - Non deep learning method with TD Learning, Off-Policy, e-Greedy Exploration. + Q-learning is a non-deep-learning method with TD Learning, Off-Policy, e-Greedy Exploration. Central formula: Q(S, A) <- Q(S, A) + alpha * (R + lambda * Q(newS, newA) - Q(S, A)) @@ -108,23 +108,25 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t Description: - * ``` - We implement Double DQN, Dueling DQN and Noisy DQN here. - - - The max operator in standard DQN uses the same values both to select and to evaluate an action by: - - Q(s_t, a_t) = R\_{t+1\} + gamma \* max\_{a}Q\_\{target\}(s_{t+1}, a). - - - Double DQN proposes to use following evaluation to address overestimation problem of max operator: - - Q(s_t, a_t) = R\_{t+1\} + gamma \* Q\_{target}(s\_\{t+1\}, max{a}Q(s_{t+1}, a)). - - - Dueling DQN uses dueling architecture where the value of state and the advantage of each action is estimated separately. - - - Noisy DQN propose to explore by adding parameter noises. - ``` - - + ``` + We implement Double DQN, Dueling DQN and Noisy DQN here. + + -The max operator in standard DQN uses the same values both to select and to evaluate an action by: + + Q(s_t, a_t) = R\_{t+1\} + gamma \* max\_{a}Q\_\{target\}(s_{t+1}, a). + + -Double DQN proposes to use following evaluation to address overestimation problem of max operator: + + Q(s_t, a_t) = R\_{t+1\} + gamma \* Q\_{target}(s\_\{t+1\}, max{a}Q(s_{t+1}, a)). + + -Dueling DQN uses dueling architecture where the value of state and the advantage of each action is estimated separately. + + -Noisy DQN propose to explore by adding parameter noises. + + + ``` + + * **Prioritized Experience Replay** From c83855336b7406b23df016cc848cbc649734bf8d Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Tue, 11 Jun 2019 15:00:18 +0100 Subject: [PATCH 56/57] changelog --- CHANGELOG.md | 8 ++++++++ examples/basic_tutorials/tutorial_cifar10_cnn_static.py | 2 +- examples/basic_tutorials/tutorial_mnist_mlp_dynamic.py | 2 +- examples/basic_tutorials/tutorial_mnist_mlp_dynamic_2.py | 2 +- examples/basic_tutorials/tutorial_mnist_mlp_static.py | 2 +- examples/basic_tutorials/tutorial_mnist_mlp_static_2.py | 2 +- examples/basic_tutorials/tutorial_mnist_siamese.py | 2 +- examples/basic_tutorials/tutorial_mnist_simple.py | 2 +- examples/data_process/tutorial_fast_affine_transform.py | 4 ++-- examples/data_process/tutorial_tf_dataset_voc.py | 2 +- examples/data_process/tutorial_tfrecord.py | 2 +- examples/data_process/tutorial_tfrecord2.py | 2 +- examples/data_process/tutorial_tfrecord3.py | 2 +- examples/database/dispatch_tasks.py | 1 - examples/database/task_script.py | 1 - .../tutorial_imagenet_inceptionV3_distributed.py | 8 ++++---- .../deprecated_tutorials/tutorial_mnist_distributed.py | 1 - .../tutorial_cifar10_distributed_trainer.py | 2 +- .../tutorial_mnist_distributed_trainer.py | 2 +- examples/keras_tfslim/tutorial_keras.py | 2 +- examples/pretrained_cnn/tutorial_models_mobilenetv1.py | 2 +- examples/pretrained_cnn/tutorial_models_squeezenetv1.py | 2 +- examples/pretrained_cnn/tutorial_models_vgg16.py | 2 +- examples/pretrained_cnn/tutorial_models_vgg19.py | 2 +- examples/pretrained_cnn/tutorial_models_vgg_static.py | 2 +- .../quantized_net/tutorial_binarynet_cifar10_tfrecord.py | 1 - examples/quantized_net/tutorial_binarynet_mnist_cnn.py | 1 - .../quantized_net/tutorial_dorefanet_cifar10_tfrecord.py | 1 - examples/quantized_net/tutorial_dorefanet_mnist_cnn.py | 1 - examples/quantized_net/tutorial_quanconv_cifar10.py | 2 +- examples/quantized_net/tutorial_quanconv_mnist.py | 1 - .../tutorial_ternaryweight_cifar10_tfrecord.py | 1 - .../quantized_net/tutorial_ternaryweight_mnist_cnn.py | 1 - .../tutorial_spatial_transformer_network_dynamic.py | 2 +- .../tutorial_spatial_transformer_network_static.py | 2 +- examples/text_classification/tutorial_imdb_fasttext.py | 2 +- examples/text_generation/tutorial_generate_text.py | 2 +- examples/text_ptb/tutorial_ptb_lstm.py | 2 +- examples/text_ptb/tutorial_ptb_lstm_state_is_tuple.py | 2 +- examples/text_word_embedding/tutorial_word2vec_basic.py | 2 +- examples/tutorial_work_with_onnx.py | 4 ++-- tensorlayer/activation.py | 1 - tensorlayer/cost.py | 1 - tensorlayer/db.py | 4 ++-- tensorlayer/distributed.py | 1 - tensorlayer/files/dataset_loaders/voc_dataset.py | 1 - tensorlayer/files/dataset_loaders/wmt_en_fr_dataset.py | 1 - tensorlayer/files/utils.py | 8 ++++---- tensorlayer/initializers.py | 1 + tensorlayer/layers/activation.py | 1 - tensorlayer/layers/convolution/binary_conv.py | 1 - tensorlayer/layers/convolution/deformable_conv.py | 1 - tensorlayer/layers/convolution/depthwise_conv.py | 1 - tensorlayer/layers/convolution/dorefa_conv.py | 1 - tensorlayer/layers/convolution/expert_conv.py | 1 - tensorlayer/layers/convolution/expert_deconv.py | 1 - tensorlayer/layers/convolution/group_conv.py | 1 - tensorlayer/layers/convolution/quan_conv.py | 1 - tensorlayer/layers/convolution/quan_conv_bn.py | 1 - tensorlayer/layers/convolution/separable_conv.py | 2 +- tensorlayer/layers/convolution/simplified_conv.py | 1 - tensorlayer/layers/convolution/simplified_deconv.py | 2 +- tensorlayer/layers/convolution/super_resolution.py | 1 - tensorlayer/layers/convolution/ternary_conv.py | 1 - tensorlayer/layers/core.py | 1 - tensorlayer/layers/dense/base_dense.py | 2 +- tensorlayer/layers/dense/binary_dense.py | 1 - tensorlayer/layers/dense/dorefa_dense.py | 1 - tensorlayer/layers/dense/dropconnect.py | 1 - tensorlayer/layers/dense/quan_dense.py | 1 - tensorlayer/layers/dense/quan_dense_bn.py | 1 - tensorlayer/layers/dense/ternary_dense.py | 1 - tensorlayer/layers/dropout.py | 1 - tensorlayer/layers/embedding.py | 2 +- tensorlayer/layers/extend.py | 1 - tensorlayer/layers/image_resampling.py | 1 - tensorlayer/layers/inputs.py | 2 +- tensorlayer/layers/lambda_layers.py | 1 - tensorlayer/layers/merge.py | 1 - tensorlayer/layers/noise.py | 1 - tensorlayer/layers/normalization.py | 3 +-- tensorlayer/layers/padding.py | 1 - tensorlayer/layers/pooling.py | 1 - tensorlayer/layers/quantize.py | 1 - tensorlayer/layers/recurrent.py | 1 - tensorlayer/layers/scale.py | 1 - tensorlayer/layers/shape.py | 1 - tensorlayer/layers/spatial_transformer.py | 4 ++-- tensorlayer/layers/stack.py | 1 - tensorlayer/layers/utils.py | 4 ++-- tensorlayer/models/core.py | 3 +-- tensorlayer/models/mobilenetv1.py | 1 - tensorlayer/models/squeezenetv1.py | 1 - tensorlayer/models/vgg.py | 2 +- tensorlayer/nlp.py | 4 ++-- tensorlayer/rein.py | 3 ++- tensorlayer/utils.py | 2 +- 97 files changed, 67 insertions(+), 109 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bac7c65fe..f8d552d64 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -73,6 +73,8 @@ To release a new version, please update the changelog as followed: - Layer - `InstanceNorm`, `InstanceNorm1d`, `InstanceNorm2d`, `InstanceNorm3d` (PR #963) +* Reinforcement learning tutorials. (PR #995) + ### Changed - remove `tl.layers.initialize_global_variables(sess)` (PR #931) - change `tl.layers.core`, `tl.models.core` (PR #966) @@ -98,6 +100,10 @@ To release a new version, please update the changelog as followed: ### Contributors - @zsdonghao: #931 - @yd-yin: #963 +- @Tokarev-TT-33: # 995 +- @initial-h: # 995 +- @quantumiracle: #995 +- @Officium: #995 ## [2.0.0-alpha] - 2019-05-04 @@ -320,8 +326,10 @@ To release a new version, please update the changelog as followed: - AtrousDeConv2dLayer added (PR #662) - Fix bugs of using `tf.layers` in CNN (PR #686) - Optimizer: + - AMSGrad Optimizer added based on `On the Convergence of Adam and Beyond (ICLR 2018)` (PR #636) - Setup: + - Creation of installation flaggs `all`, `all_cpu`, and `all_gpu` (PR #660) - Test: - `test_utils_predict.py` added to reproduce and fix issue #288 (PR #566) diff --git a/examples/basic_tutorials/tutorial_cifar10_cnn_static.py b/examples/basic_tutorials/tutorial_cifar10_cnn_static.py index c12c791a1..93794c414 100644 --- a/examples/basic_tutorials/tutorial_cifar10_cnn_static.py +++ b/examples/basic_tutorials/tutorial_cifar10_cnn_static.py @@ -5,8 +5,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import (BatchNorm, Conv2d, Dense, Flatten, Input, LocalResponseNorm, MaxPool2d) diff --git a/examples/basic_tutorials/tutorial_mnist_mlp_dynamic.py b/examples/basic_tutorials/tutorial_mnist_mlp_dynamic.py index 1ffa7fbe0..13db1abae 100644 --- a/examples/basic_tutorials/tutorial_mnist_mlp_dynamic.py +++ b/examples/basic_tutorials/tutorial_mnist_mlp_dynamic.py @@ -1,8 +1,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import Dense, Dropout, Input from tensorlayer.models import Model diff --git a/examples/basic_tutorials/tutorial_mnist_mlp_dynamic_2.py b/examples/basic_tutorials/tutorial_mnist_mlp_dynamic_2.py index b752012b0..0d94b1dfa 100644 --- a/examples/basic_tutorials/tutorial_mnist_mlp_dynamic_2.py +++ b/examples/basic_tutorials/tutorial_mnist_mlp_dynamic_2.py @@ -1,8 +1,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import Dense, Dropout, Input, LayerList from tensorlayer.models import Model diff --git a/examples/basic_tutorials/tutorial_mnist_mlp_static.py b/examples/basic_tutorials/tutorial_mnist_mlp_static.py index c9c15f911..de811a8d8 100644 --- a/examples/basic_tutorials/tutorial_mnist_mlp_static.py +++ b/examples/basic_tutorials/tutorial_mnist_mlp_static.py @@ -1,8 +1,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import Dense, Dropout, Input from tensorlayer.models import Model diff --git a/examples/basic_tutorials/tutorial_mnist_mlp_static_2.py b/examples/basic_tutorials/tutorial_mnist_mlp_static_2.py index f0836c528..a9a2c7d48 100644 --- a/examples/basic_tutorials/tutorial_mnist_mlp_static_2.py +++ b/examples/basic_tutorials/tutorial_mnist_mlp_static_2.py @@ -1,8 +1,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import Dense, Dropout, Input from tensorlayer.models import Model diff --git a/examples/basic_tutorials/tutorial_mnist_siamese.py b/examples/basic_tutorials/tutorial_mnist_siamese.py index db43f1163..fe4abdc52 100644 --- a/examples/basic_tutorials/tutorial_mnist_siamese.py +++ b/examples/basic_tutorials/tutorial_mnist_siamese.py @@ -14,8 +14,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import Dense, Dropout, Flatten, Input from tensorlayer.models import Model diff --git a/examples/basic_tutorials/tutorial_mnist_simple.py b/examples/basic_tutorials/tutorial_mnist_simple.py index b1ccd052b..ceaee0c48 100644 --- a/examples/basic_tutorials/tutorial_mnist_simple.py +++ b/examples/basic_tutorials/tutorial_mnist_simple.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl tl.logging.set_verbosity(tl.logging.DEBUG) diff --git a/examples/data_process/tutorial_fast_affine_transform.py b/examples/data_process/tutorial_fast_affine_transform.py index 52452ffd5..71890f5bd 100644 --- a/examples/data_process/tutorial_fast_affine_transform.py +++ b/examples/data_process/tutorial_fast_affine_transform.py @@ -8,10 +8,10 @@ import multiprocessing import time -import cv2 import numpy as np -import tensorflow as tf +import cv2 +import tensorflow as tf import tensorlayer as tl # tl.logging.set_verbosity(tl.logging.DEBUG) diff --git a/examples/data_process/tutorial_tf_dataset_voc.py b/examples/data_process/tutorial_tf_dataset_voc.py index fab1612f7..9779b1f60 100644 --- a/examples/data_process/tutorial_tf_dataset_voc.py +++ b/examples/data_process/tutorial_tf_dataset_voc.py @@ -13,8 +13,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl # tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/data_process/tutorial_tfrecord.py b/examples/data_process/tutorial_tfrecord.py index c0b0181f8..bcf3fe46a 100644 --- a/examples/data_process/tutorial_tfrecord.py +++ b/examples/data_process/tutorial_tfrecord.py @@ -22,9 +22,9 @@ import os import numpy as np -import tensorflow as tf from PIL import Image +import tensorflow as tf import tensorlayer as tl ## Save data ================================================================== diff --git a/examples/data_process/tutorial_tfrecord2.py b/examples/data_process/tutorial_tfrecord2.py index be41b697f..22b3d7757 100755 --- a/examples/data_process/tutorial_tfrecord2.py +++ b/examples/data_process/tutorial_tfrecord2.py @@ -14,10 +14,10 @@ import os import numpy as np + # import matplotlib # matplotlib.use('GTK') import tensorflow as tf - import tensorlayer as tl # Download data, and convert to TFRecord format, see ```tutorial_tfrecord.py``` diff --git a/examples/data_process/tutorial_tfrecord3.py b/examples/data_process/tutorial_tfrecord3.py index 9e5751a25..bc8752f2a 100644 --- a/examples/data_process/tutorial_tfrecord3.py +++ b/examples/data_process/tutorial_tfrecord3.py @@ -19,9 +19,9 @@ import os import numpy as np -import tensorflow as tf from PIL import Image +import tensorflow as tf import tensorlayer as tl diff --git a/examples/database/dispatch_tasks.py b/examples/database/dispatch_tasks.py index d1204bcd4..260257e77 100644 --- a/examples/database/dispatch_tasks.py +++ b/examples/database/dispatch_tasks.py @@ -6,7 +6,6 @@ import time import tensorflow as tf - import tensorlayer as tl tl.logging.set_verbosity(tl.logging.DEBUG) diff --git a/examples/database/task_script.py b/examples/database/task_script.py index ad51dd3ed..58ef60d1a 100644 --- a/examples/database/task_script.py +++ b/examples/database/task_script.py @@ -1,7 +1,6 @@ """Sample task script.""" import tensorflow as tf - import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/deprecated_tutorials/tutorial_imagenet_inceptionV3_distributed.py b/examples/deprecated_tutorials/tutorial_imagenet_inceptionV3_distributed.py index e54f565b5..15c0a3f3c 100644 --- a/examples/deprecated_tutorials/tutorial_imagenet_inceptionV3_distributed.py +++ b/examples/deprecated_tutorials/tutorial_imagenet_inceptionV3_distributed.py @@ -19,18 +19,18 @@ from xml.etree import ElementTree import numpy as np + import tensorflow as tf +import tensorlayer as tl from tensorflow.contrib import slim -from tensorflow.contrib.slim.python.slim.nets.inception_v3 import ( - inception_v3, inception_v3_arg_scope) +from tensorflow.contrib.slim.python.slim.nets.inception_v3 import (inception_v3, + inception_v3_arg_scope) from tensorflow.python.framework.errors_impl import OutOfRangeError from tensorflow.python.training import session_run_hook from tensorflow.python.training.basic_session_run_hooks import StopAtStepHook from tensorflow.python.training.monitored_session import \ SingularMonitoredSession -import tensorlayer as tl - tf.logging.set_verbosity(tf.logging.DEBUG) tl.logging.set_verbosity(tl.logging.DEBUG) diff --git a/examples/deprecated_tutorials/tutorial_mnist_distributed.py b/examples/deprecated_tutorials/tutorial_mnist_distributed.py index 29d291ba4..18f7cdb92 100644 --- a/examples/deprecated_tutorials/tutorial_mnist_distributed.py +++ b/examples/deprecated_tutorials/tutorial_mnist_distributed.py @@ -13,7 +13,6 @@ """ import tensorflow as tf - import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/distributed_training/tutorial_cifar10_distributed_trainer.py b/examples/distributed_training/tutorial_cifar10_distributed_trainer.py index 1ddc2d937..ce3aec007 100644 --- a/examples/distributed_training/tutorial_cifar10_distributed_trainer.py +++ b/examples/distributed_training/tutorial_cifar10_distributed_trainer.py @@ -15,8 +15,8 @@ import multiprocessing import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import (BatchNormLayer, Conv2d, DenseLayer, FlattenLayer, InputLayer, MaxPool2d) diff --git a/examples/distributed_training/tutorial_mnist_distributed_trainer.py b/examples/distributed_training/tutorial_mnist_distributed_trainer.py index 0f1b8b6dd..0cf916370 100755 --- a/examples/distributed_training/tutorial_mnist_distributed_trainer.py +++ b/examples/distributed_training/tutorial_mnist_distributed_trainer.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/keras_tfslim/tutorial_keras.py b/examples/keras_tfslim/tutorial_keras.py index 0622bc745..33a9ca860 100644 --- a/examples/keras_tfslim/tutorial_keras.py +++ b/examples/keras_tfslim/tutorial_keras.py @@ -4,8 +4,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import Input, Lambda diff --git a/examples/pretrained_cnn/tutorial_models_mobilenetv1.py b/examples/pretrained_cnn/tutorial_models_mobilenetv1.py index 8d7b35a6b..6b797a075 100644 --- a/examples/pretrained_cnn/tutorial_models_mobilenetv1.py +++ b/examples/pretrained_cnn/tutorial_models_mobilenetv1.py @@ -10,8 +10,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.models.imagenet_classes import class_names diff --git a/examples/pretrained_cnn/tutorial_models_squeezenetv1.py b/examples/pretrained_cnn/tutorial_models_squeezenetv1.py index 9b6ee4e7f..755d6c28b 100644 --- a/examples/pretrained_cnn/tutorial_models_squeezenetv1.py +++ b/examples/pretrained_cnn/tutorial_models_squeezenetv1.py @@ -5,8 +5,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.models.imagenet_classes import class_names diff --git a/examples/pretrained_cnn/tutorial_models_vgg16.py b/examples/pretrained_cnn/tutorial_models_vgg16.py index e6bb1c22e..b1bd3823f 100644 --- a/examples/pretrained_cnn/tutorial_models_vgg16.py +++ b/examples/pretrained_cnn/tutorial_models_vgg16.py @@ -5,8 +5,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.models.imagenet_classes import class_names diff --git a/examples/pretrained_cnn/tutorial_models_vgg19.py b/examples/pretrained_cnn/tutorial_models_vgg19.py index 850412c38..922c3bdf5 100644 --- a/examples/pretrained_cnn/tutorial_models_vgg19.py +++ b/examples/pretrained_cnn/tutorial_models_vgg19.py @@ -5,8 +5,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.models.imagenet_classes import class_names diff --git a/examples/pretrained_cnn/tutorial_models_vgg_static.py b/examples/pretrained_cnn/tutorial_models_vgg_static.py index 40a3ed865..a0e056e4d 100644 --- a/examples/pretrained_cnn/tutorial_models_vgg_static.py +++ b/examples/pretrained_cnn/tutorial_models_vgg_static.py @@ -5,8 +5,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.models.imagenet_classes import class_names diff --git a/examples/quantized_net/tutorial_binarynet_cifar10_tfrecord.py b/examples/quantized_net/tutorial_binarynet_cifar10_tfrecord.py index 98532debb..d3205045a 100644 --- a/examples/quantized_net/tutorial_binarynet_cifar10_tfrecord.py +++ b/examples/quantized_net/tutorial_binarynet_cifar10_tfrecord.py @@ -43,7 +43,6 @@ import time import tensorflow as tf - import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/quantized_net/tutorial_binarynet_mnist_cnn.py b/examples/quantized_net/tutorial_binarynet_mnist_cnn.py index 248812e23..84fbf7fc9 100644 --- a/examples/quantized_net/tutorial_binarynet_mnist_cnn.py +++ b/examples/quantized_net/tutorial_binarynet_mnist_cnn.py @@ -4,7 +4,6 @@ import time import tensorflow as tf - import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/quantized_net/tutorial_dorefanet_cifar10_tfrecord.py b/examples/quantized_net/tutorial_dorefanet_cifar10_tfrecord.py index 9c8ab1239..fe7666bab 100644 --- a/examples/quantized_net/tutorial_dorefanet_cifar10_tfrecord.py +++ b/examples/quantized_net/tutorial_dorefanet_cifar10_tfrecord.py @@ -43,7 +43,6 @@ import time import tensorflow as tf - import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/quantized_net/tutorial_dorefanet_mnist_cnn.py b/examples/quantized_net/tutorial_dorefanet_mnist_cnn.py index 90d7b0893..d8cab9bc8 100644 --- a/examples/quantized_net/tutorial_dorefanet_mnist_cnn.py +++ b/examples/quantized_net/tutorial_dorefanet_mnist_cnn.py @@ -4,7 +4,6 @@ import time import tensorflow as tf - import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/quantized_net/tutorial_quanconv_cifar10.py b/examples/quantized_net/tutorial_quanconv_cifar10.py index 6eb35ed67..f93368467 100644 --- a/examples/quantized_net/tutorial_quanconv_cifar10.py +++ b/examples/quantized_net/tutorial_quanconv_cifar10.py @@ -41,8 +41,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl bitW = 8 diff --git a/examples/quantized_net/tutorial_quanconv_mnist.py b/examples/quantized_net/tutorial_quanconv_mnist.py index 4060c6137..66d52d13c 100644 --- a/examples/quantized_net/tutorial_quanconv_mnist.py +++ b/examples/quantized_net/tutorial_quanconv_mnist.py @@ -4,7 +4,6 @@ import time import tensorflow as tf - import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/quantized_net/tutorial_ternaryweight_cifar10_tfrecord.py b/examples/quantized_net/tutorial_ternaryweight_cifar10_tfrecord.py index f1ee7b4bb..b695fa88a 100644 --- a/examples/quantized_net/tutorial_ternaryweight_cifar10_tfrecord.py +++ b/examples/quantized_net/tutorial_ternaryweight_cifar10_tfrecord.py @@ -42,7 +42,6 @@ import time import tensorflow as tf - import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/quantized_net/tutorial_ternaryweight_mnist_cnn.py b/examples/quantized_net/tutorial_ternaryweight_mnist_cnn.py index e1c305db6..6850b9591 100644 --- a/examples/quantized_net/tutorial_ternaryweight_mnist_cnn.py +++ b/examples/quantized_net/tutorial_ternaryweight_mnist_cnn.py @@ -4,7 +4,6 @@ import time import tensorflow as tf - import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/spatial_transformer_network/tutorial_spatial_transformer_network_dynamic.py b/examples/spatial_transformer_network/tutorial_spatial_transformer_network_dynamic.py index e4b9b1dcf..3170585e4 100644 --- a/examples/spatial_transformer_network/tutorial_spatial_transformer_network_dynamic.py +++ b/examples/spatial_transformer_network/tutorial_spatial_transformer_network_dynamic.py @@ -3,8 +3,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import * from tensorlayer.models import Model diff --git a/examples/spatial_transformer_network/tutorial_spatial_transformer_network_static.py b/examples/spatial_transformer_network/tutorial_spatial_transformer_network_static.py index c09a3c46c..5f09db68b 100644 --- a/examples/spatial_transformer_network/tutorial_spatial_transformer_network_static.py +++ b/examples/spatial_transformer_network/tutorial_spatial_transformer_network_static.py @@ -3,8 +3,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import * from tensorlayer.models import Model diff --git a/examples/text_classification/tutorial_imdb_fasttext.py b/examples/text_classification/tutorial_imdb_fasttext.py index 2c2c7aed0..731d2fce4 100644 --- a/examples/text_classification/tutorial_imdb_fasttext.py +++ b/examples/text_classification/tutorial_imdb_fasttext.py @@ -31,8 +31,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import * from tensorlayer.models import * diff --git a/examples/text_generation/tutorial_generate_text.py b/examples/text_generation/tutorial_generate_text.py index 22a17ea37..4c42d0b12 100644 --- a/examples/text_generation/tutorial_generate_text.py +++ b/examples/text_generation/tutorial_generate_text.py @@ -28,8 +28,8 @@ import nltk import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import * diff --git a/examples/text_ptb/tutorial_ptb_lstm.py b/examples/text_ptb/tutorial_ptb_lstm.py index de08399c9..77c7c3425 100644 --- a/examples/text_ptb/tutorial_ptb_lstm.py +++ b/examples/text_ptb/tutorial_ptb_lstm.py @@ -104,8 +104,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/text_ptb/tutorial_ptb_lstm_state_is_tuple.py b/examples/text_ptb/tutorial_ptb_lstm_state_is_tuple.py index 0021a7bfc..9fccca66a 100644 --- a/examples/text_ptb/tutorial_ptb_lstm_state_is_tuple.py +++ b/examples/text_ptb/tutorial_ptb_lstm_state_is_tuple.py @@ -105,8 +105,8 @@ import time import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl tf.logging.set_verbosity(tf.logging.DEBUG) diff --git a/examples/text_word_embedding/tutorial_word2vec_basic.py b/examples/text_word_embedding/tutorial_word2vec_basic.py index 6310699ad..5a1dc842c 100644 --- a/examples/text_word_embedding/tutorial_word2vec_basic.py +++ b/examples/text_word_embedding/tutorial_word2vec_basic.py @@ -44,9 +44,9 @@ import time import numpy as np -import tensorflow as tf from six.moves import xrange # pylint: disable=redefined-builtin +import tensorflow as tf import tensorlayer as tl import wget diff --git a/examples/tutorial_work_with_onnx.py b/examples/tutorial_work_with_onnx.py index 522f2ad8c..46fd0cb42 100644 --- a/examples/tutorial_work_with_onnx.py +++ b/examples/tutorial_work_with_onnx.py @@ -117,13 +117,13 @@ import time import numpy as np -import tensorflow as tf -from tensorflow.python.tools.freeze_graph import freeze_graph as _freeze_graph import onnx +import tensorflow as tf import tensorlayer as tl from onnx_tf.backend import prepare from onnx_tf.frontend import tensorflow_graph_to_onnx_model +from tensorflow.python.tools.freeze_graph import freeze_graph as _freeze_graph tf.logging.set_verbosity(tf.logging.DEBUG) tl.logging.set_verbosity(tl.logging.DEBUG) diff --git a/tensorlayer/activation.py b/tensorlayer/activation.py index 7c7b833c3..4aef4a429 100644 --- a/tensorlayer/activation.py +++ b/tensorlayer/activation.py @@ -3,7 +3,6 @@ """A file containing various activation functions.""" import tensorflow as tf - from tensorlayer.decorators import deprecated __all__ = [ diff --git a/tensorlayer/cost.py b/tensorlayer/cost.py index 753d58041..2664d8d72 100644 --- a/tensorlayer/cost.py +++ b/tensorlayer/cost.py @@ -6,7 +6,6 @@ import tensorflow as tf from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops, math_ops, nn_ops, standard_ops - from tensorlayer import logging __all__ = [ diff --git a/tensorlayer/db.py b/tensorlayer/db.py index 7e9561f2f..1de73bf6a 100644 --- a/tensorlayer/db.py +++ b/tensorlayer/db.py @@ -8,10 +8,10 @@ from datetime import datetime import numpy as np -import tensorflow as tf import gridfs import pymongo +import tensorflow as tf from tensorlayer import logging from tensorlayer.files import ( assign_weights, del_folder, exists_or_mkdir, load_hdf5_to_weights, net2static_graph, save_weights_to_hdf5, @@ -641,7 +641,7 @@ def run_top_task(self, task_name=None, sort=None, **kwargs): logging.info("[Database] Start Task: key: {} sort: {} push time: {}".format(task_name, sort, _datetime)) _script = _script.decode('utf-8') with tf.Graph().as_default(): # # as graph: # clear all TF graphs - exec(_script, globals()) + exec (_script, globals()) # set status to finished _ = self.db.Task.find_one_and_update({'_id': _id}, {'$set': {'status': 'finished'}}) diff --git a/tensorlayer/distributed.py b/tensorlayer/distributed.py index 544aac87e..d3fbdd38f 100644 --- a/tensorlayer/distributed.py +++ b/tensorlayer/distributed.py @@ -6,7 +6,6 @@ import tensorflow as tf from tensorflow.python.training import session_run_hook - from tensorlayer import logging from tensorlayer.decorators import deprecated from tensorlayer.lazy_imports import LazyImport diff --git a/tensorlayer/files/dataset_loaders/voc_dataset.py b/tensorlayer/files/dataset_loaders/voc_dataset.py index 458d5eb66..c5ccadbcf 100644 --- a/tensorlayer/files/dataset_loaders/voc_dataset.py +++ b/tensorlayer/files/dataset_loaders/voc_dataset.py @@ -4,7 +4,6 @@ import os import tensorflow as tf - from tensorlayer import logging, utils from tensorlayer.files.utils import (del_file, del_folder, folder_exists, load_file_list, diff --git a/tensorlayer/files/dataset_loaders/wmt_en_fr_dataset.py b/tensorlayer/files/dataset_loaders/wmt_en_fr_dataset.py index 0261a8581..77c1f93f9 100644 --- a/tensorlayer/files/dataset_loaders/wmt_en_fr_dataset.py +++ b/tensorlayer/files/dataset_loaders/wmt_en_fr_dataset.py @@ -6,7 +6,6 @@ import tarfile from tensorflow.python.platform import gfile - from tensorlayer import logging from tensorlayer.files.utils import maybe_download_and_extract diff --git a/tensorlayer/files/utils.py b/tensorlayer/files/utils.py index d5c972dc6..e4b0f6f8e 100644 --- a/tensorlayer/files/utils.py +++ b/tensorlayer/files/utils.py @@ -18,16 +18,16 @@ import cloudpickle import h5py import numpy as np -import progressbar import scipy.io as sio -import tensorflow as tf from six.moves import cPickle + +import progressbar +import tensorflow as tf +import tensorlayer as tl from tensorflow.python.keras.saving import model_config as model_config_lib from tensorflow.python.platform import gfile from tensorflow.python.util import serialization from tensorflow.python.util.tf_export import keras_export - -import tensorlayer as tl from tensorlayer import logging, nlp, utils, visualize # from six.moves import zip diff --git a/tensorlayer/initializers.py b/tensorlayer/initializers.py index 666777824..f68c05c1d 100644 --- a/tensorlayer/initializers.py +++ b/tensorlayer/initializers.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import numpy as np + import tensorflow as tf __all__ = [ diff --git a/tensorlayer/layers/activation.py b/tensorlayer/layers/activation.py index 44fcc47a9..9abb19ce7 100644 --- a/tensorlayer/layers/activation.py +++ b/tensorlayer/layers/activation.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - from tensorlayer import logging from tensorlayer.activation import leaky_relu6, leaky_twice_relu6 from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/binary_conv.py b/tensorlayer/layers/convolution/binary_conv.py index 23448cf6f..14e5a8721 100644 --- a/tensorlayer/layers/convolution/binary_conv.py +++ b/tensorlayer/layers/convolution/binary_conv.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/deformable_conv.py b/tensorlayer/layers/convolution/deformable_conv.py index 5f75bbe15..b9a8224db 100644 --- a/tensorlayer/layers/convolution/deformable_conv.py +++ b/tensorlayer/layers/convolution/deformable_conv.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias, private_method diff --git a/tensorlayer/layers/convolution/depthwise_conv.py b/tensorlayer/layers/convolution/depthwise_conv.py index d6136ede3..4fe4dc34c 100644 --- a/tensorlayer/layers/convolution/depthwise_conv.py +++ b/tensorlayer/layers/convolution/depthwise_conv.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/dorefa_conv.py b/tensorlayer/layers/convolution/dorefa_conv.py index ed9b32dd8..1f8944382 100644 --- a/tensorlayer/layers/convolution/dorefa_conv.py +++ b/tensorlayer/layers/convolution/dorefa_conv.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/expert_conv.py b/tensorlayer/layers/convolution/expert_conv.py index d7e59a0e8..fb27b9df6 100644 --- a/tensorlayer/layers/convolution/expert_conv.py +++ b/tensorlayer/layers/convolution/expert_conv.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/expert_deconv.py b/tensorlayer/layers/convolution/expert_deconv.py index cb5cd6773..a1571b2cb 100644 --- a/tensorlayer/layers/convolution/expert_deconv.py +++ b/tensorlayer/layers/convolution/expert_deconv.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/group_conv.py b/tensorlayer/layers/convolution/group_conv.py index 34d8c10e6..2923a10ae 100644 --- a/tensorlayer/layers/convolution/group_conv.py +++ b/tensorlayer/layers/convolution/group_conv.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/quan_conv.py b/tensorlayer/layers/convolution/quan_conv.py index 432764b63..662df2661 100644 --- a/tensorlayer/layers/convolution/quan_conv.py +++ b/tensorlayer/layers/convolution/quan_conv.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/quan_conv_bn.py b/tensorlayer/layers/convolution/quan_conv_bn.py index 0ef5ac313..1c1593373 100644 --- a/tensorlayer/layers/convolution/quan_conv_bn.py +++ b/tensorlayer/layers/convolution/quan_conv_bn.py @@ -3,7 +3,6 @@ import tensorflow as tf from tensorflow.python.training import moving_averages - from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/convolution/separable_conv.py b/tensorlayer/layers/convolution/separable_conv.py index b6ae62446..ff67672ba 100644 --- a/tensorlayer/layers/convolution/separable_conv.py +++ b/tensorlayer/layers/convolution/separable_conv.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/simplified_conv.py b/tensorlayer/layers/convolution/simplified_conv.py index c00ff8fe7..8c8eebece 100644 --- a/tensorlayer/layers/convolution/simplified_conv.py +++ b/tensorlayer/layers/convolution/simplified_conv.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/simplified_deconv.py b/tensorlayer/layers/convolution/simplified_deconv.py index 847062859..569fe0810 100644 --- a/tensorlayer/layers/convolution/simplified_deconv.py +++ b/tensorlayer/layers/convolution/simplified_deconv.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/convolution/super_resolution.py b/tensorlayer/layers/convolution/super_resolution.py index 35fee8722..a3f51e2a8 100644 --- a/tensorlayer/layers/convolution/super_resolution.py +++ b/tensorlayer/layers/convolution/super_resolution.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias, private_method diff --git a/tensorlayer/layers/convolution/ternary_conv.py b/tensorlayer/layers/convolution/ternary_conv.py index 9a97c7bec..512350ba5 100644 --- a/tensorlayer/layers/convolution/ternary_conv.py +++ b/tensorlayer/layers/convolution/ternary_conv.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/core.py b/tensorlayer/layers/core.py index e5b2c27fb..8e13631b7 100644 --- a/tensorlayer/layers/core.py +++ b/tensorlayer/layers/core.py @@ -5,7 +5,6 @@ from abc import abstractmethod import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import (deprecated_alias, private_method, protected_method) diff --git a/tensorlayer/layers/dense/base_dense.py b/tensorlayer/layers/dense/base_dense.py index a5b800f04..bec9d3f6f 100644 --- a/tensorlayer/layers/dense/base_dense.py +++ b/tensorlayer/layers/dense/base_dense.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/dense/binary_dense.py b/tensorlayer/layers/dense/binary_dense.py index 0492a01d9..74d5208cd 100644 --- a/tensorlayer/layers/dense/binary_dense.py +++ b/tensorlayer/layers/dense/binary_dense.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/dense/dorefa_dense.py b/tensorlayer/layers/dense/dorefa_dense.py index 1e80e2339..73069d478 100644 --- a/tensorlayer/layers/dense/dorefa_dense.py +++ b/tensorlayer/layers/dense/dorefa_dense.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/dense/dropconnect.py b/tensorlayer/layers/dense/dropconnect.py index 4b16fba5b..371ed2e6b 100644 --- a/tensorlayer/layers/dense/dropconnect.py +++ b/tensorlayer/layers/dense/dropconnect.py @@ -4,7 +4,6 @@ import numbers import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/dense/quan_dense.py b/tensorlayer/layers/dense/quan_dense.py index 2e6296434..8d5c594c7 100644 --- a/tensorlayer/layers/dense/quan_dense.py +++ b/tensorlayer/layers/dense/quan_dense.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/dense/quan_dense_bn.py b/tensorlayer/layers/dense/quan_dense_bn.py index 7b517b9ba..bcbd70950 100644 --- a/tensorlayer/layers/dense/quan_dense_bn.py +++ b/tensorlayer/layers/dense/quan_dense_bn.py @@ -4,7 +4,6 @@ import tensorflow as tf # from tensorlayer.layers.core import LayersConfig from tensorflow.python.training import moving_averages - from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/dense/ternary_dense.py b/tensorlayer/layers/dense/ternary_dense.py index dce6be9eb..28d84297e 100644 --- a/tensorlayer/layers/dense/ternary_dense.py +++ b/tensorlayer/layers/dense/ternary_dense.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/dropout.py b/tensorlayer/layers/dropout.py index 3724d8b43..25fe80a36 100644 --- a/tensorlayer/layers/dropout.py +++ b/tensorlayer/layers/dropout.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/embedding.py b/tensorlayer/layers/embedding.py index 80c5cadfa..a82c1a93b 100644 --- a/tensorlayer/layers/embedding.py +++ b/tensorlayer/layers/embedding.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer import logging from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/extend.py b/tensorlayer/layers/extend.py index 42395a537..09d5508db 100644 --- a/tensorlayer/layers/extend.py +++ b/tensorlayer/layers/extend.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/image_resampling.py b/tensorlayer/layers/image_resampling.py index 3b2a2825a..4713200d3 100644 --- a/tensorlayer/layers/image_resampling.py +++ b/tensorlayer/layers/image_resampling.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/inputs.py b/tensorlayer/layers/inputs.py index 0330347fe..4f2544b06 100644 --- a/tensorlayer/layers/inputs.py +++ b/tensorlayer/layers/inputs.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer import logging from tensorlayer.layers.core import Layer, LayerNode diff --git a/tensorlayer/layers/lambda_layers.py b/tensorlayer/layers/lambda_layers.py index 17501a4e4..9b82ad603 100644 --- a/tensorlayer/layers/lambda_layers.py +++ b/tensorlayer/layers/lambda_layers.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.files import utils diff --git a/tensorlayer/layers/merge.py b/tensorlayer/layers/merge.py index 346a65962..2509d35a6 100644 --- a/tensorlayer/layers/merge.py +++ b/tensorlayer/layers/merge.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - from tensorlayer import logging from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/noise.py b/tensorlayer/layers/noise.py index bd9c2df9c..c658f8e19 100644 --- a/tensorlayer/layers/noise.py +++ b/tensorlayer/layers/noise.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/normalization.py b/tensorlayer/layers/normalization.py index d8cec274c..0de0e8ed1 100644 --- a/tensorlayer/layers/normalization.py +++ b/tensorlayer/layers/normalization.py @@ -2,11 +2,10 @@ # -*- coding: utf-8 -*- import tensorflow as tf +import tensorlayer as tl from tensorflow.python.framework import ops from tensorflow.python.ops import math_ops from tensorflow.python.training import moving_averages - -import tensorlayer as tl from tensorlayer import logging from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/padding.py b/tensorlayer/layers/padding.py index db1bbb304..edcb720a5 100644 --- a/tensorlayer/layers/padding.py +++ b/tensorlayer/layers/padding.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/pooling.py b/tensorlayer/layers/pooling.py index 2046de6c5..a22cea358 100644 --- a/tensorlayer/layers/pooling.py +++ b/tensorlayer/layers/pooling.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/quantize.py b/tensorlayer/layers/quantize.py index 3b5b19635..47ad2a088 100644 --- a/tensorlayer/layers/quantize.py +++ b/tensorlayer/layers/quantize.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/recurrent.py b/tensorlayer/layers/recurrent.py index 16b7208d0..2364c6a7d 100644 --- a/tensorlayer/layers/recurrent.py +++ b/tensorlayer/layers/recurrent.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - import tensorlayer as tl from tensorlayer import logging from tensorlayer.decorators import deprecated_alias diff --git a/tensorlayer/layers/scale.py b/tensorlayer/layers/scale.py index ac1800529..6546d70af 100644 --- a/tensorlayer/layers/scale.py +++ b/tensorlayer/layers/scale.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - from tensorlayer import logging from tensorlayer.initializers import constant from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/shape.py b/tensorlayer/layers/shape.py index f8e7b47db..e308eb0c4 100644 --- a/tensorlayer/layers/shape.py +++ b/tensorlayer/layers/shape.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/spatial_transformer.py b/tensorlayer/layers/spatial_transformer.py index 0d0f578d0..262108a68 100644 --- a/tensorlayer/layers/spatial_transformer.py +++ b/tensorlayer/layers/spatial_transformer.py @@ -2,11 +2,11 @@ # -*- coding: utf-8 -*- import numpy as np -import tensorflow as tf from six.moves import xrange -from tensorflow.python.ops import array_ops +import tensorflow as tf import tensorlayer as tl +from tensorflow.python.ops import array_ops from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/stack.py b/tensorlayer/layers/stack.py index c31327989..c35e3837f 100644 --- a/tensorlayer/layers/stack.py +++ b/tensorlayer/layers/stack.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import tensorflow as tf - from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer diff --git a/tensorlayer/layers/utils.py b/tensorlayer/layers/utils.py index e5dd154b1..6d411589f 100644 --- a/tensorlayer/layers/utils.py +++ b/tensorlayer/layers/utils.py @@ -2,10 +2,10 @@ # -*- coding: utf-8 -*- import numpy as np -import tensorflow as tf -from tensorflow.python.ops.rnn_cell import LSTMStateTuple +import tensorflow as tf import tensorlayer as tl +from tensorflow.python.ops.rnn_cell import LSTMStateTuple from tensorlayer import logging from tensorlayer.decorators import deprecated, deprecated_alias diff --git a/tensorlayer/models/core.py b/tensorlayer/models/core.py index c811b9648..cbcff4bf3 100644 --- a/tensorlayer/models/core.py +++ b/tensorlayer/models/core.py @@ -3,9 +3,8 @@ from queue import Queue import tensorflow as tf -from tensorflow.python.framework import ops as tf_ops - import tensorlayer as tl +from tensorflow.python.framework import ops as tf_ops from tensorlayer import logging from tensorlayer.files import utils from tensorlayer.layers import Layer, ModelLayer diff --git a/tensorlayer/models/mobilenetv1.py b/tensorlayer/models/mobilenetv1.py index 8065eeef3..4908b3d89 100644 --- a/tensorlayer/models/mobilenetv1.py +++ b/tensorlayer/models/mobilenetv1.py @@ -5,7 +5,6 @@ import os import tensorflow as tf - from tensorlayer import logging from tensorlayer.files import (assign_weights, load_npz, maybe_download_and_extract) from tensorlayer.layers import (BatchNorm, Conv2d, DepthwiseConv2d, Flatten, GlobalMeanPool2d, Input, Reshape) diff --git a/tensorlayer/models/squeezenetv1.py b/tensorlayer/models/squeezenetv1.py index b38d42dc8..a2d7e4304 100644 --- a/tensorlayer/models/squeezenetv1.py +++ b/tensorlayer/models/squeezenetv1.py @@ -5,7 +5,6 @@ import os import tensorflow as tf - from tensorlayer import logging from tensorlayer.files import (assign_weights, load_npz, maybe_download_and_extract) from tensorlayer.layers import (Concat, Conv2d, Dropout, GlobalMeanPool2d, Input, Lambda, MaxPool2d) diff --git a/tensorlayer/models/vgg.py b/tensorlayer/models/vgg.py index b072841be..06648cb53 100644 --- a/tensorlayer/models/vgg.py +++ b/tensorlayer/models/vgg.py @@ -30,8 +30,8 @@ import os import numpy as np -import tensorflow as tf +import tensorflow as tf import tensorlayer as tl from tensorlayer import logging from tensorlayer.files import assign_weights, maybe_download_and_extract diff --git a/tensorlayer/nlp.py b/tensorlayer/nlp.py index d96a7acf1..ed1ce975d 100755 --- a/tensorlayer/nlp.py +++ b/tensorlayer/nlp.py @@ -11,11 +11,11 @@ from collections import Counter import numpy as np -import tensorflow as tf from six.moves import urllib, xrange -from tensorflow.python.platform import gfile +import tensorflow as tf import tensorlayer as tl +from tensorflow.python.platform import gfile from tensorlayer.lazy_imports import LazyImport nltk = LazyImport("nltk") diff --git a/tensorlayer/rein.py b/tensorlayer/rein.py index e5cbe6bd4..8ddce7316 100644 --- a/tensorlayer/rein.py +++ b/tensorlayer/rein.py @@ -2,9 +2,10 @@ # -*- coding: utf-8 -*- import numpy as np -import tensorflow as tf from six.moves import xrange +import tensorflow as tf + __all__ = [ 'discount_episode_rewards', 'cross_entropy_reward_loss', diff --git a/tensorlayer/utils.py b/tensorlayer/utils.py index d6b8e6d78..35e054afb 100644 --- a/tensorlayer/utils.py +++ b/tensorlayer/utils.py @@ -11,9 +11,9 @@ from sys import platform as _platform import numpy as np -import tensorflow as tf from sklearn.metrics import accuracy_score, confusion_matrix, f1_score +import tensorflow as tf import tensorlayer as tl __all__ = [ From 4793c66b9a892913ae56bf6510895181c7cdc344 Mon Sep 17 00:00:00 2001 From: quantumiracle <1402434478@qq.com> Date: Tue, 11 Jun 2019 15:19:05 +0100 Subject: [PATCH 57/57] yapf --- .../tutorial_cifar10_cnn_static.py | 6 +- .../tutorial_mnist_mlp_dynamic.py | 14 +- .../tutorial_mnist_mlp_dynamic_2.py | 6 +- .../tutorial_mnist_mlp_static.py | 10 +- .../tutorial_mnist_mlp_static_2.py | 4 +- .../basic_tutorials/tutorial_mnist_siamese.py | 2 +- .../data_process/tutorial_tf_dataset_voc.py | 5 +- examples/data_process/tutorial_tfrecord.py | 3 +- examples/data_process/tutorial_tfrecord2.py | 1 + ...torial_imagenet_inceptionV3_distributed.py | 3 +- .../tutorial_cifar10_distributed_trainer.py | 3 +- examples/keras_tfslim/tutorial_keras.py | 5 +- .../pretrained_cnn/tutorial_models_vgg16.py | 1 - .../pretrained_cnn/tutorial_models_vgg19.py | 1 - .../tutorial_models_vgg_static.py | 1 - .../reinforcement_learning/baselines/SAC.py | 212 ++++++++------- .../reinforcement_learning/baselines/utils.py | 26 +- .../baselines/wrappers.py | 51 ++-- .../reinforcement_learning/tutorial_A3C.py | 64 +++-- .../reinforcement_learning/tutorial_AC.py | 51 ++-- .../reinforcement_learning/tutorial_C51.py | 73 ++--- .../reinforcement_learning/tutorial_DDPG.py | 23 +- .../reinforcement_learning/tutorial_DPPO.py | 26 +- .../reinforcement_learning/tutorial_DQN.py | 10 +- .../tutorial_DQN_variants.py | 75 +++--- .../reinforcement_learning/tutorial_PG.py | 38 +-- .../reinforcement_learning/tutorial_PPO.py | 34 ++- .../tutorial_Retrace.py | 67 ++--- .../reinforcement_learning/tutorial_SAC.py | 254 ++++++++++-------- .../reinforcement_learning/tutorial_TD3.py | 215 ++++++++------- .../reinforcement_learning/tutorial_TRPO.py | 31 ++- .../tutorial_atari_pong.py | 10 +- .../reinforcement_learning/tutorial_format.py | 38 ++- .../tutorial_prioritized_replay.py | 105 ++++---- .../tutorial_wrappers.py | 51 ++-- ...ial_spatial_transformer_network_dynamic.py | 1 + ...rial_spatial_transformer_network_static.py | 3 +- .../tutorial_imdb_fasttext.py | 10 +- .../tutorial_word2vec_basic.py | 23 +- .../files/dataset_loaders/celebA_dataset.py | 3 +- .../files/dataset_loaders/cyclegan_dataset.py | 3 +- .../dataset_loaders/flickr_1M_dataset.py | 6 +- .../dataset_loaders/flickr_25k_dataset.py | 6 +- .../files/dataset_loaders/mpii_dataset.py | 3 +- .../files/dataset_loaders/voc_dataset.py | 4 +- tensorlayer/layers/convolution/quan_conv.py | 3 +- .../layers/convolution/quan_conv_bn.py | 3 +- tensorlayer/layers/dense/quan_dense.py | 3 +- tensorlayer/layers/dense/quan_dense_bn.py | 3 +- 49 files changed, 853 insertions(+), 740 deletions(-) diff --git a/examples/basic_tutorials/tutorial_cifar10_cnn_static.py b/examples/basic_tutorials/tutorial_cifar10_cnn_static.py index 93794c414..ecb1117ce 100644 --- a/examples/basic_tutorials/tutorial_cifar10_cnn_static.py +++ b/examples/basic_tutorials/tutorial_cifar10_cnn_static.py @@ -8,8 +8,7 @@ import tensorflow as tf import tensorlayer as tl -from tensorlayer.layers import (BatchNorm, Conv2d, Dense, Flatten, Input, - LocalResponseNorm, MaxPool2d) +from tensorlayer.layers import (BatchNorm, Conv2d, Dense, Flatten, Input, LocalResponseNorm, MaxPool2d) from tensorlayer.models import Model # enable debug logging @@ -74,7 +73,6 @@ def get_model_batchnorm(inputs_shape): # get the network net = get_model([None, 24, 24, 3]) - # training settings batch_size = 128 n_epoch = 50000 @@ -82,7 +80,7 @@ def get_model_batchnorm(inputs_shape): print_freq = 5 n_step_epoch = int(len(y_train) / batch_size) n_step = n_epoch * n_step_epoch -shuffle_buffer_size = 128 # 100 +shuffle_buffer_size = 128 # 100 # init_learning_rate = 0.1 # learning_rate_decay_factor = 0.1 # num_epoch_decay = 350 diff --git a/examples/basic_tutorials/tutorial_mnist_mlp_dynamic.py b/examples/basic_tutorials/tutorial_mnist_mlp_dynamic.py index c822aa012..f4ad787b7 100644 --- a/examples/basic_tutorials/tutorial_mnist_mlp_dynamic.py +++ b/examples/basic_tutorials/tutorial_mnist_mlp_dynamic.py @@ -13,18 +13,19 @@ ## prepare MNIST data X_train, y_train, X_val, y_val, X_test, y_test = tl.files.load_mnist_dataset(shape=(-1, 784)) + ## define the network class CustomModel(Model): def __init__(self): super(CustomModel, self).__init__() - self.dropout1 = Dropout(keep=0.8)#(self.innet) - self.dense1 = Dense(n_units=800, act=tf.nn.relu, in_channels=784)#(self.dropout1) - self.dropout2 = Dropout(keep=0.8)#(self.dense1) - self.dense2 = Dense(n_units=800, act=tf.nn.relu, in_channels=800)#(self.dropout2) - self.dropout3 = Dropout(keep=0.8)#(self.dense2) - self.dense3 = Dense(n_units=10, act=tf.nn.relu, in_channels=800)#(self.dropout3) + self.dropout1 = Dropout(keep=0.8) #(self.innet) + self.dense1 = Dense(n_units=800, act=tf.nn.relu, in_channels=784) #(self.dropout1) + self.dropout2 = Dropout(keep=0.8) #(self.dense1) + self.dense2 = Dense(n_units=800, act=tf.nn.relu, in_channels=800) #(self.dropout2) + self.dropout3 = Dropout(keep=0.8) #(self.dense2) + self.dense3 = Dense(n_units=10, act=tf.nn.relu, in_channels=800) #(self.dropout3) def forward(self, x, foo=None): z = self.dropout1(x) @@ -37,6 +38,7 @@ def forward(self, x, foo=None): out = tf.nn.relu(out) return out + MLP = CustomModel() ## start training diff --git a/examples/basic_tutorials/tutorial_mnist_mlp_dynamic_2.py b/examples/basic_tutorials/tutorial_mnist_mlp_dynamic_2.py index 0d94b1dfa..e2d45943d 100644 --- a/examples/basic_tutorials/tutorial_mnist_mlp_dynamic_2.py +++ b/examples/basic_tutorials/tutorial_mnist_mlp_dynamic_2.py @@ -13,13 +13,14 @@ ## prepare MNIST data X_train, y_train, X_val, y_val, X_test, y_test = tl.files.load_mnist_dataset(shape=(-1, 784)) + ## define the network class CustomModelHidden(Model): def __init__(self): super(CustomModelHidden, self).__init__() - self.dropout1 = Dropout(keep=0.8)#(self.innet) + self.dropout1 = Dropout(keep=0.8) #(self.innet) self.seq = LayerList( [ @@ -29,7 +30,7 @@ def __init__(self): ] ) - self.dropout3 = Dropout(keep=0.8)#(self.seq) + self.dropout3 = Dropout(keep=0.8) #(self.seq) def forward(self, x): z = self.dropout1(x) @@ -37,6 +38,7 @@ def forward(self, x): z = self.dropout3(z) return z + class CustomModelOut(Model): def __init__(self): diff --git a/examples/basic_tutorials/tutorial_mnist_mlp_static.py b/examples/basic_tutorials/tutorial_mnist_mlp_static.py index 34e17fdd4..08b72bd64 100644 --- a/examples/basic_tutorials/tutorial_mnist_mlp_static.py +++ b/examples/basic_tutorials/tutorial_mnist_mlp_static.py @@ -21,14 +21,18 @@ def get_model(inputs_shape): ni = Input(inputs_shape) nn = Dropout(keep=0.8)(ni) - nn = Dense(n_units=800, act=tf.nn.relu)(nn) # in_channels is optional in this case as it can be inferred by the previous layer + nn = Dense(n_units=800, + act=tf.nn.relu)(nn) # in_channels is optional in this case as it can be inferred by the previous layer nn = Dropout(keep=0.8)(nn) - nn = Dense(n_units=800, act=tf.nn.relu)(nn) # in_channels is optional in this case as it can be inferred by the previous layer + nn = Dense(n_units=800, + act=tf.nn.relu)(nn) # in_channels is optional in this case as it can be inferred by the previous layer nn = Dropout(keep=0.8)(nn) - nn = Dense(n_units=10, act=tf.nn.relu)(nn) # in_channels is optional in this case as it can be inferred by the previous layer + nn = Dense(n_units=10, + act=tf.nn.relu)(nn) # in_channels is optional in this case as it can be inferred by the previous layer M = Model(inputs=ni, outputs=nn, name="mlp") return M + MLP = get_model([None, 784]) import pprint pprint.pprint(MLP.config) diff --git a/examples/basic_tutorials/tutorial_mnist_mlp_static_2.py b/examples/basic_tutorials/tutorial_mnist_mlp_static_2.py index a9a2c7d48..67a519e4a 100644 --- a/examples/basic_tutorials/tutorial_mnist_mlp_static_2.py +++ b/examples/basic_tutorials/tutorial_mnist_mlp_static_2.py @@ -13,12 +13,12 @@ ## prepare MNIST data X_train, y_train, X_val, y_val, X_test, y_test = tl.files.load_mnist_dataset(shape=(-1, 784)) - ## define the network # the softmax is implemented internally in tl.cost.cross_entropy(y, y_) to # speed up computation, so we use identity here. # see tf.nn.sparse_softmax_cross_entropy_with_logits() + def hidden_model(inputs_shape): ni = Input(inputs_shape) nn = Dropout(keep=0.8)(ni) @@ -28,6 +28,7 @@ def hidden_model(inputs_shape): return Model(inputs=ni, outputs=nn, name="mlp_hidden") + def get_model(inputs_shape, hmodel): hidden = hmodel.as_layer() ni = Input(inputs_shape) @@ -37,6 +38,7 @@ def get_model(inputs_shape, hmodel): return Model(inputs=ni, outputs=nn, name="mlp") + MLP_hidden = hidden_model([None, 784]) MLP = get_model([None, 784], MLP_hidden) # MLP.print_layers() diff --git a/examples/basic_tutorials/tutorial_mnist_siamese.py b/examples/basic_tutorials/tutorial_mnist_siamese.py index fe4abdc52..e8d50ef94 100644 --- a/examples/basic_tutorials/tutorial_mnist_siamese.py +++ b/examples/basic_tutorials/tutorial_mnist_siamese.py @@ -33,7 +33,7 @@ def contrastive_loss(label, feature1, feature2): def compute_accuracy(label, feature1, feature2): - eucd = tf.sqrt(tf.reduce_sum((feature1 - feature2) ** 2, axis=1)) + eucd = tf.sqrt(tf.reduce_sum((feature1 - feature2)**2, axis=1)) pred = tf.cast(eucd < 0.5, label.dtype) return tf.reduce_mean(tf.cast(tf.equal(pred, label), tf.float32)) diff --git a/examples/data_process/tutorial_tf_dataset_voc.py b/examples/data_process/tutorial_tf_dataset_voc.py index 9779b1f60..c3ac07e06 100644 --- a/examples/data_process/tutorial_tf_dataset_voc.py +++ b/examples/data_process/tutorial_tf_dataset_voc.py @@ -108,5 +108,6 @@ def _map_fn(filename, annotation): ## save all images for i in range(len(im)): print(ann[i][1]) - tl.vis.draw_boxes_and_labels_to_image(im[i] * 255, ann[i][0], ann[i][1], [], classes, - True, save_name='_bbox_vis_%d.png' % i) + tl.vis.draw_boxes_and_labels_to_image( + im[i] * 255, ann[i][0], ann[i][1], [], classes, True, save_name='_bbox_vis_%d.png' % i + ) diff --git a/examples/data_process/tutorial_tfrecord.py b/examples/data_process/tutorial_tfrecord.py index bcf3fe46a..6c5c38162 100644 --- a/examples/data_process/tutorial_tfrecord.py +++ b/examples/data_process/tutorial_tfrecord.py @@ -79,8 +79,7 @@ def read_and_decode(filename): raw_dataset = tf.data.TFRecordDataset([filename]).shuffle(1000).batch(4) for serialized_example in raw_dataset: features = tf.io.parse_example( - serialized_example, - features={ + serialized_example, features={ 'label': tf.io.FixedLenFeature([], tf.int64), 'img_raw': tf.io.FixedLenFeature([], tf.string), } diff --git a/examples/data_process/tutorial_tfrecord2.py b/examples/data_process/tutorial_tfrecord2.py index 22b3d7757..6997be251 100755 --- a/examples/data_process/tutorial_tfrecord2.py +++ b/examples/data_process/tutorial_tfrecord2.py @@ -77,6 +77,7 @@ def read_and_decode(filename): label_batch = tf.cast(features['label'], tf.int32) yield img_batch, label_batch + img_batch, label_batch = next(read_and_decode("train.tfrecords")) print("img_batch : %s" % img_batch.shape) print("label_batch : %s" % label_batch.shape) diff --git a/examples/deprecated_tutorials/tutorial_imagenet_inceptionV3_distributed.py b/examples/deprecated_tutorials/tutorial_imagenet_inceptionV3_distributed.py index 15c0a3f3c..1c2801306 100644 --- a/examples/deprecated_tutorials/tutorial_imagenet_inceptionV3_distributed.py +++ b/examples/deprecated_tutorials/tutorial_imagenet_inceptionV3_distributed.py @@ -23,8 +23,7 @@ import tensorflow as tf import tensorlayer as tl from tensorflow.contrib import slim -from tensorflow.contrib.slim.python.slim.nets.inception_v3 import (inception_v3, - inception_v3_arg_scope) +from tensorflow.contrib.slim.python.slim.nets.inception_v3 import (inception_v3, inception_v3_arg_scope) from tensorflow.python.framework.errors_impl import OutOfRangeError from tensorflow.python.training import session_run_hook from tensorflow.python.training.basic_session_run_hooks import StopAtStepHook diff --git a/examples/distributed_training/tutorial_cifar10_distributed_trainer.py b/examples/distributed_training/tutorial_cifar10_distributed_trainer.py index ce3aec007..340e37b2f 100644 --- a/examples/distributed_training/tutorial_cifar10_distributed_trainer.py +++ b/examples/distributed_training/tutorial_cifar10_distributed_trainer.py @@ -18,8 +18,7 @@ import tensorflow as tf import tensorlayer as tl -from tensorlayer.layers import (BatchNormLayer, Conv2d, DenseLayer, - FlattenLayer, InputLayer, MaxPool2d) +from tensorlayer.layers import (BatchNormLayer, Conv2d, DenseLayer, FlattenLayer, InputLayer, MaxPool2d) tf.logging.set_verbosity(tf.logging.DEBUG) tl.logging.set_verbosity(tl.logging.DEBUG) diff --git a/examples/keras_tfslim/tutorial_keras.py b/examples/keras_tfslim/tutorial_keras.py index 33a9ca860..9b877738c 100644 --- a/examples/keras_tfslim/tutorial_keras.py +++ b/examples/keras_tfslim/tutorial_keras.py @@ -15,7 +15,6 @@ batch_size = 128 - # keras layers layers = [ tf.keras.layers.Dropout(0.8), @@ -23,12 +22,12 @@ tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(800, activation='relu'), tf.keras.layers.Dropout(0.5), - tf.keras.layers.Dense(10, activation='linear')] + tf.keras.layers.Dense(10, activation='linear') +] keras_block = tf.keras.Sequential(layers) # in order to compile keras model and get trainable_variables of the keras model _ = keras_block(np.random.random([batch_size, 784]).astype(np.float32)) - # build tl model using keras layers ni = Input([None, 784], dtype=tf.float32) nn = Lambda(fn=keras_block, fn_weights=keras_block.trainable_variables)(ni) diff --git a/examples/pretrained_cnn/tutorial_models_vgg16.py b/examples/pretrained_cnn/tutorial_models_vgg16.py index b1bd3823f..7749d5391 100644 --- a/examples/pretrained_cnn/tutorial_models_vgg16.py +++ b/examples/pretrained_cnn/tutorial_models_vgg16.py @@ -12,7 +12,6 @@ tl.logging.set_verbosity(tl.logging.DEBUG) - # get the whole model vgg = tl.models.vgg16(pretrained=True) diff --git a/examples/pretrained_cnn/tutorial_models_vgg19.py b/examples/pretrained_cnn/tutorial_models_vgg19.py index 922c3bdf5..09f2afa22 100644 --- a/examples/pretrained_cnn/tutorial_models_vgg19.py +++ b/examples/pretrained_cnn/tutorial_models_vgg19.py @@ -12,7 +12,6 @@ tl.logging.set_verbosity(tl.logging.DEBUG) - # get the whole model vgg = tl.models.vgg19(pretrained=True) diff --git a/examples/pretrained_cnn/tutorial_models_vgg_static.py b/examples/pretrained_cnn/tutorial_models_vgg_static.py index a0e056e4d..0e73b82ef 100644 --- a/examples/pretrained_cnn/tutorial_models_vgg_static.py +++ b/examples/pretrained_cnn/tutorial_models_vgg_static.py @@ -12,7 +12,6 @@ tl.logging.set_verbosity(tl.logging.DEBUG) - # get the whole model vgg = tl.models.vgg16(pretrained=True, mode='static') diff --git a/examples/reinforcement_learning/baselines/SAC.py b/examples/reinforcement_learning/baselines/SAC.py index df017edbf..5760298d3 100644 --- a/examples/reinforcement_learning/baselines/SAC.py +++ b/examples/reinforcement_learning/baselines/SAC.py @@ -51,35 +51,40 @@ args = parser.parse_args() - class SoftQNetwork(Model): + def __init__(self, num_inputs, num_actions, hidden_dim, init_w=3e-3): super(SoftQNetwork, self).__init__() input_dim = num_inputs + num_actions - w_init = tf.keras.initializers.glorot_normal(seed=None) # glorot initialization is better than uniform in practice + w_init = tf.keras.initializers.glorot_normal( + seed=None + ) # glorot initialization is better than uniform in practice # w_init = tf.random_uniform_initializer(-init_w, init_w) self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=input_dim, name='q1') self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='q2') self.linear3 = Dense(n_units=1, W_init=w_init, in_channels=hidden_dim, name='q3') - + def forward(self, input): x = self.linear1(input) x = self.linear2(x) x = self.linear3(x) return x - - + + class PolicyNetwork(Model): - def __init__(self, num_inputs, num_actions, hidden_dim, action_range=1., init_w=3e-3, log_std_min=-20, log_std_max=2): + + def __init__( + self, num_inputs, num_actions, hidden_dim, action_range=1., init_w=3e-3, log_std_min=-20, log_std_max=2 + ): super(PolicyNetwork, self).__init__() - + self.log_std_min = log_std_min self.log_std_max = log_std_max - + w_init = tf.keras.initializers.glorot_normal(seed=None) # w_init = tf.random_uniform_initializer(-init_w, init_w) - + self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=num_inputs, name='policy1') self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy2') self.linear3 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy3') @@ -92,60 +97,61 @@ def __init__(self, num_inputs, num_actions, hidden_dim, action_range=1., init_w= self.action_range = action_range self.num_actions = num_actions - def forward(self, state): x = self.linear1(state) x = self.linear2(x) x = self.linear3(x) - mean = self.mean_linear(x) + mean = self.mean_linear(x) log_std = self.log_std_linear(x) log_std = tf.clip_by_value(log_std, self.log_std_min, self.log_std_max) - + return mean, log_std - + def evaluate(self, state, epsilon=1e-6): ''' generate action with state for calculating gradients ''' state = state.astype(np.float32) mean, log_std = self.forward(state) - std = tf.math.exp(log_std) # no clip in evaluation, clip affects gradients flow - + std = tf.math.exp(log_std) # no clip in evaluation, clip affects gradients flow + normal = Normal(0, 1) - z = normal.sample() - action_0 = tf.math.tanh(mean + std*z) # TanhNormal distribution as actions; reparameterization trick - action = self.action_range*action_0 + z = normal.sample() + action_0 = tf.math.tanh(mean + std * z) # TanhNormal distribution as actions; reparameterization trick + action = self.action_range * action_0 # according to original paper, with an extra last term for normalizing different action range - log_prob = Normal(mean, std).log_prob(mean+ std*z) - tf.math.log(1. - action_0**2 + epsilon) - np.log(self.action_range) - # both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action); - # the Normal.log_prob outputs the same dim of input features instead of 1 dim probability, + log_prob = Normal(mean, std).log_prob(mean + std * z) - tf.math.log(1. - action_0**2 + + epsilon) - np.log(self.action_range) + # both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action); + # the Normal.log_prob outputs the same dim of input features instead of 1 dim probability, # needs sum up across the dim of actions to get 1 dim probability; or else use Multivariate Normal. - log_prob = tf.reduce_sum(log_prob, axis=1)[:, np.newaxis] # expand dim as reduce_sum causes 1 dim reduced + log_prob = tf.reduce_sum(log_prob, axis=1)[:, np.newaxis] # expand dim as reduce_sum causes 1 dim reduced return action, log_prob, z, mean, log_std - - + def get_action(self, state, deterministic): ''' generate action with state for interaction with envronment ''' mean, log_std = self.forward([state]) std = tf.math.exp(log_std) - + normal = Normal(0, 1) - z = normal.sample() - action = self.action_range * tf.math.tanh(mean + std*z) # TanhNormal distribution as actions; reparameterization trick - - action = self.action_range*mean if deterministic else action - return action.numpy()[0] + z = normal.sample() + action = self.action_range * tf.math.tanh( + mean + std * z + ) # TanhNormal distribution as actions; reparameterization trick + action = self.action_range * mean if deterministic else action + return action.numpy()[0] - def sample_action(self,): + def sample_action(self, ): ''' generate random actions for exploration ''' - a = tf.random.uniform([self.num_actions], -1, 1) + a = tf.random.uniform([self.num_actions], -1, 1) - return self.action_range*a.numpy() + return self.action_range * a.numpy() class SAC_Trainer(): - def __init__(self, replay_buffer, hidden_dim, action_range, soft_q_lr = 3e-4, policy_lr = 3e-4, alpha_lr = 3e-4): + + def __init__(self, replay_buffer, hidden_dim, action_range, soft_q_lr=3e-4, policy_lr=3e-4, alpha_lr=3e-4): self.replay_buffer = replay_buffer # initialize all networks @@ -168,7 +174,7 @@ def __init__(self, replay_buffer, hidden_dim, action_range, soft_q_lr = 3e-4, po self.policy_optimizer = tf.optimizers.Adam(policy_lr) self.alpha_optimizer = tf.optimizers.Adam(alpha_lr) # self.alpha_optimizer = optim.Adam([self.log_alpha], lr=alpha_lr) - + def target_ini(self, net, target_net): ''' hard-copy update for initializing target networks ''' for target_param, param in zip(target_net.trainable_weights, net.trainable_weights): @@ -181,52 +187,53 @@ def target_soft_update(self, net, target_net, soft_tau): target_param.assign( # copy weight value into target parameters target_param * (1.0 - soft_tau) + param * soft_tau ) - return target_net - - def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy=-2, gamma=0.99,soft_tau=1e-2): + return target_net + + def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy=-2, gamma=0.99, soft_tau=1e-2): ''' update all networks in SAC ''' state, action, reward, next_state, done = self.replay_buffer.sample(batch_size) reward = reward[:, np.newaxis] # expand dim done = done[:, np.newaxis] - reward = reward_scale * (reward - np.mean(reward, axis=0)) /np.std(reward, axis=0) # normalize with batch mean and std - + reward = reward_scale * (reward - + np.mean(reward, axis=0)) / np.std(reward, axis=0) # normalize with batch mean and std - # Training Q Function + # Training Q Function new_next_action, next_log_prob, _, _, _ = self.policy_net.evaluate(next_state) target_q_input = tf.concat([next_state, new_next_action], 1) # the dim 0 is number of samples - target_q_min = tf.minimum(self.target_soft_q_net1(target_q_input),self.target_soft_q_net2(target_q_input)) - self.alpha * next_log_prob - target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward + target_q_min = tf.minimum( + self.target_soft_q_net1(target_q_input), self.target_soft_q_net2(target_q_input) + ) - self.alpha * next_log_prob + target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward q_input = tf.concat([state, action], 1) # the dim 0 is number of samples - + with tf.GradientTape() as q1_tape: predicted_q_value1 = self.soft_q_net1(q_input) q_value_loss1 = tf.reduce_mean(tf.losses.mean_squared_error(predicted_q_value1, target_q_value)) q1_grad = q1_tape.gradient(q_value_loss1, self.soft_q_net1.trainable_weights) self.soft_q_optimizer1.apply_gradients(zip(q1_grad, self.soft_q_net1.trainable_weights)) - + with tf.GradientTape() as q2_tape: predicted_q_value2 = self.soft_q_net2(q_input) q_value_loss2 = tf.reduce_mean(tf.losses.mean_squared_error(predicted_q_value2, target_q_value)) q2_grad = q2_tape.gradient(q_value_loss2, self.soft_q_net2.trainable_weights) self.soft_q_optimizer2.apply_gradients(zip(q2_grad, self.soft_q_net2.trainable_weights)) - # Training Policy Function + # Training Policy Function with tf.GradientTape() as p_tape: new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state) new_q_input = tf.concat([state, new_action], 1) # the dim 0 is number of samples ''' implementation 1 ''' - predicted_new_q_value = tf.minimum(self.soft_q_net1(new_q_input),self.soft_q_net2(new_q_input)) + predicted_new_q_value = tf.minimum(self.soft_q_net1(new_q_input), self.soft_q_net2(new_q_input)) ''' implementation 2 ''' # predicted_new_q_value = self.soft_q_net1(new_q_input) policy_loss = tf.reduce_mean(self.alpha * log_prob - predicted_new_q_value) p_grad = p_tape.gradient(policy_loss, self.policy_net.trainable_weights) self.policy_optimizer.apply_gradients(zip(p_grad, self.policy_net.trainable_weights)) - - # Updating alpha w.r.t entropy - # alpha: trade-off between exploration (max entropy) and exploitation (max Q) + # Updating alpha w.r.t entropy + # alpha: trade-off between exploration (max entropy) and exploitation (max Q) if auto_entropy is True: with tf.GradientTape() as alpha_tape: alpha_loss = -tf.reduce_mean((self.log_alpha * (log_prob + target_entropy))) @@ -238,10 +245,10 @@ def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy alpha_loss = 0 # Soft update the target value nets - self.target_soft_q_net1=self.target_soft_update(self.soft_q_net1, self.target_soft_q_net1, soft_tau) - self.target_soft_q_net2=self.target_soft_update(self.soft_q_net2, self.target_soft_q_net2, soft_tau) + self.target_soft_q_net1 = self.target_soft_update(self.soft_q_net1, self.target_soft_q_net1, soft_tau) + self.target_soft_q_net2 = self.target_soft_update(self.soft_q_net2, self.target_soft_q_net2, soft_tau) - def save_weights(self): # save trained weights + def save_weights(self): # save trained weights save_model(self.soft_q_net1, 'model_q_net1', 'SAC') save_model(self.soft_q_net2, 'model_q_net2', 'SAC') save_model(self.target_soft_q_net1, 'model_target_q_net1', 'SAC') @@ -254,7 +261,7 @@ def save_weights(self): # save trained weights # tl.files.save_npz(self.target_soft_q_net2.trainable_weights, name='model_target_q_net2.npz') # tl.files.save_npz(self.policy_net.trainable_weights, name='model_policy_net.npz') - def load_weights(self): # load trained weights + def load_weights(self): # load trained weights # tl.files.load_and_assign_npz(name='model_q_net1.npz', network=self.soft_q_net1) # tl.files.load_and_assign_npz(name='model_q_net2.npz', network=self.soft_q_net2) # tl.files.load_and_assign_npz(name='model_target_q_net1.npz', network=self.target_soft_q_net1) @@ -266,6 +273,7 @@ def load_weights(self): # load trained weights load_model(self.target_soft_q_net2, 'model_target_q_net2', 'SAC') load_model(self.policy_net, 'model_policy_net', 'SAC') + # def plot(frame_idx, rewards): # clear_output(True) # plt.figure(figsize=(20,5)) @@ -274,38 +282,36 @@ def load_weights(self): # load trained weights # plt.xlabel('Episode') # plt.ylabel('Episode Reward') # plt.savefig('sac.png') - # plt.show() - +# plt.show() # choose env ENV = 'Pendulum-v0' env = NormalizedActions(gym.make(ENV)) action_dim = env.action_space.shape[0] -state_dim = env.observation_space.shape[0] -action_range=1. +state_dim = env.observation_space.shape[0] +action_range = 1. replay_buffer_size = 5e5 replay_buffer = ReplayBuffer(replay_buffer_size) - # hyper-parameters for RL training -max_frames = 30000 # total number of steps for training -test_frames = 300 # total number of steps for testing -max_steps = 150 # maximum number of steps for one episode -batch_size = 64 # udpate batchsize -explore_steps = 100 # 500 for random action sampling in the beginning of training -update_itr = 3 # repeated updates for single step -hidden_dim = 32 # size of hidden layers for networks -soft_q_lr = 3e-4 # q_net learning rate -policy_lr = 3e-4 # policy_net learning rate -alpha_lr = 3e-4 # alpha learning rate -policy_target_update_interval = 3 # delayed update for the policy network and target networks +max_frames = 30000 # total number of steps for training +test_frames = 300 # total number of steps for testing +max_steps = 150 # maximum number of steps for one episode +batch_size = 64 # udpate batchsize +explore_steps = 100 # 500 for random action sampling in the beginning of training +update_itr = 3 # repeated updates for single step +hidden_dim = 32 # size of hidden layers for networks +soft_q_lr = 3e-4 # q_net learning rate +policy_lr = 3e-4 # policy_net learning rate +alpha_lr = 3e-4 # alpha learning rate +policy_target_update_interval = 3 # delayed update for the policy network and target networks # explore_noise_scale = 1.0 # range of action noise for exploration # eval_noise_scale = 0.5 # range of action noise for evaluation of action value -reward_scale = 1. # value range of reward +reward_scale = 1. # value range of reward -AUTO_ENTROPY=True # automatically udpating variable alpha for entropy -DETERMINISTIC=False # stochastic action policy if False, otherwise deterministic +AUTO_ENTROPY = True # automatically udpating variable alpha for entropy +DETERMINISTIC = False # stochastic action policy if False, otherwise deterministic sac_trainer=SAC_Trainer(replay_buffer, hidden_dim=hidden_dim, action_range=action_range, \ @@ -320,75 +326,79 @@ def load_weights(self): # load trained weights # training loop if args.train: - frame_idx = 0 - rewards = [] + frame_idx = 0 + rewards = [] while frame_idx < max_frames: - state = env.reset() + state = env.reset() state = state.astype(np.float32) episode_reward = 0 - if frame_idx <1 : + if frame_idx < 1: print('intialize') - _=sac_trainer.policy_net([state]) # need an extra call here to make inside functions be able to use model.forward + _ = sac_trainer.policy_net( + [state] + ) # need an extra call here to make inside functions be able to use model.forward for step in range(max_steps): if frame_idx > explore_steps: - action = sac_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC) + action = sac_trainer.policy_net.get_action(state, deterministic=DETERMINISTIC) else: action = sac_trainer.policy_net.sample_action() - next_state, reward, done, _ = env.step(action) + next_state, reward, done, _ = env.step(action) next_state = next_state.astype(np.float32) env.render() - done = 1 if done == True else 0 + done = 1 if done ==True else 0 replay_buffer.push(state, action, reward, next_state, done) - + state = next_state episode_reward += reward frame_idx += 1 - + if len(replay_buffer) > batch_size: for i in range(update_itr): - sac_trainer.update(batch_size, reward_scale=reward_scale, auto_entropy=AUTO_ENTROPY, target_entropy=-1.*action_dim) - + sac_trainer.update( + batch_size, reward_scale=reward_scale, auto_entropy=AUTO_ENTROPY, + target_entropy=-1. * action_dim + ) + if frame_idx % 500 == 0: - plot(rewards, Algorithm_name = 'SAC', Env_name = ENV) - + plot(rewards, Algorithm_name='SAC', Env_name=ENV) + if done: break - print('Episode: ', frame_idx/max_steps, '| Episode Reward: ', episode_reward) + print('Episode: ', frame_idx / max_steps, '| Episode Reward: ', episode_reward) rewards.append(episode_reward) sac_trainer.save_weights() if args.test: - frame_idx = 0 - rewards = [] + frame_idx = 0 + rewards = [] sac_trainer.load_weights() while frame_idx < test_frames: - state = env.reset() + state = env.reset() state = state.astype(np.float32) episode_reward = 0 - if frame_idx <1 : + if frame_idx < 1: print('intialize') - _=sac_trainer.policy_net([state]) # need an extra call to make inside functions be able to use forward - + _ = sac_trainer.policy_net([state]) # need an extra call to make inside functions be able to use forward for step in range(max_steps): - action = sac_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC) - next_state, reward, done, _ = env.step(action) + action = sac_trainer.policy_net.get_action(state, deterministic=DETERMINISTIC) + next_state, reward, done, _ = env.step(action) next_state = next_state.astype(np.float32) env.render() - done = 1 if done == True else 0 - + done = 1 if done ==True else 0 + state = next_state episode_reward += reward frame_idx += 1 - + # if frame_idx % 50 == 0: # plot(frame_idx, rewards) - + if done: break - print('Episode: ', frame_idx/max_steps, '| Episode Reward: ', episode_reward) + print('Episode: ', frame_idx / max_steps, '| Episode Reward: ', episode_reward) rewards.append(episode_reward) diff --git a/examples/reinforcement_learning/baselines/utils.py b/examples/reinforcement_learning/baselines/utils.py index 2a02ee1a1..89d8ffe5d 100644 --- a/examples/reinforcement_learning/baselines/utils.py +++ b/examples/reinforcement_learning/baselines/utils.py @@ -23,14 +23,14 @@ def plot(episode_rewards, Algorithm_name, Env_name): :Algorithm_name: string :Env_name: string ''' - plt.figure(figsize=(10,5)) - plt.title(Algorithm_name + '-' + Env_name ) + plt.figure(figsize=(10, 5)) + plt.title(Algorithm_name + '-' + Env_name) plt.plot(np.arange(len(episode_rewards)), episode_rewards) plt.xlabel('Episode') plt.ylabel('Episode Reward') if not os.path.exists('img'): os.makedirs('img') - plt.savefig( './img/' + Algorithm_name + '.png') + plt.savefig('./img/' + Algorithm_name + '.png') def save_model(model, Model_name, Algorithm_name): @@ -40,9 +40,10 @@ def save_model(model, Model_name, Algorithm_name): :Model_name: string, e.g. 'model_sac_q1' :Algorithm_name: string, e.g. 'SAC' ''' - if not os.path.exists('model/'+Algorithm_name): - os.makedirs('model/'+Algorithm_name) - tl.files.save_npz(model.trainable_weights, './model/' + Algorithm_name + '/'+Model_name) + if not os.path.exists('model/' + Algorithm_name): + os.makedirs('model/' + Algorithm_name) + tl.files.save_npz(model.trainable_weights, './model/' + Algorithm_name + '/' + Model_name) + def load_model(model, Model_name, Algorithm_name): ''' @@ -52,7 +53,7 @@ def load_model(model, Model_name, Algorithm_name): :Algorithm_name: string, e.g. 'SAC' ''' try: - tl.files.load_and_assign_npz('./model/' + Algorithm_name + '/'+Model_name + '.npz', model) + tl.files.load_and_assign_npz('./model/' + Algorithm_name + '/' + Model_name + '.npz', model) except: print('Load Model Fails!') @@ -66,20 +67,21 @@ class ReplayBuffer: :next_state: (state_dim,) :done: (,), scalar (0 and 1) or bool (True and False) ''' + def __init__(self, capacity): self.capacity = capacity # mamimum number of samples self.buffer = [] - self.position = 0 # pointer - + self.position = 0 # pointer + def push(self, state, action, reward, next_state, done): if len(self.buffer) < self.capacity: self.buffer.append(None) self.buffer[self.position] = (state, action, reward, next_state, done) self.position = int((self.position + 1) % self.capacity) # as a ring buffer - + def sample(self, batch_size): batch = random.sample(self.buffer, batch_size) - state, action, reward, next_state, done = map(np.stack, zip(*batch)) # stack for each element + state, action, reward, next_state, done = map(np.stack, zip(*batch)) # stack for each element ''' the * serves as unpack: sum(a,b) <=> batch=(a,b), sum(*batch) ; zip: a=[1,2], b=[2,3], zip(a,b) => [(1, 2), (2, 3)] ; @@ -87,6 +89,6 @@ def sample(self, batch_size): np.stack((1,2)) => array([1, 2]) ''' return state, action, reward, next_state, done - + def __len__(self): return len(self.buffer) diff --git a/examples/reinforcement_learning/baselines/wrappers.py b/examples/reinforcement_learning/baselines/wrappers.py index 8bb4b99fd..4ae724d3a 100644 --- a/examples/reinforcement_learning/baselines/wrappers.py +++ b/examples/reinforcement_learning/baselines/wrappers.py @@ -78,16 +78,14 @@ def _make_env(env_id, env_type, seed, reward_scale, frame_stack=True): def _make_vec_env(env_id, env_type, nenv, seed, reward_scale, frame_stack=True): """Make vectorized env""" - env = SubprocVecEnv([ - partial(_make_env, env_id, env_type, seed + i, reward_scale, False) - for i in range(nenv) - ]) + env = SubprocVecEnv([partial(_make_env, env_id, env_type, seed + i, reward_scale, False) for i in range(nenv)]) if frame_stack: env = VecFrameStack(env, 4) return env class TimeLimit(gym.Wrapper): + def __init__(self, env, max_episode_steps=None): super(TimeLimit, self).__init__(env) self._max_episode_steps = max_episode_steps @@ -107,6 +105,7 @@ def reset(self, **kwargs): class NoopResetEnv(gym.Wrapper): + def __init__(self, env, noop_max=30): """Sample initial states by taking random number of no-ops on reset. No-op is assumed to be action 0. @@ -137,6 +136,7 @@ def step(self, ac): class FireResetEnv(gym.Wrapper): + def __init__(self, env): """Take action on reset for environments that are fixed until firing.""" super(FireResetEnv, self).__init__(env) @@ -158,6 +158,7 @@ def step(self, ac): class EpisodicLifeEnv(gym.Wrapper): + def __init__(self, env): """Make end-of-life == end-of-episode, but only reset on true game over. Done by DeepMind for the DQN and co. since it helps value estimation. @@ -195,6 +196,7 @@ def reset(self, **kwargs): class MaxAndSkipEnv(gym.Wrapper): + def __init__(self, env, skip=4): """Return only every `skip`-th frame""" super(MaxAndSkipEnv, self).__init__(env) @@ -226,6 +228,7 @@ def reset(self, **kwargs): class ClipRewardEnv(gym.RewardWrapper): + def __init__(self, env): super(ClipRewardEnv, self).__init__(env) @@ -235,6 +238,7 @@ def reward(self, reward): class WarpFrame(gym.ObservationWrapper): + def __init__(self, env, width=84, height=84, grayscale=True): """Warp frames to 84x84 as done in the Nature paper and later work.""" super(WarpFrame, self).__init__(env) @@ -242,9 +246,7 @@ def __init__(self, env, width=84, height=84, grayscale=True): self.height = height self.grayscale = grayscale shape = (self.height, self.width, 1 if self.grayscale else 3) - self.observation_space = spaces.Box( - low=0, high=255, shape=shape, dtype=np.uint8 - ) + self.observation_space = spaces.Box(low=0, high=255, shape=shape, dtype=np.uint8) def observation(self, frame): if self.grayscale: @@ -257,6 +259,7 @@ def observation(self, frame): class FrameStack(gym.Wrapper): + def __init__(self, env, k): """Stack k last frames. Returns lazy array, which is much more memory efficient. @@ -267,9 +270,7 @@ def __init__(self, env, k): self.frames = deque([], maxlen=k) shp = env.observation_space.shape shape = shp[:-1] + (shp[-1] * k, ) - self.observation_space = spaces.Box( - low=0, high=255, shape=shape, dtype=env.observation_space.dtype - ) + self.observation_space = spaces.Box(low=0, high=255, shape=shape, dtype=env.observation_space.dtype) def reset(self): ob = self.env.reset() @@ -288,6 +289,7 @@ def _get_ob(self): class LazyFrames(object): + def __init__(self, frames): """This object ensures that common frames between the observations are only stored once. It exists purely to optimize memory usage which can be @@ -322,6 +324,7 @@ class RewardScaler(gym.RewardWrapper): """Bring rewards to a reasonable scale for PPO. This is incredibly important and effects performance drastically. """ + def __init__(self, env, scale=0.01): super(RewardScaler, self).__init__(env) self.scale = scale @@ -331,6 +334,7 @@ def reward(self, reward): class VecFrameStack(object): + def __init__(self, env, k): self.env = env self.k = k @@ -338,9 +342,7 @@ def __init__(self, env, k): self.frames = deque([], maxlen=k) shp = env.observation_space.shape shape = shp[:-1] + (shp[-1] * k, ) - self.observation_space = spaces.Box( - low=0, high=255, shape=shape, dtype=env.observation_space.dtype - ) + self.observation_space = spaces.Box(low=0, high=255, shape=shape, dtype=env.observation_space.dtype) def reset(self): ob = self.env.reset() @@ -387,6 +389,7 @@ class CloudpickleWrapper(object): """ Uses cloudpickle to serialize contents """ + def __init__(self, x): self.x = x @@ -400,6 +403,7 @@ def __setstate__(self, ob): class SubprocVecEnv(object): + def __init__(self, env_fns): """ envs: list of gym environments to run in subprocesses @@ -413,8 +417,7 @@ def __init__(self, env_fns): self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) zipped_args = zip(self.work_remotes, self.remotes, env_fns) self.ps = [ - Process(target=_worker, - args=(work_remote, remote, CloudpickleWrapper(env_fn))) + Process(target=_worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) for (work_remote, remote, env_fn) in zipped_args ] @@ -495,6 +498,7 @@ def step(self, actions): class Monitor(gym.Wrapper): + def __init__(self, env): super(Monitor, self).__init__(env) self._monitor_rewards = None @@ -507,29 +511,28 @@ def step(self, action): o_, r, done, info = self.env.step(action) self._monitor_rewards.append(r) if done: - info['episode'] = { - 'r': sum(self._monitor_rewards), - 'l': len(self._monitor_rewards)} + info['episode'] = {'r': sum(self._monitor_rewards), 'l': len(self._monitor_rewards)} return o_, r, done, info class NormalizedActions(gym.ActionWrapper): + def _action(self, action): - low = self.action_space.low + low = self.action_space.low high = self.action_space.high - + action = low + (action + 1.0) * 0.5 * (high - low) action = np.clip(action, low, high) - + return action def _reverse_action(self, action): - low = self.action_space.low + low = self.action_space.low high = self.action_space.high - + action = 2 * (action - low) / (high - low) - 1 action = np.clip(action, low, high) - + return action diff --git a/examples/reinforcement_learning/tutorial_A3C.py b/examples/reinforcement_learning/tutorial_A3C.py index 1e2354acf..f904e7c4b 100644 --- a/examples/reinforcement_learning/tutorial_A3C.py +++ b/examples/reinforcement_learning/tutorial_A3C.py @@ -76,26 +76,27 @@ LOG_DIR = './log' # the log file N_WORKERS = multiprocessing.cpu_count() # number of workers accroding to number of cores in cpu # N_WORKERS = 2 # manually set number of workers -MAX_GLOBAL_EP = 8 # number of training episodes +MAX_GLOBAL_EP = 8 # number of training episodes GLOBAL_NET_SCOPE = 'Global_Net' -UPDATE_GLOBAL_ITER = 10 # update global policy after several episodes -GAMMA = 0.99 # reward discount factor +UPDATE_GLOBAL_ITER = 10 # update global policy after several episodes +GAMMA = 0.99 # reward discount factor ENTROPY_BETA = 0.005 # factor for entropy boosted exploration LR_A = 0.00005 # learning rate for actor LR_C = 0.0001 # learning rate for critic GLOBAL_RUNNING_R = [] GLOBAL_EP = 0 # will increase during training, stop training when it >= MAX_GLOBAL_EP - ################### Asynchronous Advantage Actor Critic (A3C) #################################### + class ACNet(object): - def __init__(self, scope, globalAC=None): + def __init__(self, scope, globalAC=None): self.scope = scope self.save_path = './model' w_init = tf.keras.initializers.glorot_normal(seed=None) # initializer, glorot=xavier + def get_actor(input_shape): # policy network with tf.name_scope(self.scope): ni = tl.layers.Input(input_shape, name='in') @@ -103,21 +104,26 @@ def get_actor(input_shape): # policy network nn = tl.layers.Dense(n_units=300, act=tf.nn.relu6, W_init=w_init, name='la2')(nn) mu = tl.layers.Dense(n_units=N_A, act=tf.nn.tanh, W_init=w_init, name='mu')(nn) sigma = tl.layers.Dense(n_units=N_A, act=tf.nn.softplus, W_init=w_init, name='sigma')(nn) - return tl.models.Model(inputs=ni, outputs=[mu, sigma], name=scope+'/Actor') - self.actor = get_actor( [None, N_S]) + return tl.models.Model(inputs=ni, outputs=[mu, sigma], name=scope + '/Actor') + + self.actor = get_actor([None, N_S]) self.actor.train() # train mode for Dropout, BatchNorm - def get_critic(input_shape): # we use Value-function here, but not Q-function. + + def get_critic(input_shape): # we use Value-function here, but not Q-function. with tf.name_scope(self.scope): ni = tl.layers.Input(input_shape, name='in') nn = tl.layers.Dense(n_units=500, act=tf.nn.relu6, W_init=w_init, name='lc')(ni) nn = tl.layers.Dense(n_units=300, act=tf.nn.relu6, W_init=w_init, name='lc2')(nn) v = tl.layers.Dense(n_units=1, W_init=w_init, name='v')(nn) - return tl.models.Model(inputs=ni, outputs=v, name=scope+'/Critic') - self.critic = get_critic( [None, N_S]) - self.critic.train() # train mode for Dropout, BatchNorm - - @tf.function # convert numpy functions to tf.Operations in the TFgraph, return tensor - def update_global(self, buffer_s, buffer_a, buffer_v_target, globalAC): # refer to the global Actor-Crtic network for updating it with samples + return tl.models.Model(inputs=ni, outputs=v, name=scope + '/Critic') + + self.critic = get_critic([None, N_S]) + self.critic.train() # train mode for Dropout, BatchNorm + + @tf.function # convert numpy functions to tf.Operations in the TFgraph, return tensor + def update_global( + self, buffer_s, buffer_a, buffer_v_target, globalAC + ): # refer to the global Actor-Crtic network for updating it with samples ''' update the global critic ''' with tf.GradientTape() as tape: self.v = self.critic(buffer_s) @@ -127,7 +133,6 @@ def update_global(self, buffer_s, buffer_a, buffer_v_target, globalAC): # refer self.c_grads = tape.gradient(self.c_loss, self.critic.trainable_weights) OPT_C.apply_gradients(zip(self.c_grads, globalAC.critic.trainable_weights)) # local grads applies to global net # del tape # Drop the reference to the tape - ''' update the global actor ''' with tf.GradientTape() as tape: self.mu, self.sigma = self.actor(buffer_s) @@ -135,7 +140,7 @@ def update_global(self, buffer_s, buffer_a, buffer_v_target, globalAC): # refer self.mu, self.sigma = self.mu * A_BOUND[1], self.sigma + 1e-5 normal_dist = tfd.Normal(self.mu, self.sigma) # no tf.contrib for tf2.0 - self.a_his = buffer_a # float32 + self.a_his = buffer_a # float32 log_prob = normal_dist.log_prob(self.a_his) exp_v = log_prob * td # td is from the critic part, no gradients for it entropy = normal_dist.entropy() # encourage exploration @@ -144,7 +149,7 @@ def update_global(self, buffer_s, buffer_a, buffer_v_target, globalAC): # refer self.a_grads = tape.gradient(self.a_loss, self.actor.trainable_weights) OPT_A.apply_gradients(zip(self.a_grads, globalAC.actor.trainable_weights)) # local grads applies to global net return self.test # for test purpose - + @tf.function def pull_global(self, globalAC): # run by a local, pull weights from the global nets for l_p, g_p in zip(self.actor.trainable_weights, globalAC.actor.trainable_weights): @@ -158,18 +163,19 @@ def choose_action(self, s): # run by a local with tf.name_scope('wrap_a_out'): self.mu, self.sigma = self.mu * A_BOUND[1], self.sigma + 1e-5 - normal_dist = tfd.Normal(self.mu, self.sigma) # for continuous action space + normal_dist = tfd.Normal(self.mu, self.sigma) # for continuous action space self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), *A_BOUND) return self.A.numpy()[0] - def save_ckpt(self): # save trained weights + def save_ckpt(self): # save trained weights tl.files.save_npz(self.actor.trainable_weights, name='model_actor.npz') tl.files.save_npz(self.critic.trainable_weights, name='model_critic.npz') - def load_ckpt(self): # load trained weights + def load_ckpt(self): # load trained weights tl.files.load_and_assign_npz(name='model_actor.npz', network=self.actor) tl.files.load_and_assign_npz(name='model_critic.npz', network=self.critic) + class Worker(object): def __init__(self, name, globalAC): @@ -189,11 +195,11 @@ def work(self, globalAC): # visualize Worker_0 during training if self.name == 'Worker_0' and total_step % 30 == 0: self.env.render() - s = s.astype('float32') # double to float - a = self.AC.choose_action(s) + s = s.astype('float32') # double to float + a = self.AC.choose_action(s) s_, r, done, _info = self.env.step(a) - - s_ = s_.astype('float32') # double to float + + s_ = s_.astype('float32') # double to float # set robot falls reward to -2 instead of -100 if r == -100: r = -2 @@ -207,7 +213,7 @@ def work(self, globalAC): if done: v_s_ = 0 # terminal else: - v_s_ = self.AC.critic(s_[np.newaxis, :])[0,0] # reduce dim from 2 to 0 + v_s_ = self.AC.critic(s_[np.newaxis, :])[0, 0] # reduce dim from 2 to 0 buffer_v_target = [] @@ -216,7 +222,7 @@ def work(self, globalAC): buffer_v_target.append(v_s_) buffer_v_target.reverse() - + buffer_s, buffer_a, buffer_v_target = ( np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target) ) @@ -265,7 +271,7 @@ def work(self, globalAC): # ============================= TRAINING =============================== t0 = time.time() with tf.device("/cpu:0"): - + OPT_A = tf.optimizers.RMSprop(LR_A, name='RMSPropA') OPT_C = tf.optimizers.RMSprop(LR_C, name='RMSPropC') @@ -283,7 +289,7 @@ def work(self, globalAC): for worker in workers: # t = threading.Thread(target=worker.work) job = lambda: worker.work(GLOBAL_AC) - t = threading.Thread(target=job) + t = threading.Thread(target=job) t.start() worker_threads.append(t) COORD.join(worker_threads) @@ -306,7 +312,7 @@ def work(self, globalAC): rall = 0 while True: env.render() - s = s.astype('float32') # double to float + s = s.astype('float32') # double to float a = GLOBAL_AC.choose_action(s) s, r, d, _ = env.step(a) rall += r diff --git a/examples/reinforcement_learning/tutorial_AC.py b/examples/reinforcement_learning/tutorial_AC.py index 02b6b5a76..0bee2735d 100644 --- a/examples/reinforcement_learning/tutorial_AC.py +++ b/examples/reinforcement_learning/tutorial_AC.py @@ -76,7 +76,6 @@ LR_A = 0.001 # learning rate for actor LR_C = 0.01 # learning rate for critic - ############################### Actor-Critic #################################### @@ -86,10 +85,15 @@ def __init__(self, n_features, n_actions, lr=0.001): def get_model(inputs_shape): ni = tl.layers.Input(inputs_shape, name='state') - nn = tl.layers.Dense(n_units=30, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden')(ni) - nn = tl.layers.Dense(n_units=10, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden2')(nn) + nn = tl.layers.Dense( + n_units=30, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden' + )(ni) + nn = tl.layers.Dense( + n_units=10, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden2' + )(nn) nn = tl.layers.Dense(n_units=n_actions, name='actions')(nn) return tl.models.Model(inputs=ni, outputs=nn, name="Actor") + self.model = get_model([None, n_features]) self.model.train() self.optimizer = tf.optimizers.Adam(lr) @@ -97,10 +101,10 @@ def get_model(inputs_shape): def learn(self, s, a, td): with tf.GradientTape() as tape: _logits = self.model(np.array([s])) - ## cross-entropy loss weighted by td-error (advantage), + ## cross-entropy loss weighted by td-error (advantage), # the cross-entropy mearsures the difference of two probability distributions: the predicted logits and sampled action distribution, - # then weighted by the td-error: small difference of real and predict actions for large td-error (advantage); and vice versa. - _exp_v = tl.rein.cross_entropy_reward_loss(logits=_logits, actions=[a], rewards=td[0]) + # then weighted by the td-error: small difference of real and predict actions for large td-error (advantage); and vice versa. + _exp_v = tl.rein.cross_entropy_reward_loss(logits=_logits, actions=[a], rewards=td[0]) grad = tape.gradient(_exp_v, self.model.trainable_weights) self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights)) return _exp_v @@ -108,17 +112,17 @@ def learn(self, s, a, td): def choose_action(self, s): _logits = self.model(np.array([s])) _probs = tf.nn.softmax(_logits).numpy() - return tl.rein.choice_action_by_probs(_probs.ravel()) # sample according to probability distribution + return tl.rein.choice_action_by_probs(_probs.ravel()) # sample according to probability distribution def choose_action_greedy(self, s): _logits = self.model(np.array([s])) # logits: probability distribution of actions _probs = tf.nn.softmax(_logits).numpy() return np.argmax(_probs.ravel()) - def save_ckpt(self): # save trained weights + def save_ckpt(self): # save trained weights tl.files.save_npz(self.model.trainable_weights, name='model_actor.npz') - def load_ckpt(self): # load trained weights + def load_ckpt(self): # load trained weights tl.files.load_and_assign_npz(name='model_actor.npz', network=self.model) @@ -128,10 +132,15 @@ def __init__(self, n_features, lr=0.01): def get_model(inputs_shape): ni = tl.layers.Input(inputs_shape, name='state') - nn = tl.layers.Dense(n_units=30, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden')(ni) - nn = tl.layers.Dense(n_units=5, act=tf.nn.relu, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden2')(nn) + nn = tl.layers.Dense( + n_units=30, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden' + )(ni) + nn = tl.layers.Dense( + n_units=5, act=tf.nn.relu, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden2' + )(nn) nn = tl.layers.Dense(n_units=1, act=None, name='value')(nn) return tl.models.Model(inputs=ni, outputs=nn, name="Critic") + self.model = get_model([1, n_features]) self.model.train() @@ -148,14 +157,15 @@ def learn(self, s, r, s_): self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights)) return td_error - def save_ckpt(self): # save trained weights + + def save_ckpt(self): # save trained weights tl.files.save_npz(self.model.trainable_weights, name='model_critic.npz') - def load_ckpt(self): # load trained weights + def load_ckpt(self): # load trained weights tl.files.load_and_assign_npz(name='model_critic.npz', network=self.model) -if __name__ == '__main__': +if __name__ == '__main__': ''' choose environment 1. Openai gym: @@ -190,7 +200,7 @@ def load_ckpt(self): # load trained weights t = 0 # number of step in this episode all_r = [] # rewards of all steps while True: - + if RENDER: env.render() a = actor.choose_action(s) @@ -208,17 +218,19 @@ def load_ckpt(self): # load trained weights all_r.append(r) - td_error = critic.learn(s, r, s_new) # learn Value-function : gradient = grad[r + lambda * V(s_new) - V(s)] + td_error = critic.learn( + s, r, s_new + ) # learn Value-function : gradient = grad[r + lambda * V(s_new) - V(s)] try: actor.learn(s, a, td_error) # learn Policy : true_gradient = grad[logPi(s, a) * td_error] - except KeyboardInterrupt: # if Ctrl+C at running actor.learn(), then save model, or exit if not at actor.learn() + except KeyboardInterrupt: # if Ctrl+C at running actor.learn(), then save model, or exit if not at actor.learn() actor.save_ckpt() critic.save_ckpt() # logging s = s_new t += 1 - + if done or t >= MAX_EP_STEPS: ep_rs_sum = sum(all_r) @@ -233,7 +245,6 @@ def load_ckpt(self): # load trained weights print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'\ .format(i_episode, MAX_EPISODE, ep_rs_sum, time.time()-t0 )) - # Early Stopping for quick check if t >= MAX_EP_STEPS: print("Early Stopping") @@ -281,7 +292,7 @@ def load_ckpt(self): # load trained weights all_r.append(r) s = s_new t += 1 - + if done or t >= MAX_EP_STEPS: ep_rs_sum = sum(all_r) diff --git a/examples/reinforcement_learning/tutorial_C51.py b/examples/reinforcement_learning/tutorial_C51.py index 34006541a..0ff50aa55 100644 --- a/examples/reinforcement_learning/tutorial_C51.py +++ b/examples/reinforcement_learning/tutorial_C51.py @@ -42,12 +42,12 @@ parser = argparse.ArgumentParser() parser.add_argument('--mode', help='train or test', default='train') -parser.add_argument('--save_path', default='c51', - help='folder to save if mode == train else model path,' - 'qnet will be saved once target net update') +parser.add_argument( + '--save_path', default='c51', help='folder to save if mode == train else model path,' + 'qnet will be saved once target net update' +) parser.add_argument('--seed', help='random seed', type=int, default=0) -parser.add_argument('--env_id', default='CartPole-v0', - help='CartPole-v0 or PongNoFrameskip-v4') +parser.add_argument('--env_id', default='CartPole-v0', help='CartPole-v0 or PongNoFrameskip-v4') args = parser.parse_args() if args.mode == 'train': @@ -95,13 +95,13 @@ # ############################## C51 #################################### class MLP(tl.models.Model): + def __init__(self, name): super(MLP, self).__init__(name=name) - self.h1 = tl.layers.Dense(64, tf.nn.tanh, in_channels=in_dim[0], - W_init=tf.initializers.GlorotUniform()) - self.qvalue = tl.layers.Dense(out_dim * atom_num, - in_channels=64, name='q', - W_init=tf.initializers.GlorotUniform()) + self.h1 = tl.layers.Dense(64, tf.nn.tanh, in_channels=in_dim[0], W_init=tf.initializers.GlorotUniform()) + self.qvalue = tl.layers.Dense( + out_dim * atom_num, in_channels=64, name='q', W_init=tf.initializers.GlorotUniform() + ) self.reshape = tl.layers.Reshape((-1, out_dim, atom_num)) def forward(self, ni): @@ -110,26 +110,30 @@ def forward(self, ni): class CNN(tl.models.Model): + def __init__(self, name): super(CNN, self).__init__(name=name) h, w, in_channels = in_dim dense_in_channels = 64 * ((h - 28) // 8) * ((w - 28) // 8) - self.conv1 = tl.layers.Conv2d(32, (8, 8), (4, 4), tf.nn.relu, 'VALID', - in_channels=in_channels, name='conv2d_1', - W_init=tf.initializers.GlorotUniform()) - self.conv2 = tl.layers.Conv2d(64, (4, 4), (2, 2), tf.nn.relu, 'VALID', - in_channels=32, name='conv2d_2', - W_init=tf.initializers.GlorotUniform()) - self.conv3 = tl.layers.Conv2d(64, (3, 3), (1, 1), tf.nn.relu, 'VALID', - in_channels=64, name='conv2d_3', - W_init=tf.initializers.GlorotUniform()) + self.conv1 = tl.layers.Conv2d( + 32, (8, 8), (4, 4), tf.nn.relu, 'VALID', in_channels=in_channels, name='conv2d_1', + W_init=tf.initializers.GlorotUniform() + ) + self.conv2 = tl.layers.Conv2d( + 64, (4, 4), (2, 2), tf.nn.relu, 'VALID', in_channels=32, name='conv2d_2', + W_init=tf.initializers.GlorotUniform() + ) + self.conv3 = tl.layers.Conv2d( + 64, (3, 3), (1, 1), tf.nn.relu, 'VALID', in_channels=64, name='conv2d_3', + W_init=tf.initializers.GlorotUniform() + ) self.flatten = tl.layers.Flatten(name='flatten') - self.preq = tl.layers.Dense(256, tf.nn.relu, - in_channels=dense_in_channels, name='pre_q', - W_init=tf.initializers.GlorotUniform()) - self.qvalue = tl.layers.Dense(out_dim * atom_num, - in_channels=256, name='q', - W_init=tf.initializers.GlorotUniform()) + self.preq = tl.layers.Dense( + 256, tf.nn.relu, in_channels=dense_in_channels, name='pre_q', W_init=tf.initializers.GlorotUniform() + ) + self.qvalue = tl.layers.Dense( + out_dim * atom_num, in_channels=256, name='q', W_init=tf.initializers.GlorotUniform() + ) self.reshape = tl.layers.Reshape((-1, out_dim, atom_num)) def forward(self, ni): @@ -139,6 +143,7 @@ def forward(self, ni): class ReplayBuffer(object): + def __init__(self, size): self._storage = [] self._maxsize = size @@ -230,8 +235,8 @@ def sync(net, net_tar): b_dist_ = np.exp(targetqnet(b_o_).numpy()) b_a_ = (b_dist_ * vrange).sum(-1).argmax(1) b_tzj = np.clip( - reward_gamma * (1 - b_d[:, None]) * vrange[None, :] - + b_r[:, None], min_value, max_value) + reward_gamma * (1 - b_d[:, None]) * vrange[None, :] + b_r[:, None], min_value, max_value + ) b_i = (b_tzj - min_value) / deltaz b_l = np.floor(b_i).astype('int64') b_u = np.ceil(b_i).astype('int64') @@ -266,9 +271,10 @@ def sync(net, net_tar): nepisode += 1 reward, length = info['episode']['r'], info['episode']['l'] fps = int(length / (time.time() - t)) - print('Time steps so far: {}, episode so far: {}, ' - 'episode reward: {:.4f}, episode length: {}, FPS: {}' - .format(i, nepisode, reward, length, fps)) + print( + 'Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}, FPS: {}'.format(i, nepisode, reward, length, fps) + ) t = time.time() else: qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') @@ -296,6 +302,7 @@ def sync(net, net_tar): if info.get('episode'): nepisode += 1 reward, length = info['episode']['r'], info['episode']['l'] - print('Time steps so far: {}, episode so far: {}, ' - 'episode reward: {:.4f}, episode length: {}' - .format(i, nepisode, reward, length)) + print( + 'Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}'.format(i, nepisode, reward, length) + ) diff --git a/examples/reinforcement_learning/tutorial_DDPG.py b/examples/reinforcement_learning/tutorial_DDPG.py index 32d2d6a4c..43efe0ad1 100644 --- a/examples/reinforcement_learning/tutorial_DDPG.py +++ b/examples/reinforcement_learning/tutorial_DDPG.py @@ -60,7 +60,6 @@ TEST_PER_EPISODES = 10 # test the model per episodes VAR = 3 # control exploration - ############################### DDPG #################################### @@ -161,8 +160,8 @@ def learn(self): indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE) bt = self.memory[indices, :] bs = bt[:, :self.s_dim] - ba = bt[:, self.s_dim: self.s_dim + self.a_dim] - br = bt[:, -self.s_dim - 1: -self.s_dim] + ba = bt[:, self.s_dim:self.s_dim + self.a_dim] + br = bt[:, -self.s_dim - 1:-self.s_dim] bs_ = bt[:, -self.s_dim:] with tf.GradientTape() as tape: @@ -177,7 +176,7 @@ def learn(self): with tf.GradientTape() as tape: a = self.actor(bs) q = self.critic([bs, a]) - a_loss = - tf.reduce_mean(q) # maximize the q + a_loss = -tf.reduce_mean(q) # maximize the q a_grads = tape.gradient(a_loss, self.actor.trainable_weights) self.actor_opt.apply_gradients(zip(a_grads, self.actor.trainable_weights)) @@ -261,8 +260,12 @@ def load_ckpt(self): s = s_ ep_reward += r if j == MAX_EP_STEPS - 1: - print('\rEpisode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}' - .format(i, MAX_EPISODES, ep_reward, time.time() - t1), end='') + print( + '\rEpisode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( + i, MAX_EPISODES, ep_reward, + time.time() - t1 + ), end='' + ) plt.show() # test if i and not i % TEST_PER_EPISODES: @@ -277,8 +280,12 @@ def load_ckpt(self): s = s_ ep_reward += r if j == MAX_EP_STEPS - 1: - print('\rEpisode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}' - .format(i, MAX_EPISODES, ep_reward, time.time() - t1)) + print( + '\rEpisode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( + i, MAX_EPISODES, ep_reward, + time.time() - t1 + ) + ) reward_buffer.append(ep_reward) diff --git a/examples/reinforcement_learning/tutorial_DPPO.py b/examples/reinforcement_learning/tutorial_DPPO.py index 0b84ff2a6..62eb7f7fb 100644 --- a/examples/reinforcement_learning/tutorial_DPPO.py +++ b/examples/reinforcement_learning/tutorial_DPPO.py @@ -63,17 +63,18 @@ C_UPDATE_STEPS = 10 # critic update steps S_DIM, A_DIM = 3, 1 # state dimension, action dimension EPS = 1e-8 # epsilon -METHOD = [dict(name='kl_pen', kl_target=0.01, lam=0.5), # KL penalty - dict(name='clip', epsilon=0.2), # Clipped surrogate objective, find this is better - ][1] # choose the method for optimization +METHOD = [ + dict(name='kl_pen', kl_target=0.01, lam=0.5), # KL penalty + dict(name='clip', epsilon=0.2), # Clipped surrogate objective, find this is better +][1] # choose the method for optimization N_WORKER = 4 # parallel workers MIN_BATCH_SIZE = 64 # minimum batch size for updating PPO UPDATE_STEP = 10 # loop update operation n-steps - ############################### DPPO #################################### + class PPO(object): ''' PPO class @@ -119,9 +120,10 @@ def a_train(self, tfs, tfa, tfadv): kl_mean = tf.reduce_mean(kl) aloss = -(tf.reduce_mean(surr - tflam * kl)) else: # clipping method, find this is better - aloss = -tf.reduce_mean(tf.minimum( - surr, - tf.clip_by_value(ratio, 1. - METHOD['epsilon'], 1. + METHOD['epsilon']) * tfadv)) + aloss = -tf.reduce_mean( + tf.minimum(surr, + tf.clip_by_value(ratio, 1. - METHOD['epsilon'], 1. + METHOD['epsilon']) * tfadv) + ) a_gard = tape.gradient(aloss, self.actor.trainable_weights) tf.optimizers.Adam(A_LR).apply_gradients(zip(a_gard, self.actor.trainable_weights)) @@ -282,7 +284,7 @@ class Worker(object): def __init__(self, wid): self.wid = wid self.env = gym.make(GAME).unwrapped - self.env.seed(wid*100 + RANDOMSEED) + self.env.seed(wid * 100 + RANDOMSEED) self.ppo = GLOBAL_PPO def work(self): @@ -335,8 +337,12 @@ def work(self): GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 + ep_r * 0.1) GLOBAL_EP += 1 - print('Episode: {}/{} | Worker: {} | Episode Reward: {:.4f} | Running Time: {:.4f}' - .format(GLOBAL_EP, EP_MAX, self.wid, ep_r, time.time() - t0)) + print( + 'Episode: {}/{} | Worker: {} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( + GLOBAL_EP, EP_MAX, self.wid, ep_r, + time.time() - t0 + ) + ) if __name__ == '__main__': diff --git a/examples/reinforcement_learning/tutorial_DQN.py b/examples/reinforcement_learning/tutorial_DQN.py index 2f34f5619..c7d6a10cd 100644 --- a/examples/reinforcement_learning/tutorial_DQN.py +++ b/examples/reinforcement_learning/tutorial_DQN.py @@ -67,7 +67,6 @@ render = False # display the game environment running_reward = None - ##################### DQN ########################## @@ -77,7 +76,6 @@ def to_one_hot(i, n_classes=None): return a - ## Define Q-network q(a,s) that ouput the rewards of 4 actions by given state, i.e. Action-Value Function. # encoding for state: 4x4 grid can be represented by one-hot vector with 16 integers. def get_model(inputs_shape): @@ -86,12 +84,14 @@ def get_model(inputs_shape): return tl.models.Model(inputs=ni, outputs=nn, name="Q-Network") -def save_ckpt(model): # save trained weights +def save_ckpt(model): # save trained weights tl.files.save_npz(model.trainable_weights, name='dqn_model.npz') -def load_ckpt(model): # load trained weights + +def load_ckpt(model): # load trained weights tl.files.load_and_assign_npz(name='dqn_model.npz', network=model) + if __name__ == '__main__': qnetwork = get_model([None, 16]) @@ -150,7 +150,7 @@ def load_ckpt(model): # load trained weights # (i, num_episodes, rAll, running_reward, time.time() - episode_time)) print('Episode: {}/{} | Episode Reward: {:.4f} | Running Average Reward: {:.4f} | Running Time: {:.4f}'\ .format(i, num_episodes, rAll, running_reward, time.time()-t0 )) - save_ckpt(qnetwork) # save model + save_ckpt(qnetwork) # save model if args.test: t0 = time.time() diff --git a/examples/reinforcement_learning/tutorial_DQN_variants.py b/examples/reinforcement_learning/tutorial_DQN_variants.py index e7fb48876..f4bf7954e 100644 --- a/examples/reinforcement_learning/tutorial_DQN_variants.py +++ b/examples/reinforcement_learning/tutorial_DQN_variants.py @@ -58,12 +58,12 @@ parser = argparse.ArgumentParser() parser.add_argument('--mode', help='train or test', default='train') -parser.add_argument('--save_path', default='dqn_variants', - help='folder to save if mode == train else model path,' - 'qnet will be saved once target net update') +parser.add_argument( + '--save_path', default='dqn_variants', help='folder to save if mode == train else model path,' + 'qnet will be saved once target net update' +) parser.add_argument('--seed', help='random seed', type=int, default=0) -parser.add_argument('--env_id', default='CartPole-v0', - help='CartPole-v0 or PongNoFrameskip-v4') +parser.add_argument('--env_id', default='CartPole-v0', help='CartPole-v0 or PongNoFrameskip-v4') args = parser.parse_args() if args.mode == 'train': @@ -107,13 +107,12 @@ # ############################## DQN #################################### class MLP(tl.models.Model): + def __init__(self, name): super(MLP, self).__init__(name=name) self.h1 = tl.layers.Dense(64, tf.nn.tanh, in_channels=in_dim[0]) - self.qvalue = tl.layers.Dense(out_dim, in_channels=64, name='q', - W_init=tf.initializers.GlorotUniform()) - self.svalue = tl.layers.Dense(1, in_channels=64, name='s', - W_init=tf.initializers.GlorotUniform()) + self.qvalue = tl.layers.Dense(out_dim, in_channels=64, name='q', W_init=tf.initializers.GlorotUniform()) + self.svalue = tl.layers.Dense(1, in_channels=64, name='s', W_init=tf.initializers.GlorotUniform()) self.noise_scale = 0 def forward(self, ni): @@ -144,30 +143,32 @@ def forward(self, ni): class CNN(tl.models.Model): + def __init__(self, name): super(CNN, self).__init__(name=name) h, w, in_channels = in_dim dense_in_channels = 64 * ((h - 28) // 8) * ((w - 28) // 8) - self.conv1 = tl.layers.Conv2d(32, (8, 8), (4, 4), tf.nn.relu, 'VALID', - in_channels=in_channels, name='conv2d_1', - W_init=tf.initializers.GlorotUniform()) - self.conv2 = tl.layers.Conv2d(64, (4, 4), (2, 2), tf.nn.relu, 'VALID', - in_channels=32, name='conv2d_2', - W_init=tf.initializers.GlorotUniform()) - self.conv3 = tl.layers.Conv2d(64, (3, 3), (1, 1), tf.nn.relu, 'VALID', - in_channels=64, name='conv2d_3', - W_init=tf.initializers.GlorotUniform()) + self.conv1 = tl.layers.Conv2d( + 32, (8, 8), (4, 4), tf.nn.relu, 'VALID', in_channels=in_channels, name='conv2d_1', + W_init=tf.initializers.GlorotUniform() + ) + self.conv2 = tl.layers.Conv2d( + 64, (4, 4), (2, 2), tf.nn.relu, 'VALID', in_channels=32, name='conv2d_2', + W_init=tf.initializers.GlorotUniform() + ) + self.conv3 = tl.layers.Conv2d( + 64, (3, 3), (1, 1), tf.nn.relu, 'VALID', in_channels=64, name='conv2d_3', + W_init=tf.initializers.GlorotUniform() + ) self.flatten = tl.layers.Flatten(name='flatten') - self.preq = tl.layers.Dense(256, tf.nn.relu, - in_channels=dense_in_channels, name='pre_q', - W_init=tf.initializers.GlorotUniform()) - self.qvalue = tl.layers.Dense(out_dim, in_channels=256, name='q', - W_init=tf.initializers.GlorotUniform()) - self.pres = tl.layers.Dense(256, tf.nn.relu, - in_channels=dense_in_channels, name='pre_s', - W_init=tf.initializers.GlorotUniform()) - self.svalue = tl.layers.Dense(1, in_channels=256, name='state', - W_init=tf.initializers.GlorotUniform()) + self.preq = tl.layers.Dense( + 256, tf.nn.relu, in_channels=dense_in_channels, name='pre_q', W_init=tf.initializers.GlorotUniform() + ) + self.qvalue = tl.layers.Dense(out_dim, in_channels=256, name='q', W_init=tf.initializers.GlorotUniform()) + self.pres = tl.layers.Dense( + 256, tf.nn.relu, in_channels=dense_in_channels, name='pre_s', W_init=tf.initializers.GlorotUniform() + ) + self.svalue = tl.layers.Dense(1, in_channels=256, name='state', W_init=tf.initializers.GlorotUniform()) self.noise_scale = 0 def forward(self, ni): @@ -197,6 +198,7 @@ def forward(self, ni): class ReplayBuffer(object): + def __init__(self, size): self._storage = [] self._maxsize = size @@ -320,8 +322,7 @@ def softmax(x, dim): # calculate loss with tf.GradientTape() as q_tape: b_q = tf.reduce_sum(qnet(b_o) * tf.one_hot(b_a, out_dim), 1) - loss = tf.reduce_mean( - huber_loss(b_q - (b_r + reward_gamma * b_q_))) + loss = tf.reduce_mean(huber_loss(b_q - (b_r + reward_gamma * b_q_))) # backward gradients q_grad = q_tape.gradient(loss, trainabel_weights) @@ -337,9 +338,10 @@ def softmax(x, dim): nepisode += 1 reward, length = info['episode']['r'], info['episode']['l'] fps = int(length / (time.time() - t)) - print('Time steps so far: {}, episode so far: {}, ' - 'episode reward: {:.4f}, episode length: {}, FPS: {}' - .format(i, nepisode, reward, length, fps)) + print( + 'Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}, FPS: {}'.format(i, nepisode, reward, length, fps) + ) t = time.time() else: qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') @@ -365,6 +367,7 @@ def softmax(x, dim): if info.get('episode'): nepisode += 1 reward, length = info['episode']['r'], info['episode']['l'] - print('Time steps so far: {}, episode so far: {}, ' - 'episode reward: {:.4f}, episode length: {}' - .format(i, nepisode, reward, length)) + print( + 'Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}'.format(i, nepisode, reward, length) + ) diff --git a/examples/reinforcement_learning/tutorial_PG.py b/examples/reinforcement_learning/tutorial_PG.py index 77ee0d4f5..42c47aacc 100644 --- a/examples/reinforcement_learning/tutorial_PG.py +++ b/examples/reinforcement_learning/tutorial_PG.py @@ -43,23 +43,23 @@ parser.add_argument('--test', dest='train', action='store_false') args = parser.parse_args() - ##################### hyper parameters #################### ENV_NAME = 'CartPole-v0' # environment name RANDOMSEED = 1 # random seed -DISPLAY_REWARD_THRESHOLD = 400 # renders environment if total episode reward is greater then this threshold -RENDER = False # rendering wastes time +DISPLAY_REWARD_THRESHOLD = 400 # renders environment if total episode reward is greater then this threshold +RENDER = False # rendering wastes time num_episodes = 3000 - ############################### PG #################################### + class PolicyGradient: """ PG class """ + def __init__(self, n_features, n_actions, learning_rate=0.01, reward_decay=0.95): self.n_actions = n_actions self.n_features = n_features @@ -76,16 +76,22 @@ def get_model(inputs_shape): """ with tf.name_scope('inputs'): self.tf_obs = tl.layers.Input(inputs_shape, tf.float32, name="observations") - self.tf_acts = tl.layers.Input([None, ], tf.int32, name="actions_num") - self.tf_vt = tl.layers.Input([None, ], tf.float32, name="actions_value") + self.tf_acts = tl.layers.Input([ + None, + ], tf.int32, name="actions_num") + self.tf_vt = tl.layers.Input([ + None, + ], tf.float32, name="actions_value") # fc1 - layer = tl.layers.Dense(n_units=30, act=tf.nn.tanh, - W_init=tf.random_normal_initializer(mean=0, stddev=0.3), - b_init=tf.constant_initializer(0.1), name='fc1')(self.tf_obs) + layer = tl.layers.Dense( + n_units=30, act=tf.nn.tanh, W_init=tf.random_normal_initializer(mean=0, stddev=0.3), + b_init=tf.constant_initializer(0.1), name='fc1' + )(self.tf_obs) # fc2 - all_act = tl.layers.Dense(n_units=self.n_actions, act=None, - W_init=tf.random_normal_initializer(mean=0, stddev=0.3), - b_init=tf.constant_initializer(0.1), name='all_act')(layer) + all_act = tl.layers.Dense( + n_units=self.n_actions, act=None, W_init=tf.random_normal_initializer(mean=0, stddev=0.3), + b_init=tf.constant_initializer(0.1), name='all_act' + )(layer) return tl.models.Model(inputs=self.tf_obs, outputs=all_act, name='PG model') self.model = get_model([None, n_features]) @@ -192,7 +198,7 @@ def load_ckpt(self): tl.logging.set_verbosity(tl.logging.DEBUG) env = gym.make(ENV_NAME) - env.seed(RANDOMSEED) # reproducible, general Policy gradient has high variance + env.seed(RANDOMSEED) # reproducible, general Policy gradient has high variance env = env.unwrapped print(env.action_space) @@ -239,8 +245,10 @@ def load_ckpt(self): # print("episode:", i_episode, " reward:", int(running_reward)) - print("Episode [%d/%d] \tsum reward: %d \trunning reward: %f \ttook: %.5fs " % - (i_episode, num_episodes, ep_rs_sum, running_reward, time.time() - episode_time)) + print( + "Episode [%d/%d] \tsum reward: %d \trunning reward: %f \ttook: %.5fs " % + (i_episode, num_episodes, ep_rs_sum, running_reward, time.time() - episode_time) + ) reward_buffer.append(running_reward) vt = RL.learn() diff --git a/examples/reinforcement_learning/tutorial_PPO.py b/examples/reinforcement_learning/tutorial_PPO.py index 76ddfe566..d95633234 100644 --- a/examples/reinforcement_learning/tutorial_PPO.py +++ b/examples/reinforcement_learning/tutorial_PPO.py @@ -60,13 +60,14 @@ C_UPDATE_STEPS = 10 # critic update steps S_DIM, A_DIM = 3, 1 # state dimension, action dimension EPS = 1e-8 # epsilon -METHOD = [dict(name='kl_pen', kl_target=0.01, lam=0.5), # KL penalty - dict(name='clip', epsilon=0.2), # Clipped surrogate objective, find this is better - ][1] # choose the method for optimization - +METHOD = [ + dict(name='kl_pen', kl_target=0.01, lam=0.5), # KL penalty + dict(name='clip', epsilon=0.2), # Clipped surrogate objective, find this is better +][1] # choose the method for optimization ############################### PPO #################################### + class PPO(object): ''' PPO class @@ -112,9 +113,10 @@ def a_train(self, tfs, tfa, tfadv): kl_mean = tf.reduce_mean(kl) aloss = -(tf.reduce_mean(surr - tflam * kl)) else: # clipping method, find this is better - aloss = -tf.reduce_mean(tf.minimum( - surr, - tf.clip_by_value(ratio, 1. - METHOD['epsilon'], 1. + METHOD['epsilon']) * tfadv)) + aloss = -tf.reduce_mean( + tf.minimum(surr, + tf.clip_by_value(ratio, 1. - METHOD['epsilon'], 1. + METHOD['epsilon']) * tfadv) + ) a_gard = tape.gradient(aloss, self.actor.trainable_weights) tf.optimizers.Adam(A_LR).apply_gradients(zip(a_gard, self.actor.trainable_weights)) @@ -175,14 +177,16 @@ def update(self, s, a, r): if METHOD['name'] == 'kl_pen': for _ in range(A_UPDATE_STEPS): kl = self.a_train(s, a, adv) - if kl > 4 * METHOD['kl_target']: # this in in google's paper + if kl > 4 * METHOD['kl_target']: # this in in google's paper break - if kl < METHOD['kl_target'] / 1.5: # adaptive lambda, this is in OpenAI's paper + if kl < METHOD['kl_target'] / 1.5: # adaptive lambda, this is in OpenAI's paper METHOD['lam'] /= 2 elif kl > METHOD['kl_target'] * 1.5: METHOD['lam'] *= 2 - METHOD['lam'] = np.clip(METHOD['lam'], 1e-4, 10) # sometimes explode, this clipping is MorvanZhou's solution - else: # clipping method, find this is better (OpenAI's paper) + METHOD['lam'] = np.clip( + METHOD['lam'], 1e-4, 10 + ) # sometimes explode, this clipping is MorvanZhou's solution + else: # clipping method, find this is better (OpenAI's paper) for _ in range(A_UPDATE_STEPS): self.a_train(s, a, adv) @@ -297,8 +301,12 @@ def load_ckpt(self): all_ep_r.append(ep_r) else: all_ep_r.append(all_ep_r[-1] * 0.9 + ep_r * 0.1) - print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}' - .format(ep, EP_MAX, ep_r, time.time() - t0)) + print( + 'Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( + ep, EP_MAX, ep_r, + time.time() - t0 + ) + ) plt.ion() plt.cla() diff --git a/examples/reinforcement_learning/tutorial_Retrace.py b/examples/reinforcement_learning/tutorial_Retrace.py index 9fdfc615e..e1e03cf1d 100644 --- a/examples/reinforcement_learning/tutorial_Retrace.py +++ b/examples/reinforcement_learning/tutorial_Retrace.py @@ -43,12 +43,12 @@ parser = argparse.ArgumentParser() parser.add_argument('--mode', help='train or test', default='train') -parser.add_argument('--save_path', default='retrace', - help='folder to save if mode == train else model path,' - 'qnet will be saved once target net update') +parser.add_argument( + '--save_path', default='retrace', help='folder to save if mode == train else model path,' + 'qnet will be saved once target net update' +) parser.add_argument('--seed', help='random seed', type=int, default=0) -parser.add_argument('--env_id', default='CartPole-v0', - help='CartPole-v0 or PongNoFrameskip-v4') +parser.add_argument('--env_id', default='CartPole-v0', help='CartPole-v0 or PongNoFrameskip-v4') args = parser.parse_args() if args.mode == 'train': @@ -86,11 +86,11 @@ # ############################## Retrace #################################### class MLP(tl.models.Model): + def __init__(self, name): super(MLP, self).__init__(name=name) self.h1 = tl.layers.Dense(64, tf.nn.tanh, in_channels=in_dim[0]) - self.qvalue = tl.layers.Dense(out_dim, in_channels=64, name='q', - W_init=tf.initializers.GlorotUniform()) + self.qvalue = tl.layers.Dense(out_dim, in_channels=64, name='q', W_init=tf.initializers.GlorotUniform()) def forward(self, ni): feature = self.h1(ni) @@ -99,25 +99,28 @@ def forward(self, ni): class CNN(tl.models.Model): + def __init__(self, name): super(CNN, self).__init__(name=name) h, w, in_channels = in_dim dense_in_channels = 64 * ((h - 28) // 8) * ((w - 28) // 8) - self.conv1 = tl.layers.Conv2d(32, (8, 8), (4, 4), tf.nn.relu, 'VALID', - in_channels=in_channels, name='conv2d_1', - W_init=tf.initializers.GlorotUniform()) - self.conv2 = tl.layers.Conv2d(64, (4, 4), (2, 2), tf.nn.relu, 'VALID', - in_channels=32, name='conv2d_2', - W_init=tf.initializers.GlorotUniform()) - self.conv3 = tl.layers.Conv2d(64, (3, 3), (1, 1), tf.nn.relu, 'VALID', - in_channels=64, name='conv2d_3', - W_init=tf.initializers.GlorotUniform()) + self.conv1 = tl.layers.Conv2d( + 32, (8, 8), (4, 4), tf.nn.relu, 'VALID', in_channels=in_channels, name='conv2d_1', + W_init=tf.initializers.GlorotUniform() + ) + self.conv2 = tl.layers.Conv2d( + 64, (4, 4), (2, 2), tf.nn.relu, 'VALID', in_channels=32, name='conv2d_2', + W_init=tf.initializers.GlorotUniform() + ) + self.conv3 = tl.layers.Conv2d( + 64, (3, 3), (1, 1), tf.nn.relu, 'VALID', in_channels=64, name='conv2d_3', + W_init=tf.initializers.GlorotUniform() + ) self.flatten = tl.layers.Flatten(name='flatten') - self.preq = tl.layers.Dense(256, tf.nn.relu, - in_channels=dense_in_channels, name='pre_q', - W_init=tf.initializers.GlorotUniform()) - self.qvalue = tl.layers.Dense(out_dim, in_channels=256, name='q', - W_init=tf.initializers.GlorotUniform()) + self.preq = tl.layers.Dense( + 256, tf.nn.relu, in_channels=dense_in_channels, name='pre_q', W_init=tf.initializers.GlorotUniform() + ) + self.qvalue = tl.layers.Dense(out_dim, in_channels=256, name='q', W_init=tf.initializers.GlorotUniform()) def forward(self, ni): feature = self.flatten(self.conv3(self.conv2(self.conv1(ni)))) @@ -126,6 +129,7 @@ def forward(self, ni): class ReplayBuffer(object): + def __init__(self, size): self._storage = [] self._maxsize = size @@ -152,11 +156,8 @@ def _encode_sample(self, idxes): b_d.append(d) b_pi.append(pi) return ( - np.stack(b_o).astype('float32') * ob_scale, - np.stack(b_a).astype('int32'), - np.stack(b_r).astype('float32'), - np.stack(b_o_).astype('float32') * ob_scale, - np.stack(b_d).astype('float32'), + np.stack(b_o).astype('float32') * ob_scale, np.stack(b_a).astype('int32'), np.stack(b_r).astype('float32'), + np.stack(b_o_).astype('float32') * ob_scale, np.stack(b_d).astype('float32'), np.stack(b_pi).astype('float32') ) @@ -243,9 +244,10 @@ def sync(net, net_tar): nepisode += 1 reward, length = info['episode']['r'], info['episode']['l'] fps = int(length / (time.time() - t)) - print('Time steps so far: {}, episode so far: {}, ' - 'episode reward: {:.4f}, episode length: {}, FPS: {}' - .format(i, nepisode, reward, length, fps)) + print( + 'Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}, FPS: {}'.format(i, nepisode, reward, length, fps) + ) t = time.time() else: qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') @@ -271,6 +273,7 @@ def sync(net, net_tar): if info.get('episode'): nepisode += 1 reward, length = info['episode']['r'], info['episode']['l'] - print('Time steps so far: {}, episode so far: {}, ' - 'episode reward: {:.4f}, episode length: {}' - .format(i, nepisode, reward, length)) + print( + 'Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}'.format(i, nepisode, reward, length) + ) diff --git a/examples/reinforcement_learning/tutorial_SAC.py b/examples/reinforcement_learning/tutorial_SAC.py index c7b93df5e..24831e85f 100644 --- a/examples/reinforcement_learning/tutorial_SAC.py +++ b/examples/reinforcement_learning/tutorial_SAC.py @@ -68,28 +68,29 @@ ##################### hyper parameters #################### # choose env ENV = 'Pendulum-v0' -action_range=1. # scale action, [-action_range, action_range] +action_range = 1. # scale action, [-action_range, action_range] # RL training -max_frames = 40000 # total number of steps for training -test_frames = 300 # total number of steps for testing -max_steps = 150 # maximum number of steps for one episode -batch_size = 64 # udpate batchsize -explore_steps = 100 # 500 for random action sampling in the beginning of training -update_itr = 3 # repeated updates for single step -hidden_dim = 32 # size of hidden layers for networks -soft_q_lr = 3e-4 # q_net learning rate -policy_lr = 3e-4 # policy_net learning rate -alpha_lr = 3e-4 # alpha learning rate -policy_target_update_interval = 3 # delayed update for the policy network and target networks -reward_scale = 1. # value range of reward +max_frames = 40000 # total number of steps for training +test_frames = 300 # total number of steps for testing +max_steps = 150 # maximum number of steps for one episode +batch_size = 64 # udpate batchsize +explore_steps = 100 # 500 for random action sampling in the beginning of training +update_itr = 3 # repeated updates for single step +hidden_dim = 32 # size of hidden layers for networks +soft_q_lr = 3e-4 # q_net learning rate +policy_lr = 3e-4 # policy_net learning rate +alpha_lr = 3e-4 # alpha learning rate +policy_target_update_interval = 3 # delayed update for the policy network and target networks +reward_scale = 1. # value range of reward replay_buffer_size = 5e5 -AUTO_ENTROPY=True # automatically udpating variable alpha for entropy -DETERMINISTIC=False # stochastic action policy if False, otherwise deterministic +AUTO_ENTROPY = True # automatically udpating variable alpha for entropy +DETERMINISTIC = False # stochastic action policy if False, otherwise deterministic ############################### SAC #################################### + class ReplayBuffer: ''' a ring buffer for storing transitions and sampling for training @@ -99,20 +100,21 @@ class ReplayBuffer: :next_state: (state_dim,) :done: (,), scalar (0 and 1) or bool (True and False) ''' + def __init__(self, capacity): self.capacity = capacity self.buffer = [] self.position = 0 - + def push(self, state, action, reward, next_state, done): if len(self.buffer) < self.capacity: self.buffer.append(None) self.buffer[self.position] = (state, action, reward, next_state, done) self.position = int((self.position + 1) % self.capacity) # as a ring buffer - + def sample(self, batch_size): batch = random.sample(self.buffer, batch_size) - state, action, reward, next_state, done = map(np.stack, zip(*batch)) # stack for each element + state, action, reward, next_state, done = map(np.stack, zip(*batch)) # stack for each element ''' the * serves as unpack: sum(a,b) <=> batch=(a,b), sum(*batch) ; zip: a=[1,2], b=[2,3], zip(a,b) => [(1, 2), (2, 3)] ; @@ -120,61 +122,69 @@ def sample(self, batch_size): np.stack((1,2)) => array([1, 2]) ''' return state, action, reward, next_state, done - + def __len__(self): return len(self.buffer) + class NormalizedActions(gym.ActionWrapper): ''' normalize the actions to be in reasonable range ''' + def _action(self, action): - low = self.action_space.low + low = self.action_space.low high = self.action_space.high - + action = low + (action + 1.0) * 0.5 * (high - low) action = np.clip(action, low, high) - + return action def _reverse_action(self, action): - low = self.action_space.low + low = self.action_space.low high = self.action_space.high - + action = 2 * (action - low) / (high - low) - 1 action = np.clip(action, low, high) - + return action - + class SoftQNetwork(Model): ''' the network for evaluate values of state-action pairs: Q(s,a) ''' + def __init__(self, num_inputs, num_actions, hidden_dim, init_w=3e-3): super(SoftQNetwork, self).__init__() input_dim = num_inputs + num_actions - w_init = tf.keras.initializers.glorot_normal(seed=None) # glorot initialization is better than uniform in practice + w_init = tf.keras.initializers.glorot_normal( + seed=None + ) # glorot initialization is better than uniform in practice # w_init = tf.random_uniform_initializer(-init_w, init_w) self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=input_dim, name='q1') self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='q2') self.linear3 = Dense(n_units=1, W_init=w_init, in_channels=hidden_dim, name='q3') - + def forward(self, input): x = self.linear1(input) x = self.linear2(x) x = self.linear3(x) return x - - + + class PolicyNetwork(Model): ''' the network for generating non-determinstic (Gaussian distributed) action from the state input ''' - def __init__(self, num_inputs, num_actions, hidden_dim, action_range=1., init_w=3e-3, log_std_min=-20, log_std_max=2): + + def __init__( + self, num_inputs, num_actions, hidden_dim, action_range=1., init_w=3e-3, log_std_min=-20, log_std_max=2 + ): super(PolicyNetwork, self).__init__() - + self.log_std_min = log_std_min self.log_std_max = log_std_max - + w_init = tf.keras.initializers.glorot_normal(seed=None) # w_init = tf.random_uniform_initializer(-init_w, init_w) - + self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=num_inputs, name='policy1') self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy2') self.linear3 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy3') @@ -187,60 +197,61 @@ def __init__(self, num_inputs, num_actions, hidden_dim, action_range=1., init_w= self.action_range = action_range self.num_actions = num_actions - def forward(self, state): x = self.linear1(state) x = self.linear2(x) x = self.linear3(x) - mean = self.mean_linear(x) + mean = self.mean_linear(x) log_std = self.log_std_linear(x) log_std = tf.clip_by_value(log_std, self.log_std_min, self.log_std_max) - + return mean, log_std - + def evaluate(self, state, epsilon=1e-6): ''' generate action with state for calculating gradients ''' state = state.astype(np.float32) mean, log_std = self.forward(state) - std = tf.math.exp(log_std) # no clip in evaluation, clip affects gradients flow - + std = tf.math.exp(log_std) # no clip in evaluation, clip affects gradients flow + normal = Normal(0, 1) - z = normal.sample() - action_0 = tf.math.tanh(mean + std*z) # TanhNormal distribution as actions; reparameterization trick - action = self.action_range*action_0 + z = normal.sample() + action_0 = tf.math.tanh(mean + std * z) # TanhNormal distribution as actions; reparameterization trick + action = self.action_range * action_0 # according to original paper, with an extra last term for normalizing different action range - log_prob = Normal(mean, std).log_prob(mean+ std*z) - tf.math.log(1. - action_0**2 + epsilon) - np.log(self.action_range) - # both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action); - # the Normal.log_prob outputs the same dim of input features instead of 1 dim probability, + log_prob = Normal(mean, std).log_prob(mean + std * z) - tf.math.log(1. - action_0**2 + + epsilon) - np.log(self.action_range) + # both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action); + # the Normal.log_prob outputs the same dim of input features instead of 1 dim probability, # needs sum up across the dim of actions to get 1 dim probability; or else use Multivariate Normal. - log_prob = tf.reduce_sum(log_prob, axis=1)[:, np.newaxis] # expand dim as reduce_sum causes 1 dim reduced + log_prob = tf.reduce_sum(log_prob, axis=1)[:, np.newaxis] # expand dim as reduce_sum causes 1 dim reduced return action, log_prob, z, mean, log_std - - + def get_action(self, state, deterministic): ''' generate action with state for interaction with envronment ''' mean, log_std = self.forward([state]) std = tf.math.exp(log_std) - + normal = Normal(0, 1) - z = normal.sample() - action = self.action_range * tf.math.tanh(mean + std*z) # TanhNormal distribution as actions; reparameterization trick - - action = self.action_range*mean if deterministic else action - return action.numpy()[0] + z = normal.sample() + action = self.action_range * tf.math.tanh( + mean + std * z + ) # TanhNormal distribution as actions; reparameterization trick + action = self.action_range * mean if deterministic else action + return action.numpy()[0] - def sample_action(self,): + def sample_action(self, ): ''' generate random actions for exploration ''' - a = tf.random.uniform([self.num_actions], -1, 1) + a = tf.random.uniform([self.num_actions], -1, 1) - return self.action_range*a.numpy() + return self.action_range * a.numpy() class SAC_Trainer(): - def __init__(self, replay_buffer, hidden_dim, action_range, soft_q_lr = 3e-4, policy_lr = 3e-4, alpha_lr = 3e-4): + + def __init__(self, replay_buffer, hidden_dim, action_range, soft_q_lr=3e-4, policy_lr=3e-4, alpha_lr=3e-4): self.replay_buffer = replay_buffer # initialize all networks @@ -262,7 +273,7 @@ def __init__(self, replay_buffer, hidden_dim, action_range, soft_q_lr = 3e-4, po self.soft_q_optimizer2 = tf.optimizers.Adam(soft_q_lr) self.policy_optimizer = tf.optimizers.Adam(policy_lr) self.alpha_optimizer = tf.optimizers.Adam(alpha_lr) - + def target_ini(self, net, target_net): ''' hard-copy update for initializing target networks ''' for target_param, param in zip(target_net.trainable_weights, net.trainable_weights): @@ -275,52 +286,53 @@ def target_soft_update(self, net, target_net, soft_tau): target_param.assign( # copy weight value into target parameters target_param * (1.0 - soft_tau) + param * soft_tau ) - return target_net - - def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy=-2, gamma=0.99,soft_tau=1e-2): + return target_net + + def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy=-2, gamma=0.99, soft_tau=1e-2): ''' update all networks in SAC ''' state, action, reward, next_state, done = self.replay_buffer.sample(batch_size) reward = reward[:, np.newaxis] # expand dim done = done[:, np.newaxis] - reward = reward_scale * (reward - np.mean(reward, axis=0)) /np.std(reward, axis=0) # normalize with batch mean and std - + reward = reward_scale * (reward - + np.mean(reward, axis=0)) / np.std(reward, axis=0) # normalize with batch mean and std - # Training Q Function + # Training Q Function new_next_action, next_log_prob, _, _, _ = self.policy_net.evaluate(next_state) target_q_input = tf.concat([next_state, new_next_action], 1) # the dim 0 is number of samples - target_q_min = tf.minimum(self.target_soft_q_net1(target_q_input),self.target_soft_q_net2(target_q_input)) - self.alpha * next_log_prob - target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward + target_q_min = tf.minimum( + self.target_soft_q_net1(target_q_input), self.target_soft_q_net2(target_q_input) + ) - self.alpha * next_log_prob + target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward q_input = tf.concat([state, action], 1) # the dim 0 is number of samples - + with tf.GradientTape() as q1_tape: predicted_q_value1 = self.soft_q_net1(q_input) q_value_loss1 = tf.reduce_mean(tf.losses.mean_squared_error(predicted_q_value1, target_q_value)) q1_grad = q1_tape.gradient(q_value_loss1, self.soft_q_net1.trainable_weights) self.soft_q_optimizer1.apply_gradients(zip(q1_grad, self.soft_q_net1.trainable_weights)) - + with tf.GradientTape() as q2_tape: predicted_q_value2 = self.soft_q_net2(q_input) q_value_loss2 = tf.reduce_mean(tf.losses.mean_squared_error(predicted_q_value2, target_q_value)) q2_grad = q2_tape.gradient(q_value_loss2, self.soft_q_net2.trainable_weights) self.soft_q_optimizer2.apply_gradients(zip(q2_grad, self.soft_q_net2.trainable_weights)) - # Training Policy Function + # Training Policy Function with tf.GradientTape() as p_tape: new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state) new_q_input = tf.concat([state, new_action], 1) # the dim 0 is number of samples ''' implementation 1 ''' - predicted_new_q_value = tf.minimum(self.soft_q_net1(new_q_input),self.soft_q_net2(new_q_input)) + predicted_new_q_value = tf.minimum(self.soft_q_net1(new_q_input), self.soft_q_net2(new_q_input)) # ''' implementation 2 ''' # predicted_new_q_value = self.soft_q_net1(new_q_input) policy_loss = tf.reduce_mean(self.alpha * log_prob - predicted_new_q_value) p_grad = p_tape.gradient(policy_loss, self.policy_net.trainable_weights) self.policy_optimizer.apply_gradients(zip(p_grad, self.policy_net.trainable_weights)) - - # Updating alpha w.r.t entropy - # alpha: trade-off between exploration (max entropy) and exploitation (max Q) + # Updating alpha w.r.t entropy + # alpha: trade-off between exploration (max entropy) and exploitation (max Q) if auto_entropy is True: with tf.GradientTape() as alpha_tape: alpha_loss = -tf.reduce_mean((self.log_alpha * (log_prob + target_entropy))) @@ -332,26 +344,27 @@ def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy alpha_loss = 0 # Soft update the target value nets - self.target_soft_q_net1=self.target_soft_update(self.soft_q_net1, self.target_soft_q_net1, soft_tau) - self.target_soft_q_net2=self.target_soft_update(self.soft_q_net2, self.target_soft_q_net2, soft_tau) + self.target_soft_q_net1 = self.target_soft_update(self.soft_q_net1, self.target_soft_q_net1, soft_tau) + self.target_soft_q_net2 = self.target_soft_update(self.soft_q_net2, self.target_soft_q_net2, soft_tau) - def save_weights(self): # save trained weights + def save_weights(self): # save trained weights tl.files.save_npz(self.soft_q_net1.trainable_weights, name='model_q_net1.npz') tl.files.save_npz(self.soft_q_net2.trainable_weights, name='model_q_net2.npz') tl.files.save_npz(self.target_soft_q_net1.trainable_weights, name='model_target_q_net1.npz') tl.files.save_npz(self.target_soft_q_net2.trainable_weights, name='model_target_q_net2.npz') tl.files.save_npz(self.policy_net.trainable_weights, name='model_policy_net.npz') - def load_weights(self): # load trained weights + def load_weights(self): # load trained weights tl.files.load_and_assign_npz(name='model_q_net1.npz', network=self.soft_q_net1) tl.files.load_and_assign_npz(name='model_q_net2.npz', network=self.soft_q_net2) tl.files.load_and_assign_npz(name='model_target_q_net1.npz', network=self.target_soft_q_net1) tl.files.load_and_assign_npz(name='model_target_q_net2.npz', network=self.target_soft_q_net2) tl.files.load_and_assign_npz(name='model_policy_net.npz', network=self.policy_net) + def plot(frame_idx, rewards): clear_output(True) - plt.figure(figsize=(20,5)) + plt.figure(figsize=(20, 5)) plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1])) plt.plot(rewards) plt.xlabel('Episode') @@ -359,11 +372,12 @@ def plot(frame_idx, rewards): plt.savefig('sac.png') # plt.show() + if __name__ == '__main__': # initialization of env env = NormalizedActions(gym.make(ENV)) action_dim = env.action_space.shape[0] - state_dim = env.observation_space.shape[0] + state_dim = env.observation_space.shape[0] # initialization of buffer replay_buffer = ReplayBuffer(replay_buffer_size) # initialization of trainer @@ -378,82 +392,98 @@ def plot(frame_idx, rewards): # training loop if args.train: - frame_idx = 0 - rewards = [] + frame_idx = 0 + rewards = [] t0 = time.time() while frame_idx < max_frames: - state = env.reset() + state = env.reset() state = state.astype(np.float32) episode_reward = 0 - if frame_idx <1 : + if frame_idx < 1: print('intialize') - _=sac_trainer.policy_net([state]) # need an extra call here to make inside functions be able to use model.forward + _ = sac_trainer.policy_net( + [state] + ) # need an extra call here to make inside functions be able to use model.forward for step in range(max_steps): if frame_idx > explore_steps: - action = sac_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC) + action = sac_trainer.policy_net.get_action(state, deterministic=DETERMINISTIC) else: action = sac_trainer.policy_net.sample_action() - next_state, reward, done, _ = env.step(action) + next_state, reward, done, _ = env.step(action) next_state = next_state.astype(np.float32) env.render() - done = 1 if done == True else 0 + done = 1 if done ==True else 0 # print('s:', state, action, reward, next_state, done) replay_buffer.push(state, action, reward, next_state, done) - + state = next_state episode_reward += reward frame_idx += 1 - + if len(replay_buffer) > batch_size: for i in range(update_itr): - sac_trainer.update(batch_size, reward_scale=reward_scale, auto_entropy=AUTO_ENTROPY, target_entropy=-1.*action_dim) - + sac_trainer.update( + batch_size, reward_scale=reward_scale, auto_entropy=AUTO_ENTROPY, + target_entropy=-1. * action_dim + ) + if frame_idx % 500 == 0: plot(frame_idx, rewards) - + if done: break - episode = int(frame_idx/max_steps) # current episode - all_episodes = int(max_frames/max_steps) # total episodes - print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(episode, all_episodes, episode_reward, time.time()-t0 ) ) + episode = int(frame_idx / max_steps) # current episode + all_episodes = int(max_frames / max_steps) # total episodes + print( + 'Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( + episode, all_episodes, episode_reward, + time.time() - t0 + ) + ) rewards.append(episode_reward) sac_trainer.save_weights() if args.test: - frame_idx = 0 - rewards = [] + frame_idx = 0 + rewards = [] t0 = time.time() sac_trainer.load_weights() while frame_idx < test_frames: - state = env.reset() + state = env.reset() state = state.astype(np.float32) episode_reward = 0 - if frame_idx <1 : + if frame_idx < 1: print('intialize') - _=sac_trainer.policy_net([state]) # need an extra call to make inside functions be able to use forward - + _ = sac_trainer.policy_net( + [state] + ) # need an extra call to make inside functions be able to use forward for step in range(max_steps): - action = sac_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC) - next_state, reward, done, _ = env.step(action) + action = sac_trainer.policy_net.get_action(state, deterministic=DETERMINISTIC) + next_state, reward, done, _ = env.step(action) next_state = next_state.astype(np.float32) env.render() - done = 1 if done == True else 0 - + done = 1 if done ==True else 0 + state = next_state episode_reward += reward frame_idx += 1 - + # if frame_idx % 50 == 0: # plot(frame_idx, rewards) - + if done: break - episode = int(frame_idx/max_steps) - all_episodes = int(test_frames/max_steps) - print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(episode, all_episodes, episode_reward, time.time()-t0 ) ) + episode = int(frame_idx / max_steps) + all_episodes = int(test_frames / max_steps) + print( + 'Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( + episode, all_episodes, episode_reward, + time.time() - t0 + ) + ) rewards.append(episode_reward) diff --git a/examples/reinforcement_learning/tutorial_TD3.py b/examples/reinforcement_learning/tutorial_TD3.py index cf7252397..e90e5b8fb 100644 --- a/examples/reinforcement_learning/tutorial_TD3.py +++ b/examples/reinforcement_learning/tutorial_TD3.py @@ -74,27 +74,27 @@ ##################### hyper parameters #################### # choose env ENV = 'Pendulum-v0' -action_range=1. # scale action, [-action_range, action_range] +action_range = 1. # scale action, [-action_range, action_range] # RL training -max_frames = 40000 # total number of steps for training -test_frames = 300 # total number of steps for testing -max_steps = 150 # maximum number of steps for one episode -batch_size = 64 # udpate batchsize -explore_steps = 500 # 500 for random action sampling in the beginning of training -update_itr = 3 # repeated updates for single step -hidden_dim = 32 # size of hidden layers for networks -q_lr = 3e-4 # q_net learning rate -policy_lr = 3e-4 # policy_net learning rate -policy_target_update_interval = 3 # delayed steps for updating the policy network and target networks -explore_noise_scale = 1.0 # range of action noise for exploration -eval_noise_scale = 0.5 # range of action noise for evaluation of action value -reward_scale = 1. # value range of reward -replay_buffer_size = 5e5 # size of replay buffer - +max_frames = 40000 # total number of steps for training +test_frames = 300 # total number of steps for testing +max_steps = 150 # maximum number of steps for one episode +batch_size = 64 # udpate batchsize +explore_steps = 500 # 500 for random action sampling in the beginning of training +update_itr = 3 # repeated updates for single step +hidden_dim = 32 # size of hidden layers for networks +q_lr = 3e-4 # q_net learning rate +policy_lr = 3e-4 # policy_net learning rate +policy_target_update_interval = 3 # delayed steps for updating the policy network and target networks +explore_noise_scale = 1.0 # range of action noise for exploration +eval_noise_scale = 0.5 # range of action noise for evaluation of action value +reward_scale = 1. # value range of reward +replay_buffer_size = 5e5 # size of replay buffer ############################### TD3 #################################### + class ReplayBuffer: ''' a ring buffer for storing transitions and sampling for training @@ -104,20 +104,21 @@ class ReplayBuffer: :next_state: (state_dim,) :done: (,), scalar (0 and 1) or bool (True and False) ''' + def __init__(self, capacity): self.capacity = capacity self.buffer = [] self.position = 0 - + def push(self, state, action, reward, next_state, done): if len(self.buffer) < self.capacity: self.buffer.append(None) self.buffer[self.position] = (state, action, reward, next_state, done) self.position = int((self.position + 1) % self.capacity) # as a ring buffer - + def sample(self, batch_size): batch = random.sample(self.buffer, batch_size) - state, action, reward, next_state, done = map(np.stack, zip(*batch)) # stack for each element + state, action, reward, next_state, done = map(np.stack, zip(*batch)) # stack for each element ''' the * serves as unpack: sum(a,b) <=> batch=(a,b), sum(*batch) ; zip: a=[1,2], b=[2,3], zip(a,b) => [(1, 2), (2, 3)] ; @@ -125,32 +126,36 @@ def sample(self, batch_size): np.stack((1,2)) => array([1, 2]) ''' return state, action, reward, next_state, done - + def __len__(self): return len(self.buffer) + class NormalizedActions(gym.ActionWrapper): ''' normalize the actions to be in reasonable range ''' + def _action(self, action): - low = self.action_space.low + low = self.action_space.low high = self.action_space.high - + action = low + (action + 1.0) * 0.5 * (high - low) action = np.clip(action, low, high) - + return action def _reverse_action(self, action): - low = self.action_space.low + low = self.action_space.low high = self.action_space.high - + action = 2 * (action - low) / (high - low) - 1 action = np.clip(action, low, high) - + return action - + + class QNetwork(Model): ''' the network for evaluate values of state-action pairs: Q(s,a) ''' + def __init__(self, num_inputs, num_actions, hidden_dim, init_w=3e-3): super(QNetwork, self).__init__() input_dim = num_inputs + num_actions @@ -160,21 +165,23 @@ def __init__(self, num_inputs, num_actions, hidden_dim, init_w=3e-3): self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=input_dim, name='q1') self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='q2') self.linear3 = Dense(n_units=1, W_init=w_init, in_channels=hidden_dim, name='q3') - + def forward(self, input): x = self.linear1(input) x = self.linear2(x) x = self.linear3(x) return x - + + class PolicyNetwork(Model): ''' the network for generating non-determinstic (Gaussian distributed) action from the state input ''' + def __init__(self, num_inputs, num_actions, hidden_dim, action_range=1., init_w=3e-3): super(PolicyNetwork, self).__init__() # w_init = tf.keras.initializers.glorot_normal(seed=None) w_init = tf.random_uniform_initializer(-init_w, init_w) - + self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=num_inputs, name='policy1') self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy2') self.linear3 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy3') @@ -185,36 +192,34 @@ def __init__(self, num_inputs, num_actions, hidden_dim, action_range=1., init_w= self.action_range = action_range self.num_actions = num_actions - def forward(self, state): x = self.linear1(state) x = self.linear2(x) x = self.linear3(x) - output = tf.nn.tanh(self.output_linear(x)) # unit range output [-1, 1] - + output = tf.nn.tanh(self.output_linear(x)) # unit range output [-1, 1] + return output - + def evaluate(self, state, eval_noise_scale): ''' generate action with state for calculating gradients; eval_noise_scale: as the trick of target policy smoothing, for generating noisy actions. ''' state = state.astype(np.float32) - action = self.forward(state) - - action = self.action_range*action + action = self.forward(state) + + action = self.action_range * action # add noise normal = Normal(0, 1) - eval_noise_clip = 2*eval_noise_scale + eval_noise_clip = 2 * eval_noise_scale noise = normal.sample(action.shape) * eval_noise_scale noise = tf.clip_by_value(noise, -eval_noise_clip, eval_noise_clip) action = action + noise return action - - + def get_action(self, state, explore_noise_scale): ''' generate action with state for interaction with envronment ''' action = self.forward([state]) @@ -223,19 +228,22 @@ def get_action(self, state, explore_noise_scale): # add noise normal = Normal(0, 1) noise = normal.sample(action.shape) * explore_noise_scale - action = self.action_range*action + noise + action = self.action_range * action + noise return action.numpy() - def sample_action(self,): + def sample_action(self, ): ''' generate random actions for exploration ''' - a = tf.random.uniform([self.num_actions], -1, 1) + a = tf.random.uniform([self.num_actions], -1, 1) - return self.action_range*a.numpy() + return self.action_range * a.numpy() class TD3_Trainer(): - def __init__(self, replay_buffer, hidden_dim, action_range, policy_target_update_interval=1, q_lr=3e-4, policy_lr=3e-4): + + def __init__( + self, replay_buffer, hidden_dim, action_range, policy_target_update_interval=1, q_lr=3e-4, policy_lr=3e-4 + ): self.replay_buffer = replay_buffer # initialize all networks @@ -252,44 +260,47 @@ def __init__(self, replay_buffer, hidden_dim, action_range, policy_target_update self.target_q_net1 = self.target_ini(self.q_net1, self.target_q_net1) self.target_q_net2 = self.target_ini(self.q_net2, self.target_q_net2) self.target_policy_net = self.target_ini(self.policy_net, self.target_policy_net) - + self.update_cnt = 0 self.policy_target_update_interval = policy_target_update_interval self.q_optimizer1 = tf.optimizers.Adam(q_lr) self.q_optimizer2 = tf.optimizers.Adam(q_lr) self.policy_optimizer = tf.optimizers.Adam(policy_lr) - + def target_ini(self, net, target_net): ''' hard-copy update for initializing target networks ''' for target_param, param in zip(target_net.trainable_weights, net.trainable_weights): target_param.assign(param) return target_net - + def target_soft_update(self, net, target_net, soft_tau): ''' soft update the target net with Polyak averaging ''' for target_param, param in zip(target_net.trainable_weights, net.trainable_weights): target_param.assign( # copy weight value into target parameters target_param * (1.0 - soft_tau) + param * soft_tau ) - return target_net + return target_net def update(self, batch_size, eval_noise_scale, reward_scale=10., gamma=0.9, soft_tau=1e-2): ''' update all networks in TD3 ''' - self.update_cnt+=1 + self.update_cnt += 1 state, action, reward, next_state, done = self.replay_buffer.sample(batch_size) reward = reward[:, np.newaxis] # expand dim done = done[:, np.newaxis] - new_next_action = self.target_policy_net.evaluate(next_state, eval_noise_scale=eval_noise_scale) # clipped normal noise - reward = reward_scale * (reward - np.mean(reward, axis=0)) /np.std(reward, axis=0) # normalize with batch mean and std - - # Training Q Function - target_q_input = tf.concat([next_state, new_next_action], 1) # the dim 0 is number of samples - target_q_min = tf.minimum(self.target_q_net1(target_q_input),self.target_q_net2(target_q_input)) + new_next_action = self.target_policy_net.evaluate( + next_state, eval_noise_scale=eval_noise_scale + ) # clipped normal noise + reward = reward_scale * (reward - + np.mean(reward, axis=0)) / np.std(reward, axis=0) # normalize with batch mean and std + + # Training Q Function + target_q_input = tf.concat([next_state, new_next_action], 1) # the dim 0 is number of samples + target_q_min = tf.minimum(self.target_q_net1(target_q_input), self.target_q_net2(target_q_input)) - target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward + target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward q_input = tf.concat([state, action], 1) # input of q_net with tf.GradientTape() as q1_tape: @@ -303,26 +314,28 @@ def update(self, batch_size, eval_noise_scale, reward_scale=10., gamma=0.9, soft q_value_loss2 = tf.reduce_mean(tf.square(predicted_q_value2 - target_q_value)) q2_grad = q2_tape.gradient(q_value_loss2, self.q_net2.trainable_weights) self.q_optimizer2.apply_gradients(zip(q2_grad, self.q_net2.trainable_weights)) - - # Training Policy Function - if self.update_cnt%self.policy_target_update_interval==0: + + # Training Policy Function + if self.update_cnt % self.policy_target_update_interval == 0: with tf.GradientTape() as p_tape: - new_action = self.policy_net.evaluate(state, eval_noise_scale=0.0) # no noise, deterministic policy gradients + new_action = self.policy_net.evaluate( + state, eval_noise_scale=0.0 + ) # no noise, deterministic policy gradients new_q_input = tf.concat([state, new_action], 1) # ''' implementation 1 ''' # predicted_new_q_value = tf.minimum(self.q_net1(new_q_input),self.q_net2(new_q_input)) ''' implementation 2 ''' predicted_new_q_value = self.q_net1(new_q_input) - policy_loss = - tf.reduce_mean(predicted_new_q_value) + policy_loss = -tf.reduce_mean(predicted_new_q_value) p_grad = p_tape.gradient(policy_loss, self.policy_net.trainable_weights) self.policy_optimizer.apply_gradients(zip(p_grad, self.policy_net.trainable_weights)) - # Soft update the target nets - self.target_q_net1=self.target_soft_update(self.q_net1, self.target_q_net1, soft_tau) - self.target_q_net2=self.target_soft_update(self.q_net2, self.target_q_net2, soft_tau) - self.target_policy_net=self.target_soft_update(self.policy_net, self.target_policy_net, soft_tau) + # Soft update the target nets + self.target_q_net1 = self.target_soft_update(self.q_net1, self.target_q_net1, soft_tau) + self.target_q_net2 = self.target_soft_update(self.q_net2, self.target_q_net2, soft_tau) + self.target_policy_net = self.target_soft_update(self.policy_net, self.target_policy_net, soft_tau) - def save_weights(self): # save trained weights + def save_weights(self): # save trained weights tl.files.save_npz(self.q_net1.trainable_weights, name='model_q_net1.npz') tl.files.save_npz(self.q_net2.trainable_weights, name='model_q_net2.npz') tl.files.save_npz(self.target_q_net1.trainable_weights, name='model_target_q_net1.npz') @@ -330,7 +343,7 @@ def save_weights(self): # save trained weights tl.files.save_npz(self.policy_net.trainable_weights, name='model_policy_net.npz') tl.files.save_npz(self.target_policy_net.trainable_weights, name='model_target_policy_net.npz') - def load_weights(self): # load trained weights + def load_weights(self): # load trained weights tl.files.load_and_assign_npz(name='model_q_net1.npz', network=self.q_net1) tl.files.load_and_assign_npz(name='model_q_net2.npz', network=self.q_net2) tl.files.load_and_assign_npz(name='model_target_q_net1.npz', network=self.target_q_net1) @@ -339,10 +352,9 @@ def load_weights(self): # load trained weights tl.files.load_and_assign_npz(name='model_target_policy_net.npz', network=self.target_policy_net) - def plot(frame_idx, rewards): clear_output(True) - plt.figure(figsize=(20,5)) + plt.figure(figsize=(20, 5)) plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1])) plt.plot(rewards) plt.xlabel('Episode') @@ -350,12 +362,13 @@ def plot(frame_idx, rewards): plt.savefig('td3.png') # plt.show() + if __name__ == '__main__': # initialization of env env = NormalizedActions(gym.make(ENV)) action_dim = env.action_space.shape[0] - state_dim = env.observation_space.shape[0] + state_dim = env.observation_space.shape[0] # initialization of buffer replay_buffer = ReplayBuffer(replay_buffer_size) # initialization of trainer @@ -371,18 +384,19 @@ def plot(frame_idx, rewards): # training loop if args.train: - frame_idx = 0 - rewards = [] + frame_idx = 0 + rewards = [] t0 = time.time() while frame_idx < max_frames: - state = env.reset() + state = env.reset() state = state.astype(np.float32) episode_reward = 0 - if frame_idx <1 : + if frame_idx < 1: print('intialize') - _=td3_trainer.policy_net([state]) # need an extra call here to make inside functions be able to use model.forward - _=td3_trainer.target_policy_net([state]) - + _ = td3_trainer.policy_net( + [state] + ) # need an extra call here to make inside functions be able to use model.forward + _ = td3_trainer.target_policy_net([state]) for step in range(max_steps): if frame_idx > explore_steps: @@ -390,68 +404,69 @@ def plot(frame_idx, rewards): else: action = td3_trainer.policy_net.sample_action() - next_state, reward, done, _ = env.step(action) + next_state, reward, done, _ = env.step(action) next_state = next_state.astype(np.float32) env.render() - done = 1 if done == True else 0 + done = 1 if done ==True else 0 replay_buffer.push(state, action, reward, next_state, done) - + state = next_state episode_reward += reward frame_idx += 1 - + if len(replay_buffer) > batch_size: for i in range(update_itr): td3_trainer.update(batch_size, eval_noise_scale=0.5, reward_scale=1.) - + if frame_idx % 500 == 0: plot(frame_idx, rewards) - + if done: break - episode = int(frame_idx/max_steps) # current episode - all_episodes = int(max_frames/max_steps) # total episodes + episode = int(frame_idx / max_steps) # current episode + all_episodes = int(max_frames / max_steps) # total episodes print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'\ .format(episode, all_episodes, episode_reward, time.time()-t0 )) rewards.append(episode_reward) td3_trainer.save_weights() if args.test: - frame_idx = 0 - rewards = [] + frame_idx = 0 + rewards = [] t0 = time.time() td3_trainer.load_weights() while frame_idx < test_frames: - state = env.reset() + state = env.reset() state = state.astype(np.float32) episode_reward = 0 - if frame_idx <1 : + if frame_idx < 1: print('intialize') - _=td3_trainer.policy_net([state]) # need an extra call to make inside functions be able to use forward - _=td3_trainer.target_policy_net([state]) - + _ = td3_trainer.policy_net( + [state] + ) # need an extra call to make inside functions be able to use forward + _ = td3_trainer.target_policy_net([state]) for step in range(max_steps): action = td3_trainer.policy_net.get_action(state, explore_noise_scale=1.0) - next_state, reward, done, _ = env.step(action) + next_state, reward, done, _ = env.step(action) next_state = next_state.astype(np.float32) env.render() - done = 1 if done == True else 0 - + done = 1 if done ==True else 0 + state = next_state episode_reward += reward frame_idx += 1 - + # if frame_idx % 50 == 0: # plot(frame_idx, rewards) - + if done: break - episode = int(frame_idx/max_steps) - all_episodes = int(test_frames/max_steps) + episode = int(frame_idx / max_steps) + all_episodes = int(test_frames / max_steps) print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'\ .format(episode, all_episodes, episode_reward, time.time()-t0 ) ) rewards.append(episode_reward) diff --git a/examples/reinforcement_learning/tutorial_TRPO.py b/examples/reinforcement_learning/tutorial_TRPO.py index 47b3c874a..f64a0a0c0 100644 --- a/examples/reinforcement_learning/tutorial_TRPO.py +++ b/examples/reinforcement_learning/tutorial_TRPO.py @@ -77,9 +77,9 @@ SAVE_FREQ = 10 # How often (in terms of gap between epochs) to save the current policy and value function EPS = 1e-8 # epsilon - ##################### functions #################### + def combined_shape(length, shape=None): """ combine length and shape based on shape type @@ -137,7 +137,7 @@ def input_layer_from_space(space): if isinstance(space, Box): return input_layer(space.shape) elif isinstance(space, Discrete): - return tl.layers.Input(dtype=tf.int32, shape=(None,)) + return tl.layers.Input(dtype=tf.int32, shape=(None, )) raise NotImplementedError @@ -150,7 +150,7 @@ def input_layers_from_spaces(*args): return [input_layer_from_space(space) for space in args] -def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): +def mlp(x, hidden_sizes=(32, ), activation=tf.tanh, output_activation=None): """ create Multi-Layer Perception :param x: tensorlayer input layer @@ -191,7 +191,7 @@ def gaussian_likelihood(x, mu, log_std): :param log_std: log std :return: gaussian likelihood """ - pre_sum = -0.5 * (((x - mu) / (tf.exp(log_std) + EPS)) ** 2 + 2 * log_std + np.log(2 * np.pi)) + pre_sum = -0.5 * (((x - mu) / (tf.exp(log_std) + EPS))**2 + 2 * log_std + np.log(2 * np.pi)) return tf.reduce_sum(pre_sum, axis=1) @@ -202,7 +202,7 @@ def diagonal_gaussian_kl(mu0, log_std0, mu1, log_std1): (https://en.wikipedia.org/wiki/Kullback-Leibler_divergence#Multivariate_normal_distributions) """ var0, var1 = tf.exp(2 * log_std0), tf.exp(2 * log_std1) - pre_sum = 0.5 * (((mu1 - mu0) ** 2 + var0) / (var1 + EPS) - 1) + log_std1 - log_std0 + pre_sum = 0.5 * (((mu1 - mu0)**2 + var0) / (var1 + EPS) - 1) + log_std1 - log_std0 all_kls = tf.reduce_sum(pre_sum, axis=1) return tf.reduce_mean(all_kls) @@ -222,7 +222,7 @@ def flat_concat(xs): :param xs: a list of tensor :return: flat tensor """ - return tf.concat([tf.reshape(x, (-1,)) for x in xs], axis=0) + return tf.concat([tf.reshape(x, (-1, )) for x in xs], axis=0) def assign_params_from_flat(x, params): @@ -334,8 +334,10 @@ def cal_outputs_1(self, states, actions, old_log_std_ph, old_mu_ph): """ -def mlp_actor_critic(x: 'env.observation_space', a: 'env.action_space', hidden_sizes=(64, 64), activation=tf.tanh, - output_activation=None): +def mlp_actor_critic( + x: 'env.observation_space', a: 'env.action_space', hidden_sizes=(64, 64), activation=tf.tanh, + output_activation=None +): """ create actor and critic :param x: observation space @@ -354,6 +356,7 @@ def mlp_actor_critic(x: 'env.observation_space', a: 'env.action_space', hidden_s raise ValueError('action space type error') class Critic: + def __init__(self, obs_space, hidden_layer_sizes, activation_funcs): inputs = input_layer_from_space(obs_space) self.model = tl.models.Model(inputs, mlp(inputs, list(hidden_layer_sizes) + [1], activation_funcs, None)) @@ -443,12 +446,11 @@ def get(self): # the next two lines implement the advantage normalization trick adv_mean, adv_std = np.mean(self.adv_buf), np.std(self.adv_buf) self.adv_buf = (self.adv_buf - adv_mean) / adv_std - return [self.obs_buf, self.act_buf, self.adv_buf, self.ret_buf, - self.logp_buf] + values_as_sorted_list(self.info_bufs) + return [self.obs_buf, self.act_buf, self.adv_buf, self.ret_buf, self.logp_buf + ] + values_as_sorted_list(self.info_bufs) ##################### TRPO #################### - """ Trust Region Policy Optimization @@ -462,6 +464,7 @@ class TRPO: """ trpo class """ + def __init__(self, obs_space, act_space): obs_dim = obs_space.shape @@ -497,7 +500,7 @@ def get_action_ops(self, states): res0 = [pi, v, logp_pi] + values_as_sorted_list(info) res = [] for i in res0: - res.append(i + 0) # transfer to tensor + res.append(i + 0) # transfer to tensor return res # TRPO losses @@ -522,7 +525,7 @@ def v_loss(self, inputs): """ x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, *info_values = inputs v = self.critic.critic_cal_func(x_ph) - v_loss = tf.reduce_mean((ret_ph - v) ** 2) + v_loss = tf.reduce_mean((ret_ph - v)**2) return v_loss def train_vf(self, inputs): @@ -653,7 +656,7 @@ def set_and_eval(step): # trpo augments npg with backtracking line search, hard kl for j in range(BACKTRACK_ITERS): - kl, pi_l_new = set_and_eval(step=BACKTRACK_COEFF ** j) + kl, pi_l_new = set_and_eval(step=BACKTRACK_COEFF**j) if kl <= DELTA and pi_l_new <= pi_l_old: # Accepting new params at step of line search break diff --git a/examples/reinforcement_learning/tutorial_atari_pong.py b/examples/reinforcement_learning/tutorial_atari_pong.py index 7e1b28822..0ffee9174 100644 --- a/examples/reinforcement_learning/tutorial_atari_pong.py +++ b/examples/reinforcement_learning/tutorial_atari_pong.py @@ -68,6 +68,8 @@ def prepro(I): episode_number = 0 xs, ys, rs = [], [], [] + + # policy network def get_model(inputs_shape): ni = tl.layers.Input(inputs_shape) @@ -75,12 +77,14 @@ def get_model(inputs_shape): nn = tl.layers.Dense(n_units=3, name='output')(nn) M = tl.models.Model(inputs=ni, outputs=nn, name="mlp") return M + + model = get_model([None, D]) train_weights = model.trainable_weights optimizer = tf.optimizers.RMSprop(lr=learning_rate, decay=decay_rate) -model.train() # set model to train mode (in case you add dropout into the model) +model.train() # set model to train mode (in case you add dropout into the model) start_time = time.time() game_number = 0 @@ -97,8 +101,8 @@ def get_model(inputs_shape): prob = tf.nn.softmax(_prob) # action. 1: STOP 2: UP 3: DOWN - # action = np.random.choice([1,2,3], p=prob.flatten()) - # action = tl.rein.choice_action_by_probs(prob.flatten(), [1, 2, 3]) + # action = np.random.choice([1,2,3], p=prob.flatten()) + # action = tl.rein.choice_action_by_probs(prob.flatten(), [1, 2, 3]) action = tl.rein.choice_action_by_probs(prob[0].numpy(), [1, 2, 3]) observation, reward, done, _ = env.step(action) diff --git a/examples/reinforcement_learning/tutorial_format.py b/examples/reinforcement_learning/tutorial_format.py index a569ab252..f3e9a7e50 100644 --- a/examples/reinforcement_learning/tutorial_format.py +++ b/examples/reinforcement_learning/tutorial_format.py @@ -1,6 +1,5 @@ # the format of turorial algorithm # # please heavily annotate the code # - ''' Algorithm Name ------------------------ @@ -33,11 +32,9 @@ import numpy as np -import package import tensorflow as tf -import 'other -import name' +# import 'other package name' np.random.seed(2) tf.random.set_seed(2) # reproducible @@ -49,56 +46,53 @@ args = parser.parse_args() ##################### hyper parameters #################### -A=a # description of hyper parameter -B=b # description of hyper parameter +A = a # description of hyper parameter +B = b # description of hyper parameter ############################### Algorithm Name #################################### -class C(): # algorithm-specific classes + +class C(): # algorithm-specific classes ''' description of class ''' + def C1(): ''' description of function''' + def D(): # some common functions, could be extracted into utils afterwards ''' description of function ''' if __name__ == '__main__': - '''initialization of env, buffer, networks in algorithms''' - env=... - buffer=... - network1=... - network2=... - + env = 'env model' + buffer = 'buffer model' + network1 = 'network model1' + network2 = 'network model2' # training loop if args.train: t0 = time.time() - while: # loop of episodes - while: # loop of steps in episode + while NOT_FINISHED: # loop of episodes + while NOT_DONE: # loop of steps in episode ''' step ''' - ''' train ''' print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'\ .format(episode, all_episodes, episode_reward, time.time()-t0 )) - ''' plot , following the format of ./baselines/utils/plot()''' - plot(rewards, Algorithm_name = 'SAC', Env_name = 'Pendulum-v0') - + plot(rewards, Algorithm_name='SAC', Env_name='Pendulum-v0') ''' save weights, implemented in defined classes above, following the format of ./baselines/utils/save_model() ''' model.save_weights() - # testing loop if args.test: t0 = time.time() ''' save weights, implemented in defined classes above, following the format of ./baselines/utils/load_model() ''' model.load_weights() - while: # loop of episodes - while: # loop of steps in episode + while NOT_FINISHED: # loop of episodes + while NOT_DONE: # loop of steps in episode ''' step ''' print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'\ diff --git a/examples/reinforcement_learning/tutorial_prioritized_replay.py b/examples/reinforcement_learning/tutorial_prioritized_replay.py index 34f3b85de..8f5f60404 100644 --- a/examples/reinforcement_learning/tutorial_prioritized_replay.py +++ b/examples/reinforcement_learning/tutorial_prioritized_replay.py @@ -45,12 +45,12 @@ parser = argparse.ArgumentParser() parser.add_argument('--mode', help='train or test', default='train') -parser.add_argument('--save_path', default='per', - help='folder to save if mode == train else model path,' - 'qnet will be saved once target net update') +parser.add_argument( + '--save_path', default='per', help='folder to save if mode == train else model path,' + 'qnet will be saved once target net update' +) parser.add_argument('--seed', help='random seed', type=int, default=0) -parser.add_argument('--env_id', default='CartPole-v0', - help='CartPole-v0 or PongNoFrameskip-v4') +parser.add_argument('--env_id', default='CartPole-v0', help='CartPole-v0 or PongNoFrameskip-v4') args = parser.parse_args() if args.mode == 'train': @@ -95,36 +95,39 @@ # ############################## PER #################################### class MLP(tl.models.Model): + def __init__(self, name): super(MLP, self).__init__(name=name) self.h1 = tl.layers.Dense(64, tf.nn.tanh, in_channels=in_dim[0]) - self.qvalue = tl.layers.Dense(out_dim, in_channels=64, name='q', - W_init=tf.initializers.GlorotUniform()) + self.qvalue = tl.layers.Dense(out_dim, in_channels=64, name='q', W_init=tf.initializers.GlorotUniform()) def forward(self, ni): return self.qvalue(self.h1(ni)) class CNN(tl.models.Model): + def __init__(self, name): super(CNN, self).__init__(name=name) h, w, in_channels = in_dim dense_in_channels = 64 * ((h - 28) // 8) * ((w - 28) // 8) - self.conv1 = tl.layers.Conv2d(32, (8, 8), (4, 4), tf.nn.relu, 'VALID', - in_channels=in_channels, name='conv2d_1', - W_init=tf.initializers.GlorotUniform()) - self.conv2 = tl.layers.Conv2d(64, (4, 4), (2, 2), tf.nn.relu, 'VALID', - in_channels=32, name='conv2d_2', - W_init=tf.initializers.GlorotUniform()) - self.conv3 = tl.layers.Conv2d(64, (3, 3), (1, 1), tf.nn.relu, 'VALID', - in_channels=64, name='conv2d_3', - W_init=tf.initializers.GlorotUniform()) + self.conv1 = tl.layers.Conv2d( + 32, (8, 8), (4, 4), tf.nn.relu, 'VALID', in_channels=in_channels, name='conv2d_1', + W_init=tf.initializers.GlorotUniform() + ) + self.conv2 = tl.layers.Conv2d( + 64, (4, 4), (2, 2), tf.nn.relu, 'VALID', in_channels=32, name='conv2d_2', + W_init=tf.initializers.GlorotUniform() + ) + self.conv3 = tl.layers.Conv2d( + 64, (3, 3), (1, 1), tf.nn.relu, 'VALID', in_channels=64, name='conv2d_3', + W_init=tf.initializers.GlorotUniform() + ) self.flatten = tl.layers.Flatten(name='flatten') - self.preq = tl.layers.Dense(256, tf.nn.relu, - in_channels=dense_in_channels, name='pre_q', - W_init=tf.initializers.GlorotUniform()) - self.qvalue = tl.layers.Dense(out_dim, in_channels=256, name='q', - W_init=tf.initializers.GlorotUniform()) + self.preq = tl.layers.Dense( + 256, tf.nn.relu, in_channels=dense_in_channels, name='pre_q', W_init=tf.initializers.GlorotUniform() + ) + self.qvalue = tl.layers.Dense(out_dim, in_channels=256, name='q', W_init=tf.initializers.GlorotUniform()) def forward(self, ni): feature = self.flatten(self.conv3(self.conv2(self.conv1(ni)))) @@ -132,6 +135,7 @@ def forward(self, ni): class SegmentTree(object): + def __init__(self, capacity, operation, neutral_element): """Build a Segment Tree data structure. @@ -172,14 +176,11 @@ def _reduce_helper(self, start, end, node, node_start, node_end): return self._reduce_helper(start, end, 2 * node, node_start, mid) else: if mid + 1 <= start: - return self._reduce_helper(start, end, - 2 * node + 1, mid + 1, node_end) + return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end) else: return self._operation( - self._reduce_helper(start, mid, - 2 * node, node_start, mid), - self._reduce_helper(mid + 1, end, - 2 * node + 1, mid + 1, node_end) + self._reduce_helper(start, mid, 2 * node, node_start, mid), + self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end) ) def reduce(self, start=0, end=None): @@ -211,10 +212,7 @@ def __setitem__(self, idx, val): self._value[idx] = val idx //= 2 while idx >= 1: - self._value[idx] = self._operation( - self._value[2 * idx], - self._value[2 * idx + 1] - ) + self._value[idx] = self._operation(self._value[2 * idx], self._value[2 * idx + 1]) idx //= 2 def __getitem__(self, idx): @@ -223,12 +221,9 @@ def __getitem__(self, idx): class SumSegmentTree(SegmentTree): + def __init__(self, capacity): - super(SumSegmentTree, self).__init__( - capacity=capacity, - operation=operator.add, - neutral_element=0.0 - ) + super(SumSegmentTree, self).__init__(capacity=capacity, operation=operator.add, neutral_element=0.0) def sum(self, start=0, end=None): """Returns arr[start] + ... + arr[end]""" @@ -264,12 +259,9 @@ def find_prefixsum_idx(self, prefixsum): class MinSegmentTree(SegmentTree): + def __init__(self, capacity): - super(MinSegmentTree, self).__init__( - capacity=capacity, - operation=min, - neutral_element=float('inf') - ) + super(MinSegmentTree, self).__init__(capacity=capacity, operation=min, neutral_element=float('inf')) def min(self, start=0, end=None): """Returns min(arr[start], ..., arr[end])""" @@ -278,6 +270,7 @@ def min(self, start=0, end=None): class ReplayBuffer(object): + def __init__(self, size): self._storage = [] self._maxsize = size @@ -317,6 +310,7 @@ def sample(self, batch_size): class PrioritizedReplayBuffer(ReplayBuffer): + def __init__(self, size, alpha, beta): """Create Prioritized Replay buffer. @@ -350,8 +344,8 @@ def add(self, *args): """See ReplayBuffer.store_effect""" idx = self._next_idx super().add(*args) - self._it_sum[idx] = self._max_priority ** self._alpha - self._it_min[idx] = self._max_priority ** self._alpha + self._it_sum[idx] = self._max_priority**self._alpha + self._it_min[idx] = self._max_priority**self._alpha def _sample_proportional(self, batch_size): res = [] @@ -369,10 +363,10 @@ def sample(self, batch_size): it_sum = self._it_sum.sum() p_min = self._it_min.min() / it_sum - max_weight = (p_min * len(self._storage)) ** (-self.beta) + max_weight = (p_min * len(self._storage))**(-self.beta) p_samples = np.asarray([self._it_sum[idx] for idx in idxes]) / it_sum - weights = (p_samples * len(self._storage)) ** (-self.beta) / max_weight + weights = (p_samples * len(self._storage))**(-self.beta) / max_weight encoded_sample = self._encode_sample(idxes) return encoded_sample + (weights, idxes) @@ -382,8 +376,8 @@ def update_priorities(self, idxes, priorities): for idx, priority in zip(idxes, priorities): assert priority > 0 assert 0 <= idx < len(self._storage) - self._it_sum[idx] = priority ** self._alpha - self._it_min[idx] = priority ** self._alpha + self._it_sum[idx] = priority**self._alpha + self._it_min[idx] = priority**self._alpha self._max_priority = max(self._max_priority, priority) @@ -408,8 +402,7 @@ def sync(net, net_tar): targetqnet.infer() sync(qnet, targetqnet) optimizer = tf.optimizers.Adam(learning_rate=lr) - buffer = PrioritizedReplayBuffer( - buffer_size, prioritized_replay_alpha, prioritized_replay_beta0) + buffer = PrioritizedReplayBuffer(buffer_size, prioritized_replay_alpha, prioritized_replay_beta0) o = env.reset() nepisode = 0 @@ -466,9 +459,10 @@ def sync(net, net_tar): nepisode += 1 reward, length = info['episode']['r'], info['episode']['l'] fps = int(length / (time.time() - t)) - print('Time steps so far: {}, episode so far: {}, ' - 'episode reward: {:.4f}, episode length: {}, FPS: {}' - .format(i, nepisode, reward, length, fps)) + print( + 'Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}, FPS: {}'.format(i, nepisode, reward, length, fps) + ) t = time.time() else: qnet = MLP('q') if qnet_type == 'MLP' else CNN('q') @@ -494,6 +488,7 @@ def sync(net, net_tar): if info.get('episode'): nepisode += 1 reward, length = info['episode']['r'], info['episode']['l'] - print('Time steps so far: {}, episode so far: {}, ' - 'episode reward: {:.4f}, episode length: {}' - .format(i, nepisode, reward, length)) + print( + 'Time steps so far: {}, episode so far: {}, ' + 'episode reward: {:.4f}, episode length: {}'.format(i, nepisode, reward, length) + ) diff --git a/examples/reinforcement_learning/tutorial_wrappers.py b/examples/reinforcement_learning/tutorial_wrappers.py index d4843f273..a53e5102d 100644 --- a/examples/reinforcement_learning/tutorial_wrappers.py +++ b/examples/reinforcement_learning/tutorial_wrappers.py @@ -77,16 +77,14 @@ def _make_env(env_id, env_type, seed, reward_scale, frame_stack=True): def _make_vec_env(env_id, env_type, nenv, seed, reward_scale, frame_stack=True): """Make vectorized env""" - env = SubprocVecEnv([ - partial(_make_env, env_id, env_type, seed + i, reward_scale, False) - for i in range(nenv) - ]) + env = SubprocVecEnv([partial(_make_env, env_id, env_type, seed + i, reward_scale, False) for i in range(nenv)]) if frame_stack: env = VecFrameStack(env, 4) return env class TimeLimit(gym.Wrapper): + def __init__(self, env, max_episode_steps=None): super(TimeLimit, self).__init__(env) self._max_episode_steps = max_episode_steps @@ -106,6 +104,7 @@ def reset(self, **kwargs): class NoopResetEnv(gym.Wrapper): + def __init__(self, env, noop_max=30): """Sample initial states by taking random number of no-ops on reset. No-op is assumed to be action 0. @@ -136,6 +135,7 @@ def step(self, ac): class FireResetEnv(gym.Wrapper): + def __init__(self, env): """Take action on reset for environments that are fixed until firing.""" super(FireResetEnv, self).__init__(env) @@ -157,6 +157,7 @@ def step(self, ac): class EpisodicLifeEnv(gym.Wrapper): + def __init__(self, env): """Make end-of-life == end-of-episode, but only reset on true game over. Done by DeepMind for the DQN and co. since it helps value estimation. @@ -194,6 +195,7 @@ def reset(self, **kwargs): class MaxAndSkipEnv(gym.Wrapper): + def __init__(self, env, skip=4): """Return only every `skip`-th frame""" super(MaxAndSkipEnv, self).__init__(env) @@ -225,6 +227,7 @@ def reset(self, **kwargs): class ClipRewardEnv(gym.RewardWrapper): + def __init__(self, env): super(ClipRewardEnv, self).__init__(env) @@ -234,6 +237,7 @@ def reward(self, reward): class WarpFrame(gym.ObservationWrapper): + def __init__(self, env, width=84, height=84, grayscale=True): """Warp frames to 84x84 as done in the Nature paper and later work.""" super(WarpFrame, self).__init__(env) @@ -241,9 +245,7 @@ def __init__(self, env, width=84, height=84, grayscale=True): self.height = height self.grayscale = grayscale shape = (self.height, self.width, 1 if self.grayscale else 3) - self.observation_space = spaces.Box( - low=0, high=255, shape=shape, dtype=np.uint8 - ) + self.observation_space = spaces.Box(low=0, high=255, shape=shape, dtype=np.uint8) def observation(self, frame): if self.grayscale: @@ -256,6 +258,7 @@ def observation(self, frame): class FrameStack(gym.Wrapper): + def __init__(self, env, k): """Stack k last frames. Returns lazy array, which is much more memory efficient. @@ -266,9 +269,7 @@ def __init__(self, env, k): self.frames = deque([], maxlen=k) shp = env.observation_space.shape shape = shp[:-1] + (shp[-1] * k, ) - self.observation_space = spaces.Box( - low=0, high=255, shape=shape, dtype=env.observation_space.dtype - ) + self.observation_space = spaces.Box(low=0, high=255, shape=shape, dtype=env.observation_space.dtype) def reset(self): ob = self.env.reset() @@ -287,6 +288,7 @@ def _get_ob(self): class LazyFrames(object): + def __init__(self, frames): """This object ensures that common frames between the observations are only stored once. It exists purely to optimize memory usage which can be @@ -321,6 +323,7 @@ class RewardScaler(gym.RewardWrapper): """Bring rewards to a reasonable scale for PPO. This is incredibly important and effects performance drastically. """ + def __init__(self, env, scale=0.01): super(RewardScaler, self).__init__(env) self.scale = scale @@ -330,6 +333,7 @@ def reward(self, reward): class VecFrameStack(object): + def __init__(self, env, k): self.env = env self.k = k @@ -337,9 +341,7 @@ def __init__(self, env, k): self.frames = deque([], maxlen=k) shp = env.observation_space.shape shape = shp[:-1] + (shp[-1] * k, ) - self.observation_space = spaces.Box( - low=0, high=255, shape=shape, dtype=env.observation_space.dtype - ) + self.observation_space = spaces.Box(low=0, high=255, shape=shape, dtype=env.observation_space.dtype) def reset(self): ob = self.env.reset() @@ -386,6 +388,7 @@ class CloudpickleWrapper(object): """ Uses cloudpickle to serialize contents """ + def __init__(self, x): self.x = x @@ -399,6 +402,7 @@ def __setstate__(self, ob): class SubprocVecEnv(object): + def __init__(self, env_fns): """ envs: list of gym environments to run in subprocesses @@ -412,8 +416,7 @@ def __init__(self, env_fns): self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) zipped_args = zip(self.work_remotes, self.remotes, env_fns) self.ps = [ - Process(target=_worker, - args=(work_remote, remote, CloudpickleWrapper(env_fn))) + Process(target=_worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) for (work_remote, remote, env_fn) in zipped_args ] @@ -494,6 +497,7 @@ def step(self, actions): class Monitor(gym.Wrapper): + def __init__(self, env): super(Monitor, self).__init__(env) self._monitor_rewards = None @@ -506,29 +510,28 @@ def step(self, action): o_, r, done, info = self.env.step(action) self._monitor_rewards.append(r) if done: - info['episode'] = { - 'r': sum(self._monitor_rewards), - 'l': len(self._monitor_rewards)} + info['episode'] = {'r': sum(self._monitor_rewards), 'l': len(self._monitor_rewards)} return o_, r, done, info class NormalizedActions(gym.ActionWrapper): + def _action(self, action): - low = self.action_space.low + low = self.action_space.low high = self.action_space.high - + action = low + (action + 1.0) * 0.5 * (high - low) action = np.clip(action, low, high) - + return action def _reverse_action(self, action): - low = self.action_space.low + low = self.action_space.low high = self.action_space.high - + action = 2 * (action - low) / (high - low) - 1 action = np.clip(action, low, high) - + return action diff --git a/examples/spatial_transformer_network/tutorial_spatial_transformer_network_dynamic.py b/examples/spatial_transformer_network/tutorial_spatial_transformer_network_dynamic.py index 3170585e4..bc0bae141 100644 --- a/examples/spatial_transformer_network/tutorial_spatial_transformer_network_dynamic.py +++ b/examples/spatial_transformer_network/tutorial_spatial_transformer_network_dynamic.py @@ -55,6 +55,7 @@ def pad_distort_ims_fn(X): ##================== DEFINE MODEL ============================================## class Net(Model): + def __init__(self): super(Net, self).__init__() diff --git a/examples/spatial_transformer_network/tutorial_spatial_transformer_network_static.py b/examples/spatial_transformer_network/tutorial_spatial_transformer_network_static.py index 5f09db68b..515e69967 100644 --- a/examples/spatial_transformer_network/tutorial_spatial_transformer_network_static.py +++ b/examples/spatial_transformer_network/tutorial_spatial_transformer_network_static.py @@ -13,6 +13,7 @@ X_train, y_train, X_val, y_val, X_test, y_test = \ tl.files.load_mnist_dataset(shape=(-1, 28, 28, 1)) + def pad_distort_im_fn(x): """ Zero pads an image to 40x40, and distort it. @@ -122,7 +123,7 @@ def get_model(inputs_shape): X_train_a = tf.expand_dims(X_train_a, 3) _logits, _ = net(X_train_a) # alternatively, you can use MLP(x, is_train=False) and remove MLP.eval() - train_loss += tl.cost.cross_entropy(_logits, y_train_a, name='eval_train_loss') + train_loss += tl.cost.cross_entropy(_logits, y_train_a, name='eval_train_loss') train_acc += np.mean(np.equal(np.argmax(_logits, 1), y_train_a)) n_iter += 1 print(" train loss: %f" % (train_loss / n_iter)) diff --git a/examples/text_classification/tutorial_imdb_fasttext.py b/examples/text_classification/tutorial_imdb_fasttext.py index 731d2fce4..94de9a66f 100644 --- a/examples/text_classification/tutorial_imdb_fasttext.py +++ b/examples/text_classification/tutorial_imdb_fasttext.py @@ -82,6 +82,7 @@ def forward(self, x): z = self.dense2(z) return z + def augment_with_ngrams(unigrams, unigram_vocab_size, n_buckets, n=2): """Augment unigram features with hashed n-gram features.""" @@ -148,11 +149,12 @@ def train_test_and_save_model(): train_accuracy.append(accuracy) if len(train_accuracy) % N_STEPS_TO_PRINT == 0: - print("\t[%d/%d][%d]accuracy " % (epoch + 1, N_EPOCH, len(train_accuracy)), - np.mean(train_accuracy[-N_STEPS_TO_PRINT:])) + print( + "\t[%d/%d][%d]accuracy " % (epoch + 1, N_EPOCH, len(train_accuracy)), + np.mean(train_accuracy[-N_STEPS_TO_PRINT:]) + ) - print("\tSummary: time %.5fs, overall accuracy" % (time.time() - start_time), - np.mean(train_accuracy)) + print("\tSummary: time %.5fs, overall accuracy" % (time.time() - start_time), np.mean(train_accuracy)) # evaluation and testing model.eval() diff --git a/examples/text_word_embedding/tutorial_word2vec_basic.py b/examples/text_word_embedding/tutorial_word2vec_basic.py index 5a1dc842c..074bcb1fa 100644 --- a/examples/text_word_embedding/tutorial_word2vec_basic.py +++ b/examples/text_word_embedding/tutorial_word2vec_basic.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """Vector Representations of Words. This is the minimalistic reimplementation of @@ -52,11 +51,9 @@ parser = argparse.ArgumentParser() -parser.add_argument("--model", - default='one', - type=str, - required=False, - help="The model name. It can be 'one', 'two', 'three', 'four'.") +parser.add_argument( + "--model", default='one', type=str, required=False, help="The model name. It can be 'one', 'two', 'three', 'four'." +) FLAGS = parser.parse_args() @@ -158,12 +155,14 @@ def main_word2vec_basic(): print() batch, labels, data_index = tl.nlp.generate_skip_gram_batch( - data=data, batch_size=8, num_skips=4, skip_window=2, data_index=0) + data=data, batch_size=8, num_skips=4, skip_window=2, data_index=0 + ) for i in range(8): print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]]) batch, labels, data_index = tl.nlp.generate_skip_gram_batch( - data=data, batch_size=8, num_skips=2, skip_window=1, data_index=0) + data=data, batch_size=8, num_skips=2, skip_window=1, data_index=0 + ) for i in range(8): print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]]) @@ -193,7 +192,7 @@ def main_word2vec_basic(): vocabulary_size=vocabulary_size, embedding_size=embedding_size, num_sampled=num_sampled, - activate_nce_loss=True, # nce loss is activated + activate_nce_loss=True, # nce loss is activated nce_loss_args={}, E_init=tl.initializers.random_uniform(minval=-1.0, maxval=1.0), nce_W_init=tl.initializers.truncated_normal(stddev=float(1.0 / np.sqrt(embedding_size))), @@ -230,9 +229,8 @@ def main_word2vec_basic(): while step < num_steps: start_time = time.time() batch_inputs, batch_labels, data_index = tl.nlp.generate_skip_gram_batch( - data=data, batch_size=batch_size, num_skips=num_skips, - skip_window=skip_window, data_index=data_index) - + data=data, batch_size=batch_size, num_skips=num_skips, skip_window=skip_window, data_index=data_index + ) # We perform one update step by evaluating the train_op (including it # in the list of returned values for sess.run() @@ -335,7 +333,6 @@ def predict(analogy): # Compute cosine distance between each pair of target and vocab. # dist has shape [N, vocab_size]. dist = tf.matmul(target, normalized_embeddings, transpose_b=True) - """Predict the top 4 answers for analogy questions.""" _, pred_idx = tf.nn.top_k(dist, n_answer) diff --git a/tensorlayer/files/dataset_loaders/celebA_dataset.py b/tensorlayer/files/dataset_loaders/celebA_dataset.py index 3563d58f9..d5dc5755f 100644 --- a/tensorlayer/files/dataset_loaders/celebA_dataset.py +++ b/tensorlayer/files/dataset_loaders/celebA_dataset.py @@ -5,8 +5,7 @@ import zipfile from tensorlayer import logging -from tensorlayer.files.utils import (download_file_from_google_drive, - exists_or_mkdir, load_file_list) +from tensorlayer.files.utils import (download_file_from_google_drive, exists_or_mkdir, load_file_list) __all__ = ['load_celebA_dataset'] diff --git a/tensorlayer/files/dataset_loaders/cyclegan_dataset.py b/tensorlayer/files/dataset_loaders/cyclegan_dataset.py index 6c465f6c5..e327b3b4c 100644 --- a/tensorlayer/files/dataset_loaders/cyclegan_dataset.py +++ b/tensorlayer/files/dataset_loaders/cyclegan_dataset.py @@ -6,8 +6,7 @@ import numpy as np from tensorlayer import logging, visualize -from tensorlayer.files.utils import (del_file, folder_exists, load_file_list, - maybe_download_and_extract) +from tensorlayer.files.utils import (del_file, folder_exists, load_file_list, maybe_download_and_extract) __all__ = ['load_cyclegan_dataset'] diff --git a/tensorlayer/files/dataset_loaders/flickr_1M_dataset.py b/tensorlayer/files/dataset_loaders/flickr_1M_dataset.py index 9f466c0eb..f2e582ae5 100644 --- a/tensorlayer/files/dataset_loaders/flickr_1M_dataset.py +++ b/tensorlayer/files/dataset_loaders/flickr_1M_dataset.py @@ -4,9 +4,9 @@ import os from tensorlayer import logging, visualize -from tensorlayer.files.utils import (del_file, folder_exists, load_file_list, - load_folder_list, - maybe_download_and_extract, read_file) +from tensorlayer.files.utils import ( + del_file, folder_exists, load_file_list, load_folder_list, maybe_download_and_extract, read_file +) __all__ = ['load_flickr1M_dataset'] diff --git a/tensorlayer/files/dataset_loaders/flickr_25k_dataset.py b/tensorlayer/files/dataset_loaders/flickr_25k_dataset.py index 0492371b0..8049a0653 100644 --- a/tensorlayer/files/dataset_loaders/flickr_25k_dataset.py +++ b/tensorlayer/files/dataset_loaders/flickr_25k_dataset.py @@ -4,9 +4,9 @@ import os from tensorlayer import logging, visualize -from tensorlayer.files.utils import (del_file, folder_exists, load_file_list, - maybe_download_and_extract, natural_keys, - read_file) +from tensorlayer.files.utils import ( + del_file, folder_exists, load_file_list, maybe_download_and_extract, natural_keys, read_file +) __all__ = ['load_flickr25k_dataset'] diff --git a/tensorlayer/files/dataset_loaders/mpii_dataset.py b/tensorlayer/files/dataset_loaders/mpii_dataset.py index 8b90dcdec..a6f88f609 100644 --- a/tensorlayer/files/dataset_loaders/mpii_dataset.py +++ b/tensorlayer/files/dataset_loaders/mpii_dataset.py @@ -4,8 +4,7 @@ import os from tensorlayer import logging -from tensorlayer.files.utils import (del_file, folder_exists, load_file_list, - maybe_download_and_extract) +from tensorlayer.files.utils import (del_file, folder_exists, load_file_list, maybe_download_and_extract) __all__ = ['load_mpii_pose_dataset'] diff --git a/tensorlayer/files/dataset_loaders/voc_dataset.py b/tensorlayer/files/dataset_loaders/voc_dataset.py index c5ccadbcf..e5124b4df 100644 --- a/tensorlayer/files/dataset_loaders/voc_dataset.py +++ b/tensorlayer/files/dataset_loaders/voc_dataset.py @@ -5,9 +5,7 @@ import tensorflow as tf from tensorlayer import logging, utils -from tensorlayer.files.utils import (del_file, del_folder, folder_exists, - load_file_list, - maybe_download_and_extract) +from tensorlayer.files.utils import (del_file, del_folder, folder_exists, load_file_list, maybe_download_and_extract) __all__ = ['load_voc_dataset'] diff --git a/tensorlayer/layers/convolution/quan_conv.py b/tensorlayer/layers/convolution/quan_conv.py index b5398c642..55112993e 100644 --- a/tensorlayer/layers/convolution/quan_conv.py +++ b/tensorlayer/layers/convolution/quan_conv.py @@ -6,8 +6,7 @@ from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer -from tensorlayer.layers.utils import (quantize_active_overflow, - quantize_weight_overflow) +from tensorlayer.layers.utils import (quantize_active_overflow, quantize_weight_overflow) __all__ = ['QuanConv2d'] diff --git a/tensorlayer/layers/convolution/quan_conv_bn.py b/tensorlayer/layers/convolution/quan_conv_bn.py index 1c1593373..bc2aec938 100644 --- a/tensorlayer/layers/convolution/quan_conv_bn.py +++ b/tensorlayer/layers/convolution/quan_conv_bn.py @@ -6,8 +6,7 @@ from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer -from tensorlayer.layers.utils import (quantize_active_overflow, - quantize_weight_overflow) +from tensorlayer.layers.utils import (quantize_active_overflow, quantize_weight_overflow) # from tensorlayer.layers.core import LayersConfig diff --git a/tensorlayer/layers/dense/quan_dense.py b/tensorlayer/layers/dense/quan_dense.py index 2985f4dba..9eabf201f 100644 --- a/tensorlayer/layers/dense/quan_dense.py +++ b/tensorlayer/layers/dense/quan_dense.py @@ -6,8 +6,7 @@ from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer -from tensorlayer.layers.utils import (quantize_active_overflow, - quantize_weight_overflow) +from tensorlayer.layers.utils import (quantize_active_overflow, quantize_weight_overflow) __all__ = [ 'QuanDense', diff --git a/tensorlayer/layers/dense/quan_dense_bn.py b/tensorlayer/layers/dense/quan_dense_bn.py index bcbd70950..9fef11c84 100644 --- a/tensorlayer/layers/dense/quan_dense_bn.py +++ b/tensorlayer/layers/dense/quan_dense_bn.py @@ -7,8 +7,7 @@ from tensorlayer import logging from tensorlayer.decorators import deprecated_alias from tensorlayer.layers.core import Layer -from tensorlayer.layers.utils import (quantize_active_overflow, - quantize_weight_overflow) +from tensorlayer.layers.utils import (quantize_active_overflow, quantize_weight_overflow) __all__ = [ 'QuanDenseLayerWithBN',