diff --git a/CHANGELOG.md b/CHANGELOG.md
index e2c63e979..fa75b2d2b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -90,11 +90,16 @@ To release a new version, please update the changelog as followed:
 ### Changed
 - Add version_info in model.config. (PR #992)
 - Replace tf.nn.func with tf.nn.func.\_\_name\_\_ in model config.
+- Add Reinforcement learning tutorials. (PR #995)
 
 ### Fixed
 
 ### Contributors
 - @warshallrho:
+- @quantumiracle: #995
+- @Tokarev-TT-33: #995
+- @initial-h: #995
+- @Officium: #995
 
 ## [2.0.2] - 2019-6-5
 
@@ -124,6 +129,8 @@ A maintain release.
  - Layer
     - `InstanceNorm`, `InstanceNorm1d`, `InstanceNorm2d`, `InstanceNorm3d` (PR #963)
 
+* Reinforcement learning tutorials. (PR #995)
+
 ### Changed
 - remove `tl.layers.initialize_global_variables(sess)` (PR #931)
 - update `tutorial_generate_text.py`, `tutorial_ptb_lstm.py`. remove `tutorial_ptb_lstm_state_is_tuple.py` (PR #958)
@@ -144,8 +151,15 @@ A maintain release.
 - @warshallrho: #PR966
 - @zsdonghao: #931
 - @yd-yin: #963
+<<<<<<< HEAD
+- @Tokarev-TT-33: # 995
+- @initial-h: # 995
+- @quantumiracle: #995
+- @Officium: #995
+=======
 - @1FengL: #958
 - @dvklopfenstein: #971
+>>>>>>> 560dbb8a17963023a3b1d59a79e1c2752530114a
 
 
 ## [2.0.0] - 2019-05-04
@@ -371,8 +385,10 @@ To many PR for this update, please check [here](https://github.com/tensorlayer/t
   - AtrousDeConv2dLayer added (PR #662)
   - Fix bugs of using `tf.layers` in CNN (PR #686)
 - Optimizer:
+
   - AMSGrad Optimizer added based on `On the Convergence of Adam and Beyond (ICLR 2018)` (PR #636)
 - Setup:
+
   - Creation of installation flaggs `all`, `all_cpu`, and `all_gpu` (PR #660)
 - Test:
   - `test_utils_predict.py` added to reproduce and fix issue #288 (PR #566)
diff --git a/examples/README.md b/examples/README.md
index 82fc62055..04c0aefa5 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1 +1,11 @@
+<br/>
+
+<a href="https://join.slack.com/t/tensorlayer/shared_invite/enQtMjUyMjczMzU2Njg4LWI0MWU0MDFkOWY2YjQ4YjVhMzI5M2VlZmE4YTNhNGY1NjZhMzUwMmQ2MTc0YWRjMjQzMjdjMTg2MWQ2ZWJhYzc" target="\_blank">
+	<div align="center">
+		<img src="../img/join_slack.png" width="40%"/>
+	</div>
+</a>
+
+<br/>
+
 # [Click Me](https://github.com/tensorlayer/awesome-tensorlayer)
diff --git a/examples/basic_tutorials/tutorial_cifar10_cnn_static.py b/examples/basic_tutorials/tutorial_cifar10_cnn_static.py
index c12c791a1..ecb1117ce 100644
--- a/examples/basic_tutorials/tutorial_cifar10_cnn_static.py
+++ b/examples/basic_tutorials/tutorial_cifar10_cnn_static.py
@@ -5,11 +5,10 @@
 import time
 
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
-from tensorlayer.layers import (BatchNorm, Conv2d, Dense, Flatten, Input,
-                                LocalResponseNorm, MaxPool2d)
+from tensorlayer.layers import (BatchNorm, Conv2d, Dense, Flatten, Input, LocalResponseNorm, MaxPool2d)
 from tensorlayer.models import Model
 
 # enable debug logging
@@ -74,7 +73,6 @@ def get_model_batchnorm(inputs_shape):
 # get the network
 net = get_model([None, 24, 24, 3])
 
-
 # training settings
 batch_size = 128
 n_epoch = 50000
@@ -82,7 +80,7 @@ def get_model_batchnorm(inputs_shape):
 print_freq = 5
 n_step_epoch = int(len(y_train) / batch_size)
 n_step = n_epoch * n_step_epoch
-shuffle_buffer_size = 128 # 100
+shuffle_buffer_size = 128  # 100
 # init_learning_rate = 0.1
 # learning_rate_decay_factor = 0.1
 # num_epoch_decay = 350
diff --git a/examples/basic_tutorials/tutorial_mnist_mlp_dynamic.py b/examples/basic_tutorials/tutorial_mnist_mlp_dynamic.py
index f680b11c1..f4ad787b7 100644
--- a/examples/basic_tutorials/tutorial_mnist_mlp_dynamic.py
+++ b/examples/basic_tutorials/tutorial_mnist_mlp_dynamic.py
@@ -1,8 +1,8 @@
 import time
 
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
 from tensorlayer.layers import Dense, Dropout, Input
 from tensorlayer.models import Model
@@ -13,18 +13,19 @@
 ## prepare MNIST data
 X_train, y_train, X_val, y_val, X_test, y_test = tl.files.load_mnist_dataset(shape=(-1, 784))
 
+
 ## define the network
 class CustomModel(Model):
 
     def __init__(self):
         super(CustomModel, self).__init__()
 
-        self.dropout1 = Dropout(keep=0.8)#(self.innet)
-        self.dense1 = Dense(n_units=800, act=tf.nn.relu, in_channels=784)#(self.dropout1)
-        self.dropout2 = Dropout(keep=0.8)#(self.dense1)
-        self.dense2 = Dense(n_units=800, act=tf.nn.relu, in_channels=800)#(self.dropout2)
-        self.dropout3 = Dropout(keep=0.8)#(self.dense2)
-        self.dense3 = Dense(n_units=10, act=tf.nn.relu, in_channels=800)#(self.dropout3)
+        self.dropout1 = Dropout(keep=0.8)  #(self.innet)
+        self.dense1 = Dense(n_units=800, act=tf.nn.relu, in_channels=784)  #(self.dropout1)
+        self.dropout2 = Dropout(keep=0.8)  #(self.dense1)
+        self.dense2 = Dense(n_units=800, act=tf.nn.relu, in_channels=800)  #(self.dropout2)
+        self.dropout3 = Dropout(keep=0.8)  #(self.dense2)
+        self.dense3 = Dense(n_units=10, act=tf.nn.relu, in_channels=800)  #(self.dropout3)
 
     def forward(self, x, foo=None):
         z = self.dropout1(x)
@@ -37,6 +38,7 @@ def forward(self, x, foo=None):
             out = tf.nn.relu(out)
         return out
 
+
 MLP = CustomModel()
 
 ## start training
diff --git a/examples/basic_tutorials/tutorial_mnist_mlp_dynamic_2.py b/examples/basic_tutorials/tutorial_mnist_mlp_dynamic_2.py
index b752012b0..e2d45943d 100644
--- a/examples/basic_tutorials/tutorial_mnist_mlp_dynamic_2.py
+++ b/examples/basic_tutorials/tutorial_mnist_mlp_dynamic_2.py
@@ -1,8 +1,8 @@
 import time
 
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
 from tensorlayer.layers import Dense, Dropout, Input, LayerList
 from tensorlayer.models import Model
@@ -13,13 +13,14 @@
 ## prepare MNIST data
 X_train, y_train, X_val, y_val, X_test, y_test = tl.files.load_mnist_dataset(shape=(-1, 784))
 
+
 ## define the network
 class CustomModelHidden(Model):
 
     def __init__(self):
         super(CustomModelHidden, self).__init__()
 
-        self.dropout1 = Dropout(keep=0.8)#(self.innet)
+        self.dropout1 = Dropout(keep=0.8)  #(self.innet)
 
         self.seq = LayerList(
             [
@@ -29,7 +30,7 @@ def __init__(self):
             ]
         )
 
-        self.dropout3 = Dropout(keep=0.8)#(self.seq)
+        self.dropout3 = Dropout(keep=0.8)  #(self.seq)
 
     def forward(self, x):
         z = self.dropout1(x)
@@ -37,6 +38,7 @@ def forward(self, x):
         z = self.dropout3(z)
         return z
 
+
 class CustomModelOut(Model):
 
     def __init__(self):
diff --git a/examples/basic_tutorials/tutorial_mnist_mlp_static.py b/examples/basic_tutorials/tutorial_mnist_mlp_static.py
index d0738c3b1..08b72bd64 100644
--- a/examples/basic_tutorials/tutorial_mnist_mlp_static.py
+++ b/examples/basic_tutorials/tutorial_mnist_mlp_static.py
@@ -1,8 +1,8 @@
 import time
 
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
 from tensorlayer.layers import Dense, Dropout, Input
 from tensorlayer.models import Model
@@ -21,14 +21,18 @@
 def get_model(inputs_shape):
     ni = Input(inputs_shape)
     nn = Dropout(keep=0.8)(ni)
-    nn = Dense(n_units=800, act=tf.nn.relu)(nn) # in_channels is optional in this case as it can be inferred by the previous layer
+    nn = Dense(n_units=800,
+               act=tf.nn.relu)(nn)  # in_channels is optional in this case as it can be inferred by the previous layer
     nn = Dropout(keep=0.8)(nn)
-    nn = Dense(n_units=800, act=tf.nn.relu)(nn) # in_channels is optional in this case as it can be inferred by the previous layer
+    nn = Dense(n_units=800,
+               act=tf.nn.relu)(nn)  # in_channels is optional in this case as it can be inferred by the previous layer
     nn = Dropout(keep=0.8)(nn)
-    nn = Dense(n_units=10, act=tf.nn.relu)(nn) # in_channels is optional in this case as it can be inferred by the previous layer
+    nn = Dense(n_units=10,
+               act=tf.nn.relu)(nn)  # in_channels is optional in this case as it can be inferred by the previous layer
     M = Model(inputs=ni, outputs=nn, name="mlp")
     return M
 
+
 MLP = get_model([None, 784])
 import pprint
 pprint.pprint(MLP.config)
diff --git a/examples/basic_tutorials/tutorial_mnist_mlp_static_2.py b/examples/basic_tutorials/tutorial_mnist_mlp_static_2.py
index f0836c528..67a519e4a 100644
--- a/examples/basic_tutorials/tutorial_mnist_mlp_static_2.py
+++ b/examples/basic_tutorials/tutorial_mnist_mlp_static_2.py
@@ -1,8 +1,8 @@
 import time
 
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
 from tensorlayer.layers import Dense, Dropout, Input
 from tensorlayer.models import Model
@@ -13,12 +13,12 @@
 ## prepare MNIST data
 X_train, y_train, X_val, y_val, X_test, y_test = tl.files.load_mnist_dataset(shape=(-1, 784))
 
-
 ## define the network
 # the softmax is implemented internally in tl.cost.cross_entropy(y, y_) to
 # speed up computation, so we use identity here.
 # see tf.nn.sparse_softmax_cross_entropy_with_logits()
 
+
 def hidden_model(inputs_shape):
     ni = Input(inputs_shape)
     nn = Dropout(keep=0.8)(ni)
@@ -28,6 +28,7 @@ def hidden_model(inputs_shape):
 
     return Model(inputs=ni, outputs=nn, name="mlp_hidden")
 
+
 def get_model(inputs_shape, hmodel):
     hidden = hmodel.as_layer()
     ni = Input(inputs_shape)
@@ -37,6 +38,7 @@ def get_model(inputs_shape, hmodel):
 
     return Model(inputs=ni, outputs=nn, name="mlp")
 
+
 MLP_hidden = hidden_model([None, 784])
 MLP = get_model([None, 784], MLP_hidden)
 # MLP.print_layers()
diff --git a/examples/basic_tutorials/tutorial_mnist_siamese.py b/examples/basic_tutorials/tutorial_mnist_siamese.py
index db43f1163..e8d50ef94 100644
--- a/examples/basic_tutorials/tutorial_mnist_siamese.py
+++ b/examples/basic_tutorials/tutorial_mnist_siamese.py
@@ -14,8 +14,8 @@
 import time
 
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
 from tensorlayer.layers import Dense, Dropout, Flatten, Input
 from tensorlayer.models import Model
@@ -33,7 +33,7 @@ def contrastive_loss(label, feature1, feature2):
 
 
 def compute_accuracy(label, feature1, feature2):
-    eucd = tf.sqrt(tf.reduce_sum((feature1 - feature2) ** 2, axis=1))
+    eucd = tf.sqrt(tf.reduce_sum((feature1 - feature2)**2, axis=1))
     pred = tf.cast(eucd < 0.5, label.dtype)
     return tf.reduce_mean(tf.cast(tf.equal(pred, label), tf.float32))
 
diff --git a/examples/basic_tutorials/tutorial_mnist_simple.py b/examples/basic_tutorials/tutorial_mnist_simple.py
index 04e233819..ceaee0c48 100644
--- a/examples/basic_tutorials/tutorial_mnist_simple.py
+++ b/examples/basic_tutorials/tutorial_mnist_simple.py
@@ -1,9 +1,10 @@
 #! /usr/bin/python
 # -*- coding: utf-8 -*-
 
+import numpy as np
+
 import tensorflow as tf
 import tensorlayer as tl
-import numpy as np
 
 tl.logging.set_verbosity(tl.logging.DEBUG)
 
diff --git a/examples/data_process/data/__init__.py b/examples/data_process/data/__init__.py
index 8b31b202a..83d5401c3 100644
--- a/examples/data_process/data/__init__.py
+++ b/examples/data_process/data/__init__.py
@@ -1,4 +1,3 @@
 from __future__ import absolute_import
 
-# from . import imagenet_classes
 from . import *
diff --git a/examples/data_process/tutorial_fast_affine_transform.py b/examples/data_process/tutorial_fast_affine_transform.py
index 52452ffd5..71890f5bd 100644
--- a/examples/data_process/tutorial_fast_affine_transform.py
+++ b/examples/data_process/tutorial_fast_affine_transform.py
@@ -8,10 +8,10 @@
 import multiprocessing
 import time
 
-import cv2
 import numpy as np
-import tensorflow as tf
 
+import cv2
+import tensorflow as tf
 import tensorlayer as tl
 
 # tl.logging.set_verbosity(tl.logging.DEBUG)
diff --git a/examples/data_process/tutorial_tf_dataset_voc.py b/examples/data_process/tutorial_tf_dataset_voc.py
index fab1612f7..c3ac07e06 100644
--- a/examples/data_process/tutorial_tf_dataset_voc.py
+++ b/examples/data_process/tutorial_tf_dataset_voc.py
@@ -13,8 +13,8 @@
 import time
 
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
 
 # tf.logging.set_verbosity(tf.logging.DEBUG)
@@ -108,5 +108,6 @@ def _map_fn(filename, annotation):
 ## save all images
 for i in range(len(im)):
     print(ann[i][1])
-    tl.vis.draw_boxes_and_labels_to_image(im[i] * 255, ann[i][0], ann[i][1], [], classes,
-                                          True, save_name='_bbox_vis_%d.png' % i)
+    tl.vis.draw_boxes_and_labels_to_image(
+        im[i] * 255, ann[i][0], ann[i][1], [], classes, True, save_name='_bbox_vis_%d.png' % i
+    )
diff --git a/examples/data_process/tutorial_tfrecord.py b/examples/data_process/tutorial_tfrecord.py
index 4cb832c1d..6c5c38162 100644
--- a/examples/data_process/tutorial_tfrecord.py
+++ b/examples/data_process/tutorial_tfrecord.py
@@ -22,9 +22,9 @@
 import os
 
 import numpy as np
-import tensorflow as tf
 from PIL import Image
 
+import tensorflow as tf
 import tensorlayer as tl
 
 ## Save data ==================================================================
@@ -79,8 +79,7 @@ def read_and_decode(filename):
     raw_dataset = tf.data.TFRecordDataset([filename]).shuffle(1000).batch(4)
     for serialized_example in raw_dataset:
         features = tf.io.parse_example(
-            serialized_example,
-            features={
+            serialized_example, features={
                 'label': tf.io.FixedLenFeature([], tf.int64),
                 'img_raw': tf.io.FixedLenFeature([], tf.string),
             }
@@ -97,4 +96,3 @@ def read_and_decode(filename):
 print("img_batch   : %s" % img_batch.shape)
 print("label_batch : %s" % label_batch.shape)
 tl.visualize.images2d(img_batch, second=1, saveable=False, name='batch', dtype=None, fig_idx=2020121)
-
diff --git a/examples/data_process/tutorial_tfrecord2.py b/examples/data_process/tutorial_tfrecord2.py
index be41b697f..6997be251 100755
--- a/examples/data_process/tutorial_tfrecord2.py
+++ b/examples/data_process/tutorial_tfrecord2.py
@@ -14,10 +14,10 @@
 import os
 
 import numpy as np
+
 # import matplotlib
 # matplotlib.use('GTK')
 import tensorflow as tf
-
 import tensorlayer as tl
 
 # Download data, and convert to TFRecord format, see ```tutorial_tfrecord.py```
@@ -77,6 +77,7 @@ def read_and_decode(filename):
         label_batch = tf.cast(features['label'], tf.int32)
         yield img_batch, label_batch
 
+
 img_batch, label_batch = next(read_and_decode("train.tfrecords"))
 print("img_batch   : %s" % img_batch.shape)
 print("label_batch : %s" % label_batch.shape)
diff --git a/examples/data_process/tutorial_tfrecord3.py b/examples/data_process/tutorial_tfrecord3.py
index 9e5751a25..bc8752f2a 100644
--- a/examples/data_process/tutorial_tfrecord3.py
+++ b/examples/data_process/tutorial_tfrecord3.py
@@ -19,9 +19,9 @@
 import os
 
 import numpy as np
-import tensorflow as tf
 from PIL import Image
 
+import tensorflow as tf
 import tensorlayer as tl
 
 
diff --git a/examples/database/dispatch_tasks.py b/examples/database/dispatch_tasks.py
index d1204bcd4..260257e77 100644
--- a/examples/database/dispatch_tasks.py
+++ b/examples/database/dispatch_tasks.py
@@ -6,7 +6,6 @@
 import time
 
 import tensorflow as tf
-
 import tensorlayer as tl
 
 tl.logging.set_verbosity(tl.logging.DEBUG)
diff --git a/examples/database/task_script.py b/examples/database/task_script.py
index ad51dd3ed..58ef60d1a 100644
--- a/examples/database/task_script.py
+++ b/examples/database/task_script.py
@@ -1,7 +1,6 @@
 """Sample task script."""
 
 import tensorflow as tf
-
 import tensorlayer as tl
 
 tf.logging.set_verbosity(tf.logging.DEBUG)
diff --git a/examples/deprecated_tutorials/tutorial_imagenet_inceptionV3_distributed.py b/examples/deprecated_tutorials/tutorial_imagenet_inceptionV3_distributed.py
index 936ae9702..1c2801306 100644
--- a/examples/deprecated_tutorials/tutorial_imagenet_inceptionV3_distributed.py
+++ b/examples/deprecated_tutorials/tutorial_imagenet_inceptionV3_distributed.py
@@ -19,18 +19,17 @@
 from xml.etree import ElementTree
 
 import numpy as np
+
 import tensorflow as tf
+import tensorlayer as tl
 from tensorflow.contrib import slim
-from tensorflow.contrib.slim.python.slim.nets.inception_v3 import (inception_v3,
-                                                                   inception_v3_arg_scope)
+from tensorflow.contrib.slim.python.slim.nets.inception_v3 import (inception_v3, inception_v3_arg_scope)
 from tensorflow.python.framework.errors_impl import OutOfRangeError
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training.basic_session_run_hooks import StopAtStepHook
 from tensorflow.python.training.monitored_session import \
     SingularMonitoredSession
 
-import tensorlayer as tl
-
 tf.logging.set_verbosity(tf.logging.DEBUG)
 tl.logging.set_verbosity(tl.logging.DEBUG)
 
diff --git a/examples/deprecated_tutorials/tutorial_mnist_distributed.py b/examples/deprecated_tutorials/tutorial_mnist_distributed.py
index 29d291ba4..18f7cdb92 100644
--- a/examples/deprecated_tutorials/tutorial_mnist_distributed.py
+++ b/examples/deprecated_tutorials/tutorial_mnist_distributed.py
@@ -13,7 +13,6 @@
 """
 
 import tensorflow as tf
-
 import tensorlayer as tl
 
 tf.logging.set_verbosity(tf.logging.DEBUG)
diff --git a/examples/distributed_training/tutorial_cifar10_distributed_trainer.py b/examples/distributed_training/tutorial_cifar10_distributed_trainer.py
index 1ddc2d937..340e37b2f 100644
--- a/examples/distributed_training/tutorial_cifar10_distributed_trainer.py
+++ b/examples/distributed_training/tutorial_cifar10_distributed_trainer.py
@@ -15,11 +15,10 @@
 import multiprocessing
 
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
-from tensorlayer.layers import (BatchNormLayer, Conv2d, DenseLayer,
-                                FlattenLayer, InputLayer, MaxPool2d)
+from tensorlayer.layers import (BatchNormLayer, Conv2d, DenseLayer, FlattenLayer, InputLayer, MaxPool2d)
 
 tf.logging.set_verbosity(tf.logging.DEBUG)
 tl.logging.set_verbosity(tl.logging.DEBUG)
diff --git a/examples/distributed_training/tutorial_mnist_distributed_trainer.py b/examples/distributed_training/tutorial_mnist_distributed_trainer.py
index 0f1b8b6dd..0cf916370 100755
--- a/examples/distributed_training/tutorial_mnist_distributed_trainer.py
+++ b/examples/distributed_training/tutorial_mnist_distributed_trainer.py
@@ -2,8 +2,8 @@
 # -*- coding: utf-8 -*-
 
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
 
 tf.logging.set_verbosity(tf.logging.DEBUG)
diff --git a/examples/keras_tfslim/tutorial_keras.py b/examples/keras_tfslim/tutorial_keras.py
index 0622bc745..9b877738c 100644
--- a/examples/keras_tfslim/tutorial_keras.py
+++ b/examples/keras_tfslim/tutorial_keras.py
@@ -4,8 +4,8 @@
 import time
 
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
 from tensorlayer.layers import Input, Lambda
 
@@ -15,7 +15,6 @@
 
 batch_size = 128
 
-
 # keras layers
 layers = [
     tf.keras.layers.Dropout(0.8),
@@ -23,12 +22,12 @@
     tf.keras.layers.Dropout(0.5),
     tf.keras.layers.Dense(800, activation='relu'),
     tf.keras.layers.Dropout(0.5),
-    tf.keras.layers.Dense(10, activation='linear')]
+    tf.keras.layers.Dense(10, activation='linear')
+]
 keras_block = tf.keras.Sequential(layers)
 # in order to compile keras model and get trainable_variables of the keras model
 _ = keras_block(np.random.random([batch_size, 784]).astype(np.float32))
 
-
 # build tl model using keras layers
 ni = Input([None, 784], dtype=tf.float32)
 nn = Lambda(fn=keras_block, fn_weights=keras_block.trainable_variables)(ni)
diff --git a/examples/pretrained_cnn/data/__init__.py b/examples/pretrained_cnn/data/__init__.py
index 8b31b202a..83d5401c3 100644
--- a/examples/pretrained_cnn/data/__init__.py
+++ b/examples/pretrained_cnn/data/__init__.py
@@ -1,4 +1,3 @@
 from __future__ import absolute_import
 
-# from . import imagenet_classes
 from . import *
diff --git a/examples/pretrained_cnn/tutorial_models_mobilenetv1.py b/examples/pretrained_cnn/tutorial_models_mobilenetv1.py
index 8d7b35a6b..6b797a075 100644
--- a/examples/pretrained_cnn/tutorial_models_mobilenetv1.py
+++ b/examples/pretrained_cnn/tutorial_models_mobilenetv1.py
@@ -10,8 +10,8 @@
 import time
 
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
 from tensorlayer.models.imagenet_classes import class_names
 
diff --git a/examples/pretrained_cnn/tutorial_models_squeezenetv1.py b/examples/pretrained_cnn/tutorial_models_squeezenetv1.py
index 9b6ee4e7f..755d6c28b 100644
--- a/examples/pretrained_cnn/tutorial_models_squeezenetv1.py
+++ b/examples/pretrained_cnn/tutorial_models_squeezenetv1.py
@@ -5,8 +5,8 @@
 import time
 
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
 from tensorlayer.models.imagenet_classes import class_names
 
diff --git a/examples/pretrained_cnn/tutorial_models_vgg16.py b/examples/pretrained_cnn/tutorial_models_vgg16.py
index e6bb1c22e..7749d5391 100644
--- a/examples/pretrained_cnn/tutorial_models_vgg16.py
+++ b/examples/pretrained_cnn/tutorial_models_vgg16.py
@@ -5,14 +5,13 @@
 import time
 
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
 from tensorlayer.models.imagenet_classes import class_names
 
 tl.logging.set_verbosity(tl.logging.DEBUG)
 
-
 # get the whole model
 vgg = tl.models.vgg16(pretrained=True)
 
diff --git a/examples/pretrained_cnn/tutorial_models_vgg19.py b/examples/pretrained_cnn/tutorial_models_vgg19.py
index 850412c38..09f2afa22 100644
--- a/examples/pretrained_cnn/tutorial_models_vgg19.py
+++ b/examples/pretrained_cnn/tutorial_models_vgg19.py
@@ -5,14 +5,13 @@
 import time
 
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
 from tensorlayer.models.imagenet_classes import class_names
 
 tl.logging.set_verbosity(tl.logging.DEBUG)
 
-
 # get the whole model
 vgg = tl.models.vgg19(pretrained=True)
 
diff --git a/examples/pretrained_cnn/tutorial_models_vgg_static.py b/examples/pretrained_cnn/tutorial_models_vgg_static.py
index 40a3ed865..0e73b82ef 100644
--- a/examples/pretrained_cnn/tutorial_models_vgg_static.py
+++ b/examples/pretrained_cnn/tutorial_models_vgg_static.py
@@ -5,14 +5,13 @@
 import time
 
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
 from tensorlayer.models.imagenet_classes import class_names
 
 tl.logging.set_verbosity(tl.logging.DEBUG)
 
-
 # get the whole model
 vgg = tl.models.vgg16(pretrained=True, mode='static')
 
diff --git a/examples/quantized_net/tutorial_binarynet_cifar10_tfrecord.py b/examples/quantized_net/tutorial_binarynet_cifar10_tfrecord.py
index 98532debb..d3205045a 100644
--- a/examples/quantized_net/tutorial_binarynet_cifar10_tfrecord.py
+++ b/examples/quantized_net/tutorial_binarynet_cifar10_tfrecord.py
@@ -43,7 +43,6 @@
 import time
 
 import tensorflow as tf
-
 import tensorlayer as tl
 
 tf.logging.set_verbosity(tf.logging.DEBUG)
diff --git a/examples/quantized_net/tutorial_binarynet_mnist_cnn.py b/examples/quantized_net/tutorial_binarynet_mnist_cnn.py
index 248812e23..84fbf7fc9 100644
--- a/examples/quantized_net/tutorial_binarynet_mnist_cnn.py
+++ b/examples/quantized_net/tutorial_binarynet_mnist_cnn.py
@@ -4,7 +4,6 @@
 import time
 
 import tensorflow as tf
-
 import tensorlayer as tl
 
 tf.logging.set_verbosity(tf.logging.DEBUG)
diff --git a/examples/quantized_net/tutorial_dorefanet_cifar10_tfrecord.py b/examples/quantized_net/tutorial_dorefanet_cifar10_tfrecord.py
index 9c8ab1239..fe7666bab 100644
--- a/examples/quantized_net/tutorial_dorefanet_cifar10_tfrecord.py
+++ b/examples/quantized_net/tutorial_dorefanet_cifar10_tfrecord.py
@@ -43,7 +43,6 @@
 import time
 
 import tensorflow as tf
-
 import tensorlayer as tl
 
 tf.logging.set_verbosity(tf.logging.DEBUG)
diff --git a/examples/quantized_net/tutorial_dorefanet_mnist_cnn.py b/examples/quantized_net/tutorial_dorefanet_mnist_cnn.py
index 90d7b0893..d8cab9bc8 100644
--- a/examples/quantized_net/tutorial_dorefanet_mnist_cnn.py
+++ b/examples/quantized_net/tutorial_dorefanet_mnist_cnn.py
@@ -4,7 +4,6 @@
 import time
 
 import tensorflow as tf
-
 import tensorlayer as tl
 
 tf.logging.set_verbosity(tf.logging.DEBUG)
diff --git a/examples/quantized_net/tutorial_quanconv_cifar10.py b/examples/quantized_net/tutorial_quanconv_cifar10.py
index 6eb35ed67..f93368467 100644
--- a/examples/quantized_net/tutorial_quanconv_cifar10.py
+++ b/examples/quantized_net/tutorial_quanconv_cifar10.py
@@ -41,8 +41,8 @@
 import time
 
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
 
 bitW = 8
diff --git a/examples/quantized_net/tutorial_quanconv_mnist.py b/examples/quantized_net/tutorial_quanconv_mnist.py
index 4060c6137..66d52d13c 100644
--- a/examples/quantized_net/tutorial_quanconv_mnist.py
+++ b/examples/quantized_net/tutorial_quanconv_mnist.py
@@ -4,7 +4,6 @@
 import time
 
 import tensorflow as tf
-
 import tensorlayer as tl
 
 tf.logging.set_verbosity(tf.logging.DEBUG)
diff --git a/examples/quantized_net/tutorial_ternaryweight_cifar10_tfrecord.py b/examples/quantized_net/tutorial_ternaryweight_cifar10_tfrecord.py
index f1ee7b4bb..b695fa88a 100644
--- a/examples/quantized_net/tutorial_ternaryweight_cifar10_tfrecord.py
+++ b/examples/quantized_net/tutorial_ternaryweight_cifar10_tfrecord.py
@@ -42,7 +42,6 @@
 import time
 
 import tensorflow as tf
-
 import tensorlayer as tl
 
 tf.logging.set_verbosity(tf.logging.DEBUG)
diff --git a/examples/quantized_net/tutorial_ternaryweight_mnist_cnn.py b/examples/quantized_net/tutorial_ternaryweight_mnist_cnn.py
index e1c305db6..6850b9591 100644
--- a/examples/quantized_net/tutorial_ternaryweight_mnist_cnn.py
+++ b/examples/quantized_net/tutorial_ternaryweight_mnist_cnn.py
@@ -4,7 +4,6 @@
 import time
 
 import tensorflow as tf
-
 import tensorlayer as tl
 
 tf.logging.set_verbosity(tf.logging.DEBUG)
diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md
index d99b5fec3..dc9b412f5 100644
--- a/examples/reinforcement_learning/README.md
+++ b/examples/reinforcement_learning/README.md
@@ -1,2 +1,345 @@
+# Reinforcement Learning Tutorial with Tensorlayer
+
+<br/>
+
+<a href="https://join.slack.com/t/tensorlayer/shared_invite/enQtMjUyMjczMzU2Njg4LWI0MWU0MDFkOWY2YjQ4YjVhMzI5M2VlZmE4YTNhNGY1NjZhMzUwMmQ2MTc0YWRjMjQzMjdjMTg2MWQ2ZWJhYzc" target="\_blank">
+	<div align="center">
+		<img src="../../img/join_slack.png" width="40%"/>
+	</div>
+</a>
+
+<br/>
+
+This repository contains implementation of most popular reinforcement learning algorithms with Tensorlayer 2.0, supporting [Tensorflow 2.0](https://www.tensorflow.org/alpha/guide/effective_tf2). We aim to make the reinforcement learning tutorial for each algorithm simple and straight-forward to use, as this would not only benefits new learners of reinforcement learning, but also provide convenience for senior researchers to testify their new ideas quickly.
+
+## Prerequisites:
+
+* python 3.5
+* tensorflow >= 2.0.0 or tensorflow-gpu >= 2.0.0a0
+* tensorlayer >= 2.0.1
+* tensorflow-probability
+* tf-nightly-2.0-preview
+
+*** If you meet the error`AttributeError: module 'tensorflow' has no attribute 'contrib'` when running the code after installing tensorflow-probability, try:
+
+`pip install --upgrade tf-nightly-2.0-preview tfp-nightly`
+
+## Status: Beta
+
+We are currently open to any suggestions or pull requests from you to make the reinforcement learning tutorial with TensorLayer2.0 a better code repository for both new learners and senior researchers. Some of the algorithms mentioned in the this markdown may be not yet available, since we are still trying to implement more RL algorithms and optimize their performances. However, those algorithms listed above will come out in a few weeks, and the repository will keep updating more advanced RL algorithms in the future.
+
+## To Use:
+
+For each tutorial, open a terminal and run:
+
+ `python ***.py --train` for training and `python ***.py --test` for testing.
+
+The tutorial algorithms follow the same basic structure, as shown in file: [`./tutorial_format.py`](https://github.com/tensorlayer/tensorlayer/blob/reinforcement-learning/examples/reinforcement_learning/tutorial_format.py)
+
+## Table of Contents:
+
+| Algorithms      | Observation Space | Action Space | Tutorial Env   |
+| --------------- | ----------------- | ------------ | -------------- |
+| Q-learning      | Discrete          | Discrete     | FrozenLake     |
+| C51             | Discrete          | Discrete     | Pong, CartPole |
+| DQN             | Discrete          | Discrete     | FrozenLake     |
+| Variants of DQN | Discrete          | Discrete     | Pong, CartPole |
+| Retrace         | Discrete          | Discrete     | Pong, CartPole |
+| PER             | Discrete          | Discrete     | Pong, CartPole |
+| Actor-Critic    | Continuous        | Discrete     | CartPole       |
+| A3C             | Continuous        | Continuous   | BipedalWalker  |
+| DDPG            | Continuous        | Continuous   | Pendulum       |
+| TD3             | Continuous        | Continuous   | Pendulum       |
+| SAC             | Continuous        | Continuous   | Pendulum       |
+| PG              | Continuous        | Discrete     | CartPole       |
+| TRPO            | Continuous        | Continuous   | Pendulum       |
+| PPO             | Continuous        | Continuous   | Pendulum       |
+| DPPO            | Continuous        | Continuous   | Pendulum       |
+
+
+## Examples of RL Algorithms:
+
+* **Q-learning**
+
+  Code: `./tutorial_Qlearning.py`
+
+  <u>Paper</u>: [Technical  Note Q-Learning](http://www.gatsby.ucl.ac.uk/~dayan/papers/cjch.pdf)
+
+  <u>Description</u>: 
+
+  ```
+  Q-learning is a non-deep-learning method with TD Learning, Off-Policy, e-Greedy Exploration.
+  
+  Central formula:
+  Q(S, A) <- Q(S, A) + alpha * (R + lambda * Q(newS, newA) - Q(S, A))
+  
+  See David Silver RL Tutorial Lecture 5 - Q-Learning for more details.
+  ```
+
+  ​    
+
+* **Deep Q-Network (DQN)**
+
+  <u>Code:</u> `./tutorial_DQN.py`
+
+  <u>Paper</u>: [Human-level control through deep reinforcementlearning](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf)
+
+  [Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)
+
+  <u>Description</u>: 
+
+  ```
+  Deep Q-Network (DQN) is a method of TD Learning, Off-Policy, e-Greedy Exploration (GLIE).
+  
+  Central formula:
+  Q(S, A) <- Q(S, A) + alpha * (R + lambda * Q(newS, newA) - Q(S, A)),
+  delta_w = R + lambda * Q(newS, newA).
+  
+  See David Silver RL Tutorial Lecture 5 - Q-Learning for more details.
+  ```
+
+  
+
+* **Double DQN / Dueling DQN / Noisy DQN**
+
+  <u>Code:</u> `./tutorial_DQN_variants.py`
+
+  <u>Paper</u>: [Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461)
+
+  <u>Description</u>: 
+
+  ```
+  We implement Double DQN, Dueling DQN and Noisy DQN here.
+  
+  -The max operator in standard DQN uses the same values both to select and to evaluate an action by:
+  
+     Q(s_t, a_t) = R\_{t+1\} + gamma \* max\_{a}Q\_\{target\}(s_{t+1}, a).
+  
+  -Double DQN proposes to use following evaluation to address overestimation problem of max operator:
+  
+     Q(s_t, a_t) = R\_{t+1\} + gamma \* Q\_{target}(s\_\{t+1\}, max{a}Q(s_{t+1}, a)).
+  
+  -Dueling DQN uses dueling architecture where the value of state and the advantage of each action is estimated separately.
+  
+  -Noisy DQN propose to explore by adding parameter noises.
+  
+  
+  ```
+
+  
+
+
+* **Prioritized Experience Replay**
+
+  <u>Code</u>: `./tutorial_prioritized_replay.py`
+
+  <u>Paper</u>: [Prioritized Experience Replay](https://arxiv.org/abs/1511.05952)
+
+  <u>Description:</u>
+
+  ```
+  Prioritized experience replay is an efficient replay method that replay important transitions more frequently. Segment tree data structure is used to speed up indexing.
+  ```
+
+  
+
+* **Distributed DQN (C51)**
+
+  <u>Code</u>: `./tutorial_C51.py`
+
+  <u>Paper</u>: [A Distributional Perspective on Reinforcement Learning](https://arxiv.org/pdf/1707.06887.pdf)
+
+  <u>Description</u>:
+
+  ```
+  Categorical 51 distributional RL algorithm is a distrbuted DQN, where 51 means the number of atoms. In this algorithm, instead of estimating actual expected value, value distribution over a series of  continuous sub-intervals (atoms) is considered.
+  ```
+
+  
+
+
+* **Retrace(lambda) DQN**
+
+  <u>Code</u>: `./tutorial_Retrace.py`
+
+  <u>Paper</u>: [Safe and Efficient Off-Policy Reinforcement Learning](https://arxiv.org/abs/1606.02647)
+
+  <u>Description:</u>
+
+  ```
+  Retrace (lambda) is an off-policy algorithm that extend the idea of eligibility trace. It apply an importance sampling ratio truncated at 1 to several behaviour policies, which suffer from the variance explosion of standard IS and lead to safe and efficient learning.
+  ```
+
+  
+
+
+* **Actor-Critic (AC)**
+
+  <u>Code</u>:`./tutorial_AC.py`
+
+  <u>Paper</u>: [Actor-Critic Algorithms](https://papers.nips.cc/paper/1786-actor-critic-algorithms.pdf)
+
+  <u>Description</u>:
+
+  ```
+  The implementation of Advantage Actor-Critic, using TD-error as the advantage.
+  ```
+
+  
+
+* **Asynchronous Advantage Actor-Critic (A3C)**
+
+  <u>Code</u>: `./tutorial_A3C.py`
+
+  <u>Paper</u>: [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/pdf/1602.01783.pdf)
+
+  <u>Description</u>:
+
+  ```
+  The implementation of Asynchronous Advantage Actor-Critic (A3C), using multi-threading for distributed policy learning on Actor-Critic structure.
+  ```
+
+  
+
+* **Soft Actor-Critic (SAC)**
+
+  <u>Code</u>: `./tutorial_SAC.py`
+
+  <u>Paper</u>: [Soft Actor-Critic Algorithms and Applications](https://arxiv.org/pdf/1812.05905.pdf)
+
+  <u>Description:</u>
+
+  ```
+  Actor policy in SAC is stochastic, with off-policy training.  And 'soft' in SAC indicates the trade-off between the entropy and expected return.  The additional consideration of entropy term helps with more explorative policy. And this implementation contains an automatic update for the entropy factor.
+  
+  This version of Soft Actor-Critic (SAC) implementation contains 5 networks: 
+  2 Q-networks, 2 target Q-networks and 1 policy network.
+  ```
+
+  
+
+
+* **Vanilla Policy Gradient (PG or REINFORCE)** 
+
+  <u>Code</u>: `./tutorial_PG.py`
+
+  <u>Paper</u>: [Policy Gradient Methods for Reinforcement Learning with Function Approximation](https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf)
+
+  <u>Description:</u>
+
+  ```
+  The policy gradient algorithm works by updating policy parameters via stochastic gradient ascent on policy performance. It's an on-policy algorithm can be used for environments with either discrete or continuous action spaces.
+  
+  To apply it on continuous action space, you need to change the last softmax layer and the choose_action function.
+  ```
+
+  
+
+* **Deep Deterministic Policy Gradient (DDPG)**
+
+  <u>Code:</u> `./tutorial_DDPG.py`
+
+  <u>Paper:</u> [Continuous Control With Deep Reinforcement Learning](https://arxiv.org/pdf/1509.02971.pdf)
+
+  <u>Description:</u>
+
+  ```
+  An algorithm concurrently learns a Q-function and a policy.
+  
+  It uses off-policy data and the Bellman equation to learn the Q-function, and uses the Q-function to learn the policy.
+  ```
+
+  
+
+
+* **Twin Delayed DDPG (TD3)**
+
+  <u>Code</u>: `./tutorial_TD3.py`
+
+  <u>Paper</u>: [Addressing Function Approximation Error in Actor-Critic Methods](https://arxiv.org/pdf/1802.09477.pdf)
+
+  <u>Description</u>:
+
+  ```
+  DDPG suffers from problems like overestimate of Q-values and sensitivity to hyper-parameters.
+  
+  Twin Delayed DDPG (TD3) is a variant of DDPG with several tricks:
+  
+  - Trick One: Clipped Double-Q Learning. TD3 learns two Q-functions instead of one (hence “twin”), and uses the smaller of the two Q-values to form the targets in the Bellman error loss functions.
+  - Trick Two: “Delayed” Policy Updates. TD3 updates the policy (and target networks) less frequently than the Q-function. 
+  - Trick Three: Target Policy Smoothing. TD3 adds noise to the target action, to make it harder for the policy to exploit Q-function errors by smoothing out Q along changes in action.
+  
+  The implementation of TD3 includes 6 networks: 
+  2 Q-networks, 2 target Q-networks, 1 policy network, 1 target policy network.
+  
+  Actor policy in TD3 is deterministic, with Gaussian exploration noise.
+  ```
+
+  
+
+* **Trust Region Policy Optimization (TRPO)**
+
+  <u>Code</u>: `./tutorial_TRPO.py`
+
+  <u>Paper</u>: [Trust Region Policy Optimization](https://arxiv.org/pdf/1502.05477.pdf)
+
+  <u>Description:</u>
+
+  ```
+  PG method with a large step can crash the policy performance, even with a small step can lead a large differences in policy.
+  
+  TRPO constraints the step in policy space using KL divergence (rather than in parameter space), which can monotonically improve performance and avoid a collapsed update.
+  ```
+
+  
+
+* **Proximal Policy Optimization (PPO)**
+
+  <u>Code:</u> `./tutorial_PPO.py`
+
+  <u>Paper</u>: [Proximal Policy Optimization Algorithms](https://arxiv.org/pdf/1707.06347.pdf)
+
+  <u>Description:</u>
+
+  ```
+  A simple version of Proximal Policy Optimization (PPO) using single thread.
+  
+  PPO is a family of first-order methods that use a few other tricks to keep new policies close to old.
+  
+  PPO methods are significantly simpler to implement, and empirically seem to perform at least as well as TRPO.
+  
+  
+  ```
+
+  
+
+* **Distributed Proximal Policy Optimization (DPPO)**
+
+  <u>Code</u>: `./tutorial_DPPO.py`
+
+  <u>Paper</u>: [Emergence of Locomotion Behaviours in Rich Environments](https://arxiv.org/pdf/1707.02286.pdf)
+
+  <u>Description:</u>
+
+  ```
+  A distributed version of OpenAI's Proximal Policy Optimization (PPO).
+  
+  Distribute the workers to collect data in parallel, then stop worker's roll-out and train PPO on collected data.
+  ```
+
+  
+
+* **More in recent weeks**
+
+## Environment:
+
+We typically apply game environments in [Openai Gym](https://gym.openai.com/) for our tutorials. For other environment sources like [DeepMind Control Suite](https://github.com/deepmind/dm_control) and [Marathon-Envs in Unity](https://github.com/Unity-Technologies/marathon-envs), they all have wrappers to convert into format of Gym environments, see [here](https://github.com/martinseilair/dm_control2gym) and [here](https://github.com/Unity-Technologies/marathon-envs/tree/master/gym-unity).
+
+Our env wrapper: `./tutorial_wrappers.py` 
+
+## Authors
+- @xxxx XXXXX : AC, A3C
+- @quantumiracle Zihan Ding: SAC, TD3.
+- @Tokarev-TT-33 Tianyang Yu @initial-h Hongming Zhang : PG, DDPG, PPO, DPPO, TRPO
+- @Officium Yanhua Huang: C51, Retrace, DQN_variants, prioritized_replay, wrappers.
 
-### More examples can be found in [example List](https://tensorlayer.readthedocs.io/en/stable/user/examples.html)
diff --git a/examples/reinforcement_learning/baselines/SAC.py b/examples/reinforcement_learning/baselines/SAC.py
new file mode 100644
index 000000000..5760298d3
--- /dev/null
+++ b/examples/reinforcement_learning/baselines/SAC.py
@@ -0,0 +1,404 @@
+'''
+Soft Actor-Critic
+using target Q instead of V net: 2 Q net, 2 target Q net, 1 policy net
+adding alpha loss
+
+paper: https://arxiv.org/pdf/1812.05905.pdf
+Actor policy is stochastic.
+
+Env: Openai Gym Pendulum-v0, continuous action space
+
+tensorflow 2.0.0a0
+tensorflow-probability 0.6.0
+tensorlayer 2.0.0
+
+&&
+pip install box2d box2d-kengz --user
+
+To run:
+python tutorial_sac.py --train/test
+'''
+
+import argparse
+import math
+import random
+import time
+
+import matplotlib.pyplot as plt
+import numpy as np
+from IPython.display import clear_output
+
+import gym
+import tensorflow as tf
+import tensorflow_probability as tfp
+import tensorlayer as tl
+from tensorlayer.layers import Dense
+from tensorlayer.models import Model
+from utils import *
+from wrappers import NormalizedActions
+
+tfd = tfp.distributions
+Normal = tfd.Normal
+
+tl.logging.set_verbosity(tl.logging.DEBUG)
+
+np.random.seed(2)
+tf.random.set_seed(2)  # reproducible
+
+parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
+parser.add_argument('--train', dest='train', action='store_true', default=False)
+parser.add_argument('--test', dest='test', action='store_true', default=True)
+args = parser.parse_args()
+
+
+class SoftQNetwork(Model):
+
+    def __init__(self, num_inputs, num_actions, hidden_dim, init_w=3e-3):
+        super(SoftQNetwork, self).__init__()
+        input_dim = num_inputs + num_actions
+        w_init = tf.keras.initializers.glorot_normal(
+            seed=None
+        )  # glorot initialization is better than uniform in practice
+        # w_init = tf.random_uniform_initializer(-init_w, init_w)
+
+        self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=input_dim, name='q1')
+        self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='q2')
+        self.linear3 = Dense(n_units=1, W_init=w_init, in_channels=hidden_dim, name='q3')
+
+    def forward(self, input):
+        x = self.linear1(input)
+        x = self.linear2(x)
+        x = self.linear3(x)
+        return x
+
+
+class PolicyNetwork(Model):
+
+    def __init__(
+            self, num_inputs, num_actions, hidden_dim, action_range=1., init_w=3e-3, log_std_min=-20, log_std_max=2
+    ):
+        super(PolicyNetwork, self).__init__()
+
+        self.log_std_min = log_std_min
+        self.log_std_max = log_std_max
+
+        w_init = tf.keras.initializers.glorot_normal(seed=None)
+        # w_init = tf.random_uniform_initializer(-init_w, init_w)
+
+        self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=num_inputs, name='policy1')
+        self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy2')
+        self.linear3 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy3')
+
+        self.mean_linear = Dense(n_units=num_actions, W_init=w_init, \
+        b_init=tf.random_uniform_initializer(-init_w, init_w), in_channels=hidden_dim, name='policy_mean')
+        self.log_std_linear = Dense(n_units=num_actions, W_init=w_init, \
+        b_init=tf.random_uniform_initializer(-init_w, init_w), in_channels=hidden_dim, name='policy_logstd')
+
+        self.action_range = action_range
+        self.num_actions = num_actions
+
+    def forward(self, state):
+        x = self.linear1(state)
+        x = self.linear2(x)
+        x = self.linear3(x)
+
+        mean = self.mean_linear(x)
+        log_std = self.log_std_linear(x)
+        log_std = tf.clip_by_value(log_std, self.log_std_min, self.log_std_max)
+
+        return mean, log_std
+
+    def evaluate(self, state, epsilon=1e-6):
+        ''' generate action with state for calculating gradients '''
+        state = state.astype(np.float32)
+        mean, log_std = self.forward(state)
+        std = tf.math.exp(log_std)  # no clip in evaluation, clip affects gradients flow
+
+        normal = Normal(0, 1)
+        z = normal.sample()
+        action_0 = tf.math.tanh(mean + std * z)  # TanhNormal distribution as actions; reparameterization trick
+        action = self.action_range * action_0
+        # according to original paper, with an extra last term for normalizing different action range
+        log_prob = Normal(mean, std).log_prob(mean + std * z) - tf.math.log(1. - action_0**2 +
+                                                                            epsilon) - np.log(self.action_range)
+        # both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action);
+        # the Normal.log_prob outputs the same dim of input features instead of 1 dim probability,
+        # needs sum up across the dim of actions to get 1 dim probability; or else use Multivariate Normal.
+        log_prob = tf.reduce_sum(log_prob, axis=1)[:, np.newaxis]  # expand dim as reduce_sum causes 1 dim reduced
+
+        return action, log_prob, z, mean, log_std
+
+    def get_action(self, state, deterministic):
+        ''' generate action with state for interaction with envronment '''
+        mean, log_std = self.forward([state])
+        std = tf.math.exp(log_std)
+
+        normal = Normal(0, 1)
+        z = normal.sample()
+        action = self.action_range * tf.math.tanh(
+            mean + std * z
+        )  # TanhNormal distribution as actions; reparameterization trick
+
+        action = self.action_range * mean if deterministic else action
+        return action.numpy()[0]
+
+    def sample_action(self, ):
+        ''' generate random actions for exploration '''
+        a = tf.random.uniform([self.num_actions], -1, 1)
+
+        return self.action_range * a.numpy()
+
+
+class SAC_Trainer():
+
+    def __init__(self, replay_buffer, hidden_dim, action_range, soft_q_lr=3e-4, policy_lr=3e-4, alpha_lr=3e-4):
+        self.replay_buffer = replay_buffer
+
+        # initialize all networks
+        self.soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim)
+        self.soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim)
+        self.target_soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim)
+        self.target_soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim)
+        self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim, action_range)
+        self.log_alpha = tf.Variable(0, dtype=np.float32, name='log_alpha')
+        self.alpha = tf.math.exp(self.log_alpha)
+        print('Soft Q Network (1,2): ', self.soft_q_net1)
+        print('Policy Network: ', self.policy_net)
+
+        # initialize weights of target networks
+        self.target_soft_q_net1 = self.target_ini(self.soft_q_net1, self.target_soft_q_net1)
+        self.target_soft_q_net2 = self.target_ini(self.soft_q_net2, self.target_soft_q_net2)
+
+        self.soft_q_optimizer1 = tf.optimizers.Adam(soft_q_lr)
+        self.soft_q_optimizer2 = tf.optimizers.Adam(soft_q_lr)
+        self.policy_optimizer = tf.optimizers.Adam(policy_lr)
+        self.alpha_optimizer = tf.optimizers.Adam(alpha_lr)
+        # self.alpha_optimizer = optim.Adam([self.log_alpha], lr=alpha_lr)
+
+    def target_ini(self, net, target_net):
+        ''' hard-copy update for initializing target networks '''
+        for target_param, param in zip(target_net.trainable_weights, net.trainable_weights):
+            target_param.assign(param)
+        return target_net
+
+    def target_soft_update(self, net, target_net, soft_tau):
+        ''' soft update the target net with Polyak averaging '''
+        for target_param, param in zip(target_net.trainable_weights, net.trainable_weights):
+            target_param.assign(  # copy weight value into target parameters
+                target_param * (1.0 - soft_tau) + param * soft_tau
+            )
+        return target_net
+
+    def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy=-2, gamma=0.99, soft_tau=1e-2):
+        ''' update all networks in SAC '''
+        state, action, reward, next_state, done = self.replay_buffer.sample(batch_size)
+
+        reward = reward[:, np.newaxis]  # expand dim
+        done = done[:, np.newaxis]
+
+        reward = reward_scale * (reward -
+                                 np.mean(reward, axis=0)) / np.std(reward, axis=0)  # normalize with batch mean and std
+
+        # Training Q Function
+        new_next_action, next_log_prob, _, _, _ = self.policy_net.evaluate(next_state)
+        target_q_input = tf.concat([next_state, new_next_action], 1)  # the dim 0 is number of samples
+        target_q_min = tf.minimum(
+            self.target_soft_q_net1(target_q_input), self.target_soft_q_net2(target_q_input)
+        ) - self.alpha * next_log_prob
+        target_q_value = reward + (1 - done) * gamma * target_q_min  # if done==1, only reward
+        q_input = tf.concat([state, action], 1)  # the dim 0 is number of samples
+
+        with tf.GradientTape() as q1_tape:
+            predicted_q_value1 = self.soft_q_net1(q_input)
+            q_value_loss1 = tf.reduce_mean(tf.losses.mean_squared_error(predicted_q_value1, target_q_value))
+        q1_grad = q1_tape.gradient(q_value_loss1, self.soft_q_net1.trainable_weights)
+        self.soft_q_optimizer1.apply_gradients(zip(q1_grad, self.soft_q_net1.trainable_weights))
+
+        with tf.GradientTape() as q2_tape:
+            predicted_q_value2 = self.soft_q_net2(q_input)
+            q_value_loss2 = tf.reduce_mean(tf.losses.mean_squared_error(predicted_q_value2, target_q_value))
+        q2_grad = q2_tape.gradient(q_value_loss2, self.soft_q_net2.trainable_weights)
+        self.soft_q_optimizer2.apply_gradients(zip(q2_grad, self.soft_q_net2.trainable_weights))
+
+        # Training Policy Function
+        with tf.GradientTape() as p_tape:
+            new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state)
+            new_q_input = tf.concat([state, new_action], 1)  # the dim 0 is number of samples
+            ''' implementation 1 '''
+            predicted_new_q_value = tf.minimum(self.soft_q_net1(new_q_input), self.soft_q_net2(new_q_input))
+            ''' implementation 2 '''
+            # predicted_new_q_value = self.soft_q_net1(new_q_input)
+            policy_loss = tf.reduce_mean(self.alpha * log_prob - predicted_new_q_value)
+        p_grad = p_tape.gradient(policy_loss, self.policy_net.trainable_weights)
+        self.policy_optimizer.apply_gradients(zip(p_grad, self.policy_net.trainable_weights))
+
+        # Updating alpha w.r.t entropy
+        # alpha: trade-off between exploration (max entropy) and exploitation (max Q)
+        if auto_entropy is True:
+            with tf.GradientTape() as alpha_tape:
+                alpha_loss = -tf.reduce_mean((self.log_alpha * (log_prob + target_entropy)))
+            alpha_grad = alpha_tape.gradient(alpha_loss, [self.log_alpha])
+            self.alpha_optimizer.apply_gradients(zip(alpha_grad, [self.log_alpha]))
+            self.alpha = tf.math.exp(self.log_alpha)
+        else:  # fixed alpha
+            self.alpha = 1.
+            alpha_loss = 0
+
+    # Soft update the target value nets
+        self.target_soft_q_net1 = self.target_soft_update(self.soft_q_net1, self.target_soft_q_net1, soft_tau)
+        self.target_soft_q_net2 = self.target_soft_update(self.soft_q_net2, self.target_soft_q_net2, soft_tau)
+
+    def save_weights(self):  # save trained weights
+        save_model(self.soft_q_net1, 'model_q_net1', 'SAC')
+        save_model(self.soft_q_net2, 'model_q_net2', 'SAC')
+        save_model(self.target_soft_q_net1, 'model_target_q_net1', 'SAC')
+        save_model(self.target_soft_q_net2, 'model_target_q_net2', 'SAC')
+        save_model(self.policy_net, 'model_policy_net', 'SAC')
+
+        # tl.files.save_npz(self.soft_q_net1.trainable_weights, name='model_q_net1.npz')
+        # tl.files.save_npz(self.soft_q_net2.trainable_weights, name='model_q_net2.npz')
+        # tl.files.save_npz(self.target_soft_q_net1.trainable_weights, name='model_target_q_net1.npz')
+        # tl.files.save_npz(self.target_soft_q_net2.trainable_weights, name='model_target_q_net2.npz')
+        # tl.files.save_npz(self.policy_net.trainable_weights, name='model_policy_net.npz')
+
+    def load_weights(self):  # load trained weights
+        # tl.files.load_and_assign_npz(name='model_q_net1.npz', network=self.soft_q_net1)
+        # tl.files.load_and_assign_npz(name='model_q_net2.npz', network=self.soft_q_net2)
+        # tl.files.load_and_assign_npz(name='model_target_q_net1.npz', network=self.target_soft_q_net1)
+        # tl.files.load_and_assign_npz(name='model_target_q_net2.npz', network=self.target_soft_q_net2)
+        # tl.files.load_and_assign_npz(name='model_policy_net.npz', network=self.policy_net)
+        load_model(self.soft_q_net1, 'model_q_net1', 'SAC')
+        load_model(self.soft_q_net2, 'model_q_net2', 'SAC')
+        load_model(self.target_soft_q_net1, 'model_target_q_net1', 'SAC')
+        load_model(self.target_soft_q_net2, 'model_target_q_net2', 'SAC')
+        load_model(self.policy_net, 'model_policy_net', 'SAC')
+
+
+# def plot(frame_idx, rewards):
+#     clear_output(True)
+#     plt.figure(figsize=(20,5))
+#     plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
+#     plt.plot(rewards)
+#     plt.xlabel('Episode')
+#     plt.ylabel('Episode Reward')
+#     plt.savefig('sac.png')
+# plt.show()
+
+# choose env
+ENV = 'Pendulum-v0'
+env = NormalizedActions(gym.make(ENV))
+action_dim = env.action_space.shape[0]
+state_dim = env.observation_space.shape[0]
+action_range = 1.
+
+replay_buffer_size = 5e5
+replay_buffer = ReplayBuffer(replay_buffer_size)
+
+# hyper-parameters for RL training
+max_frames = 30000  # total number of steps for training
+test_frames = 300  # total number of steps for testing
+max_steps = 150  # maximum number of steps for one episode
+batch_size = 64  # udpate batchsize
+explore_steps = 100  # 500 for random action sampling in the beginning of training
+update_itr = 3  # repeated updates for single step
+hidden_dim = 32  # size of hidden layers for networks
+soft_q_lr = 3e-4  # q_net learning rate
+policy_lr = 3e-4  # policy_net learning rate
+alpha_lr = 3e-4  # alpha learning rate
+policy_target_update_interval = 3  # delayed update for the policy network and target networks
+# explore_noise_scale = 1.0           # range of action noise for exploration
+# eval_noise_scale = 0.5              # range of action noise for evaluation of action value
+reward_scale = 1.  # value range of reward
+
+AUTO_ENTROPY = True  # automatically udpating variable alpha for entropy
+DETERMINISTIC = False  # stochastic action policy if False, otherwise deterministic
+
+
+sac_trainer=SAC_Trainer(replay_buffer, hidden_dim=hidden_dim, action_range=action_range, \
+soft_q_lr=soft_q_lr, policy_lr=policy_lr, alpha_lr=alpha_lr )
+
+#set train mode
+sac_trainer.soft_q_net1.train()
+sac_trainer.soft_q_net2.train()
+sac_trainer.target_soft_q_net1.train()
+sac_trainer.target_soft_q_net2.train()
+sac_trainer.policy_net.train()
+
+# training loop
+if args.train:
+    frame_idx = 0
+    rewards = []
+    while frame_idx < max_frames:
+        state = env.reset()
+        state = state.astype(np.float32)
+        episode_reward = 0
+        if frame_idx < 1:
+            print('intialize')
+            _ = sac_trainer.policy_net(
+                [state]
+            )  # need an extra call here to make inside functions be able to use model.forward
+
+        for step in range(max_steps):
+            if frame_idx > explore_steps:
+                action = sac_trainer.policy_net.get_action(state, deterministic=DETERMINISTIC)
+            else:
+                action = sac_trainer.policy_net.sample_action()
+
+            next_state, reward, done, _ = env.step(action)
+            next_state = next_state.astype(np.float32)
+            env.render()
+            done = 1 if done ==True else 0
+
+            replay_buffer.push(state, action, reward, next_state, done)
+
+            state = next_state
+            episode_reward += reward
+            frame_idx += 1
+
+            if len(replay_buffer) > batch_size:
+                for i in range(update_itr):
+                    sac_trainer.update(
+                        batch_size, reward_scale=reward_scale, auto_entropy=AUTO_ENTROPY,
+                        target_entropy=-1. * action_dim
+                    )
+
+            if frame_idx % 500 == 0:
+                plot(rewards, Algorithm_name='SAC', Env_name=ENV)
+
+            if done:
+                break
+        print('Episode: ', frame_idx / max_steps, '| Episode Reward: ', episode_reward)
+        rewards.append(episode_reward)
+    sac_trainer.save_weights()
+
+if args.test:
+    frame_idx = 0
+    rewards = []
+    sac_trainer.load_weights()
+
+    while frame_idx < test_frames:
+        state = env.reset()
+        state = state.astype(np.float32)
+        episode_reward = 0
+        if frame_idx < 1:
+            print('intialize')
+            _ = sac_trainer.policy_net([state])  # need an extra call to make inside functions be able to use forward
+
+        for step in range(max_steps):
+            action = sac_trainer.policy_net.get_action(state, deterministic=DETERMINISTIC)
+            next_state, reward, done, _ = env.step(action)
+            next_state = next_state.astype(np.float32)
+            env.render()
+            done = 1 if done ==True else 0
+
+            state = next_state
+            episode_reward += reward
+            frame_idx += 1
+
+            # if frame_idx % 50 == 0:
+            #     plot(frame_idx, rewards)
+
+            if done:
+                break
+        print('Episode: ', frame_idx / max_steps, '| Episode Reward: ', episode_reward)
+        rewards.append(episode_reward)
diff --git a/examples/reinforcement_learning/baselines/utils.py b/examples/reinforcement_learning/baselines/utils.py
new file mode 100644
index 000000000..89d8ffe5d
--- /dev/null
+++ b/examples/reinforcement_learning/baselines/utils.py
@@ -0,0 +1,94 @@
+"""
+Functions for utilization.
+
+# Requirements
+tensorflow==2.0.0a0
+tensorlayer==2.0.1
+
+"""
+import os
+import random
+import time
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+import tensorlayer as tl
+
+
+def plot(episode_rewards, Algorithm_name, Env_name):
+    '''
+    plot the learning curve, saved as ./img/Algorithm_name.png
+    :episode_rewards: array of floats
+    :Algorithm_name: string
+    :Env_name: string
+    '''
+    plt.figure(figsize=(10, 5))
+    plt.title(Algorithm_name + '-' + Env_name)
+    plt.plot(np.arange(len(episode_rewards)), episode_rewards)
+    plt.xlabel('Episode')
+    plt.ylabel('Episode Reward')
+    if not os.path.exists('img'):
+        os.makedirs('img')
+    plt.savefig('./img/' + Algorithm_name + '.png')
+
+
+def save_model(model, Model_name, Algorithm_name):
+    '''
+    save trained neural network model
+    :model: tensorlayer.models.Model
+    :Model_name: string, e.g. 'model_sac_q1'
+    :Algorithm_name: string, e.g. 'SAC'
+    '''
+    if not os.path.exists('model/' + Algorithm_name):
+        os.makedirs('model/' + Algorithm_name)
+    tl.files.save_npz(model.trainable_weights, './model/' + Algorithm_name + '/' + Model_name)
+
+
+def load_model(model, Model_name, Algorithm_name):
+    '''
+    load saved neural network model
+    :model: tensorlayer.models.Model
+    :Model_name: string, e.g. 'model_sac_q1'
+    :Algorithm_name: string, e.g. 'SAC'
+    '''
+    try:
+        tl.files.load_and_assign_npz('./model/' + Algorithm_name + '/' + Model_name + '.npz', model)
+    except:
+        print('Load Model Fails!')
+
+
+class ReplayBuffer:
+    '''
+    a ring buffer for storing transitions and sampling for training
+    :state: (state_dim,)
+    :action: (action_dim,)
+    :reward: (,), scalar
+    :next_state: (state_dim,)
+    :done: (,), scalar (0 and 1) or bool (True and False)
+    '''
+
+    def __init__(self, capacity):
+        self.capacity = capacity  # mamimum number of samples
+        self.buffer = []
+        self.position = 0  # pointer
+
+    def push(self, state, action, reward, next_state, done):
+        if len(self.buffer) < self.capacity:
+            self.buffer.append(None)
+        self.buffer[self.position] = (state, action, reward, next_state, done)
+        self.position = int((self.position + 1) % self.capacity)  # as a ring buffer
+
+    def sample(self, batch_size):
+        batch = random.sample(self.buffer, batch_size)
+        state, action, reward, next_state, done = map(np.stack, zip(*batch))  # stack for each element
+        ''' 
+        the * serves as unpack: sum(a,b) <=> batch=(a,b), sum(*batch) ;
+        zip: a=[1,2], b=[2,3], zip(a,b) => [(1, 2), (2, 3)] ;
+        the map serves as mapping the function on each list element: map(square, [2,3]) => [4,9] ;
+        np.stack((1,2)) => array([1, 2])
+        '''
+        return state, action, reward, next_state, done
+
+    def __len__(self):
+        return len(self.buffer)
diff --git a/examples/reinforcement_learning/baselines/wrappers.py b/examples/reinforcement_learning/baselines/wrappers.py
new file mode 100644
index 000000000..4ae724d3a
--- /dev/null
+++ b/examples/reinforcement_learning/baselines/wrappers.py
@@ -0,0 +1,565 @@
+"""Env wrappers
+Note that this file is adapted from `https://pypi.org/project/gym-vec-env` and
+`https://github.com/openai/baselines/blob/master/baselines/common/*wrappers.py`
+"""
+from collections import deque
+from functools import partial
+from multiprocessing import Pipe, Process, cpu_count
+from sys import platform
+
+import numpy as np
+
+import cv2
+import gym
+from gym import spaces
+
+__all__ = (
+    'build_env',  # build env
+    'TimeLimit',  # Time limit wrapper
+    'NoopResetEnv',  # Run random number of no-ops on reset
+    'FireResetEnv',  # Reset wrapper for envs with fire action
+    'EpisodicLifeEnv',  # end-of-life == end-of-episode wrapper
+    'MaxAndSkipEnv',  # skip frame wrapper
+    'ClipRewardEnv',  # clip reward wrapper
+    'WarpFrame',  # warp observation wrapper
+    'FrameStack',  # stack frame wrapper
+    'LazyFrames',  # lazy store wrapper
+    'RewardScaler',  # reward scale
+    'SubprocVecEnv',  # vectorized env wrapper
+    'VecFrameStack',  # stack frames in vectorized env
+    'Monitor',  # Episode reward and length monitor
+    'NormalizedActions',  # normalized action to actual space
+)
+cv2.ocl.setUseOpenCL(False)
+# env_id -> env_type
+id2type = dict()
+for _env in gym.envs.registry.all():
+    id2type[_env.id] = _env._entry_point.split(':')[0].rsplit('.', 1)[1]
+
+
+def build_env(env_id, vectorized=False, seed=0, reward_scale=1.0, nenv=0):
+    """Build env based on options"""
+    env_type = id2type[env_id]
+    nenv = nenv or cpu_count() // (1 + (platform == 'darwin'))
+    stack = env_type == 'atari'
+    if not vectorized:
+        env = _make_env(env_id, env_type, seed, reward_scale, stack)
+    else:
+        env = _make_vec_env(env_id, env_type, nenv, seed, reward_scale, stack)
+
+    return env
+
+
+def _make_env(env_id, env_type, seed, reward_scale, frame_stack=True):
+    """Make single env"""
+    if env_type == 'atari':
+        env = gym.make(env_id)
+        assert 'NoFrameskip' in env.spec.id
+        env = NoopResetEnv(env, noop_max=30)
+        env = MaxAndSkipEnv(env, skip=4)
+        env = Monitor(env)
+        # deepmind wrap
+        env = EpisodicLifeEnv(env)
+        if 'FIRE' in env.unwrapped.get_action_meanings():
+            env = FireResetEnv(env)
+        env = WarpFrame(env)
+        env = ClipRewardEnv(env)
+        if frame_stack:
+            env = FrameStack(env, 4)
+    elif env_type == 'classic_control':
+        env = Monitor(gym.make(env_id))
+    else:
+        raise NotImplementedError
+    if reward_scale != 1:
+        env = RewardScaler(env, reward_scale)
+    env.seed(seed)
+    return env
+
+
+def _make_vec_env(env_id, env_type, nenv, seed, reward_scale, frame_stack=True):
+    """Make vectorized env"""
+    env = SubprocVecEnv([partial(_make_env, env_id, env_type, seed + i, reward_scale, False) for i in range(nenv)])
+    if frame_stack:
+        env = VecFrameStack(env, 4)
+    return env
+
+
+class TimeLimit(gym.Wrapper):
+
+    def __init__(self, env, max_episode_steps=None):
+        super(TimeLimit, self).__init__(env)
+        self._max_episode_steps = max_episode_steps
+        self._elapsed_steps = 0
+
+    def step(self, ac):
+        observation, reward, done, info = self.env.step(ac)
+        self._elapsed_steps += 1
+        if self._elapsed_steps >= self._max_episode_steps:
+            done = True
+            info['TimeLimit.truncated'] = True
+        return observation, reward, done, info
+
+    def reset(self, **kwargs):
+        self._elapsed_steps = 0
+        return self.env.reset(**kwargs)
+
+
+class NoopResetEnv(gym.Wrapper):
+
+    def __init__(self, env, noop_max=30):
+        """Sample initial states by taking random number of no-ops on reset.
+        No-op is assumed to be action 0.
+        """
+        super(NoopResetEnv, self).__init__(env)
+        self.noop_max = noop_max
+        self.override_num_noops = None
+        self.noop_action = 0
+        assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
+
+    def reset(self, **kwargs):
+        """ Do no-op action for a number of steps in [1, noop_max]."""
+        self.env.reset(**kwargs)
+        if self.override_num_noops is not None:
+            noops = self.override_num_noops
+        else:
+            noops = self.unwrapped.np_random.randint(1, self.noop_max + 1)
+        assert noops > 0
+        obs = None
+        for _ in range(noops):
+            obs, _, done, _ = self.env.step(self.noop_action)
+            if done:
+                obs = self.env.reset(**kwargs)
+        return obs
+
+    def step(self, ac):
+        return self.env.step(ac)
+
+
+class FireResetEnv(gym.Wrapper):
+
+    def __init__(self, env):
+        """Take action on reset for environments that are fixed until firing."""
+        super(FireResetEnv, self).__init__(env)
+        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
+        assert len(env.unwrapped.get_action_meanings()) >= 3
+
+    def reset(self, **kwargs):
+        self.env.reset(**kwargs)
+        obs, _, done, _ = self.env.step(1)
+        if done:
+            self.env.reset(**kwargs)
+        obs, _, done, _ = self.env.step(2)
+        if done:
+            self.env.reset(**kwargs)
+        return obs
+
+    def step(self, ac):
+        return self.env.step(ac)
+
+
+class EpisodicLifeEnv(gym.Wrapper):
+
+    def __init__(self, env):
+        """Make end-of-life == end-of-episode, but only reset on true game over.
+        Done by DeepMind for the DQN and co. since it helps value estimation.
+        """
+        super(EpisodicLifeEnv, self).__init__(env)
+        self.lives = 0
+        self.was_real_done = True
+
+    def step(self, action):
+        obs, reward, done, info = self.env.step(action)
+        self.was_real_done = done
+        # check current lives, make loss of life terminal,
+        # then update lives to handle bonus lives
+        lives = self.env.unwrapped.ale.lives()
+        if 0 < lives < self.lives:
+            # for Qbert sometimes we stay in lives == 0 condition for a few
+            # frames so it's important to keep lives > 0, so that we only reset
+            # once the environment advertises done.
+            done = True
+        self.lives = lives
+        return obs, reward, done, info
+
+    def reset(self, **kwargs):
+        """Reset only when lives are exhausted.
+        This way all states are still reachable even though lives are episodic,
+        and the learner need not know about any of this behind-the-scenes.
+        """
+        if self.was_real_done:
+            obs = self.env.reset(**kwargs)
+        else:
+            # no-op step to advance from terminal/lost life state
+            obs, _, _, _ = self.env.step(0)
+        self.lives = self.env.unwrapped.ale.lives()
+        return obs
+
+
+class MaxAndSkipEnv(gym.Wrapper):
+
+    def __init__(self, env, skip=4):
+        """Return only every `skip`-th frame"""
+        super(MaxAndSkipEnv, self).__init__(env)
+        # most recent raw observations (for max pooling across time steps)
+        shape = (2, ) + env.observation_space.shape
+        self._obs_buffer = np.zeros(shape, dtype=np.uint8)
+        self._skip = skip
+
+    def step(self, action):
+        """Repeat action, sum reward, and max over last observations."""
+        total_reward = 0.0
+        done = info = None
+        for i in range(self._skip):
+            obs, reward, done, info = self.env.step(action)
+            if i == self._skip - 2:
+                self._obs_buffer[0] = obs
+            if i == self._skip - 1:
+                self._obs_buffer[1] = obs
+            total_reward += reward
+            if done:
+                break
+        # Note that the observation on the done=True frame doesn't matter
+        max_frame = self._obs_buffer.max(axis=0)
+
+        return max_frame, total_reward, done, info
+
+    def reset(self, **kwargs):
+        return self.env.reset(**kwargs)
+
+
+class ClipRewardEnv(gym.RewardWrapper):
+
+    def __init__(self, env):
+        super(ClipRewardEnv, self).__init__(env)
+
+    def reward(self, reward):
+        """Bin reward to {+1, 0, -1} by its sign."""
+        return np.sign(reward)
+
+
+class WarpFrame(gym.ObservationWrapper):
+
+    def __init__(self, env, width=84, height=84, grayscale=True):
+        """Warp frames to 84x84 as done in the Nature paper and later work."""
+        super(WarpFrame, self).__init__(env)
+        self.width = width
+        self.height = height
+        self.grayscale = grayscale
+        shape = (self.height, self.width, 1 if self.grayscale else 3)
+        self.observation_space = spaces.Box(low=0, high=255, shape=shape, dtype=np.uint8)
+
+    def observation(self, frame):
+        if self.grayscale:
+            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
+        size = (self.width, self.height)
+        frame = cv2.resize(frame, size, interpolation=cv2.INTER_AREA)
+        if self.grayscale:
+            frame = np.expand_dims(frame, -1)
+        return frame
+
+
+class FrameStack(gym.Wrapper):
+
+    def __init__(self, env, k):
+        """Stack k last frames.
+        Returns lazy array, which is much more memory efficient.
+        See Also `LazyFrames`
+        """
+        super(FrameStack, self).__init__(env)
+        self.k = k
+        self.frames = deque([], maxlen=k)
+        shp = env.observation_space.shape
+        shape = shp[:-1] + (shp[-1] * k, )
+        self.observation_space = spaces.Box(low=0, high=255, shape=shape, dtype=env.observation_space.dtype)
+
+    def reset(self):
+        ob = self.env.reset()
+        for _ in range(self.k):
+            self.frames.append(ob)
+        return np.asarray(self._get_ob())
+
+    def step(self, action):
+        ob, reward, done, info = self.env.step(action)
+        self.frames.append(ob)
+        return np.asarray(self._get_ob()), reward, done, info
+
+    def _get_ob(self):
+        assert len(self.frames) == self.k
+        return LazyFrames(list(self.frames))
+
+
+class LazyFrames(object):
+
+    def __init__(self, frames):
+        """This object ensures that common frames between the observations are
+        only stored once. It exists purely to optimize memory usage which can be
+        huge for DQN's 1M frames replay buffers.
+
+        This object should only be converted to numpy array before being passed
+        to the model. You'd not believe how complex the previous solution was.
+        """
+        self._frames = frames
+        self._out = None
+
+    def _force(self):
+        if self._out is None:
+            self._out = np.concatenate(self._frames, axis=-1)
+            self._frames = None
+        return self._out
+
+    def __array__(self, dtype=None):
+        out = self._force()
+        if dtype is not None:
+            out = out.astype(dtype)
+        return out
+
+    def __len__(self):
+        return len(self._force())
+
+    def __getitem__(self, i):
+        return self._force()[i]
+
+
+class RewardScaler(gym.RewardWrapper):
+    """Bring rewards to a reasonable scale for PPO.
+    This is incredibly important and effects performance drastically.
+    """
+
+    def __init__(self, env, scale=0.01):
+        super(RewardScaler, self).__init__(env)
+        self.scale = scale
+
+    def reward(self, reward):
+        return reward * self.scale
+
+
+class VecFrameStack(object):
+
+    def __init__(self, env, k):
+        self.env = env
+        self.k = k
+        self.action_space = env.action_space
+        self.frames = deque([], maxlen=k)
+        shp = env.observation_space.shape
+        shape = shp[:-1] + (shp[-1] * k, )
+        self.observation_space = spaces.Box(low=0, high=255, shape=shape, dtype=env.observation_space.dtype)
+
+    def reset(self):
+        ob = self.env.reset()
+        for _ in range(self.k):
+            self.frames.append(ob)
+        return np.asarray(self._get_ob())
+
+    def step(self, action):
+        ob, reward, done, info = self.env.step(action)
+        self.frames.append(ob)
+        return np.asarray(self._get_ob()), reward, done, info
+
+    def _get_ob(self):
+        assert len(self.frames) == self.k
+        return LazyFrames(list(self.frames))
+
+
+def _worker(remote, parent_remote, env_fn_wrapper):
+    parent_remote.close()
+    env = env_fn_wrapper.x()
+    while True:
+        cmd, data = remote.recv()
+        if cmd == 'step':
+            ob, reward, done, info = env.step(data)
+            if done:
+                ob = env.reset()
+            remote.send((ob, reward, done, info))
+        elif cmd == 'reset':
+            ob = env.reset()
+            remote.send(ob)
+        elif cmd == 'reset_task':
+            ob = env._reset_task()
+            remote.send(ob)
+        elif cmd == 'close':
+            remote.close()
+            break
+        elif cmd == 'get_spaces':
+            remote.send((env.observation_space, env.action_space))
+        else:
+            raise NotImplementedError
+
+
+class CloudpickleWrapper(object):
+    """
+    Uses cloudpickle to serialize contents
+    """
+
+    def __init__(self, x):
+        self.x = x
+
+    def __getstate__(self):
+        import cloudpickle
+        return cloudpickle.dumps(self.x)
+
+    def __setstate__(self, ob):
+        import pickle
+        self.x = pickle.loads(ob)
+
+
+class SubprocVecEnv(object):
+
+    def __init__(self, env_fns):
+        """
+        envs: list of gym environments to run in subprocesses
+        """
+        self.num_envs = len(env_fns)
+
+        self.waiting = False
+        self.closed = False
+        nenvs = len(env_fns)
+        self.nenvs = nenvs
+        self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
+        zipped_args = zip(self.work_remotes, self.remotes, env_fns)
+        self.ps = [
+            Process(target=_worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
+            for (work_remote, remote, env_fn) in zipped_args
+        ]
+
+        for p in self.ps:
+            # if the main process crashes, we should not cause things to hang
+            p.daemon = True
+            p.start()
+        for remote in self.work_remotes:
+            remote.close()
+
+        self.remotes[0].send(('get_spaces', None))
+        observation_space, action_space = self.remotes[0].recv()
+        self.observation_space = observation_space
+        self.action_space = action_space
+
+    def _step_async(self, actions):
+        """
+            Tell all the environments to start taking a step
+            with the given actions.
+            Call step_wait() to get the results of the step.
+            You should not call this if a step_async run is
+            already pending.
+            """
+        for remote, action in zip(self.remotes, actions):
+            remote.send(('step', action))
+        self.waiting = True
+
+    def _step_wait(self):
+        """
+            Wait for the step taken with step_async().
+            Returns (obs, rews, dones, infos):
+             - obs: an array of observations, or a tuple of
+                    arrays of observations.
+             - rews: an array of rewards
+             - dones: an array of "episode done" booleans
+             - infos: a sequence of info objects
+            """
+        results = [remote.recv() for remote in self.remotes]
+        self.waiting = False
+        obs, rews, dones, infos = zip(*results)
+        return np.stack(obs), np.stack(rews), np.stack(dones), infos
+
+    def reset(self):
+        """
+            Reset all the environments and return an array of
+            observations, or a tuple of observation arrays.
+            If step_async is still doing work, that work will
+            be cancelled and step_wait() should not be called
+            until step_async() is invoked again.
+            """
+        for remote in self.remotes:
+            remote.send(('reset', None))
+        return np.stack([remote.recv() for remote in self.remotes])
+
+    def _reset_task(self):
+        for remote in self.remotes:
+            remote.send(('reset_task', None))
+        return np.stack([remote.recv() for remote in self.remotes])
+
+    def close(self):
+        if self.closed:
+            return
+        if self.waiting:
+            for remote in self.remotes:
+                remote.recv()
+        for remote in self.remotes:
+            remote.send(('close', None))
+        for p in self.ps:
+            p.join()
+            self.closed = True
+
+    def __len__(self):
+        return self.nenvs
+
+    def step(self, actions):
+        self._step_async(actions)
+        return self._step_wait()
+
+
+class Monitor(gym.Wrapper):
+
+    def __init__(self, env):
+        super(Monitor, self).__init__(env)
+        self._monitor_rewards = None
+
+    def reset(self, **kwargs):
+        self._monitor_rewards = []
+        return self.env.reset(**kwargs)
+
+    def step(self, action):
+        o_, r, done, info = self.env.step(action)
+        self._monitor_rewards.append(r)
+        if done:
+            info['episode'] = {'r': sum(self._monitor_rewards), 'l': len(self._monitor_rewards)}
+        return o_, r, done, info
+
+
+class NormalizedActions(gym.ActionWrapper):
+
+    def _action(self, action):
+        low = self.action_space.low
+        high = self.action_space.high
+
+        action = low + (action + 1.0) * 0.5 * (high - low)
+        action = np.clip(action, low, high)
+
+        return action
+
+    def _reverse_action(self, action):
+        low = self.action_space.low
+        high = self.action_space.high
+
+        action = 2 * (action - low) / (high - low) - 1
+        action = np.clip(action, low, high)
+
+        return action
+
+
+def unit_test():
+    env_id = 'CartPole-v0'
+    unwrapped_env = gym.make(env_id)
+    wrapped_env = build_env(env_id, False)
+    o = wrapped_env.reset()
+    print('Reset {} observation shape {}'.format(env_id, o.shape))
+    done = False
+    while not done:
+        a = unwrapped_env.action_space.sample()
+        o_, r, done, info = wrapped_env.step(a)
+        print('Take action {} get reward {} info {}'.format(a, r, info))
+
+    env_id = 'PongNoFrameskip-v4'
+    nenv = 2
+    unwrapped_env = gym.make(env_id)
+    wrapped_env = build_env(env_id, True, nenv=nenv)
+    o = wrapped_env.reset()
+    print('Reset {} observation shape {}'.format(env_id, o.shape))
+    for _ in range(1000):
+        a = [unwrapped_env.action_space.sample() for _ in range(nenv)]
+        a = np.asarray(a, 'int64')
+        o_, r, done, info = wrapped_env.step(a)
+        print('Take action {} get reward {} info {}'.format(a, r, info))
+
+
+if __name__ == '__main__':
+    unit_test()
diff --git a/examples/reinforcement_learning/tutorial_A3C.py b/examples/reinforcement_learning/tutorial_A3C.py
new file mode 100644
index 000000000..f904e7c4b
--- /dev/null
+++ b/examples/reinforcement_learning/tutorial_A3C.py
@@ -0,0 +1,321 @@
+"""
+Asynchronous Advantage Actor Critic (A3C) with Continuous Action Space.
+
+Actor Critic History
+----------------------
+A3C > DDPG (for continuous action space) > AC
+
+Advantage
+----------
+Train faster and more stable than AC.
+
+Disadvantage
+-------------
+Have bias.
+
+Reference
+----------
+Original Paper: https://arxiv.org/pdf/1602.01783.pdf
+MorvanZhou's tutorial: https://morvanzhou.github.io/tutorials/
+MorvanZhou's code: https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/blob/master/experiments/Solve_BipedalWalker/A3C.py
+
+Environment
+-----------
+BipedalWalker-v2 : https://gym.openai.com/envs/BipedalWalker-v2
+
+Reward is given for moving forward, total 300+ points up to the far end.
+If the robot falls, it gets -100. Applying motor torque costs a small amount of
+points, more optimal agent will get better score. State consists of hull angle
+speed, angular velocity, horizontal speed, vertical speed, position of joints
+and joints angular speed, legs contact with ground, and 10 lidar rangefinder
+measurements. There's no coordinates in the state vector.
+
+Prerequisites
+--------------
+tensorflow 2.0.0a0
+tensorflow-probability 0.6.0
+tensorlayer 2.0.0
+&&
+pip install box2d box2d-kengz --user
+
+To run
+------
+python tutorial_A3C.py --train/test
+
+"""
+
+import argparse
+import multiprocessing
+import threading
+import time
+
+import numpy as np
+
+import gym
+import tensorflow as tf
+import tensorflow_probability as tfp
+import tensorlayer as tl
+from tensorlayer.layers import DenseLayer, InputLayer
+
+tfd = tfp.distributions
+
+tl.logging.set_verbosity(tl.logging.DEBUG)
+
+np.random.seed(2)
+tf.random.set_seed(2)  # reproducible
+
+# add arguments in command  --train/test
+parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
+parser.add_argument('--train', dest='train', action='store_true', default=False)
+parser.add_argument('--test', dest='test', action='store_true', default=True)
+args = parser.parse_args()
+
+#####################  hyper parameters  ####################
+
+GAME = 'BipedalWalker-v2'  # BipedalWalkerHardcore-v2   BipedalWalker-v2  LunarLanderContinuous-v2
+LOG_DIR = './log'  # the log file
+N_WORKERS = multiprocessing.cpu_count()  # number of workers accroding to number of cores in cpu
+# N_WORKERS = 2     # manually set number of workers
+MAX_GLOBAL_EP = 8  # number of training episodes
+GLOBAL_NET_SCOPE = 'Global_Net'
+UPDATE_GLOBAL_ITER = 10  # update global policy after several episodes
+GAMMA = 0.99  # reward discount factor
+ENTROPY_BETA = 0.005  # factor for entropy boosted exploration
+LR_A = 0.00005  # learning rate for actor
+LR_C = 0.0001  # learning rate for critic
+GLOBAL_RUNNING_R = []
+GLOBAL_EP = 0  # will increase during training, stop training when it >= MAX_GLOBAL_EP
+
+###################  Asynchronous Advantage Actor Critic (A3C)  ####################################
+
+
+class ACNet(object):
+
+    def __init__(self, scope, globalAC=None):
+        self.scope = scope
+        self.save_path = './model'
+
+        w_init = tf.keras.initializers.glorot_normal(seed=None)  # initializer, glorot=xavier
+
+        def get_actor(input_shape):  # policy network
+            with tf.name_scope(self.scope):
+                ni = tl.layers.Input(input_shape, name='in')
+                nn = tl.layers.Dense(n_units=500, act=tf.nn.relu6, W_init=w_init, name='la')(ni)
+                nn = tl.layers.Dense(n_units=300, act=tf.nn.relu6, W_init=w_init, name='la2')(nn)
+                mu = tl.layers.Dense(n_units=N_A, act=tf.nn.tanh, W_init=w_init, name='mu')(nn)
+                sigma = tl.layers.Dense(n_units=N_A, act=tf.nn.softplus, W_init=w_init, name='sigma')(nn)
+            return tl.models.Model(inputs=ni, outputs=[mu, sigma], name=scope + '/Actor')
+
+        self.actor = get_actor([None, N_S])
+        self.actor.train()  # train mode for Dropout, BatchNorm
+
+        def get_critic(input_shape):  # we use Value-function here, but not Q-function.
+            with tf.name_scope(self.scope):
+                ni = tl.layers.Input(input_shape, name='in')
+                nn = tl.layers.Dense(n_units=500, act=tf.nn.relu6, W_init=w_init, name='lc')(ni)
+                nn = tl.layers.Dense(n_units=300, act=tf.nn.relu6, W_init=w_init, name='lc2')(nn)
+                v = tl.layers.Dense(n_units=1, W_init=w_init, name='v')(nn)
+            return tl.models.Model(inputs=ni, outputs=v, name=scope + '/Critic')
+
+        self.critic = get_critic([None, N_S])
+        self.critic.train()  # train mode for Dropout, BatchNorm
+
+    @tf.function  # convert numpy functions to tf.Operations in the TFgraph, return tensor
+    def update_global(
+            self, buffer_s, buffer_a, buffer_v_target, globalAC
+    ):  # refer to the global Actor-Crtic network for updating it with samples
+        ''' update the global critic '''
+        with tf.GradientTape() as tape:
+            self.v = self.critic(buffer_s)
+            self.v_target = buffer_v_target
+            td = tf.subtract(self.v_target, self.v, name='TD_error')
+            self.c_loss = tf.reduce_mean(tf.square(td))
+        self.c_grads = tape.gradient(self.c_loss, self.critic.trainable_weights)
+        OPT_C.apply_gradients(zip(self.c_grads, globalAC.critic.trainable_weights))  # local grads applies to global net
+        # del tape # Drop the reference to the tape
+        ''' update the global actor '''
+        with tf.GradientTape() as tape:
+            self.mu, self.sigma = self.actor(buffer_s)
+            self.test = self.sigma[0]
+            self.mu, self.sigma = self.mu * A_BOUND[1], self.sigma + 1e-5
+
+            normal_dist = tfd.Normal(self.mu, self.sigma)  # no tf.contrib for tf2.0
+            self.a_his = buffer_a  # float32
+            log_prob = normal_dist.log_prob(self.a_his)
+            exp_v = log_prob * td  # td is from the critic part, no gradients for it
+            entropy = normal_dist.entropy()  # encourage exploration
+            self.exp_v = ENTROPY_BETA * entropy + exp_v
+            self.a_loss = tf.reduce_mean(-self.exp_v)
+        self.a_grads = tape.gradient(self.a_loss, self.actor.trainable_weights)
+        OPT_A.apply_gradients(zip(self.a_grads, globalAC.actor.trainable_weights))  # local grads applies to global net
+        return self.test  # for test purpose
+
+    @tf.function
+    def pull_global(self, globalAC):  # run by a local, pull weights from the global nets
+        for l_p, g_p in zip(self.actor.trainable_weights, globalAC.actor.trainable_weights):
+            l_p.assign(g_p)
+        for l_p, g_p in zip(self.critic.trainable_weights, globalAC.critic.trainable_weights):
+            l_p.assign(g_p)
+
+    def choose_action(self, s):  # run by a local
+        s = s[np.newaxis, :]
+        self.mu, self.sigma = self.actor(s)
+
+        with tf.name_scope('wrap_a_out'):
+            self.mu, self.sigma = self.mu * A_BOUND[1], self.sigma + 1e-5
+        normal_dist = tfd.Normal(self.mu, self.sigma)  # for continuous action space
+        self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), *A_BOUND)
+        return self.A.numpy()[0]
+
+    def save_ckpt(self):  # save trained weights
+        tl.files.save_npz(self.actor.trainable_weights, name='model_actor.npz')
+        tl.files.save_npz(self.critic.trainable_weights, name='model_critic.npz')
+
+    def load_ckpt(self):  # load trained weights
+        tl.files.load_and_assign_npz(name='model_actor.npz', network=self.actor)
+        tl.files.load_and_assign_npz(name='model_critic.npz', network=self.critic)
+
+
+class Worker(object):
+
+    def __init__(self, name, globalAC):
+        self.env = gym.make(GAME)
+        self.name = name
+        self.AC = ACNet(name, globalAC)
+
+    # def work(self):
+    def work(self, globalAC):
+        global GLOBAL_RUNNING_R, GLOBAL_EP
+        total_step = 1
+        buffer_s, buffer_a, buffer_r = [], [], []
+        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
+            s = self.env.reset()
+            ep_r = 0
+            while True:
+                # visualize Worker_0 during training
+                if self.name == 'Worker_0' and total_step % 30 == 0:
+                    self.env.render()
+                s = s.astype('float32')  # double to float
+                a = self.AC.choose_action(s)
+                s_, r, done, _info = self.env.step(a)
+
+                s_ = s_.astype('float32')  # double to float
+                # set robot falls reward to -2 instead of -100
+                if r == -100: r = -2
+
+                ep_r += r
+                buffer_s.append(s)
+                buffer_a.append(a)
+                buffer_r.append(r)
+
+                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net
+
+                    if done:
+                        v_s_ = 0  # terminal
+                    else:
+                        v_s_ = self.AC.critic(s_[np.newaxis, :])[0, 0]  # reduce dim from 2 to 0
+
+                    buffer_v_target = []
+
+                    for r in buffer_r[::-1]:  # reverse buffer r
+                        v_s_ = r + GAMMA * v_s_
+                        buffer_v_target.append(v_s_)
+
+                    buffer_v_target.reverse()
+
+                    buffer_s, buffer_a, buffer_v_target = (
+                        np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target)
+                    )
+                    # update gradients on global network
+                    self.AC.update_global(buffer_s, buffer_a, buffer_v_target.astype('float32'), globalAC)
+                    buffer_s, buffer_a, buffer_r = [], [], []
+
+                    # update local network from global network
+                    self.AC.pull_global(globalAC)
+
+                s = s_
+                total_step += 1
+                if done:
+                    if len(GLOBAL_RUNNING_R) == 0:  # record running episode reward
+                        GLOBAL_RUNNING_R.append(ep_r)
+                    else:  # moving average
+                        GLOBAL_RUNNING_R.append(0.95 * GLOBAL_RUNNING_R[-1] + 0.05 * ep_r)
+                    # print(
+                    #     self.name,
+                    #     "Episode: ",
+                    #     GLOBAL_EP,
+                    #     # "| pos: %i" % self.env.unwrapped.hull.position[0],  # number of move
+                    #     '| reward: %.1f' % ep_r,
+                    #     "| running_reward: %.1f" % GLOBAL_RUNNING_R[-1],
+                    #     # '| sigma:', test, # debug
+                    #     # 'WIN ' * 5 if self.env.unwrapped.hull.position[0] >= 88 else '',
+                    # )
+                    print('{}, Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'\
+                    .format(self.name, GLOBAL_EP, MAX_GLOBAL_EP, ep_r, time.time()-t0 ))
+                    GLOBAL_EP += 1
+                    break
+
+
+if __name__ == "__main__":
+
+    env = gym.make(GAME)
+
+    N_S = env.observation_space.shape[0]
+    N_A = env.action_space.shape[0]
+
+    A_BOUND = [env.action_space.low, env.action_space.high]
+    A_BOUND[0] = A_BOUND[0].reshape(1, N_A)
+    A_BOUND[1] = A_BOUND[1].reshape(1, N_A)
+    # print(A_BOUND)
+    if args.train:
+        # ============================= TRAINING ===============================
+        t0 = time.time()
+        with tf.device("/cpu:0"):
+
+            OPT_A = tf.optimizers.RMSprop(LR_A, name='RMSPropA')
+            OPT_C = tf.optimizers.RMSprop(LR_C, name='RMSPropC')
+
+            GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE)  # we only need its params
+            workers = []
+            # Create worker
+            for i in range(N_WORKERS):
+                i_name = 'Worker_%i' % i  # worker name
+                workers.append(Worker(i_name, GLOBAL_AC))
+
+        COORD = tf.train.Coordinator()
+
+        # start TF threading
+        worker_threads = []
+        for worker in workers:
+            # t = threading.Thread(target=worker.work)
+            job = lambda: worker.work(GLOBAL_AC)
+            t = threading.Thread(target=job)
+            t.start()
+            worker_threads.append(t)
+        COORD.join(worker_threads)
+        import matplotlib.pyplot as plt
+        plt.plot(GLOBAL_RUNNING_R)
+        plt.xlabel('episode')
+        plt.ylabel('global running reward')
+        plt.savefig('a3c.png')
+        plt.show()
+
+        GLOBAL_AC.save_ckpt()
+
+    if args.test:
+        # ============================= EVALUATION =============================
+        # env = gym.make(GAME)
+        # GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE)
+        GLOBAL_AC.load_ckpt()
+        while True:
+            s = env.reset()
+            rall = 0
+            while True:
+                env.render()
+                s = s.astype('float32')  # double to float
+                a = GLOBAL_AC.choose_action(s)
+                s, r, d, _ = env.step(a)
+                rall += r
+                if d:
+                    print("reward", rall)
+                    break
diff --git a/examples/reinforcement_learning/tutorial_AC.py b/examples/reinforcement_learning/tutorial_AC.py
new file mode 100644
index 000000000..0bee2735d
--- /dev/null
+++ b/examples/reinforcement_learning/tutorial_AC.py
@@ -0,0 +1,327 @@
+"""
+Actor-Critic 
+-------------
+It uses TD-error as the Advantage.
+
+Actor Critic History
+----------------------
+A3C > DDPG > AC
+
+Advantage
+----------
+AC converge faster than Policy Gradient.
+
+Disadvantage (IMPORTANT)
+------------------------
+The Policy is oscillated (difficult to converge), DDPG can solve
+this problem using advantage of DQN.
+
+Reference
+----------
+paper: https://papers.nips.cc/paper/1786-actor-critic-algorithms.pdf
+View more on MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/
+
+Environment
+------------
+CartPole-v0: https://gym.openai.com/envs/CartPole-v0
+
+A pole is attached by an un-actuated joint to a cart, which moves along a
+frictionless track. The system is controlled by applying a force of +1 or -1
+to the cart. The pendulum starts upright, and the goal is to prevent it from
+falling over.
+
+A reward of +1 is provided for every timestep that the pole remains upright.
+The episode ends when the pole is more than 15 degrees from vertical, or the
+cart moves more than 2.4 units from the center.
+
+
+Prerequisites
+--------------
+tensorflow >=2.0.0a0
+tensorlayer >=2.0.0
+
+To run
+------
+python tutorial_AC.py --train/test
+
+"""
+import argparse
+import time
+
+import numpy as np
+
+import gym
+import tensorflow as tf
+import tensorlayer as tl
+
+tl.logging.set_verbosity(tl.logging.DEBUG)
+
+np.random.seed(2)
+tf.random.set_seed(2)  # reproducible
+
+# add arguments in command  --train/test
+parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
+parser.add_argument('--train', dest='train', action='store_true', default=False)
+parser.add_argument('--test', dest='test', action='store_true', default=True)
+args = parser.parse_args()
+
+#####################  hyper parameters  ####################
+
+OUTPUT_GRAPH = False
+MAX_EPISODE = 3000  # number of overall episodes for training
+DISPLAY_REWARD_THRESHOLD = 100  # renders environment if running reward is greater then this threshold
+MAX_EP_STEPS = 1000  # maximum time step in one episode
+RENDER = False  # rendering wastes time
+LAMBDA = 0.9  # reward discount in TD error
+LR_A = 0.001  # learning rate for actor
+LR_C = 0.01  # learning rate for critic
+
+###############################  Actor-Critic  ####################################
+
+
+class Actor(object):
+
+    def __init__(self, n_features, n_actions, lr=0.001):
+
+        def get_model(inputs_shape):
+            ni = tl.layers.Input(inputs_shape, name='state')
+            nn = tl.layers.Dense(
+                n_units=30, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden'
+            )(ni)
+            nn = tl.layers.Dense(
+                n_units=10, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden2'
+            )(nn)
+            nn = tl.layers.Dense(n_units=n_actions, name='actions')(nn)
+            return tl.models.Model(inputs=ni, outputs=nn, name="Actor")
+
+        self.model = get_model([None, n_features])
+        self.model.train()
+        self.optimizer = tf.optimizers.Adam(lr)
+
+    def learn(self, s, a, td):
+        with tf.GradientTape() as tape:
+            _logits = self.model(np.array([s]))
+            ## cross-entropy loss weighted by td-error (advantage),
+            # the cross-entropy mearsures the difference of two probability distributions: the predicted logits and sampled action distribution,
+            # then weighted by the td-error: small difference of real and predict actions for large td-error (advantage); and vice versa.
+            _exp_v = tl.rein.cross_entropy_reward_loss(logits=_logits, actions=[a], rewards=td[0])
+        grad = tape.gradient(_exp_v, self.model.trainable_weights)
+        self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights))
+        return _exp_v
+
+    def choose_action(self, s):
+        _logits = self.model(np.array([s]))
+        _probs = tf.nn.softmax(_logits).numpy()
+        return tl.rein.choice_action_by_probs(_probs.ravel())  # sample according to probability distribution
+
+    def choose_action_greedy(self, s):
+        _logits = self.model(np.array([s]))  # logits: probability distribution of actions
+        _probs = tf.nn.softmax(_logits).numpy()
+        return np.argmax(_probs.ravel())
+
+    def save_ckpt(self):  # save trained weights
+        tl.files.save_npz(self.model.trainable_weights, name='model_actor.npz')
+
+    def load_ckpt(self):  # load trained weights
+        tl.files.load_and_assign_npz(name='model_actor.npz', network=self.model)
+
+
+class Critic(object):
+
+    def __init__(self, n_features, lr=0.01):
+
+        def get_model(inputs_shape):
+            ni = tl.layers.Input(inputs_shape, name='state')
+            nn = tl.layers.Dense(
+                n_units=30, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden'
+            )(ni)
+            nn = tl.layers.Dense(
+                n_units=5, act=tf.nn.relu, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden2'
+            )(nn)
+            nn = tl.layers.Dense(n_units=1, act=None, name='value')(nn)
+            return tl.models.Model(inputs=ni, outputs=nn, name="Critic")
+
+        self.model = get_model([1, n_features])
+        self.model.train()
+
+        self.optimizer = tf.optimizers.Adam(lr)
+
+    def learn(self, s, r, s_):
+        v_ = self.model(np.array([s_]))
+        with tf.GradientTape() as tape:
+            v = self.model(np.array([s]))
+            ## TD_error = r + lambd * V(newS) - V(S)
+            td_error = r + LAMBDA * v_ - v
+            loss = tf.square(td_error)
+        grad = tape.gradient(loss, self.model.trainable_weights)
+        self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights))
+
+        return td_error
+
+    def save_ckpt(self):  # save trained weights
+        tl.files.save_npz(self.model.trainable_weights, name='model_critic.npz')
+
+    def load_ckpt(self):  # load trained weights
+        tl.files.load_and_assign_npz(name='model_critic.npz', network=self.model)
+
+
+if __name__ == '__main__':
+    ''' 
+    choose environment
+    1. Openai gym:
+    env = gym.make()
+    2. DeepMind Control Suite:
+    env = dm_control2gym.make()
+    '''
+    env = gym.make('CartPole-v0')
+    # dm_control2gym.create_render_mode('example mode', show=True, return_pixel=False, height=240, width=320, camera_id=-1, overlays=(),
+    #              depth=False, scene_option=None)
+    # env = dm_control2gym.make(domain_name="cartpole", task_name="balance")
+    env.seed(2)  # reproducible
+    # env = env.unwrapped
+    N_F = env.observation_space.shape[0]
+    # N_A = env.action_space.shape[0]
+    N_A = env.action_space.n
+
+    print("observation dimension: %d" % N_F)  # 4
+    print("observation high: %s" % env.observation_space.high)  # [ 2.4 , inf , 0.41887902 , inf]
+    print("observation low : %s" % env.observation_space.low)  # [-2.4 , -inf , -0.41887902 , -inf]
+    print("num of actions: %d" % N_A)  # 2 : left or right
+
+    actor = Actor(n_features=N_F, n_actions=N_A, lr=LR_A)
+    # we need a good teacher, so the teacher should learn faster than the actor
+    critic = Critic(n_features=N_F, lr=LR_C)
+
+    if args.train:
+        t0 = time.time()
+        for i_episode in range(MAX_EPISODE):
+            # episode_time = time.time()
+            s = env.reset().astype(np.float32)
+            t = 0  # number of step in this episode
+            all_r = []  # rewards of all steps
+            while True:
+
+                if RENDER: env.render()
+
+                a = actor.choose_action(s)
+
+                s_new, r, done, info = env.step(a)
+                s_new = s_new.astype(np.float32)
+
+                if done: r = -20
+                # these may helpful in some tasks
+                # if abs(s_new[0]) >= env.observation_space.high[0]:
+                # #  cart moves more than 2.4 units from the center
+                #     r = -20
+                # reward for the distance between cart to the center
+                # r -= abs(s_new[0])  * .1
+
+                all_r.append(r)
+
+                td_error = critic.learn(
+                    s, r, s_new
+                )  # learn Value-function : gradient = grad[r + lambda * V(s_new) - V(s)]
+                try:
+                    actor.learn(s, a, td_error)  # learn Policy : true_gradient = grad[logPi(s, a) * td_error]
+                except KeyboardInterrupt:  # if Ctrl+C at running actor.learn(), then save model, or exit if not at actor.learn()
+                    actor.save_ckpt()
+                    critic.save_ckpt()
+                    # logging
+
+                s = s_new
+                t += 1
+
+                if done or t >= MAX_EP_STEPS:
+                    ep_rs_sum = sum(all_r)
+
+                    if 'running_reward' not in globals():
+                        running_reward = ep_rs_sum
+                    else:
+                        running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
+                    # start rending if running_reward greater than a threshold
+                    # if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True
+                    # print("Episode: %d reward: %f running_reward %f took: %.5f" % \
+                    #     (i_episode, ep_rs_sum, running_reward, time.time() - episode_time))
+                    print('Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'\
+                    .format(i_episode, MAX_EPISODE, ep_rs_sum, time.time()-t0 ))
+
+                    # Early Stopping for quick check
+                    if t >= MAX_EP_STEPS:
+                        print("Early Stopping")
+                        s = env.reset().astype(np.float32)
+                        rall = 0
+                        while True:
+                            env.render()
+                            # a = actor.choose_action(s)
+                            a = actor.choose_action_greedy(s)  # Hao Dong: it is important for this task
+                            s_new, r, done, info = env.step(a)
+                            s_new = np.concatenate((s_new[0:N_F], s[N_F:]), axis=0).astype(np.float32)
+                            rall += r
+                            s = s_new
+                            if done:
+                                print("reward", rall)
+                                s = env.reset().astype(np.float32)
+                                rall = 0
+                    break
+        actor.save_ckpt()
+        critic.save_ckpt()
+
+    if args.test:
+        actor.load_ckpt()
+        critic.load_ckpt()
+        t0 = time.time()
+
+        for i_episode in range(MAX_EPISODE):
+            episode_time = time.time()
+            s = env.reset().astype(np.float32)
+            t = 0  # number of step in this episode
+            all_r = []  # rewards of all steps
+            while True:
+                if RENDER: env.render()
+                a = actor.choose_action(s)
+                s_new, r, done, info = env.step(a)
+                s_new = s_new.astype(np.float32)
+                if done: r = -20
+                # these may helpful in some tasks
+                # if abs(s_new[0]) >= env.observation_space.high[0]:
+                # #  cart moves more than 2.4 units from the center
+                #     r = -20
+                # reward for the distance between cart to the center
+                # r -= abs(s_new[0])  * .1
+
+                all_r.append(r)
+                s = s_new
+                t += 1
+
+                if done or t >= MAX_EP_STEPS:
+                    ep_rs_sum = sum(all_r)
+
+                    if 'running_reward' not in globals():
+                        running_reward = ep_rs_sum
+                    else:
+                        running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
+                    # start rending if running_reward greater than a threshold
+                    # if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True
+                    # print("Episode: %d reward: %f running_reward %f took: %.5f" % \
+                    #     (i_episode, ep_rs_sum, running_reward, time.time() - episode_time))
+                    print('Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'\
+                    .format(i_episode, MAX_EPISODE, ep_rs_sum, time.time()-t0 ))
+
+                    # Early Stopping for quick check
+                    if t >= MAX_EP_STEPS:
+                        print("Early Stopping")
+                        s = env.reset().astype(np.float32)
+                        rall = 0
+                        while True:
+                            env.render()
+                            # a = actor.choose_action(s)
+                            a = actor.choose_action_greedy(s)  # Hao Dong: it is important for this task
+                            s_new, r, done, info = env.step(a)
+                            s_new = np.concatenate((s_new[0:N_F], s[N_F:]), axis=0).astype(np.float32)
+                            rall += r
+                            s = s_new
+                            if done:
+                                print("reward", rall)
+                                s = env.reset().astype(np.float32)
+                                rall = 0
+                    break
diff --git a/examples/reinforcement_learning/tutorial_C51.py b/examples/reinforcement_learning/tutorial_C51.py
new file mode 100644
index 000000000..0ff50aa55
--- /dev/null
+++ b/examples/reinforcement_learning/tutorial_C51.py
@@ -0,0 +1,308 @@
+"""
+C51 Algorithm
+------------------------
+Categorical 51 distributional RL algorithm, 51 means the number of atoms. In
+this algorithm, instead of estimating actual expected value, value distribution
+over a series of continuous sub-intervals (atoms) is considered.
+
+
+Reference:
+------------------------
+Bellemare M G, Dabney W, Munos R. A distributional perspective on reinforcement
+learning[C]//Proceedings of the 34th International Conference on Machine
+Learning-Volume 70. JMLR. org, 2017: 449-458.
+
+
+Environment:
+------------------------
+Cartpole and Pong in OpenAI Gym
+
+
+Requirements:
+------------------------
+tensorflow>=2.0.0a0
+tensorlayer>=2.0.0
+
+
+To run:
+------------------------
+python tutorial_C51.py --mode=train
+python tutorial_C51.py --mode=test --save_path=c51/8000.npz
+"""
+import argparse
+import os
+import random
+import time
+
+import numpy as np
+
+import tensorflow as tf
+import tensorlayer as tl
+from tutorial_wrappers import build_env
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--mode', help='train or test', default='train')
+parser.add_argument(
+    '--save_path', default='c51', help='folder to save if mode == train else model path,'
+    'qnet will be saved once target net update'
+)
+parser.add_argument('--seed', help='random seed', type=int, default=0)
+parser.add_argument('--env_id', default='CartPole-v0', help='CartPole-v0 or PongNoFrameskip-v4')
+args = parser.parse_args()
+
+if args.mode == 'train':
+    os.makedirs(args.save_path, exist_ok=True)
+random.seed(args.seed)
+np.random.seed(args.seed)
+tf.random.set_seed(args.seed)  # reproducible
+env_id = args.env_id
+env = build_env(env_id, seed=args.seed)
+
+# ####################  hyper parameters  ####################
+if env_id == 'CartPole-v0':
+    qnet_type = 'MLP'
+    number_timesteps = 10000  # total number of time steps to train on
+    explore_timesteps = 100
+    # epsilon-greedy schedule, final exploit prob is 0.99
+    epsilon = lambda i_iter: 1 - 0.99 * min(1, i_iter / explore_timesteps)
+    lr = 5e-3  # learning rate
+    buffer_size = 1000  # replay buffer size
+    target_q_update_freq = 50  # how frequency target q net update
+    ob_scale = 1.0  # scale observations
+else:
+    # reward will increase obviously after 1e5 time steps
+    qnet_type = 'CNN'
+    number_timesteps = int(1e6)  # total number of time steps to train on
+    explore_timesteps = 1e5
+    # epsilon-greedy schedule, final exploit prob is 0.99
+    epsilon = lambda i_iter: 1 - 0.99 * min(1, i_iter / explore_timesteps)
+    lr = 1e-4  # learning rate
+    buffer_size = 10000  # replay buffer size
+    target_q_update_freq = 200  # how frequency target q net update
+    ob_scale = 1.0 / 255  # scale observations
+
+in_dim = env.observation_space.shape
+out_dim = env.action_space.n
+reward_gamma = 0.99  # reward discount
+batch_size = 32  # batch size for sampling from replay buffer
+warm_start = buffer_size / 10  # sample times before learning
+atom_num = 51
+min_value = -10
+max_value = 10
+vrange = np.linspace(min_value, max_value, atom_num)
+deltaz = float(max_value - min_value) / (atom_num - 1)
+
+
+# ##############################  C51  ####################################
+class MLP(tl.models.Model):
+
+    def __init__(self, name):
+        super(MLP, self).__init__(name=name)
+        self.h1 = tl.layers.Dense(64, tf.nn.tanh, in_channels=in_dim[0], W_init=tf.initializers.GlorotUniform())
+        self.qvalue = tl.layers.Dense(
+            out_dim * atom_num, in_channels=64, name='q', W_init=tf.initializers.GlorotUniform()
+        )
+        self.reshape = tl.layers.Reshape((-1, out_dim, atom_num))
+
+    def forward(self, ni):
+        qvalues = self.qvalue(self.h1(ni))
+        return tf.nn.log_softmax(self.reshape(qvalues), 2)
+
+
+class CNN(tl.models.Model):
+
+    def __init__(self, name):
+        super(CNN, self).__init__(name=name)
+        h, w, in_channels = in_dim
+        dense_in_channels = 64 * ((h - 28) // 8) * ((w - 28) // 8)
+        self.conv1 = tl.layers.Conv2d(
+            32, (8, 8), (4, 4), tf.nn.relu, 'VALID', in_channels=in_channels, name='conv2d_1',
+            W_init=tf.initializers.GlorotUniform()
+        )
+        self.conv2 = tl.layers.Conv2d(
+            64, (4, 4), (2, 2), tf.nn.relu, 'VALID', in_channels=32, name='conv2d_2',
+            W_init=tf.initializers.GlorotUniform()
+        )
+        self.conv3 = tl.layers.Conv2d(
+            64, (3, 3), (1, 1), tf.nn.relu, 'VALID', in_channels=64, name='conv2d_3',
+            W_init=tf.initializers.GlorotUniform()
+        )
+        self.flatten = tl.layers.Flatten(name='flatten')
+        self.preq = tl.layers.Dense(
+            256, tf.nn.relu, in_channels=dense_in_channels, name='pre_q', W_init=tf.initializers.GlorotUniform()
+        )
+        self.qvalue = tl.layers.Dense(
+            out_dim * atom_num, in_channels=256, name='q', W_init=tf.initializers.GlorotUniform()
+        )
+        self.reshape = tl.layers.Reshape((-1, out_dim, atom_num))
+
+    def forward(self, ni):
+        feature = self.flatten(self.conv3(self.conv2(self.conv1(ni))))
+        qvalues = self.qvalue(self.preq(feature))
+        return tf.nn.log_softmax(self.reshape(qvalues), 2)
+
+
+class ReplayBuffer(object):
+
+    def __init__(self, size):
+        self._storage = []
+        self._maxsize = size
+        self._next_idx = 0
+
+    def __len__(self):
+        return len(self._storage)
+
+    def add(self, *args):
+        if self._next_idx >= len(self._storage):
+            self._storage.append(args)
+        else:
+            self._storage[self._next_idx] = args
+        self._next_idx = (self._next_idx + 1) % self._maxsize
+
+    def _encode_sample(self, idxes):
+        # encode sample to numpy.array with right dtype
+        b_o, b_a, b_r, b_o_, b_d = [], [], [], [], []
+        for i in idxes:
+            o, a, r, o_, d = self._storage[i]
+            b_o.append(o)
+            b_a.append(a)
+            b_r.append(r)
+            b_o_.append(o_)
+            b_d.append(d)
+        return (
+            np.stack(b_o).astype('float32') * ob_scale,
+            np.stack(b_a).astype('int32'),
+            np.stack(b_r).astype('float32'),
+            np.stack(b_o_).astype('float32') * ob_scale,
+            np.stack(b_d).astype('float32'),
+        )
+
+    def sample(self, batch_size):
+        indexes = range(len(self._storage))
+        # allow sampling with replacement
+        idxes = [random.choice(indexes) for _ in range(batch_size)]
+        return self._encode_sample(idxes)
+
+
+def sync(net, net_tar):
+    """Copy q network to target q network"""
+    for var, var_tar in zip(net.trainable_weights, net_tar.trainable_weights):
+        var_tar.assign(var)
+
+
+if __name__ == '__main__':
+    if args.mode == 'train':
+        qnet = MLP('q') if qnet_type == 'MLP' else CNN('q')
+        qnet.train()
+        trainabel_weights = qnet.trainable_weights
+        targetqnet = MLP('targetq') if qnet_type == 'MLP' else CNN('targetq')
+        targetqnet.infer()
+        sync(qnet, targetqnet)
+        optimizer = tf.optimizers.Adam(learning_rate=lr)
+        buffer = ReplayBuffer(buffer_size)
+
+        o = env.reset()
+        nepisode = 0
+        t = time.time()
+        for i in range(1, number_timesteps + 1):
+            eps = epsilon(i)
+
+            # select action
+            if random.random() < eps:
+                a = int(random.random() * out_dim)
+            else:
+                obv = np.expand_dims(o, 0).astype('float32') * ob_scale
+                qdist = np.exp(qnet(obv).numpy())
+                qvalues = (qdist * vrange).sum(-1)
+                a = qvalues.argmax(1)[0]
+
+            # execute action and feed to replay buffer
+            # note that `_` tail in var name means next
+            o_, r, done, info = env.step(a)
+            buffer.add(o, a, r, o_, done)
+
+            if i >= warm_start:
+                # sync q net and target q net
+                if i % target_q_update_freq == 0:
+                    sync(qnet, targetqnet)
+                    path = os.path.join(args.save_path, '{}.npz'.format(i))
+                    tl.files.save_npz(qnet.trainable_weights, name=path)
+
+                # sample from replay buffer
+                b_o, b_a, b_r, b_o_, b_d = buffer.sample(batch_size)
+
+                # q estimation, see Algorithm 1 in paper for detail
+                b_dist_ = np.exp(targetqnet(b_o_).numpy())
+                b_a_ = (b_dist_ * vrange).sum(-1).argmax(1)
+                b_tzj = np.clip(
+                    reward_gamma * (1 - b_d[:, None]) * vrange[None, :] + b_r[:, None], min_value, max_value
+                )
+                b_i = (b_tzj - min_value) / deltaz
+                b_l = np.floor(b_i).astype('int64')
+                b_u = np.ceil(b_i).astype('int64')
+                templ = b_dist_[range(batch_size), b_a_, :] * (b_u - b_i)
+                tempu = b_dist_[range(batch_size), b_a_, :] * (b_i - b_l)
+                b_m = np.zeros((batch_size, atom_num))
+                # TODO: aggregate value by index and batch update (scatter_add)
+                for j in range(batch_size):
+                    for k in range(atom_num):
+                        b_m[j][b_l[j][k]] += templ[j][k]
+                        b_m[j][b_u[j][k]] += tempu[j][k]
+                b_m = tf.convert_to_tensor(b_m, dtype='float32')
+
+                # calculate loss
+                with tf.GradientTape() as q_tape:
+                    b_index = np.stack([range(batch_size), b_a], 1)
+                    b_index = tf.convert_to_tensor(b_index, 'int64')
+                    b_dist_a = tf.gather_nd(qnet(b_o), b_index)
+                    loss = -tf.reduce_mean(tf.reduce_sum(b_dist_a * b_m, 1))
+
+                # backward gradients
+                q_grad = q_tape.gradient(loss, trainabel_weights)
+                optimizer.apply_gradients(zip(q_grad, trainabel_weights))
+
+            if done:
+                o = env.reset()
+            else:
+                o = o_
+
+            # episode in info is real (unwrapped) message
+            if info.get('episode'):
+                nepisode += 1
+                reward, length = info['episode']['r'], info['episode']['l']
+                fps = int(length / (time.time() - t))
+                print(
+                    'Time steps so far: {}, episode so far: {}, '
+                    'episode reward: {:.4f}, episode length: {}, FPS: {}'.format(i, nepisode, reward, length, fps)
+                )
+                t = time.time()
+    else:
+        qnet = MLP('q') if qnet_type == 'MLP' else CNN('q')
+        tl.files.load_and_assign_npz(name=args.save_path, network=qnet)
+        qnet.eval()
+
+        nepisode = 0
+        o = env.reset()
+        for i in range(1, number_timesteps + 1):
+            obv = np.expand_dims(o, 0).astype('float32') * ob_scale
+            qdist = np.exp(qnet(obv).numpy())
+            qvalues = (qdist * vrange).sum(-1)
+            a = qvalues.argmax(1)[0]
+
+            # execute action
+            # note that `_` tail in var name means next
+            o_, r, done, info = env.step(a)
+
+            if done:
+                o = env.reset()
+            else:
+                o = o_
+
+            # episode in info is real (unwrapped) message
+            if info.get('episode'):
+                nepisode += 1
+                reward, length = info['episode']['r'], info['episode']['l']
+                print(
+                    'Time steps so far: {}, episode so far: {}, '
+                    'episode reward: {:.4f}, episode length: {}'.format(i, nepisode, reward, length)
+                )
diff --git a/examples/reinforcement_learning/tutorial_DDPG.py b/examples/reinforcement_learning/tutorial_DDPG.py
new file mode 100644
index 000000000..43efe0ad1
--- /dev/null
+++ b/examples/reinforcement_learning/tutorial_DDPG.py
@@ -0,0 +1,315 @@
+"""
+Deep Deterministic Policy Gradient (DDPG)
+-----------------------------------------
+An algorithm concurrently learns a Q-function and a policy.
+It uses off-policy data and the Bellman equation to learn the Q-function,
+and uses the Q-function to learn the policy.
+
+Reference
+---------
+Deterministic Policy Gradient Algorithms, Silver et al. 2014
+Continuous Control With Deep Reinforcement Learning, Lillicrap et al. 2016
+MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/
+
+Environment
+-----------
+Openai Gym Pendulum-v0, continual action space
+
+Prerequisites
+-------------
+tensorflow >=2.0.0a0
+tensorflow-probability 0.6.0
+tensorlayer >=2.0.0
+
+To run
+------
+python tutorial_DDPG.py --train/test
+
+"""
+
+import argparse
+import os
+import time
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+import gym
+import tensorflow as tf
+import tensorlayer as tl
+
+parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
+parser.add_argument('--train', dest='train', action='store_true', default=True)
+parser.add_argument('--test', dest='train', action='store_false')
+args = parser.parse_args()
+
+#####################  hyper parameters  ####################
+
+ENV_NAME = 'Pendulum-v0'  # environment name
+RANDOMSEED = 1  # random seed
+
+LR_A = 0.001  # learning rate for actor
+LR_C = 0.002  # learning rate for critic
+GAMMA = 0.9  # reward discount
+TAU = 0.01  # soft replacement
+MEMORY_CAPACITY = 10000  # size of replay buffer
+BATCH_SIZE = 32  # update batchsize
+
+MAX_EPISODES = 200  # total number of episodes for training
+MAX_EP_STEPS = 200  # total number of steps for each episode
+TEST_PER_EPISODES = 10  # test the model per episodes
+VAR = 3  # control exploration
+
+###############################  DDPG  ####################################
+
+
+class DDPG(object):
+    """
+    DDPG class
+    """
+
+    def __init__(self, a_dim, s_dim, a_bound):
+        self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
+        self.pointer = 0
+        self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound
+
+        W_init = tf.random_normal_initializer(mean=0, stddev=0.3)
+        b_init = tf.constant_initializer(0.1)
+
+        def get_actor(input_state_shape, name=''):
+            """
+            Build actor network
+            :param input_state_shape: state
+            :param name: name
+            :return: act
+            """
+            inputs = tl.layers.Input(input_state_shape, name='A_input')
+            x = tl.layers.Dense(n_units=30, act=tf.nn.relu, W_init=W_init, b_init=b_init, name='A_l1')(inputs)
+            x = tl.layers.Dense(n_units=a_dim, act=tf.nn.tanh, W_init=W_init, b_init=b_init, name='A_a')(x)
+            x = tl.layers.Lambda(lambda x: np.array(a_bound) * x)(x)
+            return tl.models.Model(inputs=inputs, outputs=x, name='Actor' + name)
+
+        def get_critic(input_state_shape, input_action_shape, name=''):
+            """
+            Build critic network
+            :param input_state_shape: state
+            :param input_action_shape: act
+            :param name: name
+            :return: Q value Q(s,a)
+            """
+            s = tl.layers.Input(input_state_shape, name='C_s_input')
+            a = tl.layers.Input(input_action_shape, name='C_a_input')
+            x = tl.layers.Concat(1)([s, a])
+            x = tl.layers.Dense(n_units=60, act=tf.nn.relu, W_init=W_init, b_init=b_init, name='C_l1')(x)
+            x = tl.layers.Dense(n_units=1, W_init=W_init, b_init=b_init, name='C_out')(x)
+            return tl.models.Model(inputs=[s, a], outputs=x, name='Critic' + name)
+
+        self.actor = get_actor([None, s_dim])
+        self.critic = get_critic([None, s_dim], [None, a_dim])
+        self.actor.train()
+        self.critic.train()
+
+        def copy_para(from_model, to_model):
+            """
+            Copy parameters for soft updating
+            :param from_model: latest model
+            :param to_model: target model
+            :return: None
+            """
+            for i, j in zip(from_model.trainable_weights, to_model.trainable_weights):
+                j.assign(i)
+
+        self.actor_target = get_actor([None, s_dim], name='_target')
+        copy_para(self.actor, self.actor_target)
+        self.actor_target.eval()
+
+        self.critic_target = get_critic([None, s_dim], [None, a_dim], name='_target')
+        copy_para(self.critic, self.critic_target)
+        self.critic_target.eval()
+
+        self.R = tl.layers.Input([None, 1], tf.float32, 'r')
+
+        self.ema = tf.train.ExponentialMovingAverage(decay=1 - TAU)  # soft replacement
+
+        self.actor_opt = tf.optimizers.Adam(LR_A)
+        self.critic_opt = tf.optimizers.Adam(LR_C)
+
+    def ema_update(self):
+        """
+        Soft updating by exponential smoothing
+        :return: None
+        """
+        paras = self.actor.trainable_weights + self.critic.trainable_weights
+        self.ema.apply(paras)
+        for i, j in zip(self.actor_target.trainable_weights + self.critic_target.trainable_weights, paras):
+            i.assign(self.ema.average(j))
+
+    def choose_action(self, s):
+        """
+        Choose action
+        :param s: state
+        :return: act
+        """
+        return self.actor(np.array([s], dtype=np.float32))[0]
+
+    def learn(self):
+        """
+        Update parameters
+        :return: None
+        """
+        indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
+        bt = self.memory[indices, :]
+        bs = bt[:, :self.s_dim]
+        ba = bt[:, self.s_dim:self.s_dim + self.a_dim]
+        br = bt[:, -self.s_dim - 1:-self.s_dim]
+        bs_ = bt[:, -self.s_dim:]
+
+        with tf.GradientTape() as tape:
+            a_ = self.actor_target(bs_)
+            q_ = self.critic_target([bs_, a_])
+            y = br + GAMMA * q_
+            q = self.critic([bs, ba])
+            td_error = tf.losses.mean_squared_error(y, q)
+        c_grads = tape.gradient(td_error, self.critic.trainable_weights)
+        self.critic_opt.apply_gradients(zip(c_grads, self.critic.trainable_weights))
+
+        with tf.GradientTape() as tape:
+            a = self.actor(bs)
+            q = self.critic([bs, a])
+            a_loss = -tf.reduce_mean(q)  # maximize the q
+        a_grads = tape.gradient(a_loss, self.actor.trainable_weights)
+        self.actor_opt.apply_gradients(zip(a_grads, self.actor.trainable_weights))
+
+        self.ema_update()
+
+    def store_transition(self, s, a, r, s_):
+        """
+        Store data in data buffer
+        :param s: state
+        :param a: act
+        :param r: reward
+        :param s_: next state
+        :return: None
+        """
+        s = s.astype(np.float32)
+        s_ = s_.astype(np.float32)
+        transition = np.hstack((s, a, [r], s_))
+        index = self.pointer % MEMORY_CAPACITY  # replace the old memory with new memory
+        self.memory[index, :] = transition
+        self.pointer += 1
+
+    def save_ckpt(self):
+        """
+        save trained weights
+        :return: None
+        """
+        if not os.path.exists('model'):
+            os.makedirs('model')
+
+        tl.files.save_weights_to_hdf5('model/ddpg_actor.hdf5', self.actor)
+        tl.files.save_weights_to_hdf5('model/ddpg_actor_target.hdf5', self.actor_target)
+        tl.files.save_weights_to_hdf5('model/ddpg_critic.hdf5', self.critic)
+        tl.files.save_weights_to_hdf5('model/ddpg_critic_target.hdf5', self.critic_target)
+
+    def load_ckpt(self):
+        """
+        load trained weights
+        :return: None
+        """
+        tl.files.load_hdf5_to_weights_in_order('model/ddpg_actor.hdf5', self.actor)
+        tl.files.load_hdf5_to_weights_in_order('model/ddpg_actor_target.hdf5', self.actor_target)
+        tl.files.load_hdf5_to_weights_in_order('model/ddpg_critic.hdf5', self.critic)
+        tl.files.load_hdf5_to_weights_in_order('model/ddpg_critic_target.hdf5', self.critic_target)
+
+
+if __name__ == '__main__':
+
+    env = gym.make(ENV_NAME)
+    env = env.unwrapped
+
+    # reproducible
+    env.seed(RANDOMSEED)
+    np.random.seed(RANDOMSEED)
+    tf.random.set_seed(RANDOMSEED)
+
+    s_dim = env.observation_space.shape[0]
+    a_dim = env.action_space.shape[0]
+    a_bound = env.action_space.high
+
+    ddpg = DDPG(a_dim, s_dim, a_bound)
+
+    if args.train:  # train
+
+        reward_buffer = []
+        t0 = time.time()
+        for i in range(MAX_EPISODES):
+            t1 = time.time()
+            s = env.reset()
+            ep_reward = 0
+            for j in range(MAX_EP_STEPS):
+                # Add exploration noise
+                a = ddpg.choose_action(s)
+                a = np.clip(np.random.normal(a, VAR), -2, 2)  # add randomness to action selection for exploration
+                s_, r, done, info = env.step(a)
+
+                ddpg.store_transition(s, a, r / 10, s_)
+
+                if ddpg.pointer > MEMORY_CAPACITY:
+                    ddpg.learn()
+
+                s = s_
+                ep_reward += r
+                if j == MAX_EP_STEPS - 1:
+                    print(
+                        '\rEpisode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'.format(
+                            i, MAX_EPISODES, ep_reward,
+                            time.time() - t1
+                        ), end=''
+                    )
+                plt.show()
+            # test
+            if i and not i % TEST_PER_EPISODES:
+                t1 = time.time()
+                s = env.reset()
+                ep_reward = 0
+                for j in range(MAX_EP_STEPS):
+
+                    a = ddpg.choose_action(s)  # without exploration noise
+                    s_, r, done, info = env.step(a)
+
+                    s = s_
+                    ep_reward += r
+                    if j == MAX_EP_STEPS - 1:
+                        print(
+                            '\rEpisode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'.format(
+                                i, MAX_EPISODES, ep_reward,
+                                time.time() - t1
+                            )
+                        )
+
+                        reward_buffer.append(ep_reward)
+
+            if reward_buffer:
+                plt.ion()
+                plt.cla()
+                plt.title('DDPG')
+                plt.plot(np.array(range(len(reward_buffer))) * TEST_PER_EPISODES, reward_buffer)  # plot the episode vt
+                plt.xlabel('episode steps')
+                plt.ylabel('normalized state-action value')
+                plt.ylim(-2000, 0)
+                plt.show()
+                plt.pause(0.1)
+        plt.ioff()
+        plt.show()
+        print('\nRunning time: ', time.time() - t0)
+        ddpg.save_ckpt()
+
+    # test
+    ddpg.load_ckpt()
+    while True:
+        s = env.reset()
+        for i in range(MAX_EP_STEPS):
+            env.render()
+            s, r, done, info = env.step(ddpg.choose_action(s))
+            if done:
+                break
diff --git a/examples/reinforcement_learning/tutorial_DPPO.py b/examples/reinforcement_learning/tutorial_DPPO.py
new file mode 100644
index 000000000..62eb7f7fb
--- /dev/null
+++ b/examples/reinforcement_learning/tutorial_DPPO.py
@@ -0,0 +1,394 @@
+"""
+Distributed Proximal Policy Optimization (DPPO)
+----------------------------
+A distributed version of OpenAI's Proximal Policy Optimization (PPO).
+Workers in parallel to collect data, then stop worker's roll-out and train PPO on collected data.
+Restart workers once PPO is updated.
+
+Reference
+---------
+Emergence of Locomotion Behaviours in Rich Environments, Heess et al. 2017
+Proximal Policy Optimization Algorithms, Schulman et al. 2017
+High Dimensional Continuous Control Using Generalized Advantage Estimation, Schulman et al. 2016
+MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials
+
+Environment
+-----------
+Openai Gym Pendulum-v0, continual action space
+
+Prerequisites
+--------------
+tensorflow >=2.0.0a0
+tensorflow-probability 0.6.0
+tensorlayer >=2.0.0
+
+To run
+------
+python tutorial_DPPO.py --train/test
+
+
+"""
+
+import argparse
+import os
+import queue
+import threading
+import time
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+import gym
+import tensorflow as tf
+import tensorflow_probability as tfp
+import tensorlayer as tl
+
+parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
+parser.add_argument('--train', dest='train', action='store_true', default=True)
+parser.add_argument('--test', dest='train', action='store_false')
+args = parser.parse_args()
+
+#####################  hyper parameters  ####################
+
+GAME = 'Pendulum-v0'  # environment name
+RANDOMSEED = 1  # random seed
+
+EP_MAX = 1000  # total number of episodes for training
+EP_LEN = 200  # total number of steps for each episode
+GAMMA = 0.9  # reward discount
+A_LR = 0.0001  # learning rate for actor
+C_LR = 0.0002  # learning rate for critic
+BATCH = 32  # update batchsize
+A_UPDATE_STEPS = 10  # actor update steps
+C_UPDATE_STEPS = 10  # critic update steps
+S_DIM, A_DIM = 3, 1  # state dimension, action dimension
+EPS = 1e-8  # epsilon
+METHOD = [
+    dict(name='kl_pen', kl_target=0.01, lam=0.5),  # KL penalty
+    dict(name='clip', epsilon=0.2),  # Clipped surrogate objective, find this is better
+][1]  # choose the method for optimization
+
+N_WORKER = 4  # parallel workers
+MIN_BATCH_SIZE = 64  # minimum batch size for updating PPO
+UPDATE_STEP = 10  # loop update operation n-steps
+
+###############################  DPPO  ####################################
+
+
+class PPO(object):
+    '''
+    PPO class
+    '''
+
+    def __init__(self):
+
+        # critic
+        tfs = tl.layers.Input([None, S_DIM], tf.float32, 'state')
+        l1 = tl.layers.Dense(100, tf.nn.relu)(tfs)
+        v = tl.layers.Dense(1)(l1)
+        self.critic = tl.models.Model(tfs, v)
+        self.critic.train()
+
+        # actor
+        self.actor = self._build_anet('pi', trainable=True)
+        self.actor_old = self._build_anet('oldpi', trainable=False)
+
+    def a_train(self, tfs, tfa, tfadv):
+        '''
+        Update policy network
+        :param tfs: state
+        :param tfa: act
+        :param tfadv: advantage
+        :return:
+        '''
+        tfs = np.array(tfs, np.float32)
+        tfa = np.array(tfa, np.float32)
+        tfadv = np.array(tfadv, np.float32)
+        with tf.GradientTape() as tape:
+            mu, sigma = self.actor(tfs)
+            pi = tfp.distributions.Normal(mu, sigma)
+
+            mu_old, sigma_old = self.actor_old(tfs)
+            oldpi = tfp.distributions.Normal(mu_old, sigma_old)
+
+            # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa))
+            ratio = pi.prob(tfa) / (oldpi.prob(tfa) + EPS)
+            surr = ratio * tfadv
+            if METHOD['name'] == 'kl_pen':
+                tflam = METHOD['lam']
+                kl = tfp.distributions.kl_divergence(oldpi, pi)
+                kl_mean = tf.reduce_mean(kl)
+                aloss = -(tf.reduce_mean(surr - tflam * kl))
+            else:  # clipping method, find this is better
+                aloss = -tf.reduce_mean(
+                    tf.minimum(surr,
+                               tf.clip_by_value(ratio, 1. - METHOD['epsilon'], 1. + METHOD['epsilon']) * tfadv)
+                )
+        a_gard = tape.gradient(aloss, self.actor.trainable_weights)
+
+        tf.optimizers.Adam(A_LR).apply_gradients(zip(a_gard, self.actor.trainable_weights))
+
+        if METHOD['name'] == 'kl_pen':
+            return kl_mean
+
+    def update_old_pi(self):
+        '''
+        Update old policy parameter
+        :return: None
+        '''
+        for p, oldp in zip(self.actor.trainable_weights, self.actor_old.trainable_weights):
+            oldp.assign(p)
+
+    def c_train(self, tfdc_r, s):
+        '''
+        Update actor network
+        :param tfdc_r: cumulative reward
+        :param s: state
+        :return: None
+        '''
+        tfdc_r = np.array(tfdc_r, dtype=np.float32)
+        with tf.GradientTape() as tape:
+            advantage = tfdc_r - self.critic(s)
+            closs = tf.reduce_mean(tf.square(advantage))
+        grad = tape.gradient(closs, self.critic.trainable_weights)
+        tf.optimizers.Adam(C_LR).apply_gradients(zip(grad, self.critic.trainable_weights))
+
+    def cal_adv(self, tfs, tfdc_r):
+        '''
+        Calculate advantage
+        :param tfs: state
+        :param tfdc_r: cumulative reward
+        :return: advantage
+        '''
+        tfdc_r = np.array(tfdc_r, dtype=np.float32)
+        advantage = tfdc_r - self.critic(tfs)
+        return advantage.numpy()
+
+    def update(self):
+        '''
+        Update parameter with the constraint of KL divergent
+        :return: None
+        '''
+        global GLOBAL_UPDATE_COUNTER
+        while not COORD.should_stop():
+            if GLOBAL_EP < EP_MAX:
+                UPDATE_EVENT.wait()  # wait until get batch of data
+                self.update_old_pi()  # copy pi to old pi
+                data = [QUEUE.get() for _ in range(QUEUE.qsize())]  # collect data from all workers
+                data = np.vstack(data)
+
+                s, a, r = data[:, :S_DIM].astype(np.float32), \
+                          data[:, S_DIM: S_DIM + A_DIM].astype(np.float32), \
+                          data[:, -1:].astype(np.float32)
+
+                adv = self.cal_adv(s, r)
+                # adv = (adv - adv.mean())/(adv.std()+1e-6)     # sometimes helpful
+
+                # update actor
+                if METHOD['name'] == 'kl_pen':
+                    for _ in range(A_UPDATE_STEPS):
+                        kl = self.a_train(s, a, adv)
+                        if kl > 4 * METHOD['kl_target']:  # this in in google's paper
+                            break
+                    if kl < METHOD['kl_target'] / 1.5:  # adaptive lambda, this is in OpenAI's paper
+                        METHOD['lam'] /= 2
+                    elif kl > METHOD['kl_target'] * 1.5:
+                        METHOD['lam'] *= 2
+
+                    # sometimes explode, this clipping is MorvanZhou's solution
+                    METHOD['lam'] = np.clip(METHOD['lam'], 1e-4, 10)
+
+                else:  # clipping method, find this is better (OpenAI's paper)
+                    for _ in range(A_UPDATE_STEPS):
+                        self.a_train(s, a, adv)
+
+                # update critic
+                for _ in range(C_UPDATE_STEPS):
+                    self.c_train(r, s)
+
+                UPDATE_EVENT.clear()  # updating finished
+                GLOBAL_UPDATE_COUNTER = 0  # reset counter
+                ROLLING_EVENT.set()  # set roll-out available
+
+    def _build_anet(self, name, trainable):
+        '''
+        Build policy network
+        :param name: name
+        :param trainable: trainable flag
+        :return: policy network
+        '''
+        tfs = tl.layers.Input([None, S_DIM], tf.float32, name + '_state')
+        l1 = tl.layers.Dense(100, tf.nn.relu, name=name + '_l1')(tfs)
+        a = tl.layers.Dense(A_DIM, tf.nn.tanh, name=name + '_a')(l1)
+        mu = tl.layers.Lambda(lambda x: x * 2, name=name + '_lambda')(a)
+        sigma = tl.layers.Dense(A_DIM, tf.nn.softplus, name=name + '_sigma')(l1)
+        model = tl.models.Model(tfs, [mu, sigma], name)
+
+        if trainable:
+            model.train()
+        else:
+            model.eval()
+        return model
+
+    def choose_action(self, s):
+        '''
+        Choose action
+        :param s: state
+        :return: clipped act
+        '''
+        s = s[np.newaxis, :].astype(np.float32)
+        mu, sigma = self.actor(s)
+        pi = tfp.distributions.Normal(mu, sigma)
+        a = tf.squeeze(pi.sample(1), axis=0)[0]  # choosing action
+        return np.clip(a, -2, 2)
+
+    def get_v(self, s):
+        '''
+        Compute value
+        :param s: state
+        :return: value
+        '''
+        s = s.astype(np.float32)
+        if s.ndim < 2: s = s[np.newaxis, :]
+        return self.critic(s)[0, 0]
+
+    def save_ckpt(self):
+        """
+        save trained weights
+        :return: None
+        """
+        if not os.path.exists('model'):
+            os.makedirs('model')
+        tl.files.save_weights_to_hdf5('model/dppo_actor.hdf5', self.actor)
+        tl.files.save_weights_to_hdf5('model/dppo_actor_old.hdf5', self.actor_old)
+        tl.files.save_weights_to_hdf5('model/dppo_critic.hdf5', self.critic)
+
+    def load_ckpt(self):
+        """
+        load trained weights
+        :return: None
+        """
+        tl.files.load_hdf5_to_weights_in_order('model/dppo_actor.hdf5', self.actor)
+        tl.files.load_hdf5_to_weights_in_order('model/dppo_actor_old.hdf5', self.actor_old)
+        tl.files.load_hdf5_to_weights_in_order('model/dppo_critic.hdf5', self.critic)
+
+
+'''--------------------------------------------------------------'''
+
+
+class Worker(object):
+    '''
+    Worker class for distributional running
+    '''
+
+    def __init__(self, wid):
+        self.wid = wid
+        self.env = gym.make(GAME).unwrapped
+        self.env.seed(wid * 100 + RANDOMSEED)
+        self.ppo = GLOBAL_PPO
+
+    def work(self):
+        '''
+        Define a worker
+        :return: None
+        '''
+        global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
+        while not COORD.should_stop():
+            s = self.env.reset()
+            ep_r = 0
+            buffer_s, buffer_a, buffer_r = [], [], []
+            t0 = time.time()
+            for t in range(EP_LEN):
+                if not ROLLING_EVENT.is_set():  # while global PPO is updating
+                    ROLLING_EVENT.wait()  # wait until PPO is updated
+                    buffer_s, buffer_a, buffer_r = [], [], []  # clear history buffer, use new policy to collect data
+                a = self.ppo.choose_action(s)
+                s_, r, done, _ = self.env.step(a)
+                buffer_s.append(s)
+                buffer_a.append(a)
+                buffer_r.append((r + 8) / 8)  # normalize reward, find to be useful
+                s = s_
+                ep_r += r
+
+                GLOBAL_UPDATE_COUNTER += 1  # count to minimum batch size, no need to wait other workers
+                if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
+                    v_s_ = self.ppo.get_v(s_)
+                    discounted_r = []  # compute discounted reward
+                    for r in buffer_r[::-1]:
+                        v_s_ = r + GAMMA * v_s_
+                        discounted_r.append(v_s_)
+                    discounted_r.reverse()
+
+                    bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis]
+                    buffer_s, buffer_a, buffer_r = [], [], []
+                    QUEUE.put(np.hstack((bs, ba, br)))  # put data in the queue
+                    if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
+                        ROLLING_EVENT.clear()  # stop collecting data
+                        UPDATE_EVENT.set()  # globalPPO update
+
+                    if GLOBAL_EP >= EP_MAX:  # stop training
+                        COORD.request_stop()
+                        break
+
+            # record reward changes, plot later
+            if len(GLOBAL_RUNNING_R) == 0:
+                GLOBAL_RUNNING_R.append(ep_r)
+            else:
+                GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 + ep_r * 0.1)
+            GLOBAL_EP += 1
+
+            print(
+                'Episode: {}/{}  | Worker: {} | Episode Reward: {:.4f}  | Running Time: {:.4f}'.format(
+                    GLOBAL_EP, EP_MAX, self.wid, ep_r,
+                    time.time() - t0
+                )
+            )
+
+
+if __name__ == '__main__':
+
+    # reproducible
+    np.random.seed(RANDOMSEED)
+    tf.random.set_seed(RANDOMSEED)
+
+    GLOBAL_PPO = PPO()
+    if args.train:  # train
+        UPDATE_EVENT, ROLLING_EVENT = threading.Event(), threading.Event()
+        UPDATE_EVENT.clear()  # not update now
+        ROLLING_EVENT.set()  # start to roll out
+        workers = [Worker(wid=i) for i in range(N_WORKER)]
+
+        GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0
+        GLOBAL_RUNNING_R = []
+        COORD = tf.train.Coordinator()
+        QUEUE = queue.Queue()  # workers putting data in this queue
+        threads = []
+        for worker in workers:  # worker threads
+            t = threading.Thread(target=worker.work, args=())
+            t.start()  # training
+            threads.append(t)
+        # add a PPO updating thread
+        threads.append(threading.Thread(target=GLOBAL_PPO.update, ))
+        threads[-1].start()
+        COORD.join(threads)
+
+        GLOBAL_PPO.save_ckpt()
+
+        # plot reward change and test
+        plt.title('DPPO')
+        plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
+        plt.xlabel('Episode')
+        plt.ylabel('Moving reward')
+        plt.ylim(-2000, 0)
+        plt.show()
+
+    # test
+    GLOBAL_PPO.load_ckpt()
+    env = gym.make(GAME)
+    while True:
+        s = env.reset()
+        for t in range(EP_LEN):
+            env.render()
+            s, r, done, info = env.step(GLOBAL_PPO.choose_action(s))
+            if done:
+                break
diff --git a/examples/reinforcement_learning/tutorial_DQN.py b/examples/reinforcement_learning/tutorial_DQN.py
new file mode 100644
index 000000000..c7d6a10cd
--- /dev/null
+++ b/examples/reinforcement_learning/tutorial_DQN.py
@@ -0,0 +1,183 @@
+"""
+Deep Q-Network Q(a, s)
+-----------------------
+TD Learning, Off-Policy, e-Greedy Exploration (GLIE).
+
+Q(S, A) <- Q(S, A) + alpha * (R + lambda * Q(newS, newA) - Q(S, A))
+delta_w = R + lambda * Q(newS, newA)
+
+See David Silver RL Tutorial Lecture 5 - Q-Learning for more details.
+
+Reference
+----------
+original paper: https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf
+EN: https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0#.5m3361vlw
+CN: https://zhuanlan.zhihu.com/p/25710327
+
+Note: Policy Network has been proved to be better than Q-Learning, see tutorial_atari_pong.py
+
+Environment
+-----------
+# The FrozenLake v0 environment
+https://gym.openai.com/envs/FrozenLake-v0
+The agent controls the movement of a character in a grid world. Some tiles of
+the grid are walkable, and others lead to the agent falling into the water.
+Additionally, the movement direction of the agent is uncertain and only partially
+depends on the chosen direction. The agent is rewarded for finding a walkable
+path to a goal tile.
+SFFF       (S: starting point, safe)
+FHFH       (F: frozen surface, safe)
+FFFH       (H: hole, fall to your doom)
+HFFG       (G: goal, where the frisbee is located)
+The episode ends when you reach the goal or fall in a hole. You receive a reward
+of 1 if you reach the goal, and zero otherwise.
+
+Prerequisites
+--------------
+tensorflow>=2.0.0a0
+tensorlayer>=2.0.0
+
+To run
+-------
+python tutorial_DQN.py --train/test
+
+
+"""
+import argparse
+import time
+
+import numpy as np
+
+import gym
+import tensorflow as tf
+import tensorlayer as tl
+
+# add arguments in command  --train/test
+parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
+parser.add_argument('--train', dest='train', action='store_true', default=False)
+parser.add_argument('--test', dest='test', action='store_true', default=True)
+args = parser.parse_args()
+
+tl.logging.set_verbosity(tl.logging.DEBUG)
+
+#####################  hyper parameters  ####################
+lambd = .99  # decay factor
+e = 0.1  # e-Greedy Exploration, the larger the more random
+num_episodes = 10000
+render = False  # display the game environment
+running_reward = None
+
+##################### DQN ##########################
+
+
+def to_one_hot(i, n_classes=None):
+    a = np.zeros(n_classes, 'uint8')
+    a[i] = 1
+    return a
+
+
+## Define Q-network q(a,s) that ouput the rewards of 4 actions by given state, i.e. Action-Value Function.
+# encoding for state: 4x4 grid can be represented by one-hot vector with 16 integers.
+def get_model(inputs_shape):
+    ni = tl.layers.Input(inputs_shape, name='observation')
+    nn = tl.layers.Dense(4, act=None, W_init=tf.random_uniform_initializer(0, 0.01), b_init=None, name='q_a_s')(ni)
+    return tl.models.Model(inputs=ni, outputs=nn, name="Q-Network")
+
+
+def save_ckpt(model):  # save trained weights
+    tl.files.save_npz(model.trainable_weights, name='dqn_model.npz')
+
+
+def load_ckpt(model):  # load trained weights
+    tl.files.load_and_assign_npz(name='dqn_model.npz', network=model)
+
+
+if __name__ == '__main__':
+
+    qnetwork = get_model([None, 16])
+    qnetwork.train()
+    train_weights = qnetwork.trainable_weights
+
+    optimizer = tf.optimizers.SGD(learning_rate=0.1)
+    env = gym.make('FrozenLake-v0')
+
+    if args.train:
+        t0 = time.time()
+        for i in range(num_episodes):
+            ## Reset environment and get first new observation
+            # episode_time = time.time()
+            s = env.reset()  # observation is state, integer 0 ~ 15
+            rAll = 0
+            for j in range(99):  # step index, maximum step is 99
+                if render: env.render()
+                ## Choose an action by greedily (with e chance of random action) from the Q-network
+                allQ = qnetwork(np.asarray([to_one_hot(s, 16)], dtype=np.float32)).numpy()
+                a = np.argmax(allQ, 1)
+
+                ## e-Greedy Exploration !!! sample random action
+                if np.random.rand(1) < e:
+                    a[0] = env.action_space.sample()
+                ## Get new state and reward from environment
+                s1, r, d, _ = env.step(a[0])
+                ## Obtain the Q' values by feeding the new state through our network
+                Q1 = qnetwork(np.asarray([to_one_hot(s1, 16)], dtype=np.float32)).numpy()
+
+                ## Obtain maxQ' and set our target value for chosen action.
+                maxQ1 = np.max(Q1)  # in Q-Learning, policy is greedy, so we use "max" to select the next action.
+                targetQ = allQ
+                targetQ[0, a[0]] = r + lambd * maxQ1
+                ## Train network using target and predicted Q values
+                # it is not real target Q value, it is just an estimation,
+                # but check the Q-Learning update formula:
+                #    Q'(s,a) <- Q(s,a) + alpha(r + lambd * maxQ(s',a') - Q(s, a))
+                # minimizing |r + lambd * maxQ(s',a') - Q(s, a)|^2 equals to force Q'(s,a) ≈ Q(s,a)
+                with tf.GradientTape() as tape:
+                    _qvalues = qnetwork(np.asarray([to_one_hot(s, 16)], dtype=np.float32))
+                    _loss = tl.cost.mean_squared_error(targetQ, _qvalues, is_mean=False)
+                grad = tape.gradient(_loss, train_weights)
+                optimizer.apply_gradients(zip(grad, train_weights))
+
+                rAll += r
+                s = s1
+                ## Reduce chance of random action if an episode is done.
+                if d ==True:
+                    e = 1. / ((i / 50) + 10)  # reduce e, GLIE: Greey in the limit with infinite Exploration
+                    break
+
+            ## Note that, the rewards here with random action
+            running_reward = rAll if running_reward is None else running_reward * 0.99 + rAll * 0.01
+            # print("Episode [%d/%d] sum reward: %f running reward: %f took: %.5fs " % \
+            #     (i, num_episodes, rAll, running_reward, time.time() - episode_time))
+            print('Episode: {}/{}  | Episode Reward: {:.4f} | Running Average Reward: {:.4f}  | Running Time: {:.4f}'\
+            .format(i, num_episodes, rAll, running_reward,  time.time()-t0 ))
+        save_ckpt(qnetwork)  # save model
+
+    if args.test:
+        t0 = time.time()
+        load_ckpt(qnetwork)  # load model
+        for i in range(num_episodes):
+            ## Reset environment and get first new observation
+            episode_time = time.time()
+            s = env.reset()  # observation is state, integer 0 ~ 15
+            rAll = 0
+            for j in range(99):  # step index, maximum step is 99
+                if render: env.render()
+                ## Choose an action by greedily (with e chance of random action) from the Q-network
+                allQ = qnetwork(np.asarray([to_one_hot(s, 16)], dtype=np.float32)).numpy()
+                a = np.argmax(allQ, 1)  # no epsilon, only greedy for testing
+
+                ## Get new state and reward from environment
+                s1, r, d, _ = env.step(a[0])
+                rAll += r
+                s = s1
+                ## Reduce chance of random action if an episode is done.
+                if d ==True:
+                    e = 1. / ((i / 50) + 10)  # reduce e, GLIE: Greey in the limit with infinite Exploration
+                    break
+
+            ## Note that, the rewards here with random action
+            running_reward = rAll if running_reward is None else running_reward * 0.99 + rAll * 0.01
+            # print("Episode [%d/%d] sum reward: %f running reward: %f took: %.5fs " % \
+            #     (i, num_episodes, rAll, running_reward, time.time() - episode_time))
+            print('Episode: {}/{}  | Episode Reward: {:.4f} | Running Average Reward: {:.4f}  | Running Time: {:.4f}'\
+            .format(i, num_episodes, rAll, running_reward,  time.time()-t0 ))
diff --git a/examples/reinforcement_learning/tutorial_DQN_variants.py b/examples/reinforcement_learning/tutorial_DQN_variants.py
new file mode 100644
index 000000000..f4bf7954e
--- /dev/null
+++ b/examples/reinforcement_learning/tutorial_DQN_variants.py
@@ -0,0 +1,373 @@
+"""
+DQN and its variants
+------------------------
+We implement Double DQN, Dueling DQN and Noisy DQN here.
+
+The max operator in standard DQN uses the same values both to select and to
+evaluate an action by
+Q(s_t, a_t) = R_{t+1} + \gamma * max_{a}Q_{tar}(s_{t+1}, a).
+Double DQN propose to use following evaluation to address overestimation problem
+of max operator:
+Q(s_t, a_t) = R_{t+1} + \gamma * Q_{tar}(s_{t+1}, max_{a}Q(s_{t+1}, a)).
+
+Dueling DQN uses dueling architecture where the value of state and the advantage
+of each action is estimated separately.
+
+Noisy DQN propose to explore by adding parameter noises.
+
+
+Reference:
+------------------------
+1. Double DQN
+    Van Hasselt H, Guez A, Silver D. Deep reinforcement learning with double
+    q-learning[C]//Thirtieth AAAI Conference on Artificial Intelligence. 2016.
+2. Dueling DQN
+    Wang Z, Schaul T, Hessel M, et al. Dueling network architectures for deep
+    reinforcement learning[J]. arXiv preprint arXiv:1511.06581, 2015.
+3. Noisy DQN
+    Plappert M, Houthooft R, Dhariwal P, et al. Parameter space noise for
+    exploration[J]. arXiv preprint arXiv:1706.01905, 2017.
+
+
+Environment:
+------------------------
+Cartpole and Pong in OpenAI Gym
+
+
+Requirements:
+------------------------
+tensorflow>=2.0.0a0
+tensorlayer>=2.0.0
+
+
+To run:
+------------------------
+python tutorial_DQN_variantes.py --mode=train
+python tutorial_DQN_variantes.py --mode=test --save_path=dqn_variants/8000.npz
+"""
+import argparse
+import os
+import random
+import time
+
+import numpy as np
+
+import tensorflow as tf
+import tensorlayer as tl
+from tutorial_wrappers import build_env
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--mode', help='train or test', default='train')
+parser.add_argument(
+    '--save_path', default='dqn_variants', help='folder to save if mode == train else model path,'
+    'qnet will be saved once target net update'
+)
+parser.add_argument('--seed', help='random seed', type=int, default=0)
+parser.add_argument('--env_id', default='CartPole-v0', help='CartPole-v0 or PongNoFrameskip-v4')
+args = parser.parse_args()
+
+if args.mode == 'train':
+    os.makedirs(args.save_path, exist_ok=True)
+random.seed(args.seed)
+np.random.seed(args.seed)
+tf.random.set_seed(args.seed)  # reproducible
+env_id = args.env_id
+env = build_env(env_id, seed=args.seed)
+
+# ####################  hyper parameters  ####################
+if env_id == 'CartPole-v0':
+    qnet_type = 'MLP'
+    number_timesteps = 10000  # total number of time steps to train on
+    explore_timesteps = 100
+    # epsilon-greedy schedule, final exploit prob is 0.99
+    epsilon = lambda i_iter: 1 - 0.99 * min(1, i_iter / explore_timesteps)
+    lr = 5e-3  # learning rate
+    buffer_size = 1000  # replay buffer size
+    target_q_update_freq = 50  # how frequency target q net update
+    ob_scale = 1.0  # scale observations
+else:
+    # reward will increase obviously after 1e5 time steps
+    qnet_type = 'CNN'
+    number_timesteps = int(1e6)  # total number of time steps to train on
+    explore_timesteps = 1e5
+    # epsilon-greedy schedule, final exploit prob is 0.99
+    epsilon = lambda i_iter: 1 - 0.99 * min(1, i_iter / explore_timesteps)
+    lr = 1e-4  # learning rate
+    buffer_size = 10000  # replay buffer size
+    target_q_update_freq = 200  # how frequency target q net update
+    ob_scale = 1.0 / 255  # scale observations
+
+in_dim = env.observation_space.shape
+out_dim = env.action_space.n
+reward_gamma = 0.99  # reward discount
+batch_size = 32  # batch size for sampling from replay buffer
+warm_start = buffer_size / 10  # sample times befor learning
+noise_update_freq = 50  # how frequency param noise net update
+
+
+# ##############################  DQN  ####################################
+class MLP(tl.models.Model):
+
+    def __init__(self, name):
+        super(MLP, self).__init__(name=name)
+        self.h1 = tl.layers.Dense(64, tf.nn.tanh, in_channels=in_dim[0])
+        self.qvalue = tl.layers.Dense(out_dim, in_channels=64, name='q', W_init=tf.initializers.GlorotUniform())
+        self.svalue = tl.layers.Dense(1, in_channels=64, name='s', W_init=tf.initializers.GlorotUniform())
+        self.noise_scale = 0
+
+    def forward(self, ni):
+        feature = self.h1(ni)
+
+        # apply noise to all linear layer
+        if self.noise_scale != 0:
+            noises = []
+            for layer in [self.qvalue, self.svalue]:
+                for var in layer.trainable_weights:
+                    noise = tf.random.normal(tf.shape(var), 0, self.noise_scale)
+                    noises.append(noise)
+                    var.assign_add(noise)
+
+        qvalue = self.qvalue(feature)
+        svalue = self.svalue(feature)
+
+        if self.noise_scale != 0:
+            idx = 0
+            for layer in [self.qvalue, self.svalue]:
+                for var in layer.trainable_weights:
+                    var.assign_sub(noises[idx])
+                    idx += 1
+
+        # dueling network
+        out = svalue + qvalue - tf.reduce_mean(qvalue, 1, keepdims=True)
+        return out
+
+
+class CNN(tl.models.Model):
+
+    def __init__(self, name):
+        super(CNN, self).__init__(name=name)
+        h, w, in_channels = in_dim
+        dense_in_channels = 64 * ((h - 28) // 8) * ((w - 28) // 8)
+        self.conv1 = tl.layers.Conv2d(
+            32, (8, 8), (4, 4), tf.nn.relu, 'VALID', in_channels=in_channels, name='conv2d_1',
+            W_init=tf.initializers.GlorotUniform()
+        )
+        self.conv2 = tl.layers.Conv2d(
+            64, (4, 4), (2, 2), tf.nn.relu, 'VALID', in_channels=32, name='conv2d_2',
+            W_init=tf.initializers.GlorotUniform()
+        )
+        self.conv3 = tl.layers.Conv2d(
+            64, (3, 3), (1, 1), tf.nn.relu, 'VALID', in_channels=64, name='conv2d_3',
+            W_init=tf.initializers.GlorotUniform()
+        )
+        self.flatten = tl.layers.Flatten(name='flatten')
+        self.preq = tl.layers.Dense(
+            256, tf.nn.relu, in_channels=dense_in_channels, name='pre_q', W_init=tf.initializers.GlorotUniform()
+        )
+        self.qvalue = tl.layers.Dense(out_dim, in_channels=256, name='q', W_init=tf.initializers.GlorotUniform())
+        self.pres = tl.layers.Dense(
+            256, tf.nn.relu, in_channels=dense_in_channels, name='pre_s', W_init=tf.initializers.GlorotUniform()
+        )
+        self.svalue = tl.layers.Dense(1, in_channels=256, name='state', W_init=tf.initializers.GlorotUniform())
+        self.noise_scale = 0
+
+    def forward(self, ni):
+        feature = self.flatten(self.conv3(self.conv2(self.conv1(ni))))
+
+        # apply noise to all linear layer
+        if self.noise_scale != 0:
+            noises = []
+            for layer in [self.preq, self.qvalue, self.pres, self.svalue]:
+                for var in layer.trainable_weights:
+                    noise = tf.random.normal(tf.shape(var), 0, self.noise_scale)
+                    noises.append(noise)
+                    var.assign_add(noise)
+
+        qvalue = self.qvalue(self.preq(feature))
+        svalue = self.svalue(self.pres(feature))
+
+        if self.noise_scale != 0:
+            idx = 0
+            for layer in [self.preq, self.qvalue, self.pres, self.svalue]:
+                for var in layer.trainable_weights:
+                    var.assign_sub(noises[idx])
+                    idx += 1
+
+        # dueling network
+        return svalue + qvalue - tf.reduce_mean(qvalue, 1, keepdims=True)
+
+
+class ReplayBuffer(object):
+
+    def __init__(self, size):
+        self._storage = []
+        self._maxsize = size
+        self._next_idx = 0
+
+    def __len__(self):
+        return len(self._storage)
+
+    def add(self, *args):
+        if self._next_idx >= len(self._storage):
+            self._storage.append(args)
+        else:
+            self._storage[self._next_idx] = args
+        self._next_idx = (self._next_idx + 1) % self._maxsize
+
+    def _encode_sample(self, idxes):
+        b_o, b_a, b_r, b_o_, b_d = [], [], [], [], []
+        for i in idxes:
+            o, a, r, o_, d = self._storage[i]
+            b_o.append(o)
+            b_a.append(a)
+            b_r.append(r)
+            b_o_.append(o_)
+            b_d.append(d)
+        return (
+            np.stack(b_o).astype('float32') * ob_scale,
+            np.stack(b_a).astype('int32'),
+            np.stack(b_r).astype('float32'),
+            np.stack(b_o_).astype('float32') * ob_scale,
+            np.stack(b_d).astype('float32'),
+        )
+
+    def sample(self, batch_size):
+        indexes = range(len(self._storage))
+        idxes = [random.choice(indexes) for _ in range(batch_size)]
+        return self._encode_sample(idxes)
+
+
+def huber_loss(x):
+    """Loss function for value"""
+    return tf.where(tf.abs(x) < 1, tf.square(x) * 0.5, tf.abs(x) - 0.5)
+
+
+def sync(net, net_tar):
+    """Copy q network to target q network"""
+    for var, var_tar in zip(net.trainable_weights, net_tar.trainable_weights):
+        var_tar.assign(var)
+
+
+def log_softmax(x, dim):
+    temp = x - np.max(x, dim, keepdims=True)
+    return temp - np.log(np.exp(temp).sum(dim, keepdims=True))
+
+
+def softmax(x, dim):
+    temp = np.exp(x - np.max(x, dim, keepdims=True))
+    return temp / temp.sum(dim, keepdims=True)
+
+
+if __name__ == '__main__':
+    if args.mode == 'train':
+        qnet = MLP('q') if qnet_type == 'MLP' else CNN('q')
+        qnet.train()
+        trainabel_weights = qnet.trainable_weights
+        targetqnet = MLP('targetq') if qnet_type == 'MLP' else CNN('targetq')
+        targetqnet.infer()
+        sync(qnet, targetqnet)
+        optimizer = tf.optimizers.Adam(learning_rate=lr)
+        buffer = ReplayBuffer(buffer_size)
+
+        o = env.reset()
+        nepisode = 0
+        t = time.time()
+        noise_scale = 1e-2
+        for i in range(1, number_timesteps + 1):
+            eps = epsilon(i)
+
+            # select action
+            if random.random() < eps:
+                a = int(random.random() * out_dim)
+            else:
+                # noise schedule is based on KL divergence between perturbed and
+                # non-perturbed policy, see https://arxiv.org/pdf/1706.01905.pdf
+                obv = np.expand_dims(o, 0).astype('float32') * ob_scale
+                if i < explore_timesteps:
+                    qnet.noise_scale = noise_scale
+                    q_ptb = qnet(obv).numpy()
+                    qnet.noise_scale = 0
+                    if i % noise_update_freq == 0:
+                        q = qnet(obv).numpy()
+                        kl_ptb = (log_softmax(q, 1) - log_softmax(q_ptb, 1))
+                        kl_ptb = np.sum(kl_ptb * softmax(q, 1), 1).mean()
+                        kl_explore = -np.log(1 - eps + eps / out_dim)
+                        if kl_ptb < kl_explore:
+                            noise_scale *= 1.01
+                        else:
+                            noise_scale /= 1.01
+                    a = q_ptb.argmax(1)[0]
+                else:
+                    a = qnet(obv).numpy().argmax(1)[0]
+
+            # execute action and feed to replay buffer
+            # note that `_` tail in var name means next
+            o_, r, done, info = env.step(a)
+            buffer.add(o, a, r, o_, done)
+
+            if i >= warm_start:
+                # sync q net and target q net
+                if i % target_q_update_freq == 0:
+                    sync(qnet, targetqnet)
+                    path = os.path.join(args.save_path, '{}.npz'.format(i))
+                    tl.files.save_npz(qnet.trainable_weights, name=path)
+
+                # sample from replay buffer
+                b_o, b_a, b_r, b_o_, b_d = buffer.sample(batch_size)
+
+                # double q estimation
+                b_a_ = tf.one_hot(tf.argmax(qnet(b_o_), 1), out_dim)
+                b_q_ = (1 - b_d) * tf.reduce_sum(targetqnet(b_o_) * b_a_, 1)
+
+                # calculate loss
+                with tf.GradientTape() as q_tape:
+                    b_q = tf.reduce_sum(qnet(b_o) * tf.one_hot(b_a, out_dim), 1)
+                    loss = tf.reduce_mean(huber_loss(b_q - (b_r + reward_gamma * b_q_)))
+
+                # backward gradients
+                q_grad = q_tape.gradient(loss, trainabel_weights)
+                optimizer.apply_gradients(zip(q_grad, trainabel_weights))
+
+            if done:
+                o = env.reset()
+            else:
+                o = o_
+
+            # episode in info is real (unwrapped) message
+            if info.get('episode'):
+                nepisode += 1
+                reward, length = info['episode']['r'], info['episode']['l']
+                fps = int(length / (time.time() - t))
+                print(
+                    'Time steps so far: {}, episode so far: {}, '
+                    'episode reward: {:.4f}, episode length: {}, FPS: {}'.format(i, nepisode, reward, length, fps)
+                )
+                t = time.time()
+    else:
+        qnet = MLP('q') if qnet_type == 'MLP' else CNN('q')
+        tl.files.load_and_assign_npz(name=args.save_path, network=qnet)
+        qnet.eval()
+
+        nepisode = 0
+        o = env.reset()
+        for i in range(1, number_timesteps + 1):
+            obv = np.expand_dims(o, 0).astype('float32') * ob_scale
+            a = qnet(obv).numpy().argmax(1)[0]
+
+            # execute action
+            # note that `_` tail in var name means next
+            o_, r, done, info = env.step(a)
+
+            if done:
+                o = env.reset()
+            else:
+                o = o_
+
+            # episode in info is real (unwrapped) message
+            if info.get('episode'):
+                nepisode += 1
+                reward, length = info['episode']['r'], info['episode']['l']
+                print(
+                    'Time steps so far: {}, episode so far: {}, '
+                    'episode reward: {:.4f}, episode length: {}'.format(i, nepisode, reward, length)
+                )
diff --git a/examples/reinforcement_learning/tutorial_PG.py b/examples/reinforcement_learning/tutorial_PG.py
new file mode 100644
index 000000000..42c47aacc
--- /dev/null
+++ b/examples/reinforcement_learning/tutorial_PG.py
@@ -0,0 +1,280 @@
+"""
+Vanilla Policy Gradient(VPG or REINFORCE)
+-----------------------------------------
+The policy gradient algorithm works by updating policy parameters via stochastic gradient ascent on policy performance.
+It's an on-policy algorithm can be used for environments with either discrete or continuous action spaces.
+Here is an example on discrete action space game CartPole-v0.
+To apply it on continuous action space, you need to change the last softmax layer and the choose_action function.
+
+Reference
+---------
+Cookbook: Barto A G, Sutton R S. Reinforcement Learning: An Introduction[J]. 1998.
+MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/
+
+Environment
+-----------
+Openai Gym CartPole-v0, discrete action space
+
+Prerequisites
+--------------
+tensorflow >=2.0.0a0
+tensorflow-probability 0.6.0
+tensorlayer >=2.0.0
+
+To run
+------
+python tutorial_PG.py --train/test
+
+"""
+
+import argparse
+import os
+import time
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+import gym
+import tensorflow as tf
+import tensorlayer as tl
+
+parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
+parser.add_argument('--train', dest='train', action='store_true', default=True)
+parser.add_argument('--test', dest='train', action='store_false')
+args = parser.parse_args()
+
+#####################  hyper parameters  ####################
+
+ENV_NAME = 'CartPole-v0'  # environment name
+RANDOMSEED = 1  # random seed
+
+DISPLAY_REWARD_THRESHOLD = 400  # renders environment if total episode reward is greater then this threshold
+RENDER = False  # rendering wastes time
+num_episodes = 3000
+
+###############################  PG  ####################################
+
+
+class PolicyGradient:
+    """
+    PG class
+    """
+
+    def __init__(self, n_features, n_actions, learning_rate=0.01, reward_decay=0.95):
+        self.n_actions = n_actions
+        self.n_features = n_features
+        self.lr = learning_rate
+        self.gamma = reward_decay
+
+        self.ep_obs, self.ep_as, self.ep_rs = [], [], []
+
+        def get_model(inputs_shape):
+            """
+            Build a neural network model.
+            :param inputs_shape: state_shape
+            :return: act
+            """
+            with tf.name_scope('inputs'):
+                self.tf_obs = tl.layers.Input(inputs_shape, tf.float32, name="observations")
+                self.tf_acts = tl.layers.Input([
+                    None,
+                ], tf.int32, name="actions_num")
+                self.tf_vt = tl.layers.Input([
+                    None,
+                ], tf.float32, name="actions_value")
+            # fc1
+            layer = tl.layers.Dense(
+                n_units=30, act=tf.nn.tanh, W_init=tf.random_normal_initializer(mean=0, stddev=0.3),
+                b_init=tf.constant_initializer(0.1), name='fc1'
+            )(self.tf_obs)
+            # fc2
+            all_act = tl.layers.Dense(
+                n_units=self.n_actions, act=None, W_init=tf.random_normal_initializer(mean=0, stddev=0.3),
+                b_init=tf.constant_initializer(0.1), name='all_act'
+            )(layer)
+            return tl.models.Model(inputs=self.tf_obs, outputs=all_act, name='PG model')
+
+        self.model = get_model([None, n_features])
+        self.model.train()
+        self.optimizer = tf.optimizers.Adam(self.lr)
+
+    def choose_action(self, s):
+        """
+        choose action with probabilities.
+        :param s: state
+        :return: act
+        """
+        _logits = self.model(np.array([s], np.float32))
+        _probs = tf.nn.softmax(_logits).numpy()
+        return tl.rein.choice_action_by_probs(_probs.ravel())
+
+    def choose_action_greedy(self, s):
+        """
+        choose action with greedy policy
+        :param s: state
+        :return: act
+        """
+        _probs = tf.nn.softmax(self.model(np.array([s], np.float32))).numpy()
+        return np.argmax(_probs.ravel())
+
+    def store_transition(self, s, a, r):
+        """
+        store data in memory buffer
+        :param s: state
+        :param a: act
+        :param r: reward
+        :return:
+        """
+        self.ep_obs.append(np.array([s], np.float32))
+        self.ep_as.append(a)
+        self.ep_rs.append(r)
+
+    def learn(self):
+        """
+        update policy parameters via stochastic gradient ascent
+        :return: None
+        """
+        # discount and normalize episode reward
+        discounted_ep_rs_norm = self._discount_and_norm_rewards()
+
+        with tf.GradientTape() as tape:
+
+            _logits = self.model(np.vstack(self.ep_obs))
+            # to maximize total reward (log_p * R) is to minimize -(log_p * R), and the tf only have minimize(loss)
+            neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=_logits, labels=np.array(self.ep_as))
+            # this is negative log of chosen action
+
+            # or in this way:
+            # neg_log_prob = tf.reduce_sum(-tf.log(self.all_act_prob)*tf.one_hot(self.tf_acts, self.n_actions), axis=1)
+
+            loss = tf.reduce_mean(neg_log_prob * discounted_ep_rs_norm)  # reward guided loss
+
+        grad = tape.gradient(loss, self.model.trainable_weights)
+        self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights))
+
+        self.ep_obs, self.ep_as, self.ep_rs = [], [], []  # empty episode data
+        return discounted_ep_rs_norm
+
+    def _discount_and_norm_rewards(self):
+        """
+        compute discount_and_norm_rewards
+        :return: discount_and_norm_rewards
+        """
+        # discount episode rewards
+        discounted_ep_rs = np.zeros_like(self.ep_rs)
+        running_add = 0
+        for t in reversed(range(0, len(self.ep_rs))):
+            running_add = running_add * self.gamma + self.ep_rs[t]
+            discounted_ep_rs[t] = running_add
+
+        # normalize episode rewards
+        discounted_ep_rs -= np.mean(discounted_ep_rs)
+        discounted_ep_rs /= np.std(discounted_ep_rs)
+        return discounted_ep_rs
+
+    def save_ckpt(self):
+        """
+        save trained weights
+        :return: None
+        """
+        if not os.path.exists('model'):
+            os.makedirs('model')
+        tl.files.save_weights_to_hdf5('model/pg_policy.hdf5', self.model)
+
+    def load_ckpt(self):
+        """
+        load trained weights
+        :return: None
+        """
+        tl.files.load_hdf5_to_weights_in_order('model/pg_policy.hdf5', self.model)
+
+
+if __name__ == '__main__':
+
+    # reproducible
+    np.random.seed(RANDOMSEED)
+    tf.random.set_seed(RANDOMSEED)
+
+    tl.logging.set_verbosity(tl.logging.DEBUG)
+
+    env = gym.make(ENV_NAME)
+    env.seed(RANDOMSEED)  # reproducible, general Policy gradient has high variance
+    env = env.unwrapped
+
+    print(env.action_space)
+    print(env.observation_space)
+    print(env.observation_space.high)
+    print(env.observation_space.low)
+
+    RL = PolicyGradient(
+        n_actions=env.action_space.n,
+        n_features=env.observation_space.shape[0],
+        learning_rate=0.02,
+        reward_decay=0.99,
+        # output_graph=True,
+    )
+
+    if args.train:
+        reward_buffer = []
+
+        for i_episode in range(num_episodes):
+
+            episode_time = time.time()
+            observation = env.reset()
+
+            while True:
+                if RENDER:
+                    env.render()
+
+                action = RL.choose_action(observation)
+
+                observation_, reward, done, info = env.step(action)
+
+                RL.store_transition(observation, action, reward)
+
+                if done:
+                    ep_rs_sum = sum(RL.ep_rs)
+
+                    if 'running_reward' not in globals():
+                        running_reward = ep_rs_sum
+                    else:
+                        running_reward = running_reward * 0.99 + ep_rs_sum * 0.01
+
+                    if running_reward > DISPLAY_REWARD_THRESHOLD:
+                        RENDER = True  # rendering
+
+                    # print("episode:", i_episode, "  reward:", int(running_reward))
+
+                    print(
+                        "Episode [%d/%d] \tsum reward: %d  \trunning reward: %f \ttook: %.5fs " %
+                        (i_episode, num_episodes, ep_rs_sum, running_reward, time.time() - episode_time)
+                    )
+                    reward_buffer.append(running_reward)
+
+                    vt = RL.learn()
+
+                    plt.ion()
+                    plt.cla()
+                    plt.title('PG')
+                    plt.plot(reward_buffer, )  # plot the episode vt
+                    plt.xlabel('episode steps')
+                    plt.ylabel('normalized state-action value')
+                    plt.show()
+                    plt.pause(0.1)
+
+                    break
+
+                observation = observation_
+        RL.save_ckpt()
+        plt.ioff()
+        plt.show()
+
+    # test
+    RL.load_ckpt()
+    observation = env.reset()
+    while True:
+        env.render()
+        action = RL.choose_action(observation)
+        observation, reward, done, info = env.step(action)
+        if done:
+            observation = env.reset()
diff --git a/examples/reinforcement_learning/tutorial_PPO.py b/examples/reinforcement_learning/tutorial_PPO.py
new file mode 100644
index 000000000..d95633234
--- /dev/null
+++ b/examples/reinforcement_learning/tutorial_PPO.py
@@ -0,0 +1,332 @@
+"""
+Proximal Policy Optimization (PPO)
+----------------------------
+A simple version of Proximal Policy Optimization (PPO) using single thread.
+PPO is a family of first-order methods that use a few other tricks to keep new policies close to old.
+PPO methods are significantly simpler to implement, and empirically seem to perform at least as well as TRPO.
+
+Reference
+---------
+Proximal Policy Optimization Algorithms, Schulman et al. 2017
+High Dimensional Continuous Control Using Generalized Advantage Estimation, Schulman et al. 2016
+Emergence of Locomotion Behaviours in Rich Environments, Heess et al. 2017
+MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials
+
+Environment
+-----------
+Openai Gym Pendulum-v0, continual action space
+
+Prerequisites
+--------------
+tensorflow >=2.0.0a0
+tensorflow-probability 0.6.0
+tensorlayer >=2.0.0
+
+To run
+------
+python tutorial_PPO.py --train/test
+
+"""
+
+import argparse
+import os
+import time
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+import gym
+import tensorflow as tf
+import tensorflow_probability as tfp
+import tensorlayer as tl
+
+parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
+parser.add_argument('--train', dest='train', action='store_true', default=True)
+parser.add_argument('--test', dest='train', action='store_false')
+args = parser.parse_args()
+
+#####################  hyper parameters  ####################
+
+ENV_NAME = 'Pendulum-v0'  # environment name
+RANDOMSEED = 1  # random seed
+
+EP_MAX = 1000  # total number of episodes for training
+EP_LEN = 200  # total number of steps for each episode
+GAMMA = 0.9  # reward discount
+A_LR = 0.0001  # learning rate for actor
+C_LR = 0.0002  # learning rate for critic
+BATCH = 32  # update batchsize
+A_UPDATE_STEPS = 10  # actor update steps
+C_UPDATE_STEPS = 10  # critic update steps
+S_DIM, A_DIM = 3, 1  # state dimension, action dimension
+EPS = 1e-8  # epsilon
+METHOD = [
+    dict(name='kl_pen', kl_target=0.01, lam=0.5),  # KL penalty
+    dict(name='clip', epsilon=0.2),  # Clipped surrogate objective, find this is better
+][1]  # choose the method for optimization
+
+###############################  PPO  ####################################
+
+
+class PPO(object):
+    '''
+    PPO class
+    '''
+
+    def __init__(self):
+
+        # critic
+        tfs = tl.layers.Input([None, S_DIM], tf.float32, 'state')
+        l1 = tl.layers.Dense(100, tf.nn.relu)(tfs)
+        v = tl.layers.Dense(1)(l1)
+        self.critic = tl.models.Model(tfs, v)
+        self.critic.train()
+
+        # actor
+        self.actor = self._build_anet('pi', trainable=True)
+        self.actor_old = self._build_anet('oldpi', trainable=False)
+
+    def a_train(self, tfs, tfa, tfadv):
+        '''
+        Update policy network
+        :param tfs: state
+        :param tfa: act
+        :param tfadv: advantage
+        :return:
+        '''
+        tfs = np.array(tfs, np.float32)
+        tfa = np.array(tfa, np.float32)
+        tfadv = np.array(tfadv, np.float32)
+        with tf.GradientTape() as tape:
+            mu, sigma = self.actor(tfs)
+            pi = tfp.distributions.Normal(mu, sigma)
+
+            mu_old, sigma_old = self.actor_old(tfs)
+            oldpi = tfp.distributions.Normal(mu_old, sigma_old)
+
+            # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa))
+            ratio = pi.prob(tfa) / (oldpi.prob(tfa) + EPS)
+            surr = ratio * tfadv
+            if METHOD['name'] == 'kl_pen':
+                tflam = METHOD['lam']
+                kl = tfp.distributions.kl_divergence(oldpi, pi)
+                kl_mean = tf.reduce_mean(kl)
+                aloss = -(tf.reduce_mean(surr - tflam * kl))
+            else:  # clipping method, find this is better
+                aloss = -tf.reduce_mean(
+                    tf.minimum(surr,
+                               tf.clip_by_value(ratio, 1. - METHOD['epsilon'], 1. + METHOD['epsilon']) * tfadv)
+                )
+        a_gard = tape.gradient(aloss, self.actor.trainable_weights)
+
+        tf.optimizers.Adam(A_LR).apply_gradients(zip(a_gard, self.actor.trainable_weights))
+
+        if METHOD['name'] == 'kl_pen':
+            return kl_mean
+
+    def update_old_pi(self):
+        '''
+        Update old policy parameter
+        :return: None
+        '''
+        for p, oldp in zip(self.actor.trainable_weights, self.actor_old.trainable_weights):
+            oldp.assign(p)
+
+    def c_train(self, tfdc_r, s):
+        '''
+        Update actor network
+        :param tfdc_r: cumulative reward
+        :param s: state
+        :return: None
+        '''
+        tfdc_r = np.array(tfdc_r, dtype=np.float32)
+        with tf.GradientTape() as tape:
+            v = self.critic(s)
+            advantage = tfdc_r - v
+            closs = tf.reduce_mean(tf.square(advantage))
+        # print('tfdc_r value', tfdc_r)
+        grad = tape.gradient(closs, self.critic.trainable_weights)
+        tf.optimizers.Adam(C_LR).apply_gradients(zip(grad, self.critic.trainable_weights))
+
+    def cal_adv(self, tfs, tfdc_r):
+        '''
+        Calculate advantage
+        :param tfs: state
+        :param tfdc_r: cumulative reward
+        :return: advantage
+        '''
+        tfdc_r = np.array(tfdc_r, dtype=np.float32)
+        advantage = tfdc_r - self.critic(tfs)
+        return advantage.numpy()
+
+    def update(self, s, a, r):
+        '''
+        Update parameter with the constraint of KL divergent
+        :param s: state
+        :param a: act
+        :param r: reward
+        :return: None
+        '''
+        s, a, r = s.astype(np.float32), a.astype(np.float32), r.astype(np.float32)
+
+        self.update_old_pi()
+        adv = self.cal_adv(s, r)
+        # adv = (adv - adv.mean())/(adv.std()+1e-6)     # sometimes helpful
+
+        # update actor
+        if METHOD['name'] == 'kl_pen':
+            for _ in range(A_UPDATE_STEPS):
+                kl = self.a_train(s, a, adv)
+                if kl > 4 * METHOD['kl_target']:  # this in in google's paper
+                    break
+            if kl < METHOD['kl_target'] / 1.5:  # adaptive lambda, this is in OpenAI's paper
+                METHOD['lam'] /= 2
+            elif kl > METHOD['kl_target'] * 1.5:
+                METHOD['lam'] *= 2
+            METHOD['lam'] = np.clip(
+                METHOD['lam'], 1e-4, 10
+            )  # sometimes explode, this clipping is MorvanZhou's solution
+        else:  # clipping method, find this is better (OpenAI's paper)
+            for _ in range(A_UPDATE_STEPS):
+                self.a_train(s, a, adv)
+
+        # update critic
+        for _ in range(C_UPDATE_STEPS):
+            self.c_train(r, s)
+
+    def _build_anet(self, name, trainable):
+        '''
+        Build policy network
+        :param name: name
+        :param trainable: trainable flag
+        :return: policy network
+        '''
+        tfs = tl.layers.Input([None, S_DIM], tf.float32, name + '_state')
+        l1 = tl.layers.Dense(100, tf.nn.relu, name=name + '_l1')(tfs)
+        a = tl.layers.Dense(A_DIM, tf.nn.tanh, name=name + '_a')(l1)
+        mu = tl.layers.Lambda(lambda x: x * 2, name=name + '_lambda')(a)
+        sigma = tl.layers.Dense(A_DIM, tf.nn.softplus, name=name + '_sigma')(l1)
+        model = tl.models.Model(tfs, [mu, sigma], name)
+
+        if trainable:
+            model.train()
+        else:
+            model.eval()
+        return model
+
+    def choose_action(self, s):
+        '''
+        Choose action
+        :param s: state
+        :return: clipped act
+        '''
+        s = s[np.newaxis, :].astype(np.float32)
+        mu, sigma = self.actor(s)
+        pi = tfp.distributions.Normal(mu, sigma)
+        a = tf.squeeze(pi.sample(1), axis=0)[0]  # choosing action
+        return np.clip(a, -2, 2)
+
+    def get_v(self, s):
+        '''
+        Compute value
+        :param s: state
+        :return: value
+        '''
+        s = s.astype(np.float32)
+        if s.ndim < 2: s = s[np.newaxis, :]
+        return self.critic(s)[0, 0]
+
+    def save_ckpt(self):
+        """
+        save trained weights
+        :return: None
+        """
+        if not os.path.exists('model'):
+            os.makedirs('model')
+        tl.files.save_weights_to_hdf5('model/ppo_actor.hdf5', self.actor)
+        tl.files.save_weights_to_hdf5('model/ppo_actor_old.hdf5', self.actor_old)
+        tl.files.save_weights_to_hdf5('model/ppo_critic.hdf5', self.critic)
+
+    def load_ckpt(self):
+        """
+        load trained weights
+        :return: None
+        """
+        tl.files.load_hdf5_to_weights_in_order('model/ppo_actor.hdf5', self.actor)
+        tl.files.load_hdf5_to_weights_in_order('model/ppo_actor_old.hdf5', self.actor_old)
+        tl.files.load_hdf5_to_weights_in_order('model/ppo_critic.hdf5', self.critic)
+
+
+if __name__ == '__main__':
+
+    env = gym.make(ENV_NAME).unwrapped
+
+    # reproducible
+    env.seed(RANDOMSEED)
+    np.random.seed(RANDOMSEED)
+    tf.random.set_seed(RANDOMSEED)
+
+    ppo = PPO()
+
+    if args.train:
+        all_ep_r = []
+        for ep in range(EP_MAX):
+            s = env.reset()
+            buffer_s, buffer_a, buffer_r = [], [], []
+            ep_r = 0
+            t0 = time.time()
+            for t in range(EP_LEN):  # in one episode
+                # env.render()
+                a = ppo.choose_action(s)
+                s_, r, done, _ = env.step(a)
+                buffer_s.append(s)
+                buffer_a.append(a)
+                buffer_r.append((r + 8) / 8)  # normalize reward, find to be useful
+                s = s_
+                ep_r += r
+
+                # update ppo
+                if (t + 1) % BATCH == 0 or t == EP_LEN - 1:
+                    v_s_ = ppo.get_v(s_)
+                    discounted_r = []
+                    for r in buffer_r[::-1]:
+                        v_s_ = r + GAMMA * v_s_
+                        discounted_r.append(v_s_)
+                    discounted_r.reverse()
+
+                    bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis]
+                    buffer_s, buffer_a, buffer_r = [], [], []
+                    ppo.update(bs, ba, br)
+            if ep == 0:
+                all_ep_r.append(ep_r)
+            else:
+                all_ep_r.append(all_ep_r[-1] * 0.9 + ep_r * 0.1)
+            print(
+                'Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'.format(
+                    ep, EP_MAX, ep_r,
+                    time.time() - t0
+                )
+            )
+
+            plt.ion()
+            plt.cla()
+            plt.title('PPO')
+            plt.plot(np.arange(len(all_ep_r)), all_ep_r)
+            plt.ylim(-2000, 0)
+            plt.xlabel('Episode')
+            plt.ylabel('Moving averaged episode reward')
+            plt.show()
+            plt.pause(0.1)
+        ppo.save_ckpt()
+        plt.ioff()
+        plt.show()
+
+    # test
+    ppo.load_ckpt()
+    while True:
+        s = env.reset()
+        for i in range(EP_LEN):
+            env.render()
+            s, r, done, _ = env.step(ppo.choose_action(s))
+            if done:
+                break
diff --git a/examples/reinforcement_learning/tutorial_frozenlake_q_table.py b/examples/reinforcement_learning/tutorial_Qlearning.py
similarity index 96%
rename from examples/reinforcement_learning/tutorial_frozenlake_q_table.py
rename to examples/reinforcement_learning/tutorial_Qlearning.py
index a5b44059a..a8decb273 100644
--- a/examples/reinforcement_learning/tutorial_frozenlake_q_table.py
+++ b/examples/reinforcement_learning/tutorial_Qlearning.py
@@ -11,6 +11,9 @@
 EN: https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0#.5m3361vlw
 CN: https://zhuanlan.zhihu.com/p/25710327
 
+tensorflow==2.0.0a0
+tensorlayer==2.0.0
+
 """
 
 import time
@@ -52,7 +55,7 @@
             break
     rList.append(rAll)
     running_reward = r if running_reward is None else running_reward * 0.99 + r * 0.01
-    print("Episode [%d/%d] sum reward: %f running reward: %f took: %.5fs %s" % \
-        (i, num_episodes, rAll, running_reward, time.time() - episode_time, '' if rAll == 0 else ' !!!!!!!!'))
+    print("Episode [%d/%d] sum reward: %f running reward: %f took: %.5fs " % \
+        (i, num_episodes, rAll, running_reward, time.time() - episode_time))
 
 print("Final Q-Table Values:/n %s" % Q)
diff --git a/examples/reinforcement_learning/tutorial_Retrace.py b/examples/reinforcement_learning/tutorial_Retrace.py
new file mode 100644
index 000000000..e1e03cf1d
--- /dev/null
+++ b/examples/reinforcement_learning/tutorial_Retrace.py
@@ -0,0 +1,279 @@
+"""
+Retrace(\lambda) algorithm
+------------------------
+Retrace(\lambda) is an off-policy algorithm that extend the idea of eligibility
+trace. It apply an importance sampling ratio truncated at 1 to several behaviour
+policies, which suffer from the variance explosion of standard IS and lead to
+safe and efficient learning.
+
+
+Reference:
+------------------------
+Munos R, Stepleton T, Harutyunyan A, et al. Safe and efficient off-policy
+reinforcement learning[C]//Advances in Neural Information Processing Systems.
+2016: 1054-1062.
+
+
+Environment:
+------------------------
+Cartpole and Pong in OpenAI Gym
+
+
+Requirements:
+------------------------
+tensorflow>=2.0.0a0
+tensorlayer>=2.0.0
+
+
+To run:
+------------------------
+python tutorial_Retrace.py --mode=train
+python tutorial_Retrace.py --mode=test --save_path=retrace/8000.npz
+"""
+import argparse
+import os
+import random
+import time
+
+import numpy as np
+
+import tensorflow as tf
+import tensorlayer as tl
+from tutorial_wrappers import build_env
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--mode', help='train or test', default='train')
+parser.add_argument(
+    '--save_path', default='retrace', help='folder to save if mode == train else model path,'
+    'qnet will be saved once target net update'
+)
+parser.add_argument('--seed', help='random seed', type=int, default=0)
+parser.add_argument('--env_id', default='CartPole-v0', help='CartPole-v0 or PongNoFrameskip-v4')
+args = parser.parse_args()
+
+if args.mode == 'train':
+    os.makedirs(args.save_path, exist_ok=True)
+random.seed(args.seed)
+np.random.seed(args.seed)
+tf.random.set_seed(args.seed)  # reproducible
+env_id = args.env_id
+env = build_env(env_id, seed=args.seed)
+
+# ####################  hyper parameters  ####################
+if env_id == 'CartPole-v0':
+    qnet_type = 'MLP'
+    number_timesteps = 10000  # total number of time steps to train on
+    lr = 5e-3  # learning rate
+    buffer_size = 1000  # replay buffer size
+    target_q_update_freq = 50  # how frequency target q net update
+    ob_scale = 1.0  # scale observations
+else:
+    # reward will increase obviously after 1e5 time steps
+    qnet_type = 'CNN'
+    number_timesteps = int(1e6)  # total number of time steps to train on
+    lr = 1e-4  # learning rate
+    buffer_size = 10000  # replay buffer size
+    target_q_update_freq = 200  # how frequency target q net update
+    ob_scale = 1.0 / 255  # scale observations
+
+in_dim = env.observation_space.shape
+out_dim = env.action_space.n
+reward_gamma = 0.99  # reward discount
+batch_size = 32  # batch size for sampling from replay buffer
+warm_start = buffer_size / 10  # sample times befor learning
+retrace_lambda = 1.0
+
+
+# ##############################  Retrace  ####################################
+class MLP(tl.models.Model):
+
+    def __init__(self, name):
+        super(MLP, self).__init__(name=name)
+        self.h1 = tl.layers.Dense(64, tf.nn.tanh, in_channels=in_dim[0])
+        self.qvalue = tl.layers.Dense(out_dim, in_channels=64, name='q', W_init=tf.initializers.GlorotUniform())
+
+    def forward(self, ni):
+        feature = self.h1(ni)
+        qvalue = self.qvalue(feature)
+        return qvalue, tf.nn.softmax(qvalue, 1)
+
+
+class CNN(tl.models.Model):
+
+    def __init__(self, name):
+        super(CNN, self).__init__(name=name)
+        h, w, in_channels = in_dim
+        dense_in_channels = 64 * ((h - 28) // 8) * ((w - 28) // 8)
+        self.conv1 = tl.layers.Conv2d(
+            32, (8, 8), (4, 4), tf.nn.relu, 'VALID', in_channels=in_channels, name='conv2d_1',
+            W_init=tf.initializers.GlorotUniform()
+        )
+        self.conv2 = tl.layers.Conv2d(
+            64, (4, 4), (2, 2), tf.nn.relu, 'VALID', in_channels=32, name='conv2d_2',
+            W_init=tf.initializers.GlorotUniform()
+        )
+        self.conv3 = tl.layers.Conv2d(
+            64, (3, 3), (1, 1), tf.nn.relu, 'VALID', in_channels=64, name='conv2d_3',
+            W_init=tf.initializers.GlorotUniform()
+        )
+        self.flatten = tl.layers.Flatten(name='flatten')
+        self.preq = tl.layers.Dense(
+            256, tf.nn.relu, in_channels=dense_in_channels, name='pre_q', W_init=tf.initializers.GlorotUniform()
+        )
+        self.qvalue = tl.layers.Dense(out_dim, in_channels=256, name='q', W_init=tf.initializers.GlorotUniform())
+
+    def forward(self, ni):
+        feature = self.flatten(self.conv3(self.conv2(self.conv1(ni))))
+        qvalue = self.qvalue(self.preq(feature))
+        return qvalue, tf.nn.softmax(qvalue, 1)
+
+
+class ReplayBuffer(object):
+
+    def __init__(self, size):
+        self._storage = []
+        self._maxsize = size
+        self._next_idx = 0
+
+    def __len__(self):
+        return len(self._storage)
+
+    def add(self, *args):
+        if self._next_idx >= len(self._storage):
+            self._storage.append(args)
+        else:
+            self._storage[self._next_idx] = args
+        self._next_idx = (self._next_idx + 1) % self._maxsize
+
+    def _encode_sample(self, idxes):
+        b_o, b_a, b_r, b_o_, b_d, b_pi = [], [], [], [], [], []
+        for i in idxes:
+            o, a, r, o_, d, pi = self._storage[i]
+            b_o.append(o)
+            b_a.append(a)
+            b_r.append(r)
+            b_o_.append(o_)
+            b_d.append(d)
+            b_pi.append(pi)
+        return (
+            np.stack(b_o).astype('float32') * ob_scale, np.stack(b_a).astype('int32'), np.stack(b_r).astype('float32'),
+            np.stack(b_o_).astype('float32') * ob_scale, np.stack(b_d).astype('float32'),
+            np.stack(b_pi).astype('float32')
+        )
+
+    def sample(self, batch_size):
+        indexes = range(len(self._storage))
+        idxes = [random.choice(indexes) for _ in range(batch_size)]
+        return self._encode_sample(idxes)
+
+
+def huber_loss(x):
+    """Loss function for value"""
+    return tf.where(tf.abs(x) < 1, tf.square(x) * 0.5, tf.abs(x) - 0.5)
+
+
+def sync(net, net_tar):
+    """Copy q network to target q network"""
+    for var, var_tar in zip(net.trainable_weights, net_tar.trainable_weights):
+        var_tar.assign(var)
+
+
+if __name__ == '__main__':
+    if args.mode == 'train':
+        qnet = MLP('q') if qnet_type == 'MLP' else CNN('q')
+        qnet.train()
+        trainabel_weights = qnet.trainable_weights
+        targetqnet = MLP('targetq') if qnet_type == 'MLP' else CNN('targetq')
+        targetqnet.infer()
+        sync(qnet, targetqnet)
+        optimizer = tf.optimizers.Adam(learning_rate=lr)
+        buffer = ReplayBuffer(buffer_size)
+
+        o = env.reset()
+        nepisode = 0
+        t = time.time()
+        for i in range(1, number_timesteps + 1):
+            # select action based on boltzmann exploration
+            obv = np.expand_dims(o, 0).astype('float32') * ob_scale
+            qs, pi = qnet(obv)
+            a = np.random.multinomial(1, pi.numpy()[0]).argmax()
+            pi = pi.numpy()[0]
+
+            # execute action and feed to replay buffer
+            # note that `_` tail in var name means next
+            o_, r, done, info = env.step(a)
+            buffer.add(o, a, r, o_, done, pi)
+
+            if i >= warm_start:
+                # sync q net and target q net
+                if i % target_q_update_freq == 0:
+                    sync(qnet, targetqnet)
+                    path = os.path.join(args.save_path, '{}.npz'.format(i))
+                    tl.files.save_npz(qnet.trainable_weights, name=path)
+
+                # sample from replay buffer
+                b_o, b_a, b_r, b_o_, b_d, b_old_pi = buffer.sample(batch_size)
+
+                # q estimation based on 1 step retrace(\lambda)
+                b_q_, b_pi_ = targetqnet(b_o_)
+                b_v_ = (b_q_ * b_pi_).numpy().sum(1)
+                b_q, b_pi = targetqnet(b_o)
+                b_q = tf.reduce_sum(b_q * tf.one_hot(b_a, out_dim), 1).numpy()
+                c = np.clip(b_pi.numpy() / (b_old_pi + 1e-8), None, 1)
+                c = c[range(batch_size), b_a]
+                td = b_r + reward_gamma * (1 - b_d) * b_v_ - b_q
+                q_target = c * td + b_q
+
+                # calculate loss
+                with tf.GradientTape() as q_tape:
+                    b_q, _ = qnet(b_o)
+                    b_q = tf.reduce_sum(b_q * tf.one_hot(b_a, out_dim), 1)
+                    loss = tf.reduce_mean(huber_loss(b_q - q_target))
+
+                # backward gradients
+                q_grad = q_tape.gradient(loss, trainabel_weights)
+                optimizer.apply_gradients(zip(q_grad, trainabel_weights))
+
+            if done:
+                o = env.reset()
+            else:
+                o = o_
+
+            # episode in info is real (unwrapped) message
+            if info.get('episode'):
+                nepisode += 1
+                reward, length = info['episode']['r'], info['episode']['l']
+                fps = int(length / (time.time() - t))
+                print(
+                    'Time steps so far: {}, episode so far: {}, '
+                    'episode reward: {:.4f}, episode length: {}, FPS: {}'.format(i, nepisode, reward, length, fps)
+                )
+                t = time.time()
+    else:
+        qnet = MLP('q') if qnet_type == 'MLP' else CNN('q')
+        tl.files.load_and_assign_npz(name=args.save_path, network=qnet)
+        qnet.eval()
+
+        nepisode = 0
+        o = env.reset()
+        for i in range(1, number_timesteps + 1):
+            obv = np.expand_dims(o, 0).astype('float32') * ob_scale
+            a = qnet(obv)[0].numpy().argmax(1)[0]
+
+            # execute action
+            # note that `_` tail in var name means next
+            o_, r, done, info = env.step(a)
+
+            if done:
+                o = env.reset()
+            else:
+                o = o_
+
+            # episode in info is real (unwrapped) message
+            if info.get('episode'):
+                nepisode += 1
+                reward, length = info['episode']['r'], info['episode']['l']
+                print(
+                    'Time steps so far: {}, episode so far: {}, '
+                    'episode reward: {:.4f}, episode length: {}'.format(i, nepisode, reward, length)
+                )
diff --git a/examples/reinforcement_learning/tutorial_SAC.py b/examples/reinforcement_learning/tutorial_SAC.py
new file mode 100644
index 000000000..24831e85f
--- /dev/null
+++ b/examples/reinforcement_learning/tutorial_SAC.py
@@ -0,0 +1,489 @@
+''' 
+Soft Actor-Critic (SAC)
+------------------
+Actor policy in SAC is stochastic, with off-policy training. 
+And 'soft' in SAC indicates the trade-off between the entropy and expected return. 
+The additional consideration of entropy term helps with more explorative policy.
+And this implementation contains an automatic update for the entropy factor.
+
+This version of Soft Actor-Critic (SAC) implementation contains 5 networks: 
+2 Q net, 2 target Q net, 1 policy net.
+It uses alpha loss.
+
+
+Reference
+---------
+paper: https://arxiv.org/pdf/1812.05905.pdf
+
+Environment
+---
+Openai Gym Pendulum-v0, continuous action space
+https://gym.openai.com/envs/Pendulum-v0/
+
+Prerequisites
+--------------
+tensorflow >=2.0.0a0
+tensorflow-probability 0.6.0
+tensorlayer >=2.0.0
+
+&&
+pip install box2d box2d-kengz --user
+
+To run
+------
+python tutorial_SAC.py --train/test
+'''
+
+import argparse
+import math
+import random
+import time
+
+import matplotlib.pyplot as plt
+import numpy as np
+from IPython.display import clear_output
+
+import gym
+import tensorflow as tf
+import tensorflow_probability as tfp
+import tensorlayer as tl
+from tensorlayer.layers import Dense
+from tensorlayer.models import Model
+
+tfd = tfp.distributions
+Normal = tfd.Normal
+
+tl.logging.set_verbosity(tl.logging.DEBUG)
+
+random.seed(2)
+np.random.seed(2)
+tf.random.set_seed(2)  # reproducible
+
+# add arguments in command  --train/test
+parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
+parser.add_argument('--train', dest='train', action='store_true', default=False)
+parser.add_argument('--test', dest='test', action='store_true', default=True)
+args = parser.parse_args()
+
+#####################  hyper parameters  ####################
+# choose env
+ENV = 'Pendulum-v0'
+action_range = 1.  # scale action, [-action_range, action_range]
+
+# RL training
+max_frames = 40000  # total number of steps for training
+test_frames = 300  # total number of steps for testing
+max_steps = 150  # maximum number of steps for one episode
+batch_size = 64  # udpate batchsize
+explore_steps = 100  # 500 for random action sampling in the beginning of training
+update_itr = 3  # repeated updates for single step
+hidden_dim = 32  # size of hidden layers for networks
+soft_q_lr = 3e-4  # q_net learning rate
+policy_lr = 3e-4  # policy_net learning rate
+alpha_lr = 3e-4  # alpha learning rate
+policy_target_update_interval = 3  # delayed update for the policy network and target networks
+reward_scale = 1.  # value range of reward
+replay_buffer_size = 5e5
+
+AUTO_ENTROPY = True  # automatically udpating variable alpha for entropy
+DETERMINISTIC = False  # stochastic action policy if False, otherwise deterministic
+
+###############################  SAC  ####################################
+
+
+class ReplayBuffer:
+    '''
+    a ring buffer for storing transitions and sampling for training
+    :state: (state_dim,)
+    :action: (action_dim,)
+    :reward: (,), scalar
+    :next_state: (state_dim,)
+    :done: (,), scalar (0 and 1) or bool (True and False)
+    '''
+
+    def __init__(self, capacity):
+        self.capacity = capacity
+        self.buffer = []
+        self.position = 0
+
+    def push(self, state, action, reward, next_state, done):
+        if len(self.buffer) < self.capacity:
+            self.buffer.append(None)
+        self.buffer[self.position] = (state, action, reward, next_state, done)
+        self.position = int((self.position + 1) % self.capacity)  # as a ring buffer
+
+    def sample(self, batch_size):
+        batch = random.sample(self.buffer, batch_size)
+        state, action, reward, next_state, done = map(np.stack, zip(*batch))  # stack for each element
+        ''' 
+        the * serves as unpack: sum(a,b) <=> batch=(a,b), sum(*batch) ;
+        zip: a=[1,2], b=[2,3], zip(a,b) => [(1, 2), (2, 3)] ;
+        the map serves as mapping the function on each list element: map(square, [2,3]) => [4,9] ;
+        np.stack((1,2)) => array([1, 2])
+        '''
+        return state, action, reward, next_state, done
+
+    def __len__(self):
+        return len(self.buffer)
+
+
+class NormalizedActions(gym.ActionWrapper):
+    ''' normalize the actions to be in reasonable range '''
+
+    def _action(self, action):
+        low = self.action_space.low
+        high = self.action_space.high
+
+        action = low + (action + 1.0) * 0.5 * (high - low)
+        action = np.clip(action, low, high)
+
+        return action
+
+    def _reverse_action(self, action):
+        low = self.action_space.low
+        high = self.action_space.high
+
+        action = 2 * (action - low) / (high - low) - 1
+        action = np.clip(action, low, high)
+
+        return action
+
+
+class SoftQNetwork(Model):
+    ''' the network for evaluate values of state-action pairs: Q(s,a) '''
+
+    def __init__(self, num_inputs, num_actions, hidden_dim, init_w=3e-3):
+        super(SoftQNetwork, self).__init__()
+        input_dim = num_inputs + num_actions
+        w_init = tf.keras.initializers.glorot_normal(
+            seed=None
+        )  # glorot initialization is better than uniform in practice
+        # w_init = tf.random_uniform_initializer(-init_w, init_w)
+
+        self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=input_dim, name='q1')
+        self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='q2')
+        self.linear3 = Dense(n_units=1, W_init=w_init, in_channels=hidden_dim, name='q3')
+
+    def forward(self, input):
+        x = self.linear1(input)
+        x = self.linear2(x)
+        x = self.linear3(x)
+        return x
+
+
+class PolicyNetwork(Model):
+    ''' the network for generating non-determinstic (Gaussian distributed) action from the state input '''
+
+    def __init__(
+            self, num_inputs, num_actions, hidden_dim, action_range=1., init_w=3e-3, log_std_min=-20, log_std_max=2
+    ):
+        super(PolicyNetwork, self).__init__()
+
+        self.log_std_min = log_std_min
+        self.log_std_max = log_std_max
+
+        w_init = tf.keras.initializers.glorot_normal(seed=None)
+        # w_init = tf.random_uniform_initializer(-init_w, init_w)
+
+        self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=num_inputs, name='policy1')
+        self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy2')
+        self.linear3 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy3')
+
+        self.mean_linear = Dense(n_units=num_actions, W_init=w_init, \
+        b_init=tf.random_uniform_initializer(-init_w, init_w), in_channels=hidden_dim, name='policy_mean')
+        self.log_std_linear = Dense(n_units=num_actions, W_init=w_init, \
+        b_init=tf.random_uniform_initializer(-init_w, init_w), in_channels=hidden_dim, name='policy_logstd')
+
+        self.action_range = action_range
+        self.num_actions = num_actions
+
+    def forward(self, state):
+        x = self.linear1(state)
+        x = self.linear2(x)
+        x = self.linear3(x)
+
+        mean = self.mean_linear(x)
+        log_std = self.log_std_linear(x)
+        log_std = tf.clip_by_value(log_std, self.log_std_min, self.log_std_max)
+
+        return mean, log_std
+
+    def evaluate(self, state, epsilon=1e-6):
+        ''' generate action with state for calculating gradients '''
+        state = state.astype(np.float32)
+        mean, log_std = self.forward(state)
+        std = tf.math.exp(log_std)  # no clip in evaluation, clip affects gradients flow
+
+        normal = Normal(0, 1)
+        z = normal.sample()
+        action_0 = tf.math.tanh(mean + std * z)  # TanhNormal distribution as actions; reparameterization trick
+        action = self.action_range * action_0
+        # according to original paper, with an extra last term for normalizing different action range
+        log_prob = Normal(mean, std).log_prob(mean + std * z) - tf.math.log(1. - action_0**2 +
+                                                                            epsilon) - np.log(self.action_range)
+        # both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action);
+        # the Normal.log_prob outputs the same dim of input features instead of 1 dim probability,
+        # needs sum up across the dim of actions to get 1 dim probability; or else use Multivariate Normal.
+        log_prob = tf.reduce_sum(log_prob, axis=1)[:, np.newaxis]  # expand dim as reduce_sum causes 1 dim reduced
+
+        return action, log_prob, z, mean, log_std
+
+    def get_action(self, state, deterministic):
+        ''' generate action with state for interaction with envronment '''
+        mean, log_std = self.forward([state])
+        std = tf.math.exp(log_std)
+
+        normal = Normal(0, 1)
+        z = normal.sample()
+        action = self.action_range * tf.math.tanh(
+            mean + std * z
+        )  # TanhNormal distribution as actions; reparameterization trick
+
+        action = self.action_range * mean if deterministic else action
+        return action.numpy()[0]
+
+    def sample_action(self, ):
+        ''' generate random actions for exploration '''
+        a = tf.random.uniform([self.num_actions], -1, 1)
+
+        return self.action_range * a.numpy()
+
+
+class SAC_Trainer():
+
+    def __init__(self, replay_buffer, hidden_dim, action_range, soft_q_lr=3e-4, policy_lr=3e-4, alpha_lr=3e-4):
+        self.replay_buffer = replay_buffer
+
+        # initialize all networks
+        self.soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim)
+        self.soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim)
+        self.target_soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim)
+        self.target_soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim)
+        self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim, action_range)
+        self.log_alpha = tf.Variable(0, dtype=np.float32, name='log_alpha')
+        self.alpha = tf.math.exp(self.log_alpha)
+        print('Soft Q Network (1,2): ', self.soft_q_net1)
+        print('Policy Network: ', self.policy_net)
+
+        # initialize weights of target networks
+        self.target_soft_q_net1 = self.target_ini(self.soft_q_net1, self.target_soft_q_net1)
+        self.target_soft_q_net2 = self.target_ini(self.soft_q_net2, self.target_soft_q_net2)
+
+        self.soft_q_optimizer1 = tf.optimizers.Adam(soft_q_lr)
+        self.soft_q_optimizer2 = tf.optimizers.Adam(soft_q_lr)
+        self.policy_optimizer = tf.optimizers.Adam(policy_lr)
+        self.alpha_optimizer = tf.optimizers.Adam(alpha_lr)
+
+    def target_ini(self, net, target_net):
+        ''' hard-copy update for initializing target networks '''
+        for target_param, param in zip(target_net.trainable_weights, net.trainable_weights):
+            target_param.assign(param)
+        return target_net
+
+    def target_soft_update(self, net, target_net, soft_tau):
+        ''' soft update the target net with Polyak averaging '''
+        for target_param, param in zip(target_net.trainable_weights, net.trainable_weights):
+            target_param.assign(  # copy weight value into target parameters
+                target_param * (1.0 - soft_tau) + param * soft_tau
+            )
+        return target_net
+
+    def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy=-2, gamma=0.99, soft_tau=1e-2):
+        ''' update all networks in SAC '''
+        state, action, reward, next_state, done = self.replay_buffer.sample(batch_size)
+
+        reward = reward[:, np.newaxis]  # expand dim
+        done = done[:, np.newaxis]
+
+        reward = reward_scale * (reward -
+                                 np.mean(reward, axis=0)) / np.std(reward, axis=0)  # normalize with batch mean and std
+
+        # Training Q Function
+        new_next_action, next_log_prob, _, _, _ = self.policy_net.evaluate(next_state)
+        target_q_input = tf.concat([next_state, new_next_action], 1)  # the dim 0 is number of samples
+        target_q_min = tf.minimum(
+            self.target_soft_q_net1(target_q_input), self.target_soft_q_net2(target_q_input)
+        ) - self.alpha * next_log_prob
+        target_q_value = reward + (1 - done) * gamma * target_q_min  # if done==1, only reward
+        q_input = tf.concat([state, action], 1)  # the dim 0 is number of samples
+
+        with tf.GradientTape() as q1_tape:
+            predicted_q_value1 = self.soft_q_net1(q_input)
+            q_value_loss1 = tf.reduce_mean(tf.losses.mean_squared_error(predicted_q_value1, target_q_value))
+        q1_grad = q1_tape.gradient(q_value_loss1, self.soft_q_net1.trainable_weights)
+        self.soft_q_optimizer1.apply_gradients(zip(q1_grad, self.soft_q_net1.trainable_weights))
+
+        with tf.GradientTape() as q2_tape:
+            predicted_q_value2 = self.soft_q_net2(q_input)
+            q_value_loss2 = tf.reduce_mean(tf.losses.mean_squared_error(predicted_q_value2, target_q_value))
+        q2_grad = q2_tape.gradient(q_value_loss2, self.soft_q_net2.trainable_weights)
+        self.soft_q_optimizer2.apply_gradients(zip(q2_grad, self.soft_q_net2.trainable_weights))
+
+        # Training Policy Function
+        with tf.GradientTape() as p_tape:
+            new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state)
+            new_q_input = tf.concat([state, new_action], 1)  # the dim 0 is number of samples
+            ''' implementation 1 '''
+            predicted_new_q_value = tf.minimum(self.soft_q_net1(new_q_input), self.soft_q_net2(new_q_input))
+            # ''' implementation 2 '''
+            # predicted_new_q_value = self.soft_q_net1(new_q_input)
+            policy_loss = tf.reduce_mean(self.alpha * log_prob - predicted_new_q_value)
+        p_grad = p_tape.gradient(policy_loss, self.policy_net.trainable_weights)
+        self.policy_optimizer.apply_gradients(zip(p_grad, self.policy_net.trainable_weights))
+
+        # Updating alpha w.r.t entropy
+        # alpha: trade-off between exploration (max entropy) and exploitation (max Q)
+        if auto_entropy is True:
+            with tf.GradientTape() as alpha_tape:
+                alpha_loss = -tf.reduce_mean((self.log_alpha * (log_prob + target_entropy)))
+            alpha_grad = alpha_tape.gradient(alpha_loss, [self.log_alpha])
+            self.alpha_optimizer.apply_gradients(zip(alpha_grad, [self.log_alpha]))
+            self.alpha = tf.math.exp(self.log_alpha)
+        else:  # fixed alpha
+            self.alpha = 1.
+            alpha_loss = 0
+
+    # Soft update the target value nets
+        self.target_soft_q_net1 = self.target_soft_update(self.soft_q_net1, self.target_soft_q_net1, soft_tau)
+        self.target_soft_q_net2 = self.target_soft_update(self.soft_q_net2, self.target_soft_q_net2, soft_tau)
+
+    def save_weights(self):  # save trained weights
+        tl.files.save_npz(self.soft_q_net1.trainable_weights, name='model_q_net1.npz')
+        tl.files.save_npz(self.soft_q_net2.trainable_weights, name='model_q_net2.npz')
+        tl.files.save_npz(self.target_soft_q_net1.trainable_weights, name='model_target_q_net1.npz')
+        tl.files.save_npz(self.target_soft_q_net2.trainable_weights, name='model_target_q_net2.npz')
+        tl.files.save_npz(self.policy_net.trainable_weights, name='model_policy_net.npz')
+
+    def load_weights(self):  # load trained weights
+        tl.files.load_and_assign_npz(name='model_q_net1.npz', network=self.soft_q_net1)
+        tl.files.load_and_assign_npz(name='model_q_net2.npz', network=self.soft_q_net2)
+        tl.files.load_and_assign_npz(name='model_target_q_net1.npz', network=self.target_soft_q_net1)
+        tl.files.load_and_assign_npz(name='model_target_q_net2.npz', network=self.target_soft_q_net2)
+        tl.files.load_and_assign_npz(name='model_policy_net.npz', network=self.policy_net)
+
+
+def plot(frame_idx, rewards):
+    clear_output(True)
+    plt.figure(figsize=(20, 5))
+    plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
+    plt.plot(rewards)
+    plt.xlabel('Episode')
+    plt.ylabel('Episode Reward')
+    plt.savefig('sac.png')
+    # plt.show()
+
+
+if __name__ == '__main__':
+    # initialization of env
+    env = NormalizedActions(gym.make(ENV))
+    action_dim = env.action_space.shape[0]
+    state_dim = env.observation_space.shape[0]
+    # initialization of buffer
+    replay_buffer = ReplayBuffer(replay_buffer_size)
+    # initialization of trainer
+    sac_trainer=SAC_Trainer(replay_buffer, hidden_dim=hidden_dim, action_range=action_range, \
+    soft_q_lr=soft_q_lr, policy_lr=policy_lr, alpha_lr=alpha_lr )
+    #set train mode
+    sac_trainer.soft_q_net1.train()
+    sac_trainer.soft_q_net2.train()
+    sac_trainer.target_soft_q_net1.train()
+    sac_trainer.target_soft_q_net2.train()
+    sac_trainer.policy_net.train()
+
+    # training loop
+    if args.train:
+        frame_idx = 0
+        rewards = []
+        t0 = time.time()
+        while frame_idx < max_frames:
+            state = env.reset()
+            state = state.astype(np.float32)
+            episode_reward = 0
+            if frame_idx < 1:
+                print('intialize')
+                _ = sac_trainer.policy_net(
+                    [state]
+                )  # need an extra call here to make inside functions be able to use model.forward
+
+            for step in range(max_steps):
+                if frame_idx > explore_steps:
+                    action = sac_trainer.policy_net.get_action(state, deterministic=DETERMINISTIC)
+                else:
+                    action = sac_trainer.policy_net.sample_action()
+
+                next_state, reward, done, _ = env.step(action)
+                next_state = next_state.astype(np.float32)
+                env.render()
+                done = 1 if done ==True else 0
+                # print('s:', state, action, reward, next_state, done)
+
+                replay_buffer.push(state, action, reward, next_state, done)
+
+                state = next_state
+                episode_reward += reward
+                frame_idx += 1
+
+                if len(replay_buffer) > batch_size:
+                    for i in range(update_itr):
+                        sac_trainer.update(
+                            batch_size, reward_scale=reward_scale, auto_entropy=AUTO_ENTROPY,
+                            target_entropy=-1. * action_dim
+                        )
+
+                if frame_idx % 500 == 0:
+                    plot(frame_idx, rewards)
+
+                if done:
+                    break
+            episode = int(frame_idx / max_steps)  # current episode
+            all_episodes = int(max_frames / max_steps)  # total episodes
+            print(
+                'Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'.format(
+                    episode, all_episodes, episode_reward,
+                    time.time() - t0
+                )
+            )
+            rewards.append(episode_reward)
+        sac_trainer.save_weights()
+
+    if args.test:
+        frame_idx = 0
+        rewards = []
+        t0 = time.time()
+        sac_trainer.load_weights()
+
+        while frame_idx < test_frames:
+            state = env.reset()
+            state = state.astype(np.float32)
+            episode_reward = 0
+            if frame_idx < 1:
+                print('intialize')
+                _ = sac_trainer.policy_net(
+                    [state]
+                )  # need an extra call to make inside functions be able to use forward
+
+            for step in range(max_steps):
+                action = sac_trainer.policy_net.get_action(state, deterministic=DETERMINISTIC)
+                next_state, reward, done, _ = env.step(action)
+                next_state = next_state.astype(np.float32)
+                env.render()
+                done = 1 if done ==True else 0
+
+                state = next_state
+                episode_reward += reward
+                frame_idx += 1
+
+                # if frame_idx % 50 == 0:
+                #     plot(frame_idx, rewards)
+
+                if done:
+                    break
+            episode = int(frame_idx / max_steps)
+            all_episodes = int(test_frames / max_steps)
+            print(
+                'Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'.format(
+                    episode, all_episodes, episode_reward,
+                    time.time() - t0
+                )
+            )
+            rewards.append(episode_reward)
diff --git a/examples/reinforcement_learning/tutorial_TD3.py b/examples/reinforcement_learning/tutorial_TD3.py
new file mode 100644
index 000000000..e90e5b8fb
--- /dev/null
+++ b/examples/reinforcement_learning/tutorial_TD3.py
@@ -0,0 +1,472 @@
+'''
+Twin Delayed DDPG (TD3)
+------------------------
+DDPG suffers from problems like overestimate of Q-values and sensitivity to hyper-parameters.
+Twin Delayed DDPG (TD3) is a variant of DDPG with several tricks:
+* Trick One: Clipped Double-Q Learning. TD3 learns two Q-functions instead of one (hence “twin”), 
+and uses the smaller of the two Q-values to form the targets in the Bellman error loss functions.
+
+* Trick Two: “Delayed” Policy Updates. TD3 updates the policy (and target networks) less frequently 
+than the Q-function. 
+
+* Trick Three: Target Policy Smoothing. TD3 adds noise to the target action, to make it harder for 
+the policy to exploit Q-function errors by smoothing out Q along changes in action.
+
+The implementation of TD3 includes 6 networks: 2 Q-net, 2 target Q-net, 1 policy net, 1 target policy net
+Actor policy in TD3 is deterministic, with Gaussian exploration noise.
+
+Reference
+---------
+original paper: https://arxiv.org/pdf/1802.09477.pdf
+
+
+Environment
+---
+Openai Gym Pendulum-v0, continuous action space
+https://gym.openai.com/envs/Pendulum-v0/
+
+Prerequisites
+---
+tensorflow >=2.0.0a0
+tensorflow-probability 0.6.0
+tensorlayer >=2.0.0
+
+&&
+pip install box2d box2d-kengz --user
+
+To run
+-------
+python tutorial_TD3.py --train/test
+
+'''
+
+import argparse
+import math
+import random
+import time
+
+import matplotlib.pyplot as plt
+import numpy as np
+from IPython.display import clear_output
+
+import gym
+import tensorflow as tf
+import tensorflow_probability as tfp
+import tensorlayer as tl
+from tensorlayer.layers import Dense
+from tensorlayer.models import Model
+
+tfd = tfp.distributions
+Normal = tfd.Normal
+
+tl.logging.set_verbosity(tl.logging.DEBUG)
+
+random.seed(2)
+np.random.seed(2)
+tf.random.set_seed(2)  # reproducible
+
+# add arguments in command  --train/test
+parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
+parser.add_argument('--train', dest='train', action='store_true', default=False)
+parser.add_argument('--test', dest='test', action='store_true', default=True)
+args = parser.parse_args()
+
+#####################  hyper parameters  ####################
+# choose env
+ENV = 'Pendulum-v0'
+action_range = 1.  # scale action, [-action_range, action_range]
+
+# RL training
+max_frames = 40000  # total number of steps for training
+test_frames = 300  # total number of steps for testing
+max_steps = 150  # maximum number of steps for one episode
+batch_size = 64  # udpate batchsize
+explore_steps = 500  # 500 for random action sampling in the beginning of training
+update_itr = 3  # repeated updates for single step
+hidden_dim = 32  # size of hidden layers for networks
+q_lr = 3e-4  # q_net learning rate
+policy_lr = 3e-4  # policy_net learning rate
+policy_target_update_interval = 3  # delayed steps for updating the policy network and target networks
+explore_noise_scale = 1.0  # range of action noise for exploration
+eval_noise_scale = 0.5  # range of action noise for evaluation of action value
+reward_scale = 1.  # value range of reward
+replay_buffer_size = 5e5  # size of replay buffer
+
+###############################  TD3  ####################################
+
+
+class ReplayBuffer:
+    '''
+    a ring buffer for storing transitions and sampling for training
+    :state: (state_dim,)
+    :action: (action_dim,)
+    :reward: (,), scalar
+    :next_state: (state_dim,)
+    :done: (,), scalar (0 and 1) or bool (True and False)
+    '''
+
+    def __init__(self, capacity):
+        self.capacity = capacity
+        self.buffer = []
+        self.position = 0
+
+    def push(self, state, action, reward, next_state, done):
+        if len(self.buffer) < self.capacity:
+            self.buffer.append(None)
+        self.buffer[self.position] = (state, action, reward, next_state, done)
+        self.position = int((self.position + 1) % self.capacity)  # as a ring buffer
+
+    def sample(self, batch_size):
+        batch = random.sample(self.buffer, batch_size)
+        state, action, reward, next_state, done = map(np.stack, zip(*batch))  # stack for each element
+        ''' 
+        the * serves as unpack: sum(a,b) <=> batch=(a,b), sum(*batch) ;
+        zip: a=[1,2], b=[2,3], zip(a,b) => [(1, 2), (2, 3)] ;
+        the map serves as mapping the function on each list element: map(square, [2,3]) => [4,9] ;
+        np.stack((1,2)) => array([1, 2])
+        '''
+        return state, action, reward, next_state, done
+
+    def __len__(self):
+        return len(self.buffer)
+
+
+class NormalizedActions(gym.ActionWrapper):
+    ''' normalize the actions to be in reasonable range '''
+
+    def _action(self, action):
+        low = self.action_space.low
+        high = self.action_space.high
+
+        action = low + (action + 1.0) * 0.5 * (high - low)
+        action = np.clip(action, low, high)
+
+        return action
+
+    def _reverse_action(self, action):
+        low = self.action_space.low
+        high = self.action_space.high
+
+        action = 2 * (action - low) / (high - low) - 1
+        action = np.clip(action, low, high)
+
+        return action
+
+
+class QNetwork(Model):
+    ''' the network for evaluate values of state-action pairs: Q(s,a) '''
+
+    def __init__(self, num_inputs, num_actions, hidden_dim, init_w=3e-3):
+        super(QNetwork, self).__init__()
+        input_dim = num_inputs + num_actions
+        # w_init = tf.keras.initializers.glorot_normal(seed=None)
+        w_init = tf.random_uniform_initializer(-init_w, init_w)
+
+        self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=input_dim, name='q1')
+        self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='q2')
+        self.linear3 = Dense(n_units=1, W_init=w_init, in_channels=hidden_dim, name='q3')
+
+    def forward(self, input):
+        x = self.linear1(input)
+        x = self.linear2(x)
+        x = self.linear3(x)
+        return x
+
+
+class PolicyNetwork(Model):
+    ''' the network for generating non-determinstic (Gaussian distributed) action from the state input '''
+
+    def __init__(self, num_inputs, num_actions, hidden_dim, action_range=1., init_w=3e-3):
+        super(PolicyNetwork, self).__init__()
+
+        # w_init = tf.keras.initializers.glorot_normal(seed=None)
+        w_init = tf.random_uniform_initializer(-init_w, init_w)
+
+        self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=num_inputs, name='policy1')
+        self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy2')
+        self.linear3 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy3')
+
+        self.output_linear = Dense(n_units=num_actions, W_init=w_init, \
+        b_init=tf.random_uniform_initializer(-init_w, init_w), in_channels=hidden_dim, name='policy_output')
+
+        self.action_range = action_range
+        self.num_actions = num_actions
+
+    def forward(self, state):
+        x = self.linear1(state)
+        x = self.linear2(x)
+        x = self.linear3(x)
+
+        output = tf.nn.tanh(self.output_linear(x))  # unit range output [-1, 1]
+
+        return output
+
+    def evaluate(self, state, eval_noise_scale):
+        ''' 
+        generate action with state for calculating gradients;
+        eval_noise_scale: as the trick of target policy smoothing, for generating noisy actions.
+        '''
+        state = state.astype(np.float32)
+        action = self.forward(state)
+
+        action = self.action_range * action
+
+        # add noise
+        normal = Normal(0, 1)
+        eval_noise_clip = 2 * eval_noise_scale
+        noise = normal.sample(action.shape) * eval_noise_scale
+        noise = tf.clip_by_value(noise, -eval_noise_clip, eval_noise_clip)
+        action = action + noise
+
+        return action
+
+    def get_action(self, state, explore_noise_scale):
+        ''' generate action with state for interaction with envronment '''
+        action = self.forward([state])
+        action = action.numpy()[0]
+
+        # add noise
+        normal = Normal(0, 1)
+        noise = normal.sample(action.shape) * explore_noise_scale
+        action = self.action_range * action + noise
+
+        return action.numpy()
+
+    def sample_action(self, ):
+        ''' generate random actions for exploration '''
+        a = tf.random.uniform([self.num_actions], -1, 1)
+
+        return self.action_range * a.numpy()
+
+
+class TD3_Trainer():
+
+    def __init__(
+            self, replay_buffer, hidden_dim, action_range, policy_target_update_interval=1, q_lr=3e-4, policy_lr=3e-4
+    ):
+        self.replay_buffer = replay_buffer
+
+        # initialize all networks
+        self.q_net1 = QNetwork(state_dim, action_dim, hidden_dim)
+        self.q_net2 = QNetwork(state_dim, action_dim, hidden_dim)
+        self.target_q_net1 = QNetwork(state_dim, action_dim, hidden_dim)
+        self.target_q_net2 = QNetwork(state_dim, action_dim, hidden_dim)
+        self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim, action_range)
+        self.target_policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim, action_range)
+        print('Q Network (1,2): ', self.q_net1)
+        print('Policy Network: ', self.policy_net)
+
+        # initialize weights of target networks
+        self.target_q_net1 = self.target_ini(self.q_net1, self.target_q_net1)
+        self.target_q_net2 = self.target_ini(self.q_net2, self.target_q_net2)
+        self.target_policy_net = self.target_ini(self.policy_net, self.target_policy_net)
+
+        self.update_cnt = 0
+        self.policy_target_update_interval = policy_target_update_interval
+
+        self.q_optimizer1 = tf.optimizers.Adam(q_lr)
+        self.q_optimizer2 = tf.optimizers.Adam(q_lr)
+        self.policy_optimizer = tf.optimizers.Adam(policy_lr)
+
+    def target_ini(self, net, target_net):
+        ''' hard-copy update for initializing target networks '''
+        for target_param, param in zip(target_net.trainable_weights, net.trainable_weights):
+            target_param.assign(param)
+        return target_net
+
+    def target_soft_update(self, net, target_net, soft_tau):
+        ''' soft update the target net with Polyak averaging '''
+        for target_param, param in zip(target_net.trainable_weights, net.trainable_weights):
+            target_param.assign(  # copy weight value into target parameters
+                target_param * (1.0 - soft_tau) + param * soft_tau
+            )
+        return target_net
+
+    def update(self, batch_size, eval_noise_scale, reward_scale=10., gamma=0.9, soft_tau=1e-2):
+        ''' update all networks in TD3 '''
+        self.update_cnt += 1
+        state, action, reward, next_state, done = self.replay_buffer.sample(batch_size)
+
+        reward = reward[:, np.newaxis]  # expand dim
+        done = done[:, np.newaxis]
+
+        new_next_action = self.target_policy_net.evaluate(
+            next_state, eval_noise_scale=eval_noise_scale
+        )  # clipped normal noise
+        reward = reward_scale * (reward -
+                                 np.mean(reward, axis=0)) / np.std(reward, axis=0)  # normalize with batch mean and std
+
+        # Training Q Function
+        target_q_input = tf.concat([next_state, new_next_action], 1)  # the dim 0 is number of samples
+        target_q_min = tf.minimum(self.target_q_net1(target_q_input), self.target_q_net2(target_q_input))
+
+        target_q_value = reward + (1 - done) * gamma * target_q_min  # if done==1, only reward
+        q_input = tf.concat([state, action], 1)  # input of q_net
+
+        with tf.GradientTape() as q1_tape:
+            predicted_q_value1 = self.q_net1(q_input)
+            q_value_loss1 = tf.reduce_mean(tf.square(predicted_q_value1 - target_q_value))
+        q1_grad = q1_tape.gradient(q_value_loss1, self.q_net1.trainable_weights)
+        self.q_optimizer1.apply_gradients(zip(q1_grad, self.q_net1.trainable_weights))
+
+        with tf.GradientTape() as q2_tape:
+            predicted_q_value2 = self.q_net2(q_input)
+            q_value_loss2 = tf.reduce_mean(tf.square(predicted_q_value2 - target_q_value))
+        q2_grad = q2_tape.gradient(q_value_loss2, self.q_net2.trainable_weights)
+        self.q_optimizer2.apply_gradients(zip(q2_grad, self.q_net2.trainable_weights))
+
+        # Training Policy Function
+        if self.update_cnt % self.policy_target_update_interval == 0:
+            with tf.GradientTape() as p_tape:
+                new_action = self.policy_net.evaluate(
+                    state, eval_noise_scale=0.0
+                )  # no noise, deterministic policy gradients
+                new_q_input = tf.concat([state, new_action], 1)
+                # ''' implementation 1 '''
+                # predicted_new_q_value = tf.minimum(self.q_net1(new_q_input),self.q_net2(new_q_input))
+                ''' implementation 2 '''
+                predicted_new_q_value = self.q_net1(new_q_input)
+                policy_loss = -tf.reduce_mean(predicted_new_q_value)
+            p_grad = p_tape.gradient(policy_loss, self.policy_net.trainable_weights)
+            self.policy_optimizer.apply_gradients(zip(p_grad, self.policy_net.trainable_weights))
+
+            # Soft update the target nets
+            self.target_q_net1 = self.target_soft_update(self.q_net1, self.target_q_net1, soft_tau)
+            self.target_q_net2 = self.target_soft_update(self.q_net2, self.target_q_net2, soft_tau)
+            self.target_policy_net = self.target_soft_update(self.policy_net, self.target_policy_net, soft_tau)
+
+    def save_weights(self):  # save trained weights
+        tl.files.save_npz(self.q_net1.trainable_weights, name='model_q_net1.npz')
+        tl.files.save_npz(self.q_net2.trainable_weights, name='model_q_net2.npz')
+        tl.files.save_npz(self.target_q_net1.trainable_weights, name='model_target_q_net1.npz')
+        tl.files.save_npz(self.target_q_net2.trainable_weights, name='model_target_q_net2.npz')
+        tl.files.save_npz(self.policy_net.trainable_weights, name='model_policy_net.npz')
+        tl.files.save_npz(self.target_policy_net.trainable_weights, name='model_target_policy_net.npz')
+
+    def load_weights(self):  # load trained weights
+        tl.files.load_and_assign_npz(name='model_q_net1.npz', network=self.q_net1)
+        tl.files.load_and_assign_npz(name='model_q_net2.npz', network=self.q_net2)
+        tl.files.load_and_assign_npz(name='model_target_q_net1.npz', network=self.target_q_net1)
+        tl.files.load_and_assign_npz(name='model_target_q_net2.npz', network=self.target_q_net2)
+        tl.files.load_and_assign_npz(name='model_policy_net.npz', network=self.policy_net)
+        tl.files.load_and_assign_npz(name='model_target_policy_net.npz', network=self.target_policy_net)
+
+
+def plot(frame_idx, rewards):
+    clear_output(True)
+    plt.figure(figsize=(20, 5))
+    plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
+    plt.plot(rewards)
+    plt.xlabel('Episode')
+    plt.ylabel('Episode Reward')
+    plt.savefig('td3.png')
+    # plt.show()
+
+
+if __name__ == '__main__':
+
+    # initialization of env
+    env = NormalizedActions(gym.make(ENV))
+    action_dim = env.action_space.shape[0]
+    state_dim = env.observation_space.shape[0]
+    # initialization of buffer
+    replay_buffer = ReplayBuffer(replay_buffer_size)
+    # initialization of trainer
+    td3_trainer=TD3_Trainer(replay_buffer, hidden_dim=hidden_dim, policy_target_update_interval=policy_target_update_interval, \
+    action_range=action_range, q_lr=q_lr, policy_lr=policy_lr )
+    # set train mode
+    td3_trainer.q_net1.train()
+    td3_trainer.q_net2.train()
+    td3_trainer.target_q_net1.train()
+    td3_trainer.target_q_net2.train()
+    td3_trainer.policy_net.train()
+    td3_trainer.target_policy_net.train()
+
+    # training loop
+    if args.train:
+        frame_idx = 0
+        rewards = []
+        t0 = time.time()
+        while frame_idx < max_frames:
+            state = env.reset()
+            state = state.astype(np.float32)
+            episode_reward = 0
+            if frame_idx < 1:
+                print('intialize')
+                _ = td3_trainer.policy_net(
+                    [state]
+                )  # need an extra call here to make inside functions be able to use model.forward
+                _ = td3_trainer.target_policy_net([state])
+
+            for step in range(max_steps):
+                if frame_idx > explore_steps:
+                    action = td3_trainer.policy_net.get_action(state, explore_noise_scale=1.0)
+                else:
+                    action = td3_trainer.policy_net.sample_action()
+
+                next_state, reward, done, _ = env.step(action)
+                next_state = next_state.astype(np.float32)
+                env.render()
+                done = 1 if done ==True else 0
+
+                replay_buffer.push(state, action, reward, next_state, done)
+
+                state = next_state
+                episode_reward += reward
+                frame_idx += 1
+
+                if len(replay_buffer) > batch_size:
+                    for i in range(update_itr):
+                        td3_trainer.update(batch_size, eval_noise_scale=0.5, reward_scale=1.)
+
+                if frame_idx % 500 == 0:
+                    plot(frame_idx, rewards)
+
+                if done:
+                    break
+            episode = int(frame_idx / max_steps)  # current episode
+            all_episodes = int(max_frames / max_steps)  # total episodes
+            print('Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'\
+            .format(episode, all_episodes, episode_reward, time.time()-t0 ))
+            rewards.append(episode_reward)
+        td3_trainer.save_weights()
+
+    if args.test:
+        frame_idx = 0
+        rewards = []
+        t0 = time.time()
+
+        td3_trainer.load_weights()
+
+        while frame_idx < test_frames:
+            state = env.reset()
+            state = state.astype(np.float32)
+            episode_reward = 0
+            if frame_idx < 1:
+                print('intialize')
+                _ = td3_trainer.policy_net(
+                    [state]
+                )  # need an extra call to make inside functions be able to use forward
+                _ = td3_trainer.target_policy_net([state])
+
+            for step in range(max_steps):
+                action = td3_trainer.policy_net.get_action(state, explore_noise_scale=1.0)
+                next_state, reward, done, _ = env.step(action)
+                next_state = next_state.astype(np.float32)
+                env.render()
+                done = 1 if done ==True else 0
+
+                state = next_state
+                episode_reward += reward
+                frame_idx += 1
+
+                # if frame_idx % 50 == 0:
+                #     plot(frame_idx, rewards)
+
+                if done:
+                    break
+            episode = int(frame_idx / max_steps)
+            all_episodes = int(test_frames / max_steps)
+            print('Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'\
+            .format(episode, all_episodes, episode_reward, time.time()-t0 ) )
+            rewards.append(episode_reward)
diff --git a/examples/reinforcement_learning/tutorial_TRPO.py b/examples/reinforcement_learning/tutorial_TRPO.py
new file mode 100644
index 000000000..f64a0a0c0
--- /dev/null
+++ b/examples/reinforcement_learning/tutorial_TRPO.py
@@ -0,0 +1,746 @@
+"""
+Trust Region Policy Optimization (TRPO)
+---------------------------------------
+PG method with a large step can collapse the policy performance,
+even with a small step can lead a large differences in policy.
+TRPO constraint the step in policy space using KL divergence (rather than in parameter space),
+which can monotonically improve performance and avoid a collapsed update.
+
+Reference
+---------
+Trust Region Policy Optimization, Schulman et al. 2015
+High Dimensional Continuous Control Using Generalized Advantage Estimation, Schulman et al. 2016
+Approximately Optimal Approximate Reinforcement Learning, Kakade and Langford 2002
+openai/spinningup : http://spinningup.openai.com/en/latest/algorithms/trpo.html
+
+Environment
+-----------
+Openai Gym Pendulum-v0, continual action space
+
+Prerequisites
+--------------
+tensorflow >=2.0.0a0
+tensorflow-probability 0.6.0
+tensorlayer >=2.0.0
+
+To run
+------
+python tutorial_TRPO.py --train/test
+
+"""
+import argparse
+import copy
+import os
+import time
+
+import matplotlib.pyplot as plt
+import numpy as np
+import scipy.signal
+
+import gym
+import tensorflow as tf
+import tensorflow_probability as tfp
+import tensorlayer as tl
+from gym.spaces import Box, Discrete
+
+parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
+parser.add_argument('--train', dest='train', action='store_true', default=True)
+parser.add_argument('--test', dest='train', action='store_false')
+
+parser.add_argument('--env', type=str, default='Pendulum-v0')  # environment name
+parser.add_argument('--hid', type=int, default=64)  # size of each hidden layer
+parser.add_argument('--l', type=int, default=2)  # hidden layer length
+parser.add_argument('--gamma', type=float, default=0.99)  # reward discount
+parser.add_argument('--seed', '-s', type=int, default=1)  # random seed
+parser.add_argument('--steps', type=int, default=4000)  # total number of steps for each episode
+parser.add_argument('--epochs', type=int, default=500)  # total number of episodes for training
+args = parser.parse_args()
+
+#####################  hyper parameters  ####################
+
+ENV_NAME = args.env  # environment name
+HIDDEN_SIZES = [args.hid] * args.l  # hidden layer size
+SEED = args.seed  # random seed
+STEPS_PER_EPOCH = args.steps  # total number of steps for each episode
+EPOCHS = args.epochs  # total number of episodes for training
+GAMMA = args.gamma  # reward discount
+
+DELTA = 0.01  # KL-divergence limit for TRPO update.
+VF_LR = 1e-3  # Learning rate for value function optimizer
+TRAIN_V_ITERS = 80  # Number of gradient descent steps to take on value function per epoch
+DAMPING_COEFF = 0.1  # Artifact for numerical stability
+CG_ITERS = 10  # Number of iterations of conjugate gradient to perform
+BACKTRACK_ITERS = 10  # Maximum number of steps allowed in the backtracking line search
+BACKTRACK_COEFF = 0.8  # How far back to step during backtracking line search
+LAM = 0.97  # Lambda for GAE-Lambda
+MAX_EP_LEN = 1000  # Maximum length of trajectory
+SAVE_FREQ = 10  # How often (in terms of gap between epochs) to save the current policy and value function
+EPS = 1e-8  # epsilon
+
+#####################  functions  ####################
+
+
+def combined_shape(length, shape=None):
+    """
+    combine length and shape based on shape type
+    :param length: int length
+    :param shape: shape, can be either scalar or array
+    :return: shape
+    """
+    if shape is None:
+        return length,
+    return (length, shape) if np.isscalar(shape) else (length, *shape)
+
+
+def keys_as_sorted_list(dict):
+    """
+    sorted keys of the dict
+    :param dict: dict input
+    :return: sorted key list
+    """
+    return sorted(list(dict.keys()))
+
+
+def values_as_sorted_list(dict):
+    """
+    sorted values of the dict
+    :param dict: dict input
+    :return: sorted value list
+    """
+    return [dict[k] for k in keys_as_sorted_list(dict)]
+
+
+def input_layer(dim=None):
+    """
+    create tensorlayer input layer from dimension input
+    :param dim: dimension int
+    :return: tensorlayer input layer
+    """
+    return tl.layers.Input(dtype=tf.float32, shape=combined_shape(None, dim))
+
+
+def input_layers(*args):
+    """
+    create tensorlayer input layers from a list of dimensions
+    :param args: a list of dimensions
+    :return: list of input layers
+    """
+    return [input_layer(dim) for dim in args]
+
+
+def input_layer_from_space(space):
+    """
+    create tensorlayer input layers from env.space input
+    :param space: env.space
+    :return: tensorlayer input layer
+    """
+    if isinstance(space, Box):
+        return input_layer(space.shape)
+    elif isinstance(space, Discrete):
+        return tl.layers.Input(dtype=tf.int32, shape=(None, ))
+    raise NotImplementedError
+
+
+def input_layers_from_spaces(*args):
+    """
+    create tensorlayer input layers from a list of env.space inputs
+    :param args: a list of env.space inputs
+    :return: tensorlayer input layer list
+    """
+    return [input_layer_from_space(space) for space in args]
+
+
+def mlp(x, hidden_sizes=(32, ), activation=tf.tanh, output_activation=None):
+    """
+    create Multi-Layer Perception
+    :param x: tensorlayer input layer
+    :param hidden_sizes: hidden layer size
+    :param activation: hidden layer activation function
+    :param output_activation: activation function for the output layer
+    :return: output layer
+    """
+    for h in hidden_sizes[:-1]:
+        x = tl.layers.Dense(n_units=h, act=activation)(x)
+    return tl.layers.Dense(n_units=hidden_sizes[-1], act=output_activation)(x)
+
+
+def get_vars(model: tl.models.Model):
+    """
+    get trainable parameters of the model
+    :param model: tensorlayer model
+    :return: a list of trainable parameters of the model
+    """
+    return model.trainable_weights
+
+
+def count_vars(model: tl.models.Model):
+    """
+    count trainable parameters of the model
+    :param model: tensorlayer model
+    :return: counts
+    """
+    v = get_vars(model)
+    return sum([np.prod(var.shape.as_list()) for var in v])
+
+
+def gaussian_likelihood(x, mu, log_std):
+    """
+    calculate gaussian likelihood
+    :param x: input distribution
+    :param mu: mu
+    :param log_std: log std
+    :return: gaussian likelihood
+    """
+    pre_sum = -0.5 * (((x - mu) / (tf.exp(log_std) + EPS))**2 + 2 * log_std + np.log(2 * np.pi))
+    return tf.reduce_sum(pre_sum, axis=1)
+
+
+def diagonal_gaussian_kl(mu0, log_std0, mu1, log_std1):
+    """
+    tf symbol for mean KL divergence between two batches of diagonal gaussian distributions,
+    where distributions are specified by means and log stds.
+    (https://en.wikipedia.org/wiki/Kullback-Leibler_divergence#Multivariate_normal_distributions)
+    """
+    var0, var1 = tf.exp(2 * log_std0), tf.exp(2 * log_std1)
+    pre_sum = 0.5 * (((mu1 - mu0)**2 + var0) / (var1 + EPS) - 1) + log_std1 - log_std0
+    all_kls = tf.reduce_sum(pre_sum, axis=1)
+    return tf.reduce_mean(all_kls)
+
+
+def categorical_kl(logp0, logp1):
+    """
+    tf symbol for mean KL divergence between two batches of categorical probability distributions,
+    where the distributions are input as log probs.
+    """
+    all_kls = tf.reduce_sum(tf.exp(logp1) * (logp1 - logp0), axis=1)
+    return tf.reduce_mean(all_kls)
+
+
+def flat_concat(xs):
+    """
+    flat concat input
+    :param xs: a list of tensor
+    :return: flat tensor
+    """
+    return tf.concat([tf.reshape(x, (-1, )) for x in xs], axis=0)
+
+
+def assign_params_from_flat(x, params):
+    """
+    assign params from flat input
+    :param x:
+    :param params:
+    :return: group
+    """
+    flat_size = lambda p: int(np.prod(p.shape.as_list()))  # the 'int' is important for scalars
+    splits = tf.split(x, [flat_size(p) for p in params])
+    new_params = [tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits)]
+    return tf.group([p.assign(p_new) for p, p_new in zip(params, new_params)])
+
+
+def discount_cumsum(x, discount):
+    """
+    magic from rllab for computing discounted cumulative sums of vectors.
+
+    input:
+        vector x,
+        [x0,
+         x1,
+         x2]
+
+    output:
+        [x0 + discount * x1 + discount^2 * x2,
+         x1 + discount * x2,
+         x2]
+    """
+    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
+
+
+"""
+Policies
+"""
+
+
+class MlpCategoricalPolicy:
+    """
+    Categorical Policy for discrete input
+    """
+
+    def __init__(self, x, a, hidden_sizes, activation, output_activation):
+        self.act_dim = a.n
+        x = input_layer_from_space(x)
+        logits = mlp(x, list(hidden_sizes) + [self.act_dim], activation, None)
+        self.model = tl.models.Model(x, logits)
+        self.model.train()
+
+    def cal_outputs_0(self, states):
+        states = states.astype(np.float32)
+        logits = self.model(states)
+        logp_all = tf.nn.log_softmax(logits)
+        pi = tf.squeeze(tfp.distributions.Multinomial(1, logits), axis=1)
+        logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=self.act_dim) * logp_all, axis=1)
+        info = {'logp_all': logp_all}
+        return pi, logp_pi, info, logp_all
+
+    def cal_outputs_1(self, states, actions, old_logp_all):
+        pi, logp_pi, info, logp_all = self.cal_outputs_0(states)
+        logp = tf.reduce_sum(tf.one_hot(actions, depth=self.act_dim) * logp_all, axis=1)
+        d_kl = categorical_kl(logp_all, old_logp_all)
+
+        info_phs = {'logp_all': old_logp_all}
+
+        return pi, logp, logp_pi, info, info_phs, d_kl
+
+
+class MlpGaussianPolicy:
+    """
+    Gaussian Policy for continuous input
+    """
+
+    def __init__(self, x, a, hidden_sizes, activation, output_activation):
+        act_dim = a.shape[0]
+
+        x = input_layer_from_space(x)
+        mu = mlp(x, list(hidden_sizes) + [act_dim], activation, output_activation)
+        self.model = tl.models.Model(x, mu)
+        self.model.train()
+
+        self._log_std = tf.Variable(-0.5 * np.ones(act_dim, dtype=np.float32))
+        self.model.trainable_weights.append(self._log_std)
+
+    def cal_outputs_0(self, states):
+        states = states.astype(np.float32)
+        mu = self.model(states)
+        std = tf.exp(self._log_std)
+        pi = mu + tf.random.normal(tf.shape(mu)) * std
+        logp_pi = gaussian_likelihood(pi, mu, self._log_std)
+
+        info = {'mu': mu, 'log_std': self._log_std}
+
+        return pi, logp_pi, info, mu, self._log_std
+
+    def cal_outputs_1(self, states, actions, old_log_std_ph, old_mu_ph):
+        pi, logp_pi, info, mu, log_std = self.cal_outputs_0(states)
+        logp = gaussian_likelihood(actions, mu, log_std)
+        d_kl = diagonal_gaussian_kl(mu, log_std, old_mu_ph, old_log_std_ph)
+
+        info_phs = {'mu': old_mu_ph, 'log_std': old_log_std_ph}
+
+        return pi, logp, logp_pi, info, info_phs, d_kl
+
+
+"""
+Actor-Critics
+"""
+
+
+def mlp_actor_critic(
+        x: 'env.observation_space', a: 'env.action_space', hidden_sizes=(64, 64), activation=tf.tanh,
+        output_activation=None
+):
+    """
+    create actor and critic
+    :param x: observation space
+    :param a: action space
+    :param hidden_sizes: hidden layer size
+    :param activation: hidden layer activation function
+    :param output_activation: activation function for the output layer
+    :return: acter class and critic class
+    """
+    # default policy builder depends on action space
+    if isinstance(a, Box):
+        actor = MlpGaussianPolicy(x, a, hidden_sizes, activation, output_activation)
+    elif isinstance(a, Discrete):
+        actor = MlpCategoricalPolicy(x, a, hidden_sizes, activation, output_activation)
+    else:
+        raise ValueError('action space type error')
+
+    class Critic:
+
+        def __init__(self, obs_space, hidden_layer_sizes, activation_funcs):
+            inputs = input_layer_from_space(obs_space)
+            self.model = tl.models.Model(inputs, mlp(inputs, list(hidden_layer_sizes) + [1], activation_funcs, None))
+            self.model.train()
+
+        def critic_cal_func(self, states):
+            states = states.astype(np.float32)
+            return tf.squeeze(self.model(states), axis=1)
+
+    critic = Critic(x, hidden_sizes, activation)
+
+    return actor, critic
+
+
+class GAEBuffer:
+    """
+    A buffer for storing trajectories experienced by a TRPO agent interacting
+    with the environment, and using Generalized Advantage Estimation (GAE-Lambda)
+    for calculating the advantages of state-action pairs.
+    """
+
+    def __init__(self, obs_dim, act_dim, size, info_shapes, gamma=0.99, lam=0.95):
+        self.obs_buf = np.zeros(combined_shape(size, obs_dim), dtype=np.float32)
+        self.act_buf = np.zeros(combined_shape(size, act_dim), dtype=np.float32)
+        self.adv_buf = np.zeros(size, dtype=np.float32)
+        self.rew_buf = np.zeros(size, dtype=np.float32)
+        self.ret_buf = np.zeros(size, dtype=np.float32)
+        self.val_buf = np.zeros(size, dtype=np.float32)
+        self.logp_buf = np.zeros(size, dtype=np.float32)
+        self.info_bufs = {k: np.zeros([size] + list(v), dtype=np.float32) for k, v in info_shapes.items()}
+        self.sorted_info_keys = keys_as_sorted_list(self.info_bufs)
+        self.gamma, self.lam = gamma, lam
+        self.ptr, self.path_start_idx, self.max_size = 0, 0, size
+
+    def store(self, obs, act, rew, val, logp, info):
+        """
+        Append one timestep of agent-environment interaction to the buffer.
+        """
+        assert self.ptr < self.max_size  # buffer has to have room so you can store
+        self.obs_buf[self.ptr] = obs
+        self.act_buf[self.ptr] = act
+        self.rew_buf[self.ptr] = rew
+        self.val_buf[self.ptr] = val
+        self.logp_buf[self.ptr] = logp
+        for i, k in enumerate(self.sorted_info_keys):
+            self.info_bufs[k][self.ptr] = info[i]
+        self.ptr += 1
+
+    def finish_path(self, last_val=0):
+        """
+        Call this at the end of a trajectory, or when one gets cut off
+        by an epoch ending. This looks back in the buffer to where the
+        trajectory started, and uses rewards and value estimates from
+        the whole trajectory to compute advantage estimates with GAE-Lambda,
+        as well as compute the rewards-to-go for each state, to use as
+        the targets for the value function.
+
+        The "last_val" argument should be 0 if the trajectory ended
+        because the agent reached a terminal state (died), and otherwise
+        should be V(s_T), the value function estimated for the last state.
+        This allows us to bootstrap the reward-to-go calculation to account
+        for timesteps beyond the arbitrary episode horizon (or epoch cutoff).
+        """
+
+        path_slice = slice(self.path_start_idx, self.ptr)
+        rews = np.append(self.rew_buf[path_slice], last_val)
+        vals = np.append(self.val_buf[path_slice], last_val)
+
+        # the next two lines implement GAE-Lambda advantage calculation
+        deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1]
+        self.adv_buf[path_slice] = discount_cumsum(deltas, self.gamma * self.lam)
+
+        # the next line computes rewards-to-go, to be targets for the value function
+        self.ret_buf[path_slice] = discount_cumsum(rews, self.gamma)[:-1]
+
+        self.path_start_idx = self.ptr
+
+    def get(self):
+        """
+        Call this at the end of an epoch to get all of the data from
+        the buffer, with advantages appropriately normalized (shifted to have
+        mean zero and std one). Also, resets some pointers in the buffer.
+        """
+        assert self.ptr == self.max_size  # buffer has to be full before you can get
+        self.ptr, self.path_start_idx = 0, 0
+
+        # the next two lines implement the advantage normalization trick
+        adv_mean, adv_std = np.mean(self.adv_buf), np.std(self.adv_buf)
+        self.adv_buf = (self.adv_buf - adv_mean) / adv_std
+        return [self.obs_buf, self.act_buf, self.adv_buf, self.ret_buf, self.logp_buf
+               ] + values_as_sorted_list(self.info_bufs)
+
+
+#####################  TRPO  ####################
+"""
+
+Trust Region Policy Optimization 
+
+(with support for Natural Policy Gradient)
+
+"""
+
+
+class TRPO:
+    """
+    trpo class
+    """
+
+    def __init__(self, obs_space, act_space):
+
+        obs_dim = obs_space.shape
+        act_dim = act_space.shape
+
+        # # Main models and functions
+        self.actor, self.critic = mlp_actor_critic(obs_space, act_space, HIDDEN_SIZES)
+
+        if isinstance(act_space, Box):
+            act_dim = env.action_space.shape[0]
+            info_shapes = {'mu': [act_dim], 'log_std': [act_dim]}
+
+        elif isinstance(env.action_space, Discrete):
+            act_dim = env.action_space.n
+            info_shapes = {'logp_all': [act_dim]}
+        else:
+            raise Exception('info_shape error')
+
+        self.buf = GAEBuffer(obs_dim, act_dim, STEPS_PER_EPOCH, info_shapes, GAMMA, LAM)
+
+        # Optimizer for value function
+        self.critic_optimizer = tf.optimizers.Adam(learning_rate=VF_LR)
+
+    # Every step, get: action, value, logprob, & info for pdist (for computing kl div)
+    def get_action_ops(self, states):
+        """
+        get action
+        :param states: state input
+        :return: pi, v, logp_pi and other outputs
+        """
+        pi, logp_pi, info, *_ = self.actor.cal_outputs_0(states)
+        v = self.critic.critic_cal_func(states)
+        res0 = [pi, v, logp_pi] + values_as_sorted_list(info)
+        res = []
+        for i in res0:
+            res.append(i + 0)  # transfer to tensor
+        return res
+
+    # TRPO losses
+    def pi_loss(self, inputs):
+        """
+        calculate pi loss
+        :param inputs: a list of x_ph, a_ph, adv_ph, ret_ph, logp_old_ph and other inputs
+        :return: pi loss
+        """
+        x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, *info_values = inputs
+
+        pi, logp, logp_pi, info, info_phs, d_kl = self.actor.cal_outputs_1(x_ph, a_ph, *info_values)
+        ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
+        pi_loss = -tf.reduce_mean(ratio * adv_ph)
+        return pi_loss
+
+    def v_loss(self, inputs):
+        """
+        calculate value loss
+        :param inputs: a list of x_ph, a_ph, adv_ph, ret_ph, logp_old_ph and other inputs
+        :return: v loss
+        """
+        x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, *info_values = inputs
+        v = self.critic.critic_cal_func(x_ph)
+        v_loss = tf.reduce_mean((ret_ph - v)**2)
+        return v_loss
+
+    def train_vf(self, inputs):
+        """
+        train v function
+        :param inputs: a list of x_ph, a_ph, adv_ph, ret_ph, logp_old_ph and other inputs
+        :return: None
+        """
+        with tf.GradientTape() as tape:
+            loss = self.v_loss(inputs)
+        grad = tape.gradient(loss, self.critic.model.trainable_weights)
+        self.critic_optimizer.apply_gradients(zip(grad, self.critic.model.trainable_weights))
+
+    # Symbols needed for CG solver
+    def gradient(self, inputs):
+        """
+        pi gradients
+        :param inputs: a list of x_ph, a_ph, adv_ph, ret_ph, logp_old_ph and other inputs
+        :return: gradient
+        """
+        pi_params = self.actor.model.trainable_weights
+        with tf.GradientTape() as tape:
+            loss = self.pi_loss(inputs)
+        grad = tape.gradient(loss, pi_params)
+        gradient = flat_concat(grad)
+        return gradient
+
+    def hvp(self, inputs, v_ph):
+        """
+        calculate hvp
+        :param inputs: a list of x_ph, a_ph, adv_ph, ret_ph, logp_old_ph and other inputs
+        :param v_ph: v input
+        :return: hvp
+        """
+        pi_params = self.actor.model.trainable_weights
+        x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, *info_values = inputs
+
+        with tf.GradientTape() as tape1:
+            with tf.GradientTape() as tape0:
+                pi, logp, logp_pi, info, info_phs, d_kl = self.actor.cal_outputs_1(x_ph, a_ph, *info_values)
+            g = flat_concat(tape0.gradient(d_kl, pi_params))
+            l = tf.reduce_sum(g * v_ph)
+        hvp = flat_concat(tape1.gradient(l, pi_params))
+
+        if DAMPING_COEFF > 0:
+            hvp += DAMPING_COEFF * v_ph
+        return hvp
+
+    # Symbols for getting and setting params
+    def get_pi_params(self):
+        """
+        get actor trainable parameters
+        :return: flat actor trainable parameters
+        """
+        pi_params = self.actor.model.trainable_weights
+        return flat_concat(pi_params)
+
+    def set_pi_params(self, v_ph):
+        """
+        set actor trainable parameters
+        :param v_ph: inputs
+        :return: None
+        """
+        pi_params = self.actor.model.trainable_weights
+        assign_params_from_flat(v_ph, pi_params)
+
+    def save_ckpt(self):
+        """
+        save trained weights
+        :return: None
+        """
+        if not os.path.exists('model'):
+            os.makedirs('model')
+
+        tl.files.save_weights_to_hdf5('model/trpo_actor.hdf5', self.actor.model)
+        tl.files.save_weights_to_hdf5('model/trpo_critic.hdf5', self.critic.model)
+
+    def load_ckpt(self):
+        """
+        load trained weights
+        :return: None
+        """
+        tl.files.load_hdf5_to_weights_in_order('model/trpo_actor.hdf5', self.actor.model)
+        tl.files.load_hdf5_to_weights_in_order('model/trpo_critic.hdf5', self.critic.model)
+
+    def cg(self, Ax, b):
+        """
+        Conjugate gradient algorithm
+        (see https://en.wikipedia.org/wiki/Conjugate_gradient_method)
+        """
+        x = np.zeros_like(b)
+        r = copy.deepcopy(b)  # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start.
+        p = copy.deepcopy(r)
+        r_dot_old = np.dot(r, r)
+        for _ in range(CG_ITERS):
+            z = Ax(p)
+            alpha = r_dot_old / (np.dot(p, z) + EPS)
+            x += alpha * p
+            r -= alpha * z
+            r_dot_new = np.dot(r, r)
+            p = r + (r_dot_new / r_dot_old) * p
+            r_dot_old = r_dot_new
+        return x
+
+    def update(self):
+        """
+        update trpo
+        :return:
+        """
+        # Prepare hessian func, gradient eval
+        inputs = self.buf.get()
+        Hx = lambda x: self.hvp(inputs, x)
+        g, pi_l_old, v_l_old = self.gradient(inputs), self.pi_loss(inputs), self.v_loss(inputs)
+
+        # Core calculations for TRPO or NPG
+        x = self.cg(Hx, g)
+        alpha = np.sqrt(2 * DELTA / (np.dot(x, Hx(x)) + EPS))
+        old_params = self.get_pi_params()
+
+        def set_and_eval(step):
+            aa = alpha * x * step
+            par = old_params - aa
+            self.set_pi_params(par)
+            x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, *info_values = inputs
+            pi, logp, logp_pi, info, info_phs, d_kl = self.actor.cal_outputs_1(x_ph, a_ph, *info_values)
+            loss = self.pi_loss(inputs)
+            return [d_kl, loss]
+
+        # trpo augments npg with backtracking line search, hard kl
+        for j in range(BACKTRACK_ITERS):
+            kl, pi_l_new = set_and_eval(step=BACKTRACK_COEFF**j)
+            if kl <= DELTA and pi_l_new <= pi_l_old:
+                # Accepting new params at step of line search
+                break
+
+            if j == BACKTRACK_ITERS - 1:
+                # Line search failed! Keeping old params.
+                kl, pi_l_new = set_and_eval(step=0.)
+
+        # Value function updates
+        for _ in range(TRAIN_V_ITERS):
+            self.train_vf(inputs)
+
+
+if __name__ == '__main__':
+
+    tf.random.set_seed(SEED)
+    np.random.seed(SEED)
+
+    env = gym.make(ENV_NAME)
+    env.seed(SEED)
+
+    agent = TRPO(env.observation_space, env.action_space)
+
+    if args.train:
+        start_time = time.time()
+        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
+
+        reward_list = []
+        # Main loop: collect experience in env and update/log each epoch
+        for epoch in range(EPOCHS):
+            t0 = time.time()
+            rew = 0
+            for t in range(STEPS_PER_EPOCH):
+                agent_outs = agent.get_action_ops(o.reshape(1, -1))
+                a, v_t, logp_t, info_t = np.array(agent_outs[0][0], np.float32), \
+                                         np.array(agent_outs[1], np.float32), \
+                                         np.array(agent_outs[2], np.float32), \
+                                         np.array(agent_outs[3:], np.float32)
+
+                # save and log
+                agent.buf.store(o, a, r, v_t, logp_t, info_t)
+
+                o, r, d, _ = env.step(a)
+                ep_ret += r
+                ep_len += 1
+
+                terminal = d or (ep_len == MAX_EP_LEN)
+                if terminal or (t == STEPS_PER_EPOCH - 1):
+                    if not (terminal):
+                        print('Warning: trajectory cut off by epoch at %d steps.' % ep_len)
+                    # if trajectory didn't reach terminal state, bootstrap value target
+                    last_val = r if d else agent.critic.critic_cal_func(o.reshape(1, -1))
+                    agent.buf.finish_path(last_val)
+                    rew = ep_ret
+                    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
+
+            # Save model
+            if (epoch % SAVE_FREQ == 0) or (epoch == EPOCHS - 1):
+                agent.save_ckpt()
+
+            # Perform TRPO or NPG update!
+            agent.update()
+            print('epoch [{}/{}] ep_ret: {} time: {}'.format(epoch, EPOCHS, rew, time.time() - t0))
+
+            reward_list.append(rew)
+            plt.clf()
+            plt.ion()
+            plt.plot(reward_list)
+            plt.title('TRPO ' + str(DELTA))
+            plt.ylim(-2000, 0)
+            plt.show()
+            plt.pause(0.1)
+        agent.save_ckpt()
+        plt.ioff()
+        plt.show()
+
+    # test
+    agent.load_ckpt()
+    while True:
+        o = env.reset()
+        for i in range(STEPS_PER_EPOCH):
+            env.render()
+            agent_outs = agent.get_action_ops(o.reshape(1, -1))
+            a, v_t, logp_t, info_t = agent_outs[0][0], agent_outs[1], agent_outs[2], agent_outs[3:]
+            o, r, d, _ = env.step(a)
+            if d:
+                break
diff --git a/examples/reinforcement_learning/tutorial_atari_pong.py b/examples/reinforcement_learning/tutorial_atari_pong.py
index ad8e264df..0ffee9174 100644
--- a/examples/reinforcement_learning/tutorial_atari_pong.py
+++ b/examples/reinforcement_learning/tutorial_atari_pong.py
@@ -29,16 +29,11 @@
 import time
 
 import numpy as np
-import tensorflow as tf
 
 import gym
+import tensorflow as tf
 import tensorlayer as tl
 
-## enable eager mode
-tf.enable_eager_execution()
-
-
-tf.logging.set_verbosity(tf.logging.DEBUG) # enable logging
 tl.logging.set_verbosity(tl.logging.DEBUG)
 
 # hyper-parameters
@@ -52,7 +47,7 @@
 render = False  # display the game environment
 # resume = True         # load existing policy network
 model_file_name = "model_pong"
-np.set_printoptions(threshold=np.nan)
+np.set_printoptions(threshold=np.inf)
 
 
 def prepro(I):
@@ -73,35 +68,23 @@ def prepro(I):
 episode_number = 0
 
 xs, ys, rs = [], [], []
-# observation for training and inference
-# t_states = tf.placeholder(tf.float32, shape=[None, D])
-# policy network
 
+
+# policy network
 def get_model(inputs_shape):
     ni = tl.layers.Input(inputs_shape)
     nn = tl.layers.Dense(n_units=H, act=tf.nn.relu, name='hidden')(ni)
     nn = tl.layers.Dense(n_units=3, name='output')(nn)
     M = tl.models.Model(inputs=ni, outputs=nn, name="mlp")
     return M
+
+
 model = get_model([None, D])
 train_weights = model.trainable_weights
-# probs = model(t_states, is_train=True).outputs
-# sampling_prob = tf.nn.softmax(probs)
-
-# t_actions = tf.placeholder(tf.int32, shape=[None])
-# t_discount_rewards = tf.placeholder(tf.float32, shape=[None])
-# loss = tl.rein.cross_entropy_reward_loss(probs, t_actions, t_discount_rewards)
-optimizer = tf.train.RMSPropOptimizer(learning_rate, decay_rate)#.minimize(loss)
-
-# with tf.Session() as sess:
-#     sess.run(tf.global_variables_initializer())
-    # if resume: TODO
-    #     load_params = tl.files.load_npz(name=model_file_name+'.npz')
-    #     tl.files.assign_params(sess, load_params, network)
-    # tl.files.load_and_assign_npz(sess, model_file_name + '.npz', network)
-    # network.print_params()
-    # network.print_layers()
-model.train() # set model to train mode (in case you add dropout into the model)
+
+optimizer = tf.optimizers.RMSprop(lr=learning_rate, decay=decay_rate)
+
+model.train()  # set model to train mode (in case you add dropout into the model)
 
 start_time = time.time()
 game_number = 0
@@ -114,14 +97,12 @@ def get_model(inputs_shape):
     x = x.reshape(1, D)
     prev_x = cur_x
 
-    # prob = sess.run(sampling_prob, feed_dict={t_states: x})
-    _prob = model(x).outputs
+    _prob = model(x)
     prob = tf.nn.softmax(_prob)
 
     # action. 1: STOP  2: UP  3: DOWN
-        # action = np.random.choice([1,2,3], p=prob.flatten())
-        # action = tl.rein.choice_action_by_probs(prob.flatten(), [1, 2, 3])
-    # action = np.random.choice([1,2,3], p=prob.numpy())
+    # action = np.random.choice([1,2,3], p=prob.flatten())
+    # action = tl.rein.choice_action_by_probs(prob.flatten(), [1, 2, 3])
     action = tl.rein.choice_action_by_probs(prob[0].numpy(), [1, 2, 3])
 
     observation, reward, done, _ = env.step(action)
@@ -145,12 +126,8 @@ def get_model(inputs_shape):
 
             xs, ys, rs = [], [], []
 
-            # sess.run(train_op, feed_dict={t_states: epx, t_actions: epy, t_discount_rewards: disR})
-                # t_actions = tf.placeholder(tf.int32, shape=[None])
-                # t_discount_rewards = tf.placeholder(tf.float32, shape=[None])
-                # loss = tl.rein.cross_entropy_reward_loss(probs, t_actions, t_discount_rewards)
             with tf.GradientTape() as tape:
-                _prob = model(epx).outputs
+                _prob = model(epx)
                 _loss = tl.rein.cross_entropy_reward_loss(_prob, epy, disR)
             grad = tape.gradient(_loss, train_weights)
             optimizer.apply_gradients(zip(grad, train_weights))
diff --git a/examples/reinforcement_learning/tutorial_bipedalwalker_a3c_continuous_action.py b/examples/reinforcement_learning/tutorial_bipedalwalker_a3c_continuous_action.py
deleted file mode 100644
index 2f1f96d67..000000000
--- a/examples/reinforcement_learning/tutorial_bipedalwalker_a3c_continuous_action.py
+++ /dev/null
@@ -1,296 +0,0 @@
-"""
-Asynchronous Advantage Actor Critic (A3C) with Continuous Action Space.
-
-Actor Critic History
-----------------------
-A3C > DDPG (for continuous action space) > AC
-
-Advantage
-----------
-Train faster and more stable than AC.
-
-Disadvantage
--------------
-Have bias.
-
-Reference
-----------
-MorvanZhou's tutorial: https://morvanzhou.github.io/tutorials/
-MorvanZhou's code: https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/blob/master/experiments/Solve_BipedalWalker/A3C.py
-
-Environment
------------
-BipedalWalker-v2 : https://gym.openai.com/envs/BipedalWalker-v2
-
-Reward is given for moving forward, total 300+ points up to the far end.
-If the robot falls, it gets -100. Applying motor torque costs a small amount of
-points, more optimal agent will get better score. State consists of hull angle
-speed, angular velocity, horizontal speed, vertical speed, position of joints
-and joints angular speed, legs contact with ground, and 10 lidar rangefinder
-measurements. There's no coordinates in the state vector.
-
-"""
-
-import multiprocessing
-import threading
-
-import numpy as np
-import tensorflow as tf
-
-import gym
-import tensorlayer as tl
-from tensorlayer.layers import DenseLayer, InputLayer
-
-tf.logging.set_verbosity(tf.logging.DEBUG)
-tl.logging.set_verbosity(tl.logging.DEBUG)
-
-GAME = 'BipedalWalker-v2'  # BipedalWalkerHardcore-v2
-OUTPUT_GRAPH = False
-LOG_DIR = './log'
-N_WORKERS = multiprocessing.cpu_count()
-# N_WORKERS = 4
-MAX_GLOBAL_EP = 20000  # 8000
-GLOBAL_NET_SCOPE = 'Global_Net'
-UPDATE_GLOBAL_ITER = 10
-GAMMA = 0.999
-ENTROPY_BETA = 0.005
-LR_A = 0.00002  # learning rate for actor
-LR_C = 0.0001  # learning rate for critic
-GLOBAL_RUNNING_R = []
-GLOBAL_EP = 0  # will increase during training, stop training when it >= MAX_GLOBAL_EP
-
-env = gym.make(GAME)
-
-N_S = env.observation_space.shape[0]
-N_A = env.action_space.shape[0]
-# A_BOUND = [env.action_space.low, env.action_space.high]
-A_BOUND = [env.action_space.low, env.action_space.high]
-A_BOUND[0] = A_BOUND[0].reshape(1, 4)
-A_BOUND[1] = A_BOUND[1].reshape(1, 4)
-
-# print(env.unwrapped.hull.position[0])
-# exit()
-
-
-class ACNet(object):
-
-    def __init__(self, scope, globalAC=None):
-        self.scope = scope
-        if scope == GLOBAL_NET_SCOPE:
-            ## global network only do inference
-            with tf.variable_scope(scope):
-                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
-                self._build_net()
-                self.a_params = tl.layers.get_variables_with_name(scope + '/actor', True, False)
-                self.c_params = tl.layers.get_variables_with_name(scope + '/critic', True, False)
-
-                normal_dist = tf.contrib.distributions.Normal(self.mu, self.sigma)  # for continuous action space
-
-                with tf.name_scope('choose_a'):  # use local params to choose action
-                    self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), *A_BOUND)
-
-        else:
-            ## worker network calculate gradient locally, update on global network
-            with tf.variable_scope(scope):
-                self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
-                self.a_his = tf.placeholder(tf.float32, [None, N_A], 'A')
-                self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
-
-                self._build_net()
-
-                td = tf.subtract(self.v_target, self.v, name='TD_error')
-                with tf.name_scope('c_loss'):
-                    self.c_loss = tf.reduce_mean(tf.square(td))
-
-                with tf.name_scope('wrap_a_out'):
-                    self.test = self.sigma[0]
-                    self.mu, self.sigma = self.mu * A_BOUND[1], self.sigma + 1e-5
-
-                normal_dist = tf.contrib.distributions.Normal(self.mu, self.sigma)  # for continuous action space
-
-                with tf.name_scope('a_loss'):
-                    log_prob = normal_dist.log_prob(self.a_his)
-                    exp_v = log_prob * td
-                    entropy = normal_dist.entropy()  # encourage exploration
-                    self.exp_v = ENTROPY_BETA * entropy + exp_v
-                    self.a_loss = tf.reduce_mean(-self.exp_v)
-
-                with tf.name_scope('choose_a'):  # use local params to choose action
-                    self.A = tf.clip_by_value(tf.squeeze(normal_dist.sample(1), axis=0), *A_BOUND)
-
-                with tf.name_scope('local_grad'):
-                    self.a_params = tl.layers.get_variables_with_name(scope + '/actor', True, False)
-                    self.c_params = tl.layers.get_variables_with_name(scope + '/critic', True, False)
-                    self.a_grads = tf.gradients(self.a_loss, self.a_params)
-                    self.c_grads = tf.gradients(self.c_loss, self.c_params)
-
-            with tf.name_scope('sync'):
-                with tf.name_scope('pull'):
-                    self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, globalAC.a_params)]
-                    self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, globalAC.c_params)]
-                with tf.name_scope('push'):
-                    self.update_a_op = OPT_A.apply_gradients(zip(self.a_grads, globalAC.a_params))
-                    self.update_c_op = OPT_C.apply_gradients(zip(self.c_grads, globalAC.c_params))
-
-    def _build_net(self):
-        w_init = tf.contrib.layers.xavier_initializer()
-        with tf.variable_scope('actor'):  # Policy network
-            nn = InputLayer(self.s, name='in')
-            nn = DenseLayer(nn, n_units=500, act=tf.nn.relu6, W_init=w_init, name='la')
-            nn = DenseLayer(nn, n_units=300, act=tf.nn.relu6, W_init=w_init, name='la2')
-            mu = DenseLayer(nn, n_units=N_A, act=tf.nn.tanh, W_init=w_init, name='mu')
-            sigma = DenseLayer(nn, n_units=N_A, act=tf.nn.softplus, W_init=w_init, name='sigma')
-            self.mu = mu.outputs
-            self.sigma = sigma.outputs
-
-        with tf.variable_scope('critic'):  # we use Value-function here, but not Q-function.
-            nn = InputLayer(self.s, name='in')
-            nn = DenseLayer(nn, n_units=500, act=tf.nn.relu6, W_init=w_init, name='lc')
-            nn = DenseLayer(nn, n_units=200, act=tf.nn.relu6, W_init=w_init, name='lc2')
-            v = DenseLayer(nn, n_units=1, W_init=w_init, name='v')
-            self.v = v.outputs
-
-    def update_global(self, feed_dict):  # run by a local
-        _, _, t = sess.run(
-            [self.update_a_op, self.update_c_op, self.test], feed_dict
-        )  # local grads applies to global net
-        return t
-
-    def pull_global(self):  # run by a local
-        sess.run([self.pull_a_params_op, self.pull_c_params_op])
-
-    def choose_action(self, s):  # run by a local
-        s = s[np.newaxis, :]
-        return sess.run(self.A, {self.s: s})[0]
-
-    def save_ckpt(self):
-        tl.files.exists_or_mkdir(self.scope)
-        tl.files.save_ckpt(
-            sess=sess, mode_name='model.ckpt', var_list=self.a_params + self.c_params, save_dir=self.scope,
-            printable=True
-        )
-
-    def load_ckpt(self):
-        tl.files.load_ckpt(sess=sess, var_list=self.a_params + self.c_params, save_dir=self.scope, printable=True)
-        # tl.files.load_ckpt(sess=sess, mode_name='model.ckpt', var_list=self.a_params+self.c_params, save_dir=self.scope, is_latest=False, printable=True)
-
-
-class Worker(object):
-
-    def __init__(self, name, globalAC):
-        self.env = gym.make(GAME)
-        self.name = name
-        self.AC = ACNet(name, globalAC)
-
-    def work(self):
-        global GLOBAL_RUNNING_R, GLOBAL_EP
-        total_step = 1
-        buffer_s, buffer_a, buffer_r = [], [], []
-        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
-            s = self.env.reset()
-            ep_r = 0
-            while True:
-                # visualize Worker_0 during training
-                if self.name == 'Worker_0' and total_step % 30 == 0:
-                    self.env.render()
-                a = self.AC.choose_action(s)
-                s_, r, done, _info = self.env.step(a)
-
-                # set robot falls reward to -2 instead of -100
-                if r == -100: r = -2
-
-                ep_r += r
-                buffer_s.append(s)
-                buffer_a.append(a)
-                buffer_r.append(r)
-
-                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net
-
-                    if done:
-                        v_s_ = 0  # terminal
-                    else:
-                        v_s_ = sess.run(self.AC.v, {self.AC.s: s_[np.newaxis, :]})[0, 0]
-
-                    buffer_v_target = []
-
-                    for r in buffer_r[::-1]:  # reverse buffer r
-                        v_s_ = r + GAMMA * v_s_
-                        buffer_v_target.append(v_s_)
-
-                    buffer_v_target.reverse()
-
-                    buffer_s, buffer_a, buffer_v_target = (
-                        np.vstack(buffer_s), np.vstack(buffer_a), np.vstack(buffer_v_target)
-                    )
-                    feed_dict = {self.AC.s: buffer_s, self.AC.a_his: buffer_a, self.AC.v_target: buffer_v_target}
-                    # update gradients on global network
-                    self.AC.update_global(feed_dict)
-                    buffer_s, buffer_a, buffer_r = [], [], []
-
-                    # update local network from global network
-                    self.AC.pull_global()
-
-                s = s_
-                total_step += 1
-                if done:
-                    if len(GLOBAL_RUNNING_R) == 0:  # record running episode reward
-                        GLOBAL_RUNNING_R.append(ep_r)
-                    else:
-                        GLOBAL_RUNNING_R.append(0.95 * GLOBAL_RUNNING_R[-1] + 0.05 * ep_r)
-                    print(
-                        self.name,
-                        "episode:",
-                        GLOBAL_EP,
-                        "| pos: %i" % self.env.unwrapped.hull.position[0],  # number of move
-                        '| reward: %.1f' % ep_r,
-                        "| running_reward: %.1f" % GLOBAL_RUNNING_R[-1],
-                        # '| sigma:', test, # debug
-                        'WIN ' * 5 if self.env.unwrapped.hull.position[0] >= 88 else '',
-                    )
-                    GLOBAL_EP += 1
-                    break
-
-
-if __name__ == "__main__":
-    sess = tf.Session()
-
-    # ============================= TRAINING ===============================
-    with tf.device("/cpu:0"):
-        OPT_A = tf.train.RMSPropOptimizer(LR_A, name='RMSPropA')
-        OPT_C = tf.train.RMSPropOptimizer(LR_C, name='RMSPropC')
-        GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE)  # we only need its params
-        workers = []
-        # Create worker
-        for i in range(N_WORKERS):
-            i_name = 'Worker_%i' % i  # worker name
-            workers.append(Worker(i_name, GLOBAL_AC))
-
-    COORD = tf.train.Coordinator()
-    sess.run(tf.global_variables_initializer())
-
-    # start TF threading
-    worker_threads = []
-    for worker in workers:
-        t = threading.Thread(target=worker.work)
-        t.start()
-        worker_threads.append(t)
-    COORD.join(worker_threads)
-
-    GLOBAL_AC.save_ckpt()
-
-    # ============================= EVALUATION =============================
-    # env = gym.make(GAME)
-    # GLOBAL_AC = ACNet(GLOBAL_NET_SCOPE)
-    # sess.run(tf.global_variables_initializer())
-    # GLOBAL_AC.load_ckpt()
-    # while True:
-    #     s = env.reset()
-    #     rall = 0
-    #     while True:
-    #         env.render()
-    #         a = GLOBAL_AC.choose_action(s)
-    #         s, r, d, _ = env.step(a)
-    #         rall += r
-    #         if d:
-    #             print("reward", rall)
-    #             break
diff --git a/examples/reinforcement_learning/tutorial_cartpole_ac.py b/examples/reinforcement_learning/tutorial_cartpole_ac.py
deleted file mode 100644
index 4d8b6f8ea..000000000
--- a/examples/reinforcement_learning/tutorial_cartpole_ac.py
+++ /dev/null
@@ -1,249 +0,0 @@
-"""Actor-Critic using TD-error as the Advantage, Reinforcement Learning.
-
-Actor Critic History
-----------------------
-A3C > DDPG > AC
-
-Advantage
-----------
-AC converge faster than Policy Gradient.
-
-Disadvantage (IMPORTANT)
-------------------------
-The Policy is oscillated (difficult to converge), DDPG can solve
-this problem using advantage of DQN.
-
-Reference
-----------
-View more on MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/
-
-Environment
-------------
-CartPole-v0: https://gym.openai.com/envs/CartPole-v0
-
-A pole is attached by an un-actuated joint to a cart, which moves along a
-frictionless track. The system is controlled by applying a force of +1 or -1
-to the cart. The pendulum starts upright, and the goal is to prevent it from
-falling over.
-
-A reward of +1 is provided for every timestep that the pole remains upright.
-The episode ends when the pole is more than 15 degrees from vertical, or the
-cart moves more than 2.4 units from the center.
-
-"""
-import time
-
-import numpy as np
-import tensorflow as tf
-
-import gym
-import tensorlayer as tl
-
-## enable eager mode
-tf.enable_eager_execution()
-
-
-tf.logging.set_verbosity(tf.logging.DEBUG)
-tl.logging.set_verbosity(tl.logging.DEBUG)
-
-np.random.seed(2)
-tf.set_random_seed(2)  # reproducible
-
-# hyper-parameters
-OUTPUT_GRAPH = False
-MAX_EPISODE = 3000
-DISPLAY_REWARD_THRESHOLD = 100  # renders environment if running reward is greater then this threshold
-MAX_EP_STEPS = 1000  # maximum time step in one episode
-RENDER = False  # rendering wastes time
-LAMBDA = 0.9  # reward discount in TD error
-LR_A = 0.001  # learning rate for actor
-LR_C = 0.01  # learning rate for critic
-
-env = gym.make('CartPole-v0')
-env.seed(2)  # reproducible
-# env = env.unwrapped
-
-N_F = env.observation_space.shape[0]
-N_A = env.action_space.n
-# env.action_space.sample() random sample
-
-print("observation dimension: %d" % N_F)  # 4
-print("observation high: %s" % env.observation_space.high)  # [ 2.4 , inf , 0.41887902 , inf]
-print("observation low : %s" % env.observation_space.low)  # [-2.4 , -inf , -0.41887902 , -inf]
-print("num of actions: %d" % N_A)  # 2 : left or right
-
-
-class Actor(object):
-
-    def __init__(self, n_features, n_actions, lr=0.001):
-            # self.sess = sess
-            # self.s = tf.placeholder(tf.float32, [1, n_features], "state")
-            # self.a = tf.placeholder(tf.int32, [None], "act")
-            # self.td_error = tf.placeholder(tf.float32, [None], "td_error")  # TD_error
-
-            # with tf.variable_scope('Actor'):  # Policy network
-            #     n = InputLayer(self.s, name='in')
-            #     n = DenseLayer(n, n_units=30, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden')
-            #     # n = DenseLayer(n, n_units=10, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden2')
-            #     n = DenseLayer(n, n_units=n_actions, name='Pi')
-
-        def get_model(inputs_shape):
-            ni = tl.layers.Input(inputs_shape, name='state')
-            nn = tl.layers.Dense(n_units=30, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden')(ni)
-            nn = tl.layers.Dense(n_units=10, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden2')(nn)
-            nn = tl.layers.Dense(n_units=n_actions, name='actions')(nn)
-            return tl.models.Model(inputs=ni, outputs=nn, name="Actor")
-        self.model = get_model([1, n_features])
-        self.model.train()
-            # self.acts_logits = n.outputs
-            # self.acts_prob = tf.nn.softmax(self.acts_logits)
-
-            # Hao Dong
-            # with tf.variable_scope('loss'):
-            #     self.exp_v = tl.rein.cross_entropy_reward_loss(
-            #         logits=self.acts_logits, actions=self.a, rewards=self.td_error, name='actor_weighted_loss'
-            #     )
-
-            # with tf.variable_scope('train'):
-            #     self.train_op = tf.train.AdamOptimizer(lr).minimize(self.exp_v)
-        self.optimizer = tf.train.AdamOptimizer(lr)
-        # Morvan Zhou (the same)
-        # with tf.variable_scope('exp_v'):
-        #     # log_prob = tf.log(self.acts_prob[0, self.a[0]])
-        #     # self.exp_v = tf.reduce_mean(log_prob * self.td_error[0])  # advantage (TD_error) guided loss
-        #     self.exp_v = tl.rein.log_weight(probs=self.acts_prob[0, self.a[0]], weights=self.td_error)
-        #
-        # with tf.variable_scope('train'):
-        #     self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v)  # minimize(-exp_v) = maximize(exp_v)
-
-    def learn(self, s, a, td):
-            # _, exp_v = self.sess.run([self.train_op, self.exp_v], {self.s: [s], self.a: [a], self.td_error: td[0]})
-        with tf.GradientTape() as tape:
-            _logits = self.model([s]).outputs
-            # _probs = tf.nn.softmax(_logits)
-            _exp_v = tl.rein.cross_entropy_reward_loss(logits=_logits, actions=[a], rewards=td[0])
-        grad = tape.gradient(_exp_v, self.model.trainable_weights)
-        self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights))
-        return _exp_v
-
-    def choose_action(self, s):
-            # probs = self.sess.run(self.acts_prob, {self.s: [s]})  # get probabilities of all actions
-        _logits = self.model([s]).outputs
-        _probs = tf.nn.softmax(_logits).numpy()
-        return tl.rein.choice_action_by_probs(_probs.ravel())
-
-    def choose_action_greedy(self, s):
-            # probs = self.sess.run(self.acts_prob, {self.s: [s]})  # get probabilities of all actions
-        _logits = self.model([s]).outputs
-        _probs = tf.nn.softmax(_logits).numpy()
-        return np.argmax(_probs.ravel())
-
-
-class Critic(object):
-
-    def __init__(self, n_features, lr=0.01):
-            # self.sess = sess
-            # self.s = tf.placeholder(tf.float32, [1, n_features], "state")
-            # self.v_ = tf.placeholder(tf.float32, [1, 1], "v_next")
-            # self.r = tf.placeholder(tf.float32, None, 'r')
-
-            # with tf.variable_scope('Critic'):  # we use Value-function here, not Action-Value-function
-            #     n = InputLayer(self.s, name='in')
-            #     n = DenseLayer(n, n_units=30, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden')
-            #     # n = DenseLayer(n, n_units=5, act=tf.nn.relu, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden2')
-            #     n = DenseLayer(n, n_units=1, act=None, name='V')
-            #     self.v = n.outputs
-        def get_model(inputs_shape):
-            ni = tl.layers.Input(inputs_shape, name='state')
-            nn = tl.layers.Dense(n_units=30, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden')(ni)
-            nn = tl.layers.Dense(n_units=5, act=tf.nn.relu, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden2')(nn)
-            nn = tl.layers.Dense(n_units=1, act=None, name='value')(nn)
-            return tl.models.Model(inputs=ni, outputs=nn, name="Critic")
-        self.model = get_model([1, n_features])
-        self.model.train()
-            # with tf.variable_scope('squared_TD_error'):
-            #     # TD_error = r + lambd * V(newS) - V(S)
-            #     self.td_error = self.r + LAMBDA * self.v_ - self.v
-            #     self.loss = tf.square(self.td_error)
-            # with tf.variable_scope('train'):
-                # self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
-        self.optimizer = tf.train.AdamOptimizer(lr)
-
-    def learn(self, s, r, s_):
-            # v_ = self.sess.run(self.v, {self.s: [s_]})
-        v_ = self.model([s_]).outputs
-            # td_error, _ = self.sess.run([self.td_error, self.train_op], {self.s: [s], self.v_: v_, self.r: r})
-        with tf.GradientTape() as tape:
-            v = self.model([s]).outputs
-            # TD_error = r + lambd * V(newS) - V(S)
-            td_error = r + LAMBDA * v_ - v
-            loss = tf.square(td_error)
-        grad = tape.gradient(loss, self.model.trainable_weights)
-        self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights))
-
-        return td_error
-
-actor = Actor(n_features=N_F, n_actions=N_A, lr=LR_A)
-# we need a good teacher, so the teacher should learn faster than the actor
-critic = Critic(n_features=N_F, lr=LR_C)
-
-
-for i_episode in range(MAX_EPISODE):
-    episode_time = time.time()
-    s = env.reset().astype(np.float32)
-    t = 0  # number of step in this episode
-    all_r = []  # rewards of all steps
-    while True:
-        if RENDER: env.render()
-
-        a = actor.choose_action(s)
-
-        s_new, r, done, info = env.step(a)
-        s_new = s_new.astype(np.float32)
-
-        if done: r = -20
-        # these may helpful in some tasks
-        # if abs(s_new[0]) >= env.observation_space.high[0]:
-        # #  cart moves more than 2.4 units from the center
-        #     r = -20
-        # reward for the distance between cart to the center
-        # r -= abs(s_new[0])  * .1
-
-        all_r.append(r)
-
-        td_error = critic.learn(s, r, s_new)  # learn Value-function : gradient = grad[r + lambda * V(s_new) - V(s)]
-        actor.learn(s, a, td_error)  # learn Policy         : true_gradient = grad[logPi(s, a) * td_error]
-
-        s = s_new
-        t += 1
-
-        if done or t >= MAX_EP_STEPS:
-            ep_rs_sum = sum(all_r)
-
-            if 'running_reward' not in globals():
-                running_reward = ep_rs_sum
-            else:
-                running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
-            # start rending if running_reward greater than a threshold
-            # if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True
-            print("Episode: %d reward: %f running_reward %f took: %.5f" % \
-                (i_episode, ep_rs_sum, running_reward, time.time() - episode_time))
-
-            # Early Stopping for quick check
-            if t >= MAX_EP_STEPS:
-                print("Early Stopping")
-                s = env.reset().astype(np.float32)
-                rall = 0
-                while True:
-                    env.render()
-                    # a = actor.choose_action(s)
-                    a = actor.choose_action_greedy(s)  # Hao Dong: it is important for this task
-                    s_new, r, done, info = env.step(a)
-                    s_new = np.concatenate((s_new[0:N_F], s[N_F:]), axis=0).astype(np.float32)
-                    rall += r
-                    s = s_new
-                    if done:
-                        print("reward", rall)
-                        s = env.reset().astype(np.float32)
-                        rall = 0
-            break
diff --git a/examples/reinforcement_learning/tutorial_format.py b/examples/reinforcement_learning/tutorial_format.py
new file mode 100644
index 000000000..f3e9a7e50
--- /dev/null
+++ b/examples/reinforcement_learning/tutorial_format.py
@@ -0,0 +1,99 @@
+# the format of turorial algorithm #
+# please heavily annotate the code #
+'''
+Algorithm Name
+------------------------
+Briefly describe the algorithms, add some details.
+
+Reference
+---------
+original paper: e.g. https://arxiv.org/pdf/1802.09477.pdf
+website: ...
+
+
+Environment
+-----------
+e.g. Openai Gym Pendulum-v0, continuous action space
+
+Prerequisites
+---------------
+tensorflow >=2.0.0a0
+tensorlayer >=2.0.0
+...
+
+To run
+-------
+python tutorial_***.py --train/test
+
+'''
+
+import argparse
+import time
+
+import numpy as np
+
+import tensorflow as tf
+
+# import 'other package name'
+
+np.random.seed(2)
+tf.random.set_seed(2)  # reproducible
+
+# add arguments in command  --train/test
+parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
+parser.add_argument('--train', dest='train', action='store_true', default=False)
+parser.add_argument('--test', dest='test', action='store_true', default=True)
+args = parser.parse_args()
+
+#####################  hyper parameters  ####################
+A = a  # description of hyper parameter
+B = b  # description of hyper parameter
+
+###############################  Algorithm Name  ####################################
+
+
+class C():  # algorithm-specific classes
+    ''' description of class '''
+
+    def C1():
+        ''' description of function'''
+
+
+def D():  # some common functions, could be extracted into utils afterwards
+    ''' description of function '''
+
+
+if __name__ == '__main__':
+    '''initialization of env, buffer, networks in algorithms'''
+    env = 'env model'
+    buffer = 'buffer model'
+    network1 = 'network model1'
+    network2 = 'network model2'
+
+    # training loop
+    if args.train:
+        t0 = time.time()
+        while NOT_FINISHED:  # loop of episodes
+            while NOT_DONE:  # loop of steps in episode
+                ''' step '''
+                ''' train '''
+
+            print('Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'\
+            .format(episode, all_episodes, episode_reward, time.time()-t0 ))
+        ''' plot , following the format of ./baselines/utils/plot()'''
+        plot(rewards, Algorithm_name='SAC', Env_name='Pendulum-v0')
+        ''' save weights, implemented in defined classes above, following the format of ./baselines/utils/save_model()  '''
+        model.save_weights()
+
+    # testing loop
+    if args.test:
+        t0 = time.time()
+        ''' save weights, implemented in defined classes above, following the format of ./baselines/utils/load_model()  '''
+        model.load_weights()
+
+        while NOT_FINISHED:  # loop of episodes
+            while NOT_DONE:  # loop of steps in episode
+                ''' step '''
+
+            print('Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'\
+            .format(episode, all_episodes, episode_reward, time.time()-t0 ) )
diff --git a/examples/reinforcement_learning/tutorial_frozenlake_dqn.py b/examples/reinforcement_learning/tutorial_frozenlake_dqn.py
deleted file mode 100644
index 9411da423..000000000
--- a/examples/reinforcement_learning/tutorial_frozenlake_dqn.py
+++ /dev/null
@@ -1,132 +0,0 @@
-"""Q-Network Q(a, s) - TD Learning, Off-Policy, e-Greedy Exploration (GLIE).
-
-Q(S, A) <- Q(S, A) + alpha * (R + lambda * Q(newS, newA) - Q(S, A))
-delta_w = R + lambda * Q(newS, newA)
-
-See David Silver RL Tutorial Lecture 5 - Q-Learning for more details.
-
-EN: https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0#.5m3361vlw
-CN: https://zhuanlan.zhihu.com/p/25710327
-
-Note: Policy Network has been proved to be better than Q-Learning, see tutorial_atari_pong.py
-
-# The FrozenLake v0 environment
-https://gym.openai.com/envs/FrozenLake-v0
-The agent controls the movement of a character in a grid world. Some tiles of
-the grid are walkable, and others lead to the agent falling into the water.
-Additionally, the movement direction of the agent is uncertain and only partially
-depends on the chosen direction. The agent is rewarded for finding a walkable
-path to a goal tile.
-SFFF       (S: starting point, safe)
-FHFH       (F: frozen surface, safe)
-FFFH       (H: hole, fall to your doom)
-HFFG       (G: goal, where the frisbee is located)
-The episode ends when you reach the goal or fall in a hole. You receive a reward
-of 1 if you reach the goal, and zero otherwise.
-
-"""
-import time
-
-import numpy as np
-import tensorflow as tf
-
-import gym
-import tensorlayer as tl
-
-## enable eager mode
-tf.enable_eager_execution()
-
-
-tf.logging.set_verbosity(tf.logging.DEBUG)
-tl.logging.set_verbosity(tl.logging.DEBUG)
-
-env = gym.make('FrozenLake-v0')
-
-def to_one_hot(i, n_classes=None):
-    a = np.zeros(n_classes, 'uint8')
-    a[i] = 1
-    return a
-
-render = False  # display the game environment
-running_reward = None
-
-    # tf.reset_default_graph()
-## Define Q-network q(a,s) that ouput the rewards of 4 actions by given state, i.e. Action-Value Function.
-# 4x4 grid can be represented by one-hot vector with 16 integers.
-    # inputs = tf.placeholder(shape=[1, 16], dtype=tf.float32)
-    # net = InputLayer(inputs, name='observation')
-    # net = DenseLayer(net, 4, act=None, W_init=tf.random_uniform_initializer(0, 0.01), b_init=None, name='q_a_s')
-    # y = net.outputs  # action-value / rewards of 4 actions
-def get_model(inputs_shape):
-    ni = tl.layers.Input(inputs_shape, name='observation')
-    nn = tl.layers.Dense(4, act=None, W_init=tf.random_uniform_initializer(0, 0.01), b_init=None, name='q_a_s')(ni)
-    return tl.models.Model(inputs=ni, outputs=nn, name="Q-Network")
-qnetwork = get_model([1, 16])
-qnetwork.train()
-train_weights = qnetwork.trainable_weights
-
-# chose action greedily with reward. in Q-Learning, policy is greedy, so we use "max" to select the next action.
-    # predict = tf.argmax(y, 1)
-
-## Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
-    # nextQ = tf.placeholder(shape=[1, 4], dtype=tf.float32)
-    # loss = tl.cost.mean_squared_error(nextQ, y, is_mean=False)  # tf.reduce_sum(tf.square(nextQ - y))
-    # train_op = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(loss)
-optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
-
-## Set learning parameters
-lambd = .99  # decay factor
-e = 0.1  # e-Greedy Exploration, the larger the more random
-num_episodes = 10000
-
-# with tf.Session() as sess:
-    # tl.layers.initialize_global_variables(sess)
-for i in range(num_episodes):
-    ## Reset environment and get first new observation
-    episode_time = time.time()
-    s = env.reset()  # observation is state, integer 0 ~ 15
-    rAll = 0
-    for j in range(99):  # step index, maximum step is 99
-        if render: env.render()
-        ## Choose an action by greedily (with e chance of random action) from the Q-network
-            # a, allQ = sess.run([predict, y], feed_dict={inputs: [to_one_hot(s, 16)]})
-        allQ = qnetwork(np.asarray([to_one_hot(s, 16)], dtype=np.float32)).outputs.numpy()
-        a = np.argmax(allQ, 1)
-
-        ## e-Greedy Exploration !!! sample random action
-        if np.random.rand(1) < e:
-            a[0] = env.action_space.sample()
-        ## Get new state and reward from environment
-        s1, r, d, _ = env.step(a[0])
-        ## Obtain the Q' values by feeding the new state through our network
-            # Q1 = sess.run(y, feed_dict={inputs: [to_one_hot(s1, 16)]})
-        Q1 = qnetwork(np.asarray([to_one_hot(s1, 16)], dtype=np.float32)).outputs.numpy()
-
-        ## Obtain maxQ' and set our target value for chosen action.
-        maxQ1 = np.max(Q1)  # in Q-Learning, policy is greedy, so we use "max" to select the next action.
-        targetQ = allQ
-        targetQ[0, a[0]] = r + lambd * maxQ1
-        ## Train network using target and predicted Q values
-        # it is not real target Q value, it is just an estimation,
-        # but check the Q-Learning update formula:
-        #    Q'(s,a) <- Q(s,a) + alpha(r + lambd * maxQ(s',a') - Q(s, a))
-        # minimizing |r + lambd * maxQ(s',a') - Q(s, a)|^2 equal to force
-        #   Q'(s,a) ≈ Q(s,a)
-            # _ = sess.run(train_op, {inputs: [to_one_hot(s, 16)], nextQ: targetQ})
-        with tf.GradientTape() as tape:
-            _qvalues = qnetwork(np.asarray([to_one_hot(s, 16)], dtype=np.float32)).outputs
-            _loss = tl.cost.mean_squared_error(targetQ, _qvalues, is_mean=False)
-        grad = tape.gradient(_loss, train_weights)
-        optimizer.apply_gradients(zip(grad, train_weights))
-
-        rAll += r
-        s = s1
-        ## Reduce chance of random action if an episode is done.
-        if d ==True:
-            e = 1. / ((i / 50) + 10)  # reduce e, GLIE: Greey in the limit with infinite Exploration
-            break
-
-    ## Note that, the rewards here with random action
-    running_reward = rAll if running_reward is None else running_reward * 0.99 + rAll * 0.01
-    print("Episode [%d/%d] sum reward: %f running reward: %f took: %.5fs %s" % \
-        (i, num_episodes, rAll, running_reward, time.time() - episode_time, '' if rAll == 0 else ' !!!!!!!!'))
diff --git a/examples/reinforcement_learning/tutorial_prioritized_replay.py b/examples/reinforcement_learning/tutorial_prioritized_replay.py
new file mode 100644
index 000000000..8f5f60404
--- /dev/null
+++ b/examples/reinforcement_learning/tutorial_prioritized_replay.py
@@ -0,0 +1,494 @@
+"""
+Prioritized Experience Replay
+------------------------
+Prioritized experience replay is an efficient replay method that replay
+important transitions more frequently. Segment tree data structure is used to
+speed up indexing.
+
+
+Reference:
+------------------------
+Schaul T, Quan J, Antonoglou I, et al. Prioritized experience replay[J]. arXiv
+preprint arXiv:1511.05952, 2015.
+
+Dhariwal P, Hesse C, Klimov O, et al. Openai baselines (2017)[J]. URL
+https://github. com/opfenai/baselines.
+
+
+Environment:
+------------------------
+Cartpole and Pong in OpenAI Gym
+
+
+Requirements:
+------------------------
+tensorflow>=2.0.0a0
+tensorlayer>=2.0.0
+
+
+To run:
+------------------------
+python tutorial_prioritized_replay.py --mode=train
+python tutorial_prioritized_replay.py --mode=test --save_path=per/8000.npz
+"""
+import argparse
+import operator
+import os
+import random
+import time
+
+import numpy as np
+
+import tensorflow as tf
+import tensorlayer as tl
+from tutorial_wrappers import build_env
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--mode', help='train or test', default='train')
+parser.add_argument(
+    '--save_path', default='per', help='folder to save if mode == train else model path,'
+    'qnet will be saved once target net update'
+)
+parser.add_argument('--seed', help='random seed', type=int, default=0)
+parser.add_argument('--env_id', default='CartPole-v0', help='CartPole-v0 or PongNoFrameskip-v4')
+args = parser.parse_args()
+
+if args.mode == 'train':
+    os.makedirs(args.save_path, exist_ok=True)
+random.seed(args.seed)
+np.random.seed(args.seed)
+tf.random.set_seed(args.seed)  # reproducible
+env_id = args.env_id
+env = build_env(env_id, seed=args.seed)
+
+# ####################  hyper parameters  ####################
+if env_id == 'CartPole-v0':
+    qnet_type = 'MLP'
+    number_timesteps = 10000  # total number of time steps to train on
+    explore_timesteps = 100
+    # epsilon-greedy schedule, final exploit prob is 0.99
+    epsilon = lambda i_iter: 1 - 0.99 * min(1, i_iter / explore_timesteps)
+    lr = 5e-3  # learning rate
+    buffer_size = 1000  # replay buffer size
+    target_q_update_freq = 50  # how frequency target q net update
+    ob_scale = 1.0  # scale observations
+else:
+    # reward will increase obviously after 1e5 time steps
+    qnet_type = 'CNN'
+    number_timesteps = int(1e6)  # total number of time steps to train on
+    explore_timesteps = 1e5
+    # epsilon-greedy schedule, final exploit prob is 0.99
+    epsilon = lambda i_iter: 1 - 0.99 * min(1, i_iter / explore_timesteps)
+    lr = 1e-4  # learning rate
+    buffer_size = 10000  # replay buffer size
+    target_q_update_freq = 200  # how frequency target q net update
+    ob_scale = 1.0 / 255  # scale observations
+
+in_dim = env.observation_space.shape
+out_dim = env.action_space.n
+reward_gamma = 0.99  # reward discount
+batch_size = 32  # batch size for sampling from replay buffer
+warm_start = buffer_size / 10  # sample times befor learning
+prioritized_replay_alpha = 0.6  # alpha in PER
+prioritized_replay_beta0 = 0.4  # initial beta in PER
+
+
+# ##############################  PER  ####################################
+class MLP(tl.models.Model):
+
+    def __init__(self, name):
+        super(MLP, self).__init__(name=name)
+        self.h1 = tl.layers.Dense(64, tf.nn.tanh, in_channels=in_dim[0])
+        self.qvalue = tl.layers.Dense(out_dim, in_channels=64, name='q', W_init=tf.initializers.GlorotUniform())
+
+    def forward(self, ni):
+        return self.qvalue(self.h1(ni))
+
+
+class CNN(tl.models.Model):
+
+    def __init__(self, name):
+        super(CNN, self).__init__(name=name)
+        h, w, in_channels = in_dim
+        dense_in_channels = 64 * ((h - 28) // 8) * ((w - 28) // 8)
+        self.conv1 = tl.layers.Conv2d(
+            32, (8, 8), (4, 4), tf.nn.relu, 'VALID', in_channels=in_channels, name='conv2d_1',
+            W_init=tf.initializers.GlorotUniform()
+        )
+        self.conv2 = tl.layers.Conv2d(
+            64, (4, 4), (2, 2), tf.nn.relu, 'VALID', in_channels=32, name='conv2d_2',
+            W_init=tf.initializers.GlorotUniform()
+        )
+        self.conv3 = tl.layers.Conv2d(
+            64, (3, 3), (1, 1), tf.nn.relu, 'VALID', in_channels=64, name='conv2d_3',
+            W_init=tf.initializers.GlorotUniform()
+        )
+        self.flatten = tl.layers.Flatten(name='flatten')
+        self.preq = tl.layers.Dense(
+            256, tf.nn.relu, in_channels=dense_in_channels, name='pre_q', W_init=tf.initializers.GlorotUniform()
+        )
+        self.qvalue = tl.layers.Dense(out_dim, in_channels=256, name='q', W_init=tf.initializers.GlorotUniform())
+
+    def forward(self, ni):
+        feature = self.flatten(self.conv3(self.conv2(self.conv1(ni))))
+        return self.qvalue(self.preq(feature))
+
+
+class SegmentTree(object):
+
+    def __init__(self, capacity, operation, neutral_element):
+        """Build a Segment Tree data structure.
+
+        https://en.wikipedia.org/wiki/Segment_tree
+
+        Can be used as regular array, but with two
+        important differences:
+
+            a) setting item's value is slightly slower.
+               It is O(lg capacity) instead of O(1).
+            b) user has access to an efficient ( O(log segment size) )
+               `reduce` operation which reduces `operation` over
+               a contiguous subsequence of items in the array.
+
+        Paramters
+        ---------
+        capacity: int
+            Total size of the array - must be a power of two.
+        operation: lambda obj, obj -> obj
+            and operation for combining elements (eg. sum, max)
+            must form a mathematical group together with the set of
+            possible values for array elements (i.e. be associative)
+        neutral_element: obj
+            neutral element for the operation above. eg. float('-inf')
+            for max and 0 for sum.
+        """
+        assert capacity > 0 and capacity & (capacity - 1) == 0, \
+            "capacity must be positive and a power of 2."
+        self._capacity = capacity
+        self._value = [neutral_element for _ in range(2 * capacity)]
+        self._operation = operation
+
+    def _reduce_helper(self, start, end, node, node_start, node_end):
+        if start == node_start and end == node_end:
+            return self._value[node]
+        mid = (node_start + node_end) // 2
+        if end <= mid:
+            return self._reduce_helper(start, end, 2 * node, node_start, mid)
+        else:
+            if mid + 1 <= start:
+                return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end)
+            else:
+                return self._operation(
+                    self._reduce_helper(start, mid, 2 * node, node_start, mid),
+                    self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
+                )
+
+    def reduce(self, start=0, end=None):
+        """Returns result of applying `self.operation`
+        to a contiguous subsequence of the array.
+
+        Parameters
+        ----------
+        start: int
+            beginning of the subsequence
+        end: int
+            end of the subsequences
+
+        Returns
+        -------
+        reduced: obj
+            result of reducing self.operation over the specified range of array.
+        """
+        if end is None:
+            end = self._capacity
+        if end < 0:
+            end += self._capacity
+        end -= 1
+        return self._reduce_helper(start, end, 1, 0, self._capacity - 1)
+
+    def __setitem__(self, idx, val):
+        # index of the leaf
+        idx += self._capacity
+        self._value[idx] = val
+        idx //= 2
+        while idx >= 1:
+            self._value[idx] = self._operation(self._value[2 * idx], self._value[2 * idx + 1])
+            idx //= 2
+
+    def __getitem__(self, idx):
+        assert 0 <= idx < self._capacity
+        return self._value[self._capacity + idx]
+
+
+class SumSegmentTree(SegmentTree):
+
+    def __init__(self, capacity):
+        super(SumSegmentTree, self).__init__(capacity=capacity, operation=operator.add, neutral_element=0.0)
+
+    def sum(self, start=0, end=None):
+        """Returns arr[start] + ... + arr[end]"""
+        return super(SumSegmentTree, self).reduce(start, end)
+
+    def find_prefixsum_idx(self, prefixsum):
+        """Find the highest index `i` in the array such that
+            sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
+
+        if array values are probabilities, this function
+        allows to sample indexes according to the discrete
+        probability efficiently.
+
+        Parameters
+        ----------
+        perfixsum: float
+            upperbound on the sum of array prefix
+
+        Returns
+        -------
+        idx: int
+            highest index satisfying the prefixsum constraint
+        """
+        assert 0 <= prefixsum <= self.sum() + 1e-5
+        idx = 1
+        while idx < self._capacity:  # while non-leaf
+            if self._value[2 * idx] > prefixsum:
+                idx = 2 * idx
+            else:
+                prefixsum -= self._value[2 * idx]
+                idx = 2 * idx + 1
+        return idx - self._capacity
+
+
+class MinSegmentTree(SegmentTree):
+
+    def __init__(self, capacity):
+        super(MinSegmentTree, self).__init__(capacity=capacity, operation=min, neutral_element=float('inf'))
+
+    def min(self, start=0, end=None):
+        """Returns min(arr[start], ...,  arr[end])"""
+
+        return super(MinSegmentTree, self).reduce(start, end)
+
+
+class ReplayBuffer(object):
+
+    def __init__(self, size):
+        self._storage = []
+        self._maxsize = size
+        self._next_idx = 0
+
+    def __len__(self):
+        return len(self._storage)
+
+    def add(self, *args):
+        if self._next_idx >= len(self._storage):
+            self._storage.append(args)
+        else:
+            self._storage[self._next_idx] = args
+        self._next_idx = (self._next_idx + 1) % self._maxsize
+
+    def _encode_sample(self, idxes):
+        b_o, b_a, b_r, b_o_, b_d = [], [], [], [], []
+        for i in idxes:
+            o, a, r, o_, d = self._storage[i]
+            b_o.append(o)
+            b_a.append(a)
+            b_r.append(r)
+            b_o_.append(o_)
+            b_d.append(d)
+        return (
+            np.stack(b_o).astype('float32') * ob_scale,
+            np.stack(b_a).astype('int32'),
+            np.stack(b_r).astype('float32'),
+            np.stack(b_o_).astype('float32') * ob_scale,
+            np.stack(b_d).astype('float32'),
+        )
+
+    def sample(self, batch_size):
+        indexes = range(len(self._storage))
+        idxes = [random.choice(indexes) for _ in range(batch_size)]
+        return self._encode_sample(idxes)
+
+
+class PrioritizedReplayBuffer(ReplayBuffer):
+
+    def __init__(self, size, alpha, beta):
+        """Create Prioritized Replay buffer.
+
+        Parameters
+        ----------
+        size: int
+            Max number of transitions to store in the buffer. When the buffer
+            overflows the old memories are dropped.
+        alpha: float
+            how much prioritization is used
+            (0 - no prioritization, 1 - full prioritization)
+
+        See Also
+        --------
+        ReplayBuffer.__init__
+        """
+        super(PrioritizedReplayBuffer, self).__init__(size)
+        assert alpha >= 0
+        self._alpha = alpha
+
+        it_capacity = 1
+        while it_capacity < size:
+            it_capacity *= 2
+
+        self._it_sum = SumSegmentTree(it_capacity)
+        self._it_min = MinSegmentTree(it_capacity)
+        self._max_priority = 1.0
+        self.beta = beta
+
+    def add(self, *args):
+        """See ReplayBuffer.store_effect"""
+        idx = self._next_idx
+        super().add(*args)
+        self._it_sum[idx] = self._max_priority**self._alpha
+        self._it_min[idx] = self._max_priority**self._alpha
+
+    def _sample_proportional(self, batch_size):
+        res = []
+        p_total = self._it_sum.sum(0, len(self._storage) - 1)
+        every_range_len = p_total / batch_size
+        for i in range(batch_size):
+            mass = random.random() * every_range_len + i * every_range_len
+            idx = self._it_sum.find_prefixsum_idx(mass)
+            res.append(idx)
+        return res
+
+    def sample(self, batch_size):
+        """Sample a batch of experiences"""
+        idxes = self._sample_proportional(batch_size)
+
+        it_sum = self._it_sum.sum()
+        p_min = self._it_min.min() / it_sum
+        max_weight = (p_min * len(self._storage))**(-self.beta)
+
+        p_samples = np.asarray([self._it_sum[idx] for idx in idxes]) / it_sum
+        weights = (p_samples * len(self._storage))**(-self.beta) / max_weight
+        encoded_sample = self._encode_sample(idxes)
+        return encoded_sample + (weights, idxes)
+
+    def update_priorities(self, idxes, priorities):
+        """Update priorities of sampled transitions"""
+        assert len(idxes) == len(priorities)
+        for idx, priority in zip(idxes, priorities):
+            assert priority > 0
+            assert 0 <= idx < len(self._storage)
+            self._it_sum[idx] = priority**self._alpha
+            self._it_min[idx] = priority**self._alpha
+
+            self._max_priority = max(self._max_priority, priority)
+
+
+def huber_loss(x):
+    """Loss function for value"""
+    return tf.where(tf.abs(x) < 1, tf.square(x) * 0.5, tf.abs(x) - 0.5)
+
+
+def sync(net, net_tar):
+    """Copy q network to target q network"""
+    for var, var_tar in zip(net.trainable_weights, net_tar.trainable_weights):
+        var_tar.assign(var)
+
+
+if __name__ == '__main__':
+    if args.mode == 'train':
+        qnet = MLP('q') if qnet_type == 'MLP' else CNN('q')
+        qnet.train()
+        trainabel_weights = qnet.trainable_weights
+        targetqnet = MLP('targetq') if qnet_type == 'MLP' else CNN('targetq')
+        targetqnet.infer()
+        sync(qnet, targetqnet)
+        optimizer = tf.optimizers.Adam(learning_rate=lr)
+        buffer = PrioritizedReplayBuffer(buffer_size, prioritized_replay_alpha, prioritized_replay_beta0)
+
+        o = env.reset()
+        nepisode = 0
+        t = time.time()
+        for i in range(1, number_timesteps + 1):
+            eps = epsilon(i)
+            buffer.beta += (1 - prioritized_replay_beta0) / number_timesteps
+
+            # select action
+            if random.random() < eps:
+                a = int(random.random() * out_dim)
+            else:
+                obv = np.expand_dims(o, 0).astype('float32') * ob_scale
+                a = qnet(obv).numpy().argmax(1)[0]
+
+            # execute action and feed to replay buffer
+            # note that `_` tail in var name means next
+            o_, r, done, info = env.step(a)
+            buffer.add(o, a, r, o_, done)
+
+            if i >= warm_start:
+                # sync q net and target q net
+                if i % target_q_update_freq == 0:
+                    sync(qnet, targetqnet)
+                    path = os.path.join(args.save_path, '{}.npz'.format(i))
+                    tl.files.save_npz(qnet.trainable_weights, name=path)
+
+                # sample from replay buffer
+                b_o, b_a, b_r, b_o_, b_d, weights, idxs \
+                    = buffer.sample(batch_size)
+
+                # q estimation
+                b_q_ = (1 - b_d) * tf.reduce_max(targetqnet(b_o_), 1)
+
+                # calculate loss
+                with tf.GradientTape() as q_tape:
+                    b_q = tf.reduce_sum(qnet(b_o) * tf.one_hot(b_a, out_dim), 1)
+                    abs_td_error = tf.abs(b_q - (b_r + reward_gamma * b_q_))
+                    priorities = np.clip(abs_td_error.numpy(), 1e-6, None)
+                    buffer.update_priorities(idxs, priorities)
+                    loss = tf.reduce_mean(weights * huber_loss(abs_td_error))
+
+                # backward gradients
+                q_grad = q_tape.gradient(loss, trainabel_weights)
+                optimizer.apply_gradients(zip(q_grad, trainabel_weights))
+
+            if done:
+                o = env.reset()
+            else:
+                o = o_
+
+            # episode in info is real (unwrapped) message
+            if info.get('episode'):
+                nepisode += 1
+                reward, length = info['episode']['r'], info['episode']['l']
+                fps = int(length / (time.time() - t))
+                print(
+                    'Time steps so far: {}, episode so far: {}, '
+                    'episode reward: {:.4f}, episode length: {}, FPS: {}'.format(i, nepisode, reward, length, fps)
+                )
+                t = time.time()
+    else:
+        qnet = MLP('q') if qnet_type == 'MLP' else CNN('q')
+        tl.files.load_and_assign_npz(name=args.save_path, network=qnet)
+        qnet.eval()
+
+        nepisode = 0
+        o = env.reset()
+        for i in range(1, number_timesteps + 1):
+            obv = np.expand_dims(o, 0).astype('float32') * ob_scale
+            a = qnet(obv).numpy().argmax(1)[0]
+
+            # execute action
+            # note that `_` tail in var name means next
+            o_, r, done, info = env.step(a)
+
+            if done:
+                o = env.reset()
+            else:
+                o = o_
+
+            # episode in info is real (unwrapped) message
+            if info.get('episode'):
+                nepisode += 1
+                reward, length = info['episode']['r'], info['episode']['l']
+                print(
+                    'Time steps so far: {}, episode so far: {}, '
+                    'episode reward: {:.4f}, episode length: {}'.format(i, nepisode, reward, length)
+                )
diff --git a/examples/reinforcement_learning/tutorial_wrappers.py b/examples/reinforcement_learning/tutorial_wrappers.py
new file mode 100644
index 000000000..a53e5102d
--- /dev/null
+++ b/examples/reinforcement_learning/tutorial_wrappers.py
@@ -0,0 +1,564 @@
+"""Env wrappers
+Note that this file is adapted from `https://pypi.org/project/gym-vec-env` and
+`https://github.com/openai/baselines/blob/master/baselines/common/*wrappers.py`
+"""
+from collections import deque
+from functools import partial
+from multiprocessing import Pipe, Process, cpu_count
+from sys import platform
+
+import numpy as np
+
+import cv2
+import gym
+from gym import spaces
+
+__all__ = (
+    'build_env',  # build env
+    'TimeLimit',  # Time limit wrapper
+    'NoopResetEnv',  # Run random number of no-ops on reset
+    'FireResetEnv',  # Reset wrapper for envs with fire action
+    'EpisodicLifeEnv',  # end-of-life == end-of-episode wrapper
+    'MaxAndSkipEnv',  # skip frame wrapper
+    'ClipRewardEnv',  # clip reward wrapper
+    'WarpFrame',  # warp observation wrapper
+    'FrameStack',  # stack frame wrapper
+    'LazyFrames',  # lazy store wrapper
+    'RewardScaler',  # reward scale
+    'SubprocVecEnv',  # vectorized env wrapper
+    'VecFrameStack',  # stack frames in vectorized env
+    'Monitor',  # Episode reward and length monitor
+)
+cv2.ocl.setUseOpenCL(False)
+# env_id -> env_type
+id2type = dict()
+for _env in gym.envs.registry.all():
+    id2type[_env.id] = _env._entry_point.split(':')[0].rsplit('.', 1)[1]
+
+
+def build_env(env_id, vectorized=False, seed=0, reward_scale=1.0, nenv=0):
+    """Build env based on options"""
+    env_type = id2type[env_id]
+    nenv = nenv or cpu_count() // (1 + (platform == 'darwin'))
+    stack = env_type == 'atari'
+    if not vectorized:
+        env = _make_env(env_id, env_type, seed, reward_scale, stack)
+    else:
+        env = _make_vec_env(env_id, env_type, nenv, seed, reward_scale, stack)
+
+    return env
+
+
+def _make_env(env_id, env_type, seed, reward_scale, frame_stack=True):
+    """Make single env"""
+    if env_type == 'atari':
+        env = gym.make(env_id)
+        assert 'NoFrameskip' in env.spec.id
+        env = NoopResetEnv(env, noop_max=30)
+        env = MaxAndSkipEnv(env, skip=4)
+        env = Monitor(env)
+        # deepmind wrap
+        env = EpisodicLifeEnv(env)
+        if 'FIRE' in env.unwrapped.get_action_meanings():
+            env = FireResetEnv(env)
+        env = WarpFrame(env)
+        env = ClipRewardEnv(env)
+        if frame_stack:
+            env = FrameStack(env, 4)
+    elif env_type == 'classic_control':
+        env = Monitor(gym.make(env_id))
+    else:
+        raise NotImplementedError
+    if reward_scale != 1:
+        env = RewardScaler(env, reward_scale)
+    env.seed(seed)
+    return env
+
+
+def _make_vec_env(env_id, env_type, nenv, seed, reward_scale, frame_stack=True):
+    """Make vectorized env"""
+    env = SubprocVecEnv([partial(_make_env, env_id, env_type, seed + i, reward_scale, False) for i in range(nenv)])
+    if frame_stack:
+        env = VecFrameStack(env, 4)
+    return env
+
+
+class TimeLimit(gym.Wrapper):
+
+    def __init__(self, env, max_episode_steps=None):
+        super(TimeLimit, self).__init__(env)
+        self._max_episode_steps = max_episode_steps
+        self._elapsed_steps = 0
+
+    def step(self, ac):
+        observation, reward, done, info = self.env.step(ac)
+        self._elapsed_steps += 1
+        if self._elapsed_steps >= self._max_episode_steps:
+            done = True
+            info['TimeLimit.truncated'] = True
+        return observation, reward, done, info
+
+    def reset(self, **kwargs):
+        self._elapsed_steps = 0
+        return self.env.reset(**kwargs)
+
+
+class NoopResetEnv(gym.Wrapper):
+
+    def __init__(self, env, noop_max=30):
+        """Sample initial states by taking random number of no-ops on reset.
+        No-op is assumed to be action 0.
+        """
+        super(NoopResetEnv, self).__init__(env)
+        self.noop_max = noop_max
+        self.override_num_noops = None
+        self.noop_action = 0
+        assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
+
+    def reset(self, **kwargs):
+        """ Do no-op action for a number of steps in [1, noop_max]."""
+        self.env.reset(**kwargs)
+        if self.override_num_noops is not None:
+            noops = self.override_num_noops
+        else:
+            noops = self.unwrapped.np_random.randint(1, self.noop_max + 1)
+        assert noops > 0
+        obs = None
+        for _ in range(noops):
+            obs, _, done, _ = self.env.step(self.noop_action)
+            if done:
+                obs = self.env.reset(**kwargs)
+        return obs
+
+    def step(self, ac):
+        return self.env.step(ac)
+
+
+class FireResetEnv(gym.Wrapper):
+
+    def __init__(self, env):
+        """Take action on reset for environments that are fixed until firing."""
+        super(FireResetEnv, self).__init__(env)
+        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
+        assert len(env.unwrapped.get_action_meanings()) >= 3
+
+    def reset(self, **kwargs):
+        self.env.reset(**kwargs)
+        obs, _, done, _ = self.env.step(1)
+        if done:
+            self.env.reset(**kwargs)
+        obs, _, done, _ = self.env.step(2)
+        if done:
+            self.env.reset(**kwargs)
+        return obs
+
+    def step(self, ac):
+        return self.env.step(ac)
+
+
+class EpisodicLifeEnv(gym.Wrapper):
+
+    def __init__(self, env):
+        """Make end-of-life == end-of-episode, but only reset on true game over.
+        Done by DeepMind for the DQN and co. since it helps value estimation.
+        """
+        super(EpisodicLifeEnv, self).__init__(env)
+        self.lives = 0
+        self.was_real_done = True
+
+    def step(self, action):
+        obs, reward, done, info = self.env.step(action)
+        self.was_real_done = done
+        # check current lives, make loss of life terminal,
+        # then update lives to handle bonus lives
+        lives = self.env.unwrapped.ale.lives()
+        if 0 < lives < self.lives:
+            # for Qbert sometimes we stay in lives == 0 condition for a few
+            # frames so it's important to keep lives > 0, so that we only reset
+            # once the environment advertises done.
+            done = True
+        self.lives = lives
+        return obs, reward, done, info
+
+    def reset(self, **kwargs):
+        """Reset only when lives are exhausted.
+        This way all states are still reachable even though lives are episodic,
+        and the learner need not know about any of this behind-the-scenes.
+        """
+        if self.was_real_done:
+            obs = self.env.reset(**kwargs)
+        else:
+            # no-op step to advance from terminal/lost life state
+            obs, _, _, _ = self.env.step(0)
+        self.lives = self.env.unwrapped.ale.lives()
+        return obs
+
+
+class MaxAndSkipEnv(gym.Wrapper):
+
+    def __init__(self, env, skip=4):
+        """Return only every `skip`-th frame"""
+        super(MaxAndSkipEnv, self).__init__(env)
+        # most recent raw observations (for max pooling across time steps)
+        shape = (2, ) + env.observation_space.shape
+        self._obs_buffer = np.zeros(shape, dtype=np.uint8)
+        self._skip = skip
+
+    def step(self, action):
+        """Repeat action, sum reward, and max over last observations."""
+        total_reward = 0.0
+        done = info = None
+        for i in range(self._skip):
+            obs, reward, done, info = self.env.step(action)
+            if i == self._skip - 2:
+                self._obs_buffer[0] = obs
+            if i == self._skip - 1:
+                self._obs_buffer[1] = obs
+            total_reward += reward
+            if done:
+                break
+        # Note that the observation on the done=True frame doesn't matter
+        max_frame = self._obs_buffer.max(axis=0)
+
+        return max_frame, total_reward, done, info
+
+    def reset(self, **kwargs):
+        return self.env.reset(**kwargs)
+
+
+class ClipRewardEnv(gym.RewardWrapper):
+
+    def __init__(self, env):
+        super(ClipRewardEnv, self).__init__(env)
+
+    def reward(self, reward):
+        """Bin reward to {+1, 0, -1} by its sign."""
+        return np.sign(reward)
+
+
+class WarpFrame(gym.ObservationWrapper):
+
+    def __init__(self, env, width=84, height=84, grayscale=True):
+        """Warp frames to 84x84 as done in the Nature paper and later work."""
+        super(WarpFrame, self).__init__(env)
+        self.width = width
+        self.height = height
+        self.grayscale = grayscale
+        shape = (self.height, self.width, 1 if self.grayscale else 3)
+        self.observation_space = spaces.Box(low=0, high=255, shape=shape, dtype=np.uint8)
+
+    def observation(self, frame):
+        if self.grayscale:
+            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
+        size = (self.width, self.height)
+        frame = cv2.resize(frame, size, interpolation=cv2.INTER_AREA)
+        if self.grayscale:
+            frame = np.expand_dims(frame, -1)
+        return frame
+
+
+class FrameStack(gym.Wrapper):
+
+    def __init__(self, env, k):
+        """Stack k last frames.
+        Returns lazy array, which is much more memory efficient.
+        See Also `LazyFrames`
+        """
+        super(FrameStack, self).__init__(env)
+        self.k = k
+        self.frames = deque([], maxlen=k)
+        shp = env.observation_space.shape
+        shape = shp[:-1] + (shp[-1] * k, )
+        self.observation_space = spaces.Box(low=0, high=255, shape=shape, dtype=env.observation_space.dtype)
+
+    def reset(self):
+        ob = self.env.reset()
+        for _ in range(self.k):
+            self.frames.append(ob)
+        return np.asarray(self._get_ob())
+
+    def step(self, action):
+        ob, reward, done, info = self.env.step(action)
+        self.frames.append(ob)
+        return np.asarray(self._get_ob()), reward, done, info
+
+    def _get_ob(self):
+        assert len(self.frames) == self.k
+        return LazyFrames(list(self.frames))
+
+
+class LazyFrames(object):
+
+    def __init__(self, frames):
+        """This object ensures that common frames between the observations are
+        only stored once. It exists purely to optimize memory usage which can be
+        huge for DQN's 1M frames replay buffers.
+
+        This object should only be converted to numpy array before being passed
+        to the model. You'd not believe how complex the previous solution was.
+        """
+        self._frames = frames
+        self._out = None
+
+    def _force(self):
+        if self._out is None:
+            self._out = np.concatenate(self._frames, axis=-1)
+            self._frames = None
+        return self._out
+
+    def __array__(self, dtype=None):
+        out = self._force()
+        if dtype is not None:
+            out = out.astype(dtype)
+        return out
+
+    def __len__(self):
+        return len(self._force())
+
+    def __getitem__(self, i):
+        return self._force()[i]
+
+
+class RewardScaler(gym.RewardWrapper):
+    """Bring rewards to a reasonable scale for PPO.
+    This is incredibly important and effects performance drastically.
+    """
+
+    def __init__(self, env, scale=0.01):
+        super(RewardScaler, self).__init__(env)
+        self.scale = scale
+
+    def reward(self, reward):
+        return reward * self.scale
+
+
+class VecFrameStack(object):
+
+    def __init__(self, env, k):
+        self.env = env
+        self.k = k
+        self.action_space = env.action_space
+        self.frames = deque([], maxlen=k)
+        shp = env.observation_space.shape
+        shape = shp[:-1] + (shp[-1] * k, )
+        self.observation_space = spaces.Box(low=0, high=255, shape=shape, dtype=env.observation_space.dtype)
+
+    def reset(self):
+        ob = self.env.reset()
+        for _ in range(self.k):
+            self.frames.append(ob)
+        return np.asarray(self._get_ob())
+
+    def step(self, action):
+        ob, reward, done, info = self.env.step(action)
+        self.frames.append(ob)
+        return np.asarray(self._get_ob()), reward, done, info
+
+    def _get_ob(self):
+        assert len(self.frames) == self.k
+        return LazyFrames(list(self.frames))
+
+
+def _worker(remote, parent_remote, env_fn_wrapper):
+    parent_remote.close()
+    env = env_fn_wrapper.x()
+    while True:
+        cmd, data = remote.recv()
+        if cmd == 'step':
+            ob, reward, done, info = env.step(data)
+            if done:
+                ob = env.reset()
+            remote.send((ob, reward, done, info))
+        elif cmd == 'reset':
+            ob = env.reset()
+            remote.send(ob)
+        elif cmd == 'reset_task':
+            ob = env._reset_task()
+            remote.send(ob)
+        elif cmd == 'close':
+            remote.close()
+            break
+        elif cmd == 'get_spaces':
+            remote.send((env.observation_space, env.action_space))
+        else:
+            raise NotImplementedError
+
+
+class CloudpickleWrapper(object):
+    """
+    Uses cloudpickle to serialize contents
+    """
+
+    def __init__(self, x):
+        self.x = x
+
+    def __getstate__(self):
+        import cloudpickle
+        return cloudpickle.dumps(self.x)
+
+    def __setstate__(self, ob):
+        import pickle
+        self.x = pickle.loads(ob)
+
+
+class SubprocVecEnv(object):
+
+    def __init__(self, env_fns):
+        """
+        envs: list of gym environments to run in subprocesses
+        """
+        self.num_envs = len(env_fns)
+
+        self.waiting = False
+        self.closed = False
+        nenvs = len(env_fns)
+        self.nenvs = nenvs
+        self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
+        zipped_args = zip(self.work_remotes, self.remotes, env_fns)
+        self.ps = [
+            Process(target=_worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
+            for (work_remote, remote, env_fn) in zipped_args
+        ]
+
+        for p in self.ps:
+            # if the main process crashes, we should not cause things to hang
+            p.daemon = True
+            p.start()
+        for remote in self.work_remotes:
+            remote.close()
+
+        self.remotes[0].send(('get_spaces', None))
+        observation_space, action_space = self.remotes[0].recv()
+        self.observation_space = observation_space
+        self.action_space = action_space
+
+    def _step_async(self, actions):
+        """
+            Tell all the environments to start taking a step
+            with the given actions.
+            Call step_wait() to get the results of the step.
+            You should not call this if a step_async run is
+            already pending.
+            """
+        for remote, action in zip(self.remotes, actions):
+            remote.send(('step', action))
+        self.waiting = True
+
+    def _step_wait(self):
+        """
+            Wait for the step taken with step_async().
+            Returns (obs, rews, dones, infos):
+             - obs: an array of observations, or a tuple of
+                    arrays of observations.
+             - rews: an array of rewards
+             - dones: an array of "episode done" booleans
+             - infos: a sequence of info objects
+            """
+        results = [remote.recv() for remote in self.remotes]
+        self.waiting = False
+        obs, rews, dones, infos = zip(*results)
+        return np.stack(obs), np.stack(rews), np.stack(dones), infos
+
+    def reset(self):
+        """
+            Reset all the environments and return an array of
+            observations, or a tuple of observation arrays.
+            If step_async is still doing work, that work will
+            be cancelled and step_wait() should not be called
+            until step_async() is invoked again.
+            """
+        for remote in self.remotes:
+            remote.send(('reset', None))
+        return np.stack([remote.recv() for remote in self.remotes])
+
+    def _reset_task(self):
+        for remote in self.remotes:
+            remote.send(('reset_task', None))
+        return np.stack([remote.recv() for remote in self.remotes])
+
+    def close(self):
+        if self.closed:
+            return
+        if self.waiting:
+            for remote in self.remotes:
+                remote.recv()
+        for remote in self.remotes:
+            remote.send(('close', None))
+        for p in self.ps:
+            p.join()
+            self.closed = True
+
+    def __len__(self):
+        return self.nenvs
+
+    def step(self, actions):
+        self._step_async(actions)
+        return self._step_wait()
+
+
+class Monitor(gym.Wrapper):
+
+    def __init__(self, env):
+        super(Monitor, self).__init__(env)
+        self._monitor_rewards = None
+
+    def reset(self, **kwargs):
+        self._monitor_rewards = []
+        return self.env.reset(**kwargs)
+
+    def step(self, action):
+        o_, r, done, info = self.env.step(action)
+        self._monitor_rewards.append(r)
+        if done:
+            info['episode'] = {'r': sum(self._monitor_rewards), 'l': len(self._monitor_rewards)}
+        return o_, r, done, info
+
+
+class NormalizedActions(gym.ActionWrapper):
+
+    def _action(self, action):
+        low = self.action_space.low
+        high = self.action_space.high
+
+        action = low + (action + 1.0) * 0.5 * (high - low)
+        action = np.clip(action, low, high)
+
+        return action
+
+    def _reverse_action(self, action):
+        low = self.action_space.low
+        high = self.action_space.high
+
+        action = 2 * (action - low) / (high - low) - 1
+        action = np.clip(action, low, high)
+
+        return action
+
+
+def unit_test():
+    env_id = 'CartPole-v0'
+    unwrapped_env = gym.make(env_id)
+    wrapped_env = build_env(env_id, False)
+    o = wrapped_env.reset()
+    print('Reset {} observation shape {}'.format(env_id, o.shape))
+    done = False
+    while not done:
+        a = unwrapped_env.action_space.sample()
+        o_, r, done, info = wrapped_env.step(a)
+        print('Take action {} get reward {} info {}'.format(a, r, info))
+
+    env_id = 'PongNoFrameskip-v4'
+    nenv = 2
+    unwrapped_env = gym.make(env_id)
+    wrapped_env = build_env(env_id, True, nenv=nenv)
+    o = wrapped_env.reset()
+    print('Reset {} observation shape {}'.format(env_id, o.shape))
+    for _ in range(1000):
+        a = [unwrapped_env.action_space.sample() for _ in range(nenv)]
+        a = np.asarray(a, 'int64')
+        o_, r, done, info = wrapped_env.step(a)
+        print('Take action {} get reward {} info {}'.format(a, r, info))
+
+
+if __name__ == '__main__':
+    unit_test()
diff --git a/examples/spatial_transformer_network/tutorial_spatial_transformer_network_dynamic.py b/examples/spatial_transformer_network/tutorial_spatial_transformer_network_dynamic.py
index aecc69f61..bc0bae141 100644
--- a/examples/spatial_transformer_network/tutorial_spatial_transformer_network_dynamic.py
+++ b/examples/spatial_transformer_network/tutorial_spatial_transformer_network_dynamic.py
@@ -1,7 +1,9 @@
 #! /usr/bin/python
 # -*- coding: utf8 -*-
 import time
+
 import numpy as np
+
 import tensorflow as tf
 import tensorlayer as tl
 from tensorlayer.layers import *
@@ -53,6 +55,7 @@ def pad_distort_ims_fn(X):
 
 ##================== DEFINE MODEL ============================================##
 class Net(Model):
+
     def __init__(self):
         super(Net, self).__init__()
 
diff --git a/examples/spatial_transformer_network/tutorial_spatial_transformer_network_static.py b/examples/spatial_transformer_network/tutorial_spatial_transformer_network_static.py
index c9a93629f..515e69967 100644
--- a/examples/spatial_transformer_network/tutorial_spatial_transformer_network_static.py
+++ b/examples/spatial_transformer_network/tutorial_spatial_transformer_network_static.py
@@ -1,7 +1,9 @@
 #! /usr/bin/python
 # -*- coding: utf8 -*-
 import time
+
 import numpy as np
+
 import tensorflow as tf
 import tensorlayer as tl
 from tensorlayer.layers import *
@@ -11,6 +13,7 @@
 X_train, y_train, X_val, y_val, X_test, y_test = \
     tl.files.load_mnist_dataset(shape=(-1, 28, 28, 1))
 
+
 def pad_distort_im_fn(x):
     """ Zero pads an image to 40x40, and distort it.
 
@@ -120,7 +123,7 @@ def get_model(inputs_shape):
             X_train_a = tf.expand_dims(X_train_a, 3)
 
             _logits, _ = net(X_train_a)  # alternatively, you can use MLP(x, is_train=False) and remove MLP.eval()
-            train_loss += tl.cost.cross_entropy(_logits, y_train_a,  name='eval_train_loss')
+            train_loss += tl.cost.cross_entropy(_logits, y_train_a, name='eval_train_loss')
             train_acc += np.mean(np.equal(np.argmax(_logits, 1), y_train_a))
             n_iter += 1
         print("   train loss: %f" % (train_loss / n_iter))
diff --git a/examples/text_classification/tutorial_imdb_fasttext.py b/examples/text_classification/tutorial_imdb_fasttext.py
index 2c2c7aed0..94de9a66f 100644
--- a/examples/text_classification/tutorial_imdb_fasttext.py
+++ b/examples/text_classification/tutorial_imdb_fasttext.py
@@ -31,8 +31,8 @@
 import time
 
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
 from tensorlayer.layers import *
 from tensorlayer.models import *
@@ -82,6 +82,7 @@ def forward(self, x):
         z = self.dense2(z)
         return z
 
+
 def augment_with_ngrams(unigrams, unigram_vocab_size, n_buckets, n=2):
     """Augment unigram features with hashed n-gram features."""
 
@@ -148,11 +149,12 @@ def train_test_and_save_model():
 
                 train_accuracy.append(accuracy)
                 if len(train_accuracy) % N_STEPS_TO_PRINT == 0:
-                    print("\t[%d/%d][%d]accuracy " % (epoch + 1, N_EPOCH, len(train_accuracy)),
-                          np.mean(train_accuracy[-N_STEPS_TO_PRINT:]))
+                    print(
+                        "\t[%d/%d][%d]accuracy " % (epoch + 1, N_EPOCH, len(train_accuracy)),
+                        np.mean(train_accuracy[-N_STEPS_TO_PRINT:])
+                    )
 
-            print("\tSummary: time %.5fs, overall accuracy" % (time.time() - start_time),
-                  np.mean(train_accuracy))
+            print("\tSummary: time %.5fs, overall accuracy" % (time.time() - start_time), np.mean(train_accuracy))
 
     # evaluation and testing
     model.eval()
diff --git a/examples/text_generation/data/__init__.py b/examples/text_generation/data/__init__.py
index 7acccd1ee..5feb25700 100644
--- a/examples/text_generation/data/__init__.py
+++ b/examples/text_generation/data/__init__.py
@@ -1,4 +1,5 @@
 from __future__ import absolute_import
 
 from . import imagenet_classes
+
 # from . import
diff --git a/examples/text_generation/tutorial_generate_text.py b/examples/text_generation/tutorial_generate_text.py
index d157b1ed5..2df0ed018 100644
--- a/examples/text_generation/tutorial_generate_text.py
+++ b/examples/text_generation/tutorial_generate_text.py
@@ -28,8 +28,8 @@
 
 import nltk
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
 from tensorlayer.layers import *
 from tensorlayer.models import Model
diff --git a/examples/text_ptb/tutorial_ptb_lstm.py b/examples/text_ptb/tutorial_ptb_lstm.py
index 6f215abba..2fa1f331e 100644
--- a/examples/text_ptb/tutorial_ptb_lstm.py
+++ b/examples/text_ptb/tutorial_ptb_lstm.py
@@ -104,8 +104,8 @@
 import time
 
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
 from tensorlayer.models import Model
 
diff --git a/examples/text_ptb/tutorial_ptb_lstm_state_is_tuple.py b/examples/text_ptb/tutorial_ptb_lstm_state_is_tuple.py
new file mode 100644
index 000000000..9fccca66a
--- /dev/null
+++ b/examples/text_ptb/tutorial_ptb_lstm_state_is_tuple.py
@@ -0,0 +1,618 @@
+#! /usr/bin/python
+# -*- coding: utf-8 -*-
+"""Example of Synced sequence input and output.
+
+This is a reimpmentation of the TensorFlow official PTB example in :
+tensorflow/models/rnn/ptb
+
+The batch_size can be seem as how many concurrent computations.n
+As the following example shows, the first batch learn the sequence information by using 0 to 9.n
+The second batch learn the sequence information by using 10 to 19.n
+So it ignores the information from 9 to 10 !n
+If only if we set the batch_size = 1, it will consider all information from 0 to 20.n
+
+The meaning of batch_size here is not the same with the MNIST example. In MNIST example,
+batch_size reflects how many examples we consider in each iteration, while in
+PTB example, batch_size is how many concurrent processes (segments)
+for speed up computation.
+
+Some Information will be ignored if batch_size > 1, however, if your dataset
+is "long" enough (a text corpus usually has billions words), the ignored
+information would not effect the final result.
+
+In PTB tutorial, we setted batch_size = 20, so we cut the dataset into 20 segments.
+At the begining of each epoch, we initialize (reset) the 20 RNN states for 20
+segments, then go through 20 segments separately.
+
+The training data will be generated as follow:n
+
+>>> train_data = [i for i in range(20)]
+>>> for batch in tl.iterate.ptb_iterator(train_data, batch_size=2, num_steps=3):
+>>>     x, y = batch
+>>>     print(x, 'n',y)
+... [[ 0  1  2] <---x                       1st subset/ iteration
+...  [10 11 12]]
+... [[ 1  2  3] <---y
+...  [11 12 13]]
+...
+... [[ 3  4  5]  <--- 1st batch input       2nd subset/ iteration
+...  [13 14 15]] <--- 2nd batch input
+... [[ 4  5  6]  <--- 1st batch target
+...  [14 15 16]] <--- 2nd batch target
+...
+... [[ 6  7  8]                             3rd subset/ iteration
+...  [16 17 18]]
+... [[ 7  8  9]
+...  [17 18 19]]
+
+Hao Dong: This example can also be considered as pre-training of the word
+embedding matrix.
+
+About RNN
+----------
+$ Karpathy Blog : http://karpathy.github.io/2015/05/21/rnn-effectiveness/
+
+More TensorFlow official RNN examples can be found here
+---------------------------------------------------------
+$ RNN for PTB : https://www.tensorflow.org/versions/master/tutorials/recurrent/index.html#recurrent-neural-networks
+$ Seq2seq : https://www.tensorflow.org/versions/master/tutorials/seq2seq/index.html#sequence-to-sequence-models
+$ translation : tensorflow/models/rnn/translate
+
+tensorflow (0.9.0)
+
+Example / benchmark for building a PTB LSTM model.
+
+Trains the model described in:
+(Zaremba, et. al.) Recurrent Neural Network Regularization
+http://arxiv.org/abs/1409.2329
+
+There are 3 supported model configurations:
+===========================================
+| config | epochs | train | valid  | test
+===========================================
+| small  | 13     | 37.99 | 121.39 | 115.91
+| medium | 39     | 48.45 |  86.16 |  82.07
+| large  | 55     | 37.87 |  82.62 |  78.29
+The exact results may vary depending on the random initialization.
+
+The hyperparameters used in the model:
+- init_scale - the initial scale of the weights
+- learning_rate - the initial value of the learning rate
+- max_grad_norm - the maximum permissible norm of the gradient
+- num_layers - the number of LSTM layers
+- num_steps - the number of unrolled steps of LSTM
+- hidden_size - the number of LSTM units
+- max_epoch - the number of epochs trained with the initial learning rate
+- max_max_epoch - the total number of epochs for training
+- keep_prob - the probability of keeping weights in the dropout layer
+- lr_decay - the decay of the learning rate for each epoch after "max_epoch"
+- batch_size - the batch size
+
+The data required for this example is in the data/ dir of the
+PTB dataset from Tomas Mikolov's webpage:
+
+$ wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
+$ tar xvf simple-examples.tgz
+
+A) use the zero_state function on the cell object
+
+B) for an rnn, all time steps share weights. We use one matrix to keep all
+gate weights. Split by column into 4 parts to get the 4 gate weight matrices.
+
+"""
+
+import sys
+import time
+
+import numpy as np
+
+import tensorflow as tf
+import tensorlayer as tl
+
+tf.logging.set_verbosity(tf.logging.DEBUG)
+tl.logging.set_verbosity(tl.logging.DEBUG)
+
+flags = tf.app.flags
+
+flags.DEFINE_string("model", "small", "A type of model. Possible options are: small, medium, large.")
+
+if (tf.VERSION >= '1.5'):
+    # parse flags
+    flags.FLAGS(sys.argv, known_only=True)
+    flags.ArgumentParser()
+
+FLAGS = flags.FLAGS
+
+tf.logging.set_verbosity(tf.logging.DEBUG)
+
+
+def main(_):
+    """
+    The core of the model consists of an LSTM cell that processes one word at
+    a time and computes probabilities of the possible continuations of the
+    sentence. The memory state of the network is initialized with a vector
+    of zeros and gets updated after reading each word. Also, for computational
+    reasons, we will process data in mini-batches of size batch_size.
+    """
+    if FLAGS.model == "small":
+        init_scale = 0.1
+        learning_rate = 1.
+        max_grad_norm = 5
+        num_steps = 20
+        hidden_size = 200
+        max_epoch = 4
+        max_max_epoch = 13
+        keep_prob = 1.0
+        lr_decay = 0.5
+        batch_size = 20
+        vocab_size = 10000
+    elif FLAGS.model == "medium":
+        init_scale = 0.05
+        learning_rate = 1.0
+        max_grad_norm = 5
+        # num_layers = 2
+        num_steps = 35
+        hidden_size = 650
+        max_epoch = 6
+        max_max_epoch = 39
+        keep_prob = 0.5
+        lr_decay = 0.8
+        batch_size = 20
+        vocab_size = 10000
+    elif FLAGS.model == "large":
+        init_scale = 0.04
+        learning_rate = 1.0
+        max_grad_norm = 10
+        # num_layers = 2
+        num_steps = 35
+        hidden_size = 1500
+        max_epoch = 14
+        max_max_epoch = 55
+        keep_prob = 0.35
+        lr_decay = 1 / 1.15
+        batch_size = 20
+        vocab_size = 10000
+    else:
+        raise ValueError("Invalid model: %s", FLAGS.model)
+
+    # Load PTB dataset
+    train_data, valid_data, test_data, vocab_size = tl.files.load_ptb_dataset()
+    # train_data = train_data[0:int(100000/5)]    # for fast testing
+    print('len(train_data) {}'.format(len(train_data)))  # 929589 a list of int
+    print('len(valid_data) {}'.format(len(valid_data)))  # 73760  a list of int
+    print('len(test_data)  {}'.format(len(test_data)))  # 82430  a list of int
+    print('vocab_size      {}'.format(vocab_size))  # 10000
+
+    sess = tf.InteractiveSession()
+
+    # One int represents one word, the meaning of batch_size here is not the
+    # same with MNIST example, it is the number of concurrent processes for
+    # computational reasons.
+
+    # Training and Validation
+    input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
+    targets = tf.placeholder(tf.int32, [batch_size, num_steps])
+    # Testing (Evaluation)
+    input_data_test = tf.placeholder(tf.int32, [1, 1])
+    targets_test = tf.placeholder(tf.int32, [1, 1])
+
+    def inference(x, is_training, num_steps, reuse=None):
+        """If reuse is True, the inferences use the existing parameters,
+        then different inferences share the same parameters.
+
+        Note :
+        - For DynamicRNNLayer, you can set dropout and the number of RNN layer internally.
+        """
+        print("\nnum_steps : %d, is_training : %s, reuse : %s" % (num_steps, is_training, reuse))
+        init = tf.random_uniform_initializer(-init_scale, init_scale)
+        with tf.variable_scope("model", reuse=reuse):
+            net = tl.layers.EmbeddingInputlayer(x, vocab_size, hidden_size, init, name='embedding')
+            net = tl.layers.DropoutLayer(net, keep=keep_prob, is_fix=True, is_train=is_training, name='drop1')
+            net = tl.layers.RNNLayer(
+                net,
+                cell_fn=tf.contrib.rnn.BasicLSTMCell,  # tf.nn.rnn_cell.BasicLSTMCell,
+                cell_init_args={
+                    'forget_bias': 0.0,
+                    'state_is_tuple': True
+                },
+                n_hidden=hidden_size,
+                initializer=init,
+                n_steps=num_steps,
+                return_last=False,
+                name='basic_lstm1'
+            )
+            lstm1 = net
+            net = tl.layers.DropoutLayer(net, keep=keep_prob, is_fix=True, is_train=is_training, name='drop2')
+            net = tl.layers.RNNLayer(
+                net,
+                cell_fn=tf.contrib.rnn.BasicLSTMCell,  # tf.nn.rnn_cell.BasicLSTMCell,
+                cell_init_args={
+                    'forget_bias': 0.0,
+                    'state_is_tuple': True
+                },
+                n_hidden=hidden_size,
+                initializer=init,
+                n_steps=num_steps,
+                return_last=False,
+                return_seq_2d=True,
+                name='basic_lstm2'
+            )
+            lstm2 = net
+            # Alternatively, if return_seq_2d=False, in the above RNN layer,
+            # you can reshape the outputs as follow:
+            # net = tl.layers.ReshapeLayer(net,
+            #       shape=[-1, int(net.outputs._shape[-1])], name='reshape')
+            net = tl.layers.DropoutLayer(net, keep=keep_prob, is_fix=True, is_train=is_training, name='drop3')
+            net = tl.layers.DenseLayer(net, vocab_size, W_init=init, b_init=init, act=None, name='output')
+        return net, lstm1, lstm2
+
+    # Inference for Training
+    net, lstm1, lstm2 = inference(input_data, is_training=True, num_steps=num_steps, reuse=None)
+    # Inference for Validating
+    net_val, lstm1_val, lstm2_val = inference(input_data, is_training=False, num_steps=num_steps, reuse=True)
+    # Inference for Testing (Evaluation)
+    net_test, lstm1_test, lstm2_test = inference(input_data_test, is_training=False, num_steps=1, reuse=True)
+
+    # sess.run(tf.global_variables_initializer())
+    sess.run(tf.global_variables_initializer())
+
+    def loss_fn(outputs, targets, batch_size):
+        # See tl.cost.cross_entropy_seq()
+        # Returns the cost function of Cross-entropy of two sequences, implement
+        # softmax internally.
+        # outputs : 2D tensor [batch_size*num_steps, n_units of output layer]
+        # targets : 2D tensor [batch_size, num_steps], need to be reshaped.
+        # batch_size : RNN batch_size, number of concurrent processes.
+        # n_examples = batch_size * num_steps
+        # so
+        # cost is the averaged cost of each mini-batch (concurrent process).
+        loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
+            [outputs], [tf.reshape(targets, [-1])], [tf.ones_like(tf.reshape(targets, [-1]), dtype=tf.float32)]
+        )
+        # [tf.ones([batch_size * num_steps])])
+        cost = tf.reduce_sum(loss) / batch_size
+        return cost
+
+    # Cost for Training
+    cost = loss_fn(net.outputs, targets, batch_size)
+    # Cost for Validating
+    cost_val = loss_fn(net_val.outputs, targets, batch_size)
+    # Cost for Testing (Evaluation)
+    cost_test = loss_fn(net_test.outputs, targets_test, 1)
+
+    # Truncated Backpropagation for training
+    with tf.variable_scope('learning_rate'):
+        lr = tf.Variable(0.0, trainable=False)
+    tvars = tf.trainable_variables()
+    grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), max_grad_norm)
+    optimizer = tf.train.GradientDescentOptimizer(lr)
+    train_op = optimizer.apply_gradients(zip(grads, tvars))
+
+    # sess.run(tf.global_variables_initializer())
+    sess.run(tf.global_variables_initializer())
+
+    net.print_params()
+    net.print_layers()
+    tl.layers.print_all_variables()
+
+    print("nStart learning a language model by using PTB dataset")
+    for i in range(max_max_epoch):
+        # decreases the initial learning rate after several
+        # epoachs (defined by ``max_epoch``), by multipling a ``lr_decay``.
+        new_lr_decay = lr_decay**max(i - max_epoch, 0.0)
+        sess.run(tf.assign(lr, learning_rate * new_lr_decay))
+
+        # Training
+        print("Epoch: %d/%d Learning rate: %.3f" % (i + 1, max_max_epoch, sess.run(lr)))
+        epoch_size = ((len(train_data) // batch_size) - 1) // num_steps
+        start_time = time.time()
+        costs = 0.0
+        iters = 0
+        # reset all states at the begining of every epoch
+        state1 = tl.layers.initialize_rnn_state(lstm1.initial_state)
+        state2 = tl.layers.initialize_rnn_state(lstm2.initial_state)
+        for step, (x, y) in enumerate(tl.iterate.ptb_iterator(train_data, batch_size, num_steps)):
+            feed_dict = {
+                input_data: x,
+                targets: y,
+                lstm1.initial_state.c: state1[0],
+                lstm1.initial_state.h: state1[1],
+                lstm2.initial_state.c: state2[0],
+                lstm2.initial_state.h: state2[1],
+            }
+            # For training, enable dropout
+            feed_dict.update(net.all_drop)
+            _cost, state1_c, state1_h, state2_c, state2_h, _ = sess.run(
+                [cost, lstm1.final_state.c, lstm1.final_state.h, lstm2.final_state.c, lstm2.final_state.h, train_op],
+                feed_dict=feed_dict
+            )
+            state1 = (state1_c, state1_h)
+            state2 = (state2_c, state2_h)
+
+            costs += _cost
+            iters += num_steps
+
+            if step % (epoch_size // 10) == 10:
+                print(
+                    "%.3f perplexity: %.3f speed: %.0f wps" %
+                    (step * 1.0 / epoch_size, np.exp(costs / iters), iters * batch_size / (time.time() - start_time))
+                )
+        train_perplexity = np.exp(costs / iters)
+        print("Epoch: %d/%d Train Perplexity: %.3f" % (i + 1, max_max_epoch, train_perplexity))
+
+        # Validation
+        start_time = time.time()
+        costs = 0.0
+        iters = 0
+        # reset all states at the begining of every epoch
+        state1 = tl.layers.initialize_rnn_state(lstm1_val.initial_state)
+        state2 = tl.layers.initialize_rnn_state(lstm2_val.initial_state)
+        for step, (x, y) in enumerate(tl.iterate.ptb_iterator(valid_data, batch_size, num_steps)):
+            feed_dict = {
+                input_data: x,
+                targets: y,
+                lstm1_val.initial_state.c: state1[0],
+                lstm1_val.initial_state.h: state1[1],
+                lstm2_val.initial_state.c: state2[0],
+                lstm2_val.initial_state.h: state2[1],
+            }
+            _cost, state1_c, state1_h, state2_c, state2_h, _ = sess.run(
+                [
+                    cost_val, lstm1_val.final_state.c, lstm1_val.final_state.h, lstm2_val.final_state.c,
+                    lstm2_val.final_state.h,
+                    tf.no_op()
+                ], feed_dict=feed_dict
+            )
+            state1 = (state1_c, state1_h)
+            state2 = (state2_c, state2_h)
+            costs += _cost
+            iters += num_steps
+        valid_perplexity = np.exp(costs / iters)
+        print("Epoch: %d/%d Valid Perplexity: %.3f" % (i + 1, max_max_epoch, valid_perplexity))
+
+    print("Evaluation")
+    # Testing
+    # go through the test set step by step, it will take a while.
+    start_time = time.time()
+    costs = 0.0
+    iters = 0
+    # reset all states at the begining
+    state1 = tl.layers.initialize_rnn_state(lstm1_test.initial_state)
+    state2 = tl.layers.initialize_rnn_state(lstm2_test.initial_state)
+    for step, (x, y) in enumerate(tl.iterate.ptb_iterator(test_data, batch_size=1, num_steps=1)):
+        feed_dict = {
+            input_data_test: x,
+            targets_test: y,
+            lstm1_test.initial_state.c: state1[0],
+            lstm1_test.initial_state.h: state1[1],
+            lstm2_test.initial_state.c: state2[0],
+            lstm2_test.initial_state.h: state2[1],
+        }
+        _cost, state1_c, state1_h, state2_c, state2_h = sess.run(
+            [
+                cost_test,
+                lstm1_test.final_state.c,
+                lstm1_test.final_state.h,
+                lstm2_test.final_state.c,
+                lstm2_test.final_state.h,
+            ], feed_dict=feed_dict
+        )
+        state1 = (state1_c, state1_h)
+        state2 = (state2_c, state2_h)
+        costs += _cost
+        iters += 1
+    test_perplexity = np.exp(costs / iters)
+    print("Test Perplexity: %.3f took %.2fs" % (test_perplexity, time.time() - start_time))
+
+    print(
+        "More example: Text generation using Trump's speech data: https://github.com/tensorlayer/tensorlayer/blob/master/example/tutorial_generate_text.py  -- def main_lstm_generate_text():"
+    )
+
+
+if __name__ == "__main__":
+    tf.app.run()
+
+# log of SmallConfig
+# Start learning a language model by using PTB dataset
+# Epoch: 1 Learning rate: 1.000
+# 0.004 perplexity: 5512.735 speed: 4555 wps
+# 0.104 perplexity: 841.289 speed: 8823 wps
+# 0.204 perplexity: 626.273 speed: 9292 wps
+# 0.304 perplexity: 505.628 speed: 9472 wps
+# 0.404 perplexity: 435.580 speed: 9551 wps
+# 0.504 perplexity: 390.108 speed: 9555 wps
+# 0.604 perplexity: 351.379 speed: 9546 wps
+# 0.703 perplexity: 324.846 speed: 9579 wps
+# 0.803 perplexity: 303.824 speed: 9574 wps
+# 0.903 perplexity: 284.468 speed: 9551 wps
+# Epoch: 1 Train Perplexity: 269.981
+# Epoch: 1 Valid Perplexity: 178.561
+# Epoch: 2 Learning rate: 1.000
+# 0.004 perplexity: 211.632 speed: 7697 wps
+# 0.104 perplexity: 151.509 speed: 9488 wps
+# 0.204 perplexity: 158.947 speed: 9674 wps
+# 0.304 perplexity: 153.963 speed: 9806 wps
+# 0.404 perplexity: 150.938 speed: 9817 wps
+# 0.504 perplexity: 148.413 speed: 9824 wps
+# 0.604 perplexity: 143.763 speed: 9765 wps
+# 0.703 perplexity: 141.616 speed: 9731 wps
+# 0.803 perplexity: 139.618 speed: 9781 wps
+# 0.903 perplexity: 135.880 speed: 9735 wps
+# Epoch: 2 Train Perplexity: 133.771
+# Epoch: 2 Valid Perplexity: 142.595
+# Epoch: 3 Learning rate: 1.000
+# 0.004 perplexity: 146.902 speed: 8345 wps
+# 0.104 perplexity: 105.647 speed: 9572 wps
+# 0.204 perplexity: 114.261 speed: 9585 wps
+# 0.304 perplexity: 111.237 speed: 9586 wps
+# 0.404 perplexity: 110.181 speed: 9605 wps
+# 0.504 perplexity: 109.383 speed: 9601 wps
+# 0.604 perplexity: 106.722 speed: 9635 wps
+# 0.703 perplexity: 106.075 speed: 9597 wps
+# 0.803 perplexity: 105.481 speed: 9624 wps
+# 0.903 perplexity: 103.262 speed: 9618 wps
+# Epoch: 3 Train Perplexity: 102.272
+# Epoch: 3 Valid Perplexity: 131.884
+# Epoch: 4 Learning rate: 1.000
+# 0.004 perplexity: 118.127 speed: 7867 wps
+# 0.104 perplexity: 85.530 speed: 9330 wps
+# 0.204 perplexity: 93.559 speed: 9399 wps
+# 0.304 perplexity: 91.141 speed: 9386 wps
+# 0.404 perplexity: 90.668 speed: 9462 wps
+# 0.504 perplexity: 90.366 speed: 9516 wps
+# 0.604 perplexity: 88.479 speed: 9477 wps
+# 0.703 perplexity: 88.275 speed: 9533 wps
+# 0.803 perplexity: 88.091 speed: 9560 wps
+# 0.903 perplexity: 86.430 speed: 9516 wps
+# Epoch: 4 Train Perplexity: 85.839
+# Epoch: 4 Valid Perplexity: 128.408
+# Epoch: 5 Learning rate: 1.000
+# 0.004 perplexity: 100.077 speed: 7682 wps
+# 0.104 perplexity: 73.856 speed: 9197 wps
+# 0.204 perplexity: 81.242 speed: 9266 wps
+# 0.304 perplexity: 79.315 speed: 9375 wps
+# 0.404 perplexity: 79.009 speed: 9439 wps
+# 0.504 perplexity: 78.874 speed: 9377 wps
+# 0.604 perplexity: 77.430 speed: 9436 wps
+# 0.703 perplexity: 77.415 speed: 9417 wps
+# 0.803 perplexity: 77.424 speed: 9407 wps
+# 0.903 perplexity: 76.083 speed: 9407 wps
+# Epoch: 5 Train Perplexity: 75.719
+# Epoch: 5 Valid Perplexity: 127.057
+# Epoch: 6 Learning rate: 0.500
+# 0.004 perplexity: 87.561 speed: 7130 wps
+# 0.104 perplexity: 64.202 speed: 9753 wps
+# 0.204 perplexity: 69.518 speed: 9537 wps
+# 0.304 perplexity: 66.868 speed: 9647 wps
+# 0.404 perplexity: 65.766 speed: 9538 wps
+# 0.504 perplexity: 64.967 speed: 9537 wps
+# 0.604 perplexity: 63.090 speed: 9565 wps
+# 0.703 perplexity: 62.415 speed: 9544 wps
+# 0.803 perplexity: 61.751 speed: 9504 wps
+# 0.903 perplexity: 60.027 speed: 9482 wps
+# Epoch: 6 Train Perplexity: 59.127
+# Epoch: 6 Valid Perplexity: 120.339
+# Epoch: 7 Learning rate: 0.250
+# 0.004 perplexity: 72.069 speed: 7683 wps
+# 0.104 perplexity: 53.331 speed: 9526 wps
+# 0.204 perplexity: 57.897 speed: 9572 wps
+# 0.304 perplexity: 55.557 speed: 9491 wps
+# 0.404 perplexity: 54.597 speed: 9483 wps
+# 0.504 perplexity: 53.817 speed: 9471 wps
+# 0.604 perplexity: 52.147 speed: 9511 wps
+# 0.703 perplexity: 51.473 speed: 9497 wps
+# 0.803 perplexity: 50.788 speed: 9521 wps
+# 0.903 perplexity: 49.203 speed: 9515 wps
+# Epoch: 7 Train Perplexity: 48.303
+# Epoch: 7 Valid Perplexity: 120.782
+# Epoch: 8 Learning rate: 0.125
+# 0.004 perplexity: 63.503 speed: 8425 wps
+# 0.104 perplexity: 47.324 speed: 9433 wps
+# 0.204 perplexity: 51.525 speed: 9653 wps
+# 0.304 perplexity: 49.405 speed: 9520 wps
+# 0.404 perplexity: 48.532 speed: 9487 wps
+# 0.504 perplexity: 47.800 speed: 9610 wps
+# 0.604 perplexity: 46.282 speed: 9554 wps
+# 0.703 perplexity: 45.637 speed: 9536 wps
+# 0.803 perplexity: 44.972 speed: 9493 wps
+# 0.903 perplexity: 43.506 speed: 9496 wps
+# Epoch: 8 Train Perplexity: 42.653
+# Epoch: 8 Valid Perplexity: 122.119
+# Epoch: 9 Learning rate: 0.062
+# 0.004 perplexity: 59.375 speed: 7158 wps
+# 0.104 perplexity: 44.223 speed: 9275 wps
+# 0.204 perplexity: 48.269 speed: 9459 wps
+# 0.304 perplexity: 46.273 speed: 9564 wps
+# 0.404 perplexity: 45.450 speed: 9604 wps
+# 0.504 perplexity: 44.749 speed: 9604 wps
+# 0.604 perplexity: 43.308 speed: 9619 wps
+# 0.703 perplexity: 42.685 speed: 9647 wps
+# 0.803 perplexity: 42.022 speed: 9673 wps
+# 0.903 perplexity: 40.616 speed: 9678 wps
+# Epoch: 9 Train Perplexity: 39.792
+# Epoch: 9 Valid Perplexity: 123.170
+# Epoch: 10 Learning rate: 0.031
+# 0.004 perplexity: 57.333 speed: 7183 wps
+# 0.104 perplexity: 42.631 speed: 9592 wps
+# 0.204 perplexity: 46.580 speed: 9518 wps
+# 0.304 perplexity: 44.625 speed: 9569 wps
+# 0.404 perplexity: 43.832 speed: 9576 wps
+# 0.504 perplexity: 43.153 speed: 9571 wps
+# 0.604 perplexity: 41.761 speed: 9557 wps
+# 0.703 perplexity: 41.159 speed: 9524 wps
+# 0.803 perplexity: 40.494 speed: 9527 wps
+# 0.903 perplexity: 39.111 speed: 9558 wps
+# Epoch: 10 Train Perplexity: 38.298
+# Epoch: 10 Valid Perplexity: 123.658
+# Epoch: 11 Learning rate: 0.016
+# 0.004 perplexity: 56.238 speed: 7190 wps
+# 0.104 perplexity: 41.771 speed: 9171 wps
+# 0.204 perplexity: 45.656 speed: 9415 wps
+# 0.304 perplexity: 43.719 speed: 9472 wps
+# 0.404 perplexity: 42.941 speed: 9483 wps
+# 0.504 perplexity: 42.269 speed: 9494 wps
+# 0.604 perplexity: 40.903 speed: 9530 wps
+# 0.703 perplexity: 40.314 speed: 9545 wps
+# 0.803 perplexity: 39.654 speed: 9580 wps
+# 0.903 perplexity: 38.287 speed: 9597 wps
+# Epoch: 11 Train Perplexity: 37.477
+# Epoch: 11 Valid Perplexity: 123.523
+# Epoch: 12 Learning rate: 0.008
+# 0.004 perplexity: 55.552 speed: 7317 wps
+# 0.104 perplexity: 41.267 speed: 9234 wps
+# 0.204 perplexity: 45.119 speed: 9461 wps
+# 0.304 perplexity: 43.204 speed: 9519 wps
+# 0.404 perplexity: 42.441 speed: 9453 wps
+# 0.504 perplexity: 41.773 speed: 9536 wps
+# 0.604 perplexity: 40.423 speed: 9555 wps
+# 0.703 perplexity: 39.836 speed: 9576 wps
+# 0.803 perplexity: 39.181 speed: 9579 wps
+# 0.903 perplexity: 37.827 speed: 9554 wps
+# Epoch: 12 Train Perplexity: 37.020
+# Epoch: 12 Valid Perplexity: 123.192
+# Epoch: 13 Learning rate: 0.004
+# 0.004 perplexity: 55.124 speed: 8234 wps
+# 0.104 perplexity: 40.970 speed: 9391 wps
+# 0.204 perplexity: 44.804 speed: 9525 wps
+# 0.304 perplexity: 42.912 speed: 9512 wps
+# 0.404 perplexity: 42.162 speed: 9536 wps
+# 0.504 perplexity: 41.500 speed: 9630 wps
+# 0.604 perplexity: 40.159 speed: 9591 wps
+# 0.703 perplexity: 39.574 speed: 9575 wps
+# 0.803 perplexity: 38.921 speed: 9613 wps
+# 0.903 perplexity: 37.575 speed: 9629 wps
+# Epoch: 13 Train Perplexity: 36.771
+# Epoch: 13 Valid Perplexity: 122.917
+# Evaluation
+# Test Perplexity: 116.723 took 124.06s
+
+# MediumConfig
+# Epoch: 1 Learning rate: 1.000
+# 0.008 perplexity: 5173.547 speed: 6469 wps
+# 0.107 perplexity: 1219.527 speed: 6453 wps
+# 0.206 perplexity: 866.163 speed: 6441 wps
+# 0.306 perplexity: 695.163 speed: 6428 wps
+# 0.405 perplexity: 598.464 speed: 6420 wps
+# 0.505 perplexity: 531.875 speed: 6422 wps
+# 0.604 perplexity: 477.079 speed: 6425 wps
+# 0.704 perplexity: 438.297 speed: 6428 wps
+# 0.803 perplexity: 407.928 speed: 6425 wps
+# 0.903 perplexity: 381.264 speed: 6429 wps
+# Epoch: 1 Train Perplexity: 360.795
+# Epoch: 1 Valid Perplexity: 208.854
+# ...
+# Epoch: 39 Learning rate: 0.001
+# 0.008 perplexity: 56.618 speed: 6357 wps
+# 0.107 perplexity: 43.375 speed: 6341 wps
+# 0.206 perplexity: 47.873 speed: 6336 wps
+# 0.306 perplexity: 46.408 speed: 6337 wps
+# 0.405 perplexity: 46.327 speed: 6337 wps
+# 0.505 perplexity: 46.115 speed: 6335 wps
+# 0.604 perplexity: 45.323 speed: 6336 wps
+# 0.704 perplexity: 45.286 speed: 6337 wps
+# 0.803 perplexity: 45.174 speed: 6336 wps
+# 0.903 perplexity: 44.334 speed: 6336 wps
+# Epoch: 39 Train Perplexity: 44.021
+# Epoch: 39 Valid Perplexity: 87.516
+# Evaluation
+# Test Perplexity: 83.858 took 167.58s
diff --git a/examples/text_word_embedding/tutorial_word2vec_basic.py b/examples/text_word_embedding/tutorial_word2vec_basic.py
index 6310699ad..074bcb1fa 100644
--- a/examples/text_word_embedding/tutorial_word2vec_basic.py
+++ b/examples/text_word_embedding/tutorial_word2vec_basic.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Vector Representations of Words.
 
 This is the minimalistic reimplementation of
@@ -44,19 +43,17 @@
 import time
 
 import numpy as np
-import tensorflow as tf
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+import tensorflow as tf
 import tensorlayer as tl
 import wget
 
 parser = argparse.ArgumentParser()
 
-parser.add_argument("--model",
-                    default='one',
-                    type=str,
-                    required=False,
-                    help="The model name. It can be 'one', 'two', 'three', 'four'.")
+parser.add_argument(
+    "--model", default='one', type=str, required=False, help="The model name. It can be 'one', 'two', 'three', 'four'."
+)
 
 FLAGS = parser.parse_args()
 
@@ -158,12 +155,14 @@ def main_word2vec_basic():
     print()
 
     batch, labels, data_index = tl.nlp.generate_skip_gram_batch(
-        data=data, batch_size=8, num_skips=4, skip_window=2, data_index=0)
+        data=data, batch_size=8, num_skips=4, skip_window=2, data_index=0
+    )
     for i in range(8):
         print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]])
 
     batch, labels, data_index = tl.nlp.generate_skip_gram_batch(
-        data=data, batch_size=8, num_skips=2, skip_window=1, data_index=0)
+        data=data, batch_size=8, num_skips=2, skip_window=1, data_index=0
+    )
     for i in range(8):
         print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]])
 
@@ -193,7 +192,7 @@ def main_word2vec_basic():
         vocabulary_size=vocabulary_size,
         embedding_size=embedding_size,
         num_sampled=num_sampled,
-        activate_nce_loss=True, # nce loss is activated
+        activate_nce_loss=True,  # nce loss is activated
         nce_loss_args={},
         E_init=tl.initializers.random_uniform(minval=-1.0, maxval=1.0),
         nce_W_init=tl.initializers.truncated_normal(stddev=float(1.0 / np.sqrt(embedding_size))),
@@ -230,9 +229,8 @@ def main_word2vec_basic():
     while step < num_steps:
         start_time = time.time()
         batch_inputs, batch_labels, data_index = tl.nlp.generate_skip_gram_batch(
-            data=data, batch_size=batch_size, num_skips=num_skips,
-            skip_window=skip_window, data_index=data_index)
-
+            data=data, batch_size=batch_size, num_skips=num_skips, skip_window=skip_window, data_index=data_index
+        )
 
         # We perform one update step by evaluating the train_op (including it
         # in the list of returned values for sess.run()
@@ -335,7 +333,6 @@ def predict(analogy):
         # Compute cosine distance between each pair of target and vocab.
         # dist has shape [N, vocab_size].
         dist = tf.matmul(target, normalized_embeddings, transpose_b=True)
-
         """Predict the top 4 answers for analogy questions."""
         _, pred_idx = tf.nn.top_k(dist, n_answer)
 
diff --git a/examples/tutorial_work_with_onnx.py b/examples/tutorial_work_with_onnx.py
index 522f2ad8c..46fd0cb42 100644
--- a/examples/tutorial_work_with_onnx.py
+++ b/examples/tutorial_work_with_onnx.py
@@ -117,13 +117,13 @@
 import time
 
 import numpy as np
-import tensorflow as tf
-from tensorflow.python.tools.freeze_graph import freeze_graph as _freeze_graph
 
 import onnx
+import tensorflow as tf
 import tensorlayer as tl
 from onnx_tf.backend import prepare
 from onnx_tf.frontend import tensorflow_graph_to_onnx_model
+from tensorflow.python.tools.freeze_graph import freeze_graph as _freeze_graph
 
 tf.logging.set_verbosity(tf.logging.DEBUG)
 tl.logging.set_verbosity(tl.logging.DEBUG)
diff --git a/tensorlayer/__init__.py b/tensorlayer/__init__.py
index 835a4935f..f89eebfff 100644
--- a/tensorlayer/__init__.py
+++ b/tensorlayer/__init__.py
@@ -5,19 +5,10 @@
 import os
 from distutils.version import LooseVersion
 
-from tensorlayer.package_info import VERSION
-from tensorlayer.package_info import __shortversion__
-from tensorlayer.package_info import __version__
-
-from tensorlayer.package_info import __package_name__
-from tensorlayer.package_info import __contact_names__
-from tensorlayer.package_info import __contact_emails__
-from tensorlayer.package_info import __homepage__
-from tensorlayer.package_info import __repository_url__
-from tensorlayer.package_info import __download_url__
-from tensorlayer.package_info import __description__
-from tensorlayer.package_info import __license__
-from tensorlayer.package_info import __keywords__
+from tensorlayer.package_info import (
+    VERSION, __contact_emails__, __contact_names__, __description__, __download_url__, __homepage__, __keywords__,
+    __license__, __package_name__, __repository_url__, __shortversion__, __version__
+)
 
 if 'TENSORLAYER_PACKAGE_BUILDING' not in os.environ:
 
diff --git a/tensorlayer/activation.py b/tensorlayer/activation.py
index 7c7b833c3..4aef4a429 100644
--- a/tensorlayer/activation.py
+++ b/tensorlayer/activation.py
@@ -3,7 +3,6 @@
 """A file containing various activation functions."""
 
 import tensorflow as tf
-
 from tensorlayer.decorators import deprecated
 
 __all__ = [
diff --git a/tensorlayer/cost.py b/tensorlayer/cost.py
index 2cd29256f..8ae36920d 100644
--- a/tensorlayer/cost.py
+++ b/tensorlayer/cost.py
@@ -5,8 +5,7 @@
 
 import tensorflow as tf
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import standard_ops, math_ops, nn_ops, array_ops
-
+from tensorflow.python.ops import array_ops, math_ops, nn_ops, standard_ops
 from tensorlayer import logging
 
 __all__ = [
diff --git a/tensorlayer/db.py b/tensorlayer/db.py
index 7ca3e7bb5..2d9a3f1ed 100644
--- a/tensorlayer/db.py
+++ b/tensorlayer/db.py
@@ -7,12 +7,13 @@
 import time
 from datetime import datetime
 
-import gridfs
 import numpy as np
+
+import gridfs
 import pymongo
 import tensorflow as tf
-
 from tensorlayer import logging
+
 from tensorlayer.files import static_graph2net, assign_weights
 from tensorlayer.files import save_weights_to_hdf5, load_hdf5_to_weights
 from tensorlayer.files import del_folder, exists_or_mkdir
@@ -640,7 +641,7 @@ def run_top_task(self, task_name=None, sort=None, **kwargs):
         logging.info("[Database] Start Task: key: {} sort: {} push time: {}".format(task_name, sort, _datetime))
         _script = _script.decode('utf-8')
         with tf.Graph().as_default():  #  # as graph: # clear all TF graphs
-            exec(_script, globals())
+            exec (_script, globals())
 
         # set status to finished
         _ = self.db.Task.find_one_and_update({'_id': _id}, {'$set': {'status': 'finished'}})
diff --git a/tensorlayer/decorators/__init__.py b/tensorlayer/decorators/__init__.py
index 9d4eeaa17..2a289862a 100644
--- a/tensorlayer/decorators/__init__.py
+++ b/tensorlayer/decorators/__init__.py
@@ -11,7 +11,6 @@
 
 from .deprecated import deprecated
 from .deprecated_alias import deprecated_alias
-from .method_decorator import private_method
-from .method_decorator import protected_method
+from .method_decorator import private_method, protected_method
 
 __all__ = ['deprecated', 'deprecated_alias', 'private_method', 'protected_method']
diff --git a/tensorlayer/distributed.py b/tensorlayer/distributed.py
index 544aac87e..d3fbdd38f 100644
--- a/tensorlayer/distributed.py
+++ b/tensorlayer/distributed.py
@@ -6,7 +6,6 @@
 
 import tensorflow as tf
 from tensorflow.python.training import session_run_hook
-
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated
 from tensorlayer.lazy_imports import LazyImport
diff --git a/tensorlayer/files/__init__.py b/tensorlayer/files/__init__.py
index e96fc663e..4d88fa35d 100644
--- a/tensorlayer/files/__init__.py
+++ b/tensorlayer/files/__init__.py
@@ -25,7 +25,6 @@
 from .dataset_loaders.ptb_dataset import *
 from .dataset_loaders.voc_dataset import *
 from .dataset_loaders.wmt_en_fr_dataset import *
-
 from .utils import *
 
 __all__ = [
diff --git a/tensorlayer/files/dataset_loaders/voc_dataset.py b/tensorlayer/files/dataset_loaders/voc_dataset.py
index 5584864ae..e5124b4df 100644
--- a/tensorlayer/files/dataset_loaders/voc_dataset.py
+++ b/tensorlayer/files/dataset_loaders/voc_dataset.py
@@ -4,7 +4,6 @@
 import os
 
 import tensorflow as tf
-
 from tensorlayer import logging, utils
 from tensorlayer.files.utils import (del_file, del_folder, folder_exists, load_file_list, maybe_download_and_extract)
 
diff --git a/tensorlayer/files/dataset_loaders/wmt_en_fr_dataset.py b/tensorlayer/files/dataset_loaders/wmt_en_fr_dataset.py
index 0261a8581..77c1f93f9 100644
--- a/tensorlayer/files/dataset_loaders/wmt_en_fr_dataset.py
+++ b/tensorlayer/files/dataset_loaders/wmt_en_fr_dataset.py
@@ -6,7 +6,6 @@
 import tarfile
 
 from tensorflow.python.platform import gfile
-
 from tensorlayer import logging
 from tensorlayer.files.utils import maybe_download_and_extract
 
diff --git a/tensorlayer/files/utils.py b/tensorlayer/files/utils.py
index b80c17846..b9288209b 100644
--- a/tensorlayer/files/utils.py
+++ b/tensorlayer/files/utils.py
@@ -1,8 +1,9 @@
 #! /usr/bin/python
 # -*- coding: utf-8 -*-
 
+import base64
 import gzip
-import importlib
+import json
 import math
 import os
 import pickle
@@ -14,15 +15,19 @@
 import time
 import zipfile
 
+import cloudpickle
 import h5py
 import numpy as np
-import progressbar
 import scipy.io as sio
-import tensorflow as tf
 from six.moves import cPickle
-from tensorflow.python.platform import gfile
 
+import progressbar
+import tensorflow as tf
 import tensorlayer as tl
+from tensorflow.python.keras.saving import model_config as model_config_lib
+from tensorflow.python.platform import gfile
+from tensorflow.python.util import serialization
+from tensorflow.python.util.tf_export import keras_export
 from tensorlayer import logging, nlp, utils, visualize
 
 import cloudpickle
diff --git a/tensorlayer/initializers.py b/tensorlayer/initializers.py
index aaf4f37ac..7db82f839 100644
--- a/tensorlayer/initializers.py
+++ b/tensorlayer/initializers.py
@@ -2,6 +2,7 @@
 # -*- coding: utf-8 -*-
 
 import numpy as np
+
 import tensorflow as tf
 
 __all__ = [
diff --git a/tensorlayer/layers/__init__.py b/tensorlayer/layers/__init__.py
index a312ec210..d67024381 100644
--- a/tensorlayer/layers/__init__.py
+++ b/tensorlayer/layers/__init__.py
@@ -5,11 +5,10 @@
 from .convolution import *
 from .core import *
 from .dense import *
-from .dropout import *
 from .deprecated import *
+from .dropout import *
 from .embedding import *
 from .extend import *
-# from .flow_control import * # remove for TF 2.0
 from .image_resampling import *
 from .inputs import *
 from .lambda_layers import *
@@ -19,11 +18,9 @@
 from .padding import *
 from .pooling import *
 from .quantize import *
-# from .reconstruction import * # remove for TF 2.0
 from .recurrent import *
 from .scale import *
 from .shape import *
 from .spatial_transformer import *
 from .stack import *
-# from .time_distribution import * # remove for TF 2.0
 from .utils import *
diff --git a/tensorlayer/layers/activation.py b/tensorlayer/layers/activation.py
index 0d50b4a4a..6ac0fd756 100644
--- a/tensorlayer/layers/activation.py
+++ b/tensorlayer/layers/activation.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 from tensorlayer import logging
 from tensorlayer.activation import leaky_relu6, leaky_twice_relu6
 from tensorlayer.decorators import deprecated_alias
diff --git a/tensorlayer/layers/convolution/__init__.py b/tensorlayer/layers/convolution/__init__.py
index ba68797f2..8cf4bd74c 100644
--- a/tensorlayer/layers/convolution/__init__.py
+++ b/tensorlayer/layers/convolution/__init__.py
@@ -9,7 +9,6 @@
 More functions can be found in `TensorFlow API <https://www.tensorflow.org/versions/master/api_docs/index.html>`__.
 """
 
-# from .atrous_conv import * # remove for TF 2.0
 from .binary_conv import *
 from .deformable_conv import *
 from .depthwise_conv import *
@@ -17,13 +16,13 @@
 from .expert_conv import *
 from .expert_deconv import *
 from .group_conv import *
+from .quan_conv import *
+from .quan_conv_bn import *
 from .separable_conv import *
 from .simplified_conv import *
 from .simplified_deconv import *
 from .super_resolution import *
 from .ternary_conv import *
-from .quan_conv import *
-from .quan_conv_bn import *
 
 __all__ = [
 
diff --git a/tensorlayer/layers/convolution/binary_conv.py b/tensorlayer/layers/convolution/binary_conv.py
index cf55127d5..b54dbc762 100644
--- a/tensorlayer/layers/convolution/binary_conv.py
+++ b/tensorlayer/layers/convolution/binary_conv.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 import tensorlayer as tl
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
diff --git a/tensorlayer/layers/convolution/deformable_conv.py b/tensorlayer/layers/convolution/deformable_conv.py
index 616803ba1..50701fec5 100644
--- a/tensorlayer/layers/convolution/deformable_conv.py
+++ b/tensorlayer/layers/convolution/deformable_conv.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 import tensorlayer as tl
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias, private_method
diff --git a/tensorlayer/layers/convolution/depthwise_conv.py b/tensorlayer/layers/convolution/depthwise_conv.py
index b11233c27..c46e60cd4 100644
--- a/tensorlayer/layers/convolution/depthwise_conv.py
+++ b/tensorlayer/layers/convolution/depthwise_conv.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 import tensorlayer as tl
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
diff --git a/tensorlayer/layers/convolution/dorefa_conv.py b/tensorlayer/layers/convolution/dorefa_conv.py
index dc7979967..ece1bcaef 100644
--- a/tensorlayer/layers/convolution/dorefa_conv.py
+++ b/tensorlayer/layers/convolution/dorefa_conv.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 import tensorlayer as tl
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
diff --git a/tensorlayer/layers/convolution/expert_conv.py b/tensorlayer/layers/convolution/expert_conv.py
index 50ea12cb9..eb3539eb3 100644
--- a/tensorlayer/layers/convolution/expert_conv.py
+++ b/tensorlayer/layers/convolution/expert_conv.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 import tensorlayer as tl
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
diff --git a/tensorlayer/layers/convolution/expert_deconv.py b/tensorlayer/layers/convolution/expert_deconv.py
index f23c752ad..a541b8a14 100644
--- a/tensorlayer/layers/convolution/expert_deconv.py
+++ b/tensorlayer/layers/convolution/expert_deconv.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 import tensorlayer as tl
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
diff --git a/tensorlayer/layers/convolution/group_conv.py b/tensorlayer/layers/convolution/group_conv.py
index bc35d4e00..262056ff9 100644
--- a/tensorlayer/layers/convolution/group_conv.py
+++ b/tensorlayer/layers/convolution/group_conv.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 import tensorlayer as tl
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
diff --git a/tensorlayer/layers/convolution/quan_conv.py b/tensorlayer/layers/convolution/quan_conv.py
index 75ee3943c..55112993e 100644
--- a/tensorlayer/layers/convolution/quan_conv.py
+++ b/tensorlayer/layers/convolution/quan_conv.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 import tensorlayer as tl
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
diff --git a/tensorlayer/layers/convolution/quan_conv_bn.py b/tensorlayer/layers/convolution/quan_conv_bn.py
index ef0f9bfda..bc2aec938 100644
--- a/tensorlayer/layers/convolution/quan_conv_bn.py
+++ b/tensorlayer/layers/convolution/quan_conv_bn.py
@@ -3,7 +3,6 @@
 
 import tensorflow as tf
 from tensorflow.python.training import moving_averages
-
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
 from tensorlayer.layers.core import Layer
diff --git a/tensorlayer/layers/convolution/separable_conv.py b/tensorlayer/layers/convolution/separable_conv.py
index ca1c66d49..462b639f5 100644
--- a/tensorlayer/layers/convolution/separable_conv.py
+++ b/tensorlayer/layers/convolution/separable_conv.py
@@ -2,8 +2,8 @@
 # -*- coding: utf-8 -*-
 
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
diff --git a/tensorlayer/layers/convolution/simplified_conv.py b/tensorlayer/layers/convolution/simplified_conv.py
index 536d4e52e..e0f76ae47 100644
--- a/tensorlayer/layers/convolution/simplified_conv.py
+++ b/tensorlayer/layers/convolution/simplified_conv.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 import tensorlayer as tl
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
diff --git a/tensorlayer/layers/convolution/simplified_deconv.py b/tensorlayer/layers/convolution/simplified_deconv.py
index 57beff0f4..13431b7bd 100644
--- a/tensorlayer/layers/convolution/simplified_deconv.py
+++ b/tensorlayer/layers/convolution/simplified_deconv.py
@@ -2,8 +2,8 @@
 # -*- coding: utf-8 -*-
 
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
diff --git a/tensorlayer/layers/convolution/super_resolution.py b/tensorlayer/layers/convolution/super_resolution.py
index 21d765e03..95dc119ba 100644
--- a/tensorlayer/layers/convolution/super_resolution.py
+++ b/tensorlayer/layers/convolution/super_resolution.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 import tensorlayer as tl
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias, private_method
diff --git a/tensorlayer/layers/convolution/ternary_conv.py b/tensorlayer/layers/convolution/ternary_conv.py
index 33e01507c..421b46ff2 100644
--- a/tensorlayer/layers/convolution/ternary_conv.py
+++ b/tensorlayer/layers/convolution/ternary_conv.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 import tensorlayer as tl
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
diff --git a/tensorlayer/layers/core.py b/tensorlayer/layers/core.py
index e360ccd04..7049e216c 100644
--- a/tensorlayer/layers/core.py
+++ b/tensorlayer/layers/core.py
@@ -1,17 +1,15 @@
 #! /usr/bin/python
 # -*- coding: utf-8 -*-
 
+import inspect
 from abc import abstractmethod
 
 import tensorflow as tf
-
 import tensorlayer as tl
 from tensorlayer import logging
 from tensorlayer.decorators import (deprecated_alias, private_method, protected_method)
-from tensorlayer.layers.utils import (get_variable_with_initializer, list_remove_repeat)
 from tensorlayer.files import utils
-
-import inspect
+from tensorlayer.layers.utils import (get_variable_with_initializer, list_remove_repeat)
 
 __all__ = ['Layer', 'ModelLayer', 'LayerList']
 
diff --git a/tensorlayer/layers/dense/__init__.py b/tensorlayer/layers/dense/__init__.py
index 675559eaf..87b064f0c 100644
--- a/tensorlayer/layers/dense/__init__.py
+++ b/tensorlayer/layers/dense/__init__.py
@@ -13,9 +13,9 @@
 from .binary_dense import *
 from .dorefa_dense import *
 from .dropconnect import *
-from .ternary_dense import *
 from .quan_dense import *
 from .quan_dense_bn import *
+from .ternary_dense import *
 
 __all__ = [
     'BinaryDense',
diff --git a/tensorlayer/layers/dense/base_dense.py b/tensorlayer/layers/dense/base_dense.py
index e4c96af50..59cb87ea8 100644
--- a/tensorlayer/layers/dense/base_dense.py
+++ b/tensorlayer/layers/dense/base_dense.py
@@ -2,8 +2,8 @@
 # -*- coding: utf-8 -*-
 
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
diff --git a/tensorlayer/layers/dense/binary_dense.py b/tensorlayer/layers/dense/binary_dense.py
index 77adde3b0..0803ea6c0 100644
--- a/tensorlayer/layers/dense/binary_dense.py
+++ b/tensorlayer/layers/dense/binary_dense.py
@@ -3,7 +3,6 @@
 
 import tensorflow as tf
 import tensorlayer as tl
-
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
 from tensorlayer.layers.core import Layer
diff --git a/tensorlayer/layers/dense/dorefa_dense.py b/tensorlayer/layers/dense/dorefa_dense.py
index 07e9c339e..515e32a57 100644
--- a/tensorlayer/layers/dense/dorefa_dense.py
+++ b/tensorlayer/layers/dense/dorefa_dense.py
@@ -3,7 +3,6 @@
 
 import tensorflow as tf
 import tensorlayer as tl
-
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
 from tensorlayer.layers.core import Layer
diff --git a/tensorlayer/layers/dense/dropconnect.py b/tensorlayer/layers/dense/dropconnect.py
index 449221f42..ece4405f5 100644
--- a/tensorlayer/layers/dense/dropconnect.py
+++ b/tensorlayer/layers/dense/dropconnect.py
@@ -1,13 +1,13 @@
 #! /usr/bin/python
 # -*- coding: utf-8 -*-
 
+import numbers
+
 import tensorflow as tf
 import tensorlayer as tl
-
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
 from tensorlayer.layers.core import Layer
-import numbers
 
 __all__ = [
     'DropconnectDense',
diff --git a/tensorlayer/layers/dense/quan_dense.py b/tensorlayer/layers/dense/quan_dense.py
index f5c42492f..9eabf201f 100644
--- a/tensorlayer/layers/dense/quan_dense.py
+++ b/tensorlayer/layers/dense/quan_dense.py
@@ -3,7 +3,6 @@
 
 import tensorflow as tf
 import tensorlayer as tl
-
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
 from tensorlayer.layers.core import Layer
diff --git a/tensorlayer/layers/dense/quan_dense_bn.py b/tensorlayer/layers/dense/quan_dense_bn.py
index e647a7e6e..9fef11c84 100644
--- a/tensorlayer/layers/dense/quan_dense_bn.py
+++ b/tensorlayer/layers/dense/quan_dense_bn.py
@@ -4,7 +4,6 @@
 import tensorflow as tf
 # from tensorlayer.layers.core import LayersConfig
 from tensorflow.python.training import moving_averages
-
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
 from tensorlayer.layers.core import Layer
diff --git a/tensorlayer/layers/dense/ternary_dense.py b/tensorlayer/layers/dense/ternary_dense.py
index 778636155..6469b40b5 100644
--- a/tensorlayer/layers/dense/ternary_dense.py
+++ b/tensorlayer/layers/dense/ternary_dense.py
@@ -3,7 +3,6 @@
 
 import tensorflow as tf
 import tensorlayer as tl
-
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
 from tensorlayer.layers.core import Layer
diff --git a/tensorlayer/layers/dropout.py b/tensorlayer/layers/dropout.py
index 3724d8b43..25fe80a36 100644
--- a/tensorlayer/layers/dropout.py
+++ b/tensorlayer/layers/dropout.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
 from tensorlayer.layers.core import Layer
diff --git a/tensorlayer/layers/embedding.py b/tensorlayer/layers/embedding.py
index 80c5cadfa..a82c1a93b 100644
--- a/tensorlayer/layers/embedding.py
+++ b/tensorlayer/layers/embedding.py
@@ -2,8 +2,8 @@
 # -*- coding: utf-8 -*-
 
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
 from tensorlayer import logging
 from tensorlayer.layers.core import Layer
diff --git a/tensorlayer/layers/extend.py b/tensorlayer/layers/extend.py
index 42395a537..09d5508db 100644
--- a/tensorlayer/layers/extend.py
+++ b/tensorlayer/layers/extend.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
 from tensorlayer.layers.core import Layer
diff --git a/tensorlayer/layers/image_resampling.py b/tensorlayer/layers/image_resampling.py
index 3b2a2825a..4713200d3 100644
--- a/tensorlayer/layers/image_resampling.py
+++ b/tensorlayer/layers/image_resampling.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
 from tensorlayer.layers.core import Layer
diff --git a/tensorlayer/layers/inputs.py b/tensorlayer/layers/inputs.py
index 0330347fe..4f2544b06 100644
--- a/tensorlayer/layers/inputs.py
+++ b/tensorlayer/layers/inputs.py
@@ -2,8 +2,8 @@
 # -*- coding: utf-8 -*-
 
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
 from tensorlayer import logging
 from tensorlayer.layers.core import Layer, LayerNode
diff --git a/tensorlayer/layers/lambda_layers.py b/tensorlayer/layers/lambda_layers.py
index 13bc3ecbe..9b82ad603 100644
--- a/tensorlayer/layers/lambda_layers.py
+++ b/tensorlayer/layers/lambda_layers.py
@@ -2,11 +2,10 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
-from tensorlayer.layers.core import Layer
 from tensorlayer.files import utils
+from tensorlayer.layers.core import Layer
 
 # from tensorlayer.layers.core import TF_GRAPHKEYS_VARIABLES
 
diff --git a/tensorlayer/layers/merge.py b/tensorlayer/layers/merge.py
index 6f49374ca..6c9817406 100644
--- a/tensorlayer/layers/merge.py
+++ b/tensorlayer/layers/merge.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 from tensorlayer import logging
 from tensorlayer.layers.core import Layer
 
diff --git a/tensorlayer/layers/noise.py b/tensorlayer/layers/noise.py
index bd9c2df9c..c658f8e19 100644
--- a/tensorlayer/layers/noise.py
+++ b/tensorlayer/layers/noise.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 import tensorlayer as tl
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
diff --git a/tensorlayer/layers/normalization.py b/tensorlayer/layers/normalization.py
index 1f2b25f81..226795981 100644
--- a/tensorlayer/layers/normalization.py
+++ b/tensorlayer/layers/normalization.py
@@ -2,11 +2,10 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
+import tensorlayer as tl
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import moving_averages
-
-import tensorlayer as tl
 from tensorlayer import logging
 from tensorlayer.layers.core import Layer
 
diff --git a/tensorlayer/layers/padding.py b/tensorlayer/layers/padding.py
index db1bbb304..edcb720a5 100644
--- a/tensorlayer/layers/padding.py
+++ b/tensorlayer/layers/padding.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 import tensorlayer as tl
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
diff --git a/tensorlayer/layers/pooling.py b/tensorlayer/layers/pooling.py
index 2046de6c5..a22cea358 100644
--- a/tensorlayer/layers/pooling.py
+++ b/tensorlayer/layers/pooling.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 import tensorlayer as tl
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
diff --git a/tensorlayer/layers/quantize.py b/tensorlayer/layers/quantize.py
index 3b5b19635..47ad2a088 100644
--- a/tensorlayer/layers/quantize.py
+++ b/tensorlayer/layers/quantize.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
 from tensorlayer.layers.core import Layer
diff --git a/tensorlayer/layers/recurrent.py b/tensorlayer/layers/recurrent.py
index bad4a5eb0..d91288dda 100644
--- a/tensorlayer/layers/recurrent.py
+++ b/tensorlayer/layers/recurrent.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 import tensorlayer as tl
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
diff --git a/tensorlayer/layers/scale.py b/tensorlayer/layers/scale.py
index ac1800529..6546d70af 100644
--- a/tensorlayer/layers/scale.py
+++ b/tensorlayer/layers/scale.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 from tensorlayer import logging
 from tensorlayer.initializers import constant
 from tensorlayer.layers.core import Layer
diff --git a/tensorlayer/layers/shape.py b/tensorlayer/layers/shape.py
index f8e7b47db..e308eb0c4 100644
--- a/tensorlayer/layers/shape.py
+++ b/tensorlayer/layers/shape.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
 from tensorlayer.layers.core import Layer
diff --git a/tensorlayer/layers/spatial_transformer.py b/tensorlayer/layers/spatial_transformer.py
index e456625a7..262108a68 100644
--- a/tensorlayer/layers/spatial_transformer.py
+++ b/tensorlayer/layers/spatial_transformer.py
@@ -2,11 +2,11 @@
 # -*- coding: utf-8 -*-
 
 import numpy as np
+from six.moves import xrange
+
 import tensorflow as tf
 import tensorlayer as tl
-from six.moves import xrange
 from tensorflow.python.ops import array_ops
-
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
 from tensorlayer.layers.core import Layer
diff --git a/tensorlayer/layers/stack.py b/tensorlayer/layers/stack.py
index c31327989..c35e3837f 100644
--- a/tensorlayer/layers/stack.py
+++ b/tensorlayer/layers/stack.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 import tensorflow as tf
-
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated_alias
 from tensorlayer.layers.core import Layer
diff --git a/tensorlayer/layers/utils.py b/tensorlayer/layers/utils.py
index 10cc1fc18..6d411589f 100644
--- a/tensorlayer/layers/utils.py
+++ b/tensorlayer/layers/utils.py
@@ -2,10 +2,10 @@
 # -*- coding: utf-8 -*-
 
 import numpy as np
+
 import tensorflow as tf
 import tensorlayer as tl
 from tensorflow.python.ops.rnn_cell import LSTMStateTuple
-
 from tensorlayer import logging
 from tensorlayer.decorators import deprecated, deprecated_alias
 
diff --git a/tensorlayer/logging/contrib/hyperdash.py b/tensorlayer/logging/contrib/hyperdash.py
index 122a8c7e5..6e19c8e9b 100644
--- a/tensorlayer/logging/contrib/hyperdash.py
+++ b/tensorlayer/logging/contrib/hyperdash.py
@@ -4,7 +4,6 @@
 from __future__ import absolute_import
 
 import hyperdash as hd
-
 import tensorlayer as tl
 
 __all__ = ["HyperDashHandler", "monitor", "Experiment", "IPythonMagicsWrapper"]
diff --git a/tensorlayer/models/__init__.py b/tensorlayer/models/__init__.py
index 5375efcdd..ec4b021d2 100644
--- a/tensorlayer/models/__init__.py
+++ b/tensorlayer/models/__init__.py
@@ -4,6 +4,6 @@
 # """A collections of pre-defined well known models."""
 
 from .core import *
-from .squeezenetv1 import SqueezeNetV1
 from .mobilenetv1 import MobileNetV1
+from .squeezenetv1 import SqueezeNetV1
 from .vgg import *
diff --git a/tensorlayer/models/core.py b/tensorlayer/models/core.py
index a1a003d5d..6b9309c05 100644
--- a/tensorlayer/models/core.py
+++ b/tensorlayer/models/core.py
@@ -3,9 +3,8 @@
 from queue import Queue
 
 import tensorflow as tf
-from tensorflow.python.framework import ops as tf_ops
-
 import tensorlayer as tl
+from tensorflow.python.framework import ops as tf_ops
 from tensorlayer import logging
 from tensorlayer.files import utils
 from tensorlayer.layers import Layer, ModelLayer
diff --git a/tensorlayer/models/mobilenetv1.py b/tensorlayer/models/mobilenetv1.py
index 8065eeef3..4908b3d89 100644
--- a/tensorlayer/models/mobilenetv1.py
+++ b/tensorlayer/models/mobilenetv1.py
@@ -5,7 +5,6 @@
 import os
 
 import tensorflow as tf
-
 from tensorlayer import logging
 from tensorlayer.files import (assign_weights, load_npz, maybe_download_and_extract)
 from tensorlayer.layers import (BatchNorm, Conv2d, DepthwiseConv2d, Flatten, GlobalMeanPool2d, Input, Reshape)
diff --git a/tensorlayer/models/squeezenetv1.py b/tensorlayer/models/squeezenetv1.py
index 6d6a70535..a2d7e4304 100644
--- a/tensorlayer/models/squeezenetv1.py
+++ b/tensorlayer/models/squeezenetv1.py
@@ -5,10 +5,9 @@
 import os
 
 import tensorflow as tf
-
 from tensorlayer import logging
 from tensorlayer.files import (assign_weights, load_npz, maybe_download_and_extract)
-from tensorlayer.layers import (Concat, Conv2d, Dropout, GlobalMeanPool2d, Input, MaxPool2d, Lambda)
+from tensorlayer.layers import (Concat, Conv2d, Dropout, GlobalMeanPool2d, Input, Lambda, MaxPool2d)
 from tensorlayer.models import Model
 
 __all__ = [
diff --git a/tensorlayer/models/vgg.py b/tensorlayer/models/vgg.py
index 391878c61..06648cb53 100644
--- a/tensorlayer/models/vgg.py
+++ b/tensorlayer/models/vgg.py
@@ -30,12 +30,12 @@
 import os
 
 import numpy as np
-import tensorflow as tf
 
+import tensorflow as tf
 import tensorlayer as tl
 from tensorlayer import logging
 from tensorlayer.files import assign_weights, maybe_download_and_extract
-from tensorlayer.layers import (BatchNorm, Conv2d, Dense, Flatten, Input, LayerList, MaxPool2d, Lambda)
+from tensorlayer.layers import (BatchNorm, Conv2d, Dense, Flatten, Input, Lambda, LayerList, MaxPool2d)
 from tensorlayer.models import Model
 
 __all__ = [
diff --git a/tensorlayer/nlp.py b/tensorlayer/nlp.py
index 21f7c2831..699eeb5fd 100755
--- a/tensorlayer/nlp.py
+++ b/tensorlayer/nlp.py
@@ -12,11 +12,11 @@
 
 import six as _six
 import numpy as np
-import tensorflow as tf
 from six.moves import urllib, xrange
-from tensorflow.python.platform import gfile
 
+import tensorflow as tf
 import tensorlayer as tl
+from tensorflow.python.platform import gfile
 from tensorlayer.lazy_imports import LazyImport
 
 nltk = LazyImport("nltk")
diff --git a/tensorlayer/rein.py b/tensorlayer/rein.py
index e5cbe6bd4..8ddce7316 100644
--- a/tensorlayer/rein.py
+++ b/tensorlayer/rein.py
@@ -2,9 +2,10 @@
 # -*- coding: utf-8 -*-
 
 import numpy as np
-import tensorflow as tf
 from six.moves import xrange
 
+import tensorflow as tf
+
 __all__ = [
     'discount_episode_rewards',
     'cross_entropy_reward_loss',
diff --git a/tensorlayer/utils.py b/tensorlayer/utils.py
index d6b8e6d78..35e054afb 100644
--- a/tensorlayer/utils.py
+++ b/tensorlayer/utils.py
@@ -11,9 +11,9 @@
 from sys import platform as _platform
 
 import numpy as np
-import tensorflow as tf
 from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
 
+import tensorflow as tf
 import tensorlayer as tl
 
 __all__ = [