From 58406fd8272cad14730dc7cbb5dfd576c48bc703 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Fri, 8 May 2020 07:29:41 +0000
Subject: [PATCH 1/8] add

---
 .github/issue_template.md        |  2 +-
 .github/pull_request_template.md | 13 ++++++-------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/.github/issue_template.md b/.github/issue_template.md
index b0ea00dc..8dea127e 100644
--- a/.github/issue_template.md
+++ b/.github/issue_template.md
@@ -22,7 +22,7 @@ about: 您可以提问训练中报错、应用、出core等问题。 You could u
 - 复现信息：如为报错，请给出复现环境、复现步骤
 - 问题描述：请详细描述您的问题，同步贴出报错信息、日志、可复现的代码片段
 
-Thank you for contributing to PaddlePaddle.
+Thank you for contributing to EDL.
 Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before.
 If there is no solution,please make sure that this is a training issue including the following details:
 **System information**
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 515a4f04..08b6f28c 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,12 +1,11 @@
-# What this PR does / why we need it:
+## What this PR does / why we need it:
 
-# Which issue(s) this PR fixes:
+## Which issue(s) this PR fixes:
 
-## Fixes #
+### Fixes #
 
-# Special notes for your reviewer:
+## Special notes for your reviewer:
 
-# Does this PR introduce a user-facing change?:
+## Does this PR introduce a user-facing change?
 
-
-# Additional documentation?
+## Additional documentation?

From 78e7c8f479e90069aff24e0832c1d5c6aa223340 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Mon, 11 May 2020 09:12:13 +0000
Subject: [PATCH 2/8] merge

---
 .github/issue_template.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/issue_template.md b/.github/issue_template.md
index dbeda3fa..871957cc 100644
--- a/.github/issue_template.md
+++ b/.github/issue_template.md
@@ -17,7 +17,6 @@ about: 您可以提问训练中报错、应用、出core等问题。 You could u
 - 复现信息：如为报错，请给出复现环境、复现步骤
 - 问题描述：请详细描述您的问题，同步贴出报错信息、日志、可复现的代码片段
 
-
 Thank you for contributing to EDL. 
 Before submitting the issue, you could search the issue in the GitHub in case that there was a similar issue submitted or resolved before. 
 If there is no solution, please make sure that this is a training issue including the following details: 

From 59338c82d3b605569673bde1b7af68bce7b14378 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Sun, 28 Jun 2020 08:14:53 +0000
Subject: [PATCH 3/8] add lstm

---
 example/distill/nlp/lstm.py | 59 +++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 example/distill/nlp/lstm.py

diff --git a/example/distill/nlp/lstm.py b/example/distill/nlp/lstm.py
new file mode 100644
index 00000000..43840cb8
--- /dev/null
+++ b/example/distill/nlp/lstm.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+
+import numpy as np
+import argparse
+from sklearn.metrics import f1_score, accuracy_score
+import paddle as P
+import paddle.fluid as F
+import paddle.fluid.layers as L
+import paddle.fluid.dygraph as D
+from reader import ChnSentiCorp, pad_batch_data
+from paddle_edl.distill.distill_reader import DistillReader
+import re
+
+import os
+import sys
+from paddle_serving_client import Client
+from paddle_serving_app.reader import ChineseBertReader
+from paddle.incubate.hapi.text import LSTM
+
+
+class LSTM(D.layer):
+    def __init__(self, word_dict):
+        super().__init__()
+
+        self.emb = D.Embedding(len(word_dict), 300)
+        self.lstm = LSTM(input_size=300, hidden_size=150)
+        self.fc = D.Linear(150, 2)
+
+    def forward(self, ids, labels=None):
+        embbed = self.emb(ids)
+        lstm_out, self.hidden = self.lstm(embbed)
+        logits = self.fc(lstm_out[-1])
+
+        if labels is not None:
+            if len(labels.shape) == 1:
+                labels = L.reshape(labels, [-1, 1])
+            loss = L.softmax_with_cross_entropy(logits, labels)
+        else:
+            loss = None
+
+        return loss, logits
+
+    def lr(self, steps_per_epoch=None):
+        return 1e-3

From 067dd4e8d68556064fc016750343c3f7b941e40e Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Sun, 28 Jun 2020 08:19:42 +0000
Subject: [PATCH 4/8] add lstm

---
 example/distill/nlp/lstm.py | 126 +++++++++++++++++++++++++-----------
 1 file changed, 89 insertions(+), 37 deletions(-)

diff --git a/example/distill/nlp/lstm.py b/example/distill/nlp/lstm.py
index 43840cb8..181207c8 100644
--- a/example/distill/nlp/lstm.py
+++ b/example/distill/nlp/lstm.py
@@ -12,48 +12,100 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-import os
-
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, Embedding
+from paddle.fluid.dygraph import GRUUnit
+from paddle.fluid.dygraph.base import to_variable
 import numpy as np
-import argparse
-from sklearn.metrics import f1_score, accuracy_score
-import paddle as P
-import paddle.fluid as F
-import paddle.fluid.layers as L
-import paddle.fluid.dygraph as D
-from reader import ChnSentiCorp, pad_batch_data
-from paddle_edl.distill.distill_reader import DistillReader
-import re
 
-import os
-import sys
-from paddle_serving_client import Client
-from paddle_serving_app.reader import ChineseBertReader
-from paddle.incubate.hapi.text import LSTM
 
+class DynamicGRU(fluid.dygraph.Layer):
+    def __init__(self,
+                 size,
+                 param_attr=None,
+                 bias_attr=None,
+                 is_reverse=False,
+                 gate_activation='sigmoid',
+                 candidate_activation='tanh',
+                 h_0=None,
+                 origin_mode=False,
+                 init_size=None):
+        super(DynamicGRU, self).__init__()
+        self.gru_unit = GRUUnit(
+            size * 3,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            activation=candidate_activation,
+            gate_activation=gate_activation,
+            origin_mode=origin_mode)
+        self.size = size
+        self.h_0 = h_0
+        self.is_reverse = is_reverse
 
-class LSTM(D.layer):
-    def __init__(self, word_dict):
-        super().__init__()
+    def forward(self, inputs):
+        hidden = self.h_0
+        res = []
+        for i in range(inputs.shape[1]):
+            if self.is_reverse:
+                i = inputs.shape[1] - 1 - i
+            input_ = inputs[:, i:i + 1, :]
+            input_ = fluid.layers.reshape(
+                input_, [-1, input_.shape[2]], inplace=False)
+            hidden, reset, gate = self.gru_unit(input_, hidden)
+            hidden_ = fluid.layers.reshape(
+                hidden, [-1, 1, hidden.shape[1]], inplace=False)
+            res.append(hidden_)
+        if self.is_reverse:
+            res = res[::-1]
+        res = fluid.layers.concat(res, axis=1)
+        return res
 
-        self.emb = D.Embedding(len(word_dict), 300)
-        self.lstm = LSTM(input_size=300, hidden_size=150)
-        self.fc = D.Linear(150, 2)
 
-    def forward(self, ids, labels=None):
-        embbed = self.emb(ids)
-        lstm_out, self.hidden = self.lstm(embbed)
-        logits = self.fc(lstm_out[-1])
+class GRU(fluid.dygraph.Layer):
+    def __init__(self, dict_dim, batch_size, seq_len):
+        super(GRU, self).__init__()
+        self.dict_dim = dict_dim
+        self.emb_dim = 128
+        self.hid_dim = 128
+        self.fc_hid_dim = 96
+        self.class_dim = 2
+        self.batch_size = batch_size
+        self.seq_len = seq_len
+        self.embedding = Embedding(
+            size=[self.dict_dim + 1, self.emb_dim],
+            dtype='float32',
+            param_attr=fluid.ParamAttr(learning_rate=30),
+            is_sparse=False)
+        h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32")
+        h_0 = to_variable(h_0)
+        self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.hid_dim * 3)
+        self._fc2 = Linear(
+            input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
+        self._fc_prediction = Linear(
+            input_dim=self.fc_hid_dim,
+            output_dim=self.class_dim,
+            act="softmax")
+        self._gru = DynamicGRU(size=self.hid_dim, h_0=h_0)
 
-        if labels is not None:
-            if len(labels.shape) == 1:
-                labels = L.reshape(labels, [-1, 1])
-            loss = L.softmax_with_cross_entropy(logits, labels)
+    def forward(self, inputs, label=None):
+        emb = self.embedding(inputs)
+        o_np_mask = to_variable(
+            inputs.numpy().reshape(-1, 1) != self.dict_dim).astype('float32')
+        mask_emb = fluid.layers.expand(
+            to_variable(o_np_mask), [1, self.hid_dim])
+        emb = emb * mask_emb
+        emb = fluid.layers.reshape(
+            emb, shape=[self.batch_size, -1, self.hid_dim])
+        fc_1 = self._fc1(emb)
+        gru_hidden = self._gru(fc_1)
+        gru_hidden = fluid.layers.reduce_max(gru_hidden, dim=1)
+        tanh_1 = fluid.layers.tanh(gru_hidden)
+        fc_2 = self._fc2(tanh_1)
+        prediction = self._fc_prediction(fc_2)
+        if label:
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            #acc = fluid.layers.accuracy(input=prediction, label=label)
+            return avg_cost, prediction
         else:
-            loss = None
-
-        return loss, logits
-
-    def lr(self, steps_per_epoch=None):
-        return 1e-3
+            return None, prediction

From 5444d004a0e4da6264b405fa824652e6e9ea5ace Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Sun, 28 Jun 2020 08:29:20 +0000
Subject: [PATCH 5/8] add

---
 example/distill/nlp/lstm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/example/distill/nlp/lstm.py b/example/distill/nlp/lstm.py
index 181207c8..8f6d9b87 100644
--- a/example/distill/nlp/lstm.py
+++ b/example/distill/nlp/lstm.py
@@ -62,9 +62,9 @@ def forward(self, inputs):
 
 
 class GRU(fluid.dygraph.Layer):
-    def __init__(self, dict_dim, batch_size, seq_len):
+    def __init__(self, word_dict, batch_size=16, seq_len=256):
         super(GRU, self).__init__()
-        self.dict_dim = dict_dim
+        self.dict_dim = len(word_dict)
         self.emb_dim = 128
         self.hid_dim = 128
         self.fc_hid_dim = 96

From 00da2739aa0d2480ce4f9d4c49f89f123eee614b Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Sun, 28 Jun 2020 08:37:11 +0000
Subject: [PATCH 6/8] add

---
 example/distill/nlp/distill.py    |  13 +-
 example/distill/nlp/lstm.py       | 126 ++--
 example/distill/nlp/model.py      |  22 +
 example/distill/nlp/test_train.sh |   2 +-
 example/distill/nlp/text_basic.py | 976 ++++++++++++++++++++++++++++++
 example/distill/nlp/train.py      |  10 +-
 6 files changed, 1050 insertions(+), 99 deletions(-)
 create mode 100644 example/distill/nlp/text_basic.py

diff --git a/example/distill/nlp/distill.py b/example/distill/nlp/distill.py
index 8d4d3c2e..3b684241 100644
--- a/example/distill/nlp/distill.py
+++ b/example/distill/nlp/distill.py
@@ -30,7 +30,7 @@
 import sys
 from paddle_serving_client import Client
 from paddle_serving_app.reader import ChineseBertReader
-from model import CNN, AdamW, evaluate_student, KL, BOW, KL_T
+from model import CNN, AdamW, evaluate_student, KL, BOW, KL_T, model_factory
 
 parser = argparse.ArgumentParser(__doc__)
 parser.add_argument(
@@ -54,6 +54,8 @@
     "--use_data_au", type=int, default=1, help="use data augmentation")
 parser.add_argument(
     "--T", type=float, default=2.0, help="weight of student in loss")
+parser.add_argument(
+    "--model", type=str, default="BOW", help="student model name")
 args = parser.parse_args()
 print("parsed args:", args)
 
@@ -63,19 +65,16 @@
 
 def train_with_distill(train_reader, dev_reader, word_dict, test_reader,
                        epoch_num):
-    boundaries = [2250 * 2, 2250 * 4, 2250 * 6]
-    values = [1e-4, 1.5e-4, 2.5e-4, 4e-4]
-    lr = D.PiecewiseDecay(boundaries, values, 0)
-    model = BOW(word_dict)
+    model = model_factory(args.model, word_dict)
     if args.opt == "Adam":
         opt = F.optimizer.Adam(
-            learning_rate=lr,
+            learning_rate=model.lr(steps_per_epoch=2250),
             parameter_list=model.parameters(),
             regularization=F.regularizer.L2Decay(
                 regularization_coeff=args.weight_decay))
     else:
         opt = AdamW(
-            learning_rate=lr,
+            learning_rate=model.lr(steps_per_epoch=2250),
             parameter_list=model.parameters(),
             weight_decay=args.weight_decay)
 
diff --git a/example/distill/nlp/lstm.py b/example/distill/nlp/lstm.py
index 8f6d9b87..37704832 100644
--- a/example/distill/nlp/lstm.py
+++ b/example/distill/nlp/lstm.py
@@ -12,100 +12,48 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.fluid as fluid
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, Embedding
-from paddle.fluid.dygraph import GRUUnit
-from paddle.fluid.dygraph.base import to_variable
+import sys
+import os
+
 import numpy as np
+import argparse
+from sklearn.metrics import f1_score, accuracy_score
+import paddle as P
+import paddle.fluid as F
+import paddle.fluid.layers as L
+import paddle.fluid.dygraph as D
+from reader import ChnSentiCorp, pad_batch_data
+from paddle_edl.distill.distill_reader import DistillReader
+import re
 
+import os
+import sys
+from paddle_serving_client import Client
+from paddle_serving_app.reader import ChineseBertReader
+from text_basic import LSTM
 
-class DynamicGRU(fluid.dygraph.Layer):
-    def __init__(self,
-                 size,
-                 param_attr=None,
-                 bias_attr=None,
-                 is_reverse=False,
-                 gate_activation='sigmoid',
-                 candidate_activation='tanh',
-                 h_0=None,
-                 origin_mode=False,
-                 init_size=None):
-        super(DynamicGRU, self).__init__()
-        self.gru_unit = GRUUnit(
-            size * 3,
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            activation=candidate_activation,
-            gate_activation=gate_activation,
-            origin_mode=origin_mode)
-        self.size = size
-        self.h_0 = h_0
-        self.is_reverse = is_reverse
 
-    def forward(self, inputs):
-        hidden = self.h_0
-        res = []
-        for i in range(inputs.shape[1]):
-            if self.is_reverse:
-                i = inputs.shape[1] - 1 - i
-            input_ = inputs[:, i:i + 1, :]
-            input_ = fluid.layers.reshape(
-                input_, [-1, input_.shape[2]], inplace=False)
-            hidden, reset, gate = self.gru_unit(input_, hidden)
-            hidden_ = fluid.layers.reshape(
-                hidden, [-1, 1, hidden.shape[1]], inplace=False)
-            res.append(hidden_)
-        if self.is_reverse:
-            res = res[::-1]
-        res = fluid.layers.concat(res, axis=1)
-        return res
+class LSTM(D.layer):
+    def __init__(self, word_dict):
+        super().__init__()
 
+        self.emb = D.Embedding(len(word_dict), 300)
+        self.lstm = LSTM(input_size=300, hidden_size=150)
+        self.fc = D.Linear(150, 2)
 
-class GRU(fluid.dygraph.Layer):
-    def __init__(self, word_dict, batch_size=16, seq_len=256):
-        super(GRU, self).__init__()
-        self.dict_dim = len(word_dict)
-        self.emb_dim = 128
-        self.hid_dim = 128
-        self.fc_hid_dim = 96
-        self.class_dim = 2
-        self.batch_size = batch_size
-        self.seq_len = seq_len
-        self.embedding = Embedding(
-            size=[self.dict_dim + 1, self.emb_dim],
-            dtype='float32',
-            param_attr=fluid.ParamAttr(learning_rate=30),
-            is_sparse=False)
-        h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32")
-        h_0 = to_variable(h_0)
-        self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.hid_dim * 3)
-        self._fc2 = Linear(
-            input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
-        self._fc_prediction = Linear(
-            input_dim=self.fc_hid_dim,
-            output_dim=self.class_dim,
-            act="softmax")
-        self._gru = DynamicGRU(size=self.hid_dim, h_0=h_0)
+    def forward(self, ids, labels=None):
+        embbed = self.emb(ids)
+        lstm_out, self.hidden = self.lstm(embbed)
+        logits = self.fc(lstm_out[-1])
 
-    def forward(self, inputs, label=None):
-        emb = self.embedding(inputs)
-        o_np_mask = to_variable(
-            inputs.numpy().reshape(-1, 1) != self.dict_dim).astype('float32')
-        mask_emb = fluid.layers.expand(
-            to_variable(o_np_mask), [1, self.hid_dim])
-        emb = emb * mask_emb
-        emb = fluid.layers.reshape(
-            emb, shape=[self.batch_size, -1, self.hid_dim])
-        fc_1 = self._fc1(emb)
-        gru_hidden = self._gru(fc_1)
-        gru_hidden = fluid.layers.reduce_max(gru_hidden, dim=1)
-        tanh_1 = fluid.layers.tanh(gru_hidden)
-        fc_2 = self._fc2(tanh_1)
-        prediction = self._fc_prediction(fc_2)
-        if label:
-            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            avg_cost = fluid.layers.mean(x=cost)
-            #acc = fluid.layers.accuracy(input=prediction, label=label)
-            return avg_cost, prediction
+        if labels is not None:
+            if len(labels.shape) == 1:
+                labels = L.reshape(labels, [-1, 1])
+            loss = L.softmax_with_cross_entropy(logits, labels)
         else:
-            return None, prediction
+            loss = None
+
+        return loss, logits
+
+    def lr(self, steps_per_epoch=None):
+        return 1e-3
diff --git a/example/distill/nlp/model.py b/example/distill/nlp/model.py
index 24209cb4..f2f0dab4 100644
--- a/example/distill/nlp/model.py
+++ b/example/distill/nlp/model.py
@@ -30,6 +30,7 @@
 import sys
 from paddle_serving_client import Client
 from paddle_serving_app.reader import ChineseBertReader
+from lstm import GRU
 
 
 class AdamW(F.optimizer.AdamOptimizer):
@@ -105,6 +106,13 @@ def forward(self, ids, labels=None):
             loss = None
         return loss, logits
 
+    def lr(self, steps_per_epoch):
+        values = [1e-4, 1.5e-4, 2.5e-4, 4e-4]
+        boundaries = [
+            steps_per_epoch * 2, steps_per_epoch * 4, steps_per_epoch * 6
+        ]
+        return D.PiecewiseDecay(boundaries, values, 0)
+
 
 class CNN(D.Layer):
     def __init__(self, word_dict):
@@ -133,3 +141,17 @@ def forward(self, ids, labels=None):
         else:
             loss = None
         return loss, logits
+
+    def lr(self, steps_per_epoch=None):
+        return 1e-4
+
+
+def model_factory(model_name, word_dict):
+    if model_name == "BOW":
+        return BOW(word_dict)
+    elif model_name == "CNN":
+        return CNN(word_dict)
+    elif model_name == "LSTM":
+        return GRU(word_dict)
+    else:
+        assert False, "not supported model name:{}".format(model_name)
diff --git a/example/distill/nlp/test_train.sh b/example/distill/nlp/test_train.sh
index 218ab84a..34ac5590 100755
--- a/example/distill/nlp/test_train.sh
+++ b/example/distill/nlp/test_train.sh
@@ -1,4 +1,4 @@
 #!/bin/bash
 export LD_LIBRARY_PATH=/root/go/soft/env/cuda-9.0/lib64:/root/go/soft/cuda10-cudnn7.6.5.32/lib64:$LD_LIBRARY_PATH:/usr/lib64/:/usr/local/lib/
 export CUDA_VISIBLE_DEVICES=7
-nohup python3.6 -u train.py > train_with_test.log 2>&1 &
+nohup python3.6 -u train.py --model CNN > train_with_test.log 2>&1 &
diff --git a/example/distill/nlp/text_basic.py b/example/distill/nlp/text_basic.py
new file mode 100644
index 00000000..a4204f22
--- /dev/null
+++ b/example/distill/nlp/text_basic.py
@@ -0,0 +1,976 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import collections
+import six
+import sys
+from functools import partial, reduce
+
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers.utils as utils
+from paddle.fluid import layers
+from paddle.fluid.layers import BeamSearchDecoder
+from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
+from paddle.fluid.dygraph import Layer, Embedding, Linear, LayerNorm, GRUUnit, Conv2D, Pool2D
+from paddle.fluid.data_feeder import convert_dtype
+
+
+class RNNCell(Layer):
+    """
+    RNNCell is the base class for abstraction representing the calculations
+    mapping the input and state to the output and new state. It is suitable to
+    and mostly used in RNN.
+    """
+
+    def get_initial_states(self,
+                           batch_ref,
+                           shape=None,
+                           dtype=None,
+                           init_value=0,
+                           batch_dim_idx=0):
+        """
+        Generate initialized states according to provided shape, data type and
+        value.
+
+        Parameters:
+            batch_ref: A (possibly nested structure of) tensor variable[s].
+                The first dimension of the tensor will be used as batch size to
+                initialize states.
+            shape: A (possibly nested structure of) shape[s], where a shape is
+                represented as a list/tuple of integer). -1(for batch size) will
+                beautomatically inserted if shape is not started with it. If None,
+                property `state_shape` will be used. The default value is None.
+            dtype: A (possibly nested structure of) data type[s]. The structure
+                must be same as that of `shape`, except when all tensors' in states
+                has the same data type, a single data type can be used. If None and
+                property `cell.state_shape` is not available, float32 will be used
+                as the data type. The default value is None.
+            init_value: A float value used to initialize states.
+            batch_dim_idx: An integer indicating which dimension of the tensor in
+                inputs represents batch size.  The default value is 0.
+
+        Returns:
+            Variable: tensor variable[s] packed in the same structure provided \
+                by shape, representing the initialized states.
+        """
+        # TODO: use inputs and batch_size
+        batch_ref = flatten(batch_ref)[0]
+
+        def _is_shape_sequence(seq):
+            if sys.version_info < (3, ):
+                integer_types = (
+                    int,
+                    long, )
+            else:
+                integer_types = (int, )
+            """For shape, list/tuple of integer is the finest-grained objection"""
+            if (isinstance(seq, list) or isinstance(seq, tuple)):
+                if reduce(
+                        lambda flag, x: isinstance(x, integer_types) and flag,
+                        seq, True):
+                    return False
+            # TODO: Add check for the illegal
+            if isinstance(seq, dict):
+                return True
+            return (isinstance(seq, collections.Sequence) and
+                    not isinstance(seq, six.string_types))
+
+        class Shape(object):
+            def __init__(self, shape):
+                self.shape = shape if shape[0] == -1 else ([-1] + list(shape))
+
+        # nested structure of shapes
+        states_shapes = self.state_shape if shape is None else shape
+        is_sequence_ori = utils.is_sequence
+        utils.is_sequence = _is_shape_sequence
+        states_shapes = map_structure(lambda shape: Shape(shape),
+                                      states_shapes)
+        utils.is_sequence = is_sequence_ori
+
+        # nested structure of dtypes
+        try:
+            states_dtypes = self.state_dtype if dtype is None else dtype
+        except NotImplementedError:  # use fp32 as default
+            states_dtypes = "float32"
+        if len(flatten(states_dtypes)) == 1:
+            dtype = flatten(states_dtypes)[0]
+            states_dtypes = map_structure(lambda shape: dtype, states_shapes)
+
+        init_states = map_structure(
+            lambda shape, dtype: fluid.layers.fill_constant_batch_size_like(
+                input=batch_ref,
+                shape=shape.shape,
+                dtype=dtype,
+                value=init_value,
+                input_dim_idx=batch_dim_idx), states_shapes, states_dtypes)
+        return init_states
+
+    @property
+    def state_shape(self):
+        """
+        Abstract method (property).
+        Used to initialize states.
+        A (possiblely nested structure of) shape[s], where a shape is represented
+        as a list/tuple of integers (-1 for batch size would be automatically
+        inserted into a shape if shape is not started with it).
+        Not necessary to be implemented if states are not initialized by
+        `get_initial_states` or the `shape` argument is provided when using
+        `get_initial_states`.
+        """
+        raise NotImplementedError(
+            "Please add implementaion for `state_shape` in the used cell.")
+
+    @property
+    def state_dtype(self):
+        """
+        Abstract method (property).
+        Used to initialize states.
+        A (possiblely nested structure of) data types[s]. The structure must be
+        same as that of `shape`, except when all tensors' in states has the same
+        data type, a signle data type can be used.
+        Not necessary to be implemented if states are not initialized
+        by `get_initial_states` or the `dtype` argument is provided when using
+        `get_initial_states`.
+        """
+        raise NotImplementedError(
+            "Please add implementaion for `state_dtype` in the used cell.")
+
+
+class BasicLSTMCell(RNNCell):
+    """
+    Long-Short Term Memory(LSTM) RNN cell.
+
+    The formula used is as follows:
+
+    .. math::
+
+        i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
+
+        f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
+
+        c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
+
+        o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
+
+        h_{t} & = o_{t} act_c (c_{t})
+
+    Please refer to `An Empirical Exploration of Recurrent Network Architectures
+    <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
+
+    Parameters:
+        input_size (int): The input size in the LSTM cell.
+        hidden_size (int): The hidden size in the LSTM cell.
+        param_attr(ParamAttr, optional): The parameter attribute for the learnable
+            weight matrix. Default: None.
+        bias_attr (ParamAttr, optional): The parameter attribute for the bias
+            of LSTM. Default: None.
+        gate_activation (function, optional): The activation function for gates
+            of LSTM, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            LSTM, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        forget_bias(float, optional): forget bias used when computing forget gate.
+            Default 1.0
+        dtype(string, optional): The data type used in this cell. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import BasicLSTMCell, RNN
+
+            inputs = paddle.rand((2, 4, 32))
+            cell = BasicLSTMCell(input_size=32, hidden_size=64)
+            rnn = RNN(cell=cell)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 param_attr=None,
+                 bias_attr=None,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 dtype='float32'):
+        super(BasicLSTMCell, self).__init__()
+
+        self._hidden_size = hidden_size
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+        self._gate_activation = gate_activation or layers.sigmoid
+        self._activation = activation or layers.tanh
+        # TODO(guosheng): find better way to resolve constants in __init__
+        self._forget_bias = layers.create_global_var(
+            shape=[1], dtype=dtype, value=forget_bias, persistable=True)
+        # TODO(guosheng): refine this if recurrent_op removes gradient require
+        self._forget_bias.stop_gradient = False
+        self._dtype = dtype
+        self._input_size = input_size
+
+        self._weight = self.create_parameter(
+            attr=self._param_attr,
+            shape=[
+                self._input_size + self._hidden_size, 4 * self._hidden_size
+            ],
+            dtype=self._dtype)
+
+        self._bias = self.create_parameter(
+            attr=self._bias_attr,
+            shape=[4 * self._hidden_size],
+            dtype=self._dtype,
+            is_bias=True)
+
+    def forward(self, inputs, states):
+        """
+        Performs single step LSTM calculations.
+
+        Parameters:
+            inputs (Variable): A tensor with shape `[batch_size, input_size]`,
+                corresponding to :math:`x_t` in the formula. The data type
+                should be float32 or float64.
+            states (Variable): A list of containing two tensors, each shaped
+                `[batch_size, hidden_size]`, corresponding to :math:`h_{t-1}, c_{t-1}`
+                in the formula. The data type should be float32 or float64.
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
+                a tensor with shape `[batch_size, hidden_size]`, corresponding \
+                to :math:`h_{t}` in the formula; `new_states` is a list containing \
+                two tenser variables shaped `[batch_size, hidden_size]`, corresponding \
+                to :math:`h_{t}, c_{t}` in the formula. The data type of these \
+                tensors all is same as that of `states`.
+        """
+        pre_hidden, pre_cell = states
+        concat_input_hidden = layers.concat([inputs, pre_hidden], 1)
+        gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)
+        gate_input = layers.elementwise_add(gate_input, self._bias)
+        i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
+        new_cell = layers.elementwise_add(
+            layers.elementwise_mul(
+                pre_cell,
+                self._gate_activation(
+                    layers.elementwise_add(f, self._forget_bias))),
+            layers.elementwise_mul(
+                self._gate_activation(i), self._activation(j)))
+        new_hidden = self._activation(new_cell) * self._gate_activation(o)
+
+        return new_hidden, [new_hidden, new_cell]
+
+    @property
+    def state_shape(self):
+        """
+        The `state_shape` of BasicLSTMCell is a list with two shapes: `[[hidden_size], [hidden_size]]`
+        (-1 for batch size would be automatically inserted into shape). These two
+        shapes correspond to :math:`h_{t-1}` and :math:`c_{t-1}` separately.
+        """
+        return [[self._hidden_size], [self._hidden_size]]
+
+
+class BasicGRUCell(RNNCell):
+    """
+    Gated Recurrent Unit (GRU) RNN cell.
+
+    The formula for GRU used is as follows:
+
+    .. math::
+
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+
+        h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t}
+
+    Please refer to `An Empirical Exploration of Recurrent Network Architectures
+    <http://proceedings.mlr.press/v37/jozefowicz15.pdf>`_ for more details.
+
+    Parameters:
+        input_size (int): The input size for the first GRU cell.
+        hidden_size (int): The hidden size for every GRU cell.
+        param_attr(ParamAttr, optional): The parameter attribute for the learnable
+            weight matrix. Default: None.
+        bias_attr (ParamAttr, optional): The parameter attribute for the bias
+            of LSTM. Default: None.
+        gate_activation (function, optional): The activation function for gates
+            of GRU, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            GRU, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        dtype(string, optional): The data type used in this cell. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import BasicGRUCell, RNN
+
+            inputs = paddle.rand((2, 4, 32))
+            cell = BasicGRUCell(input_size=32, hidden_size=64)
+            rnn = RNN(cell=cell)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 param_attr=None,
+                 bias_attr=None,
+                 gate_activation=None,
+                 activation=None,
+                 dtype='float32'):
+        super(BasicGRUCell, self).__init__()
+        self._input_size = input_size
+        self._hidden_size = hidden_size
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+        self._gate_activation = gate_activation or layers.sigmoid
+        self._activation = activation or layers.tanh
+        self._dtype = dtype
+
+        if self._param_attr is not None and self._param_attr.name is not None:
+            gate_param_attr = copy.deepcopy(self._param_attr)
+            candidate_param_attr = copy.deepcopy(self._param_attr)
+            gate_param_attr.name += "_gate"
+            candidate_param_attr.name += "_candidate"
+        else:
+            gate_param_attr = self._param_attr
+            candidate_param_attr = self._param_attr
+
+        self._gate_weight = self.create_parameter(
+            attr=gate_param_attr,
+            shape=[
+                self._input_size + self._hidden_size, 2 * self._hidden_size
+            ],
+            dtype=self._dtype)
+
+        self._candidate_weight = self.create_parameter(
+            attr=candidate_param_attr,
+            shape=[self._input_size + self._hidden_size, self._hidden_size],
+            dtype=self._dtype)
+
+        if self._bias_attr is not None and self._bias_attr.name is not None:
+            gate_bias_attr = copy.deepcopy(self._bias_attr)
+            candidate_bias_attr = copy.deepcopy(self._bias_attr)
+            gate_bias_attr.name += "_gate"
+            candidate_bias_attr.name += "_candidate"
+        else:
+            gate_bias_attr = self._bias_attr
+            candidate_bias_attr = self._bias_attr
+
+        self._gate_bias = self.create_parameter(
+            attr=gate_bias_attr,
+            shape=[2 * self._hidden_size],
+            dtype=self._dtype,
+            is_bias=True)
+        self._candidate_bias = self.create_parameter(
+            attr=candidate_bias_attr,
+            shape=[self._hidden_size],
+            dtype=self._dtype,
+            is_bias=True)
+
+    def forward(self, inputs, states):
+        """
+        Performs single step GRU calculations.
+
+        Parameters:
+            inputs (Variable): A tensor with shape `[batch_size, input_size]`,
+                corresponding to :math:`x_t` in the formula. The data type
+                should be float32 or float64.
+            states (Variable): A tensor with shape `[batch_size, hidden_size]`.
+                corresponding to :math:`h_{t-1}` in the formula. The data type
+                should be float32 or float64.
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` and \
+                `new_states` is the same tensor shaped `[batch_size, hidden_size]`, \
+                corresponding to :math:`h_t` in the formula. The data type of the \
+                tensor is same as that of `states`.        
+        """
+        pre_hidden = states
+        concat_input_hidden = layers.concat([inputs, pre_hidden], axis=1)
+
+        gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight)
+
+        gate_input = layers.elementwise_add(gate_input, self._gate_bias)
+
+        gate_input = self._gate_activation(gate_input)
+        r, u = layers.split(gate_input, num_or_sections=2, dim=1)
+
+        r_hidden = r * pre_hidden
+
+        candidate = layers.matmul(
+            layers.concat([inputs, r_hidden], 1), self._candidate_weight)
+        candidate = layers.elementwise_add(candidate, self._candidate_bias)
+
+        c = self._activation(candidate)
+        new_hidden = u * pre_hidden + (1 - u) * c
+
+        return new_hidden, new_hidden
+
+    @property
+    def state_shape(self):
+        """
+        The `state_shape` of BasicGRUCell is a shape `[hidden_size]` (-1 for batch
+        size would be automatically inserted into shape). The shape corresponds
+        to :math:`h_{t-1}`.
+        """
+        return [self._hidden_size]
+
+
+class RNN(Layer):
+    """
+    RNN creates a recurrent neural network specified by RNNCell `cell`, which
+    performs :code:`cell.forward()` repeatedly until reaches to the maximum
+    length of `inputs`.
+
+    Parameters:
+        cell(RNNCell): An instance of `RNNCell`.
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import StackedLSTMCell, RNN
+
+            inputs = paddle.rand((2, 4, 32))
+            cell = StackedLSTMCell(input_size=32, hidden_size=64)
+            rnn = RNN(cell=cell)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
+    """
+
+    def __init__(self, cell, is_reverse=False, time_major=False):
+        super(RNN, self).__init__()
+        self.cell = cell
+        if not hasattr(self.cell, "call"):
+            self.cell.call = self.cell.forward
+        self.is_reverse = is_reverse
+        self.time_major = time_major
+        self.batch_index, self.time_step_index = (1, 0) if time_major else (0,
+                                                                            1)
+
+    def forward(self,
+                inputs,
+                initial_states=None,
+                sequence_length=None,
+                **kwargs):
+        """
+        Performs :code:`cell.forward()` repeatedly until reaches to the maximum
+        length of `inputs`.
+
+        Parameters:
+            inputs (Variable): A (possibly nested structure of) tensor variable[s]. 
+                The shape of tensor should be `[batch_size, sequence_length, ...]`
+                for `time_major == False` or `[sequence_length, batch_size, ...]`
+                for `time_major == True`. It represents the inputs to be unrolled
+                in RNN.
+            initial_states (Variable, optional): A (possibly nested structure of)
+                tensor variable[s], representing the initial state for RNN. 
+                If not provided, `cell.get_initial_states` would be used to produce
+                the initial state. Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+            **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. 
+
+        Returns:
+            tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \
+                outputs and states, both are Tensor or nested structure of Tensor. \
+                `final_outputs` has the same structure and data types as \
+                the returned `outputs` of :code:`cell.forward` , and each Tenser in `final_outputs` \
+                stacks all time steps' counterpart in `outputs` thus has shape `[batch_size, sequence_length, ...]` \
+                for `time_major == False` or `[sequence_length, batch_size, ...]` for `time_major == True`. \
+                `final_states` is the counterpart at last time step of initial states, \
+                thus has the same structure with it and has tensors with same shapes \
+                and data types.
+        """
+        if fluid.in_dygraph_mode():
+
+            class ArrayWrapper(object):
+                def __init__(self, x):
+                    self.array = [x]
+
+                def append(self, x):
+                    self.array.append(x)
+                    return self
+
+            def _maybe_copy(state, new_state, step_mask):
+                # TODO: use where_op
+                new_state = fluid.layers.elementwise_mul(
+                    new_state, step_mask,
+                    axis=0) - fluid.layers.elementwise_mul(
+                        state, (step_mask - 1), axis=0)
+                return new_state
+
+            flat_inputs = flatten(inputs)
+            batch_size, time_steps = (
+                flat_inputs[0].shape[self.batch_index],
+                flat_inputs[0].shape[self.time_step_index])
+
+            if initial_states is None:
+                initial_states = self.cell.get_initial_states(
+                    batch_ref=inputs, batch_dim_idx=self.batch_index)
+
+            if not self.time_major:
+                inputs = map_structure(
+                    lambda x: fluid.layers.transpose(x, [1, 0] + list(
+                        range(2, len(x.shape)))), inputs)
+
+            if sequence_length is not None:
+                mask = fluid.layers.sequence_mask(
+                    sequence_length,
+                    maxlen=time_steps,
+                    dtype=flatten(initial_states)[0].dtype)
+                mask = fluid.layers.transpose(mask, [1, 0])
+
+            if self.is_reverse:
+                inputs = map_structure(
+                    lambda x: fluid.layers.reverse(x, axis=[0]), inputs)
+                mask = fluid.layers.reverse(
+                    mask, axis=[0]) if sequence_length is not None else None
+
+            states = initial_states
+            outputs = []
+            for i in range(time_steps):
+                step_inputs = map_structure(lambda x: x[i], inputs)
+                step_outputs, new_states = self.cell(step_inputs, states,
+                                                     **kwargs)
+                if sequence_length is not None:
+                    new_states = map_structure(
+                        partial(
+                            _maybe_copy, step_mask=mask[i]),
+                        states,
+                        new_states)
+                states = new_states
+                outputs = map_structure(
+                    lambda x: ArrayWrapper(x),
+                    step_outputs) if i == 0 else map_structure(
+                        lambda x, x_array: x_array.append(x), step_outputs,
+                        outputs)
+
+            final_outputs = map_structure(
+                lambda x: fluid.layers.stack(x.array, axis=self.time_step_index
+                                             ), outputs)
+
+            if self.is_reverse:
+                final_outputs = map_structure(
+                    lambda x: fluid.layers.reverse(x, axis=self.time_step_index
+                                                   ), final_outputs)
+
+            final_states = new_states
+        else:
+            final_outputs, final_states = fluid.layers.rnn(
+                self.cell,
+                inputs,
+                initial_states=initial_states,
+                sequence_length=sequence_length,
+                time_major=self.time_major,
+                is_reverse=self.is_reverse,
+                **kwargs)
+        return final_outputs, final_states
+
+
+class StackedRNNCell(RNNCell):
+    """
+    Wrapper allowing a stack of RNN cells to behave as a single cell. It is used
+    to implement stacked RNNs.
+
+    Parameters:
+        cells (list|tuple): List of RNN cell instances.
+
+    Examples:
+
+        .. code-block:: python
+
+            from paddle.incubate.hapi.text import BasicLSTMCell, StackedRNNCell
+
+            cells = [BasicLSTMCell(32, 32), BasicLSTMCell(32, 32)]
+            stack_rnn = StackedRNNCell(cells)
+    """
+
+    def __init__(self, cells):
+        super(StackedRNNCell, self).__init__()
+        self.cells = []
+        for i, cell in enumerate(cells):
+            self.cells.append(self.add_sublayer("cell_%d" % i, cell))
+
+    def forward(self, inputs, states, **kwargs):
+        """
+        Performs :code:`cell.forward` for all including cells sequentially.
+        Each cell's `inputs` is the `outputs` of the previous cell. And each
+        cell's `states` is the corresponding one in `states`.
+
+        Parameters:
+            inputs (Variable): The inputs for the first cell. Mostly it is a
+                float32 or float64 tensor with shape `[batch_size, input_size]`.
+            states (list): A list containing states for all cells orderly.
+            **kwargs: Additional keyword arguments, which passed to `cell.forward`
+                for all including cells.
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ). `outputs` is the \
+                `outputs` of the last cell. `new_states` is a list composed \
+                of all cells' `new_states`, and its structure and data type is \
+                same as that of `states` argument.
+        """
+        new_states = []
+        for cell, state in zip(self.cells, states):
+            outputs, new_state = cell(inputs, state, **kwargs)
+            inputs = outputs
+            new_states.append(new_state)
+        return outputs, new_states
+
+    @staticmethod
+    def stack_param_attr(param_attr, n):
+        """
+        If `param_attr` is a list or tuple, convert every element in it to a
+        ParamAttr instance. Otherwise, repeat `param_attr` `n` times to
+        construct a list, and rename every one by appending a increasing index
+        suffix to avoid having same names when `param_attr` contains a name.
+
+        Parameters:
+            param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+                converted to a ParamAttr instance by `ParamAttr._to_attr`.
+            n (int): The times to repeat to construct a list when `param_attr`
+                is not a list or tuple.
+
+        Returns:
+            list: A list composed of each including cell's `param_attr`.
+        """
+        if isinstance(param_attr, (list, tuple)):
+            assert len(param_attr) == n, (
+                "length of param_attr should be %d when it is a list/tuple" %
+                n)
+            param_attrs = [
+                fluid.ParamAttr._to_attr(attr) for attr in param_attr
+            ]
+        else:
+            param_attrs = []
+            attr = fluid.ParamAttr._to_attr(param_attr)
+            for i in range(n):
+                attr_i = copy.deepcopy(attr)
+                if attr.name:
+                    attr_i.name = attr_i.name + "_" + str(i)
+                param_attrs.append(attr_i)
+        return param_attrs
+
+    @property
+    def state_shape(self):
+        """
+        The `state_shape` of StackedRNNCell is a list composed of each including
+        cell's `state_shape`.
+
+        Returns:
+            list: A list composed of each including cell's `state_shape`.
+        """
+        return [cell.state_shape for cell in self.cells]
+
+
+class StackedLSTMCell(RNNCell):
+    """
+    Wrapper allowing a stack of LSTM cells to behave as a single cell. It is used
+    to implement stacked LSTM.
+
+    The formula for LSTM used here is as follows:
+
+    .. math::
+
+        i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
+
+        f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
+
+        c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
+
+        o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
+
+        h_{t} & = o_{t} act_c (c_{t})
+
+
+    Parameters:
+        input_size (int): The input size for the first LSTM cell.
+        hidden_size (int): The hidden size for every LSTM cell.
+        gate_activation (function, optional): The activation function for gates
+            of LSTM, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            LSTM, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        forget_bias (float, optional): forget bias used when computing forget
+            gate. It also can accept a boolean value `True`, which would set
+            :math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and
+            :math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in
+            http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf .
+            Default 1.0.
+        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            LSTM. It also can be a list or tuple, including dropout probabilities
+            for the corresponding LSTM. Default 0.0
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import StackedLSTMCell, RNN
+
+            inputs = paddle.rand((2, 4, 32))
+            cell = StackedLSTMCell(input_size=32, hidden_size=64)
+            rnn = RNN(cell=cell)
+            outputs, _ = rnn(inputs)  # [2, 4, 64]
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 num_layers=1,
+                 dropout=0.0,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype="float32"):
+        super(StackedLSTMCell, self).__init__()
+        self.dropout = utils.convert_to_list(dropout, num_layers, "dropout",
+                                             float)
+        param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers)
+        bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers)
+
+        self.cells = []
+        for i in range(num_layers):
+            if forget_bias is True:
+                bias_attrs[
+                    i].initializer = fluid.initializer.NumpyArrayInitializer(
+                        np.concatenate(
+                            np.zeros(2 * hidden_size),
+                            np.ones(hidden_size), np.zeros(hidden_size))
+                        .astype(dtype))
+                forget_bias = 0.0
+            self.cells.append(
+                self.add_sublayer(
+                    "lstm_%d" % i,
+                    BasicLSTMCell(
+                        input_size=input_size if i == 0 else hidden_size,
+                        hidden_size=hidden_size,
+                        gate_activation=gate_activation,
+                        activation=activation,
+                        forget_bias=forget_bias,
+                        param_attr=param_attrs[i],
+                        bias_attr=bias_attrs[i],
+                        dtype=dtype)))
+
+    def forward(self, inputs, states):
+        """
+        Performs the stacked LSTM cells sequentially. Each cell's `inputs` is
+        the `outputs` of the previous cell. And each cell's `states` is the
+        corresponding one in `states`.
+
+        Parameters:
+            inputs (Variable): The inputs for the first cell. It is a float32 or
+                float64 tensor with shape `[batch_size, input_size]`.
+            states (list): A list containing states for all cells orderly.
+            **kwargs: Additional keyword arguments, which passed to `cell.forward`
+                for all including cells.
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \
+                a tensor with shape `[batch_size, hidden_size]`, corresponding \
+                to :math:`h_{t}` in the formula of the last LSTM; `new_states` \
+                is a list composed of every LSTM `new_states` which is a pair \
+                of tensors standing for :math:`h_{t}, c_{t}` in the formula, \
+                and the data type and structure of these tensors all is same \
+                as that of `states`.
+        """
+        new_states = []
+        for i, cell in enumerate(self.cells):
+            outputs, new_state = cell(inputs, states[i])
+            outputs = layers.dropout(
+                outputs,
+                self.dropout[i],
+                dropout_implementation='upscale_in_train') if self.dropout[
+                    i] > 0 else outputs
+            inputs = outputs
+            new_states.append(new_state)
+        return outputs, new_states
+
+    @property
+    def state_shape(self):
+        """
+        The `state_shape` of StackedLSTMCell is a list composed of each including
+        LSTM cell's `state_shape`.
+
+        Returns:
+            list: A list composed of each including LSTM cell's `state_shape`.
+        """
+        return [cell.state_shape for cell in self.cells]
+
+
+class LSTM(Layer):
+    """
+    Applies a stacked multi-layer long short-term memory (LSTM) RNN to an input
+    sequence.
+
+    The formula for LSTM used here is as follows:
+
+    .. math::
+
+        i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i})
+
+        f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias)
+
+        c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c})
+
+        o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o})
+
+        h_{t} & = o_{t} act_c (c_{t})
+
+
+    Parameters:
+        input_size (int): The input feature size for the first LSTM.
+        hidden_size (int): The hidden size for every LSTM.
+        gate_activation (function, optional): The activation function for gates
+            of LSTM, that is :math:`act_g` in the formula. Default: None,
+            representing for `fluid.layers.sigmoid`.
+        activation (function, optional): The non-gate activation function of
+            LSTM, that is :math:`act_c` in the formula. Default: None,
+            representing for 'fluid.layers.tanh'.
+        forget_bias (float, optional): forget bias used when computing forget
+            gate. It also can accept a boolean value `True`, which would set
+            :math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and
+            :math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in
+            http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf .
+            Default 1.0.
+        num_layers(int, optional): The number of LSTM to be stacked. Default 1.
+        dropout(float|list|tuple, optional): The dropout probability after each
+            LSTM. It also can be a list or tuple, including dropout probabilities
+            for the corresponding LSTM. Default 0.0
+        is_reverse (bool, optional): Indicate whether to calculate in the reverse
+            order of input sequences. Default: `False`.
+        time_major (bool, optional): Indicate the data layout of Tensor included
+            in `input` and `output` tensors. If `False`, the data layout would
+            be batch major with shape `[batch_size, sequence_length, ...]`.  If
+            `True`, the data layout would be time major with shape
+            `[sequence_length, batch_size, ...]`. Default: `False`.
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`.
+            Default None.
+        bias_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is
+            a list or tuple, it's length must equal to `num_layers`. Otherwise,
+            construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`.
+            Default None.
+        dtype(string, optional): The data type used in this cell. It can be
+            float32 or float64. Default float32.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import LSTM
+
+            inputs = paddle.rand((2, 4, 32))
+            lstm = LSTM(input_size=32, hidden_size=64, num_layers=2)
+            outputs, _ = lstm(inputs)  # [2, 4, 64]
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 num_layers=1,
+                 dropout=0.0,
+                 is_reverse=False,
+                 time_major=False,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32'):
+        super(LSTM, self).__init__()
+        lstm_cell = StackedLSTMCell(input_size, hidden_size, gate_activation,
+                                    activation, forget_bias, num_layers,
+                                    dropout, param_attr, bias_attr, dtype)
+        self.lstm = RNN(lstm_cell, is_reverse, time_major)
+
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        """
+        Performs the stacked multi-layer LSTM layer by layer. Each LSTM's `outputs`
+        is the `inputs` of the subsequent one.
+
+        Parameters:
+            inputs (Variable): The inputs for the first LSTM. It is a float32
+                or float64 tensor shaped `[batch_size, sequence_length, input_size]`.
+            initial_states (list|None, optional): A list containing initial states 
+                of all stacked LSTM, and the initial states of each LSTM is a pair
+                of tensors shaped `[batch_size, hidden_size]`. If not provided,
+                use 0 as initial states. Default None.
+            sequence_length (Variable, optional): A tensor with shape `[batch_size]`.
+                It stores real length of each instance, thus enables users to extract
+                the last valid state when past a batch element's sequence length for
+                correctness. If not provided, the paddings would be treated same as
+                non-padding inputs. Default None.
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \
+                is the output of last LSTM and it is a tensor with shape \
+                `[batch_size, sequence_length, hidden_size]` and has the same \
+                data type as `inputs`, `final_states` is the counterpart of \
+                `initial_states` at last time step, thus has the same structure \
+                with it and has tensors with same shapes data types. 
+        """
+        return self.lstm(inputs, initial_states, sequence_length)
diff --git a/example/distill/nlp/train.py b/example/distill/nlp/train.py
index 5f9a85cc..d99c1d91 100644
--- a/example/distill/nlp/train.py
+++ b/example/distill/nlp/train.py
@@ -29,15 +29,21 @@
 import os
 import sys
 
-from model import CNN, AdamW, evaluate_student, BOW
+from model import CNN, AdamW, evaluate_student, BOW, model_factory
 
 g_max_dev_acc = []
 g_max_test_acc = []
 
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument(
+    "--model", type=str, default="BOW", help="student model name")
+args = parser.parse_args()
+print("parsed args:", args)
+
 
 def train_without_distill(train_reader, dev_reader, test_reader, word_dict,
                           epoch_num, lr):
-    model = BOW(word_dict)
+    model = model_factory(args.model, word_dict)
     opt = AdamW(
         learning_rate=lr, parameter_list=model.parameters(), weight_decay=0.01)
     model.train()

From c058d3660b2aa7748653def3cb09221e17268f21 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Sun, 28 Jun 2020 13:25:11 +0000
Subject: [PATCH 7/8] add

---
 example/distill/nlp/nets.py | 287 ++++++++++++++++++++++++++++++++++++
 1 file changed, 287 insertions(+)
 create mode 100644 example/distill/nlp/nets.py

diff --git a/example/distill/nlp/nets.py b/example/distill/nlp/nets.py
new file mode 100644
index 00000000..717f6635
--- /dev/null
+++ b/example/distill/nlp/nets.py
@@ -0,0 +1,287 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, Embedding
+from paddle.fluid.dygraph import GRUUnit
+from paddle.fluid.dygraph.base import to_variable
+import numpy as np
+
+
+class DynamicGRU(fluid.dygraph.Layer):
+    def __init__(self,
+                 size,
+                 param_attr=None,
+                 bias_attr=None,
+                 is_reverse=False,
+                 gate_activation='sigmoid',
+                 candidate_activation='tanh',
+                 h_0=None,
+                 origin_mode=False,
+                 init_size=None):
+        super(DynamicGRU, self).__init__()
+        self.gru_unit = GRUUnit(
+            size * 3,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            activation=candidate_activation,
+            gate_activation=gate_activation,
+            origin_mode=origin_mode)
+        self.size = size
+        self.h_0 = h_0
+        self.is_reverse = is_reverse
+
+    def forward(self, inputs):
+        hidden = self.h_0
+        res = []
+        for i in range(inputs.shape[1]):
+            if self.is_reverse:
+                i = inputs.shape[1] - 1 - i
+            input_ = inputs[:, i:i + 1, :]
+            input_ = fluid.layers.reshape(
+                input_, [-1, input_.shape[2]], inplace=False)
+            hidden, reset, gate = self.gru_unit(input_, hidden)
+            hidden_ = fluid.layers.reshape(
+                hidden, [-1, 1, hidden.shape[1]], inplace=False)
+            res.append(hidden_)
+        if self.is_reverse:
+            res = res[::-1]
+        res = fluid.layers.concat(res, axis=1)
+        return res
+
+
+class SimpleConvPool(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 use_cudnn=False,
+                 batch_size=None):
+        super(SimpleConvPool, self).__init__()
+        self.batch_size = batch_size
+        self._conv2d = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            padding=[1, 1],
+            use_cudnn=use_cudnn,
+            act='tanh')
+
+    def forward(self, inputs):
+        x = self._conv2d(inputs)
+        x = fluid.layers.reduce_max(x, dim=-1)
+        x = fluid.layers.reshape(x, shape=[self.batch_size, -1])
+        return x
+
+
+class CNN(fluid.dygraph.Layer):
+    def __init__(self, dict_dim, batch_size, seq_len):
+        super(CNN, self).__init__()
+        self.dict_dim = dict_dim
+        self.emb_dim = 128
+        self.hid_dim = 128
+        self.fc_hid_dim = 96
+        self.class_dim = 2
+        self.channels = 1
+        self.win_size = [3, self.hid_dim]
+        self.batch_size = batch_size
+        self.seq_len = seq_len
+        self.embedding = Embedding(
+            size=[self.dict_dim + 1, self.emb_dim],
+            dtype='float32',
+            is_sparse=False)
+        self._simple_conv_pool_1 = SimpleConvPool(
+            self.channels,
+            self.hid_dim,
+            self.win_size,
+            batch_size=self.batch_size)
+        self._fc1 = Linear(
+            input_dim=self.hid_dim * self.seq_len,
+            output_dim=self.fc_hid_dim,
+            act="softmax")
+        self._fc_prediction = Linear(
+            input_dim=self.fc_hid_dim,
+            output_dim=self.class_dim,
+            act="softmax")
+
+    def forward(self, inputs, label=None):
+        emb = self.embedding(inputs)
+        o_np_mask = (
+            inputs.numpy().reshape(-1, 1) != self.dict_dim).astype('float32')
+        mask_emb = fluid.layers.expand(
+            to_variable(o_np_mask), [1, self.hid_dim])
+        emb = emb * mask_emb
+        emb = fluid.layers.reshape(
+            emb, shape=[-1, self.channels, self.seq_len, self.hid_dim])
+        conv_3 = self._simple_conv_pool_1(emb)
+        fc_1 = self._fc1(conv_3)
+        prediction = self._fc_prediction(fc_1)
+        if label:
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            acc = fluid.layers.accuracy(input=prediction, label=label)
+            return avg_cost, prediction, acc
+        else:
+            return prediction
+
+
+class BOW(fluid.dygraph.Layer):
+    def __init__(self, dict_dim, batch_size, seq_len):
+        super(BOW, self).__init__()
+        self.dict_dim = dict_dim
+        self.emb_dim = 128
+        self.hid_dim = 128
+        self.fc_hid_dim = 96
+        self.class_dim = 2
+        self.batch_size = batch_size
+        self.seq_len = seq_len
+        self.embedding = Embedding(
+            size=[self.dict_dim + 1, self.emb_dim],
+            dtype='float32',
+            is_sparse=False)
+        self._fc1 = Linear(
+            input_dim=self.hid_dim, output_dim=self.hid_dim, act="tanh")
+        self._fc2 = Linear(
+            input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
+        self._fc_prediction = Linear(
+            input_dim=self.fc_hid_dim,
+            output_dim=self.class_dim,
+            act="softmax")
+
+    def forward(self, inputs, label=None):
+        emb = self.embedding(inputs)
+        o_np_mask = (
+            inputs.numpy().reshape(-1, 1) != self.dict_dim).astype('float32')
+        mask_emb = fluid.layers.expand(
+            to_variable(o_np_mask), [1, self.hid_dim])
+        emb = emb * mask_emb
+        emb = fluid.layers.reshape(emb, shape=[-1, self.seq_len, self.hid_dim])
+        bow_1 = fluid.layers.reduce_sum(emb, dim=1)
+        bow_1 = fluid.layers.tanh(bow_1)
+        fc_1 = self._fc1(bow_1)
+        fc_2 = self._fc2(fc_1)
+        prediction = self._fc_prediction(fc_2)
+        if label is not None:
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            acc = fluid.layers.accuracy(input=prediction, label=label)
+            return avg_cost, prediction, acc
+        else:
+            return prediction
+
+
+class GRU(fluid.dygraph.Layer):
+    def __init__(self, dict_dim, batch_size, seq_len):
+        super(GRU, self).__init__()
+        self.dict_dim = dict_dim
+        self.emb_dim = 128
+        self.hid_dim = 128
+        self.fc_hid_dim = 96
+        self.class_dim = 2
+        self.batch_size = batch_size
+        self.seq_len = seq_len
+        self.embedding = Embedding(
+            size=[self.dict_dim + 1, self.emb_dim],
+            dtype='float32',
+            param_attr=fluid.ParamAttr(learning_rate=30),
+            is_sparse=False)
+        h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32")
+        h_0 = to_variable(h_0)
+        self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.hid_dim * 3)
+        self._fc2 = Linear(
+            input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
+        self._fc_prediction = Linear(
+            input_dim=self.fc_hid_dim,
+            output_dim=self.class_dim,
+            act="softmax")
+        self._gru = DynamicGRU(size=self.hid_dim, h_0=h_0)
+
+    def forward(self, inputs, label=None):
+        emb = self.embedding(inputs)
+        o_np_mask = to_variable(
+            inputs.numpy().reshape(-1, 1) != self.dict_dim).astype('float32')
+        mask_emb = fluid.layers.expand(
+            to_variable(o_np_mask), [1, self.hid_dim])
+        emb = emb * mask_emb
+        emb = fluid.layers.reshape(
+            emb, shape=[self.batch_size, -1, self.hid_dim])
+        fc_1 = self._fc1(emb)
+        gru_hidden = self._gru(fc_1)
+        gru_hidden = fluid.layers.reduce_max(gru_hidden, dim=1)
+        tanh_1 = fluid.layers.tanh(gru_hidden)
+        fc_2 = self._fc2(tanh_1)
+        prediction = self._fc_prediction(fc_2)
+        if label:
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            acc = fluid.layers.accuracy(input=prediction, label=label)
+            return avg_cost, prediction, acc
+        else:
+            return prediction
+
+
+class BiGRU(fluid.dygraph.Layer):
+    def __init__(self, dict_dim, batch_size, seq_len):
+        super(BiGRU, self).__init__()
+        self.dict_dim = dict_dim
+        self.emb_dim = 128
+        self.hid_dim = 128
+        self.fc_hid_dim = 96
+        self.class_dim = 2
+        self.batch_size = batch_size
+        self.seq_len = seq_len
+        self.embedding = Embedding(
+            size=[self.dict_dim + 1, self.emb_dim],
+            dtype='float32',
+            param_attr=fluid.ParamAttr(learning_rate=30),
+            is_sparse=False)
+        h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32")
+        h_0 = to_variable(h_0)
+        self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.hid_dim * 3)
+        self._fc2 = Linear(
+            input_dim=self.hid_dim * 2, output_dim=self.fc_hid_dim, act="tanh")
+        self._fc_prediction = Linear(
+            input_dim=self.fc_hid_dim,
+            output_dim=self.class_dim,
+            act="softmax")
+        self._gru_forward = DynamicGRU(
+            size=self.hid_dim, h_0=h_0, is_reverse=False)
+        self._gru_backward = DynamicGRU(
+            size=self.hid_dim, h_0=h_0, is_reverse=True)
+
+    def forward(self, inputs, label=None):
+        emb = self.embedding(inputs)
+        o_np_mask = to_variable(
+            inputs.numpy().reshape(-1, 1) != self.dict_dim).astype('float32')
+        mask_emb = fluid.layers.expand(
+            to_variable(o_np_mask), [1, self.hid_dim])
+        emb = emb * mask_emb
+        emb = fluid.layers.reshape(
+            emb, shape=[self.batch_size, -1, self.hid_dim])
+        fc_1 = self._fc1(emb)
+        gru_forward = self._gru_forward(fc_1)
+        gru_backward = self._gru_backward(fc_1)
+        gru_forward_tanh = fluid.layers.tanh(gru_forward)
+        gru_backward_tanh = fluid.layers.tanh(gru_backward)
+        encoded_vector = fluid.layers.concat(
+            input=[gru_forward_tanh, gru_backward_tanh], axis=2)
+        encoded_vector = fluid.layers.reduce_max(encoded_vector, dim=1)
+        fc_2 = self._fc2(encoded_vector)
+        prediction = self._fc_prediction(fc_2)
+        if label:
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            acc = fluid.layers.accuracy(input=prediction, label=label)
+            return avg_cost, prediction, acc
+        else:
+            return prediction

From 2d5ac20e522c1ba6c0caf62cb414a9229bb99f99 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Mon, 29 Jun 2020 06:04:21 +0000
Subject: [PATCH 8/8] fix some test=develop

---
 example/distill/nlp/distill.py | 48 ++++++++++++++++++--------------
 example/distill/nlp/lstm.py    | 18 ++++++++----
 example/distill/nlp/model.py   | 12 ++++++--
 example/distill/nlp/nets.py    | 41 ++++++++++++++++------------
 example/distill/nlp/train.py   | 50 ++++++++++++++++++++--------------
 5 files changed, 103 insertions(+), 66 deletions(-)

diff --git a/example/distill/nlp/distill.py b/example/distill/nlp/distill.py
index 3b684241..d2872aca 100644
--- a/example/distill/nlp/distill.py
+++ b/example/distill/nlp/distill.py
@@ -53,7 +53,7 @@
 parser.add_argument(
     "--use_data_au", type=int, default=1, help="use data augmentation")
 parser.add_argument(
-    "--T", type=float, default=2.0, help="weight of student in loss")
+    "--T", type=float, default=None, help="weight of student in loss")
 parser.add_argument(
     "--model", type=str, default="BOW", help="student model name")
 args = parser.parse_args()
@@ -100,30 +100,34 @@ def train_with_distill(train_reader, dev_reader, word_dict, test_reader,
                                                   ) * loss_kd
             else:
                 loss_kd = KL_T(logits_s, logits_t, args.T)
-                loss = args.T * args.T * (args.s_weight * loss_ce +
-                                          (1.0 - args.s_weight) * loss_kd)
+                loss = args.T * args.T * (loss_ce + loss_kd)
+                #loss_kd = KL(logits_s, logits_t)
+                #loss = loss_ce +  loss_kd
 
             loss = L.reduce_mean(loss)
             loss.backward()
-            if step % 10 == 0:
+            if step % 100 == 0:
+                print("stduent logits:", logits_s)
+                print("teatcher logits:", logits_t)
                 print('[step %03d] distill train loss %.5f lr %.3e' %
                       (step, loss.numpy(), opt.current_step_lr()))
             opt.minimize(loss)
             model.clear_gradients()
         f1, acc = evaluate_student(model, dev_reader)
-        print('student on dev f1 %.5f acc %.5f' % (f1, acc))
+        print('student on dev f1 %.5f acc %.5f epoch_no %d' % (f1, acc, epoch))
 
         if max_dev_acc < acc:
             max_dev_acc = acc
 
         f1, acc = evaluate_student(model, test_reader)
-        print('student on test f1 %.5f acc %.5f' % (f1, acc))
+        print('student on test f1 %.5f acc %.5f epoch_no %d' %
+              (f1, acc, epoch))
 
         if max_test_acc < acc:
             max_test_acc = acc
 
-    g_max_dev_acc.append(g_max_dev_acc)
-    g_max_test_acc.append(g_max_test_acc)
+    g_max_dev_acc.append(max_dev_acc)
+    g_max_test_acc.append(max_test_acc)
 
 
 def ernie_reader(s_reader, key_list):
@@ -154,10 +158,7 @@ def reader():
     return reader
 
 
-if __name__ == "__main__":
-    place = F.CUDAPlace(0)
-    D.guard(place).__enter__()
-
+def train():
     ds = ChnSentiCorp()
     word_dict = ds.student_word_dict("./data/vocab.bow.txt")
     batch_size = 16
@@ -194,14 +195,21 @@ def reader():
         input_files, word_dict, batch_size=batch_size)
     dr_t = dr.set_batch_generator(ernie_reader(dr_train_reader, feed_keys))
 
+    train_with_distill(
+        dr_t, dev_reader, word_dict, test_reader, epoch_num=args.epoch_num)
+
+
+if __name__ == "__main__":
+    place = F.CUDAPlace(0)
+    D.guard(place).__enter__()
+
     for i in range(args.train_range):
-        train_with_distill(
-            dr_t, dev_reader, word_dict, test_reader, epoch_num=args.epoch_num)
+        train()
 
-    arr = np.array(g_max_dev_acc)
-    print("max_dev_acc:", arr, "average:", np.average(arr), "train_args:",
-          args)
+        arr = np.array(g_max_dev_acc)
+        print("max_dev_acc:", arr, "average:", np.average(arr), "train_args:",
+              args)
 
-    arr = np.array(g_max_test_acc)
-    print("max_test_acc:", arr, "average:", np.average(arr), "train_args:",
-          args)
+        arr = np.array(g_max_test_acc)
+        print("max_test_acc:", arr, "average:", np.average(arr), "train_args:",
+              args)
diff --git a/example/distill/nlp/lstm.py b/example/distill/nlp/lstm.py
index 37704832..bae8b253 100644
--- a/example/distill/nlp/lstm.py
+++ b/example/distill/nlp/lstm.py
@@ -30,21 +30,27 @@
 import sys
 from paddle_serving_client import Client
 from paddle_serving_app.reader import ChineseBertReader
-from text_basic import LSTM
+from text_basic import LSTM as basic_lstm
 
 
-class LSTM(D.layer):
+class LSTM(D.Layer):
     def __init__(self, word_dict):
         super().__init__()
 
-        self.emb = D.Embedding(len(word_dict), 300)
-        self.lstm = LSTM(input_size=300, hidden_size=150)
+        self.emb = D.Embedding([len(word_dict), 300])
+        self.lstm = basic_lstm(input_size=300, hidden_size=150)
         self.fc = D.Linear(150, 2)
 
     def forward(self, ids, labels=None):
         embbed = self.emb(ids)
-        lstm_out, self.hidden = self.lstm(embbed)
-        logits = self.fc(lstm_out[-1])
+        #print("embed shape:", embbed.shape)
+
+        lstm_out, hidden = self.lstm(embbed)
+        #print("lstm_out shape:", lstm_out.shape)
+        #print("hiden list len:", len(hidden))
+
+        logits = self.fc(lstm_out[:, -1])
+        #print("logits shape:", logits.shape)
 
         if labels is not None:
             if len(labels.shape) == 1:
diff --git a/example/distill/nlp/model.py b/example/distill/nlp/model.py
index f2f0dab4..c288f65b 100644
--- a/example/distill/nlp/model.py
+++ b/example/distill/nlp/model.py
@@ -30,7 +30,8 @@
 import sys
 from paddle_serving_client import Client
 from paddle_serving_app.reader import ChineseBertReader
-from lstm import GRU
+from lstm import LSTM
+from nets import GRU
 
 
 class AdamW(F.optimizer.AdamOptimizer):
@@ -67,12 +68,15 @@ def KL_T(logits_s, logits_t, T=2.0):
     return loss
 
 
-def evaluate_student(model, test_reader):
+def evaluate_student(model, test_reader, batch_size=None):
     all_pred, all_label = [], []
     with D.base._switch_tracer_mode_guard_(is_train=False):
         model.eval()
         for step, (ids_student, labels, _) in enumerate(test_reader()):
-            _, logits = model(ids_student)
+            if batch_size is not None:
+                _, logits = model(ids_student, batch_size=batch_size)
+            else:
+                _, logits = model(ids_student)
             pred = L.argmax(logits, -1)
             all_pred.extend(pred.numpy())
             all_label.extend(labels.numpy())
@@ -152,6 +156,8 @@ def model_factory(model_name, word_dict):
     elif model_name == "CNN":
         return CNN(word_dict)
     elif model_name == "LSTM":
+        return LSTM(word_dict)
+    elif model_name == "GRU":
         return GRU(word_dict)
     else:
         assert False, "not supported model name:{}".format(model_name)
diff --git a/example/distill/nlp/nets.py b/example/distill/nlp/nets.py
index 717f6635..dc66ee96 100644
--- a/example/distill/nlp/nets.py
+++ b/example/distill/nlp/nets.py
@@ -15,6 +15,7 @@
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, Embedding
 from paddle.fluid.dygraph import GRUUnit
 from paddle.fluid.dygraph.base import to_variable
+import paddle.fluid.layers as L
 import numpy as np
 
 
@@ -181,9 +182,9 @@ def forward(self, inputs, label=None):
 
 
 class GRU(fluid.dygraph.Layer):
-    def __init__(self, dict_dim, batch_size, seq_len):
+    def __init__(self, word_dict, batch_size=16, seq_len=256):
         super(GRU, self).__init__()
-        self.dict_dim = dict_dim
+        self.dict_dim = len(word_dict)
         self.emb_dim = 128
         self.hid_dim = 128
         self.fc_hid_dim = 96
@@ -206,13 +207,17 @@ def __init__(self, dict_dim, batch_size, seq_len):
             act="softmax")
         self._gru = DynamicGRU(size=self.hid_dim, h_0=h_0)
 
-    def forward(self, inputs, label=None):
+    def forward(self, inputs, labels=None):
         emb = self.embedding(inputs)
+        """
         o_np_mask = to_variable(
             inputs.numpy().reshape(-1, 1) != self.dict_dim).astype('float32')
         mask_emb = fluid.layers.expand(
             to_variable(o_np_mask), [1, self.hid_dim])
         emb = emb * mask_emb
+        """
+        pad_mask = L.unsqueeze(L.cast(inputs != 0, 'float32'), [-1])
+        emb = emb * pad_mask
         emb = fluid.layers.reshape(
             emb, shape=[self.batch_size, -1, self.hid_dim])
         fc_1 = self._fc1(emb)
@@ -221,13 +226,16 @@ def forward(self, inputs, label=None):
         tanh_1 = fluid.layers.tanh(gru_hidden)
         fc_2 = self._fc2(tanh_1)
         prediction = self._fc_prediction(fc_2)
-        if label:
-            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            avg_cost = fluid.layers.mean(x=cost)
-            acc = fluid.layers.accuracy(input=prediction, label=label)
-            return avg_cost, prediction, acc
+        if labels is not None:
+            cost = fluid.layers.cross_entropy(input=prediction, label=labels)
+            #avg_cost = fluid.layers.mean(x=cost)
+            #acc = fluid.layers.accuracy(input=prediction, label=label)
+            return cost, prediction
         else:
-            return prediction
+            return None, prediction
+
+    def lr(self, steps_per_epoch=None):
+        return 1e-3
 
 
 class BiGRU(fluid.dygraph.Layer):
@@ -251,9 +259,7 @@ def __init__(self, dict_dim, batch_size, seq_len):
         self._fc2 = Linear(
             input_dim=self.hid_dim * 2, output_dim=self.fc_hid_dim, act="tanh")
         self._fc_prediction = Linear(
-            input_dim=self.fc_hid_dim,
-            output_dim=self.class_dim,
-            act="softmax")
+            input_dim=self.fc_hid_dim, output_dim=self.class_dim)
         self._gru_forward = DynamicGRU(
             size=self.hid_dim, h_0=h_0, is_reverse=False)
         self._gru_backward = DynamicGRU(
@@ -279,9 +285,10 @@ def forward(self, inputs, label=None):
         fc_2 = self._fc2(encoded_vector)
         prediction = self._fc_prediction(fc_2)
         if label:
-            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            avg_cost = fluid.layers.mean(x=cost)
-            acc = fluid.layers.accuracy(input=prediction, label=label)
-            return avg_cost, prediction, acc
+            cost = fluid.layers.softmax_cross_entropy(
+                input=prediction, label=label)
+            #avg_cost = fluid.layers.mean(x=cost)
+            #acc = fluid.layers.accuracy(input=prediction, label=label)
+            return avg_cost, prediction
         else:
-            return prediction
+            return None, prediction
diff --git a/example/distill/nlp/train.py b/example/distill/nlp/train.py
index d99c1d91..4326f9b6 100644
--- a/example/distill/nlp/train.py
+++ b/example/distill/nlp/train.py
@@ -37,15 +37,20 @@
 parser = argparse.ArgumentParser(__doc__)
 parser.add_argument(
     "--model", type=str, default="BOW", help="student model name")
+parser.add_argument(
+    "--epoch_num", type=int, default=10, help="weight of student in loss")
+parser.add_argument("--train_range", type=int, default=10, help="train range")
 args = parser.parse_args()
 print("parsed args:", args)
 
 
 def train_without_distill(train_reader, dev_reader, test_reader, word_dict,
-                          epoch_num, lr):
+                          epoch_num):
     model = model_factory(args.model, word_dict)
     opt = AdamW(
-        learning_rate=lr, parameter_list=model.parameters(), weight_decay=0.01)
+        learning_rate=model.lr(),
+        parameter_list=model.parameters(),
+        weight_decay=0.01)
     model.train()
 
     max_dev_acc = 0.0
@@ -62,13 +67,15 @@ def train_without_distill(train_reader, dev_reader, test_reader, word_dict,
             opt.minimize(loss)
             model.clear_gradients()
         f1, acc = evaluate_student(model, dev_reader)
-        print('train_without_distill on dev f1 %.5f acc %.5f' % (f1, acc))
+        print('train_without_distill on dev f1 %.5f acc %.5f epoch_no %d' %
+              (f1, acc, epoch))
 
         if max_dev_acc < acc:
             max_dev_acc = acc
 
         f1, acc = evaluate_student(model, test_reader)
-        print('train_without_distill on test f1 %.5f acc %.5f' % (f1, acc))
+        print('train_without_distill on test f1 %.5f acc %.5f epoch_no %d' %
+              (f1, acc, epoch))
 
         if max_test_acc < acc:
             max_test_acc = acc
@@ -77,10 +84,7 @@ def train_without_distill(train_reader, dev_reader, test_reader, word_dict,
     g_max_test_acc.append(max_test_acc)
 
 
-if __name__ == "__main__":
-    place = F.CUDAPlace(0)
-    D.guard(place).__enter__()
-
+def train():
     ds = ChnSentiCorp()
     word_dict = ds.student_word_dict("./data/vocab.bow.txt")
     batch_size = 16
@@ -93,17 +97,23 @@ def train_without_distill(train_reader, dev_reader, test_reader, word_dict,
     test_reader = ds.pad_batch_reader(
         "./data/test.part.0", word_dict, batch_size=batch_size)
 
-    for i in range(10):
-        train_without_distill(
-            train_reader,
-            dev_reader,
-            test_reader,
-            word_dict,
-            epoch_num=10,
-            lr=1e-4)
+    train_without_distill(
+        train_reader,
+        dev_reader,
+        test_reader,
+        word_dict,
+        epoch_num=args.epoch_num)
+
+
+if __name__ == "__main__":
+    place = F.CUDAPlace(0)
+    D.guard(place).__enter__()
+
+    for i in range(args.train_range):
+        train()
 
-    arr = np.array(g_max_dev_acc)
-    print("max_dev_acc:", arr, "average:", np.average(arr))
+        arr = np.array(g_max_dev_acc)
+        print("max_dev_acc:", arr, "average:", np.average(arr))
 
-    arr = np.array(g_max_test_acc)
-    print("max_test_acc:", arr, "average:", np.average(arr))
+        arr = np.array(g_max_test_acc)
+        print("max_test_acc:", arr, "average:", np.average(arr))