From 58406fd8272cad14730dc7cbb5dfd576c48bc703 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Fri, 8 May 2020 07:29:41 +0000 Subject: [PATCH 1/8] add --- .github/issue_template.md | 2 +- .github/pull_request_template.md | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/.github/issue_template.md b/.github/issue_template.md index b0ea00dc..8dea127e 100644 --- a/.github/issue_template.md +++ b/.github/issue_template.md @@ -22,7 +22,7 @@ about: 您可以提问训练中报错、应用、出core等问题。 You could u - 复现信息:如为报错,请给出复现环境、复现步骤 - 问题描述:请详细描述您的问题,同步贴出报错信息、日志、可复现的代码片段 -Thank you for contributing to PaddlePaddle. +Thank you for contributing to EDL. Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before. If there is no solution,please make sure that this is a training issue including the following details: **System information** diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 515a4f04..08b6f28c 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,12 +1,11 @@ -# What this PR does / why we need it: +## What this PR does / why we need it: -# Which issue(s) this PR fixes: +## Which issue(s) this PR fixes: -## Fixes # +### Fixes # -# Special notes for your reviewer: +## Special notes for your reviewer: -# Does this PR introduce a user-facing change?: +## Does this PR introduce a user-facing change? - -# Additional documentation? +## Additional documentation? From 78e7c8f479e90069aff24e0832c1d5c6aa223340 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Mon, 11 May 2020 09:12:13 +0000 Subject: [PATCH 2/8] merge --- .github/issue_template.md | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/issue_template.md b/.github/issue_template.md index dbeda3fa..871957cc 100644 --- a/.github/issue_template.md +++ b/.github/issue_template.md @@ -17,7 +17,6 @@ about: 您可以提问训练中报错、应用、出core等问题。 You could u - 复现信息:如为报错,请给出复现环境、复现步骤 - 问题描述:请详细描述您的问题,同步贴出报错信息、日志、可复现的代码片段 - Thank you for contributing to EDL. Before submitting the issue, you could search the issue in the GitHub in case that there was a similar issue submitted or resolved before. If there is no solution, please make sure that this is a training issue including the following details: From 59338c82d3b605569673bde1b7af68bce7b14378 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Sun, 28 Jun 2020 08:14:53 +0000 Subject: [PATCH 3/8] add lstm --- example/distill/nlp/lstm.py | 59 +++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 example/distill/nlp/lstm.py diff --git a/example/distill/nlp/lstm.py b/example/distill/nlp/lstm.py new file mode 100644 index 00000000..43840cb8 --- /dev/null +++ b/example/distill/nlp/lstm.py @@ -0,0 +1,59 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import os + +import numpy as np +import argparse +from sklearn.metrics import f1_score, accuracy_score +import paddle as P +import paddle.fluid as F +import paddle.fluid.layers as L +import paddle.fluid.dygraph as D +from reader import ChnSentiCorp, pad_batch_data +from paddle_edl.distill.distill_reader import DistillReader +import re + +import os +import sys +from paddle_serving_client import Client +from paddle_serving_app.reader import ChineseBertReader +from paddle.incubate.hapi.text import LSTM + + +class LSTM(D.layer): + def __init__(self, word_dict): + super().__init__() + + self.emb = D.Embedding(len(word_dict), 300) + self.lstm = LSTM(input_size=300, hidden_size=150) + self.fc = D.Linear(150, 2) + + def forward(self, ids, labels=None): + embbed = self.emb(ids) + lstm_out, self.hidden = self.lstm(embbed) + logits = self.fc(lstm_out[-1]) + + if labels is not None: + if len(labels.shape) == 1: + labels = L.reshape(labels, [-1, 1]) + loss = L.softmax_with_cross_entropy(logits, labels) + else: + loss = None + + return loss, logits + + def lr(self, steps_per_epoch=None): + return 1e-3 From 067dd4e8d68556064fc016750343c3f7b941e40e Mon Sep 17 00:00:00 2001 From: gongweibao Date: Sun, 28 Jun 2020 08:19:42 +0000 Subject: [PATCH 4/8] add lstm --- example/distill/nlp/lstm.py | 126 +++++++++++++++++++++++++----------- 1 file changed, 89 insertions(+), 37 deletions(-) diff --git a/example/distill/nlp/lstm.py b/example/distill/nlp/lstm.py index 43840cb8..181207c8 100644 --- a/example/distill/nlp/lstm.py +++ b/example/distill/nlp/lstm.py @@ -12,48 +12,100 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys -import os - +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, Embedding +from paddle.fluid.dygraph import GRUUnit +from paddle.fluid.dygraph.base import to_variable import numpy as np -import argparse -from sklearn.metrics import f1_score, accuracy_score -import paddle as P -import paddle.fluid as F -import paddle.fluid.layers as L -import paddle.fluid.dygraph as D -from reader import ChnSentiCorp, pad_batch_data -from paddle_edl.distill.distill_reader import DistillReader -import re -import os -import sys -from paddle_serving_client import Client -from paddle_serving_app.reader import ChineseBertReader -from paddle.incubate.hapi.text import LSTM +class DynamicGRU(fluid.dygraph.Layer): + def __init__(self, + size, + param_attr=None, + bias_attr=None, + is_reverse=False, + gate_activation='sigmoid', + candidate_activation='tanh', + h_0=None, + origin_mode=False, + init_size=None): + super(DynamicGRU, self).__init__() + self.gru_unit = GRUUnit( + size * 3, + param_attr=param_attr, + bias_attr=bias_attr, + activation=candidate_activation, + gate_activation=gate_activation, + origin_mode=origin_mode) + self.size = size + self.h_0 = h_0 + self.is_reverse = is_reverse -class LSTM(D.layer): - def __init__(self, word_dict): - super().__init__() + def forward(self, inputs): + hidden = self.h_0 + res = [] + for i in range(inputs.shape[1]): + if self.is_reverse: + i = inputs.shape[1] - 1 - i + input_ = inputs[:, i:i + 1, :] + input_ = fluid.layers.reshape( + input_, [-1, input_.shape[2]], inplace=False) + hidden, reset, gate = self.gru_unit(input_, hidden) + hidden_ = fluid.layers.reshape( + hidden, [-1, 1, hidden.shape[1]], inplace=False) + res.append(hidden_) + if self.is_reverse: + res = res[::-1] + res = fluid.layers.concat(res, axis=1) + return res - self.emb = D.Embedding(len(word_dict), 300) - self.lstm = LSTM(input_size=300, hidden_size=150) - self.fc = D.Linear(150, 2) - def forward(self, ids, labels=None): - embbed = self.emb(ids) - lstm_out, self.hidden = self.lstm(embbed) - logits = self.fc(lstm_out[-1]) +class GRU(fluid.dygraph.Layer): + def __init__(self, dict_dim, batch_size, seq_len): + super(GRU, self).__init__() + self.dict_dim = dict_dim + self.emb_dim = 128 + self.hid_dim = 128 + self.fc_hid_dim = 96 + self.class_dim = 2 + self.batch_size = batch_size + self.seq_len = seq_len + self.embedding = Embedding( + size=[self.dict_dim + 1, self.emb_dim], + dtype='float32', + param_attr=fluid.ParamAttr(learning_rate=30), + is_sparse=False) + h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32") + h_0 = to_variable(h_0) + self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.hid_dim * 3) + self._fc2 = Linear( + input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh") + self._fc_prediction = Linear( + input_dim=self.fc_hid_dim, + output_dim=self.class_dim, + act="softmax") + self._gru = DynamicGRU(size=self.hid_dim, h_0=h_0) - if labels is not None: - if len(labels.shape) == 1: - labels = L.reshape(labels, [-1, 1]) - loss = L.softmax_with_cross_entropy(logits, labels) + def forward(self, inputs, label=None): + emb = self.embedding(inputs) + o_np_mask = to_variable( + inputs.numpy().reshape(-1, 1) != self.dict_dim).astype('float32') + mask_emb = fluid.layers.expand( + to_variable(o_np_mask), [1, self.hid_dim]) + emb = emb * mask_emb + emb = fluid.layers.reshape( + emb, shape=[self.batch_size, -1, self.hid_dim]) + fc_1 = self._fc1(emb) + gru_hidden = self._gru(fc_1) + gru_hidden = fluid.layers.reduce_max(gru_hidden, dim=1) + tanh_1 = fluid.layers.tanh(gru_hidden) + fc_2 = self._fc2(tanh_1) + prediction = self._fc_prediction(fc_2) + if label: + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + #acc = fluid.layers.accuracy(input=prediction, label=label) + return avg_cost, prediction else: - loss = None - - return loss, logits - - def lr(self, steps_per_epoch=None): - return 1e-3 + return None, prediction From 5444d004a0e4da6264b405fa824652e6e9ea5ace Mon Sep 17 00:00:00 2001 From: gongweibao Date: Sun, 28 Jun 2020 08:29:20 +0000 Subject: [PATCH 5/8] add --- example/distill/nlp/lstm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/example/distill/nlp/lstm.py b/example/distill/nlp/lstm.py index 181207c8..8f6d9b87 100644 --- a/example/distill/nlp/lstm.py +++ b/example/distill/nlp/lstm.py @@ -62,9 +62,9 @@ def forward(self, inputs): class GRU(fluid.dygraph.Layer): - def __init__(self, dict_dim, batch_size, seq_len): + def __init__(self, word_dict, batch_size=16, seq_len=256): super(GRU, self).__init__() - self.dict_dim = dict_dim + self.dict_dim = len(word_dict) self.emb_dim = 128 self.hid_dim = 128 self.fc_hid_dim = 96 From 00da2739aa0d2480ce4f9d4c49f89f123eee614b Mon Sep 17 00:00:00 2001 From: gongweibao Date: Sun, 28 Jun 2020 08:37:11 +0000 Subject: [PATCH 6/8] add --- example/distill/nlp/distill.py | 13 +- example/distill/nlp/lstm.py | 126 ++-- example/distill/nlp/model.py | 22 + example/distill/nlp/test_train.sh | 2 +- example/distill/nlp/text_basic.py | 976 ++++++++++++++++++++++++++++++ example/distill/nlp/train.py | 10 +- 6 files changed, 1050 insertions(+), 99 deletions(-) create mode 100644 example/distill/nlp/text_basic.py diff --git a/example/distill/nlp/distill.py b/example/distill/nlp/distill.py index 8d4d3c2e..3b684241 100644 --- a/example/distill/nlp/distill.py +++ b/example/distill/nlp/distill.py @@ -30,7 +30,7 @@ import sys from paddle_serving_client import Client from paddle_serving_app.reader import ChineseBertReader -from model import CNN, AdamW, evaluate_student, KL, BOW, KL_T +from model import CNN, AdamW, evaluate_student, KL, BOW, KL_T, model_factory parser = argparse.ArgumentParser(__doc__) parser.add_argument( @@ -54,6 +54,8 @@ "--use_data_au", type=int, default=1, help="use data augmentation") parser.add_argument( "--T", type=float, default=2.0, help="weight of student in loss") +parser.add_argument( + "--model", type=str, default="BOW", help="student model name") args = parser.parse_args() print("parsed args:", args) @@ -63,19 +65,16 @@ def train_with_distill(train_reader, dev_reader, word_dict, test_reader, epoch_num): - boundaries = [2250 * 2, 2250 * 4, 2250 * 6] - values = [1e-4, 1.5e-4, 2.5e-4, 4e-4] - lr = D.PiecewiseDecay(boundaries, values, 0) - model = BOW(word_dict) + model = model_factory(args.model, word_dict) if args.opt == "Adam": opt = F.optimizer.Adam( - learning_rate=lr, + learning_rate=model.lr(steps_per_epoch=2250), parameter_list=model.parameters(), regularization=F.regularizer.L2Decay( regularization_coeff=args.weight_decay)) else: opt = AdamW( - learning_rate=lr, + learning_rate=model.lr(steps_per_epoch=2250), parameter_list=model.parameters(), weight_decay=args.weight_decay) diff --git a/example/distill/nlp/lstm.py b/example/distill/nlp/lstm.py index 8f6d9b87..37704832 100644 --- a/example/distill/nlp/lstm.py +++ b/example/distill/nlp/lstm.py @@ -12,100 +12,48 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle.fluid as fluid -from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, Embedding -from paddle.fluid.dygraph import GRUUnit -from paddle.fluid.dygraph.base import to_variable +import sys +import os + import numpy as np +import argparse +from sklearn.metrics import f1_score, accuracy_score +import paddle as P +import paddle.fluid as F +import paddle.fluid.layers as L +import paddle.fluid.dygraph as D +from reader import ChnSentiCorp, pad_batch_data +from paddle_edl.distill.distill_reader import DistillReader +import re +import os +import sys +from paddle_serving_client import Client +from paddle_serving_app.reader import ChineseBertReader +from text_basic import LSTM -class DynamicGRU(fluid.dygraph.Layer): - def __init__(self, - size, - param_attr=None, - bias_attr=None, - is_reverse=False, - gate_activation='sigmoid', - candidate_activation='tanh', - h_0=None, - origin_mode=False, - init_size=None): - super(DynamicGRU, self).__init__() - self.gru_unit = GRUUnit( - size * 3, - param_attr=param_attr, - bias_attr=bias_attr, - activation=candidate_activation, - gate_activation=gate_activation, - origin_mode=origin_mode) - self.size = size - self.h_0 = h_0 - self.is_reverse = is_reverse - def forward(self, inputs): - hidden = self.h_0 - res = [] - for i in range(inputs.shape[1]): - if self.is_reverse: - i = inputs.shape[1] - 1 - i - input_ = inputs[:, i:i + 1, :] - input_ = fluid.layers.reshape( - input_, [-1, input_.shape[2]], inplace=False) - hidden, reset, gate = self.gru_unit(input_, hidden) - hidden_ = fluid.layers.reshape( - hidden, [-1, 1, hidden.shape[1]], inplace=False) - res.append(hidden_) - if self.is_reverse: - res = res[::-1] - res = fluid.layers.concat(res, axis=1) - return res +class LSTM(D.layer): + def __init__(self, word_dict): + super().__init__() + self.emb = D.Embedding(len(word_dict), 300) + self.lstm = LSTM(input_size=300, hidden_size=150) + self.fc = D.Linear(150, 2) -class GRU(fluid.dygraph.Layer): - def __init__(self, word_dict, batch_size=16, seq_len=256): - super(GRU, self).__init__() - self.dict_dim = len(word_dict) - self.emb_dim = 128 - self.hid_dim = 128 - self.fc_hid_dim = 96 - self.class_dim = 2 - self.batch_size = batch_size - self.seq_len = seq_len - self.embedding = Embedding( - size=[self.dict_dim + 1, self.emb_dim], - dtype='float32', - param_attr=fluid.ParamAttr(learning_rate=30), - is_sparse=False) - h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32") - h_0 = to_variable(h_0) - self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.hid_dim * 3) - self._fc2 = Linear( - input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh") - self._fc_prediction = Linear( - input_dim=self.fc_hid_dim, - output_dim=self.class_dim, - act="softmax") - self._gru = DynamicGRU(size=self.hid_dim, h_0=h_0) + def forward(self, ids, labels=None): + embbed = self.emb(ids) + lstm_out, self.hidden = self.lstm(embbed) + logits = self.fc(lstm_out[-1]) - def forward(self, inputs, label=None): - emb = self.embedding(inputs) - o_np_mask = to_variable( - inputs.numpy().reshape(-1, 1) != self.dict_dim).astype('float32') - mask_emb = fluid.layers.expand( - to_variable(o_np_mask), [1, self.hid_dim]) - emb = emb * mask_emb - emb = fluid.layers.reshape( - emb, shape=[self.batch_size, -1, self.hid_dim]) - fc_1 = self._fc1(emb) - gru_hidden = self._gru(fc_1) - gru_hidden = fluid.layers.reduce_max(gru_hidden, dim=1) - tanh_1 = fluid.layers.tanh(gru_hidden) - fc_2 = self._fc2(tanh_1) - prediction = self._fc_prediction(fc_2) - if label: - cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(x=cost) - #acc = fluid.layers.accuracy(input=prediction, label=label) - return avg_cost, prediction + if labels is not None: + if len(labels.shape) == 1: + labels = L.reshape(labels, [-1, 1]) + loss = L.softmax_with_cross_entropy(logits, labels) else: - return None, prediction + loss = None + + return loss, logits + + def lr(self, steps_per_epoch=None): + return 1e-3 diff --git a/example/distill/nlp/model.py b/example/distill/nlp/model.py index 24209cb4..f2f0dab4 100644 --- a/example/distill/nlp/model.py +++ b/example/distill/nlp/model.py @@ -30,6 +30,7 @@ import sys from paddle_serving_client import Client from paddle_serving_app.reader import ChineseBertReader +from lstm import GRU class AdamW(F.optimizer.AdamOptimizer): @@ -105,6 +106,13 @@ def forward(self, ids, labels=None): loss = None return loss, logits + def lr(self, steps_per_epoch): + values = [1e-4, 1.5e-4, 2.5e-4, 4e-4] + boundaries = [ + steps_per_epoch * 2, steps_per_epoch * 4, steps_per_epoch * 6 + ] + return D.PiecewiseDecay(boundaries, values, 0) + class CNN(D.Layer): def __init__(self, word_dict): @@ -133,3 +141,17 @@ def forward(self, ids, labels=None): else: loss = None return loss, logits + + def lr(self, steps_per_epoch=None): + return 1e-4 + + +def model_factory(model_name, word_dict): + if model_name == "BOW": + return BOW(word_dict) + elif model_name == "CNN": + return CNN(word_dict) + elif model_name == "LSTM": + return GRU(word_dict) + else: + assert False, "not supported model name:{}".format(model_name) diff --git a/example/distill/nlp/test_train.sh b/example/distill/nlp/test_train.sh index 218ab84a..34ac5590 100755 --- a/example/distill/nlp/test_train.sh +++ b/example/distill/nlp/test_train.sh @@ -1,4 +1,4 @@ #!/bin/bash export LD_LIBRARY_PATH=/root/go/soft/env/cuda-9.0/lib64:/root/go/soft/cuda10-cudnn7.6.5.32/lib64:$LD_LIBRARY_PATH:/usr/lib64/:/usr/local/lib/ export CUDA_VISIBLE_DEVICES=7 -nohup python3.6 -u train.py > train_with_test.log 2>&1 & +nohup python3.6 -u train.py --model CNN > train_with_test.log 2>&1 & diff --git a/example/distill/nlp/text_basic.py b/example/distill/nlp/text_basic.py new file mode 100644 index 00000000..a4204f22 --- /dev/null +++ b/example/distill/nlp/text_basic.py @@ -0,0 +1,976 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy +import collections +import six +import sys +from functools import partial, reduce + +import numpy as np + +import paddle +import paddle.fluid as fluid +import paddle.fluid.layers.utils as utils +from paddle.fluid import layers +from paddle.fluid.layers import BeamSearchDecoder +from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as +from paddle.fluid.dygraph import Layer, Embedding, Linear, LayerNorm, GRUUnit, Conv2D, Pool2D +from paddle.fluid.data_feeder import convert_dtype + + +class RNNCell(Layer): + """ + RNNCell is the base class for abstraction representing the calculations + mapping the input and state to the output and new state. It is suitable to + and mostly used in RNN. + """ + + def get_initial_states(self, + batch_ref, + shape=None, + dtype=None, + init_value=0, + batch_dim_idx=0): + """ + Generate initialized states according to provided shape, data type and + value. + + Parameters: + batch_ref: A (possibly nested structure of) tensor variable[s]. + The first dimension of the tensor will be used as batch size to + initialize states. + shape: A (possibly nested structure of) shape[s], where a shape is + represented as a list/tuple of integer). -1(for batch size) will + beautomatically inserted if shape is not started with it. If None, + property `state_shape` will be used. The default value is None. + dtype: A (possibly nested structure of) data type[s]. The structure + must be same as that of `shape`, except when all tensors' in states + has the same data type, a single data type can be used. If None and + property `cell.state_shape` is not available, float32 will be used + as the data type. The default value is None. + init_value: A float value used to initialize states. + batch_dim_idx: An integer indicating which dimension of the tensor in + inputs represents batch size. The default value is 0. + + Returns: + Variable: tensor variable[s] packed in the same structure provided \ + by shape, representing the initialized states. + """ + # TODO: use inputs and batch_size + batch_ref = flatten(batch_ref)[0] + + def _is_shape_sequence(seq): + if sys.version_info < (3, ): + integer_types = ( + int, + long, ) + else: + integer_types = (int, ) + """For shape, list/tuple of integer is the finest-grained objection""" + if (isinstance(seq, list) or isinstance(seq, tuple)): + if reduce( + lambda flag, x: isinstance(x, integer_types) and flag, + seq, True): + return False + # TODO: Add check for the illegal + if isinstance(seq, dict): + return True + return (isinstance(seq, collections.Sequence) and + not isinstance(seq, six.string_types)) + + class Shape(object): + def __init__(self, shape): + self.shape = shape if shape[0] == -1 else ([-1] + list(shape)) + + # nested structure of shapes + states_shapes = self.state_shape if shape is None else shape + is_sequence_ori = utils.is_sequence + utils.is_sequence = _is_shape_sequence + states_shapes = map_structure(lambda shape: Shape(shape), + states_shapes) + utils.is_sequence = is_sequence_ori + + # nested structure of dtypes + try: + states_dtypes = self.state_dtype if dtype is None else dtype + except NotImplementedError: # use fp32 as default + states_dtypes = "float32" + if len(flatten(states_dtypes)) == 1: + dtype = flatten(states_dtypes)[0] + states_dtypes = map_structure(lambda shape: dtype, states_shapes) + + init_states = map_structure( + lambda shape, dtype: fluid.layers.fill_constant_batch_size_like( + input=batch_ref, + shape=shape.shape, + dtype=dtype, + value=init_value, + input_dim_idx=batch_dim_idx), states_shapes, states_dtypes) + return init_states + + @property + def state_shape(self): + """ + Abstract method (property). + Used to initialize states. + A (possiblely nested structure of) shape[s], where a shape is represented + as a list/tuple of integers (-1 for batch size would be automatically + inserted into a shape if shape is not started with it). + Not necessary to be implemented if states are not initialized by + `get_initial_states` or the `shape` argument is provided when using + `get_initial_states`. + """ + raise NotImplementedError( + "Please add implementaion for `state_shape` in the used cell.") + + @property + def state_dtype(self): + """ + Abstract method (property). + Used to initialize states. + A (possiblely nested structure of) data types[s]. The structure must be + same as that of `shape`, except when all tensors' in states has the same + data type, a signle data type can be used. + Not necessary to be implemented if states are not initialized + by `get_initial_states` or the `dtype` argument is provided when using + `get_initial_states`. + """ + raise NotImplementedError( + "Please add implementaion for `state_dtype` in the used cell.") + + +class BasicLSTMCell(RNNCell): + """ + Long-Short Term Memory(LSTM) RNN cell. + + The formula used is as follows: + + .. math:: + + i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i}) + + f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias) + + c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c}) + + o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o}) + + h_{t} & = o_{t} act_c (c_{t}) + + Please refer to `An Empirical Exploration of Recurrent Network Architectures + `_ for more details. + + Parameters: + input_size (int): The input size in the LSTM cell. + hidden_size (int): The hidden size in the LSTM cell. + param_attr(ParamAttr, optional): The parameter attribute for the learnable + weight matrix. Default: None. + bias_attr (ParamAttr, optional): The parameter attribute for the bias + of LSTM. Default: None. + gate_activation (function, optional): The activation function for gates + of LSTM, that is :math:`act_g` in the formula. Default: None, + representing for `fluid.layers.sigmoid`. + activation (function, optional): The non-gate activation function of + LSTM, that is :math:`act_c` in the formula. Default: None, + representing for 'fluid.layers.tanh'. + forget_bias(float, optional): forget bias used when computing forget gate. + Default 1.0 + dtype(string, optional): The data type used in this cell. Default float32. + + Examples: + + .. code-block:: python + + import paddle + import paddle.fluid as fluid + from paddle.incubate.hapi.text import BasicLSTMCell, RNN + + inputs = paddle.rand((2, 4, 32)) + cell = BasicLSTMCell(input_size=32, hidden_size=64) + rnn = RNN(cell=cell) + outputs, _ = rnn(inputs) # [2, 4, 64] + """ + + def __init__(self, + input_size, + hidden_size, + param_attr=None, + bias_attr=None, + gate_activation=None, + activation=None, + forget_bias=1.0, + dtype='float32'): + super(BasicLSTMCell, self).__init__() + + self._hidden_size = hidden_size + self._param_attr = param_attr + self._bias_attr = bias_attr + self._gate_activation = gate_activation or layers.sigmoid + self._activation = activation or layers.tanh + # TODO(guosheng): find better way to resolve constants in __init__ + self._forget_bias = layers.create_global_var( + shape=[1], dtype=dtype, value=forget_bias, persistable=True) + # TODO(guosheng): refine this if recurrent_op removes gradient require + self._forget_bias.stop_gradient = False + self._dtype = dtype + self._input_size = input_size + + self._weight = self.create_parameter( + attr=self._param_attr, + shape=[ + self._input_size + self._hidden_size, 4 * self._hidden_size + ], + dtype=self._dtype) + + self._bias = self.create_parameter( + attr=self._bias_attr, + shape=[4 * self._hidden_size], + dtype=self._dtype, + is_bias=True) + + def forward(self, inputs, states): + """ + Performs single step LSTM calculations. + + Parameters: + inputs (Variable): A tensor with shape `[batch_size, input_size]`, + corresponding to :math:`x_t` in the formula. The data type + should be float32 or float64. + states (Variable): A list of containing two tensors, each shaped + `[batch_size, hidden_size]`, corresponding to :math:`h_{t-1}, c_{t-1}` + in the formula. The data type should be float32 or float64. + + Returns: + tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \ + a tensor with shape `[batch_size, hidden_size]`, corresponding \ + to :math:`h_{t}` in the formula; `new_states` is a list containing \ + two tenser variables shaped `[batch_size, hidden_size]`, corresponding \ + to :math:`h_{t}, c_{t}` in the formula. The data type of these \ + tensors all is same as that of `states`. + """ + pre_hidden, pre_cell = states + concat_input_hidden = layers.concat([inputs, pre_hidden], 1) + gate_input = layers.matmul(x=concat_input_hidden, y=self._weight) + gate_input = layers.elementwise_add(gate_input, self._bias) + i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) + new_cell = layers.elementwise_add( + layers.elementwise_mul( + pre_cell, + self._gate_activation( + layers.elementwise_add(f, self._forget_bias))), + layers.elementwise_mul( + self._gate_activation(i), self._activation(j))) + new_hidden = self._activation(new_cell) * self._gate_activation(o) + + return new_hidden, [new_hidden, new_cell] + + @property + def state_shape(self): + """ + The `state_shape` of BasicLSTMCell is a list with two shapes: `[[hidden_size], [hidden_size]]` + (-1 for batch size would be automatically inserted into shape). These two + shapes correspond to :math:`h_{t-1}` and :math:`c_{t-1}` separately. + """ + return [[self._hidden_size], [self._hidden_size]] + + +class BasicGRUCell(RNNCell): + """ + Gated Recurrent Unit (GRU) RNN cell. + + The formula for GRU used is as follows: + + .. math:: + + u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u) + + r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r) + + \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c) + + h_t & = u_t \odot h_{t-1} + (1-u_t) \odot \\tilde{h_t} + + Please refer to `An Empirical Exploration of Recurrent Network Architectures + `_ for more details. + + Parameters: + input_size (int): The input size for the first GRU cell. + hidden_size (int): The hidden size for every GRU cell. + param_attr(ParamAttr, optional): The parameter attribute for the learnable + weight matrix. Default: None. + bias_attr (ParamAttr, optional): The parameter attribute for the bias + of LSTM. Default: None. + gate_activation (function, optional): The activation function for gates + of GRU, that is :math:`act_g` in the formula. Default: None, + representing for `fluid.layers.sigmoid`. + activation (function, optional): The non-gate activation function of + GRU, that is :math:`act_c` in the formula. Default: None, + representing for 'fluid.layers.tanh'. + dtype(string, optional): The data type used in this cell. Default float32. + + Examples: + + .. code-block:: python + + import paddle + import paddle.fluid as fluid + from paddle.incubate.hapi.text import BasicGRUCell, RNN + + inputs = paddle.rand((2, 4, 32)) + cell = BasicGRUCell(input_size=32, hidden_size=64) + rnn = RNN(cell=cell) + outputs, _ = rnn(inputs) # [2, 4, 64] + """ + + def __init__(self, + input_size, + hidden_size, + param_attr=None, + bias_attr=None, + gate_activation=None, + activation=None, + dtype='float32'): + super(BasicGRUCell, self).__init__() + self._input_size = input_size + self._hidden_size = hidden_size + self._param_attr = param_attr + self._bias_attr = bias_attr + self._gate_activation = gate_activation or layers.sigmoid + self._activation = activation or layers.tanh + self._dtype = dtype + + if self._param_attr is not None and self._param_attr.name is not None: + gate_param_attr = copy.deepcopy(self._param_attr) + candidate_param_attr = copy.deepcopy(self._param_attr) + gate_param_attr.name += "_gate" + candidate_param_attr.name += "_candidate" + else: + gate_param_attr = self._param_attr + candidate_param_attr = self._param_attr + + self._gate_weight = self.create_parameter( + attr=gate_param_attr, + shape=[ + self._input_size + self._hidden_size, 2 * self._hidden_size + ], + dtype=self._dtype) + + self._candidate_weight = self.create_parameter( + attr=candidate_param_attr, + shape=[self._input_size + self._hidden_size, self._hidden_size], + dtype=self._dtype) + + if self._bias_attr is not None and self._bias_attr.name is not None: + gate_bias_attr = copy.deepcopy(self._bias_attr) + candidate_bias_attr = copy.deepcopy(self._bias_attr) + gate_bias_attr.name += "_gate" + candidate_bias_attr.name += "_candidate" + else: + gate_bias_attr = self._bias_attr + candidate_bias_attr = self._bias_attr + + self._gate_bias = self.create_parameter( + attr=gate_bias_attr, + shape=[2 * self._hidden_size], + dtype=self._dtype, + is_bias=True) + self._candidate_bias = self.create_parameter( + attr=candidate_bias_attr, + shape=[self._hidden_size], + dtype=self._dtype, + is_bias=True) + + def forward(self, inputs, states): + """ + Performs single step GRU calculations. + + Parameters: + inputs (Variable): A tensor with shape `[batch_size, input_size]`, + corresponding to :math:`x_t` in the formula. The data type + should be float32 or float64. + states (Variable): A tensor with shape `[batch_size, hidden_size]`. + corresponding to :math:`h_{t-1}` in the formula. The data type + should be float32 or float64. + + Returns: + tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` and \ + `new_states` is the same tensor shaped `[batch_size, hidden_size]`, \ + corresponding to :math:`h_t` in the formula. The data type of the \ + tensor is same as that of `states`. + """ + pre_hidden = states + concat_input_hidden = layers.concat([inputs, pre_hidden], axis=1) + + gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight) + + gate_input = layers.elementwise_add(gate_input, self._gate_bias) + + gate_input = self._gate_activation(gate_input) + r, u = layers.split(gate_input, num_or_sections=2, dim=1) + + r_hidden = r * pre_hidden + + candidate = layers.matmul( + layers.concat([inputs, r_hidden], 1), self._candidate_weight) + candidate = layers.elementwise_add(candidate, self._candidate_bias) + + c = self._activation(candidate) + new_hidden = u * pre_hidden + (1 - u) * c + + return new_hidden, new_hidden + + @property + def state_shape(self): + """ + The `state_shape` of BasicGRUCell is a shape `[hidden_size]` (-1 for batch + size would be automatically inserted into shape). The shape corresponds + to :math:`h_{t-1}`. + """ + return [self._hidden_size] + + +class RNN(Layer): + """ + RNN creates a recurrent neural network specified by RNNCell `cell`, which + performs :code:`cell.forward()` repeatedly until reaches to the maximum + length of `inputs`. + + Parameters: + cell(RNNCell): An instance of `RNNCell`. + is_reverse (bool, optional): Indicate whether to calculate in the reverse + order of input sequences. Default: `False`. + time_major (bool, optional): Indicate the data layout of Tensor included + in `input` and `output` tensors. If `False`, the data layout would + be batch major with shape `[batch_size, sequence_length, ...]`. If + `True`, the data layout would be time major with shape + `[sequence_length, batch_size, ...]`. Default: `False`. + + Examples: + + .. code-block:: python + + import paddle + import paddle.fluid as fluid + from paddle.incubate.hapi.text import StackedLSTMCell, RNN + + inputs = paddle.rand((2, 4, 32)) + cell = StackedLSTMCell(input_size=32, hidden_size=64) + rnn = RNN(cell=cell) + outputs, _ = rnn(inputs) # [2, 4, 64] + """ + + def __init__(self, cell, is_reverse=False, time_major=False): + super(RNN, self).__init__() + self.cell = cell + if not hasattr(self.cell, "call"): + self.cell.call = self.cell.forward + self.is_reverse = is_reverse + self.time_major = time_major + self.batch_index, self.time_step_index = (1, 0) if time_major else (0, + 1) + + def forward(self, + inputs, + initial_states=None, + sequence_length=None, + **kwargs): + """ + Performs :code:`cell.forward()` repeatedly until reaches to the maximum + length of `inputs`. + + Parameters: + inputs (Variable): A (possibly nested structure of) tensor variable[s]. + The shape of tensor should be `[batch_size, sequence_length, ...]` + for `time_major == False` or `[sequence_length, batch_size, ...]` + for `time_major == True`. It represents the inputs to be unrolled + in RNN. + initial_states (Variable, optional): A (possibly nested structure of) + tensor variable[s], representing the initial state for RNN. + If not provided, `cell.get_initial_states` would be used to produce + the initial state. Default None. + sequence_length (Variable, optional): A tensor with shape `[batch_size]`. + It stores real length of each instance, thus enables users to extract + the last valid state when past a batch element's sequence length for + correctness. If not provided, the paddings would be treated same as + non-padding inputs. Default None. + **kwargs: Additional keyword arguments. Arguments passed to `cell.forward`. + + Returns: + tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \ + outputs and states, both are Tensor or nested structure of Tensor. \ + `final_outputs` has the same structure and data types as \ + the returned `outputs` of :code:`cell.forward` , and each Tenser in `final_outputs` \ + stacks all time steps' counterpart in `outputs` thus has shape `[batch_size, sequence_length, ...]` \ + for `time_major == False` or `[sequence_length, batch_size, ...]` for `time_major == True`. \ + `final_states` is the counterpart at last time step of initial states, \ + thus has the same structure with it and has tensors with same shapes \ + and data types. + """ + if fluid.in_dygraph_mode(): + + class ArrayWrapper(object): + def __init__(self, x): + self.array = [x] + + def append(self, x): + self.array.append(x) + return self + + def _maybe_copy(state, new_state, step_mask): + # TODO: use where_op + new_state = fluid.layers.elementwise_mul( + new_state, step_mask, + axis=0) - fluid.layers.elementwise_mul( + state, (step_mask - 1), axis=0) + return new_state + + flat_inputs = flatten(inputs) + batch_size, time_steps = ( + flat_inputs[0].shape[self.batch_index], + flat_inputs[0].shape[self.time_step_index]) + + if initial_states is None: + initial_states = self.cell.get_initial_states( + batch_ref=inputs, batch_dim_idx=self.batch_index) + + if not self.time_major: + inputs = map_structure( + lambda x: fluid.layers.transpose(x, [1, 0] + list( + range(2, len(x.shape)))), inputs) + + if sequence_length is not None: + mask = fluid.layers.sequence_mask( + sequence_length, + maxlen=time_steps, + dtype=flatten(initial_states)[0].dtype) + mask = fluid.layers.transpose(mask, [1, 0]) + + if self.is_reverse: + inputs = map_structure( + lambda x: fluid.layers.reverse(x, axis=[0]), inputs) + mask = fluid.layers.reverse( + mask, axis=[0]) if sequence_length is not None else None + + states = initial_states + outputs = [] + for i in range(time_steps): + step_inputs = map_structure(lambda x: x[i], inputs) + step_outputs, new_states = self.cell(step_inputs, states, + **kwargs) + if sequence_length is not None: + new_states = map_structure( + partial( + _maybe_copy, step_mask=mask[i]), + states, + new_states) + states = new_states + outputs = map_structure( + lambda x: ArrayWrapper(x), + step_outputs) if i == 0 else map_structure( + lambda x, x_array: x_array.append(x), step_outputs, + outputs) + + final_outputs = map_structure( + lambda x: fluid.layers.stack(x.array, axis=self.time_step_index + ), outputs) + + if self.is_reverse: + final_outputs = map_structure( + lambda x: fluid.layers.reverse(x, axis=self.time_step_index + ), final_outputs) + + final_states = new_states + else: + final_outputs, final_states = fluid.layers.rnn( + self.cell, + inputs, + initial_states=initial_states, + sequence_length=sequence_length, + time_major=self.time_major, + is_reverse=self.is_reverse, + **kwargs) + return final_outputs, final_states + + +class StackedRNNCell(RNNCell): + """ + Wrapper allowing a stack of RNN cells to behave as a single cell. It is used + to implement stacked RNNs. + + Parameters: + cells (list|tuple): List of RNN cell instances. + + Examples: + + .. code-block:: python + + from paddle.incubate.hapi.text import BasicLSTMCell, StackedRNNCell + + cells = [BasicLSTMCell(32, 32), BasicLSTMCell(32, 32)] + stack_rnn = StackedRNNCell(cells) + """ + + def __init__(self, cells): + super(StackedRNNCell, self).__init__() + self.cells = [] + for i, cell in enumerate(cells): + self.cells.append(self.add_sublayer("cell_%d" % i, cell)) + + def forward(self, inputs, states, **kwargs): + """ + Performs :code:`cell.forward` for all including cells sequentially. + Each cell's `inputs` is the `outputs` of the previous cell. And each + cell's `states` is the corresponding one in `states`. + + Parameters: + inputs (Variable): The inputs for the first cell. Mostly it is a + float32 or float64 tensor with shape `[batch_size, input_size]`. + states (list): A list containing states for all cells orderly. + **kwargs: Additional keyword arguments, which passed to `cell.forward` + for all including cells. + + Returns: + tuple: A tuple( :code:`(outputs, new_states)` ). `outputs` is the \ + `outputs` of the last cell. `new_states` is a list composed \ + of all cells' `new_states`, and its structure and data type is \ + same as that of `states` argument. + """ + new_states = [] + for cell, state in zip(self.cells, states): + outputs, new_state = cell(inputs, state, **kwargs) + inputs = outputs + new_states.append(new_state) + return outputs, new_states + + @staticmethod + def stack_param_attr(param_attr, n): + """ + If `param_attr` is a list or tuple, convert every element in it to a + ParamAttr instance. Otherwise, repeat `param_attr` `n` times to + construct a list, and rename every one by appending a increasing index + suffix to avoid having same names when `param_attr` contains a name. + + Parameters: + param_attr (list|tuple|ParamAttr): A list, tuple or something can be + converted to a ParamAttr instance by `ParamAttr._to_attr`. + n (int): The times to repeat to construct a list when `param_attr` + is not a list or tuple. + + Returns: + list: A list composed of each including cell's `param_attr`. + """ + if isinstance(param_attr, (list, tuple)): + assert len(param_attr) == n, ( + "length of param_attr should be %d when it is a list/tuple" % + n) + param_attrs = [ + fluid.ParamAttr._to_attr(attr) for attr in param_attr + ] + else: + param_attrs = [] + attr = fluid.ParamAttr._to_attr(param_attr) + for i in range(n): + attr_i = copy.deepcopy(attr) + if attr.name: + attr_i.name = attr_i.name + "_" + str(i) + param_attrs.append(attr_i) + return param_attrs + + @property + def state_shape(self): + """ + The `state_shape` of StackedRNNCell is a list composed of each including + cell's `state_shape`. + + Returns: + list: A list composed of each including cell's `state_shape`. + """ + return [cell.state_shape for cell in self.cells] + + +class StackedLSTMCell(RNNCell): + """ + Wrapper allowing a stack of LSTM cells to behave as a single cell. It is used + to implement stacked LSTM. + + The formula for LSTM used here is as follows: + + .. math:: + + i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i}) + + f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias) + + c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c}) + + o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o}) + + h_{t} & = o_{t} act_c (c_{t}) + + + Parameters: + input_size (int): The input size for the first LSTM cell. + hidden_size (int): The hidden size for every LSTM cell. + gate_activation (function, optional): The activation function for gates + of LSTM, that is :math:`act_g` in the formula. Default: None, + representing for `fluid.layers.sigmoid`. + activation (function, optional): The non-gate activation function of + LSTM, that is :math:`act_c` in the formula. Default: None, + representing for 'fluid.layers.tanh'. + forget_bias (float, optional): forget bias used when computing forget + gate. It also can accept a boolean value `True`, which would set + :math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and + :math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in + http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf . + Default 1.0. + num_layers(int, optional): The number of LSTM to be stacked. Default 1. + dropout(float|list|tuple, optional): The dropout probability after each + LSTM. It also can be a list or tuple, including dropout probabilities + for the corresponding LSTM. Default 0.0 + param_attr (list|tuple|ParamAttr): A list, tuple or something can be + converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is + a list or tuple, it's length must equal to `num_layers`. Otherwise, + construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`. + Default None. + bias_attr (list|tuple|ParamAttr): A list, tuple or something can be + converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is + a list or tuple, it's length must equal to `num_layers`. Otherwise, + construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`. + Default None. + dtype(string, optional): The data type used in this cell. It can be + float32 or float64. Default float32. + + Examples: + + .. code-block:: python + + import paddle + import paddle.fluid as fluid + from paddle.incubate.hapi.text import StackedLSTMCell, RNN + + inputs = paddle.rand((2, 4, 32)) + cell = StackedLSTMCell(input_size=32, hidden_size=64) + rnn = RNN(cell=cell) + outputs, _ = rnn(inputs) # [2, 4, 64] + """ + + def __init__(self, + input_size, + hidden_size, + gate_activation=None, + activation=None, + forget_bias=1.0, + num_layers=1, + dropout=0.0, + param_attr=None, + bias_attr=None, + dtype="float32"): + super(StackedLSTMCell, self).__init__() + self.dropout = utils.convert_to_list(dropout, num_layers, "dropout", + float) + param_attrs = StackedRNNCell.stack_param_attr(param_attr, num_layers) + bias_attrs = StackedRNNCell.stack_param_attr(bias_attr, num_layers) + + self.cells = [] + for i in range(num_layers): + if forget_bias is True: + bias_attrs[ + i].initializer = fluid.initializer.NumpyArrayInitializer( + np.concatenate( + np.zeros(2 * hidden_size), + np.ones(hidden_size), np.zeros(hidden_size)) + .astype(dtype)) + forget_bias = 0.0 + self.cells.append( + self.add_sublayer( + "lstm_%d" % i, + BasicLSTMCell( + input_size=input_size if i == 0 else hidden_size, + hidden_size=hidden_size, + gate_activation=gate_activation, + activation=activation, + forget_bias=forget_bias, + param_attr=param_attrs[i], + bias_attr=bias_attrs[i], + dtype=dtype))) + + def forward(self, inputs, states): + """ + Performs the stacked LSTM cells sequentially. Each cell's `inputs` is + the `outputs` of the previous cell. And each cell's `states` is the + corresponding one in `states`. + + Parameters: + inputs (Variable): The inputs for the first cell. It is a float32 or + float64 tensor with shape `[batch_size, input_size]`. + states (list): A list containing states for all cells orderly. + **kwargs: Additional keyword arguments, which passed to `cell.forward` + for all including cells. + + Returns: + tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` is \ + a tensor with shape `[batch_size, hidden_size]`, corresponding \ + to :math:`h_{t}` in the formula of the last LSTM; `new_states` \ + is a list composed of every LSTM `new_states` which is a pair \ + of tensors standing for :math:`h_{t}, c_{t}` in the formula, \ + and the data type and structure of these tensors all is same \ + as that of `states`. + """ + new_states = [] + for i, cell in enumerate(self.cells): + outputs, new_state = cell(inputs, states[i]) + outputs = layers.dropout( + outputs, + self.dropout[i], + dropout_implementation='upscale_in_train') if self.dropout[ + i] > 0 else outputs + inputs = outputs + new_states.append(new_state) + return outputs, new_states + + @property + def state_shape(self): + """ + The `state_shape` of StackedLSTMCell is a list composed of each including + LSTM cell's `state_shape`. + + Returns: + list: A list composed of each including LSTM cell's `state_shape`. + """ + return [cell.state_shape for cell in self.cells] + + +class LSTM(Layer): + """ + Applies a stacked multi-layer long short-term memory (LSTM) RNN to an input + sequence. + + The formula for LSTM used here is as follows: + + .. math:: + + i_{t} & = act_g(W_{x_{i}}x_{t} + W_{h_{i}}h_{t-1} + b_{i}) + + f_{t} & = act_g(W_{x_{f}}x_{t} + W_{h_{f}}h_{t-1} + b_{f} + forget\\_bias) + + c_{t} & = f_{t}c_{t-1} + i_{t} act_c (W_{x_{c}}x_{t} + W_{h_{c}}h_{t-1} + b_{c}) + + o_{t} & = act_g(W_{x_{o}}x_{t} + W_{h_{o}}h_{t-1} + b_{o}) + + h_{t} & = o_{t} act_c (c_{t}) + + + Parameters: + input_size (int): The input feature size for the first LSTM. + hidden_size (int): The hidden size for every LSTM. + gate_activation (function, optional): The activation function for gates + of LSTM, that is :math:`act_g` in the formula. Default: None, + representing for `fluid.layers.sigmoid`. + activation (function, optional): The non-gate activation function of + LSTM, that is :math:`act_c` in the formula. Default: None, + representing for 'fluid.layers.tanh'. + forget_bias (float, optional): forget bias used when computing forget + gate. It also can accept a boolean value `True`, which would set + :math:`forget\\_bias` as 0 but initialize :math:`b_{f}` as 1 and + :math:`b_{i}, b_{f}, b_{c}, b_{0}` as 0. This is recommended in + http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf . + Default 1.0. + num_layers(int, optional): The number of LSTM to be stacked. Default 1. + dropout(float|list|tuple, optional): The dropout probability after each + LSTM. It also can be a list or tuple, including dropout probabilities + for the corresponding LSTM. Default 0.0 + is_reverse (bool, optional): Indicate whether to calculate in the reverse + order of input sequences. Default: `False`. + time_major (bool, optional): Indicate the data layout of Tensor included + in `input` and `output` tensors. If `False`, the data layout would + be batch major with shape `[batch_size, sequence_length, ...]`. If + `True`, the data layout would be time major with shape + `[sequence_length, batch_size, ...]`. Default: `False`. + param_attr (list|tuple|ParamAttr): A list, tuple or something can be + converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is + a list or tuple, it's length must equal to `num_layers`. Otherwise, + construct a list by `StackedRNNCell.stack_param_attr(param_attr, num_layers)`. + Default None. + bias_attr (list|tuple|ParamAttr): A list, tuple or something can be + converted to a ParamAttr instance by `ParamAttr._to_attr`. If it is + a list or tuple, it's length must equal to `num_layers`. Otherwise, + construct a list by `StackedRNNCell.stack_param_attr(bias_attr, num_layers)`. + Default None. + dtype(string, optional): The data type used in this cell. It can be + float32 or float64. Default float32. + + Examples: + + .. code-block:: python + + import paddle + import paddle.fluid as fluid + from paddle.incubate.hapi.text import LSTM + + inputs = paddle.rand((2, 4, 32)) + lstm = LSTM(input_size=32, hidden_size=64, num_layers=2) + outputs, _ = lstm(inputs) # [2, 4, 64] + """ + + def __init__(self, + input_size, + hidden_size, + gate_activation=None, + activation=None, + forget_bias=1.0, + num_layers=1, + dropout=0.0, + is_reverse=False, + time_major=False, + param_attr=None, + bias_attr=None, + dtype='float32'): + super(LSTM, self).__init__() + lstm_cell = StackedLSTMCell(input_size, hidden_size, gate_activation, + activation, forget_bias, num_layers, + dropout, param_attr, bias_attr, dtype) + self.lstm = RNN(lstm_cell, is_reverse, time_major) + + def forward(self, inputs, initial_states=None, sequence_length=None): + """ + Performs the stacked multi-layer LSTM layer by layer. Each LSTM's `outputs` + is the `inputs` of the subsequent one. + + Parameters: + inputs (Variable): The inputs for the first LSTM. It is a float32 + or float64 tensor shaped `[batch_size, sequence_length, input_size]`. + initial_states (list|None, optional): A list containing initial states + of all stacked LSTM, and the initial states of each LSTM is a pair + of tensors shaped `[batch_size, hidden_size]`. If not provided, + use 0 as initial states. Default None. + sequence_length (Variable, optional): A tensor with shape `[batch_size]`. + It stores real length of each instance, thus enables users to extract + the last valid state when past a batch element's sequence length for + correctness. If not provided, the paddings would be treated same as + non-padding inputs. Default None. + + Returns: + tuple: A tuple( :code:`(outputs, final_states)` ), where `outputs` \ + is the output of last LSTM and it is a tensor with shape \ + `[batch_size, sequence_length, hidden_size]` and has the same \ + data type as `inputs`, `final_states` is the counterpart of \ + `initial_states` at last time step, thus has the same structure \ + with it and has tensors with same shapes data types. + """ + return self.lstm(inputs, initial_states, sequence_length) diff --git a/example/distill/nlp/train.py b/example/distill/nlp/train.py index 5f9a85cc..d99c1d91 100644 --- a/example/distill/nlp/train.py +++ b/example/distill/nlp/train.py @@ -29,15 +29,21 @@ import os import sys -from model import CNN, AdamW, evaluate_student, BOW +from model import CNN, AdamW, evaluate_student, BOW, model_factory g_max_dev_acc = [] g_max_test_acc = [] +parser = argparse.ArgumentParser(__doc__) +parser.add_argument( + "--model", type=str, default="BOW", help="student model name") +args = parser.parse_args() +print("parsed args:", args) + def train_without_distill(train_reader, dev_reader, test_reader, word_dict, epoch_num, lr): - model = BOW(word_dict) + model = model_factory(args.model, word_dict) opt = AdamW( learning_rate=lr, parameter_list=model.parameters(), weight_decay=0.01) model.train() From c058d3660b2aa7748653def3cb09221e17268f21 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Sun, 28 Jun 2020 13:25:11 +0000 Subject: [PATCH 7/8] add --- example/distill/nlp/nets.py | 287 ++++++++++++++++++++++++++++++++++++ 1 file changed, 287 insertions(+) create mode 100644 example/distill/nlp/nets.py diff --git a/example/distill/nlp/nets.py b/example/distill/nlp/nets.py new file mode 100644 index 00000000..717f6635 --- /dev/null +++ b/example/distill/nlp/nets.py @@ -0,0 +1,287 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, Embedding +from paddle.fluid.dygraph import GRUUnit +from paddle.fluid.dygraph.base import to_variable +import numpy as np + + +class DynamicGRU(fluid.dygraph.Layer): + def __init__(self, + size, + param_attr=None, + bias_attr=None, + is_reverse=False, + gate_activation='sigmoid', + candidate_activation='tanh', + h_0=None, + origin_mode=False, + init_size=None): + super(DynamicGRU, self).__init__() + self.gru_unit = GRUUnit( + size * 3, + param_attr=param_attr, + bias_attr=bias_attr, + activation=candidate_activation, + gate_activation=gate_activation, + origin_mode=origin_mode) + self.size = size + self.h_0 = h_0 + self.is_reverse = is_reverse + + def forward(self, inputs): + hidden = self.h_0 + res = [] + for i in range(inputs.shape[1]): + if self.is_reverse: + i = inputs.shape[1] - 1 - i + input_ = inputs[:, i:i + 1, :] + input_ = fluid.layers.reshape( + input_, [-1, input_.shape[2]], inplace=False) + hidden, reset, gate = self.gru_unit(input_, hidden) + hidden_ = fluid.layers.reshape( + hidden, [-1, 1, hidden.shape[1]], inplace=False) + res.append(hidden_) + if self.is_reverse: + res = res[::-1] + res = fluid.layers.concat(res, axis=1) + return res + + +class SimpleConvPool(fluid.dygraph.Layer): + def __init__(self, + num_channels, + num_filters, + filter_size, + use_cudnn=False, + batch_size=None): + super(SimpleConvPool, self).__init__() + self.batch_size = batch_size + self._conv2d = Conv2D( + num_channels=num_channels, + num_filters=num_filters, + filter_size=filter_size, + padding=[1, 1], + use_cudnn=use_cudnn, + act='tanh') + + def forward(self, inputs): + x = self._conv2d(inputs) + x = fluid.layers.reduce_max(x, dim=-1) + x = fluid.layers.reshape(x, shape=[self.batch_size, -1]) + return x + + +class CNN(fluid.dygraph.Layer): + def __init__(self, dict_dim, batch_size, seq_len): + super(CNN, self).__init__() + self.dict_dim = dict_dim + self.emb_dim = 128 + self.hid_dim = 128 + self.fc_hid_dim = 96 + self.class_dim = 2 + self.channels = 1 + self.win_size = [3, self.hid_dim] + self.batch_size = batch_size + self.seq_len = seq_len + self.embedding = Embedding( + size=[self.dict_dim + 1, self.emb_dim], + dtype='float32', + is_sparse=False) + self._simple_conv_pool_1 = SimpleConvPool( + self.channels, + self.hid_dim, + self.win_size, + batch_size=self.batch_size) + self._fc1 = Linear( + input_dim=self.hid_dim * self.seq_len, + output_dim=self.fc_hid_dim, + act="softmax") + self._fc_prediction = Linear( + input_dim=self.fc_hid_dim, + output_dim=self.class_dim, + act="softmax") + + def forward(self, inputs, label=None): + emb = self.embedding(inputs) + o_np_mask = ( + inputs.numpy().reshape(-1, 1) != self.dict_dim).astype('float32') + mask_emb = fluid.layers.expand( + to_variable(o_np_mask), [1, self.hid_dim]) + emb = emb * mask_emb + emb = fluid.layers.reshape( + emb, shape=[-1, self.channels, self.seq_len, self.hid_dim]) + conv_3 = self._simple_conv_pool_1(emb) + fc_1 = self._fc1(conv_3) + prediction = self._fc_prediction(fc_1) + if label: + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + return avg_cost, prediction, acc + else: + return prediction + + +class BOW(fluid.dygraph.Layer): + def __init__(self, dict_dim, batch_size, seq_len): + super(BOW, self).__init__() + self.dict_dim = dict_dim + self.emb_dim = 128 + self.hid_dim = 128 + self.fc_hid_dim = 96 + self.class_dim = 2 + self.batch_size = batch_size + self.seq_len = seq_len + self.embedding = Embedding( + size=[self.dict_dim + 1, self.emb_dim], + dtype='float32', + is_sparse=False) + self._fc1 = Linear( + input_dim=self.hid_dim, output_dim=self.hid_dim, act="tanh") + self._fc2 = Linear( + input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh") + self._fc_prediction = Linear( + input_dim=self.fc_hid_dim, + output_dim=self.class_dim, + act="softmax") + + def forward(self, inputs, label=None): + emb = self.embedding(inputs) + o_np_mask = ( + inputs.numpy().reshape(-1, 1) != self.dict_dim).astype('float32') + mask_emb = fluid.layers.expand( + to_variable(o_np_mask), [1, self.hid_dim]) + emb = emb * mask_emb + emb = fluid.layers.reshape(emb, shape=[-1, self.seq_len, self.hid_dim]) + bow_1 = fluid.layers.reduce_sum(emb, dim=1) + bow_1 = fluid.layers.tanh(bow_1) + fc_1 = self._fc1(bow_1) + fc_2 = self._fc2(fc_1) + prediction = self._fc_prediction(fc_2) + if label is not None: + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + return avg_cost, prediction, acc + else: + return prediction + + +class GRU(fluid.dygraph.Layer): + def __init__(self, dict_dim, batch_size, seq_len): + super(GRU, self).__init__() + self.dict_dim = dict_dim + self.emb_dim = 128 + self.hid_dim = 128 + self.fc_hid_dim = 96 + self.class_dim = 2 + self.batch_size = batch_size + self.seq_len = seq_len + self.embedding = Embedding( + size=[self.dict_dim + 1, self.emb_dim], + dtype='float32', + param_attr=fluid.ParamAttr(learning_rate=30), + is_sparse=False) + h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32") + h_0 = to_variable(h_0) + self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.hid_dim * 3) + self._fc2 = Linear( + input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh") + self._fc_prediction = Linear( + input_dim=self.fc_hid_dim, + output_dim=self.class_dim, + act="softmax") + self._gru = DynamicGRU(size=self.hid_dim, h_0=h_0) + + def forward(self, inputs, label=None): + emb = self.embedding(inputs) + o_np_mask = to_variable( + inputs.numpy().reshape(-1, 1) != self.dict_dim).astype('float32') + mask_emb = fluid.layers.expand( + to_variable(o_np_mask), [1, self.hid_dim]) + emb = emb * mask_emb + emb = fluid.layers.reshape( + emb, shape=[self.batch_size, -1, self.hid_dim]) + fc_1 = self._fc1(emb) + gru_hidden = self._gru(fc_1) + gru_hidden = fluid.layers.reduce_max(gru_hidden, dim=1) + tanh_1 = fluid.layers.tanh(gru_hidden) + fc_2 = self._fc2(tanh_1) + prediction = self._fc_prediction(fc_2) + if label: + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + return avg_cost, prediction, acc + else: + return prediction + + +class BiGRU(fluid.dygraph.Layer): + def __init__(self, dict_dim, batch_size, seq_len): + super(BiGRU, self).__init__() + self.dict_dim = dict_dim + self.emb_dim = 128 + self.hid_dim = 128 + self.fc_hid_dim = 96 + self.class_dim = 2 + self.batch_size = batch_size + self.seq_len = seq_len + self.embedding = Embedding( + size=[self.dict_dim + 1, self.emb_dim], + dtype='float32', + param_attr=fluid.ParamAttr(learning_rate=30), + is_sparse=False) + h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32") + h_0 = to_variable(h_0) + self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.hid_dim * 3) + self._fc2 = Linear( + input_dim=self.hid_dim * 2, output_dim=self.fc_hid_dim, act="tanh") + self._fc_prediction = Linear( + input_dim=self.fc_hid_dim, + output_dim=self.class_dim, + act="softmax") + self._gru_forward = DynamicGRU( + size=self.hid_dim, h_0=h_0, is_reverse=False) + self._gru_backward = DynamicGRU( + size=self.hid_dim, h_0=h_0, is_reverse=True) + + def forward(self, inputs, label=None): + emb = self.embedding(inputs) + o_np_mask = to_variable( + inputs.numpy().reshape(-1, 1) != self.dict_dim).astype('float32') + mask_emb = fluid.layers.expand( + to_variable(o_np_mask), [1, self.hid_dim]) + emb = emb * mask_emb + emb = fluid.layers.reshape( + emb, shape=[self.batch_size, -1, self.hid_dim]) + fc_1 = self._fc1(emb) + gru_forward = self._gru_forward(fc_1) + gru_backward = self._gru_backward(fc_1) + gru_forward_tanh = fluid.layers.tanh(gru_forward) + gru_backward_tanh = fluid.layers.tanh(gru_backward) + encoded_vector = fluid.layers.concat( + input=[gru_forward_tanh, gru_backward_tanh], axis=2) + encoded_vector = fluid.layers.reduce_max(encoded_vector, dim=1) + fc_2 = self._fc2(encoded_vector) + prediction = self._fc_prediction(fc_2) + if label: + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + return avg_cost, prediction, acc + else: + return prediction From 2d5ac20e522c1ba6c0caf62cb414a9229bb99f99 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Mon, 29 Jun 2020 06:04:21 +0000 Subject: [PATCH 8/8] fix some test=develop --- example/distill/nlp/distill.py | 48 ++++++++++++++++++-------------- example/distill/nlp/lstm.py | 18 ++++++++---- example/distill/nlp/model.py | 12 ++++++-- example/distill/nlp/nets.py | 41 ++++++++++++++++------------ example/distill/nlp/train.py | 50 ++++++++++++++++++++-------------- 5 files changed, 103 insertions(+), 66 deletions(-) diff --git a/example/distill/nlp/distill.py b/example/distill/nlp/distill.py index 3b684241..d2872aca 100644 --- a/example/distill/nlp/distill.py +++ b/example/distill/nlp/distill.py @@ -53,7 +53,7 @@ parser.add_argument( "--use_data_au", type=int, default=1, help="use data augmentation") parser.add_argument( - "--T", type=float, default=2.0, help="weight of student in loss") + "--T", type=float, default=None, help="weight of student in loss") parser.add_argument( "--model", type=str, default="BOW", help="student model name") args = parser.parse_args() @@ -100,30 +100,34 @@ def train_with_distill(train_reader, dev_reader, word_dict, test_reader, ) * loss_kd else: loss_kd = KL_T(logits_s, logits_t, args.T) - loss = args.T * args.T * (args.s_weight * loss_ce + - (1.0 - args.s_weight) * loss_kd) + loss = args.T * args.T * (loss_ce + loss_kd) + #loss_kd = KL(logits_s, logits_t) + #loss = loss_ce + loss_kd loss = L.reduce_mean(loss) loss.backward() - if step % 10 == 0: + if step % 100 == 0: + print("stduent logits:", logits_s) + print("teatcher logits:", logits_t) print('[step %03d] distill train loss %.5f lr %.3e' % (step, loss.numpy(), opt.current_step_lr())) opt.minimize(loss) model.clear_gradients() f1, acc = evaluate_student(model, dev_reader) - print('student on dev f1 %.5f acc %.5f' % (f1, acc)) + print('student on dev f1 %.5f acc %.5f epoch_no %d' % (f1, acc, epoch)) if max_dev_acc < acc: max_dev_acc = acc f1, acc = evaluate_student(model, test_reader) - print('student on test f1 %.5f acc %.5f' % (f1, acc)) + print('student on test f1 %.5f acc %.5f epoch_no %d' % + (f1, acc, epoch)) if max_test_acc < acc: max_test_acc = acc - g_max_dev_acc.append(g_max_dev_acc) - g_max_test_acc.append(g_max_test_acc) + g_max_dev_acc.append(max_dev_acc) + g_max_test_acc.append(max_test_acc) def ernie_reader(s_reader, key_list): @@ -154,10 +158,7 @@ def reader(): return reader -if __name__ == "__main__": - place = F.CUDAPlace(0) - D.guard(place).__enter__() - +def train(): ds = ChnSentiCorp() word_dict = ds.student_word_dict("./data/vocab.bow.txt") batch_size = 16 @@ -194,14 +195,21 @@ def reader(): input_files, word_dict, batch_size=batch_size) dr_t = dr.set_batch_generator(ernie_reader(dr_train_reader, feed_keys)) + train_with_distill( + dr_t, dev_reader, word_dict, test_reader, epoch_num=args.epoch_num) + + +if __name__ == "__main__": + place = F.CUDAPlace(0) + D.guard(place).__enter__() + for i in range(args.train_range): - train_with_distill( - dr_t, dev_reader, word_dict, test_reader, epoch_num=args.epoch_num) + train() - arr = np.array(g_max_dev_acc) - print("max_dev_acc:", arr, "average:", np.average(arr), "train_args:", - args) + arr = np.array(g_max_dev_acc) + print("max_dev_acc:", arr, "average:", np.average(arr), "train_args:", + args) - arr = np.array(g_max_test_acc) - print("max_test_acc:", arr, "average:", np.average(arr), "train_args:", - args) + arr = np.array(g_max_test_acc) + print("max_test_acc:", arr, "average:", np.average(arr), "train_args:", + args) diff --git a/example/distill/nlp/lstm.py b/example/distill/nlp/lstm.py index 37704832..bae8b253 100644 --- a/example/distill/nlp/lstm.py +++ b/example/distill/nlp/lstm.py @@ -30,21 +30,27 @@ import sys from paddle_serving_client import Client from paddle_serving_app.reader import ChineseBertReader -from text_basic import LSTM +from text_basic import LSTM as basic_lstm -class LSTM(D.layer): +class LSTM(D.Layer): def __init__(self, word_dict): super().__init__() - self.emb = D.Embedding(len(word_dict), 300) - self.lstm = LSTM(input_size=300, hidden_size=150) + self.emb = D.Embedding([len(word_dict), 300]) + self.lstm = basic_lstm(input_size=300, hidden_size=150) self.fc = D.Linear(150, 2) def forward(self, ids, labels=None): embbed = self.emb(ids) - lstm_out, self.hidden = self.lstm(embbed) - logits = self.fc(lstm_out[-1]) + #print("embed shape:", embbed.shape) + + lstm_out, hidden = self.lstm(embbed) + #print("lstm_out shape:", lstm_out.shape) + #print("hiden list len:", len(hidden)) + + logits = self.fc(lstm_out[:, -1]) + #print("logits shape:", logits.shape) if labels is not None: if len(labels.shape) == 1: diff --git a/example/distill/nlp/model.py b/example/distill/nlp/model.py index f2f0dab4..c288f65b 100644 --- a/example/distill/nlp/model.py +++ b/example/distill/nlp/model.py @@ -30,7 +30,8 @@ import sys from paddle_serving_client import Client from paddle_serving_app.reader import ChineseBertReader -from lstm import GRU +from lstm import LSTM +from nets import GRU class AdamW(F.optimizer.AdamOptimizer): @@ -67,12 +68,15 @@ def KL_T(logits_s, logits_t, T=2.0): return loss -def evaluate_student(model, test_reader): +def evaluate_student(model, test_reader, batch_size=None): all_pred, all_label = [], [] with D.base._switch_tracer_mode_guard_(is_train=False): model.eval() for step, (ids_student, labels, _) in enumerate(test_reader()): - _, logits = model(ids_student) + if batch_size is not None: + _, logits = model(ids_student, batch_size=batch_size) + else: + _, logits = model(ids_student) pred = L.argmax(logits, -1) all_pred.extend(pred.numpy()) all_label.extend(labels.numpy()) @@ -152,6 +156,8 @@ def model_factory(model_name, word_dict): elif model_name == "CNN": return CNN(word_dict) elif model_name == "LSTM": + return LSTM(word_dict) + elif model_name == "GRU": return GRU(word_dict) else: assert False, "not supported model name:{}".format(model_name) diff --git a/example/distill/nlp/nets.py b/example/distill/nlp/nets.py index 717f6635..dc66ee96 100644 --- a/example/distill/nlp/nets.py +++ b/example/distill/nlp/nets.py @@ -15,6 +15,7 @@ from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, Embedding from paddle.fluid.dygraph import GRUUnit from paddle.fluid.dygraph.base import to_variable +import paddle.fluid.layers as L import numpy as np @@ -181,9 +182,9 @@ def forward(self, inputs, label=None): class GRU(fluid.dygraph.Layer): - def __init__(self, dict_dim, batch_size, seq_len): + def __init__(self, word_dict, batch_size=16, seq_len=256): super(GRU, self).__init__() - self.dict_dim = dict_dim + self.dict_dim = len(word_dict) self.emb_dim = 128 self.hid_dim = 128 self.fc_hid_dim = 96 @@ -206,13 +207,17 @@ def __init__(self, dict_dim, batch_size, seq_len): act="softmax") self._gru = DynamicGRU(size=self.hid_dim, h_0=h_0) - def forward(self, inputs, label=None): + def forward(self, inputs, labels=None): emb = self.embedding(inputs) + """ o_np_mask = to_variable( inputs.numpy().reshape(-1, 1) != self.dict_dim).astype('float32') mask_emb = fluid.layers.expand( to_variable(o_np_mask), [1, self.hid_dim]) emb = emb * mask_emb + """ + pad_mask = L.unsqueeze(L.cast(inputs != 0, 'float32'), [-1]) + emb = emb * pad_mask emb = fluid.layers.reshape( emb, shape=[self.batch_size, -1, self.hid_dim]) fc_1 = self._fc1(emb) @@ -221,13 +226,16 @@ def forward(self, inputs, label=None): tanh_1 = fluid.layers.tanh(gru_hidden) fc_2 = self._fc2(tanh_1) prediction = self._fc_prediction(fc_2) - if label: - cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(x=cost) - acc = fluid.layers.accuracy(input=prediction, label=label) - return avg_cost, prediction, acc + if labels is not None: + cost = fluid.layers.cross_entropy(input=prediction, label=labels) + #avg_cost = fluid.layers.mean(x=cost) + #acc = fluid.layers.accuracy(input=prediction, label=label) + return cost, prediction else: - return prediction + return None, prediction + + def lr(self, steps_per_epoch=None): + return 1e-3 class BiGRU(fluid.dygraph.Layer): @@ -251,9 +259,7 @@ def __init__(self, dict_dim, batch_size, seq_len): self._fc2 = Linear( input_dim=self.hid_dim * 2, output_dim=self.fc_hid_dim, act="tanh") self._fc_prediction = Linear( - input_dim=self.fc_hid_dim, - output_dim=self.class_dim, - act="softmax") + input_dim=self.fc_hid_dim, output_dim=self.class_dim) self._gru_forward = DynamicGRU( size=self.hid_dim, h_0=h_0, is_reverse=False) self._gru_backward = DynamicGRU( @@ -279,9 +285,10 @@ def forward(self, inputs, label=None): fc_2 = self._fc2(encoded_vector) prediction = self._fc_prediction(fc_2) if label: - cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(x=cost) - acc = fluid.layers.accuracy(input=prediction, label=label) - return avg_cost, prediction, acc + cost = fluid.layers.softmax_cross_entropy( + input=prediction, label=label) + #avg_cost = fluid.layers.mean(x=cost) + #acc = fluid.layers.accuracy(input=prediction, label=label) + return avg_cost, prediction else: - return prediction + return None, prediction diff --git a/example/distill/nlp/train.py b/example/distill/nlp/train.py index d99c1d91..4326f9b6 100644 --- a/example/distill/nlp/train.py +++ b/example/distill/nlp/train.py @@ -37,15 +37,20 @@ parser = argparse.ArgumentParser(__doc__) parser.add_argument( "--model", type=str, default="BOW", help="student model name") +parser.add_argument( + "--epoch_num", type=int, default=10, help="weight of student in loss") +parser.add_argument("--train_range", type=int, default=10, help="train range") args = parser.parse_args() print("parsed args:", args) def train_without_distill(train_reader, dev_reader, test_reader, word_dict, - epoch_num, lr): + epoch_num): model = model_factory(args.model, word_dict) opt = AdamW( - learning_rate=lr, parameter_list=model.parameters(), weight_decay=0.01) + learning_rate=model.lr(), + parameter_list=model.parameters(), + weight_decay=0.01) model.train() max_dev_acc = 0.0 @@ -62,13 +67,15 @@ def train_without_distill(train_reader, dev_reader, test_reader, word_dict, opt.minimize(loss) model.clear_gradients() f1, acc = evaluate_student(model, dev_reader) - print('train_without_distill on dev f1 %.5f acc %.5f' % (f1, acc)) + print('train_without_distill on dev f1 %.5f acc %.5f epoch_no %d' % + (f1, acc, epoch)) if max_dev_acc < acc: max_dev_acc = acc f1, acc = evaluate_student(model, test_reader) - print('train_without_distill on test f1 %.5f acc %.5f' % (f1, acc)) + print('train_without_distill on test f1 %.5f acc %.5f epoch_no %d' % + (f1, acc, epoch)) if max_test_acc < acc: max_test_acc = acc @@ -77,10 +84,7 @@ def train_without_distill(train_reader, dev_reader, test_reader, word_dict, g_max_test_acc.append(max_test_acc) -if __name__ == "__main__": - place = F.CUDAPlace(0) - D.guard(place).__enter__() - +def train(): ds = ChnSentiCorp() word_dict = ds.student_word_dict("./data/vocab.bow.txt") batch_size = 16 @@ -93,17 +97,23 @@ def train_without_distill(train_reader, dev_reader, test_reader, word_dict, test_reader = ds.pad_batch_reader( "./data/test.part.0", word_dict, batch_size=batch_size) - for i in range(10): - train_without_distill( - train_reader, - dev_reader, - test_reader, - word_dict, - epoch_num=10, - lr=1e-4) + train_without_distill( + train_reader, + dev_reader, + test_reader, + word_dict, + epoch_num=args.epoch_num) + + +if __name__ == "__main__": + place = F.CUDAPlace(0) + D.guard(place).__enter__() + + for i in range(args.train_range): + train() - arr = np.array(g_max_dev_acc) - print("max_dev_acc:", arr, "average:", np.average(arr)) + arr = np.array(g_max_dev_acc) + print("max_dev_acc:", arr, "average:", np.average(arr)) - arr = np.array(g_max_test_acc) - print("max_test_acc:", arr, "average:", np.average(arr)) + arr = np.array(g_max_test_acc) + print("max_test_acc:", arr, "average:", np.average(arr))