Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

【Hackathon 5th No.116】LSTM/RNNBase易用性提升 #56460

Closed
wants to merge 15 commits into from
71 changes: 53 additions & 18 deletions python/paddle/nn/layer/rnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,12 @@ class RNNCellBase(Layer):
"""

def get_initial_states(
self, batch_ref, shape=None, dtype=None, init_value=0.0, batch_dim_idx=0
self,
batch_ref,
shape=None,
dtype=None,
init_value=0.0,
batch_dim_idx=0,
):
r"""
Generate initialized states according to provided shape, data type and
Expand Down Expand Up @@ -865,6 +870,8 @@ class LSTMCell(RNNCellBase):
Parameters:
input_size (int): The input size.
hidden_size (int): The hidden size.
proj_size (int, optional): If specified, the output hidden state
will be projected to `proj_size`.
weight_ih_attr(ParamAttr, optional): The parameter attribute for
`weight_ih`. Default: None.
weight_hh_attr(ParamAttr, optional): The parameter attribute for
Expand All @@ -879,6 +886,7 @@ class LSTMCell(RNNCellBase):
Variables:
- **weight_ih** (Parameter): shape (4 * hidden_size, input_size), input to hidden weight, which corresponds to the concatenation of :math:`W_{ii}, W_{if}, W_{ig}, W_{io}` in the formula.
Asthestarsfalll marked this conversation as resolved.
Show resolved Hide resolved
- **weight_hh** (Parameter): shape (4 * hidden_size, hidden_size), hidden to hidden weight, which corresponds to the concatenation of :math:`W_{hi}, W_{hf}, W_{hg}, W_{ho}` in the formula.
Asthestarsfalll marked this conversation as resolved.
Show resolved Hide resolved
- **weight_ho** (Parameter, optional): shape (hidden_size, proj_size), project the hidden state.
- **bias_ih** (Parameter): shape (4 * hidden_size, ), input to hidden bias, which corresponds to the concatenation of :math:`b_{ii}, b_{if}, b_{ig}, b_{io}` in the formula.
- **bias_hh** (Parameter): shape (4 * hidden_size, ), hidden to hidden bias, swhich corresponds to the concatenation of :math:`b_{hi}, b_{hf}, b_{hg}, b_{ho}` in the formula.

Expand All @@ -888,7 +896,8 @@ class LSTMCell(RNNCellBase):

Returns:
- **outputs** (Tensor): shape `[batch_size, hidden_size]`, the output, corresponding to :math:`h_{t}` in the formula.
- **states** (tuple): a tuple of two tensors, each of shape `[batch_size, hidden_size]`, the new hidden states, corresponding to :math:`h_{t}, c_{t}` in the formula.
- **states** (tuple): a tuple of two tensors, each of shape `[batch_size, hidden_size]`, if proj_size is specified, output shape of the first element will be `[batch_size, proj_size]`
the new hidden states, corresponding to :math:`h_{t}, c_{t}` in the formula.

Notes:
All the weights and bias are initialized with `Uniform(-std, std)` by
Expand Down Expand Up @@ -921,6 +930,7 @@ def __init__(
self,
input_size,
hidden_size,
proj_size=None,
weight_ih_attr=None,
weight_hh_attr=None,
bias_ih_attr=None,
Expand All @@ -941,7 +951,7 @@ def __init__(
default_initializer=I.Uniform(-std, std),
)
self.weight_hh = self.create_parameter(
(4 * hidden_size, hidden_size),
(4 * hidden_size, proj_size or hidden_size),
weight_hh_attr,
default_initializer=I.Uniform(-std, std),
)
Expand All @@ -957,6 +967,13 @@ def __init__(
is_bias=True,
default_initializer=I.Uniform(-std, std),
)
self.proj_size = proj_size
if proj_size:
self.weight_ho = self.create_parameter(
(proj_size, hidden_size),
weight_hh_attr,
default_initializer=I.Uniform(-std, std),
)

self.hidden_size = hidden_size
self.input_size = input_size
Expand All @@ -966,6 +983,7 @@ def __init__(
def forward(self, inputs, states=None):
if states is None:
states = self.get_initial_states(inputs, self.state_shape)

pre_hidden, pre_cell = states
gates = paddle.matmul(inputs, self.weight_ih, transpose_y=True)
if self.bias_ih is not None:
Expand All @@ -981,6 +999,8 @@ def forward(self, inputs, states=None):
o = self._gate_activation(chunked_gates[3])
c = f * pre_cell + i * self._activation(chunked_gates[2])
h = o * self._activation(c)
if self.proj_size:
h = paddle.matmul(h, self.weight_ho, transpose_y=True)

return h, (h, c)

Expand All @@ -992,7 +1012,7 @@ def state_shape(self):
automatically inserted into shape). These two shapes correspond
to :math:`h_{t-1}` and :math:`c_{t-1}` separately.
"""
return ((self.hidden_size,), (self.hidden_size,))
return ((self.hidden_size,), (self.proj_size or self.hidden_size,))

def extra_repr(self):
return '{input_size}, {hidden_size}'.format(**self.__dict__)
Expand Down Expand Up @@ -1329,6 +1349,7 @@ def __init__(
hidden_size,
num_layers=1,
direction="forward",
proj_size=None,
time_major=False,
dropout=0.0,
weight_ih_attr=None,
Expand All @@ -1354,28 +1375,37 @@ def __init__(
"bias_hh_attr": bias_hh_attr,
}

self.proj_size = proj_size
if proj_size:
assert mode == 'LSTM'

if mode == "LSTM":
rnn_cls = LSTMCell
kwargs["proj_size"] = proj_size
elif mode == "GRU":
rnn_cls = GRUCell
else:
elif mode == "RNN_RELU":
rnn_cls = SimpleRNNCell
kwargs["activation"] = self.activation
kwargs["activation"] = 'relu'
elif mode == "RNN_TANH":
rnn_cls = SimpleRNNCell
kwargs["activation"] = 'tanh'

in_size = proj_size or hidden_size
if direction in ["forward"]:
is_reverse = False
cell = rnn_cls(input_size, hidden_size, **kwargs)
self.append(RNN(cell, is_reverse, time_major))
for i in range(1, num_layers):
cell = rnn_cls(hidden_size, hidden_size, **kwargs)
cell = rnn_cls(in_size, hidden_size, **kwargs)
self.append(RNN(cell, is_reverse, time_major))
elif direction in bidirectional_list:
cell_fw = rnn_cls(input_size, hidden_size, **kwargs)
cell_bw = rnn_cls(input_size, hidden_size, **kwargs)
self.append(BiRNN(cell_fw, cell_bw, time_major))
for i in range(1, num_layers):
cell_fw = rnn_cls(2 * hidden_size, hidden_size, **kwargs)
cell_bw = rnn_cls(2 * hidden_size, hidden_size, **kwargs)
cell_fw = rnn_cls(2 * in_size, hidden_size, **kwargs)
cell_bw = rnn_cls(2 * in_size, hidden_size, **kwargs)
self.append(BiRNN(cell_fw, cell_bw, time_major))
else:
raise ValueError(
Expand Down Expand Up @@ -1569,21 +1599,19 @@ def forward(self, inputs, initial_states=None, sequence_length=None):
batch_index = 1 if self.time_major else 0
dtype = inputs.dtype
if initial_states is None:
state_shape = (
self.num_layers * self.num_directions,
-1,
self.hidden_size,
)

state_shape = (self.num_layers * self.num_directions, -1)
dims = ([self.proj_size or self.hidden_size], [self.hidden_size])
fill_shape = list(state_shape)
if inputs.shape[batch_index] > 0:
fill_shape[1] = inputs.shape[batch_index]
else:
fill_shape[1] = paddle.shape(inputs)[batch_index].item()
initial_states = tuple(
[
paddle.full(shape=fill_shape, fill_value=0, dtype=dtype)
for _ in range(self.state_components)
paddle.full(
shape=fill_shape + dims[i], fill_value=0, dtype=dtype
)
for i in range(self.state_components)
]
)
else:
Expand Down Expand Up @@ -1745,6 +1773,7 @@ def __init__(
hidden_size,
num_layers,
direction,
None,
time_major,
dropout,
weight_ih_attr,
Expand Down Expand Up @@ -1793,6 +1822,8 @@ class LSTM(RNNBase):
direction (str, optional): The direction of the network. It can be "forward"
or "bidirect"(or "bidirectional"). When "bidirect", the way to merge
outputs of forward and backward is concatenating. Defaults to "forward".
proj_size (int, optional): If specified, the output hidden state of each layer
will be projected to `proj_size`.
time_major (bool, optional): Whether the first dimension of the input
means the time steps. If time_major is True, the shape of Tensor is
[time_steps,batch_size,input_size], otherwise [batch_size, time_steps,input_size].
Expand Down Expand Up @@ -1820,7 +1851,8 @@ class LSTM(RNNBase):

- **outputs** (Tensor): the output sequence. If `time_major` is True, the shape is `[time_steps, batch_size, num_directions * hidden_size]`, If `time_major` is False, the shape is `[batch_size, time_steps, num_directions * hidden_size]`. Note that `num_directions` is 2 if direction is "bidirectional" else 1. `time_steps` means the length of the output sequence.

- **final_states** (tuple): the final state, a tuple of two tensors, h and c. The shape of each is `[num_layers * num_directions, batch_size, hidden_size]`. Note that `num_directions` is 2 if direction is "bidirectional" (the index of forward states are 0, 2, 4, 6... and the index of backward states are 1, 3, 5, 7...), else 1.
- **final_states** (tuple): the final state, a tuple of two tensors, h and c. The shape of each is `[num_layers * num_directions, batch_size, hidden_size]`. If `proj_size` is specified, the last dimension of h will be proj_size.
Note that `num_directions` is 2 if direction is "bidirectional" (the index of forward states are 0, 2, 4, 6... and the index of backward states are 1, 3, 5, 7...), else 1.

Variables:
- **weight_ih_l[k]**: the learnable input-hidden weights of the k-th layer. If `k = 0`, the shape is `[hidden_size, input_size]`. Otherwise, the shape is `[hidden_size, num_directions * hidden_size]`.
Expand Down Expand Up @@ -1857,6 +1889,7 @@ def __init__(
hidden_size,
num_layers=1,
direction="forward",
proj_size=None,
time_major=False,
dropout=0.0,
weight_ih_attr=None,
Expand All @@ -1871,6 +1904,7 @@ def __init__(
hidden_size,
num_layers,
direction,
proj_size,
time_major,
dropout,
weight_ih_attr,
Expand Down Expand Up @@ -1990,6 +2024,7 @@ def __init__(
hidden_size,
num_layers,
direction,
None,
time_major,
dropout,
weight_ih_attr,
Expand Down
25 changes: 16 additions & 9 deletions test/dygraph_to_static/test_lstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@


class LSTMLayer(nn.Layer):
def __init__(self, in_channels, hidden_size):
def __init__(self, in_channels, hidden_size, proj_size=None):
super().__init__()
self.cell = nn.LSTM(
in_channels, hidden_size, direction='bidirectional', num_layers=2
Expand All @@ -36,9 +36,9 @@ def forward(self, x):


class Net(nn.Layer):
def __init__(self, in_channels, hidden_size):
def __init__(self, in_channels, hidden_size, proj_size=None):
super().__init__()
self.lstm = LSTMLayer(in_channels, hidden_size)
self.lstm = LSTMLayer(in_channels, hidden_size, proj_size=proj_size)

def forward(self, x):
x = self.lstm(x)
Expand All @@ -49,6 +49,8 @@ def forward(self, x):
class TestLstm(unittest.TestCase):
def setUp(self):
self.temp_dir = tempfile.TemporaryDirectory()
self.net = Net(12, 2)
self.inputs = paddle.zeros((2, 10, 12))

def tearDown(self):
self.temp_dir.cleanup()
Expand All @@ -60,10 +62,8 @@ def run_lstm(self, to_static):
paddle.static.default_main_program().random_seed = 1001
paddle.static.default_startup_program().random_seed = 1001

net = Net(12, 2)
net = paddle.jit.to_static(net)
x = paddle.zeros((2, 10, 12))
y = net(paddle.to_tensor(x))
net = paddle.jit.to_static(self.net)
y = net(paddle.to_tensor(self.inputs))
return y.numpy()

def test_lstm_to_static(self):
Expand All @@ -74,8 +74,8 @@ def test_lstm_to_static(self):
@ast_only_test
def test_save_in_eval(self, with_training=True):
paddle.jit.enable_to_static(True)
net = Net(12, 2)
x = paddle.randn((2, 10, 12))
net = self.net
x = self.inputs
if with_training:
x.stop_gradient = False
dygraph_out = net(x)
Expand Down Expand Up @@ -123,6 +123,13 @@ def test_save_without_training(self):
self.test_save_in_eval(with_training=False)


class TestLstmWithProjsize(unittest.TestCase):
def setUp(self):
self.temp_dir = tempfile.TemporaryDirectory()
self.net = Net(12, 2, 4)
self.inputs = paddle.zeros((2, 10, 12))


class LinearNet(nn.Layer):
def __init__(self):
super().__init__()
Expand Down
Loading