PaddlePaddle · Asthestarsfalll · Aug 19, 2023 · Aug 21, 2023 · Oct 16, 2023 · Oct 16, 2023
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
@@ -554,7 +554,12 @@ class RNNCellBase(Layer):
     """
 
     def get_initial_states(
-        self, batch_ref, shape=None, dtype=None, init_value=0.0, batch_dim_idx=0
+        self,
+        batch_ref,
+        shape=None,
+        dtype=None,
+        init_value=0.0,
+        batch_dim_idx=0,
     ):
         r"""
         Generate initialized states according to provided shape, data type and
@@ -865,6 +870,8 @@ class LSTMCell(RNNCellBase):
     Parameters:
         input_size (int): The input size.
         hidden_size (int): The hidden size.
+        proj_size (int, optional): If specified, the output hidden state
+            will be projected to `proj_size`.
         weight_ih_attr(ParamAttr, optional): The parameter attribute for
             `weight_ih`. Default: None.
         weight_hh_attr(ParamAttr, optional): The parameter attribute for
@@ -879,6 +886,7 @@ class LSTMCell(RNNCellBase):
     Variables:
         - **weight_ih** (Parameter): shape (4 * hidden_size, input_size), input to hidden weight, which corresponds to the concatenation of :math:`W_{ii}, W_{if}, W_{ig}, W_{io}` in the formula.
         - **weight_hh** (Parameter): shape (4 * hidden_size, hidden_size), hidden to hidden weight, which corresponds to the concatenation of :math:`W_{hi}, W_{hf}, W_{hg}, W_{ho}` in the formula.
+        - **weight_ho** (Parameter, optional): shape (hidden_size, proj_size), project the hidden state.
         - **bias_ih** (Parameter): shape (4 * hidden_size, ), input to hidden bias, which corresponds to the concatenation of :math:`b_{ii}, b_{if}, b_{ig}, b_{io}` in the formula.
         - **bias_hh** (Parameter): shape (4 * hidden_size, ), hidden to hidden bias, swhich corresponds to the concatenation of :math:`b_{hi}, b_{hf}, b_{hg}, b_{ho}` in the formula.
 
@@ -888,7 +896,8 @@ class LSTMCell(RNNCellBase):
 
     Returns:
         - **outputs** (Tensor): shape `[batch_size, hidden_size]`, the output, corresponding to :math:`h_{t}` in the formula.
-        - **states** (tuple): a tuple of two tensors, each of shape `[batch_size, hidden_size]`, the new hidden states, corresponding to :math:`h_{t}, c_{t}` in the formula.
+        - **states** (tuple): a tuple of two tensors, each of shape `[batch_size, hidden_size]`, if proj_size is specified, output shape of the first element will be `[batch_size, proj_size]`
+            the new hidden states, corresponding to :math:`h_{t}, c_{t}` in the formula.
 
     Notes:
         All the weights and bias are initialized with `Uniform(-std, std)` by
@@ -921,6 +930,7 @@ def __init__(
         self,
         input_size,
         hidden_size,
+        proj_size=None,
         weight_ih_attr=None,
         weight_hh_attr=None,
         bias_ih_attr=None,
@@ -941,7 +951,7 @@ def __init__(
             default_initializer=I.Uniform(-std, std),
         )
         self.weight_hh = self.create_parameter(
-            (4 * hidden_size, hidden_size),
+            (4 * hidden_size, proj_size or hidden_size),
             weight_hh_attr,
             default_initializer=I.Uniform(-std, std),
         )
@@ -957,6 +967,13 @@ def __init__(
             is_bias=True,
             default_initializer=I.Uniform(-std, std),
         )
+        self.proj_size = proj_size
+        if proj_size:
+            self.weight_ho = self.create_parameter(
+                (proj_size, hidden_size),
+                weight_hh_attr,
+                default_initializer=I.Uniform(-std, std),
+            )
 
         self.hidden_size = hidden_size
         self.input_size = input_size
@@ -966,6 +983,7 @@ def __init__(
     def forward(self, inputs, states=None):
         if states is None:
             states = self.get_initial_states(inputs, self.state_shape)
+
         pre_hidden, pre_cell = states
         gates = paddle.matmul(inputs, self.weight_ih, transpose_y=True)
         if self.bias_ih is not None:
@@ -981,6 +999,8 @@ def forward(self, inputs, states=None):
         o = self._gate_activation(chunked_gates[3])
         c = f * pre_cell + i * self._activation(chunked_gates[2])
         h = o * self._activation(c)
+        if self.proj_size:
+            h = paddle.matmul(h, self.weight_ho, transpose_y=True)
 
         return h, (h, c)
 
@@ -992,7 +1012,7 @@ def state_shape(self):
         automatically inserted into shape). These two shapes correspond
         to :math:`h_{t-1}` and :math:`c_{t-1}` separately.
         """
-        return ((self.hidden_size,), (self.hidden_size,))
+        return ((self.hidden_size,), (self.proj_size or self.hidden_size,))
 
     def extra_repr(self):
         return '{input_size}, {hidden_size}'.format(**self.__dict__)
@@ -1329,6 +1349,7 @@ def __init__(
         hidden_size,
         num_layers=1,
         direction="forward",
+        proj_size=None,
         time_major=False,
         dropout=0.0,
         weight_ih_attr=None,
@@ -1354,28 +1375,37 @@ def __init__(
             "bias_hh_attr": bias_hh_attr,
         }
 
+        self.proj_size = proj_size
+        if proj_size:
+            assert mode == 'LSTM'
+
         if mode == "LSTM":
             rnn_cls = LSTMCell
+            kwargs["proj_size"] = proj_size
         elif mode == "GRU":
             rnn_cls = GRUCell
-        else:
+        elif mode == "RNN_RELU":
             rnn_cls = SimpleRNNCell
-            kwargs["activation"] = self.activation
+            kwargs["activation"] = 'relu'
+        elif mode == "RNN_TANH":
+            rnn_cls = SimpleRNNCell
+            kwargs["activation"] = 'tanh'
 
+        in_size = proj_size or hidden_size
         if direction in ["forward"]:
             is_reverse = False
             cell = rnn_cls(input_size, hidden_size, **kwargs)
             self.append(RNN(cell, is_reverse, time_major))
             for i in range(1, num_layers):
-                cell = rnn_cls(hidden_size, hidden_size, **kwargs)
+                cell = rnn_cls(in_size, hidden_size, **kwargs)
                 self.append(RNN(cell, is_reverse, time_major))
         elif direction in bidirectional_list:
             cell_fw = rnn_cls(input_size, hidden_size, **kwargs)
             cell_bw = rnn_cls(input_size, hidden_size, **kwargs)
             self.append(BiRNN(cell_fw, cell_bw, time_major))
             for i in range(1, num_layers):
-                cell_fw = rnn_cls(2 * hidden_size, hidden_size, **kwargs)
-                cell_bw = rnn_cls(2 * hidden_size, hidden_size, **kwargs)
+                cell_fw = rnn_cls(2 * in_size, hidden_size, **kwargs)
+                cell_bw = rnn_cls(2 * in_size, hidden_size, **kwargs)
                 self.append(BiRNN(cell_fw, cell_bw, time_major))
         else:
             raise ValueError(
@@ -1569,21 +1599,19 @@ def forward(self, inputs, initial_states=None, sequence_length=None):
         batch_index = 1 if self.time_major else 0
         dtype = inputs.dtype
         if initial_states is None:
-            state_shape = (
-                self.num_layers * self.num_directions,
-                -1,
-                self.hidden_size,
-            )
-
+            state_shape = (self.num_layers * self.num_directions, -1)
+            dims = ([self.proj_size or self.hidden_size], [self.hidden_size])
             fill_shape = list(state_shape)
             if inputs.shape[batch_index] > 0:
                 fill_shape[1] = inputs.shape[batch_index]
             else:
                 fill_shape[1] = paddle.shape(inputs)[batch_index].item()
             initial_states = tuple(
                 [
-                    paddle.full(shape=fill_shape, fill_value=0, dtype=dtype)
-                    for _ in range(self.state_components)
+                    paddle.full(
+                        shape=fill_shape + dims[i], fill_value=0, dtype=dtype
+                    )
+                    for i in range(self.state_components)
                 ]
             )
         else:
@@ -1745,6 +1773,7 @@ def __init__(
             hidden_size,
             num_layers,
             direction,
+            None,
             time_major,
             dropout,
             weight_ih_attr,
@@ -1793,6 +1822,8 @@ class LSTM(RNNBase):
         direction (str, optional): The direction of the network. It can be "forward"
             or "bidirect"(or "bidirectional"). When "bidirect", the way to merge
             outputs of forward and backward is concatenating. Defaults to "forward".
+        proj_size (int, optional): If specified, the output hidden state of each layer
+            will be projected to `proj_size`.
         time_major (bool, optional): Whether the first dimension of the input
             means the time steps. If time_major is True, the shape of Tensor is
             [time_steps,batch_size,input_size], otherwise [batch_size, time_steps,input_size].
@@ -1820,7 +1851,8 @@ class LSTM(RNNBase):
 
         - **outputs** (Tensor): the output sequence. If `time_major` is True, the shape is `[time_steps, batch_size, num_directions * hidden_size]`, If `time_major` is False, the shape is `[batch_size, time_steps, num_directions * hidden_size]`. Note that `num_directions` is 2 if direction is "bidirectional" else 1. `time_steps` means the length of the output sequence.
 
-        - **final_states** (tuple): the final state, a tuple of two tensors, h and c. The shape of each is `[num_layers * num_directions, batch_size, hidden_size]`. Note that `num_directions` is 2 if direction is "bidirectional" (the index of forward states are 0, 2, 4, 6... and the index of backward states are 1, 3, 5, 7...), else 1.
+        - **final_states** (tuple): the final state, a tuple of two tensors, h and c. The shape of each is `[num_layers * num_directions, batch_size, hidden_size]`. If `proj_size` is specified, the last dimension of h will be proj_size.
+            Note that `num_directions` is 2 if direction is "bidirectional" (the index of forward states are 0, 2, 4, 6... and the index of backward states are 1, 3, 5, 7...), else 1.
 
     Variables:
         - **weight_ih_l[k]**: the learnable input-hidden weights of the k-th layer. If `k = 0`, the shape is `[hidden_size, input_size]`. Otherwise, the shape is `[hidden_size, num_directions * hidden_size]`.
@@ -1857,6 +1889,7 @@ def __init__(
         hidden_size,
         num_layers=1,
         direction="forward",
+        proj_size=None,
         time_major=False,
         dropout=0.0,
         weight_ih_attr=None,
@@ -1871,6 +1904,7 @@ def __init__(
             hidden_size,
             num_layers,
             direction,
+            proj_size,
             time_major,
             dropout,
             weight_ih_attr,
@@ -1990,6 +2024,7 @@ def __init__(
             hidden_size,
             num_layers,
             direction,
+            None,
             time_major,
             dropout,
             weight_ih_attr,

@@ -24,7 +24,7 @@
 
 
 class LSTMLayer(nn.Layer):
-    def __init__(self, in_channels, hidden_size):
+    def __init__(self, in_channels, hidden_size, proj_size=None):
         super().__init__()
         self.cell = nn.LSTM(
             in_channels, hidden_size, direction='bidirectional', num_layers=2
@@ -36,9 +36,9 @@ def forward(self, x):
 
 
 class Net(nn.Layer):
-    def __init__(self, in_channels, hidden_size):
+    def __init__(self, in_channels, hidden_size, proj_size=None):
         super().__init__()
-        self.lstm = LSTMLayer(in_channels, hidden_size)
+        self.lstm = LSTMLayer(in_channels, hidden_size, proj_size=proj_size)
 
     def forward(self, x):
         x = self.lstm(x)
@@ -49,6 +49,8 @@ def forward(self, x):
 class TestLstm(unittest.TestCase):
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
+        self.net = Net(12, 2)
+        self.inputs = paddle.zeros((2, 10, 12))
 
     def tearDown(self):
         self.temp_dir.cleanup()
@@ -60,10 +62,8 @@ def run_lstm(self, to_static):
         paddle.static.default_main_program().random_seed = 1001
         paddle.static.default_startup_program().random_seed = 1001
 
-        net = Net(12, 2)
-        net = paddle.jit.to_static(net)
-        x = paddle.zeros((2, 10, 12))
-        y = net(paddle.to_tensor(x))
+        net = paddle.jit.to_static(self.net)
+        y = net(paddle.to_tensor(self.inputs))
         return y.numpy()
 
     def test_lstm_to_static(self):
@@ -74,8 +74,8 @@ def test_lstm_to_static(self):
     @ast_only_test
     def test_save_in_eval(self, with_training=True):
         paddle.jit.enable_to_static(True)
-        net = Net(12, 2)
-        x = paddle.randn((2, 10, 12))
+        net = self.net
+        x = self.inputs
         if with_training:
             x.stop_gradient = False
             dygraph_out = net(x)
@@ -123,6 +123,13 @@ def test_save_without_training(self):
         self.test_save_in_eval(with_training=False)
 
 
+class TestLstmWithProjsize(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.net = Net(12, 2, 4)
+        self.inputs = paddle.zeros((2, 10, 12))
+
+
 class LinearNet(nn.Layer):
     def __init__(self):
         super().__init__()