From 84d8e49de82cd26362d356b6ac523c9f6c44e08d Mon Sep 17 00:00:00 2001
From: MRXLT <xlt2024@gmail.com>
Date: Mon, 12 Oct 2020 18:54:20 +0800
Subject: [PATCH] refine adam/strided_slice && fix doc for rmsprop/unstack
 (#27740)

* refine parameters order && doc

* update rmsprop doc

* refine adam/transpose/unstack/stride_slice

* fix bug && doc

* fix doc

* bug fix

* bug fix

* fix doc

* fix doc

* fix doc

* fix doc

* depercate old strided_slice

* update doc

* set default value for name

* update doc
---
 python/paddle/fluid/layers/nn.py              | 10 ++-
 .../tests/unittests/test_strided_slice_op.py  | 13 +++
 .../incubate/complex/tensor/manipulation.py   | 15 ++--
 python/paddle/optimizer/adam.py               | 38 ++++----
 python/paddle/optimizer/adamw.py              | 27 +++---
 python/paddle/optimizer/rmsprop.py            | 24 ++---
 python/paddle/tensor/manipulation.py          | 87 ++++++++++++++++++-
 7 files changed, 147 insertions(+), 67 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 8cb0404c18cad..a6402a2852c2a 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -10241,9 +10241,9 @@ def unstack(x, axis=0, num=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[2, 3, 5], dtype='float32')  # create a tensor with shape=[2, 3, 5]
-            y = fluid.layers.unstack(x, axis=1)  # unstack with second axis, which results 3 tensors with shape=[2, 5]
+            import paddle
+            x = paddle.ones(name='x', shape=[2, 3, 5], dtype='float32')  # create a tensor with shape=[2, 3, 5]
+            y = paddle.unstack(x, axis=1)  # unstack with second axis, which results 3 tensors with shape=[2, 5]
 
     """
     helper = LayerHelper('unstack', **locals())
@@ -11017,7 +11017,7 @@ def slice(input, axes, starts, ends):
     return out
 
 
-@templatedoc()
+@deprecated(since='2.0.0', update_to="paddle.strided_slice")
 def strided_slice(input, axes, starts, ends, strides):
     """
     :alias_main: paddle.strided_slice
@@ -11095,7 +11095,9 @@ def strided_slice(input, axes, starts, ends, strides):
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
 
+            paddle.enable_static()
             input = fluid.data(
                 name="input", shape=[3, 4, 5, 6], dtype='float32')
 
diff --git a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
index 37f11c449d21f..0fe6cd5e7e753 100644
--- a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
@@ -16,6 +16,9 @@
 import numpy as np
 import unittest
 import paddle.fluid as fluid
+import paddle
+
+paddle.enable_static()
 
 
 def strided_slice_native_forward(input, axes, starts, ends, strides):
@@ -498,6 +501,16 @@ def test_1(self):
         assert np.array_equal(res_6, input[-3:3, 0:100:2, :, -1:2:-1])
         assert np.array_equal(res_7, input[-1, 0:100:2, :, -1:2:-1])
 
+    def test_dygraph_op(self):
+        x = paddle.zeros(shape=[3, 4, 5, 6], dtype="float32")
+        axes = [1, 2, 3]
+        starts = [-3, 0, 2]
+        ends = [3, 2, 4]
+        strides_1 = [1, 1, 1]
+        sliced_1 = paddle.strided_slice(
+            x, axes=axes, starts=starts, ends=ends, strides=strides_1)
+        assert sliced_1.shape == (3, 2, 2, 2)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/incubate/complex/tensor/manipulation.py b/python/paddle/incubate/complex/tensor/manipulation.py
index 7852260a31e3c..d1e0cbed82e99 100644
--- a/python/paddle/incubate/complex/tensor/manipulation.py
+++ b/python/paddle/incubate/complex/tensor/manipulation.py
@@ -128,16 +128,13 @@ def transpose(x, perm, name=None):
         .. code-block:: python
  
             import paddle
-            import numpy as np
-            import paddle.fluid.dygraph as dg
  
-            with dg.guard():
-                a = np.array([[1.0 + 1.0j, 2.0 + 1.0j], [3.0+1.0j, 4.0+1.0j]])
-                x = dg.to_variable(a)
-                y = paddle.complex.transpose(x, [1, 0])
-                print(y.numpy())
-                # [[1.+1.j 3.+1.j]
-                #  [2.+1.j 4.+1.j]]
+            x = paddle.to_tensor([[1.0 + 1.0j, 2.0 + 1.0j], [3.0+1.0j, 4.0+1.0j], [5.0+1.0j, 6.0+1.0j]])
+            x_transposed = paddle.complex.transpose(x, [1, 0])
+            print(x_transposed.numpy())
+            #[[1.+1.j 3.+1.j 5.+1.j]
+            # [2.+1.j 4.+1.j 6.+1.j]]
+
     """
     complex_variable_exists([x], "transpose")
     real = layers.transpose(x.real, perm, name)
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index 9cbb45ce60d14..366d8b953e3d4 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -29,7 +29,7 @@ class Adam(Optimizer):
     of section 2 of `Adam paper <https://arxiv.org/abs/1412.6980>`_ ,
     it can dynamically adjusts the learning rate of each parameter using
     the 1st moment estimates and the 2nd moment estimates of the gradient.
-    
+
     The parameter ``param_out`` update rule with gradient ``grad``:
 
     .. math::
@@ -68,13 +68,10 @@ class Adam(Optimizer):
 	    the regularization setting here in optimizer will be ignored for this parameter. \
 	    Otherwise, the regularization setting here in optimizer will take effect. \
 	    Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
-        name (str, optional): Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name`.
-            The default value is None.
         lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
             The accumulators are updated at every step. Every element of the two moving-average
             is updated in both dense mode and sparse mode. If the size of parameter is very large,
@@ -82,17 +79,17 @@ class Adam(Optimizer):
             gradient in current mini-batch, so it will be much more faster. But this mode has
             different semantics with the original Adam algorithm and may lead to different result.
             The default value is False.
+        name (str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name`.
+            The default value is None.
 
     Examples:
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            paddle.disable_static()
-            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
             linear = paddle.nn.Linear(10, 10)
-            inp = paddle.to_tensor(inp)
+            inp = paddle.rand([10,10], dtype="float32")
             out = linear(inp)
             loss = paddle.mean(out)
             adam = paddle.optimizer.Adam(learning_rate=0.1,
@@ -105,12 +102,9 @@ class Adam(Optimizer):
 
             # Adam with beta1/beta2 as Tensor and weight_decay as float
             import paddle
-            import numpy as np
 
-            paddle.disable_static()
-            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
             linear = paddle.nn.Linear(10, 10)
-            inp = paddle.to_tensor(inp)
+            inp = paddle.rand([10,10], dtype="float32")
             out = linear(inp)
             loss = paddle.mean(out)
 
@@ -140,8 +134,8 @@ def __init__(self,
                  parameters=None,
                  weight_decay=None,
                  grad_clip=None,
-                 name=None,
-                 lazy_mode=False):
+                 lazy_mode=False,
+                 name=None):
         assert learning_rate is not None
         assert beta1 is not None
         assert beta2 is not None
@@ -258,7 +252,7 @@ def _append_optimize_op(self, block, param_and_grad):
     def step(self):
         """
         Execute the optimizer and update parameters once.
-        
+
         Returns:
             None
 
@@ -266,13 +260,11 @@ def step(self):
             .. code-block:: python
 
                 import paddle
-                import numpy as np
-                paddle.disable_static()
-                value = np.arange(26).reshape(2, 13).astype("float32")
-                a = paddle.to_tensor(value)
+                
+                a = paddle.rand([2,13], dtype="float32")
                 linear = paddle.nn.Linear(13, 5)
                 # This can be any optimizer supported by dygraph.
-                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
+                adam = paddle.optimizer.Adam(learning_rate = 0.01,
                                             parameters = linear.parameters())
                 out = linear(a)
                 out.backward()
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index 0b04f03eb14da..00c197a58b3dd 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -23,7 +23,7 @@
 
 class AdamW(Adam):
     """
-    The AdamW optimizer is implemented based on the AdamW Optimization 
+    The AdamW optimizer is implemented based on the AdamW Optimization
     in paper `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
     it can resolves the problem of L2 regularization failure in the Adam optimizer.
 
@@ -32,7 +32,7 @@ class AdamW(Adam):
         t & = t + 1
 
         moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
-        
+
         moemnt\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
 
         learning\_rate & = learning\_rate * \\
@@ -57,16 +57,13 @@ class AdamW(Adam):
             The default value is 1e-08.
         weight_decay (float|Tensor, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01.
         apply_decay_param_fun (function|None, optional): If it is not None,
-            only tensors that makes apply_decay_param_fun(Tensor)==True 
+            only tensors that makes apply_decay_param_fun(Tensor)==True
             will be updated. It only works when we want to specify tensors.
             Default: None.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
-        name (str, optional): Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name`.
-            The default value is None.
         lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
             The accumulators are updated at every step. Every element of the two moving-average
             is updated in both dense mode and sparse mode. If the size of parameter is very large,
@@ -74,18 +71,18 @@ class AdamW(Adam):
             gradient in current mini-batch, so it will be much more faster. But this mode has
             different semantics with the original Adam algorithm and may lead to different result.
             The default value is False.
+        name (str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name`.
+            The default value is None.
     **Notes**:
         **Currently, AdamW doesn't support sparse parameter optimization.**
 
     Examples:
         .. code-block:: python
             import paddle
-            import numpy as np
 
-            paddle.disable_static()
-            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
             linear = paddle.nn.Linear(10, 10)
-            inp = paddle.to_tensor(inp)
+            inp = paddle.rand([10,10], dtype="float32")
             out = linear(inp)
             loss = paddle.mean(out)
 
@@ -112,8 +109,8 @@ def __init__(self,
                  weight_decay=0.01,
                  apply_decay_param_fun=None,
                  grad_clip=None,
-                 name=None,
-                 lazy_mode=False):
+                 lazy_mode=False,
+                 name=None):
         assert learning_rate is not None
         assert beta1 is not None
         assert beta2 is not None
diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
index 2609972d85ccd..5e17ca34ff218 100644
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
@@ -90,9 +90,9 @@ class RMSProp(Optimizer):
 	    the regularization setting here in optimizer will be ignored for this parameter. \
 	    Otherwise, the regularization setting here in optimizer will take effect. \
 	    Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): This parameter is used by developers to print debugging information. \
             For details, please refer to :ref:`api_guide_Name`. Default is None.
@@ -104,24 +104,18 @@ class RMSProp(Optimizer):
           .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            paddle.disable_static()
-            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            inp = paddle.rand([10,10], dtype="float32")
             linear = paddle.nn.Linear(10, 10)
-            inp = paddle.to_tensor(inp)
             out = linear(inp)
             loss = paddle.mean(out)
 
-            beta1 = paddle.to_tensor([0.9], dtype="float32")
-            beta2 = paddle.to_tensor([0.99], dtype="float32")
-
-            adam = paddle.optimizer.RMSProp(learning_rate=0.1,
-                    parameters=linear.parameters(),
-                    weight_decay=0.01)
+            rmsprop = paddle.optimizer.RMSProp(learning_rate=0.1,
+                             parameters=linear.parameters(),
+                                       weight_decay=0.01)
             out.backward()
-            adam.step()
-            adam.clear_grad()
+            rmsprop.step()
+            rmsprop.clear_grad()
 
     """
 
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 86bf9b31f9a9e..531629c573fb6 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -25,7 +25,6 @@
 # TODO: define functions to manipulate a tensor  
 from ..fluid.layers import cast  #DEFINE_ALIAS
 from ..fluid.layers import slice  #DEFINE_ALIAS
-from ..fluid.layers import strided_slice  #DEFINE_ALIAS
 from ..fluid.layers import transpose  #DEFINE_ALIAS
 from ..fluid.layers import unstack  #DEFINE_ALIAS
 
@@ -1461,3 +1460,89 @@ def gather_nd(x, index, name=None):
     """
 
     return paddle.fluid.layers.gather_nd(input=x, index=index, name=name)
+
+
+def strided_slice(x, axes, starts, ends, strides, name=None):
+    """
+    This operator produces a slice of ``x`` along multiple axes. Similar to numpy:
+    https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
+    Slice uses ``axes``, ``starts`` and ``ends`` attributes to specify the start and
+    end dimension for each axis in the list of axes and Slice uses this information
+    to slice the input data tensor. If a negative value is passed to
+    ``starts`` or ``ends`` such as :math:`-i`,  it represents the reverse position of the
+    axis :math:`i-1` th(here 0 is the initial position). The ``strides`` represents steps of
+    slicing and if the ``strides`` is negative, slice operation is in the opposite direction.
+    If the value passed to ``starts`` or ``ends`` is greater than n
+    (the number of elements in this dimension), it represents n.
+    For slicing to the end of a dimension with unknown size, it is recommended
+    to pass in INT_MAX. The size of ``axes`` must be equal to ``starts`` , ``ends`` and ``strides``.
+    Following examples will explain how strided_slice works:
+
+    .. code-block:: text
+
+        Case1:
+            Given:
+                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+                axes = [0, 1]
+                starts = [1, 0]
+                ends = [2, 3]
+                strides = [1, 1]
+            Then:
+                result = [ [5, 6, 7], ]
+
+        Case2:
+            Given:
+                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+                axes = [0, 1]
+                starts = [0, 1]
+                ends = [2, 0]
+                strides = [1, -1]
+            Then:
+                result = [ [8, 7, 6], ]
+        Case3:
+            Given:
+                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+                axes = [0, 1]
+                starts = [0, 1]
+                ends = [-1, 1000]
+                strides = [1, 3]
+            Then:
+                result = [ [2], ]
+    Args:
+        x (Tensor): An N-D ``Tensor``. The data type is ``float32``, ``float64``, ``int32`` or ``int64``.
+        axes (list|tuple): The data type is ``int32`` . Axes that `starts` and `ends` apply to.
+                            It's optional. If it is not provides, it will be treated as :math:`[0,1,...,len(starts)-1]`.
+        starts (list|tuple|Tensor): The data type is ``int32`` . If ``starts`` is a list or tuple, the elements of                                                                                          it should be integers or Tensors with shape [1]. If ``starts`` is an Tensor, it should be an 1-D Tensor.                                                                                    It represents starting indices of corresponding axis in ``axes``.
+        ends (list|tuple|Tensor): The data type is ``int32`` . If ``ends`` is a list or tuple, the elements of
+                it should be integers or Tensors with shape [1]. If ``ends`` is an Tensor, it should be an 1-D Tensor .                                                                                     It represents ending indices of corresponding axis in ``axes``.
+        strides (list|tuple|Tensor): The data type is ``int32`` . If ``strides`` is a list or tuple, the elements of
+                it should be integers or Tensors with shape [1]. If ``strides`` is an Tensor, it should be an 1-D Tensor .                                                                                  It represents slice step of corresponding axis in ``axes``.
+        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
+                        For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        Tensor:  A ``Tensor`` with the same dimension as ``x``. The data type is same as ``x``.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            x = paddle.zeros(shape=[3,4,5,6], dtype="float32")
+            # example 1:
+            # attr starts is a list which doesn't contain Tensor.
+            axes = [1, 2, 3]
+            starts = [-3, 0, 2]
+            ends = [3, 2, 4]
+            strides_1 = [1, 1, 1]
+            strides_2 = [1, 1, 2]
+            sliced_1 = paddle.strided_slice(x, axes=axes, starts=starts, ends=ends, strides=strides_1)
+            # sliced_1 is x[:, 1:3:1, 0:2:1, 2:4:1].                                
+            # example 2:
+            # attr starts is a list which contain tensor Tensor.
+            minus_3 = paddle.fill_constant([1], "int32", -3)
+            sliced_2 = paddle.strided_slice(x, axes=axes, starts=[minus_3, 0, 2], ends=ends, strides=strides_2)
+            # sliced_2 is x[:, 1:3:1, 0:2:1, 2:4:2].
+    """
+
+    return paddle.fluid.layers.strided_slice(
+        input=x, axes=axes, starts=starts, ends=ends, strides=strides)