PaddlePaddle · SigureMo · Jun 17, 2024 · Jun 12, 2024 · Jun 13, 2024 · Jun 13, 2024
@@ -33,7 +33,6 @@
 
 ShapeLike: TypeAlias = Union[_DynamicShapeLike, _StaticShapeLike]
 
-
 # for size parameters, eg, kernel_size, stride ...
 Size1: TypeAlias = Union[int, Tuple[int], List[int]]
 Size2: TypeAlias = Union[int, Tuple[int, int], List[int]]

diff --git a/python/paddle/base/dygraph/base.py b/python/paddle/base/dygraph/base.py
@@ -11,10 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import annotations
+
 import inspect
 import sys
 import warnings
-from typing import Callable, TypeVar
+from typing import Callable, ContextManager, TypeVar, overload
 
 import decorator
 from typing_extensions import ParamSpec
@@ -271,6 +274,16 @@ def _switch_tracer_mode_guard_(is_train=True):
         yield
 
 
+@overload
+def no_grad(func: None = ...) -> ContextManager:
+    ...
+
+
+@overload
+def no_grad(func: Callable[_InputT, _RetT]) -> Callable[_InputT, _RetT]:
+    ...
+
+
 def no_grad(func=None):
     """
     :api_attr: imperative
@@ -327,7 +340,11 @@ def no_grad(func=None):
     else:
 
         @decorator.decorator
-        def __impl__(func, *args, **kwargs):
+        def __impl__(
+            func: Callable[_InputT, _RetT],
+            *args: _InputT.args,
+            **kwargs: _InputT.kwargs,
+        ) -> _RetT:
             with _switch_tracer_mode_guard_(is_train=False):
                 return func(*args, **kwargs)
 

diff --git a/python/paddle/nn/layer/layers.py b/python/paddle/nn/layer/layers.py
@@ -940,7 +940,7 @@ def parameters(self, include_sublayers: bool = True) -> list[Tensor]:
                 Default: True.
 
         Returns:
-            list of Tensor, a list of Parameters.
+            list, list of Tensor, a list of Parameters.
 
         Examples:
             .. code-block:: python

diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
@@ -12,8 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import warnings
 from collections import defaultdict
+from typing import TYPE_CHECKING, Sequence
+
+from typing_extensions import NotRequired
 
 import paddle
 from paddle import _C_ops, pir
@@ -28,7 +33,21 @@
     in_dynamic_or_pir_mode,
     in_pir_mode,
 )
-from .optimizer import Optimizer
+from .optimizer import Optimizer, _ParameterConfig
+
+
+class _AdamParameterConfig(_ParameterConfig):
+    beta1: NotRequired[float | Tensor]
+    beta2: NotRequired[float | Tensor]
+
+
+if TYPE_CHECKING:
+    from paddle import Tensor
+    from paddle.nn.clip import GradientClipBase
+    from paddle.regularizer import WeightDecayRegularizer
+
+    from .lr import LRScheduler
+
 
 __all__ = []
 
@@ -69,20 +88,20 @@ class Adam(Optimizer):
         epsilon (float|Tensor, optional): A small float value for numerical stability.
             It should be a float number or a 0-D Tensor with shape [] and data type as float32.
             The default value is 1e-08.
-        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``.
+        parameters (list|tuple|None, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``.
             This parameter is required in dygraph mode. And you can specify different options for
             different parameter groups such as the learning rate, weight decay, etc,
             then the parameters are list of dict. Note that the learning_rate in parameter groups
             represents the scale of base learning_rate.
             The default value is None in static graph mode, at this time all parameters will be updated.
-        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization.
+        weight_decay (float|WeightDecayRegularizer|None, optional): The strategy of regularization.
             It canbe a float value as coeff of L2 regularization or
             :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
             If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already,
             the regularization setting here in optimizer will be ignored for this parameter.
             Otherwise, the regularization setting here in optimizer will take effect.
             Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
+        grad_clip (GradientClipBase|None, optional): Gradient clipping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three clipping strategies
             ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
             :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
@@ -95,7 +114,7 @@ class Adam(Optimizer):
             The default value is False.
         multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.
         use_multi_tensor (bool, optional): Whether to use multi-tensor strategy to update all parameters at once . Default is false.
-        name (str, optional): Normally there is no need for user to set this property.
+        name (str|None, optional): Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name`.
             The default value is None.
 
@@ -145,14 +164,14 @@ class Adam(Optimizer):
             >>> loss = paddle.mean(out)
             >>> adam = paddle.optimizer.Adam(
             ...     learning_rate=0.1,
-            ...     parameters=[{
-            ...         'params': linear_1.parameters()
-            ...     }, {
-            ...         'params': linear_2.parameters(),
-            ...         'weight_decay': 0.001,
-            ...         'learning_rate': 0.1,
-            ...         'beta1': 0.8
-            ...     }],
+            ...     parameters=[{  # type: ignore
+            ...         'params': linear_1.parameters()  # type: ignore
+            ...     }, {  # type: ignore
+            ...         'params': linear_2.parameters(),  # type: ignore
+            ...         'weight_decay': 0.001,  # type: ignore
+            ...         'learning_rate': 0.1,  # type: ignore
+            ...         'beta1': 0.8  # type: ignore
+            ...     }],  # type: ignore
             ...     weight_decay=0.01,
             ...     beta1=0.9)
             >>> loss.backward()
@@ -167,18 +186,20 @@ class Adam(Optimizer):
 
     def __init__(
         self,
-        learning_rate=0.001,
-        beta1=0.9,
-        beta2=0.999,
-        epsilon=1e-8,
-        parameters=None,
-        weight_decay=None,
-        grad_clip=None,
-        lazy_mode=False,
-        multi_precision=False,
-        use_multi_tensor=False,
-        name=None,
-    ):
+        learning_rate: float | LRScheduler = 0.001,
+        beta1: float | Tensor = 0.9,
+        beta2: float | Tensor = 0.999,
+        epsilon: float | Tensor = 1e-8,
+        parameters: Sequence[Tensor]
+        | Sequence[_AdamParameterConfig]
+        | None = None,
+        weight_decay: float | WeightDecayRegularizer | None = None,
+        grad_clip: GradientClipBase | None = None,
+        lazy_mode: bool = False,
+        multi_precision: bool = False,
+        use_multi_tensor: bool = False,
+        name: str | None = None,
+    ) -> None:
         assert learning_rate is not None
         assert beta1 is not None
         assert beta2 is not None
@@ -408,7 +429,7 @@ def _append_optimize_op(self, block, param_and_grad):
 
     @imperative_base.no_grad
     @framework.non_static_only
-    def step(self):
+    def step(self) -> None:
         """
         Execute the optimizer and update parameters once.