PaddlePaddle · SigureMo · Jun 17, 2024 · Jun 12, 2024 · Jun 13, 2024 · Jun 13, 2024
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import logging
 import os
 from collections import defaultdict
+from typing import TYPE_CHECKING, Sequence
 
 import numpy as np
 
@@ -46,6 +49,14 @@
 from ..base.layer_helper import LayerHelper
 from .lr import LRScheduler
 
+if TYPE_CHECKING:
+    from paddle import Tensor
+    from paddle.callbacks import Callback
+    from paddle.nn.clip import GradientClipBase
+    from paddle.regularizer import WeightDecayRegularizer
+
+    from ..base.framework import Operator, Program
+
 __all__ = []
 
 g_shard_bypass_dygraph_optimizer = int(
@@ -111,24 +122,24 @@ class Optimizer:
     Args:
         learning_rate (float|LRScheduler): The learning rate used to update ``Parameter``.
             It can be a float value or any subclass of ``LRScheduler`` .
-        parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \
+        parameters (list[Tensor]|tuple[Tensor,...]|None, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. And you can specify different options for \
             different parameter groups such as the learning rate, weight decay, etc, \
             then the parameters are list of dict. Note that the learning_rate in paramter groups \
             represents the scale of base learning_rate. \
             The default value is None in static graph mode, at this time all parameters will be updated.
-        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+        weight_decay (float|WeightDecayRegularizer|None, optional): The strategy of regularization. \
             It canbe a float value as coeff of L2 regularization or \
             :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
             If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already, \
             the regularization setting here in optimizer will be ignored for this parameter. \
             Otherwise, the regularization setting here in optimizer will take effect. \
             Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of \
+        grad_clip (GradientClipBase|None, optional): Gradient cliping strategy, it's an instance of \
             some derived class of ``GradientClipBase`` . There are three cliping strategies \
             ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` , \
             :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
-        name (str, optional): Normally there is no need for user to set this property.
+        name (str|None, optional): Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name`.
             The default value is None.
 
@@ -178,12 +189,12 @@ class Optimizer:
     @imperative_base.no_grad()
     def __init__(
         self,
-        learning_rate,
-        parameters=None,
-        weight_decay=None,
-        grad_clip=None,
-        name=None,
-    ):
+        learning_rate: float | LRScheduler,
+        parameters: Sequence[Tensor] | None = None,
+        weight_decay: float | WeightDecayRegularizer | None = None,
+        grad_clip: GradientClipBase | None = None,
+        name: str | None = None,
+    ) -> None:
         if parameters is not None:
             # paddle.Tensor is also iterable, so here we don't check whether
             # the input is iterable, if the input is paddle.Tensor, the
@@ -311,7 +322,7 @@ def _get_auxiliary_var(self, key):
         return self._auxiliary_vars.get(key, None)
 
     @framework.dygraph_only
-    def state_dict(self):
+    def state_dict(self) -> dict[str, Tensor]:
         '''
         Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be include in state dict.
         If the optimizer never be called(minimize function), the state_dict is empty.
@@ -320,7 +331,7 @@ def state_dict(self):
             None
 
         Returns:
-            state_dict(dict) : dict contains all the Tensor used by optimizer
+            dict[str,Tensor], dict contains all the Tensor used by optimizer
 
         Examples:
             .. code-block:: python
@@ -355,12 +366,13 @@ def state_dict(self):
         return state_dict
 
     @framework.dygraph_only
-    def set_state_dict(self, state_dict):
+    def set_state_dict(self, state_dict: dict[str, Tensor]) -> None:
         '''
         Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be changed.
 
         Args:
-            state_dict(dict) : Dict contains all the Tensor needed by optimizer
+            state_dict(dict): Dict contains all the Tensor needed by optimizer
+
         Return:
             None
 
@@ -414,7 +426,7 @@ def set_state_dict(self, state_dict):
                     )
                 var.set_value(state_dict[var_tmp.name])
 
-    def get_opti_var_name_list(self):
+    def get_opti_var_name_list(self) -> list[str]:
         return self._opti_name_list
 
     def _create_global_learning_rate(self):
@@ -549,15 +561,15 @@ def do_create():
             do_create()
 
     @framework.dygraph_only
-    def set_lr(self, value):
+    def set_lr(self, value: float) -> None:
         """
         :api_attr: imperative
 
         Set the value of the learning rate manually in the optimizer. If the optimizer use LRScheduler,
         this API cannot be invoked, because it will lead to conflict.
 
         Args:
-            value (float): the value of learning rate
+            value (float): the value of learning rate.
 
         Returns:
             None
@@ -618,7 +630,7 @@ def set_lr(self, value):
                 )
 
     @framework.dygraph_only
-    def set_lr_scheduler(self, scheduler):
+    def set_lr_scheduler(self, scheduler: LRScheduler) -> None:
         """
         :api_attr: imperative
 
@@ -663,14 +675,17 @@ def set_lr_scheduler(self, scheduler):
             )
         self._learning_rate = scheduler
 
-    def get_lr(self):
+    def get_lr(self) -> float:
         """
         Get current learning rate of optimizer.
         If 'LRScheduler' is not used, the return value is all the same.
         If 'LRScheduler' is used, the return value is the current scheduled learing rete.
 
+        Args:
+            None.
+
         Returns:
-            float: The current learning rate of optimizer.
+            float, The current learning rate of optimizer.
 
         Examples:
             .. code-block:: python
@@ -1343,31 +1358,31 @@ def _pir_create_optimization_pass(
 
     def backward(
         self,
-        loss,
-        startup_program=None,
-        parameters=None,
-        no_grad_set=None,
-        callbacks=None,
-    ):
+        loss: Tensor,
+        startup_program: Program | None = None,
+        parameters: list[Tensor] | list[str] | None = None,
+        no_grad_set: set[Tensor] | set[str] | None = None,
+        callbacks: list[Callback] | None = None,
+    ) -> list[tuple[Tensor, Tensor]]:
         """
         The first part of ``minimize``, do auto-diff to append backward operations for
         the current program.
 
         Args:
             loss (Tensor): ``loss`` tensor to run optimizations.
-            startup_program (Program, optional): :ref:`api_paddle_static_Program` for
+            startup_program (Program|None, optional): :ref:`api_paddle_static_Program` for
                 initializing parameters in ``parameters``. The default value
                 is None, at this time :ref:`api_paddle_static_default_startup_program` will be used.
-            parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update
+            parameters (list[Tensor]|list[str]|None, optional): List of ``Tensor`` or ``Tensor.name`` to update
                 to minimize ``loss``. The default value is None, at this time all parameters
                 will be updated.
-            no_grad_set (set, optional): Set of ``Tensor``  or ``Tensor.name`` that don't need
+            no_grad_set (set[Tensor]|set[str]|None, optional): Set of ``Tensor``  or ``Tensor.name`` that don't need
                 to be updated. The default value is None.
-            callbacks (list, optional): list of callable objects to run when appending backward
+            callbacks (list[Callback]|None, optional): list of callable objects to run when appending backward
                 operator for one parameter. The default value is None.
 
         Return:
-            list: list of (param, grad) tensor pairs, param is ``Parameter``,
+            list[tuple[Tensor, Tensor]], list of (param, grad) tensor pairs, param is ``Parameter``,
                 grad is the gradient value corresponding to the parameter.
 
         Examples:
@@ -1448,13 +1463,15 @@ def backward(
                         )
         return params_grads
 
-    def apply_gradients(self, params_grads):
+    def apply_gradients(
+        self, params_grads: list[tuple[Tensor, Tensor]]
+    ) -> list[Operator]:
         """
         Second part of `minimize`, appending optimization operators for
         given `params_grads` pairs.
 
         Args:
-            params_grads (list): list of (param, grad) pair to do optimization.
+            params_grads (list[tuple[Tensor, Tensor]]): list of (param, grad) pair to do optimization.
 
         Returns:
             list: A list of operators appended to the current program.
@@ -1610,8 +1627,10 @@ def get_target_param(param, grad):
             return new_grad
 
     def append_regularization_ops(
-        self, parameters_and_grads, regularization=None
-    ):
+        self,
+        parameters_and_grads: list[tuple[Tensor, Tensor]],
+        regularization: WeightDecayRegularizer | None = None,
+    ) -> list[tuple[Tensor, Tensor]]:
         r"""Create and add backward regularization Operators
 
         Creates and adds backward regularization operators in the BlockDesc.
@@ -1620,14 +1639,14 @@ def append_regularization_ops(
         same as implementing weight decay in optimizers for regularization.
 
         Args:
-            parameters_and_grads: A list of (parameters, gradients) pairs
-                                  that need to be regularized.
-            regularization: A global regularizer. If the parameter is not
-                            set. It will be applied with regularizer.
+            parameters_and_grads (list[tuple[Tensor,Tensor]]): A list of (parameters, gradients) pairs
+                that need to be regularized.
+            regularization (WeightDecayRegularizer|None, optional): A global regularizer. If the parameter is not
+                set. It will be applied with regularizer.
 
         Returns:
-            list[(Variable, Variable)]: list of (parameters, gradients) \
-            pair with the regularized gradient
+            list[tuple[Tensor,Tensor]]: list of (parameters, gradients) \
+                pair with the regularized gradient
 
         Raises:
             Exception: Unknown regularization type
@@ -1684,7 +1703,7 @@ def _get_no_grad_set(self, loss, no_grad_set=None):
             return no_grad_set
 
     @framework.non_static_only
-    def clear_grad(self, set_to_zero=True):
+    def clear_grad(self, set_to_zero: bool = True) -> None:
         """
         Clear the gradients of all optimized parameters for model.
 
@@ -1732,29 +1751,33 @@ def clear_grad(self, set_to_zero=True):
 
     @imperative_base.no_grad()
     def minimize(
-        self, loss, startup_program=None, parameters=None, no_grad_set=None
-    ):
+        self,
+        loss: Tensor,
+        startup_program: Program | None = None,
+        parameters: list[Tensor] | list[str] | None = None,
+        no_grad_set: set[Tensor] | set[str] | None = None,
+    ) -> tuple[list[Operator], list[tuple[Tensor, Tensor]]]:
         """
         Add operations to minimize ``loss`` by updating ``parameters``.
 
         Args:
             loss (Tensor): A ``Tensor`` containing the value to minimize.
-            startup_program (Program, optional): :ref:`api_paddle_static_Program` for
+            startup_program (Program|None, optional): :ref:`api_paddle_static_Program` for
                 initializing parameters in ``parameters``. The default value
                 is None, at this time :ref:`api_paddle_static_default_startup_program` will be used.
-            parameters (list, optional): List of ``Tensor`` or ``Tensor.name`` to update
+            parameters (list[Tensor]|list[str]|None, optional): List of ``Tensor`` or ``Tensor.name`` to update
                 to minimize ``loss``. The default value is None, at this time all parameters
                 will be updated.
-            no_grad_set (set, optional): Set of ``Tensor``  or ``Tensor.name`` that don't need
+            no_grad_set (set[Tensor]|set[str]|None, optional): Set of ``Tensor``  or ``Tensor.name`` that don't need
                 to be updated. The default value is None.
 
         Returns:
-            tuple: tuple (optimize_ops, params_grads), A list of operators appended
-            by minimize and a list of (param, grad) tensor pairs, param is
-            ``Parameter``, grad is the gradient value corresponding to the parameter.
-            In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
-            indicate program pruning. If so, the program will be pruned by ``feed`` and
-            ``fetch_list`` before run, see details in ``Executor``.
+            tuple[list[Operator],list[tuple[Tensor, Tensor]]], A list of operators appended
+                by minimize and a list of (param, grad) tensor pairs, param is
+                ``Parameter``, grad is the gradient value corresponding to the parameter.
+                In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
+                indicate program pruning. If so, the program will be pruned by ``feed`` and
+                ``fetch_list`` before run, see details in ``Executor``.
 
         Examples:
             .. code-block:: python
@@ -1818,10 +1841,13 @@ def _declarative_step(self):
 
     @imperative_base.no_grad()
     @framework.non_static_only
-    def step(self):
+    def step(self) -> None:
         """
         Execute the optimizer and update parameters once.
 
+        Args:
+            None
+
         Returns:
             None