diff --git a/python/paddle/optimizer/lbfgs.py b/python/paddle/optimizer/lbfgs.py index 89c0daf6ae593..a0198048ecfea 100644 --- a/python/paddle/optimizer/lbfgs.py +++ b/python/paddle/optimizer/lbfgs.py @@ -12,17 +12,47 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + from collections import defaultdict from functools import reduce +from typing import TYPE_CHECKING, NoReturn, Sequence, TypedDict + +from typing_extensions import NotRequired import paddle from ..base import framework from .optimizer import Optimizer +if TYPE_CHECKING: + from paddle import Tensor + from paddle.nn.clip import GradientClipBase + from paddle.regularizer import WeightDecayRegularizer + + from .optimizer import _ParameterConfig + __all__ = [] +class _LbfgsState(TypedDict): + func_evals: int + n_iter: int + d: Tensor + alpha: Tensor + old_yk: list[Tensor] + old_sk: list[Tensor] + ro: list[Tensor] + H_diag: Tensor + prev_flat_grad: Tensor + prev_loss: float + al: NotRequired[list[Tensor]] + + +class _LbfgsStateDict(TypedDict): + state: _LbfgsState + + def dot(x, y): r""" NOTE: This is a temporary workaround for unstable result computed by `paddle.dot`, @@ -333,28 +363,28 @@ class LBFGS(Optimizer): learning_rate (float, optional): learning rate .The default value is 1. max_iter (int, optional): maximal number of iterations per optimization step. The default value is 20. - max_eval (int, optional): maximal number of function evaluations per optimization + max_eval (int|None, optional): maximal number of function evaluations per optimization step. The default value is max_iter * 1.25. tolerance_grad (float, optional): termination tolerance on first order optimality The default value is 1e-5. tolerance_change (float, optional): termination tolerance on function value/parameter changes. The default value is 1e-9. history_size (int, optional): update history size. The default value is 100. - line_search_fn (string, optional): either 'strong_wolfe' or None. The default value is strong_wolfe. - parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \ + line_search_fn (string|None, optional): either 'strong_wolfe' or None. The default value is strong_wolfe. + parameters (list|tuple|None, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \ This parameter is required in dygraph mode. The default value is None. - weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \ + weight_decay (float|WeightDecayRegularizer|None, optional): The strategy of regularization. \ It canbe a float value as coeff of L2 regularization or \ :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`. If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already, \ the regularization setting here in optimizer will be ignored for this parameter. \ Otherwise, the regularization setting here in optimizer will take effect. \ Default None, meaning there is no regularization. - grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of \ + grad_clip (GradientClipBase|None, optional): Gradient clipping strategy, it's an instance of \ some derived class of ``GradientClipBase`` . There are three clipping strategies \ ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` , \ :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping. - name (str, optional): Normally there is no need for user to set this property. + name (str|None, optional): Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. The default value is None. @@ -369,10 +399,10 @@ class LBFGS(Optimizer): >>> paddle.disable_static() >>> np.random.seed(0) - >>> np_w = np.random.rand(1).astype(np.float32) - >>> np_x = np.random.rand(1).astype(np.float32) + >>> np_w = np.random.rand(1).astype(np.float32) # type: ignore + >>> np_x = np.random.rand(1).astype(np.float32) # type: ignore - >>> inputs = [np.random.rand(1).astype(np.float32) for i in range(10)] + >>> inputs = [np.random.rand(1).astype(np.float32) for i in range(10)] # type: ignore >>> # y = 2x >>> targets = [2 * x for x in inputs] @@ -397,26 +427,26 @@ class LBFGS(Optimizer): ... return loss ... opt.step(closure) ... - >>> for input, target in zip(inputs, targets): - ... input = paddle.to_tensor(input) - ... target = paddle.to_tensor(target) + >>> for input_np, target_np in zip(inputs, targets): + ... input = paddle.to_tensor(input_np) + ... target = paddle.to_tensor(target_np) ... train_step(input, target) """ def __init__( self, - learning_rate=1.0, - max_iter=20, - max_eval=None, - tolerance_grad=1e-7, - tolerance_change=1e-9, - history_size=100, - line_search_fn=None, - parameters=None, - weight_decay=None, - grad_clip=None, - name=None, - ): + learning_rate: float = 1.0, + max_iter: int = 20, + max_eval: int | None = None, + tolerance_grad: float = 1e-7, + tolerance_change: float = 1e-9, + history_size: int = 100, + line_search_fn: str | None = None, + parameters: Sequence[Tensor] | Sequence[_ParameterConfig] | None = None, + weight_decay: float | WeightDecayRegularizer | None = None, + grad_clip: GradientClipBase | None = None, + name: str | None = None, + ) -> None: if max_eval is None: max_eval = max_iter * 5 // 4 @@ -452,7 +482,7 @@ def __init__( self._numel_cache = None - def state_dict(self): + def state_dict(self) -> _LbfgsStateDict: r"""Returns the state of the optimizer as a :class:`dict`. Return: @@ -496,7 +526,6 @@ def state_dict(self): ... loss = train_step(inputs, targets) ... n_iter = opt.state_dict()["state"]["func_evals"] ... print("n_iter:", n_iter) - ... """ packed_state = {} @@ -505,7 +534,7 @@ def state_dict(self): return {'state': packed_state} - def _numel(self): + def _numel(self) -> int: # compute the number of all parameters if self._numel_cache is None: self._numel_cache = reduce( @@ -553,7 +582,7 @@ def _directional_evaluate(self, closure, x, alpha, d): return loss, flat_grad @framework.non_static_only - def step(self, closure): + def step(self, closure) -> Tensor: """Performs a single optimization step. Args: @@ -778,7 +807,7 @@ def obj_func(x, alpha, d): def minimize( self, loss, startup_program=None, parameters=None, no_grad_set=None - ): + ) -> NoReturn: """Empty method. LBFGS optimizer does not use this way to minimize ``loss``. Please refer 'Examples' of LBFGS() above for usage.""" raise NotImplementedError( "LBFGS optimizer does not use this way to minimize loss. Please refer 'Examples' of LBFGS() for usage." diff --git a/python/paddle/tensor/tensor.prototype.pyi b/python/paddle/tensor/tensor.prototype.pyi index 54eb541ad33ef..77122814afc23 100644 --- a/python/paddle/tensor/tensor.prototype.pyi +++ b/python/paddle/tensor/tensor.prototype.pyi @@ -24,7 +24,7 @@ from paddle._typing import * # noqa: F403 # isort: on -from typing import Any, Literal, overload +from typing import Any, Iterator, Literal, overload import numpy.typing as npt @@ -263,5 +263,8 @@ class Tensor: @property def type(self) -> Any: ... + # virtual methods + def __iter__(self) -> Iterator[Tensor]: ... # For iterating over the tensor + # annotation: ${tensor_alias} __qualname__: Literal["Tensor"]