diff --git a/python/paddle/optimizer/lbfgs.py b/python/paddle/optimizer/lbfgs.py
index 89c0daf6ae593..a0198048ecfea 100644
--- a/python/paddle/optimizer/lbfgs.py
+++ b/python/paddle/optimizer/lbfgs.py
@@ -12,17 +12,47 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 from collections import defaultdict
 from functools import reduce
+from typing import TYPE_CHECKING, NoReturn, Sequence, TypedDict
+
+from typing_extensions import NotRequired
 
 import paddle
 
 from ..base import framework
 from .optimizer import Optimizer
 
+if TYPE_CHECKING:
+    from paddle import Tensor
+    from paddle.nn.clip import GradientClipBase
+    from paddle.regularizer import WeightDecayRegularizer
+
+    from .optimizer import _ParameterConfig
+
 __all__ = []
 
 
+class _LbfgsState(TypedDict):
+    func_evals: int
+    n_iter: int
+    d: Tensor
+    alpha: Tensor
+    old_yk: list[Tensor]
+    old_sk: list[Tensor]
+    ro: list[Tensor]
+    H_diag: Tensor
+    prev_flat_grad: Tensor
+    prev_loss: float
+    al: NotRequired[list[Tensor]]
+
+
+class _LbfgsStateDict(TypedDict):
+    state: _LbfgsState
+
+
 def dot(x, y):
     r"""
     NOTE: This is a temporary workaround for unstable result computed by `paddle.dot`,
@@ -333,28 +363,28 @@ class LBFGS(Optimizer):
         learning_rate (float, optional): learning rate .The default value is 1.
         max_iter (int, optional): maximal number of iterations per optimization step.
             The default value is 20.
-        max_eval (int, optional): maximal number of function evaluations per optimization
+        max_eval (int|None, optional): maximal number of function evaluations per optimization
             step. The default value is max_iter * 1.25.
         tolerance_grad (float, optional): termination tolerance on first order optimality
             The default value is 1e-5.
         tolerance_change (float, optional): termination tolerance on function
             value/parameter changes. The default value is 1e-9.
         history_size (int, optional): update history size. The default value is 100.
-        line_search_fn (string, optional): either 'strong_wolfe' or None. The default value is strong_wolfe.
-        parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \
+        line_search_fn (string|None, optional): either 'strong_wolfe' or None. The default value is strong_wolfe.
+        parameters (list|tuple|None, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. The default value is None.
-        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+        weight_decay (float|WeightDecayRegularizer|None, optional): The strategy of regularization. \
             It canbe a float value as coeff of L2 regularization or \
             :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
             If a parameter has set regularizer using :ref:`api_paddle_ParamAttr` already, \
             the regularization setting here in optimizer will be ignored for this parameter. \
             Otherwise, the regularization setting here in optimizer will take effect. \
             Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of \
+        grad_clip (GradientClipBase|None, optional): Gradient clipping strategy, it's an instance of \
             some derived class of ``GradientClipBase`` . There are three clipping strategies \
             ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` , \
             :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
-        name (str, optional): Normally there is no need for user to set this property.
+        name (str|None, optional): Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name`.
             The default value is None.
 
@@ -369,10 +399,10 @@ class LBFGS(Optimizer):
 
             >>> paddle.disable_static()
             >>> np.random.seed(0)
-            >>> np_w = np.random.rand(1).astype(np.float32)
-            >>> np_x = np.random.rand(1).astype(np.float32)
+            >>> np_w = np.random.rand(1).astype(np.float32)  # type: ignore
+            >>> np_x = np.random.rand(1).astype(np.float32)  # type: ignore
 
-            >>> inputs = [np.random.rand(1).astype(np.float32) for i in range(10)]
+            >>> inputs = [np.random.rand(1).astype(np.float32) for i in range(10)]  # type: ignore
             >>> # y = 2x
             >>> targets = [2 * x for x in inputs]
 
@@ -397,26 +427,26 @@ class LBFGS(Optimizer):
             ...         return loss
             ...     opt.step(closure)
             ...
-            >>> for input, target in zip(inputs, targets):
-            ...     input = paddle.to_tensor(input)
-            ...     target = paddle.to_tensor(target)
+            >>> for input_np, target_np in zip(inputs, targets):
+            ...     input = paddle.to_tensor(input_np)
+            ...     target = paddle.to_tensor(target_np)
             ...     train_step(input, target)
     """
 
     def __init__(
         self,
-        learning_rate=1.0,
-        max_iter=20,
-        max_eval=None,
-        tolerance_grad=1e-7,
-        tolerance_change=1e-9,
-        history_size=100,
-        line_search_fn=None,
-        parameters=None,
-        weight_decay=None,
-        grad_clip=None,
-        name=None,
-    ):
+        learning_rate: float = 1.0,
+        max_iter: int = 20,
+        max_eval: int | None = None,
+        tolerance_grad: float = 1e-7,
+        tolerance_change: float = 1e-9,
+        history_size: int = 100,
+        line_search_fn: str | None = None,
+        parameters: Sequence[Tensor] | Sequence[_ParameterConfig] | None = None,
+        weight_decay: float | WeightDecayRegularizer | None = None,
+        grad_clip: GradientClipBase | None = None,
+        name: str | None = None,
+    ) -> None:
         if max_eval is None:
             max_eval = max_iter * 5 // 4
 
@@ -452,7 +482,7 @@ def __init__(
 
         self._numel_cache = None
 
-    def state_dict(self):
+    def state_dict(self) -> _LbfgsStateDict:
         r"""Returns the state of the optimizer as a :class:`dict`.
 
         Return:
@@ -496,7 +526,6 @@ def state_dict(self):
                 ...     loss = train_step(inputs, targets)
                 ...     n_iter = opt.state_dict()["state"]["func_evals"]
                 ...     print("n_iter:", n_iter)
-                ...
         """
 
         packed_state = {}
@@ -505,7 +534,7 @@ def state_dict(self):
 
         return {'state': packed_state}
 
-    def _numel(self):
+    def _numel(self) -> int:
         # compute the number of all parameters
         if self._numel_cache is None:
             self._numel_cache = reduce(
@@ -553,7 +582,7 @@ def _directional_evaluate(self, closure, x, alpha, d):
         return loss, flat_grad
 
     @framework.non_static_only
-    def step(self, closure):
+    def step(self, closure) -> Tensor:
         """Performs a single optimization step.
 
         Args:
@@ -778,7 +807,7 @@ def obj_func(x, alpha, d):
 
     def minimize(
         self, loss, startup_program=None, parameters=None, no_grad_set=None
-    ):
+    ) -> NoReturn:
         """Empty method. LBFGS optimizer does not use this way to minimize ``loss``. Please refer 'Examples' of LBFGS() above for usage."""
         raise NotImplementedError(
             "LBFGS optimizer does not use this way to minimize loss. Please refer 'Examples' of LBFGS() for usage."
diff --git a/python/paddle/tensor/tensor.prototype.pyi b/python/paddle/tensor/tensor.prototype.pyi
index 54eb541ad33ef..77122814afc23 100644
--- a/python/paddle/tensor/tensor.prototype.pyi
+++ b/python/paddle/tensor/tensor.prototype.pyi
@@ -24,7 +24,7 @@ from paddle._typing import *  # noqa: F403
 
 # isort: on
 
-from typing import Any, Literal, overload
+from typing import Any, Iterator, Literal, overload
 
 import numpy.typing as npt
 
@@ -263,5 +263,8 @@ class Tensor:
     @property
     def type(self) -> Any: ...
 
+    # virtual methods
+    def __iter__(self) -> Iterator[Tensor]: ...  # For iterating over the tensor
+
     # annotation: ${tensor_alias}
     __qualname__: Literal["Tensor"]