diff --git a/python/paddle/nn/initializer/assign.py b/python/paddle/nn/initializer/assign.py
index c3c60c7088995..89b3369b59817 100644
--- a/python/paddle/nn/initializer/assign.py
+++ b/python/paddle/nn/initializer/assign.py
@@ -11,6 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Sequence
+
 import paddle
 from paddle import _C_ops
 
@@ -23,6 +28,9 @@
 )
 from .initializer import Initializer
 
+if TYPE_CHECKING:
+    import numpy.typing as npt
+
 __all__ = []
 
 
@@ -38,19 +46,21 @@ class NumpyArrayInitializer(Initializer):
 
     """
 
-    def __init__(self, value):
+    def __init__(self, value: npt.NDArray[Any]) -> None:
         import numpy
 
         assert isinstance(value, numpy.ndarray)
         super().__init__()
         self._value = value
 
-    def forward(self, var, block=None):
+    def forward(
+        self, var: paddle.Tensor, block: paddle.pir.Block | None = None
+    ) -> paddle.Tensor | None:
         """Initialize the input tensor with Numpy array.
 
         Args:
             var(Tensor): Tensor that needs to be initialized.
-            block(Block, optional): The block in which initialization ops
+            block(Block|None, optional): The block in which initialization ops
                    should be added. Used in static graph only, default None.
 
         Returns:
@@ -172,7 +182,7 @@ class Assign(NumpyArrayInitializer):
 
     Args:
         value (Tensor|numpy.ndarray|list|tuple): numpy array, list, tuple, or tensor to initialize the parameter.
-        name(str, optional): Normally there is no need for user to set this
+        name(str|None, optional): Normally there is no need for user to set this
             property. For more information, please refer to :ref:`api_guide_Name`. Default is None.
 
     Returns:
@@ -239,7 +249,11 @@ class Assign(NumpyArrayInitializer):
             [6.]
     """
 
-    def __init__(self, value, name=None):
+    def __init__(
+        self,
+        value: npt.NDArray[Any] | Sequence[int] | paddle.Tensor,
+        name: str | None = None,
+    ) -> None:
         import numpy
 
         check_type(
diff --git a/python/paddle/nn/initializer/bilinear.py b/python/paddle/nn/initializer/bilinear.py
index 05ac3641caff2..9659ea42b2a15 100644
--- a/python/paddle/nn/initializer/bilinear.py
+++ b/python/paddle/nn/initializer/bilinear.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import numpy as np
 
+import paddle
 from paddle import _C_ops, pir
 
 from ...base import core, framework, unique_name
@@ -74,16 +77,18 @@ class Bilinear(Initializer):
 
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         """Constructor for BilinearInitializer."""
         super().__init__()
 
-    def forward(self, var, block=None):
+    def forward(
+        self, var: paddle.Tensor, block: pir.Block | None = None
+    ) -> paddle.Tensor | None:
         """Initialize the input tensor with Bilinear initialization.
 
         Args:
             var(Tensor): Tensor that needs to be initialized.
-            block(Block, optional): The block in which initialization ops
+            block(Block|None, optional): The block in which initialization ops
                    should be added. Used in static graph only, default None.
 
         Returns:
diff --git a/python/paddle/nn/initializer/constant.py b/python/paddle/nn/initializer/constant.py
index e02d167770749..7c85d29356945 100644
--- a/python/paddle/nn/initializer/constant.py
+++ b/python/paddle/nn/initializer/constant.py
@@ -47,7 +47,7 @@ def forward(
         self,
         var: paddle.Tensor,
         block: paddle.pir.Block | None = None,
-    ):
+    ) -> paddle.Tensor | None:
         """Initialize the input tensor with constant.
 
         Args:
diff --git a/python/paddle/nn/initializer/initializer.py b/python/paddle/nn/initializer/initializer.py
index 6eb339cff33a1..e09619a69b440 100644
--- a/python/paddle/nn/initializer/initializer.py
+++ b/python/paddle/nn/initializer/initializer.py
@@ -16,8 +16,10 @@
 
 import functools
 import math
+from typing import TYPE_CHECKING, Literal
 
 import numpy as np
+from typing_extensions import TypeAlias
 
 import paddle
 
@@ -28,6 +30,22 @@
 )
 from .lazy_init import lazy_init_helper
 
+if TYPE_CHECKING:
+    _NonLinearity: TypeAlias = Literal[  # noqa: PYI047
+        "sigmoid",
+        "linear",
+        "conv1d",
+        "conv2d",
+        "conv3d",
+        "conv1d_transpose",
+        "conv2d_transpose",
+        "conv3d_transpose",
+        "tanh",
+        "relu",
+        "leaky_relu",
+        "selu",
+    ]
+
 __all__ = []
 
 
@@ -40,7 +58,7 @@ class Initializer:
     directly, but need to use one of its implementations.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         pass
 
     def __call__(
@@ -53,7 +71,7 @@ def __call__(
 
     def forward(
         self, param: paddle.Tensor, block: paddle.pir.Block | None = None
-    ):
+    ) -> paddle.Tensor | None:
         """Add corresponding initialization operations to the network."""
         raise NotImplementedError()
 
diff --git a/python/paddle/nn/initializer/kaiming.py b/python/paddle/nn/initializer/kaiming.py
index efb1fc486d059..dbb943d209f1a 100644
--- a/python/paddle/nn/initializer/kaiming.py
+++ b/python/paddle/nn/initializer/kaiming.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 # TODO: define the initializers of Kaiming functions in neural network
 import math
+from typing import TYPE_CHECKING
 
 import paddle
 from paddle import _C_ops
@@ -26,6 +29,9 @@
 )
 from .initializer import Initializer, calculate_gain
 
+if TYPE_CHECKING:
+    from .initializer import _NonLinearity
+
 __all__ = []
 
 
@@ -64,12 +70,12 @@ class MSRAInitializer(Initializer):
 
     def __init__(
         self,
-        uniform=True,
-        fan_in=None,
-        seed=0,
-        negative_slope=0,
-        nonlinearity='relu',
-    ):
+        uniform: bool = True,
+        fan_in: float | None = None,
+        seed: int = 0,
+        negative_slope: float = 0,
+        nonlinearity: _NonLinearity = 'relu',
+    ) -> None:
         """Constructor for MSRAInitializer"""
         assert uniform is not None
         assert seed is not None
@@ -80,12 +86,14 @@ def __init__(
         self._negative_slope = negative_slope
         self._nonlinearity = nonlinearity
 
-    def forward(self, var, block=None):
+    def forward(
+        self, var: paddle.Tensor, block: paddle.pir.Block | None = None
+    ) -> paddle.Tensor | None:
         """Initialize the input tensor with MSRA initialization.
 
         Args:
             var(Tensor): Tensor that needs to be initialized.
-            block(Block, optional): The block in which initialization ops
+            block(Block|None, optional): The block in which initialization ops
                    should be added. Used in static graph only, default None.
 
         Returns:
@@ -271,7 +279,12 @@ class KaimingNormal(MSRAInitializer):
 
     """
 
-    def __init__(self, fan_in=None, negative_slope=0.0, nonlinearity='relu'):
+    def __init__(
+        self,
+        fan_in: float | None = None,
+        negative_slope: float = 0.0,
+        nonlinearity: str = 'relu',
+    ) -> None:
         super().__init__(
             uniform=False,
             fan_in=fan_in,
@@ -317,7 +330,12 @@ class KaimingUniform(MSRAInitializer):
 
     """
 
-    def __init__(self, fan_in=None, negative_slope=0.0, nonlinearity='relu'):
+    def __init__(
+        self,
+        fan_in: float | None = None,
+        negative_slope: float = 0.0,
+        nonlinearity: str = 'relu',
+    ) -> None:
         super().__init__(
             uniform=True,
             fan_in=fan_in,
diff --git a/python/paddle/nn/initializer/normal.py b/python/paddle/nn/initializer/normal.py
index 04139af51cf5b..21450b6505fa3 100644
--- a/python/paddle/nn/initializer/normal.py
+++ b/python/paddle/nn/initializer/normal.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
+import paddle
 from paddle import _C_ops, pir
 
 from ...base import core, framework, unique_name
@@ -36,7 +39,9 @@ class NormalInitializer(Initializer):
 
     """
 
-    def __init__(self, loc=0.0, scale=1.0, seed=0):
+    def __init__(
+        self, loc: float = 0.0, scale: float = 1.0, seed: int = 0
+    ) -> None:
         assert loc is not None
         assert scale is not None
         assert seed is not None
@@ -45,12 +50,14 @@ def __init__(self, loc=0.0, scale=1.0, seed=0):
         self._std_dev = scale
         self._seed = seed
 
-    def forward(self, var, block=None):
+    def forward(
+        self, var: paddle.Tensor, block: pir.Block | None = None
+    ) -> paddle.Tensor | None:
         """Initialize the input tensor with Normal distribution.
 
         Args:
             var(Tensor): Tensor that needs to be initialized.
-            block(Block, optional): The block in which initialization ops
+            block(Block|None, optional): The block in which initialization ops
                    should be added. Used in static graph only, default None.
 
         Returns:
@@ -119,7 +126,7 @@ class Normal(NormalInitializer):
     Args:
         mean (float, optional): mean of the normal distribution. Default is 0.0.
         std (float, optional): standard deviation of the normal distribution. Default is 1.0.
-        name(str, optional): The default value is None. Normally there is no need for user to set this
+        name(str|None, optional): The default value is None. Normally there is no need for user to set this
             property. For more information, please refer to :ref:`api_guide_Name`. Default: None.
 
     Returns:
@@ -156,7 +163,9 @@ class Normal(NormalInitializer):
              [[ 1.0754838 -4.071067 ]]])
     """
 
-    def __init__(self, mean=0.0, std=1.0, name=None):
+    def __init__(
+        self, mean: float = 0.0, std: float = 1.0, name: str | None = None
+    ) -> None:
         assert mean is not None, 'mean should not be None'
         assert std is not None, 'std should not be None'
         super().__init__(loc=mean, scale=std, seed=0)
@@ -178,7 +187,14 @@ class TruncatedNormalInitializer(Initializer):
 
     """
 
-    def __init__(self, loc=0.0, scale=1.0, seed=0, a=-2.0, b=2.0):
+    def __init__(
+        self,
+        loc: float = 0.0,
+        scale: float = 1.0,
+        seed: int = 0,
+        a: float = -2.0,
+        b: float = 2.0,
+    ) -> None:
         assert loc is not None
         assert scale is not None
         assert seed is not None
@@ -191,12 +207,14 @@ def __init__(self, loc=0.0, scale=1.0, seed=0, a=-2.0, b=2.0):
         self._a = a
         self._b = b
 
-    def forward(self, var, block=None):
+    def forward(
+        self, var: paddle.Tensor, block: pir.Block | None = None
+    ) -> paddle.Tensor | None:
         """Initialize the input tensor with TruncatedNormal distribution.
 
         Args:
             var(Tensor): Tensor that needs to be initialized.
-            block(Block, optional): The block in which initialization ops
+            block(Block|None, optional): The block in which initialization ops
                    should be added. Used in static graph only, default None.
 
         Returns:
@@ -289,7 +307,7 @@ class TruncatedNormal(TruncatedNormalInitializer):
         std (float, optional): Standard deviation of the normal distribution. Default is :math:`1.0`.
         a (float, optional): The minimum cutoff value. Default is -2.0.
         b (float, optional): The maximum cutoff value. Default is 2.0.
-        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
+        name (str|None, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A parameter initialized by truncated normal distribution (Gaussian distribution).
@@ -325,7 +343,14 @@ class TruncatedNormal(TruncatedNormalInitializer):
              [[-0.11380529 -3.0696259 ]]])
     """
 
-    def __init__(self, mean=0.0, std=1.0, a=-2.0, b=2.0, name=None):
+    def __init__(
+        self,
+        mean: float = 0.0,
+        std: float = 1.0,
+        a: float = -2.0,
+        b: float = 2.0,
+        name: str | None = None,
+    ) -> None:
         assert mean is not None, 'mean should not be None'
         assert std is not None, 'std should not be None'
         assert a is not None, 'a should not be None'
diff --git a/python/paddle/nn/initializer/uniform.py b/python/paddle/nn/initializer/uniform.py
index f30ef1b38402d..0ce639e54df5c 100644
--- a/python/paddle/nn/initializer/uniform.py
+++ b/python/paddle/nn/initializer/uniform.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
+import paddle
 from paddle import _C_ops, pir
 
 from ...base import core, framework, unique_name
@@ -43,8 +46,14 @@ class UniformInitializer(Initializer):
     """
 
     def __init__(
-        self, low=-1.0, high=1.0, seed=0, diag_num=0, diag_step=0, diag_val=1.0
-    ):
+        self,
+        low: float = -1.0,
+        high: float = 1.0,
+        seed: int = 0,
+        diag_num: int = 0,
+        diag_step: int = 0,
+        diag_val: float = 1.0,
+    ) -> None:
         assert low is not None
         assert high is not None
         assert high >= low
@@ -62,12 +71,14 @@ def __init__(
         self._diag_step = diag_step
         self._diag_val = diag_val
 
-    def forward(self, var, block=None):
+    def forward(
+        self, var: paddle.Tensor, block: pir.Block | None = None
+    ) -> paddle.Tensor | None:
         """Initialize the input tensor with Uniform distribution.
 
         Args:
             var(Tensor): Tensor that needs to be initialized.
-            block(Block, optional): The block in which initialization ops
+            block(Block|None, optional): The block in which initialization ops
                    should be added. Used in static graph only, default None.
 
         Returns:
@@ -176,7 +187,7 @@ class Uniform(UniformInitializer):
     Args:
         low (float, optional): Lower boundary of the uniform distribution. Default is :math:`-1.0`.
         high (float, optional): Upper boundary of the uniform distribution. Default is :math:`1.0`.
-        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
+        name (str|None, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A parameter initialized by uniform distribution.
@@ -213,7 +224,9 @@ class Uniform(UniformInitializer):
              [[-0.41843393,  0.27575102]]])
     """
 
-    def __init__(self, low=-1.0, high=1.0, name=None):
+    def __init__(
+        self, low: float = -1.0, high: float = 1.0, name: str | None = None
+    ) -> None:
         assert low is not None, 'low should not be None'
         assert high is not None, 'high should not be None'
         assert high >= low, 'high should greater or equal than low'
diff --git a/python/paddle/nn/initializer/xavier.py b/python/paddle/nn/initializer/xavier.py
index 0a4c414aa274c..0b7675b38bee2 100644
--- a/python/paddle/nn/initializer/xavier.py
+++ b/python/paddle/nn/initializer/xavier.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import math
 
+import paddle
 from paddle import _C_ops
 
 from ...base import core, framework, unique_name
@@ -53,12 +56,12 @@ class XavierInitializer(Initializer):
 
     Args:
         uniform (bool, optional): whether to use uniform ,if False use normal distribution. Default is True.
-        fan_in (float, optional): fan_in for Xavier initialization. If None, it is
+        fan_in (float|None, optional): fan_in for Xavier initialization. If None, it is
                 inferred from the variable. Default is None.
-        fan_out (float, optional): fan_out for Xavier initialization. If None, it is
+        fan_out (float|None, optional): fan_out for Xavier initialization. If None, it is
                  inferred from the variable. Default is None.
-        gain (float, optional): Scaling Tensor. Default is 1.0.
         seed (int, optional): Random seed. Default is 0.
+        gain (float, optional): Scaling Tensor. Default is 1.0.
 
     Note:
         It is recommended to set fan_in and fan_out to None for most cases.
@@ -66,8 +69,13 @@ class XavierInitializer(Initializer):
     """
 
     def __init__(
-        self, uniform=True, fan_in=None, fan_out=None, seed=0, gain=1.0
-    ):
+        self,
+        uniform: bool = True,
+        fan_in: float | None = None,
+        fan_out: float | None = None,
+        seed: int = 0,
+        gain: float = 1.0,
+    ) -> None:
         assert uniform is not None
         assert seed is not None
         super().__init__()
@@ -77,18 +85,19 @@ def __init__(
         self._seed = seed
         self._gain = gain
 
-    def forward(self, var, block=None):
+    def forward(
+        self, var: paddle.Tensor, block: paddle.pir.Block | None = None
+    ) -> paddle.Tensor | None:
         """Initialize the input tensor with Xavier initialization.
 
         Args:
             var(Tensor): Tensor that needs to be initialized.
-            block(Block, optional): The block in which initialization ops
+            block(Block|None, optional): The block in which initialization ops
                    should be added. Used in static graph only, default None.
 
         Returns:
             The initialization op
         """
-        import paddle
 
         block = self._check_block(block)
         assert isinstance(block, (framework.Block, paddle.pir.Block))
@@ -262,12 +271,12 @@ class XavierNormal(XavierInitializer):
 
 
     Args:
-        fan_in (float, optional): fan_in for Xavier initialization, which is
+        fan_in (float|None, optional): fan_in for Xavier initialization, which is
                 inferred from the Tensor. Default is None.
-        fan_out (float, optional): fan_out for Xavier initialization, which is
+        fan_out (float|None, optional): fan_out for Xavier initialization, which is
                  inferred from the Tensor. Default is None.
         gain (float, optional): Scaling Tensor. Default is 1.0.
-        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
+        name (str|None, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A parameter initialized by Xavier weight, using a normal distribution.
@@ -304,7 +313,13 @@ class XavierNormal(XavierInitializer):
              [[1.13615966, 0.89018601]]])
     """
 
-    def __init__(self, fan_in=None, fan_out=None, gain=1.0, name=None):
+    def __init__(
+        self,
+        fan_in: float | None = None,
+        fan_out: float | None = None,
+        gain: float = 1.0,
+        name: str | None = None,
+    ) -> None:
         super().__init__(
             uniform=False, fan_in=fan_in, fan_out=fan_out, seed=0, gain=gain
         )
@@ -326,12 +341,12 @@ class XavierUniform(XavierInitializer):
         x = gain \times \sqrt{\frac{6.0}{fan\_in + fan\_out}}.
 
     Args:
-        fan_in (float, optional): fan_in for Xavier initialization, which is
+        fan_in (float|None, optional): fan_in for Xavier initialization, which is
                 inferred from the Tensor. Default is None.
-        fan_out (float, optional): fan_out for Xavier initialization, which is
+        fan_out (float|None, optional): fan_out for Xavier initialization, which is
                  inferred from the Tensor. Default is None.
         gain (float, optional): Scaling Tensor. Default is 1.0.
-        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
+        name (str|None, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A parameter initialized by Xavier weight, using a uniform distribution.
@@ -367,7 +382,13 @@ class XavierUniform(XavierInitializer):
              [[-1.02494967,  0.67544925]]])
     """
 
-    def __init__(self, fan_in=None, fan_out=None, gain=1.0, name=None):
+    def __init__(
+        self,
+        fan_in: float | None = None,
+        fan_out: float | None = None,
+        gain: float = 1.0,
+        name: str | None = None,
+    ) -> None:
         super().__init__(
             uniform=True, fan_in=fan_in, fan_out=fan_out, seed=0, gain=gain
         )