From e548e67b7b85a34c2292cf0cbce6b00754cc70e7 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Mon, 8 Jul 2024 13:57:09 +0800
Subject: [PATCH 1/3] fix&refine extformer code and docs

---
 docs/zh/examples/extformer_moe.md             |   9 +-
 .../extformer_moe/extformer_moe_enso_train.py |   2 +-
 ppsci/arch/extformer_moe_cuboid.py            |  56 +++++-----
 ppsci/arch/extformer_moe_cuboid_decoder.py    |  80 ++++++--------
 ppsci/arch/extformer_moe_cuboid_encoder.py    | 102 +++++++++---------
 ppsci/arch/extformer_moe_cuboid_utils.py      |  32 +++---
 ppsci/arch/extformer_moe_utils.py             |  37 +++----
 7 files changed, 152 insertions(+), 166 deletions(-)

diff --git a/docs/zh/examples/extformer_moe.md b/docs/zh/examples/extformer_moe.md
index 9c955399ff..6c4286b8a8 100644
--- a/docs/zh/examples/extformer_moe.md
+++ b/docs/zh/examples/extformer_moe.md
@@ -1,14 +1,18 @@
 # Extformer-MoE
 
-开始训练、评估前，请先下载，并对应修改 yaml 配置文件中的 FILE_PATH
+!!! note
 
-[ICAR-ENSO数据集](https://tianchi.aliyun.com/dataset/98942)
+    开始训练、评估前，请先下载 [ICAR-ENSO数据集](https://tianchi.aliyun.com/dataset/98942)，并对应修改 yaml 配置文件中的 `FILE_PATH` 为解压后的数据集路径。
+    若训练时显存不足，可指定 `MODEL.checkpoint_level` 为 0、1 或 2，此时使用 recompute 模式运行，以训练时间换取显存。
 
 === "模型训练命令"
 
     ``` sh
     # ICAR-ENSO 数据预训练模型: Extformer-MoE
     python extformer_moe_enso_train.py
+    # python extformer_moe_enso_train.py MODEL.checkpoint_level=0 # using recompute to run in device with small GPU memory
+    # python extformer_moe_enso_train.py MODEL.checkpoint_level=1 # using recompute to run in device with small GPU memory
+    # python extformer_moe_enso_train.py MODEL.checkpoint_level=2 # using recompute to run in device with small GPU memory
     ```
 
 === "模型评估命令"
@@ -46,7 +50,6 @@ Earthformer，一种用于地球系统预测的时空转换器。为了更好地
 
 Rank-N-Contrast（RNC）是一种表征学习方法，旨在学习一种回归感知的样本表征，该表征以连续标签空间中的距离为依据，对嵌入空间中的样本间距离进行排序，然后利用它来预测最终连续的标签。在地球系统极端预测问题中，RNC 可以对气象数据的表征进行规范，使其满足嵌入空间的连续性，和标签空间对齐，最终缓解极端事件的预测结果的过平滑问题。
 
-
 ## 2. 模型原理
 
 ### 2.1 Earthformer
diff --git a/examples/extformer_moe/extformer_moe_enso_train.py b/examples/extformer_moe/extformer_moe_enso_train.py
index 9c1bc461ed..e0e570fb95 100644
--- a/examples/extformer_moe/extformer_moe_enso_train.py
+++ b/examples/extformer_moe/extformer_moe_enso_train.py
@@ -1,10 +1,10 @@
+import enso_metric
 import hydra
 import paddle
 from omegaconf import DictConfig
 from omegaconf import OmegaConf
 from paddle import nn
 
-import examples.extformer_moe.enso_metric as enso_metric
 import ppsci
 
 
diff --git a/ppsci/arch/extformer_moe_cuboid.py b/ppsci/arch/extformer_moe_cuboid.py
index 53f8b6a6d2..bdd6311e2b 100644
--- a/ppsci/arch/extformer_moe_cuboid.py
+++ b/ppsci/arch/extformer_moe_cuboid.py
@@ -17,7 +17,7 @@
 """A space-time Transformer with Cuboid Attention"""
 
 
-class InitialEncoder(paddle.nn.Layer):
+class InitialEncoder(nn.Layer):
     def __init__(
         self,
         dim,
@@ -40,16 +40,14 @@ def __init__(
         for i in range(num_conv_layers):
             if i == 0:
                 conv_block.append(
-                    paddle.nn.Conv2D(
+                    nn.Conv2D(
                         kernel_size=(3, 3),
                         padding=(1, 1),
                         in_channels=dim,
                         out_channels=out_dim,
                     )
                 )
-                conv_block.append(
-                    paddle.nn.GroupNorm(num_groups=16, num_channels=out_dim)
-                )
+                conv_block.append(nn.GroupNorm(num_groups=16, num_channels=out_dim))
                 conv_block.append(
                     act_mod.get_activation(activation)
                     if activation != "leaky_relu"
@@ -57,22 +55,20 @@ def __init__(
                 )
             else:
                 conv_block.append(
-                    paddle.nn.Conv2D(
+                    nn.Conv2D(
                         kernel_size=(3, 3),
                         padding=(1, 1),
                         in_channels=out_dim,
                         out_channels=out_dim,
                     )
                 )
-                conv_block.append(
-                    paddle.nn.GroupNorm(num_groups=16, num_channels=out_dim)
-                )
+                conv_block.append(nn.GroupNorm(num_groups=16, num_channels=out_dim))
                 conv_block.append(
                     act_mod.get_activation(activation)
                     if activation != "leaky_relu"
                     else nn.LeakyReLU(NEGATIVE_SLOPE)
                 )
-        self.conv_block = paddle.nn.Sequential(*conv_block)
+        self.conv_block = nn.Sequential(*conv_block)
         if isinstance(downsample_scale, int):
             patch_merge_downsample = (1, downsample_scale, downsample_scale)
         elif len(downsample_scale) == 2:
@@ -123,7 +119,7 @@ def forward(self, x):
         return x
 
 
-class FinalDecoder(paddle.nn.Layer):
+class FinalDecoder(nn.Layer):
     def __init__(
         self,
         target_thw: Tuple[int, ...],
@@ -145,20 +141,20 @@ def __init__(
         conv_block = []
         for i in range(num_conv_layers):
             conv_block.append(
-                paddle.nn.Conv2D(
+                nn.Conv2D(
                     kernel_size=(3, 3),
                     padding=(1, 1),
                     in_channels=dim,
                     out_channels=dim,
                 )
             )
-            conv_block.append(paddle.nn.GroupNorm(num_groups=16, num_channels=dim))
+            conv_block.append(nn.GroupNorm(num_groups=16, num_channels=dim))
             conv_block.append(
                 act_mod.get_activation(activation)
                 if activation != "leaky_relu"
                 else nn.LeakyReLU(NEGATIVE_SLOPE)
             )
-        self.conv_block = paddle.nn.Sequential(*conv_block)
+        self.conv_block = nn.Sequential(*conv_block)
         self.upsample = cuboid_decoder.Upsample3DLayer(
             dim=dim,
             out_dim=dim,
@@ -199,7 +195,7 @@ def forward(self, x):
         return x
 
 
-class InitialStackPatchMergingEncoder(paddle.nn.Layer):
+class InitialStackPatchMergingEncoder(nn.Layer):
     def __init__(
         self,
         num_merge: int,
@@ -224,8 +220,8 @@ def __init__(
         self.downsample_scale_list = downsample_scale_list[:num_merge]
         self.num_conv_per_merge_list = num_conv_per_merge_list
         self.num_group_list = [max(1, out_dim // 4) for out_dim in self.out_dim_list]
-        self.conv_block_list = paddle.nn.LayerList()
-        self.patch_merge_list = paddle.nn.LayerList()
+        self.conv_block_list = nn.LayerList()
+        self.patch_merge_list = nn.LayerList()
         for i in range(num_merge):
             if i == 0:
                 in_dim = in_dim
@@ -240,7 +236,7 @@ def __init__(
                 else:
                     conv_in_dim = out_dim
                 conv_block.append(
-                    paddle.nn.Conv2D(
+                    nn.Conv2D(
                         kernel_size=(3, 3),
                         padding=(1, 1),
                         in_channels=conv_in_dim,
@@ -248,7 +244,7 @@ def __init__(
                     )
                 )
                 conv_block.append(
-                    paddle.nn.GroupNorm(
+                    nn.GroupNorm(
                         num_groups=self.num_group_list[i], num_channels=out_dim
                     )
                 )
@@ -257,7 +253,7 @@ def __init__(
                     if activation != "leaky_relu"
                     else nn.LeakyReLU(NEGATIVE_SLOPE)
                 )
-            conv_block = paddle.nn.Sequential(*conv_block)
+            conv_block = nn.Sequential(*conv_block)
             self.conv_block_list.append(conv_block)
             patch_merge = cuboid_encoder.PatchMerging3D(
                 dim=out_dim,
@@ -307,7 +303,7 @@ def forward(self, x):
         return x
 
 
-class FinalStackUpsamplingDecoder(paddle.nn.Layer):
+class FinalStackUpsamplingDecoder(nn.Layer):
     def __init__(
         self,
         target_shape_list: Tuple[Tuple[int, ...]],
@@ -331,8 +327,8 @@ def __init__(
         self.in_dim = in_dim
         self.num_conv_per_up_list = num_conv_per_up_list
         self.num_group_list = [max(1, out_dim // 4) for out_dim in self.out_dim_list]
-        self.conv_block_list = paddle.nn.LayerList()
-        self.upsample_list = paddle.nn.LayerList()
+        self.conv_block_list = nn.LayerList()
+        self.upsample_list = nn.LayerList()
         for i in range(self.num_upsample):
             if i == 0:
                 in_dim = in_dim
@@ -354,7 +350,7 @@ def __init__(
                 else:
                     conv_in_dim = out_dim
                 conv_block.append(
-                    paddle.nn.Conv2D(
+                    nn.Conv2D(
                         kernel_size=(3, 3),
                         padding=(1, 1),
                         in_channels=conv_in_dim,
@@ -362,7 +358,7 @@ def __init__(
                     )
                 )
                 conv_block.append(
-                    paddle.nn.GroupNorm(
+                    nn.GroupNorm(
                         num_groups=self.num_group_list[i], num_channels=out_dim
                     )
                 )
@@ -371,7 +367,7 @@ def __init__(
                     if activation != "leaky_relu"
                     else nn.LeakyReLU(NEGATIVE_SLOPE)
                 )
-            conv_block = paddle.nn.Sequential(*conv_block)
+            conv_block = nn.Sequential(*conv_block)
             self.conv_block_list.append(conv_block)
         self.reset_parameters()
 
@@ -694,7 +690,7 @@ def __init__(
             embed_dim=base_units, typ=pos_embed_type, maxH=H_in, maxW=W_in, maxT=T_in
         )
         mem_shapes = self.encoder.get_mem_shapes()
-        self.z_proj = paddle.nn.Linear(
+        self.z_proj = nn.Linear(
             in_features=mem_shapes[-1][-1], out_features=mem_shapes[-1][-1]
         )
         self.dec_pos_embed = cuboid_decoder.PosEmbed(
@@ -812,7 +808,7 @@ def get_initial_encoder_final_decoder(
             new_input_shape = self.initial_encoder.patch_merge.get_out_shape(
                 self.input_shape
             )
-            self.dec_final_proj = paddle.nn.Linear(
+            self.dec_final_proj = nn.Linear(
                 in_features=self.base_units, out_features=C_out
             )
         elif self.initial_downsample_type == "stack_conv":
@@ -852,7 +848,7 @@ def get_initial_encoder_final_decoder(
                 linear_init_mode=self.down_up_linear_init_mode,
                 norm_init_mode=self.norm_init_mode,
             )
-            self.dec_final_proj = paddle.nn.Linear(
+            self.dec_final_proj = nn.Linear(
                 in_features=dec_target_shape_list[-1][-1], out_features=C_out
             )
             new_input_shape = self.initial_encoder.get_out_shape_list(self.input_shape)[
@@ -905,7 +901,7 @@ def get_initial_z(self, final_mem, T_out):
                 shape=[B, -1, -1, -1, -1]
             )
         elif self.z_init_method == "nearest_interp":
-            initial_z = paddle.nn.functional.interpolate(
+            initial_z = nn.functional.interpolate(
                 x=final_mem.transpose(perm=[0, 4, 1, 2, 3]),
                 size=(T_out, final_mem.shape[2], final_mem.shape[3]),
             ).transpose(perm=[0, 2, 3, 4, 1])
diff --git a/ppsci/arch/extformer_moe_cuboid_decoder.py b/ppsci/arch/extformer_moe_cuboid_decoder.py
index d1192a6861..aee77f7a8a 100644
--- a/ppsci/arch/extformer_moe_cuboid_decoder.py
+++ b/ppsci/arch/extformer_moe_cuboid_decoder.py
@@ -13,7 +13,7 @@
 from ppsci.utils import initializer
 
 
-class PosEmbed(paddle.nn.Layer):
+class PosEmbed(nn.Layer):
     """pose embeding
 
     Args:
@@ -46,20 +46,12 @@ def __init__(
         self.maxW = maxW
         self.embed_dim = embed_dim
         if self.typ == "t+h+w":
-            self.T_embed = paddle.nn.Embedding(
-                num_embeddings=maxT, embedding_dim=embed_dim
-            )
-            self.H_embed = paddle.nn.Embedding(
-                num_embeddings=maxH, embedding_dim=embed_dim
-            )
-            self.W_embed = paddle.nn.Embedding(
-                num_embeddings=maxW, embedding_dim=embed_dim
-            )
+            self.T_embed = nn.Embedding(num_embeddings=maxT, embedding_dim=embed_dim)
+            self.H_embed = nn.Embedding(num_embeddings=maxH, embedding_dim=embed_dim)
+            self.W_embed = nn.Embedding(num_embeddings=maxW, embedding_dim=embed_dim)
         elif self.typ == "t+hw":
-            self.T_embed = paddle.nn.Embedding(
-                num_embeddings=maxT, embedding_dim=embed_dim
-            )
-            self.HW_embed = paddle.nn.Embedding(
+            self.T_embed = nn.Embedding(num_embeddings=maxT, embedding_dim=embed_dim)
+            self.HW_embed = nn.Embedding(
                 num_embeddings=maxH * maxW, embedding_dim=embed_dim
             )
         else:
@@ -177,7 +169,7 @@ def compute_cuboid_cross_attention_mask(
     return attn_mask
 
 
-class CuboidCrossAttentionLayer(paddle.nn.Layer):
+class CuboidCrossAttentionLayer(nn.Layer):
     """Implements the cuboid cross attention.
 
     The idea of Cuboid Cross Attention is to extend the idea of cuboid self attention to work for the
@@ -317,21 +309,19 @@ def __init__(
             self.register_buffer(
                 name="relative_position_index", tensor=relative_position_index
             )
-        self.q_proj = paddle.nn.Linear(
-            in_features=dim, out_features=dim, bias_attr=qkv_bias
-        )
-        self.kv_proj = paddle.nn.Linear(
+        self.q_proj = nn.Linear(in_features=dim, out_features=dim, bias_attr=qkv_bias)
+        self.kv_proj = nn.Linear(
             in_features=dim, out_features=dim * 2, bias_attr=qkv_bias
         )
-        self.attn_drop = paddle.nn.Dropout(p=attn_drop)
-        self.proj = paddle.nn.Linear(in_features=dim, out_features=dim)
-        self.proj_drop = paddle.nn.Dropout(p=proj_drop)
+        self.attn_drop = nn.Dropout(p=attn_drop)
+        self.proj = nn.Linear(in_features=dim, out_features=dim)
+        self.proj_drop = nn.Dropout(p=proj_drop)
         if self.use_global_vector:
             if self.separate_global_qkv:
-                self.l2g_q_net = paddle.nn.Linear(
+                self.l2g_q_net = nn.Linear(
                     in_features=dim, out_features=dim, bias_attr=qkv_bias
                 )
-                self.l2g_global_kv_net = paddle.nn.Linear(
+                self.l2g_global_kv_net = nn.Linear(
                     in_features=global_dim_ratio * dim,
                     out_features=dim * 2,
                     bias_attr=qkv_bias,
@@ -556,7 +546,7 @@ def forward(self, x, mem, mem_global_vectors=None):
         return x
 
 
-class StackCuboidCrossAttentionBlock(paddle.nn.Layer):
+class StackCuboidCrossAttentionBlock(nn.Layer):
     """A stack of cuboid cross attention layers.
 
     The advantage of cuboid attention is that we can combine cuboid attention building blocks with different
@@ -661,7 +651,7 @@ def __init__(
         self.use_global_vector = use_global_vector
         if self.use_inter_ffn:
             if moe_config["use_ffn_moe"]:
-                self.ffn_l = paddle.nn.LayerList(
+                self.ffn_l = nn.LayerList(
                     sublayers=[
                         cuboid_encoder.MixtureFFN(
                             units=dim,
@@ -681,7 +671,7 @@ def __init__(
                     ]
                 )
             else:
-                self.ffn_l = paddle.nn.LayerList(
+                self.ffn_l = nn.LayerList(
                     sublayers=[
                         cuboid_encoder.PositionwiseFFN(
                             units=dim,
@@ -702,7 +692,7 @@ def __init__(
                 )
         else:
             if moe_config["use_ffn_moe"]:
-                self.ffn_l = paddle.nn.LayerList(
+                self.ffn_l = nn.LayerList(
                     sublayers=[
                         cuboid_encoder.MixtureFFN(
                             units=dim,
@@ -721,7 +711,7 @@ def __init__(
                     ]
                 )
             else:
-                self.ffn_l = paddle.nn.LayerList(
+                self.ffn_l = nn.LayerList(
                     sublayers=[
                         cuboid_encoder.PositionwiseFFN(
                             units=dim,
@@ -741,7 +731,7 @@ def __init__(
                 )
 
         if moe_config["use_attn_moe"]:
-            self.attn_l = paddle.nn.LayerList(
+            self.attn_l = nn.LayerList(
                 sublayers=[
                     MixtureCrossAttention(
                         dim=dim,
@@ -778,7 +768,7 @@ def __init__(
                 ]
             )
         else:
-            self.attn_l = paddle.nn.LayerList(
+            self.attn_l = nn.LayerList(
                 sublayers=[
                     CuboidCrossAttentionLayer(
                         dim=dim,
@@ -854,7 +844,7 @@ def forward(self, x, mem, mem_global_vector=None):
         return x
 
 
-class Upsample3DLayer(paddle.nn.Layer):
+class Upsample3DLayer(nn.Layer):
     """Upsampling based on nn.UpSampling and Conv3x3.
 
     If the temporal dimension remains the same:
@@ -889,12 +879,10 @@ def __init__(
         self.out_dim = out_dim
         self.temporal_upsample = temporal_upsample
         if temporal_upsample:
-            self.up = paddle.nn.Upsample(size=target_size, mode="nearest")
+            self.up = nn.Upsample(size=target_size, mode="nearest")
         else:
-            self.up = paddle.nn.Upsample(
-                size=(target_size[1], target_size[2]), mode="nearest"
-            )
-        self.conv = paddle.nn.Conv2D(
+            self.up = nn.Upsample(size=(target_size[1], target_size[2]), mode="nearest")
+        self.conv = nn.Conv2D(
             in_channels=dim,
             out_channels=out_dim,
             kernel_size=(kernel_size, kernel_size),
@@ -955,7 +943,7 @@ def forward(self, x):
                 )
 
 
-class CuboidTransformerDecoder(paddle.nn.Layer):
+class CuboidTransformerDecoder(nn.Layer):
     """Decoder of the CuboidTransformer.
 
     For each block, we first apply the StackCuboidSelfAttention and then apply the StackCuboidCrossAttention
@@ -1169,8 +1157,8 @@ def __init__(
                 )
                 for _ in range(ele_depth)
             ]
-            self_blocks.append(paddle.nn.LayerList(sublayers=stack_cuboid_blocks))
-        self.self_blocks = paddle.nn.LayerList(sublayers=self_blocks)
+            self_blocks.append(nn.LayerList(sublayers=stack_cuboid_blocks))
+        self.self_blocks = nn.LayerList(sublayers=self_blocks)
 
         if block_cross_attn_patterns is not None:
             if isinstance(block_cross_attn_patterns, (tuple, list)):
@@ -1229,10 +1217,10 @@ def __init__(
                 assert (
                     len(block_cross_n_temporal) == self.num_blocks
                 ), f"Incorrect input format! Received block_cross_n_temporal={block_cross_n_temporal}"
-        self.cross_blocks = paddle.nn.LayerList()
+        self.cross_blocks = nn.LayerList()
         assert self.cross_start == 0
         for i in range(self.cross_start, self.num_blocks):
-            cross_block = paddle.nn.LayerList(
+            cross_block = nn.LayerList(
                 sublayers=[
                     StackCuboidCrossAttentionBlock(
                         dim=self.mem_shapes[i][-1],
@@ -1268,7 +1256,7 @@ def __init__(
             self.cross_blocks.append(cross_block)
         if self.num_blocks > 1:
             if self.upsample_type == "upsample":
-                self.upsample_layers = paddle.nn.LayerList(
+                self.upsample_layers = nn.LayerList(
                     sublayers=[
                         Upsample3DLayer(
                             dim=self.mem_shapes[i + 1][-1],
@@ -1285,7 +1273,7 @@ def __init__(
             else:
                 raise NotImplementedError(f"{self.upsample_type} is invalid.")
             if self.hierarchical_pos_embed:
-                self.hierarchical_pos_embed_l = paddle.nn.LayerList(
+                self.hierarchical_pos_embed_l = nn.LayerList(
                     sublayers=[
                         PosEmbed(
                             embed_dim=self.mem_shapes[i][-1],
@@ -1368,7 +1356,7 @@ def forward(self, x, mem_l, mem_global_vector_l=None):
         return x
 
 
-class MixtureCrossAttention(paddle.nn.Layer):
+class MixtureCrossAttention(nn.Layer):
     def __init__(
         self,
         dim,
@@ -1424,7 +1412,7 @@ def __init__(
         else:
             raise NotImplementedError
 
-        self.experts = paddle.nn.LayerList(
+        self.experts = nn.LayerList(
             [
                 CuboidCrossAttentionLayer(
                     dim=dim,
diff --git a/ppsci/arch/extformer_moe_cuboid_encoder.py b/ppsci/arch/extformer_moe_cuboid_encoder.py
index a21b126c0a..c26b3837a5 100644
--- a/ppsci/arch/extformer_moe_cuboid_encoder.py
+++ b/ppsci/arch/extformer_moe_cuboid_encoder.py
@@ -16,7 +16,7 @@
 NEGATIVE_SLOPE = 0.1
 
 
-class PatchMerging3D(paddle.nn.Layer):
+class PatchMerging3D(nn.Layer):
     """Patch Merging Layer
 
     Args:
@@ -49,7 +49,7 @@ def __init__(
         self.out_dim = out_dim
         self.downsample = downsample
         self.padding_type = padding_type
-        self.reduction = paddle.nn.Linear(
+        self.reduction = nn.Linear(
             in_features=downsample[0] * downsample[1] * downsample[2] * dim,
             out_features=out_dim,
             bias_attr=False,
@@ -127,7 +127,7 @@ def forward(self, x):
         return x
 
 
-class PositionwiseFFN(paddle.nn.Layer):
+class PositionwiseFFN(nn.Layer):
     """The Position-wise FFN layer used in Transformer-like architectures
 
     If pre_norm is True:
@@ -187,8 +187,8 @@ def __init__(
                 ("pre_norm", pre_norm),
             ]
         )
-        self.dropout_layer = paddle.nn.Dropout(p=dropout)
-        self.activation_dropout_layer = paddle.nn.Dropout(p=activation_dropout)
+        self.dropout_layer = nn.Dropout(p=dropout)
+        self.activation_dropout_layer = nn.Dropout(p=activation_dropout)
 
         if moe_config["use_linear_moe"]:
             self.ffn_1 = MixtureLinear(
@@ -199,11 +199,11 @@ def __init__(
                 moe_config=moe_config,
             )
         else:
-            self.ffn_1 = paddle.nn.Linear(
+            self.ffn_1 = nn.Linear(
                 in_features=units, out_features=hidden_size, bias_attr=True
             )
         if self._gated_proj:
-            self.ffn_1_gate = paddle.nn.Linear(
+            self.ffn_1_gate = nn.Linear(
                 in_features=units, out_features=hidden_size, bias_attr=True
             )
         if activation == "leaky_relu":
@@ -220,7 +220,7 @@ def __init__(
                 moe_config=moe_config,
             )
         else:
-            self.ffn_2 = paddle.nn.Linear(
+            self.ffn_2 = nn.Linear(
                 in_features=hidden_size, out_features=units, bias_attr=True
             )
         self.layer_norm = cuboid_utils.get_norm_layer(
@@ -424,9 +424,11 @@ def masked_softmax(att_score, mask, axis: int = -1):
             att_score = att_score.masked_fill(paddle.logical_not(mask), -1e4)
         else:
             att_score = att_score.masked_fill(paddle.logical_not(mask), -1e18)
-        att_weights = paddle.nn.functional.softmax(x=att_score, axis=axis) * mask
+        att_weights = nn.functional.softmax(x=att_score, axis=axis) * mask.astype(
+            att_score.dtype
+        )
     else:
-        att_weights = paddle.nn.functional.softmax(x=att_score, axis=axis)
+        att_weights = nn.functional.softmax(x=att_score, axis=axis)
     return att_weights
 
 
@@ -475,7 +477,7 @@ def cuboid_reorder_reverse(data, cuboid_size, strategy, orig_data_shape):
     return data
 
 
-class CuboidSelfAttentionLayer(paddle.nn.Layer):
+class CuboidSelfAttentionLayer(nn.Layer):
     """Implements the cuboid self attention.
 
     The idea of Cuboid Self Attention is to divide the input tensor (T, H, W) into several non-overlapping cuboids.
@@ -613,49 +615,47 @@ def __init__(
             self.register_buffer(
                 name="relative_position_index", tensor=relative_position_index
             )
-        self.qkv = paddle.nn.Linear(
-            in_features=dim, out_features=dim * 3, bias_attr=qkv_bias
-        )
-        self.attn_drop = paddle.nn.Dropout(p=attn_drop)
+        self.qkv = nn.Linear(in_features=dim, out_features=dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(p=attn_drop)
         if self.use_global_vector:
             if self.separate_global_qkv:
-                self.l2g_q_net = paddle.nn.Linear(
+                self.l2g_q_net = nn.Linear(
                     in_features=dim, out_features=dim, bias_attr=qkv_bias
                 )
-                self.l2g_global_kv_net = paddle.nn.Linear(
+                self.l2g_global_kv_net = nn.Linear(
                     in_features=global_dim_ratio * dim,
                     out_features=dim * 2,
                     bias_attr=qkv_bias,
                 )
-                self.g2l_global_q_net = paddle.nn.Linear(
+                self.g2l_global_q_net = nn.Linear(
                     in_features=global_dim_ratio * dim,
                     out_features=dim,
                     bias_attr=qkv_bias,
                 )
-                self.g2l_k_net = paddle.nn.Linear(
+                self.g2l_k_net = nn.Linear(
                     in_features=dim, out_features=dim, bias_attr=qkv_bias
                 )
-                self.g2l_v_net = paddle.nn.Linear(
+                self.g2l_v_net = nn.Linear(
                     in_features=dim,
                     out_features=global_dim_ratio * dim,
                     bias_attr=qkv_bias,
                 )
                 if self.use_global_self_attn:
-                    self.g2g_global_qkv_net = paddle.nn.Linear(
+                    self.g2g_global_qkv_net = nn.Linear(
                         in_features=global_dim_ratio * dim,
                         out_features=global_dim_ratio * dim * 3,
                         bias_attr=qkv_bias,
                     )
             else:
-                self.global_qkv = paddle.nn.Linear(
+                self.global_qkv = nn.Linear(
                     in_features=dim, out_features=dim * 3, bias_attr=qkv_bias
                 )
-            self.global_attn_drop = paddle.nn.Dropout(p=attn_drop)
+            self.global_attn_drop = nn.Dropout(p=attn_drop)
         if use_final_proj:
-            self.proj = paddle.nn.Linear(in_features=dim, out_features=dim)
-            self.proj_drop = paddle.nn.Dropout(p=proj_drop)
+            self.proj = nn.Linear(in_features=dim, out_features=dim)
+            self.proj_drop = nn.Dropout(p=proj_drop)
             if self.use_global_vector:
-                self.global_proj = paddle.nn.Linear(
+                self.global_proj = nn.Linear(
                     in_features=global_dim_ratio * dim,
                     out_features=global_dim_ratio * dim,
                 )
@@ -980,7 +980,7 @@ def forward(self, x, global_vectors=None):
             return x
 
 
-class StackCuboidSelfAttentionBlock(paddle.nn.Layer):
+class StackCuboidSelfAttentionBlock(nn.Layer):
     """
     - "use_inter_ffn" is True
         x --> attn1 -----+-------> ffn1 ---+---> attn2 --> ... --> ffn_k --> out
@@ -1083,7 +1083,7 @@ def __init__(
         self.global_dim_ratio = global_dim_ratio
         if self.use_inter_ffn:
             if moe_config["use_ffn_moe"]:
-                self.ffn_l = paddle.nn.LayerList(
+                self.ffn_l = nn.LayerList(
                     sublayers=[
                         MixtureFFN(
                             units=dim,
@@ -1103,7 +1103,7 @@ def __init__(
                     ]
                 )
             else:
-                self.ffn_l = paddle.nn.LayerList(
+                self.ffn_l = nn.LayerList(
                     sublayers=[
                         PositionwiseFFN(
                             units=dim,
@@ -1124,7 +1124,7 @@ def __init__(
                 )
             if self.use_global_vector_ffn and self.use_global_vector:
                 if moe_config["use_ffn_moe"]:
-                    self.global_ffn_l = paddle.nn.LayerList(
+                    self.global_ffn_l = nn.LayerList(
                         sublayers=[
                             MixtureFFN(
                                 units=global_dim_ratio * dim,
@@ -1144,7 +1144,7 @@ def __init__(
                         ]
                     )
                 else:
-                    self.global_ffn_l = paddle.nn.LayerList(
+                    self.global_ffn_l = nn.LayerList(
                         sublayers=[
                             PositionwiseFFN(
                                 units=global_dim_ratio * dim,
@@ -1165,7 +1165,7 @@ def __init__(
                     )
         else:
             if moe_config["use_ffn_moe"]:
-                self.ffn_l = paddle.nn.LayerList(
+                self.ffn_l = nn.LayerList(
                     sublayers=[
                         MixtureFFN(
                             units=dim,
@@ -1184,7 +1184,7 @@ def __init__(
                     ]
                 )
             else:
-                self.ffn_l = paddle.nn.LayerList(
+                self.ffn_l = nn.LayerList(
                     sublayers=[
                         PositionwiseFFN(
                             units=dim,
@@ -1204,7 +1204,7 @@ def __init__(
                 )
             if self.use_global_vector_ffn and self.use_global_vector:
                 if moe_config["use_ffn_moe"]:
-                    self.global_ffn_l = paddle.nn.LayerList(
+                    self.global_ffn_l = nn.LayerList(
                         sublayers=[
                             MixtureFFN(
                                 units=global_dim_ratio * dim,
@@ -1223,7 +1223,7 @@ def __init__(
                         ]
                     )
                 else:
-                    self.global_ffn_l = paddle.nn.LayerList(
+                    self.global_ffn_l = nn.LayerList(
                         sublayers=[
                             PositionwiseFFN(
                                 units=global_dim_ratio * dim,
@@ -1243,7 +1243,7 @@ def __init__(
                     )
 
         if moe_config["use_attn_moe"]:
-            self.attn_l = paddle.nn.LayerList(
+            self.attn_l = nn.LayerList(
                 sublayers=[
                     MixtureSelfAttention(
                         dim=dim,
@@ -1276,7 +1276,7 @@ def __init__(
                 ]
             )
         else:
-            self.attn_l = paddle.nn.LayerList(
+            self.attn_l = nn.LayerList(
                 sublayers=[
                     CuboidSelfAttentionLayer(
                         dim=dim,
@@ -1385,7 +1385,7 @@ def forward(self, x, global_vectors=None):
             return x
 
 
-class CuboidTransformerEncoder(paddle.nn.Layer):
+class CuboidTransformerEncoder(nn.Layer):
     """Encoder of the CuboidTransformer
 
     x --> attn_block --> patch_merge --> attn_block --> patch_merge --> ... --> out
@@ -1500,7 +1500,7 @@ def __init__(
         self.block_units = block_units
         if self.num_blocks > 1:
             if downsample_type == "patch_merge":
-                self.down_layers = paddle.nn.LayerList(
+                self.down_layers = nn.LayerList(
                     sublayers=[
                         PatchMerging3D(
                             dim=self.block_units[i],
@@ -1516,9 +1516,9 @@ def __init__(
             else:
                 raise NotImplementedError(f"{downsample_type} is invalid.")
             if self.use_global_vector:
-                self.down_layer_global_proj = paddle.nn.LayerList(
+                self.down_layer_global_proj = nn.LayerList(
                     sublayers=[
-                        paddle.nn.Linear(
+                        nn.Linear(
                             in_features=global_dim_ratio * self.block_units[i],
                             out_features=global_dim_ratio * self.block_units[i + 1],
                         )
@@ -1566,9 +1566,9 @@ def __init__(
         self.block_shift_size = block_shift_size
 
         expert_shape_list = self.get_mem_shapes()
-        self.blocks = paddle.nn.LayerList(
+        self.blocks = nn.LayerList(
             sublayers=[
-                paddle.nn.Sequential(
+                nn.Sequential(
                     *[
                         StackCuboidSelfAttentionBlock(
                             dim=self.block_units[i],
@@ -1675,7 +1675,7 @@ def forward(self, x, global_vectors=None):
             return out
 
 
-class MixtureLinear(paddle.nn.Layer):
+class MixtureLinear(nn.Layer):
     def __init__(self, in_dim, out_dim, expert_shape, moe_config, bias_attr=True):
         super().__init__()
 
@@ -1709,11 +1709,9 @@ def __init__(self, in_dim, out_dim, expert_shape, moe_config, bias_attr=True):
         else:
             raise NotImplementedError
 
-        self.experts = paddle.nn.LayerList(
+        self.experts = nn.LayerList(
             [
-                paddle.nn.Linear(
-                    in_features=in_dim, out_features=out_dim, bias_attr=bias_attr
-                )
+                nn.Linear(in_features=in_dim, out_features=out_dim, bias_attr=bias_attr)
                 for _ in range(self.num_experts)
             ]
         )
@@ -1763,7 +1761,7 @@ def forward(self, x):
         return y
 
 
-class MixtureFFN(paddle.nn.Layer):
+class MixtureFFN(nn.Layer):
     def __init__(
         self,
         units,
@@ -1808,7 +1806,7 @@ def __init__(
         else:
             raise NotImplementedError
 
-        self.experts = paddle.nn.LayerList(
+        self.experts = nn.LayerList(
             [
                 PositionwiseFFN(
                     units=units,
@@ -1878,7 +1876,7 @@ def reset_parameters(self):
             self.experts[i].reset_parameters()
 
 
-class MixtureSelfAttention(paddle.nn.Layer):
+class MixtureSelfAttention(nn.Layer):
     def __init__(
         self,
         dim,
@@ -1933,7 +1931,7 @@ def __init__(
         else:
             raise NotImplementedError
 
-        self.experts = paddle.nn.LayerList(
+        self.experts = nn.LayerList(
             [
                 CuboidSelfAttentionLayer(
                     dim=dim,
diff --git a/ppsci/arch/extformer_moe_cuboid_utils.py b/ppsci/arch/extformer_moe_cuboid_utils.py
index 49bde7c2ab..8b90626e3b 100644
--- a/ppsci/arch/extformer_moe_cuboid_utils.py
+++ b/ppsci/arch/extformer_moe_cuboid_utils.py
@@ -12,7 +12,7 @@ def round_to(dat, c):
     return dat + (dat - dat % c) % c
 
 
-class RMSNorm(paddle.nn.Layer):
+class RMSNorm(nn.Layer):
     """Root Mean Square Layer Normalization proposed in "[NeurIPS2019] Root Mean Square Layer Normalization"
 
     Args:
@@ -94,7 +94,7 @@ def get_norm_layer(
         if normalization == "layer_norm":
             assert in_channels > 0
             assert axis == -1
-            norm_layer = paddle.nn.LayerNorm(
+            norm_layer = nn.LayerNorm(
                 normalized_shape=in_channels, epsilon=epsilon, **kwargs
             )
         elif normalization == "rms_norm":
@@ -104,7 +104,7 @@ def get_norm_layer(
             raise NotImplementedError(f"normalization={normalization} is not supported")
         return norm_layer
     elif normalization is None:
-        return paddle.nn.Identity()
+        return nn.Identity()
     else:
         raise NotImplementedError("The type of normalization must be str")
 
@@ -115,7 +115,7 @@ def generalize_padding(x, pad_t, pad_h, pad_w, padding_type, t_pad_left=False):
     assert padding_type in ["zeros", "ignore", "nearest"]
     B, T, H, W, C = x.shape
     if padding_type == "nearest":
-        return paddle.nn.functional.interpolate(
+        return nn.functional.interpolate(
             x=x.transpose(perm=[0, 4, 1, 2, 3]), size=(T + pad_t, H + pad_h, W + pad_w)
         ).transpose(perm=[0, 2, 3, 4, 1])
     elif t_pad_left:
@@ -136,7 +136,7 @@ def generalize_unpadding(x, pad_t, pad_h, pad_w, padding_type):
     if pad_t == 0 and pad_h == 0 and pad_w == 0:
         return x
     if padding_type == "nearest":
-        return paddle.nn.functional.interpolate(
+        return nn.functional.interpolate(
             x=x.transpose(perm=[0, 4, 1, 2, 3]), size=(T - pad_t, H - pad_h, W - pad_w)
         ).transpose(perm=[0, 2, 3, 4, 1])
     else:
@@ -144,13 +144,13 @@ def generalize_unpadding(x, pad_t, pad_h, pad_w, padding_type):
 
 
 def apply_initialization(
-    m: paddle.nn.Layer,
+    m: nn.Layer,
     linear_mode: str = "0",
     conv_mode: str = "0",
     norm_mode: str = "0",
     embed_mode: str = "0",
 ):
-    if isinstance(m, paddle.nn.Linear):
+    if isinstance(m, nn.Linear):
         if linear_mode in ("0",):
             m.weight = initializer.kaiming_normal_(m.weight, nonlinearity="linear")
         elif linear_mode in ("1",):
@@ -164,10 +164,10 @@ def apply_initialization(
     elif isinstance(
         m,
         (
-            paddle.nn.Conv2D,
-            paddle.nn.Conv3D,
-            paddle.nn.Conv2DTranspose,
-            paddle.nn.Conv3DTranspose,
+            nn.Conv2D,
+            nn.Conv3D,
+            nn.Conv2DTranspose,
+            nn.Conv3DTranspose,
         ),
     ):
         if conv_mode in ("0",):
@@ -178,26 +178,26 @@ def apply_initialization(
             raise NotImplementedError(f"{conv_mode} is invalid.")
         if hasattr(m, "bias") and m.bias is not None:
             m.bias = initializer.zeros_(m.bias)
-    elif isinstance(m, paddle.nn.LayerNorm):
+    elif isinstance(m, nn.LayerNorm):
         if norm_mode in ("0",):
             m.weight = initializer.zeros_(m.weight)
             m.bias = initializer.zeros_(m.bias)
         else:
             raise NotImplementedError(f"{norm_mode} is invalid.")
-    elif isinstance(m, paddle.nn.GroupNorm):
+    elif isinstance(m, nn.GroupNorm):
         if norm_mode in ("0",):
             m.weight = initializer.ones_(m.weight)
             m.bias = initializer.zeros_(m.bias)
         else:
             raise NotImplementedError(f"{norm_mode} is invalid.")
-    elif isinstance(m, paddle.nn.Embedding):
+    elif isinstance(m, nn.Embedding):
         if embed_mode in ("0",):
             m.weight.data = initializer.trunc_normal_(m.weight.data, std=0.02)
         else:
             raise NotImplementedError(f"{embed_mode} is invalid.")
-    elif isinstance(m, paddle.nn.Layer) and hasattr(m, "experts"):
+    elif isinstance(m, nn.Layer) and hasattr(m, "experts"):
         for lin in m.experts:
-            assert isinstance(lin, paddle.nn.Linear)
+            assert isinstance(lin, nn.Linear)
             apply_initialization(lin, linear_mode=linear_mode)
     else:
         pass
diff --git a/ppsci/arch/extformer_moe_utils.py b/ppsci/arch/extformer_moe_utils.py
index 0c57f96066..3332b356c8 100644
--- a/ppsci/arch/extformer_moe_utils.py
+++ b/ppsci/arch/extformer_moe_utils.py
@@ -1,11 +1,12 @@
 import math
 
 import paddle
+from paddle import nn
 
 # MoE Gating
 
 
-class GatingNet(paddle.nn.Layer):
+class GatingNet(nn.Layer):
     def __init__(self, moe_config, input_shape, in_channels):
         super().__init__()
 
@@ -16,12 +17,12 @@ def __init__(self, moe_config, input_shape, in_channels):
         assert len(input_shape) == 4
         self.input_shape = input_shape
 
-        self.noise_lin = paddle.nn.Linear(
+        self.noise_lin = nn.Linear(
             in_features=in_channels, out_features=self.num_experts, bias_attr=False
         )
         self.noise_eps = 1e-2
-        self.softplus = paddle.nn.Softplus()
-        self.softmax = paddle.nn.Softmax(axis=-1)
+        self.softplus = nn.Softplus()
+        self.softmax = nn.Softmax(axis=-1)
 
         self.importance_weight = moe_config["importance_weight"]
         self.load_weight = moe_config["load_weight"]
@@ -178,7 +179,7 @@ def __init__(self, moe_config, input_shape, in_channels):
         assert len(input_shape) == 4
         T, H, W, C = input_shape
 
-        self.lin = paddle.nn.Linear(
+        self.lin = nn.Linear(
             in_features=in_channels, out_features=self.num_experts, bias_attr=False
         )
 
@@ -199,7 +200,7 @@ def __init__(self, moe_config, input_shape, in_channels):
         self.routing_weights = paddle.create_parameter(
             shape=[H, W, self.num_experts],
             dtype="float32",
-            default_initializer=paddle.nn.initializer.Uniform(-bound, bound),
+            default_initializer=nn.initializer.Uniform(-bound, bound),
         )
 
     def gating(self, x, t_map=None):
@@ -222,16 +223,16 @@ def __init__(self, moe_config, input_shape, in_channels):
         self.spatial_routing_weights = paddle.create_parameter(
             shape=[H, W, self.num_experts],
             dtype="float32",
-            default_initializer=paddle.nn.initializer.Uniform(-bound, bound),
+            default_initializer=nn.initializer.Uniform(-bound, bound),
         )
-        self.lin = paddle.nn.Linear(
+        self.lin = nn.Linear(
             in_features=in_channels, out_features=self.num_experts, bias_attr=False
         )
 
         self.combine_weight = paddle.create_parameter(
             shape=[H, W, self.num_experts, 2],
             dtype="float32",
-            default_initializer=paddle.nn.initializer.Uniform(-bound, bound),
+            default_initializer=nn.initializer.Uniform(-bound, bound),
         )
 
     def gating(self, x, t_map=None):
@@ -262,7 +263,7 @@ def __init__(self, moe_config, input_shape, in_channels):
         self.routing_weights = paddle.create_parameter(
             shape=[T, H, W, self.num_experts],
             dtype="float32",
-            default_initializer=paddle.nn.initializer.Uniform(-bound, bound),
+            default_initializer=nn.initializer.Uniform(-bound, bound),
         )
 
     def gating(self, x, t_map=None):
@@ -285,17 +286,17 @@ def __init__(self, moe_config, input_shape, in_channels):
         self.cuboid_routing_weights = paddle.create_parameter(
             shape=[T, H, W, self.num_experts],
             dtype="float32",
-            default_initializer=paddle.nn.initializer.Uniform(-bound, bound),
+            default_initializer=nn.initializer.Uniform(-bound, bound),
         )
 
-        self.lin = paddle.nn.Linear(
+        self.lin = nn.Linear(
             in_features=in_channels, out_features=self.num_experts, bias_attr=False
         )
 
         self.combine_weight = paddle.create_parameter(
             shape=[T, H, W, self.num_experts, 2],
             dtype="float32",
-            default_initializer=paddle.nn.initializer.Uniform(-bound, bound),
+            default_initializer=nn.initializer.Uniform(-bound, bound),
         )
 
     def gating(self, x, t_map=None):
@@ -418,7 +419,7 @@ def combine(self, expert_out):
 # RNC
 
 
-class LabelDifference(paddle.nn.Layer):
+class LabelDifference(nn.Layer):
     def __init__(self, distance_type="l1"):
         super().__init__()
         self.distance_type = distance_type
@@ -435,7 +436,7 @@ def forward(self, labels):
             raise ValueError(self.distance_type)
 
 
-class FeatureSimilarity(paddle.nn.Layer):
+class FeatureSimilarity(nn.Layer):
     def __init__(self, similarity_type="l2", temperature=2):
         super().__init__()
         self.similarity_type = similarity_type
@@ -454,7 +455,7 @@ def forward(self, features):
             logits -= logits_max.detach()
             return logits
         elif self.similarity_type == "cosine":
-            cos_func = paddle.nn.CosineSimilarity(axis=-1)
+            cos_func = nn.CosineSimilarity(axis=-1)
             logits = cos_func(features[:, :, None, :], features[:, None, :, :])
             logits /= self.t
             return logits
@@ -462,7 +463,7 @@ def forward(self, features):
             raise ValueError(self.similarity_type)
 
 
-class RnCLoss(paddle.nn.Layer):
+class RnCLoss(nn.Layer):
     def __init__(self, rnc_config):
         super().__init__()
 
@@ -474,7 +475,7 @@ def __init__(self, rnc_config):
         )
         self.rnc_weight = rnc_config["rank_reg_coeff"]
         self.loss_cal_mode = rnc_config["loss_cal_style"]
-        self.softmax_cri = paddle.nn.Softmax(axis=-1)
+        self.softmax_cri = nn.Softmax(axis=-1)
 
     def cal_loss(self, features, labels):
 

From 618529ad3177209a2682aa53f5f2da0896c70331 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Mon, 8 Jul 2024 13:57:26 +0800
Subject: [PATCH 2/3] replace paddle.nn. with nn.

---
 ppsci/arch/activation.py                 |   2 +-
 ppsci/arch/amgnet.py                     |  28 ++---
 ppsci/arch/cuboid_transformer.py         |  56 ++++-----
 ppsci/arch/cuboid_transformer_decoder.py |  70 +++++------
 ppsci/arch/cuboid_transformer_encoder.py |  74 ++++++------
 ppsci/arch/cuboid_transformer_utils.py   |  28 ++---
 ppsci/arch/mlp.py                        |   2 +-
 ppsci/arch/nowcastnet.py                 | 147 +++++++++++------------
 ppsci/arch/phycrnet.py                   |  22 ++--
 9 files changed, 204 insertions(+), 225 deletions(-)

diff --git a/ppsci/arch/activation.py b/ppsci/arch/activation.py
index f73b51581b..bfba786853 100644
--- a/ppsci/arch/activation.py
+++ b/ppsci/arch/activation.py
@@ -51,7 +51,7 @@ def __init__(self, beta: float = 1.0):
         super().__init__()
         self.beta = self.create_parameter(
             shape=[],
-            default_initializer=paddle.nn.initializer.Constant(beta),
+            default_initializer=nn.initializer.Constant(beta),
         )
 
     def forward(self, x):
diff --git a/ppsci/arch/amgnet.py b/ppsci/arch/amgnet.py
index b0e4b89929..ce728317d6 100644
--- a/ppsci/arch/amgnet.py
+++ b/ppsci/arch/amgnet.py
@@ -238,21 +238,21 @@ def faster_graph_connectivity(perm, edge_index, edge_weight, score, pos, N, norm
         value_A = edge_weight.clone()
 
     value_A = paddle.squeeze(value_A)
-    model_1 = paddle.nn.Sequential(
-        ("l1", paddle.nn.Linear(128, 256)),
-        ("act1", paddle.nn.ReLU()),
-        ("l2", paddle.nn.Linear(256, 256)),
-        ("act2", paddle.nn.ReLU()),
-        ("l4", paddle.nn.Linear(256, 128)),
-        ("act4", paddle.nn.ReLU()),
-        ("l5", paddle.nn.Linear(128, 1)),
+    model_1 = nn.Sequential(
+        ("l1", nn.Linear(128, 256)),
+        ("act1", nn.ReLU()),
+        ("l2", nn.Linear(256, 256)),
+        ("act2", nn.ReLU()),
+        ("l4", nn.Linear(256, 128)),
+        ("act4", nn.ReLU()),
+        ("l5", nn.Linear(128, 1)),
     )
-    model_2 = paddle.nn.Sequential(
-        ("l1", paddle.nn.Linear(1, 64)),
-        ("act1", paddle.nn.ReLU()),
-        ("l2", paddle.nn.Linear(64, 128)),
-        ("act2", paddle.nn.ReLU()),
-        ("l4", paddle.nn.Linear(128, 128)),
+    model_2 = nn.Sequential(
+        ("l1", nn.Linear(1, 64)),
+        ("act1", nn.ReLU()),
+        ("l2", nn.Linear(64, 128)),
+        ("act2", nn.ReLU()),
+        ("l4", nn.Linear(128, 128)),
     )
 
     val_A = model_1(value_A)
diff --git a/ppsci/arch/cuboid_transformer.py b/ppsci/arch/cuboid_transformer.py
index 0b74caf52b..e0e6cbded6 100644
--- a/ppsci/arch/cuboid_transformer.py
+++ b/ppsci/arch/cuboid_transformer.py
@@ -16,7 +16,7 @@
 """A space-time Transformer with Cuboid Attention"""
 
 
-class InitialEncoder(paddle.nn.Layer):
+class InitialEncoder(nn.Layer):
     def __init__(
         self,
         dim,
@@ -38,16 +38,14 @@ def __init__(
         for i in range(num_conv_layers):
             if i == 0:
                 conv_block.append(
-                    paddle.nn.Conv2D(
+                    nn.Conv2D(
                         kernel_size=(3, 3),
                         padding=(1, 1),
                         in_channels=dim,
                         out_channels=out_dim,
                     )
                 )
-                conv_block.append(
-                    paddle.nn.GroupNorm(num_groups=16, num_channels=out_dim)
-                )
+                conv_block.append(nn.GroupNorm(num_groups=16, num_channels=out_dim))
                 conv_block.append(
                     act_mod.get_activation(activation)
                     if activation != "leaky_relu"
@@ -55,22 +53,20 @@ def __init__(
                 )
             else:
                 conv_block.append(
-                    paddle.nn.Conv2D(
+                    nn.Conv2D(
                         kernel_size=(3, 3),
                         padding=(1, 1),
                         in_channels=out_dim,
                         out_channels=out_dim,
                     )
                 )
-                conv_block.append(
-                    paddle.nn.GroupNorm(num_groups=16, num_channels=out_dim)
-                )
+                conv_block.append(nn.GroupNorm(num_groups=16, num_channels=out_dim))
                 conv_block.append(
                     act_mod.get_activation(activation)
                     if activation != "leaky_relu"
                     else nn.LeakyReLU(NEGATIVE_SLOPE)
                 )
-        self.conv_block = paddle.nn.Sequential(*conv_block)
+        self.conv_block = nn.Sequential(*conv_block)
         if isinstance(downsample_scale, int):
             patch_merge_downsample = (1, downsample_scale, downsample_scale)
         elif len(downsample_scale) == 2:
@@ -121,7 +117,7 @@ def forward(self, x):
         return x
 
 
-class FinalDecoder(paddle.nn.Layer):
+class FinalDecoder(nn.Layer):
     def __init__(
         self,
         target_thw: Tuple[int, ...],
@@ -142,20 +138,20 @@ def __init__(
         conv_block = []
         for i in range(num_conv_layers):
             conv_block.append(
-                paddle.nn.Conv2D(
+                nn.Conv2D(
                     kernel_size=(3, 3),
                     padding=(1, 1),
                     in_channels=dim,
                     out_channels=dim,
                 )
             )
-            conv_block.append(paddle.nn.GroupNorm(num_groups=16, num_channels=dim))
+            conv_block.append(nn.GroupNorm(num_groups=16, num_channels=dim))
             conv_block.append(
                 act_mod.get_activation(activation)
                 if activation != "leaky_relu"
                 else nn.LeakyReLU(NEGATIVE_SLOPE)
             )
-        self.conv_block = paddle.nn.Sequential(*conv_block)
+        self.conv_block = nn.Sequential(*conv_block)
         self.upsample = cuboid_decoder.Upsample3DLayer(
             dim=dim,
             out_dim=dim,
@@ -196,7 +192,7 @@ def forward(self, x):
         return x
 
 
-class InitialStackPatchMergingEncoder(paddle.nn.Layer):
+class InitialStackPatchMergingEncoder(nn.Layer):
     def __init__(
         self,
         num_merge: int,
@@ -220,8 +216,8 @@ def __init__(
         self.downsample_scale_list = downsample_scale_list[:num_merge]
         self.num_conv_per_merge_list = num_conv_per_merge_list
         self.num_group_list = [max(1, out_dim // 4) for out_dim in self.out_dim_list]
-        self.conv_block_list = paddle.nn.LayerList()
-        self.patch_merge_list = paddle.nn.LayerList()
+        self.conv_block_list = nn.LayerList()
+        self.patch_merge_list = nn.LayerList()
         for i in range(num_merge):
             if i == 0:
                 in_dim = in_dim
@@ -236,7 +232,7 @@ def __init__(
                 else:
                     conv_in_dim = out_dim
                 conv_block.append(
-                    paddle.nn.Conv2D(
+                    nn.Conv2D(
                         kernel_size=(3, 3),
                         padding=(1, 1),
                         in_channels=conv_in_dim,
@@ -244,7 +240,7 @@ def __init__(
                     )
                 )
                 conv_block.append(
-                    paddle.nn.GroupNorm(
+                    nn.GroupNorm(
                         num_groups=self.num_group_list[i], num_channels=out_dim
                     )
                 )
@@ -253,7 +249,7 @@ def __init__(
                     if activation != "leaky_relu"
                     else nn.LeakyReLU(NEGATIVE_SLOPE)
                 )
-            conv_block = paddle.nn.Sequential(*conv_block)
+            conv_block = nn.Sequential(*conv_block)
             self.conv_block_list.append(conv_block)
             patch_merge = cuboid_encoder.PatchMerging3D(
                 dim=out_dim,
@@ -303,7 +299,7 @@ def forward(self, x):
         return x
 
 
-class FinalStackUpsamplingDecoder(paddle.nn.Layer):
+class FinalStackUpsamplingDecoder(nn.Layer):
     def __init__(
         self,
         target_shape_list: Tuple[Tuple[int, ...]],
@@ -326,8 +322,8 @@ def __init__(
         self.in_dim = in_dim
         self.num_conv_per_up_list = num_conv_per_up_list
         self.num_group_list = [max(1, out_dim // 4) for out_dim in self.out_dim_list]
-        self.conv_block_list = paddle.nn.LayerList()
-        self.upsample_list = paddle.nn.LayerList()
+        self.conv_block_list = nn.LayerList()
+        self.upsample_list = nn.LayerList()
         for i in range(self.num_upsample):
             if i == 0:
                 in_dim = in_dim
@@ -349,7 +345,7 @@ def __init__(
                 else:
                     conv_in_dim = out_dim
                 conv_block.append(
-                    paddle.nn.Conv2D(
+                    nn.Conv2D(
                         kernel_size=(3, 3),
                         padding=(1, 1),
                         in_channels=conv_in_dim,
@@ -357,7 +353,7 @@ def __init__(
                     )
                 )
                 conv_block.append(
-                    paddle.nn.GroupNorm(
+                    nn.GroupNorm(
                         num_groups=self.num_group_list[i], num_channels=out_dim
                     )
                 )
@@ -366,7 +362,7 @@ def __init__(
                     if activation != "leaky_relu"
                     else nn.LeakyReLU(NEGATIVE_SLOPE)
                 )
-            conv_block = paddle.nn.Sequential(*conv_block)
+            conv_block = nn.Sequential(*conv_block)
             self.conv_block_list.append(conv_block)
         self.reset_parameters()
 
@@ -686,7 +682,7 @@ def __init__(
             embed_dim=base_units, typ=pos_embed_type, maxH=H_in, maxW=W_in, maxT=T_in
         )
         mem_shapes = self.encoder.get_mem_shapes()
-        self.z_proj = paddle.nn.Linear(
+        self.z_proj = nn.Linear(
             in_features=mem_shapes[-1][-1], out_features=mem_shapes[-1][-1]
         )
         self.dec_pos_embed = cuboid_decoder.PosEmbed(
@@ -799,7 +795,7 @@ def get_initial_encoder_final_decoder(
             new_input_shape = self.initial_encoder.patch_merge.get_out_shape(
                 self.input_shape
             )
-            self.dec_final_proj = paddle.nn.Linear(
+            self.dec_final_proj = nn.Linear(
                 in_features=self.base_units, out_features=C_out
             )
         elif self.initial_downsample_type == "stack_conv":
@@ -839,7 +835,7 @@ def get_initial_encoder_final_decoder(
                 linear_init_mode=self.down_up_linear_init_mode,
                 norm_init_mode=self.norm_init_mode,
             )
-            self.dec_final_proj = paddle.nn.Linear(
+            self.dec_final_proj = nn.Linear(
                 in_features=dec_target_shape_list[-1][-1], out_features=C_out
             )
             new_input_shape = self.initial_encoder.get_out_shape_list(self.input_shape)[
@@ -892,7 +888,7 @@ def get_initial_z(self, final_mem, T_out):
                 shape=[B, -1, -1, -1, -1]
             )
         elif self.z_init_method == "nearest_interp":
-            initial_z = paddle.nn.functional.interpolate(
+            initial_z = nn.functional.interpolate(
                 x=final_mem.transpose(perm=[0, 4, 1, 2, 3]),
                 size=(T_out, final_mem.shape[2], final_mem.shape[3]),
             ).transpose(perm=[0, 2, 3, 4, 1])
diff --git a/ppsci/arch/cuboid_transformer_decoder.py b/ppsci/arch/cuboid_transformer_decoder.py
index 6504717042..894363b1a8 100644
--- a/ppsci/arch/cuboid_transformer_decoder.py
+++ b/ppsci/arch/cuboid_transformer_decoder.py
@@ -12,7 +12,7 @@
 from ppsci.utils import initializer
 
 
-class PosEmbed(paddle.nn.Layer):
+class PosEmbed(nn.Layer):
     """Pose embeding
 
     Args:
@@ -37,20 +37,12 @@ def __init__(self, embed_dim, maxT, maxH, maxW, typ: str = "t+h+w"):
         self.maxW = maxW
         self.embed_dim = embed_dim
         if self.typ == "t+h+w":
-            self.T_embed = paddle.nn.Embedding(
-                num_embeddings=maxT, embedding_dim=embed_dim
-            )
-            self.H_embed = paddle.nn.Embedding(
-                num_embeddings=maxH, embedding_dim=embed_dim
-            )
-            self.W_embed = paddle.nn.Embedding(
-                num_embeddings=maxW, embedding_dim=embed_dim
-            )
+            self.T_embed = nn.Embedding(num_embeddings=maxT, embedding_dim=embed_dim)
+            self.H_embed = nn.Embedding(num_embeddings=maxH, embedding_dim=embed_dim)
+            self.W_embed = nn.Embedding(num_embeddings=maxW, embedding_dim=embed_dim)
         elif self.typ == "t+hw":
-            self.T_embed = paddle.nn.Embedding(
-                num_embeddings=maxT, embedding_dim=embed_dim
-            )
-            self.HW_embed = paddle.nn.Embedding(
+            self.T_embed = nn.Embedding(num_embeddings=maxT, embedding_dim=embed_dim)
+            self.HW_embed = nn.Embedding(
                 num_embeddings=maxH * maxW, embedding_dim=embed_dim
             )
         else:
@@ -168,7 +160,7 @@ def compute_cuboid_cross_attention_mask(
     return attn_mask
 
 
-class CuboidCrossAttentionLayer(paddle.nn.Layer):
+class CuboidCrossAttentionLayer(nn.Layer):
     """Implements the cuboid cross attention.
 
     The idea of Cuboid Cross Attention is to extend the idea of cuboid self attention to work for the
@@ -307,21 +299,19 @@ def __init__(
             self.register_buffer(
                 name="relative_position_index", tensor=relative_position_index
             )
-        self.q_proj = paddle.nn.Linear(
-            in_features=dim, out_features=dim, bias_attr=qkv_bias
-        )
-        self.kv_proj = paddle.nn.Linear(
+        self.q_proj = nn.Linear(in_features=dim, out_features=dim, bias_attr=qkv_bias)
+        self.kv_proj = nn.Linear(
             in_features=dim, out_features=dim * 2, bias_attr=qkv_bias
         )
-        self.attn_drop = paddle.nn.Dropout(p=attn_drop)
-        self.proj = paddle.nn.Linear(in_features=dim, out_features=dim)
-        self.proj_drop = paddle.nn.Dropout(p=proj_drop)
+        self.attn_drop = nn.Dropout(p=attn_drop)
+        self.proj = nn.Linear(in_features=dim, out_features=dim)
+        self.proj_drop = nn.Dropout(p=proj_drop)
         if self.use_global_vector:
             if self.separate_global_qkv:
-                self.l2g_q_net = paddle.nn.Linear(
+                self.l2g_q_net = nn.Linear(
                     in_features=dim, out_features=dim, bias_attr=qkv_bias
                 )
-                self.l2g_global_kv_net = paddle.nn.Linear(
+                self.l2g_global_kv_net = nn.Linear(
                     in_features=global_dim_ratio * dim,
                     out_features=dim * 2,
                     bias_attr=qkv_bias,
@@ -546,7 +536,7 @@ def forward(self, x, mem, mem_global_vectors=None):
         return x
 
 
-class StackCuboidCrossAttentionBlock(paddle.nn.Layer):
+class StackCuboidCrossAttentionBlock(nn.Layer):
     """A stack of cuboid cross attention layers.
 
     The advantage of cuboid attention is that we can combine cuboid attention building blocks with different
@@ -648,7 +638,7 @@ def __init__(
         self.use_inter_ffn = use_inter_ffn
         self.use_global_vector = use_global_vector
         if self.use_inter_ffn:
-            self.ffn_l = paddle.nn.LayerList(
+            self.ffn_l = nn.LayerList(
                 sublayers=[
                     cuboid_encoder.PositionwiseFFN(
                         units=dim,
@@ -666,7 +656,7 @@ def __init__(
                 ]
             )
         else:
-            self.ffn_l = paddle.nn.LayerList(
+            self.ffn_l = nn.LayerList(
                 sublayers=[
                     cuboid_encoder.PositionwiseFFN(
                         units=dim,
@@ -682,7 +672,7 @@ def __init__(
                     )
                 ]
             )
-        self.attn_l = paddle.nn.LayerList(
+        self.attn_l = nn.LayerList(
             sublayers=[
                 CuboidCrossAttentionLayer(
                     dim=dim,
@@ -755,7 +745,7 @@ def forward(self, x, mem, mem_global_vector=None):
         return x
 
 
-class Upsample3DLayer(paddle.nn.Layer):
+class Upsample3DLayer(nn.Layer):
     """Upsampling based on nn.UpSampling and Conv3x3.
 
     If the temporal dimension remains the same:
@@ -789,12 +779,10 @@ def __init__(
         self.out_dim = out_dim
         self.temporal_upsample = temporal_upsample
         if temporal_upsample:
-            self.up = paddle.nn.Upsample(size=target_size, mode="nearest")
+            self.up = nn.Upsample(size=target_size, mode="nearest")
         else:
-            self.up = paddle.nn.Upsample(
-                size=(target_size[1], target_size[2]), mode="nearest"
-            )
-        self.conv = paddle.nn.Conv2D(
+            self.up = nn.Upsample(size=(target_size[1], target_size[2]), mode="nearest")
+        self.conv = nn.Conv2D(
             in_channels=dim,
             out_channels=out_dim,
             kernel_size=(kernel_size, kernel_size),
@@ -855,7 +843,7 @@ def forward(self, x):
                 )
 
 
-class CuboidTransformerDecoder(paddle.nn.Layer):
+class CuboidTransformerDecoder(nn.Layer):
     """Decoder of the CuboidTransformer.
 
     For each block, we first apply the StackCuboidSelfAttention and then apply the StackCuboidCrossAttention
@@ -1062,8 +1050,8 @@ def __init__(
                 )
                 for _ in range(ele_depth)
             ]
-            self_blocks.append(paddle.nn.LayerList(sublayers=stack_cuboid_blocks))
-        self.self_blocks = paddle.nn.LayerList(sublayers=self_blocks)
+            self_blocks.append(nn.LayerList(sublayers=stack_cuboid_blocks))
+        self.self_blocks = nn.LayerList(sublayers=self_blocks)
         if block_cross_attn_patterns is not None:
             if isinstance(block_cross_attn_patterns, (tuple, list)):
                 assert len(block_cross_attn_patterns) == self.num_blocks
@@ -1121,9 +1109,9 @@ def __init__(
                 assert (
                     len(block_cross_n_temporal) == self.num_blocks
                 ), f"Incorrect input format! Received block_cross_n_temporal={block_cross_n_temporal}"
-        self.cross_blocks = paddle.nn.LayerList()
+        self.cross_blocks = nn.LayerList()
         for i in range(self.cross_start, self.num_blocks):
-            cross_block = paddle.nn.LayerList(
+            cross_block = nn.LayerList(
                 sublayers=[
                     StackCuboidCrossAttentionBlock(
                         dim=self.mem_shapes[i][-1],
@@ -1157,7 +1145,7 @@ def __init__(
             self.cross_blocks.append(cross_block)
         if self.num_blocks > 1:
             if self.upsample_type == "upsample":
-                self.upsample_layers = paddle.nn.LayerList(
+                self.upsample_layers = nn.LayerList(
                     sublayers=[
                         Upsample3DLayer(
                             dim=self.mem_shapes[i + 1][-1],
@@ -1174,7 +1162,7 @@ def __init__(
             else:
                 raise NotImplementedError(f"{self.upsample_type} is invalid.")
             if self.hierarchical_pos_embed:
-                self.hierarchical_pos_embed_l = paddle.nn.LayerList(
+                self.hierarchical_pos_embed_l = nn.LayerList(
                     sublayers=[
                         PosEmbed(
                             embed_dim=self.mem_shapes[i][-1],
diff --git a/ppsci/arch/cuboid_transformer_encoder.py b/ppsci/arch/cuboid_transformer_encoder.py
index 34ec3efa53..79b2e6fd1d 100644
--- a/ppsci/arch/cuboid_transformer_encoder.py
+++ b/ppsci/arch/cuboid_transformer_encoder.py
@@ -15,7 +15,7 @@
 NEGATIVE_SLOPE = 0.1
 
 
-class PatchMerging3D(paddle.nn.Layer):
+class PatchMerging3D(nn.Layer):
     """Patch Merging Layer
 
     Args:
@@ -47,7 +47,7 @@ def __init__(
         self.out_dim = out_dim
         self.downsample = downsample
         self.padding_type = padding_type
-        self.reduction = paddle.nn.Linear(
+        self.reduction = nn.Linear(
             in_features=downsample[0] * downsample[1] * downsample[2] * dim,
             out_features=out_dim,
             bias_attr=False,
@@ -125,7 +125,7 @@ def forward(self, x):
         return x
 
 
-class PositionwiseFFN(paddle.nn.Layer):
+class PositionwiseFFN(nn.Layer):
     """The Position-wise FFN layer used in Transformer-like architectures
 
     If pre_norm is True:
@@ -183,20 +183,20 @@ def __init__(
                 ("pre_norm", pre_norm),
             ]
         )
-        self.dropout_layer = paddle.nn.Dropout(p=dropout)
-        self.activation_dropout_layer = paddle.nn.Dropout(p=activation_dropout)
-        self.ffn_1 = paddle.nn.Linear(
+        self.dropout_layer = nn.Dropout(p=dropout)
+        self.activation_dropout_layer = nn.Dropout(p=activation_dropout)
+        self.ffn_1 = nn.Linear(
             in_features=units, out_features=hidden_size, bias_attr=True
         )
         if self._gated_proj:
-            self.ffn_1_gate = paddle.nn.Linear(
+            self.ffn_1_gate = nn.Linear(
                 in_features=units, out_features=hidden_size, bias_attr=True
             )
         if activation == "leaky_relu":
             self.activation = nn.LeakyReLU(NEGATIVE_SLOPE)
         else:
             self.activation = act_mod.get_activation(activation)
-        self.ffn_2 = paddle.nn.Linear(
+        self.ffn_2 = nn.Linear(
             in_features=hidden_size, out_features=units, bias_attr=True
         )
         self.layer_norm = cuboid_utils.get_norm_layer(
@@ -400,9 +400,9 @@ def masked_softmax(att_score, mask, axis: int = -1):
             att_score = att_score.masked_fill(paddle.logical_not(mask), -1e4)
         else:
             att_score = att_score.masked_fill(paddle.logical_not(mask), -1e18)
-        att_weights = paddle.nn.functional.softmax(x=att_score, axis=axis) * mask
+        att_weights = nn.functional.softmax(x=att_score, axis=axis) * mask
     else:
-        att_weights = paddle.nn.functional.softmax(x=att_score, axis=axis)
+        att_weights = nn.functional.softmax(x=att_score, axis=axis)
     return att_weights
 
 
@@ -451,7 +451,7 @@ def cuboid_reorder_reverse(data, cuboid_size, strategy, orig_data_shape):
     return data
 
 
-class CuboidSelfAttentionLayer(paddle.nn.Layer):
+class CuboidSelfAttentionLayer(nn.Layer):
     """Implements the cuboid self attention.
 
     The idea of Cuboid Self Attention is to divide the input tensor (T, H, W) into several non-overlapping cuboids.
@@ -588,49 +588,47 @@ def __init__(
             self.register_buffer(
                 name="relative_position_index", tensor=relative_position_index
             )
-        self.qkv = paddle.nn.Linear(
-            in_features=dim, out_features=dim * 3, bias_attr=qkv_bias
-        )
-        self.attn_drop = paddle.nn.Dropout(p=attn_drop)
+        self.qkv = nn.Linear(in_features=dim, out_features=dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(p=attn_drop)
         if self.use_global_vector:
             if self.separate_global_qkv:
-                self.l2g_q_net = paddle.nn.Linear(
+                self.l2g_q_net = nn.Linear(
                     in_features=dim, out_features=dim, bias_attr=qkv_bias
                 )
-                self.l2g_global_kv_net = paddle.nn.Linear(
+                self.l2g_global_kv_net = nn.Linear(
                     in_features=global_dim_ratio * dim,
                     out_features=dim * 2,
                     bias_attr=qkv_bias,
                 )
-                self.g2l_global_q_net = paddle.nn.Linear(
+                self.g2l_global_q_net = nn.Linear(
                     in_features=global_dim_ratio * dim,
                     out_features=dim,
                     bias_attr=qkv_bias,
                 )
-                self.g2l_k_net = paddle.nn.Linear(
+                self.g2l_k_net = nn.Linear(
                     in_features=dim, out_features=dim, bias_attr=qkv_bias
                 )
-                self.g2l_v_net = paddle.nn.Linear(
+                self.g2l_v_net = nn.Linear(
                     in_features=dim,
                     out_features=global_dim_ratio * dim,
                     bias_attr=qkv_bias,
                 )
                 if self.use_global_self_attn:
-                    self.g2g_global_qkv_net = paddle.nn.Linear(
+                    self.g2g_global_qkv_net = nn.Linear(
                         in_features=global_dim_ratio * dim,
                         out_features=global_dim_ratio * dim * 3,
                         bias_attr=qkv_bias,
                     )
             else:
-                self.global_qkv = paddle.nn.Linear(
+                self.global_qkv = nn.Linear(
                     in_features=dim, out_features=dim * 3, bias_attr=qkv_bias
                 )
-            self.global_attn_drop = paddle.nn.Dropout(p=attn_drop)
+            self.global_attn_drop = nn.Dropout(p=attn_drop)
         if use_final_proj:
-            self.proj = paddle.nn.Linear(in_features=dim, out_features=dim)
-            self.proj_drop = paddle.nn.Dropout(p=proj_drop)
+            self.proj = nn.Linear(in_features=dim, out_features=dim)
+            self.proj_drop = nn.Dropout(p=proj_drop)
             if self.use_global_vector:
-                self.global_proj = paddle.nn.Linear(
+                self.global_proj = nn.Linear(
                     in_features=global_dim_ratio * dim,
                     out_features=global_dim_ratio * dim,
                 )
@@ -955,7 +953,7 @@ def forward(self, x, global_vectors=None):
             return x
 
 
-class StackCuboidSelfAttentionBlock(paddle.nn.Layer):
+class StackCuboidSelfAttentionBlock(nn.Layer):
     """
     - "use_inter_ffn" is True
         x --> attn1 -----+-------> ffn1 ---+---> attn2 --> ... --> ffn_k --> out
@@ -1055,7 +1053,7 @@ def __init__(
         self.use_global_self_attn = use_global_self_attn
         self.global_dim_ratio = global_dim_ratio
         if self.use_inter_ffn:
-            self.ffn_l = paddle.nn.LayerList(
+            self.ffn_l = nn.LayerList(
                 sublayers=[
                     PositionwiseFFN(
                         units=dim,
@@ -1073,7 +1071,7 @@ def __init__(
                 ]
             )
             if self.use_global_vector_ffn and self.use_global_vector:
-                self.global_ffn_l = paddle.nn.LayerList(
+                self.global_ffn_l = nn.LayerList(
                     sublayers=[
                         PositionwiseFFN(
                             units=global_dim_ratio * dim,
@@ -1091,7 +1089,7 @@ def __init__(
                     ]
                 )
         else:
-            self.ffn_l = paddle.nn.LayerList(
+            self.ffn_l = nn.LayerList(
                 sublayers=[
                     PositionwiseFFN(
                         units=dim,
@@ -1108,7 +1106,7 @@ def __init__(
                 ]
             )
             if self.use_global_vector_ffn and self.use_global_vector:
-                self.global_ffn_l = paddle.nn.LayerList(
+                self.global_ffn_l = nn.LayerList(
                     sublayers=[
                         PositionwiseFFN(
                             units=global_dim_ratio * dim,
@@ -1124,7 +1122,7 @@ def __init__(
                         )
                     ]
                 )
-        self.attn_l = paddle.nn.LayerList(
+        self.attn_l = nn.LayerList(
             sublayers=[
                 CuboidSelfAttentionLayer(
                     dim=dim,
@@ -1233,7 +1231,7 @@ def forward(self, x, global_vectors=None):
             return x
 
 
-class CuboidTransformerEncoder(paddle.nn.Layer):
+class CuboidTransformerEncoder(nn.Layer):
     """Encoder of the CuboidTransformer
 
     x --> attn_block --> patch_merge --> attn_block --> patch_merge --> ... --> out
@@ -1346,7 +1344,7 @@ def __init__(
         self.block_units = block_units
         if self.num_blocks > 1:
             if downsample_type == "patch_merge":
-                self.down_layers = paddle.nn.LayerList(
+                self.down_layers = nn.LayerList(
                     sublayers=[
                         PatchMerging3D(
                             dim=self.block_units[i],
@@ -1362,9 +1360,9 @@ def __init__(
             else:
                 raise NotImplementedError(f"{downsample_type} is invalid.")
             if self.use_global_vector:
-                self.down_layer_global_proj = paddle.nn.LayerList(
+                self.down_layer_global_proj = nn.LayerList(
                     sublayers=[
-                        paddle.nn.Linear(
+                        nn.Linear(
                             in_features=global_dim_ratio * self.block_units[i],
                             out_features=global_dim_ratio * self.block_units[i + 1],
                         )
@@ -1410,9 +1408,9 @@ def __init__(
         self.block_cuboid_size = block_cuboid_size
         self.block_strategy = block_strategy
         self.block_shift_size = block_shift_size
-        self.blocks = paddle.nn.LayerList(
+        self.blocks = nn.LayerList(
             sublayers=[
-                paddle.nn.Sequential(
+                nn.Sequential(
                     *[
                         StackCuboidSelfAttentionBlock(
                             dim=self.block_units[i],
diff --git a/ppsci/arch/cuboid_transformer_utils.py b/ppsci/arch/cuboid_transformer_utils.py
index 456e975cfd..02ef060002 100644
--- a/ppsci/arch/cuboid_transformer_utils.py
+++ b/ppsci/arch/cuboid_transformer_utils.py
@@ -12,7 +12,7 @@ def round_to(dat, c):
     return dat + (dat - dat % c) % c
 
 
-class RMSNorm(paddle.nn.Layer):
+class RMSNorm(nn.Layer):
     """Root Mean Square Layer Normalization proposed in "[NeurIPS2019] Root Mean Square Layer Normalization"
 
     Args:
@@ -94,7 +94,7 @@ def get_norm_layer(
         if normalization == "layer_norm":
             assert in_channels > 0
             assert axis == -1
-            norm_layer = paddle.nn.LayerNorm(
+            norm_layer = nn.LayerNorm(
                 normalized_shape=in_channels, epsilon=epsilon, **kwargs
             )
         elif normalization == "rms_norm":
@@ -106,7 +106,7 @@ def get_norm_layer(
             )
         return norm_layer
     elif normalization is None:
-        return paddle.nn.Identity()
+        return nn.Identity()
     else:
         raise NotImplementedError("The type of normalization must be str")
 
@@ -117,7 +117,7 @@ def generalize_padding(x, pad_t, pad_h, pad_w, padding_type, t_pad_left=False):
     assert padding_type in ["zeros", "ignore", "nearest"]
     B, T, H, W, C = x.shape
     if padding_type == "nearest":
-        return paddle.nn.functional.interpolate(
+        return nn.functional.interpolate(
             x=x.transpose(perm=[0, 4, 1, 2, 3]), size=(T + pad_t, H + pad_h, W + pad_w)
         ).transpose(perm=[0, 2, 3, 4, 1])
     elif t_pad_left:
@@ -138,7 +138,7 @@ def generalize_unpadding(x, pad_t, pad_h, pad_w, padding_type):
     if pad_t == 0 and pad_h == 0 and pad_w == 0:
         return x
     if padding_type == "nearest":
-        return paddle.nn.functional.interpolate(
+        return nn.functional.interpolate(
             x=x.transpose(perm=[0, 4, 1, 2, 3]), size=(T - pad_t, H - pad_h, W - pad_w)
         ).transpose(perm=[0, 2, 3, 4, 1])
     else:
@@ -146,13 +146,13 @@ def generalize_unpadding(x, pad_t, pad_h, pad_w, padding_type):
 
 
 def apply_initialization(
-    m: paddle.nn.Layer,
+    m: nn.Layer,
     linear_mode: str = "0",
     conv_mode: str = "0",
     norm_mode: str = "0",
     embed_mode: str = "0",
 ):
-    if isinstance(m, paddle.nn.Linear):
+    if isinstance(m, nn.Linear):
         if linear_mode in ("0",):
             m.weight = initializer.kaiming_normal_(m.weight, nonlinearity="linear")
         elif linear_mode in ("1",):
@@ -166,10 +166,10 @@ def apply_initialization(
     elif isinstance(
         m,
         (
-            paddle.nn.Conv2D,
-            paddle.nn.Conv3D,
-            paddle.nn.Conv2DTranspose,
-            paddle.nn.Conv3DTranspose,
+            nn.Conv2D,
+            nn.Conv3D,
+            nn.Conv2DTranspose,
+            nn.Conv3DTranspose,
         ),
     ):
         if conv_mode in ("0",):
@@ -180,19 +180,19 @@ def apply_initialization(
             raise NotImplementedError(f"{conv_mode} is invalid.")
         if hasattr(m, "bias") and m.bias is not None:
             m.bias = initializer.zeros_(m.bias)
-    elif isinstance(m, paddle.nn.LayerNorm):
+    elif isinstance(m, nn.LayerNorm):
         if norm_mode in ("0",):
             m.weight = initializer.zeros_(m.weight)
             m.bias = initializer.zeros_(m.bias)
         else:
             raise NotImplementedError(f"{norm_mode} is invalid.")
-    elif isinstance(m, paddle.nn.GroupNorm):
+    elif isinstance(m, nn.GroupNorm):
         if norm_mode in ("0",):
             m.weight = initializer.ones_(m.weight)
             m.bias = initializer.zeros_(m.bias)
         else:
             raise NotImplementedError(f"{norm_mode} is invalid.")
-    elif isinstance(m, paddle.nn.Embedding):
+    elif isinstance(m, nn.Embedding):
         if embed_mode in ("0",):
             m.weight.data = initializer.trunc_normal_(m.weight.data, std=0.02)
         else:
diff --git a/ppsci/arch/mlp.py b/ppsci/arch/mlp.py
index fea0ea79d2..9a9d142a8d 100644
--- a/ppsci/arch/mlp.py
+++ b/ppsci/arch/mlp.py
@@ -103,7 +103,7 @@ def __init__(self, periods: Dict[str, Tuple[float, bool]]):
             )  # mu = 2*pi / period for sin/cos function
             for k, (p, trainable) in periods.items()
         }
-        self.freqs = paddle.nn.ParameterList(list(self.freqs_dict.values()))
+        self.freqs = nn.ParameterList(list(self.freqs_dict.values()))
 
     def forward(self, x: Dict[str, paddle.Tensor]):
         y = {k: v for k, v in x.items()}  # shallow copy to avoid modifying input dict
diff --git a/ppsci/arch/nowcastnet.py b/ppsci/arch/nowcastnet.py
index 38d5209616..bc7538ad91 100644
--- a/ppsci/arch/nowcastnet.py
+++ b/ppsci/arch/nowcastnet.py
@@ -16,6 +16,7 @@
 from typing import Tuple
 
 import paddle
+from paddle import nn
 
 from ppsci.arch import base
 
@@ -139,7 +140,7 @@ def forward_tensor(self, x):
         return gen_result.unsqueeze(axis=-1)
 
 
-class Evolution_Network(paddle.nn.Layer):
+class Evolution_Network(nn.Layer):
     def __init__(self, n_channels, n_classes, base_c=64, bilinear=True):
         super().__init__()
         self.n_channels = n_channels
@@ -161,7 +162,7 @@ def __init__(self, n_channels, n_classes, base_c=64, bilinear=True):
         gamma = self.create_parameter(
             shape=param1.shape,
             dtype=param1.dtype,
-            default_initializer=paddle.nn.initializer.Assign(param1),
+            default_initializer=nn.initializer.Assign(param1),
         )
         gamma.stop_gradient = False
         self.gamma = gamma
@@ -190,26 +191,26 @@ def forward(self, x):
         return x, v
 
 
-class DoubleConv(paddle.nn.Layer):
+class DoubleConv(nn.Layer):
     def __init__(self, in_channels, out_channels, kernel=3, mid_channels=None):
         super().__init__()
         if not mid_channels:
             mid_channels = out_channels
-        self.double_conv = paddle.nn.Sequential(
-            paddle.nn.BatchNorm2D(num_features=in_channels),
-            paddle.nn.ReLU(),
-            paddle.nn.utils.spectral_norm(
-                layer=paddle.nn.Conv2D(
+        self.double_conv = nn.Sequential(
+            nn.BatchNorm2D(num_features=in_channels),
+            nn.ReLU(),
+            nn.utils.spectral_norm(
+                layer=nn.Conv2D(
                     in_channels=in_channels,
                     out_channels=mid_channels,
                     kernel_size=kernel,
                     padding=kernel // 2,
                 )
             ),
-            paddle.nn.BatchNorm2D(num_features=mid_channels),
-            paddle.nn.ReLU(),
-            paddle.nn.utils.spectral_norm(
-                layer=paddle.nn.Conv2D(
+            nn.BatchNorm2D(num_features=mid_channels),
+            nn.ReLU(),
+            nn.utils.spectral_norm(
+                layer=nn.Conv2D(
                     in_channels=mid_channels,
                     out_channels=out_channels,
                     kernel_size=kernel,
@@ -217,10 +218,10 @@ def __init__(self, in_channels, out_channels, kernel=3, mid_channels=None):
                 )
             ),
         )
-        self.single_conv = paddle.nn.Sequential(
-            paddle.nn.BatchNorm2D(num_features=in_channels),
-            paddle.nn.utils.spectral_norm(
-                layer=paddle.nn.Conv2D(
+        self.single_conv = nn.Sequential(
+            nn.BatchNorm2D(num_features=in_channels),
+            nn.utils.spectral_norm(
+                layer=nn.Conv2D(
                     in_channels=in_channels,
                     out_channels=out_channels,
                     kernel_size=kernel,
@@ -236,11 +237,11 @@ def forward(self, x):
         return x
 
 
-class Down(paddle.nn.Layer):
+class Down(nn.Layer):
     def __init__(self, in_channels, out_channels, kernel=3):
         super().__init__()
-        self.maxpool_conv = paddle.nn.Sequential(
-            paddle.nn.MaxPool2D(kernel_size=2),
+        self.maxpool_conv = nn.Sequential(
+            nn.MaxPool2D(kernel_size=2),
             DoubleConv(in_channels, out_channels, kernel),
         )
 
@@ -249,18 +250,16 @@ def forward(self, x):
         return x
 
 
-class Up(paddle.nn.Layer):
+class Up(nn.Layer):
     def __init__(self, in_channels, out_channels, bilinear=True, kernel=3):
         super().__init__()
         if bilinear:
-            self.up = paddle.nn.Upsample(
-                scale_factor=2, mode="bilinear", align_corners=True
-            )
+            self.up = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True)
             self.conv = DoubleConv(
                 in_channels, out_channels, kernel=kernel, mid_channels=in_channels // 2
             )
         else:
-            self.up = paddle.nn.Conv2DTranspose(
+            self.up = nn.Conv2DTranspose(
                 in_channels=in_channels,
                 out_channels=in_channels // 2,
                 kernel_size=2,
@@ -273,25 +272,23 @@ def forward(self, x1, x2):
         # input is CHW
         diffY = x2.shape[2] - x1.shape[2]
         diffX = x2.shape[3] - x1.shape[3]
-        x1 = paddle.nn.functional.pad(
+        x1 = nn.functional.pad(
             x1, [diffX // 2, diffX - diffX // 2, diffY // 2, diffY - diffY // 2]
         )
         x = paddle.concat(x=[x2, x1], axis=1)
         return self.conv(x)
 
 
-class Up_S(paddle.nn.Layer):
+class Up_S(nn.Layer):
     def __init__(self, in_channels, out_channels, bilinear=True, kernel=3):
         super().__init__()
         if bilinear:
-            self.up = paddle.nn.Upsample(
-                scale_factor=2, mode="bilinear", align_corners=True
-            )
+            self.up = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True)
             self.conv = DoubleConv(
                 in_channels, out_channels, kernel=kernel, mid_channels=in_channels
             )
         else:
-            self.up = paddle.nn.Conv2DTranspose(
+            self.up = nn.Conv2DTranspose(
                 in_channels=in_channels,
                 out_channels=in_channels,
                 kernel_size=2,
@@ -304,10 +301,10 @@ def forward(self, x):
         return self.conv(x)
 
 
-class OutConv(paddle.nn.Layer):
+class OutConv(nn.Layer):
     def __init__(self, in_channels, out_channels):
         super().__init__()
-        self.conv = paddle.nn.Conv2D(
+        self.conv = nn.Conv2D(
             in_channels=in_channels, out_channels=out_channels, kernel_size=1
         )
 
@@ -315,7 +312,7 @@ def forward(self, x):
         return self.conv(x)
 
 
-class Generative_Encoder(paddle.nn.Layer):
+class Generative_Encoder(nn.Layer):
     def __init__(self, n_channels, base_c=64):
         super().__init__()
         base_c = base_c
@@ -332,13 +329,13 @@ def forward(self, x):
         return x
 
 
-class Generative_Decoder(paddle.nn.Layer):
+class Generative_Decoder(nn.Layer):
     def __init__(self, opt):
         super().__init__()
         self.opt = opt
         nf = opt.ngf
         ic = opt.ic_feature
-        self.fc = paddle.nn.Conv2D(
+        self.fc = nn.Conv2D(
             in_channels=ic, out_channels=8 * nf, kernel_size=3, padding=1
         )
         self.head_0 = GenBlock(8 * nf, 8 * nf, opt)
@@ -348,10 +345,10 @@ def __init__(self, opt):
         self.up_1 = GenBlock(2 * nf, 1 * nf, opt, double_conv=True)
         self.up_2 = GenBlock(1 * nf, 1 * nf, opt, double_conv=True)
         final_nc = nf * 1
-        self.conv_img = paddle.nn.Conv2D(
+        self.conv_img = nn.Conv2D(
             in_channels=final_nc, out_channels=self.opt.gen_oc, kernel_size=3, padding=1
         )
-        self.up = paddle.nn.Upsample(scale_factor=2)
+        self.up = nn.Upsample(scale_factor=2)
 
     def forward(self, x, evo):
         x = self.fc(x)
@@ -364,26 +361,26 @@ def forward(self, x, evo):
         x = self.up(x)
         x = self.up_1(x, evo)
         x = self.up_2(x, evo)
-        x = self.conv_img(paddle.nn.functional.leaky_relu(x=x, negative_slope=0.2))
+        x = self.conv_img(nn.functional.leaky_relu(x=x, negative_slope=0.2))
         return x
 
 
-class GenBlock(paddle.nn.Layer):
+class GenBlock(nn.Layer):
     def __init__(self, fin, fout, opt, use_se=False, dilation=1, double_conv=False):
         super().__init__()
         self.learned_shortcut = fin != fout
         fmiddle = min(fin, fout)
         self.opt = opt
         self.double_conv = double_conv
-        self.pad = paddle.nn.Pad2D(padding=dilation, mode="reflect")
-        self.conv_0 = paddle.nn.Conv2D(
+        self.pad = nn.Pad2D(padding=dilation, mode="reflect")
+        self.conv_0 = nn.Conv2D(
             in_channels=fin,
             out_channels=fmiddle,
             kernel_size=3,
             padding=0,
             dilation=dilation,
         )
-        self.conv_1 = paddle.nn.Conv2D(
+        self.conv_1 = nn.Conv2D(
             in_channels=fmiddle,
             out_channels=fout,
             kernel_size=3,
@@ -391,13 +388,13 @@ def __init__(self, fin, fout, opt, use_se=False, dilation=1, double_conv=False):
             dilation=dilation,
         )
         if self.learned_shortcut:
-            self.conv_s = paddle.nn.Conv2D(
+            self.conv_s = nn.Conv2D(
                 in_channels=fin, out_channels=fout, kernel_size=1, bias_attr=False
             )
-        self.conv_0 = paddle.nn.utils.spectral_norm(layer=self.conv_0)
-        self.conv_1 = paddle.nn.utils.spectral_norm(layer=self.conv_1)
+        self.conv_0 = nn.utils.spectral_norm(layer=self.conv_0)
+        self.conv_1 = nn.utils.spectral_norm(layer=self.conv_1)
         if self.learned_shortcut:
-            self.conv_s = paddle.nn.utils.spectral_norm(layer=self.conv_s)
+            self.conv_s = nn.utils.spectral_norm(layer=self.conv_s)
         ic = opt.evo_ic
         self.norm_0 = SPADE(fin, ic)
         self.norm_1 = SPADE(fmiddle, ic)
@@ -420,37 +417,37 @@ def shortcut(self, x, evo):
         return x_s
 
     def actvn(self, x):
-        return paddle.nn.functional.leaky_relu(x=x, negative_slope=0.2)
+        return nn.functional.leaky_relu(x=x, negative_slope=0.2)
 
 
-class SPADE(paddle.nn.Layer):
+class SPADE(nn.Layer):
     def __init__(self, norm_nc, label_nc):
         super().__init__()
         ks = 3
-        self.param_free_norm = paddle.nn.InstanceNorm2D(
+        self.param_free_norm = nn.InstanceNorm2D(
             num_features=norm_nc, weight_attr=False, bias_attr=False, momentum=1 - 0.1
         )
         nhidden = 64
         ks = 3
         pw = ks // 2
-        self.mlp_shared = paddle.nn.Sequential(
-            paddle.nn.Pad2D(padding=pw, mode="reflect"),
-            paddle.nn.Conv2D(
+        self.mlp_shared = nn.Sequential(
+            nn.Pad2D(padding=pw, mode="reflect"),
+            nn.Conv2D(
                 in_channels=label_nc, out_channels=nhidden, kernel_size=ks, padding=0
             ),
-            paddle.nn.ReLU(),
+            nn.ReLU(),
         )
-        self.pad = paddle.nn.Pad2D(padding=pw, mode="reflect")
-        self.mlp_gamma = paddle.nn.Conv2D(
+        self.pad = nn.Pad2D(padding=pw, mode="reflect")
+        self.mlp_gamma = nn.Conv2D(
             in_channels=nhidden, out_channels=norm_nc, kernel_size=ks, padding=0
         )
-        self.mlp_beta = paddle.nn.Conv2D(
+        self.mlp_beta = nn.Conv2D(
             in_channels=nhidden, out_channels=norm_nc, kernel_size=ks, padding=0
         )
 
     def forward(self, x, evo):
         normalized = self.param_free_norm(x)
-        evo = paddle.nn.functional.adaptive_avg_pool2d(x=evo, output_size=x.shape[2:])
+        evo = nn.functional.adaptive_avg_pool2d(x=evo, output_size=x.shape[2:])
         actv = self.mlp_shared(evo)
         gamma = self.mlp_gamma(self.pad(actv))
         beta = self.mlp_beta(self.pad(actv))
@@ -458,12 +455,12 @@ def forward(self, x, evo):
         return out
 
 
-class Noise_Projector(paddle.nn.Layer):
+class Noise_Projector(nn.Layer):
     def __init__(self, input_length):
         super().__init__()
         self.input_length = input_length
-        self.conv_first = paddle.nn.utils.spectral_norm(
-            paddle.nn.Conv2D(
+        self.conv_first = nn.utils.spectral_norm(
+            nn.Conv2D(
                 in_channels=self.input_length,
                 out_channels=self.input_length * 2,
                 kernel_size=3,
@@ -484,29 +481,29 @@ def forward(self, x):
         return x
 
 
-class ProjBlock(paddle.nn.Layer):
+class ProjBlock(nn.Layer):
     def __init__(self, in_channel, out_channel):
         super().__init__()
-        self.one_conv = paddle.nn.utils.spectral_norm(
-            paddle.nn.Conv2D(
+        self.one_conv = nn.utils.spectral_norm(
+            nn.Conv2D(
                 in_channels=in_channel,
                 out_channels=out_channel - in_channel,
                 kernel_size=1,
                 padding=0,
             )
         )
-        self.double_conv = paddle.nn.Sequential(
-            paddle.nn.utils.spectral_norm(
-                paddle.nn.Conv2D(
+        self.double_conv = nn.Sequential(
+            nn.utils.spectral_norm(
+                nn.Conv2D(
                     in_channels=in_channel,
                     out_channels=out_channel,
                     kernel_size=3,
                     padding=1,
                 )
             ),
-            paddle.nn.ReLU(),
-            paddle.nn.utils.spectral_norm(
-                paddle.nn.Conv2D(
+            nn.ReLU(),
+            nn.utils.spectral_norm(
+                nn.Conv2D(
                     in_channels=out_channel,
                     out_channels=out_channel,
                     kernel_size=3,
@@ -538,7 +535,7 @@ def warp(input, flow, grid, mode="bilinear", padding_mode="zeros"):
     vgrid[:, 0, :, :] = 2.0 * vgrid[:, 0, :, :].clone() / max(W - 1, 1) - 1.0
     vgrid[:, 1, :, :] = 2.0 * vgrid[:, 1, :, :].clone() / max(H - 1, 1) - 1.0
     vgrid = vgrid.transpose(perm=[0, 2, 3, 1])
-    output = paddle.nn.functional.grid_sample(
+    output = nn.functional.grid_sample(
         x=input.cpu(),
         grid=vgrid.cpu(),
         padding_mode=padding_mode,
@@ -552,7 +549,7 @@ def l2normalize(v, eps=1e-12):
     return v / (v.norm() + eps)
 
 
-class spectral_norm(paddle.nn.Layer):
+class spectral_norm(nn.Layer):
     def __init__(self, module, name="weight", power_iterations=1):
         super().__init__()
         self.module = module
@@ -596,7 +593,7 @@ def _make_params(self):
         out_0 = paddle.create_parameter(
             shape=tmp_w.shape,
             dtype=tmp_w.numpy().dtype,
-            default_initializer=paddle.nn.initializer.Assign(tmp_w),
+            default_initializer=nn.initializer.Assign(tmp_w),
         )
         out_0.stop_gradient = True
         u = out_0
@@ -605,7 +602,7 @@ def _make_params(self):
         out_1 = paddle.create_parameter(
             shape=tmp_w.shape,
             dtype=tmp_w.numpy().dtype,
-            default_initializer=paddle.nn.initializer.Assign(tmp_w),
+            default_initializer=nn.initializer.Assign(tmp_w),
         )
         out_1.stop_gradient = True
         v = out_1
@@ -615,7 +612,7 @@ def _make_params(self):
         out_2 = paddle.create_parameter(
             shape=tmp_w.shape,
             dtype=tmp_w.numpy().dtype,
-            default_initializer=paddle.nn.initializer.Assign(tmp_w),
+            default_initializer=nn.initializer.Assign(tmp_w),
         )
         out_2.stop_gradient = False
         w_bar = out_2
@@ -636,7 +633,7 @@ def create_param(x):
     param = paddle.create_parameter(
         shape=x.shape,
         dtype=x.dtype,
-        default_initializer=paddle.nn.initializer.Assign(x),
+        default_initializer=nn.initializer.Assign(x),
     )
     param.stop_gradient = x.stop_gradient
     return param
diff --git a/ppsci/arch/phycrnet.py b/ppsci/arch/phycrnet.py
index f020607be8..c72583ebf9 100644
--- a/ppsci/arch/phycrnet.py
+++ b/ppsci/arch/phycrnet.py
@@ -133,7 +133,7 @@ def __init__(
         self.num_convlstm = num_layers[1]
 
         # encoder - downsampling
-        self.encoder = paddle.nn.LayerList(
+        self.encoder = nn.LayerList(
             [
                 encoder_block(
                     input_channels=self.input_channels[i],
@@ -147,7 +147,7 @@ def __init__(
         )
 
         # ConvLSTM
-        self.convlstm = paddle.nn.LayerList(
+        self.convlstm = nn.LayerList(
             [
                 ConvLSTMCell(
                     input_channels=self.input_channels[i],
@@ -170,7 +170,7 @@ def __init__(
 
         # initialize weights
         self.apply(_initialize_weights)
-        initializer_0 = paddle.nn.initializer.Constant(0.0)
+        initializer_0 = nn.initializer.Constant(0.0)
         initializer_0(self.output_layer.bias)
         self.enable_transform = True
 
@@ -334,8 +334,8 @@ def __init__(
             padding_mode="circular",
         )
 
-        initializer_0 = paddle.nn.initializer.Constant(0.0)
-        initializer_1 = paddle.nn.initializer.Constant(1.0)
+        initializer_0 = nn.initializer.Constant(0.0)
+        initializer_1 = nn.initializer.Constant(1.0)
 
         initializer_0(self.Wxi.bias)
         initializer_0(self.Wxf.bias)
@@ -343,10 +343,10 @@ def __init__(
         initializer_1(self.Wxo.bias)
 
     def forward(self, x, h, c):
-        ci = paddle.nn.functional.sigmoid(self.Wxi(x) + self.Whi(h))
-        cf = paddle.nn.functional.sigmoid(self.Wxf(x) + self.Whf(h))
+        ci = nn.functional.sigmoid(self.Wxi(x) + self.Whi(h))
+        cf = nn.functional.sigmoid(self.Wxf(x) + self.Whf(h))
         cc = cf * c + ci * paddle.tanh(self.Wxc(x) + self.Whc(h))
-        co = paddle.nn.functional.sigmoid(self.Wxo(x) + self.Who(h))
+        co = nn.functional.sigmoid(self.Wxo(x) + self.Who(h))
         ch = co * paddle.tanh(cc)
         return ch, cc
 
@@ -387,7 +387,7 @@ def __init__(
 
         self.act = nn.ReLU()
 
-        initializer_0 = paddle.nn.initializer.Constant(0.0)
+        initializer_0 = nn.initializer.Constant(0.0)
         initializer_0(self.conv.bias)
 
     def forward(self, x):
@@ -418,7 +418,7 @@ def __init__(self, der_filter, resol, kernel_size=3, name=""):
         self.filter.weight = self.create_parameter(
             shape=self.filter.weight.shape,
             dtype=self.filter.weight.dtype,
-            default_initializer=paddle.nn.initializer.Assign(
+            default_initializer=nn.initializer.Assign(
                 paddle.to_tensor(
                     der_filter, dtype=paddle.get_default_dtype(), stop_gradient=True
                 )
@@ -455,7 +455,7 @@ def __init__(self, der_filter, resol, kernel_size=3, name=""):
         self.filter.weight = self.create_parameter(
             shape=self.filter.weight.shape,
             dtype=self.filter.weight.dtype,
-            default_initializer=paddle.nn.initializer.Assign(
+            default_initializer=nn.initializer.Assign(
                 paddle.to_tensor(
                     der_filter, dtype=paddle.get_default_dtype(), stop_gradient=True
                 )

From a18e02889e7c0535336ae78fc26ef2e902459729 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Mon, 8 Jul 2024 14:01:57 +0800
Subject: [PATCH 3/3] update Extformer-MoE in docs

---
 README.md                               | 1 +
 docs/index.md                           | 1 +
 docs/zh/examples/extformer_moe.md       | 6 +++---
 examples/extformer_moe/requirements.txt | 2 ++
 mkdocs.yml                              | 1 +
 5 files changed, 8 insertions(+), 3 deletions(-)
 create mode 100644 examples/extformer_moe/requirements.txt

diff --git a/README.md b/README.md
index b7763837a4..0f1b7dfab0 100644
--- a/README.md
+++ b/README.md
@@ -92,6 +92,7 @@ PaddleScience 是一个基于深度学习框架 PaddlePaddle 开发的科学计
 
 | 问题类型 | 案例名称 | 优化算法 | 模型类型 | 训练方式 | 数据集 | 参考资料 |
 |-----|---------|-----|---------|----|---------|---------|
+| 天气预报 | [Extformer-MoE 气象预报](https://paddlescience-docs.readthedocs.io/zh/latest/zh/examples/extformer_moe.md) | 数据驱动 | FourCastNet | 监督学习 | [enso](https://tianchi.aliyun.com/dataset/98942) | - |
 | 天气预报 | [FourCastNet 气象预报](https://paddlescience-docs.readthedocs.io/zh/latest/zh/examples/fourcastnet) | 数据驱动 | FourCastNet | 监督学习 | [ERA5](https://app.globus.org/file-manager?origin_id=945b3c9e-0f8c-11ed-8daf-9f359c660fbd&origin_path=%2F~%2Fdata%2F) | [Paper](https://arxiv.org/pdf/2202.11214.pdf) |
 | 天气预报 | [NowCastNet 气象预报](https://paddlescience-docs.readthedocs.io/zh/latest/zh/examples/nowcastnet) | 数据驱动 | NowCastNet | 监督学习 | [MRMS](https://app.globus.org/file-manager?origin_id=945b3c9e-0f8c-11ed-8daf-9f359c660fbd&origin_path=%2F~%2Fdata%2F) | [Paper](https://www.nature.com/articles/s41586-023-06184-4) |
 | 天气预报 | [GraphCast 气象预报](https://paddlescience-docs.readthedocs.io/zh/latest/zh/examples/graphcast) | 数据驱动 | GraphCastNet | 监督学习 | - | [Paper](https://arxiv.org/abs/2212.12794) |
diff --git a/docs/index.md b/docs/index.md
index 195da54656..d5257fd1b9 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -137,6 +137,7 @@
 
 | 问题类型 | 案例名称 | 优化算法 | 模型类型 | 训练方式 | 数据集 | 参考资料 |
 |-----|---------|-----|---------|----|---------|---------|
+| 天气预报 | [Extformer-MoE 气象预报](./zh/examples/extformer_moe.md) | 数据驱动 | FourCastNet | 监督学习 | [enso](https://tianchi.aliyun.com/dataset/98942) | - |
 | 天气预报 | [FourCastNet 气象预报](./zh/examples/fourcastnet.md) | 数据驱动 | FourCastNet | 监督学习 | [ERA5](https://app.globus.org/file-manager?origin_id=945b3c9e-0f8c-11ed-8daf-9f359c660fbd&origin_path=%2F~%2Fdata%2F) | [Paper](https://arxiv.org/pdf/2202.11214.pdf) |
 | 天气预报 | [NowCastNet 气象预报](./zh/examples/nowcastnet.md) | 数据驱动 | NowCastNet | 监督学习 | [MRMS](https://app.globus.org/file-manager?origin_id=945b3c9e-0f8c-11ed-8daf-9f359c660fbd&origin_path=%2F~%2Fdata%2F) | [Paper](https://www.nature.com/articles/s41586-023-06184-4) |
 | 天气预报 | [GraphCast 气象预报](./zh/examples/graphcast.md) | 数据驱动 | GraphCastNet | 监督学习 | - | [Paper](https://arxiv.org/abs/2212.12794) |
diff --git a/docs/zh/examples/extformer_moe.md b/docs/zh/examples/extformer_moe.md
index 6c4286b8a8..25bb101da4 100644
--- a/docs/zh/examples/extformer_moe.md
+++ b/docs/zh/examples/extformer_moe.md
@@ -2,15 +2,15 @@
 
 !!! note
 
-    开始训练、评估前，请先下载 [ICAR-ENSO数据集](https://tianchi.aliyun.com/dataset/98942)，并对应修改 yaml 配置文件中的 `FILE_PATH` 为解压后的数据集路径。
-    若训练时显存不足，可指定 `MODEL.checkpoint_level` 为 0、1 或 2，此时使用 recompute 模式运行，以训练时间换取显存。
+    1. 开始训练、评估前，请先下载 [ICAR-ENSO数据集](https://tianchi.aliyun.com/dataset/98942)，并对应修改 yaml 配置文件中的 `FILE_PATH` 为解压后的数据集路径。
+    2. 开始训练、评估前，请安装 `xarray` 和 `h5netcdf`：`pip install requirements.txt`
+    3. 若训练时显存不足，可指定 `MODEL.checkpoint_level` 为 `1` 或 `2`，此时使用 recompute 模式运行，以训练时间换取显存。
 
 === "模型训练命令"
 
     ``` sh
     # ICAR-ENSO 数据预训练模型: Extformer-MoE
     python extformer_moe_enso_train.py
-    # python extformer_moe_enso_train.py MODEL.checkpoint_level=0 # using recompute to run in device with small GPU memory
     # python extformer_moe_enso_train.py MODEL.checkpoint_level=1 # using recompute to run in device with small GPU memory
     # python extformer_moe_enso_train.py MODEL.checkpoint_level=2 # using recompute to run in device with small GPU memory
     ```
diff --git a/examples/extformer_moe/requirements.txt b/examples/extformer_moe/requirements.txt
new file mode 100644
index 0000000000..c0f424a290
--- /dev/null
+++ b/examples/extformer_moe/requirements.txt
@@ -0,0 +1,2 @@
+h5netcdf
+xarray==2024.2.0
diff --git a/mkdocs.yml b/mkdocs.yml
index e024b2a460..74af8b1b12 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -84,6 +84,7 @@ nav:
       - 材料科学(AI for Material):
         - hPINNs: zh/examples/hpinns.md
       - 地球科学(AI for Earth Science):
+        - Extformer-MoE: zh/examples/extformer_moe.md
         - FourCastNet: zh/examples/fourcastnet.md
         - NowcastNet: zh/examples/nowcastnet.md
         - DGMR: zh/examples/dgmr.md