From e548e67b7b85a34c2292cf0cbce6b00754cc70e7 Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Mon, 8 Jul 2024 13:57:09 +0800 Subject: [PATCH 1/3] fix&refine extformer code and docs --- docs/zh/examples/extformer_moe.md | 9 +- .../extformer_moe/extformer_moe_enso_train.py | 2 +- ppsci/arch/extformer_moe_cuboid.py | 56 +++++----- ppsci/arch/extformer_moe_cuboid_decoder.py | 80 ++++++-------- ppsci/arch/extformer_moe_cuboid_encoder.py | 102 +++++++++--------- ppsci/arch/extformer_moe_cuboid_utils.py | 32 +++--- ppsci/arch/extformer_moe_utils.py | 37 +++---- 7 files changed, 152 insertions(+), 166 deletions(-) diff --git a/docs/zh/examples/extformer_moe.md b/docs/zh/examples/extformer_moe.md index 9c955399ff..6c4286b8a8 100644 --- a/docs/zh/examples/extformer_moe.md +++ b/docs/zh/examples/extformer_moe.md @@ -1,14 +1,18 @@ # Extformer-MoE -开始训练、评估前,请先下载,并对应修改 yaml 配置文件中的 FILE_PATH +!!! note -[ICAR-ENSO数据集](https://tianchi.aliyun.com/dataset/98942) + 开始训练、评估前,请先下载 [ICAR-ENSO数据集](https://tianchi.aliyun.com/dataset/98942),并对应修改 yaml 配置文件中的 `FILE_PATH` 为解压后的数据集路径。 + 若训练时显存不足,可指定 `MODEL.checkpoint_level` 为 0、1 或 2,此时使用 recompute 模式运行,以训练时间换取显存。 === "模型训练命令" ``` sh # ICAR-ENSO 数据预训练模型: Extformer-MoE python extformer_moe_enso_train.py + # python extformer_moe_enso_train.py MODEL.checkpoint_level=0 # using recompute to run in device with small GPU memory + # python extformer_moe_enso_train.py MODEL.checkpoint_level=1 # using recompute to run in device with small GPU memory + # python extformer_moe_enso_train.py MODEL.checkpoint_level=2 # using recompute to run in device with small GPU memory ``` === "模型评估命令" @@ -46,7 +50,6 @@ Earthformer,一种用于地球系统预测的时空转换器。为了更好地 Rank-N-Contrast(RNC)是一种表征学习方法,旨在学习一种回归感知的样本表征,该表征以连续标签空间中的距离为依据,对嵌入空间中的样本间距离进行排序,然后利用它来预测最终连续的标签。在地球系统极端预测问题中,RNC 可以对气象数据的表征进行规范,使其满足嵌入空间的连续性,和标签空间对齐,最终缓解极端事件的预测结果的过平滑问题。 - ## 2. 模型原理 ### 2.1 Earthformer diff --git a/examples/extformer_moe/extformer_moe_enso_train.py b/examples/extformer_moe/extformer_moe_enso_train.py index 9c1bc461ed..e0e570fb95 100644 --- a/examples/extformer_moe/extformer_moe_enso_train.py +++ b/examples/extformer_moe/extformer_moe_enso_train.py @@ -1,10 +1,10 @@ +import enso_metric import hydra import paddle from omegaconf import DictConfig from omegaconf import OmegaConf from paddle import nn -import examples.extformer_moe.enso_metric as enso_metric import ppsci diff --git a/ppsci/arch/extformer_moe_cuboid.py b/ppsci/arch/extformer_moe_cuboid.py index 53f8b6a6d2..bdd6311e2b 100644 --- a/ppsci/arch/extformer_moe_cuboid.py +++ b/ppsci/arch/extformer_moe_cuboid.py @@ -17,7 +17,7 @@ """A space-time Transformer with Cuboid Attention""" -class InitialEncoder(paddle.nn.Layer): +class InitialEncoder(nn.Layer): def __init__( self, dim, @@ -40,16 +40,14 @@ def __init__( for i in range(num_conv_layers): if i == 0: conv_block.append( - paddle.nn.Conv2D( + nn.Conv2D( kernel_size=(3, 3), padding=(1, 1), in_channels=dim, out_channels=out_dim, ) ) - conv_block.append( - paddle.nn.GroupNorm(num_groups=16, num_channels=out_dim) - ) + conv_block.append(nn.GroupNorm(num_groups=16, num_channels=out_dim)) conv_block.append( act_mod.get_activation(activation) if activation != "leaky_relu" @@ -57,22 +55,20 @@ def __init__( ) else: conv_block.append( - paddle.nn.Conv2D( + nn.Conv2D( kernel_size=(3, 3), padding=(1, 1), in_channels=out_dim, out_channels=out_dim, ) ) - conv_block.append( - paddle.nn.GroupNorm(num_groups=16, num_channels=out_dim) - ) + conv_block.append(nn.GroupNorm(num_groups=16, num_channels=out_dim)) conv_block.append( act_mod.get_activation(activation) if activation != "leaky_relu" else nn.LeakyReLU(NEGATIVE_SLOPE) ) - self.conv_block = paddle.nn.Sequential(*conv_block) + self.conv_block = nn.Sequential(*conv_block) if isinstance(downsample_scale, int): patch_merge_downsample = (1, downsample_scale, downsample_scale) elif len(downsample_scale) == 2: @@ -123,7 +119,7 @@ def forward(self, x): return x -class FinalDecoder(paddle.nn.Layer): +class FinalDecoder(nn.Layer): def __init__( self, target_thw: Tuple[int, ...], @@ -145,20 +141,20 @@ def __init__( conv_block = [] for i in range(num_conv_layers): conv_block.append( - paddle.nn.Conv2D( + nn.Conv2D( kernel_size=(3, 3), padding=(1, 1), in_channels=dim, out_channels=dim, ) ) - conv_block.append(paddle.nn.GroupNorm(num_groups=16, num_channels=dim)) + conv_block.append(nn.GroupNorm(num_groups=16, num_channels=dim)) conv_block.append( act_mod.get_activation(activation) if activation != "leaky_relu" else nn.LeakyReLU(NEGATIVE_SLOPE) ) - self.conv_block = paddle.nn.Sequential(*conv_block) + self.conv_block = nn.Sequential(*conv_block) self.upsample = cuboid_decoder.Upsample3DLayer( dim=dim, out_dim=dim, @@ -199,7 +195,7 @@ def forward(self, x): return x -class InitialStackPatchMergingEncoder(paddle.nn.Layer): +class InitialStackPatchMergingEncoder(nn.Layer): def __init__( self, num_merge: int, @@ -224,8 +220,8 @@ def __init__( self.downsample_scale_list = downsample_scale_list[:num_merge] self.num_conv_per_merge_list = num_conv_per_merge_list self.num_group_list = [max(1, out_dim // 4) for out_dim in self.out_dim_list] - self.conv_block_list = paddle.nn.LayerList() - self.patch_merge_list = paddle.nn.LayerList() + self.conv_block_list = nn.LayerList() + self.patch_merge_list = nn.LayerList() for i in range(num_merge): if i == 0: in_dim = in_dim @@ -240,7 +236,7 @@ def __init__( else: conv_in_dim = out_dim conv_block.append( - paddle.nn.Conv2D( + nn.Conv2D( kernel_size=(3, 3), padding=(1, 1), in_channels=conv_in_dim, @@ -248,7 +244,7 @@ def __init__( ) ) conv_block.append( - paddle.nn.GroupNorm( + nn.GroupNorm( num_groups=self.num_group_list[i], num_channels=out_dim ) ) @@ -257,7 +253,7 @@ def __init__( if activation != "leaky_relu" else nn.LeakyReLU(NEGATIVE_SLOPE) ) - conv_block = paddle.nn.Sequential(*conv_block) + conv_block = nn.Sequential(*conv_block) self.conv_block_list.append(conv_block) patch_merge = cuboid_encoder.PatchMerging3D( dim=out_dim, @@ -307,7 +303,7 @@ def forward(self, x): return x -class FinalStackUpsamplingDecoder(paddle.nn.Layer): +class FinalStackUpsamplingDecoder(nn.Layer): def __init__( self, target_shape_list: Tuple[Tuple[int, ...]], @@ -331,8 +327,8 @@ def __init__( self.in_dim = in_dim self.num_conv_per_up_list = num_conv_per_up_list self.num_group_list = [max(1, out_dim // 4) for out_dim in self.out_dim_list] - self.conv_block_list = paddle.nn.LayerList() - self.upsample_list = paddle.nn.LayerList() + self.conv_block_list = nn.LayerList() + self.upsample_list = nn.LayerList() for i in range(self.num_upsample): if i == 0: in_dim = in_dim @@ -354,7 +350,7 @@ def __init__( else: conv_in_dim = out_dim conv_block.append( - paddle.nn.Conv2D( + nn.Conv2D( kernel_size=(3, 3), padding=(1, 1), in_channels=conv_in_dim, @@ -362,7 +358,7 @@ def __init__( ) ) conv_block.append( - paddle.nn.GroupNorm( + nn.GroupNorm( num_groups=self.num_group_list[i], num_channels=out_dim ) ) @@ -371,7 +367,7 @@ def __init__( if activation != "leaky_relu" else nn.LeakyReLU(NEGATIVE_SLOPE) ) - conv_block = paddle.nn.Sequential(*conv_block) + conv_block = nn.Sequential(*conv_block) self.conv_block_list.append(conv_block) self.reset_parameters() @@ -694,7 +690,7 @@ def __init__( embed_dim=base_units, typ=pos_embed_type, maxH=H_in, maxW=W_in, maxT=T_in ) mem_shapes = self.encoder.get_mem_shapes() - self.z_proj = paddle.nn.Linear( + self.z_proj = nn.Linear( in_features=mem_shapes[-1][-1], out_features=mem_shapes[-1][-1] ) self.dec_pos_embed = cuboid_decoder.PosEmbed( @@ -812,7 +808,7 @@ def get_initial_encoder_final_decoder( new_input_shape = self.initial_encoder.patch_merge.get_out_shape( self.input_shape ) - self.dec_final_proj = paddle.nn.Linear( + self.dec_final_proj = nn.Linear( in_features=self.base_units, out_features=C_out ) elif self.initial_downsample_type == "stack_conv": @@ -852,7 +848,7 @@ def get_initial_encoder_final_decoder( linear_init_mode=self.down_up_linear_init_mode, norm_init_mode=self.norm_init_mode, ) - self.dec_final_proj = paddle.nn.Linear( + self.dec_final_proj = nn.Linear( in_features=dec_target_shape_list[-1][-1], out_features=C_out ) new_input_shape = self.initial_encoder.get_out_shape_list(self.input_shape)[ @@ -905,7 +901,7 @@ def get_initial_z(self, final_mem, T_out): shape=[B, -1, -1, -1, -1] ) elif self.z_init_method == "nearest_interp": - initial_z = paddle.nn.functional.interpolate( + initial_z = nn.functional.interpolate( x=final_mem.transpose(perm=[0, 4, 1, 2, 3]), size=(T_out, final_mem.shape[2], final_mem.shape[3]), ).transpose(perm=[0, 2, 3, 4, 1]) diff --git a/ppsci/arch/extformer_moe_cuboid_decoder.py b/ppsci/arch/extformer_moe_cuboid_decoder.py index d1192a6861..aee77f7a8a 100644 --- a/ppsci/arch/extformer_moe_cuboid_decoder.py +++ b/ppsci/arch/extformer_moe_cuboid_decoder.py @@ -13,7 +13,7 @@ from ppsci.utils import initializer -class PosEmbed(paddle.nn.Layer): +class PosEmbed(nn.Layer): """pose embeding Args: @@ -46,20 +46,12 @@ def __init__( self.maxW = maxW self.embed_dim = embed_dim if self.typ == "t+h+w": - self.T_embed = paddle.nn.Embedding( - num_embeddings=maxT, embedding_dim=embed_dim - ) - self.H_embed = paddle.nn.Embedding( - num_embeddings=maxH, embedding_dim=embed_dim - ) - self.W_embed = paddle.nn.Embedding( - num_embeddings=maxW, embedding_dim=embed_dim - ) + self.T_embed = nn.Embedding(num_embeddings=maxT, embedding_dim=embed_dim) + self.H_embed = nn.Embedding(num_embeddings=maxH, embedding_dim=embed_dim) + self.W_embed = nn.Embedding(num_embeddings=maxW, embedding_dim=embed_dim) elif self.typ == "t+hw": - self.T_embed = paddle.nn.Embedding( - num_embeddings=maxT, embedding_dim=embed_dim - ) - self.HW_embed = paddle.nn.Embedding( + self.T_embed = nn.Embedding(num_embeddings=maxT, embedding_dim=embed_dim) + self.HW_embed = nn.Embedding( num_embeddings=maxH * maxW, embedding_dim=embed_dim ) else: @@ -177,7 +169,7 @@ def compute_cuboid_cross_attention_mask( return attn_mask -class CuboidCrossAttentionLayer(paddle.nn.Layer): +class CuboidCrossAttentionLayer(nn.Layer): """Implements the cuboid cross attention. The idea of Cuboid Cross Attention is to extend the idea of cuboid self attention to work for the @@ -317,21 +309,19 @@ def __init__( self.register_buffer( name="relative_position_index", tensor=relative_position_index ) - self.q_proj = paddle.nn.Linear( - in_features=dim, out_features=dim, bias_attr=qkv_bias - ) - self.kv_proj = paddle.nn.Linear( + self.q_proj = nn.Linear(in_features=dim, out_features=dim, bias_attr=qkv_bias) + self.kv_proj = nn.Linear( in_features=dim, out_features=dim * 2, bias_attr=qkv_bias ) - self.attn_drop = paddle.nn.Dropout(p=attn_drop) - self.proj = paddle.nn.Linear(in_features=dim, out_features=dim) - self.proj_drop = paddle.nn.Dropout(p=proj_drop) + self.attn_drop = nn.Dropout(p=attn_drop) + self.proj = nn.Linear(in_features=dim, out_features=dim) + self.proj_drop = nn.Dropout(p=proj_drop) if self.use_global_vector: if self.separate_global_qkv: - self.l2g_q_net = paddle.nn.Linear( + self.l2g_q_net = nn.Linear( in_features=dim, out_features=dim, bias_attr=qkv_bias ) - self.l2g_global_kv_net = paddle.nn.Linear( + self.l2g_global_kv_net = nn.Linear( in_features=global_dim_ratio * dim, out_features=dim * 2, bias_attr=qkv_bias, @@ -556,7 +546,7 @@ def forward(self, x, mem, mem_global_vectors=None): return x -class StackCuboidCrossAttentionBlock(paddle.nn.Layer): +class StackCuboidCrossAttentionBlock(nn.Layer): """A stack of cuboid cross attention layers. The advantage of cuboid attention is that we can combine cuboid attention building blocks with different @@ -661,7 +651,7 @@ def __init__( self.use_global_vector = use_global_vector if self.use_inter_ffn: if moe_config["use_ffn_moe"]: - self.ffn_l = paddle.nn.LayerList( + self.ffn_l = nn.LayerList( sublayers=[ cuboid_encoder.MixtureFFN( units=dim, @@ -681,7 +671,7 @@ def __init__( ] ) else: - self.ffn_l = paddle.nn.LayerList( + self.ffn_l = nn.LayerList( sublayers=[ cuboid_encoder.PositionwiseFFN( units=dim, @@ -702,7 +692,7 @@ def __init__( ) else: if moe_config["use_ffn_moe"]: - self.ffn_l = paddle.nn.LayerList( + self.ffn_l = nn.LayerList( sublayers=[ cuboid_encoder.MixtureFFN( units=dim, @@ -721,7 +711,7 @@ def __init__( ] ) else: - self.ffn_l = paddle.nn.LayerList( + self.ffn_l = nn.LayerList( sublayers=[ cuboid_encoder.PositionwiseFFN( units=dim, @@ -741,7 +731,7 @@ def __init__( ) if moe_config["use_attn_moe"]: - self.attn_l = paddle.nn.LayerList( + self.attn_l = nn.LayerList( sublayers=[ MixtureCrossAttention( dim=dim, @@ -778,7 +768,7 @@ def __init__( ] ) else: - self.attn_l = paddle.nn.LayerList( + self.attn_l = nn.LayerList( sublayers=[ CuboidCrossAttentionLayer( dim=dim, @@ -854,7 +844,7 @@ def forward(self, x, mem, mem_global_vector=None): return x -class Upsample3DLayer(paddle.nn.Layer): +class Upsample3DLayer(nn.Layer): """Upsampling based on nn.UpSampling and Conv3x3. If the temporal dimension remains the same: @@ -889,12 +879,10 @@ def __init__( self.out_dim = out_dim self.temporal_upsample = temporal_upsample if temporal_upsample: - self.up = paddle.nn.Upsample(size=target_size, mode="nearest") + self.up = nn.Upsample(size=target_size, mode="nearest") else: - self.up = paddle.nn.Upsample( - size=(target_size[1], target_size[2]), mode="nearest" - ) - self.conv = paddle.nn.Conv2D( + self.up = nn.Upsample(size=(target_size[1], target_size[2]), mode="nearest") + self.conv = nn.Conv2D( in_channels=dim, out_channels=out_dim, kernel_size=(kernel_size, kernel_size), @@ -955,7 +943,7 @@ def forward(self, x): ) -class CuboidTransformerDecoder(paddle.nn.Layer): +class CuboidTransformerDecoder(nn.Layer): """Decoder of the CuboidTransformer. For each block, we first apply the StackCuboidSelfAttention and then apply the StackCuboidCrossAttention @@ -1169,8 +1157,8 @@ def __init__( ) for _ in range(ele_depth) ] - self_blocks.append(paddle.nn.LayerList(sublayers=stack_cuboid_blocks)) - self.self_blocks = paddle.nn.LayerList(sublayers=self_blocks) + self_blocks.append(nn.LayerList(sublayers=stack_cuboid_blocks)) + self.self_blocks = nn.LayerList(sublayers=self_blocks) if block_cross_attn_patterns is not None: if isinstance(block_cross_attn_patterns, (tuple, list)): @@ -1229,10 +1217,10 @@ def __init__( assert ( len(block_cross_n_temporal) == self.num_blocks ), f"Incorrect input format! Received block_cross_n_temporal={block_cross_n_temporal}" - self.cross_blocks = paddle.nn.LayerList() + self.cross_blocks = nn.LayerList() assert self.cross_start == 0 for i in range(self.cross_start, self.num_blocks): - cross_block = paddle.nn.LayerList( + cross_block = nn.LayerList( sublayers=[ StackCuboidCrossAttentionBlock( dim=self.mem_shapes[i][-1], @@ -1268,7 +1256,7 @@ def __init__( self.cross_blocks.append(cross_block) if self.num_blocks > 1: if self.upsample_type == "upsample": - self.upsample_layers = paddle.nn.LayerList( + self.upsample_layers = nn.LayerList( sublayers=[ Upsample3DLayer( dim=self.mem_shapes[i + 1][-1], @@ -1285,7 +1273,7 @@ def __init__( else: raise NotImplementedError(f"{self.upsample_type} is invalid.") if self.hierarchical_pos_embed: - self.hierarchical_pos_embed_l = paddle.nn.LayerList( + self.hierarchical_pos_embed_l = nn.LayerList( sublayers=[ PosEmbed( embed_dim=self.mem_shapes[i][-1], @@ -1368,7 +1356,7 @@ def forward(self, x, mem_l, mem_global_vector_l=None): return x -class MixtureCrossAttention(paddle.nn.Layer): +class MixtureCrossAttention(nn.Layer): def __init__( self, dim, @@ -1424,7 +1412,7 @@ def __init__( else: raise NotImplementedError - self.experts = paddle.nn.LayerList( + self.experts = nn.LayerList( [ CuboidCrossAttentionLayer( dim=dim, diff --git a/ppsci/arch/extformer_moe_cuboid_encoder.py b/ppsci/arch/extformer_moe_cuboid_encoder.py index a21b126c0a..c26b3837a5 100644 --- a/ppsci/arch/extformer_moe_cuboid_encoder.py +++ b/ppsci/arch/extformer_moe_cuboid_encoder.py @@ -16,7 +16,7 @@ NEGATIVE_SLOPE = 0.1 -class PatchMerging3D(paddle.nn.Layer): +class PatchMerging3D(nn.Layer): """Patch Merging Layer Args: @@ -49,7 +49,7 @@ def __init__( self.out_dim = out_dim self.downsample = downsample self.padding_type = padding_type - self.reduction = paddle.nn.Linear( + self.reduction = nn.Linear( in_features=downsample[0] * downsample[1] * downsample[2] * dim, out_features=out_dim, bias_attr=False, @@ -127,7 +127,7 @@ def forward(self, x): return x -class PositionwiseFFN(paddle.nn.Layer): +class PositionwiseFFN(nn.Layer): """The Position-wise FFN layer used in Transformer-like architectures If pre_norm is True: @@ -187,8 +187,8 @@ def __init__( ("pre_norm", pre_norm), ] ) - self.dropout_layer = paddle.nn.Dropout(p=dropout) - self.activation_dropout_layer = paddle.nn.Dropout(p=activation_dropout) + self.dropout_layer = nn.Dropout(p=dropout) + self.activation_dropout_layer = nn.Dropout(p=activation_dropout) if moe_config["use_linear_moe"]: self.ffn_1 = MixtureLinear( @@ -199,11 +199,11 @@ def __init__( moe_config=moe_config, ) else: - self.ffn_1 = paddle.nn.Linear( + self.ffn_1 = nn.Linear( in_features=units, out_features=hidden_size, bias_attr=True ) if self._gated_proj: - self.ffn_1_gate = paddle.nn.Linear( + self.ffn_1_gate = nn.Linear( in_features=units, out_features=hidden_size, bias_attr=True ) if activation == "leaky_relu": @@ -220,7 +220,7 @@ def __init__( moe_config=moe_config, ) else: - self.ffn_2 = paddle.nn.Linear( + self.ffn_2 = nn.Linear( in_features=hidden_size, out_features=units, bias_attr=True ) self.layer_norm = cuboid_utils.get_norm_layer( @@ -424,9 +424,11 @@ def masked_softmax(att_score, mask, axis: int = -1): att_score = att_score.masked_fill(paddle.logical_not(mask), -1e4) else: att_score = att_score.masked_fill(paddle.logical_not(mask), -1e18) - att_weights = paddle.nn.functional.softmax(x=att_score, axis=axis) * mask + att_weights = nn.functional.softmax(x=att_score, axis=axis) * mask.astype( + att_score.dtype + ) else: - att_weights = paddle.nn.functional.softmax(x=att_score, axis=axis) + att_weights = nn.functional.softmax(x=att_score, axis=axis) return att_weights @@ -475,7 +477,7 @@ def cuboid_reorder_reverse(data, cuboid_size, strategy, orig_data_shape): return data -class CuboidSelfAttentionLayer(paddle.nn.Layer): +class CuboidSelfAttentionLayer(nn.Layer): """Implements the cuboid self attention. The idea of Cuboid Self Attention is to divide the input tensor (T, H, W) into several non-overlapping cuboids. @@ -613,49 +615,47 @@ def __init__( self.register_buffer( name="relative_position_index", tensor=relative_position_index ) - self.qkv = paddle.nn.Linear( - in_features=dim, out_features=dim * 3, bias_attr=qkv_bias - ) - self.attn_drop = paddle.nn.Dropout(p=attn_drop) + self.qkv = nn.Linear(in_features=dim, out_features=dim * 3, bias_attr=qkv_bias) + self.attn_drop = nn.Dropout(p=attn_drop) if self.use_global_vector: if self.separate_global_qkv: - self.l2g_q_net = paddle.nn.Linear( + self.l2g_q_net = nn.Linear( in_features=dim, out_features=dim, bias_attr=qkv_bias ) - self.l2g_global_kv_net = paddle.nn.Linear( + self.l2g_global_kv_net = nn.Linear( in_features=global_dim_ratio * dim, out_features=dim * 2, bias_attr=qkv_bias, ) - self.g2l_global_q_net = paddle.nn.Linear( + self.g2l_global_q_net = nn.Linear( in_features=global_dim_ratio * dim, out_features=dim, bias_attr=qkv_bias, ) - self.g2l_k_net = paddle.nn.Linear( + self.g2l_k_net = nn.Linear( in_features=dim, out_features=dim, bias_attr=qkv_bias ) - self.g2l_v_net = paddle.nn.Linear( + self.g2l_v_net = nn.Linear( in_features=dim, out_features=global_dim_ratio * dim, bias_attr=qkv_bias, ) if self.use_global_self_attn: - self.g2g_global_qkv_net = paddle.nn.Linear( + self.g2g_global_qkv_net = nn.Linear( in_features=global_dim_ratio * dim, out_features=global_dim_ratio * dim * 3, bias_attr=qkv_bias, ) else: - self.global_qkv = paddle.nn.Linear( + self.global_qkv = nn.Linear( in_features=dim, out_features=dim * 3, bias_attr=qkv_bias ) - self.global_attn_drop = paddle.nn.Dropout(p=attn_drop) + self.global_attn_drop = nn.Dropout(p=attn_drop) if use_final_proj: - self.proj = paddle.nn.Linear(in_features=dim, out_features=dim) - self.proj_drop = paddle.nn.Dropout(p=proj_drop) + self.proj = nn.Linear(in_features=dim, out_features=dim) + self.proj_drop = nn.Dropout(p=proj_drop) if self.use_global_vector: - self.global_proj = paddle.nn.Linear( + self.global_proj = nn.Linear( in_features=global_dim_ratio * dim, out_features=global_dim_ratio * dim, ) @@ -980,7 +980,7 @@ def forward(self, x, global_vectors=None): return x -class StackCuboidSelfAttentionBlock(paddle.nn.Layer): +class StackCuboidSelfAttentionBlock(nn.Layer): """ - "use_inter_ffn" is True x --> attn1 -----+-------> ffn1 ---+---> attn2 --> ... --> ffn_k --> out @@ -1083,7 +1083,7 @@ def __init__( self.global_dim_ratio = global_dim_ratio if self.use_inter_ffn: if moe_config["use_ffn_moe"]: - self.ffn_l = paddle.nn.LayerList( + self.ffn_l = nn.LayerList( sublayers=[ MixtureFFN( units=dim, @@ -1103,7 +1103,7 @@ def __init__( ] ) else: - self.ffn_l = paddle.nn.LayerList( + self.ffn_l = nn.LayerList( sublayers=[ PositionwiseFFN( units=dim, @@ -1124,7 +1124,7 @@ def __init__( ) if self.use_global_vector_ffn and self.use_global_vector: if moe_config["use_ffn_moe"]: - self.global_ffn_l = paddle.nn.LayerList( + self.global_ffn_l = nn.LayerList( sublayers=[ MixtureFFN( units=global_dim_ratio * dim, @@ -1144,7 +1144,7 @@ def __init__( ] ) else: - self.global_ffn_l = paddle.nn.LayerList( + self.global_ffn_l = nn.LayerList( sublayers=[ PositionwiseFFN( units=global_dim_ratio * dim, @@ -1165,7 +1165,7 @@ def __init__( ) else: if moe_config["use_ffn_moe"]: - self.ffn_l = paddle.nn.LayerList( + self.ffn_l = nn.LayerList( sublayers=[ MixtureFFN( units=dim, @@ -1184,7 +1184,7 @@ def __init__( ] ) else: - self.ffn_l = paddle.nn.LayerList( + self.ffn_l = nn.LayerList( sublayers=[ PositionwiseFFN( units=dim, @@ -1204,7 +1204,7 @@ def __init__( ) if self.use_global_vector_ffn and self.use_global_vector: if moe_config["use_ffn_moe"]: - self.global_ffn_l = paddle.nn.LayerList( + self.global_ffn_l = nn.LayerList( sublayers=[ MixtureFFN( units=global_dim_ratio * dim, @@ -1223,7 +1223,7 @@ def __init__( ] ) else: - self.global_ffn_l = paddle.nn.LayerList( + self.global_ffn_l = nn.LayerList( sublayers=[ PositionwiseFFN( units=global_dim_ratio * dim, @@ -1243,7 +1243,7 @@ def __init__( ) if moe_config["use_attn_moe"]: - self.attn_l = paddle.nn.LayerList( + self.attn_l = nn.LayerList( sublayers=[ MixtureSelfAttention( dim=dim, @@ -1276,7 +1276,7 @@ def __init__( ] ) else: - self.attn_l = paddle.nn.LayerList( + self.attn_l = nn.LayerList( sublayers=[ CuboidSelfAttentionLayer( dim=dim, @@ -1385,7 +1385,7 @@ def forward(self, x, global_vectors=None): return x -class CuboidTransformerEncoder(paddle.nn.Layer): +class CuboidTransformerEncoder(nn.Layer): """Encoder of the CuboidTransformer x --> attn_block --> patch_merge --> attn_block --> patch_merge --> ... --> out @@ -1500,7 +1500,7 @@ def __init__( self.block_units = block_units if self.num_blocks > 1: if downsample_type == "patch_merge": - self.down_layers = paddle.nn.LayerList( + self.down_layers = nn.LayerList( sublayers=[ PatchMerging3D( dim=self.block_units[i], @@ -1516,9 +1516,9 @@ def __init__( else: raise NotImplementedError(f"{downsample_type} is invalid.") if self.use_global_vector: - self.down_layer_global_proj = paddle.nn.LayerList( + self.down_layer_global_proj = nn.LayerList( sublayers=[ - paddle.nn.Linear( + nn.Linear( in_features=global_dim_ratio * self.block_units[i], out_features=global_dim_ratio * self.block_units[i + 1], ) @@ -1566,9 +1566,9 @@ def __init__( self.block_shift_size = block_shift_size expert_shape_list = self.get_mem_shapes() - self.blocks = paddle.nn.LayerList( + self.blocks = nn.LayerList( sublayers=[ - paddle.nn.Sequential( + nn.Sequential( *[ StackCuboidSelfAttentionBlock( dim=self.block_units[i], @@ -1675,7 +1675,7 @@ def forward(self, x, global_vectors=None): return out -class MixtureLinear(paddle.nn.Layer): +class MixtureLinear(nn.Layer): def __init__(self, in_dim, out_dim, expert_shape, moe_config, bias_attr=True): super().__init__() @@ -1709,11 +1709,9 @@ def __init__(self, in_dim, out_dim, expert_shape, moe_config, bias_attr=True): else: raise NotImplementedError - self.experts = paddle.nn.LayerList( + self.experts = nn.LayerList( [ - paddle.nn.Linear( - in_features=in_dim, out_features=out_dim, bias_attr=bias_attr - ) + nn.Linear(in_features=in_dim, out_features=out_dim, bias_attr=bias_attr) for _ in range(self.num_experts) ] ) @@ -1763,7 +1761,7 @@ def forward(self, x): return y -class MixtureFFN(paddle.nn.Layer): +class MixtureFFN(nn.Layer): def __init__( self, units, @@ -1808,7 +1806,7 @@ def __init__( else: raise NotImplementedError - self.experts = paddle.nn.LayerList( + self.experts = nn.LayerList( [ PositionwiseFFN( units=units, @@ -1878,7 +1876,7 @@ def reset_parameters(self): self.experts[i].reset_parameters() -class MixtureSelfAttention(paddle.nn.Layer): +class MixtureSelfAttention(nn.Layer): def __init__( self, dim, @@ -1933,7 +1931,7 @@ def __init__( else: raise NotImplementedError - self.experts = paddle.nn.LayerList( + self.experts = nn.LayerList( [ CuboidSelfAttentionLayer( dim=dim, diff --git a/ppsci/arch/extformer_moe_cuboid_utils.py b/ppsci/arch/extformer_moe_cuboid_utils.py index 49bde7c2ab..8b90626e3b 100644 --- a/ppsci/arch/extformer_moe_cuboid_utils.py +++ b/ppsci/arch/extformer_moe_cuboid_utils.py @@ -12,7 +12,7 @@ def round_to(dat, c): return dat + (dat - dat % c) % c -class RMSNorm(paddle.nn.Layer): +class RMSNorm(nn.Layer): """Root Mean Square Layer Normalization proposed in "[NeurIPS2019] Root Mean Square Layer Normalization" Args: @@ -94,7 +94,7 @@ def get_norm_layer( if normalization == "layer_norm": assert in_channels > 0 assert axis == -1 - norm_layer = paddle.nn.LayerNorm( + norm_layer = nn.LayerNorm( normalized_shape=in_channels, epsilon=epsilon, **kwargs ) elif normalization == "rms_norm": @@ -104,7 +104,7 @@ def get_norm_layer( raise NotImplementedError(f"normalization={normalization} is not supported") return norm_layer elif normalization is None: - return paddle.nn.Identity() + return nn.Identity() else: raise NotImplementedError("The type of normalization must be str") @@ -115,7 +115,7 @@ def generalize_padding(x, pad_t, pad_h, pad_w, padding_type, t_pad_left=False): assert padding_type in ["zeros", "ignore", "nearest"] B, T, H, W, C = x.shape if padding_type == "nearest": - return paddle.nn.functional.interpolate( + return nn.functional.interpolate( x=x.transpose(perm=[0, 4, 1, 2, 3]), size=(T + pad_t, H + pad_h, W + pad_w) ).transpose(perm=[0, 2, 3, 4, 1]) elif t_pad_left: @@ -136,7 +136,7 @@ def generalize_unpadding(x, pad_t, pad_h, pad_w, padding_type): if pad_t == 0 and pad_h == 0 and pad_w == 0: return x if padding_type == "nearest": - return paddle.nn.functional.interpolate( + return nn.functional.interpolate( x=x.transpose(perm=[0, 4, 1, 2, 3]), size=(T - pad_t, H - pad_h, W - pad_w) ).transpose(perm=[0, 2, 3, 4, 1]) else: @@ -144,13 +144,13 @@ def generalize_unpadding(x, pad_t, pad_h, pad_w, padding_type): def apply_initialization( - m: paddle.nn.Layer, + m: nn.Layer, linear_mode: str = "0", conv_mode: str = "0", norm_mode: str = "0", embed_mode: str = "0", ): - if isinstance(m, paddle.nn.Linear): + if isinstance(m, nn.Linear): if linear_mode in ("0",): m.weight = initializer.kaiming_normal_(m.weight, nonlinearity="linear") elif linear_mode in ("1",): @@ -164,10 +164,10 @@ def apply_initialization( elif isinstance( m, ( - paddle.nn.Conv2D, - paddle.nn.Conv3D, - paddle.nn.Conv2DTranspose, - paddle.nn.Conv3DTranspose, + nn.Conv2D, + nn.Conv3D, + nn.Conv2DTranspose, + nn.Conv3DTranspose, ), ): if conv_mode in ("0",): @@ -178,26 +178,26 @@ def apply_initialization( raise NotImplementedError(f"{conv_mode} is invalid.") if hasattr(m, "bias") and m.bias is not None: m.bias = initializer.zeros_(m.bias) - elif isinstance(m, paddle.nn.LayerNorm): + elif isinstance(m, nn.LayerNorm): if norm_mode in ("0",): m.weight = initializer.zeros_(m.weight) m.bias = initializer.zeros_(m.bias) else: raise NotImplementedError(f"{norm_mode} is invalid.") - elif isinstance(m, paddle.nn.GroupNorm): + elif isinstance(m, nn.GroupNorm): if norm_mode in ("0",): m.weight = initializer.ones_(m.weight) m.bias = initializer.zeros_(m.bias) else: raise NotImplementedError(f"{norm_mode} is invalid.") - elif isinstance(m, paddle.nn.Embedding): + elif isinstance(m, nn.Embedding): if embed_mode in ("0",): m.weight.data = initializer.trunc_normal_(m.weight.data, std=0.02) else: raise NotImplementedError(f"{embed_mode} is invalid.") - elif isinstance(m, paddle.nn.Layer) and hasattr(m, "experts"): + elif isinstance(m, nn.Layer) and hasattr(m, "experts"): for lin in m.experts: - assert isinstance(lin, paddle.nn.Linear) + assert isinstance(lin, nn.Linear) apply_initialization(lin, linear_mode=linear_mode) else: pass diff --git a/ppsci/arch/extformer_moe_utils.py b/ppsci/arch/extformer_moe_utils.py index 0c57f96066..3332b356c8 100644 --- a/ppsci/arch/extformer_moe_utils.py +++ b/ppsci/arch/extformer_moe_utils.py @@ -1,11 +1,12 @@ import math import paddle +from paddle import nn # MoE Gating -class GatingNet(paddle.nn.Layer): +class GatingNet(nn.Layer): def __init__(self, moe_config, input_shape, in_channels): super().__init__() @@ -16,12 +17,12 @@ def __init__(self, moe_config, input_shape, in_channels): assert len(input_shape) == 4 self.input_shape = input_shape - self.noise_lin = paddle.nn.Linear( + self.noise_lin = nn.Linear( in_features=in_channels, out_features=self.num_experts, bias_attr=False ) self.noise_eps = 1e-2 - self.softplus = paddle.nn.Softplus() - self.softmax = paddle.nn.Softmax(axis=-1) + self.softplus = nn.Softplus() + self.softmax = nn.Softmax(axis=-1) self.importance_weight = moe_config["importance_weight"] self.load_weight = moe_config["load_weight"] @@ -178,7 +179,7 @@ def __init__(self, moe_config, input_shape, in_channels): assert len(input_shape) == 4 T, H, W, C = input_shape - self.lin = paddle.nn.Linear( + self.lin = nn.Linear( in_features=in_channels, out_features=self.num_experts, bias_attr=False ) @@ -199,7 +200,7 @@ def __init__(self, moe_config, input_shape, in_channels): self.routing_weights = paddle.create_parameter( shape=[H, W, self.num_experts], dtype="float32", - default_initializer=paddle.nn.initializer.Uniform(-bound, bound), + default_initializer=nn.initializer.Uniform(-bound, bound), ) def gating(self, x, t_map=None): @@ -222,16 +223,16 @@ def __init__(self, moe_config, input_shape, in_channels): self.spatial_routing_weights = paddle.create_parameter( shape=[H, W, self.num_experts], dtype="float32", - default_initializer=paddle.nn.initializer.Uniform(-bound, bound), + default_initializer=nn.initializer.Uniform(-bound, bound), ) - self.lin = paddle.nn.Linear( + self.lin = nn.Linear( in_features=in_channels, out_features=self.num_experts, bias_attr=False ) self.combine_weight = paddle.create_parameter( shape=[H, W, self.num_experts, 2], dtype="float32", - default_initializer=paddle.nn.initializer.Uniform(-bound, bound), + default_initializer=nn.initializer.Uniform(-bound, bound), ) def gating(self, x, t_map=None): @@ -262,7 +263,7 @@ def __init__(self, moe_config, input_shape, in_channels): self.routing_weights = paddle.create_parameter( shape=[T, H, W, self.num_experts], dtype="float32", - default_initializer=paddle.nn.initializer.Uniform(-bound, bound), + default_initializer=nn.initializer.Uniform(-bound, bound), ) def gating(self, x, t_map=None): @@ -285,17 +286,17 @@ def __init__(self, moe_config, input_shape, in_channels): self.cuboid_routing_weights = paddle.create_parameter( shape=[T, H, W, self.num_experts], dtype="float32", - default_initializer=paddle.nn.initializer.Uniform(-bound, bound), + default_initializer=nn.initializer.Uniform(-bound, bound), ) - self.lin = paddle.nn.Linear( + self.lin = nn.Linear( in_features=in_channels, out_features=self.num_experts, bias_attr=False ) self.combine_weight = paddle.create_parameter( shape=[T, H, W, self.num_experts, 2], dtype="float32", - default_initializer=paddle.nn.initializer.Uniform(-bound, bound), + default_initializer=nn.initializer.Uniform(-bound, bound), ) def gating(self, x, t_map=None): @@ -418,7 +419,7 @@ def combine(self, expert_out): # RNC -class LabelDifference(paddle.nn.Layer): +class LabelDifference(nn.Layer): def __init__(self, distance_type="l1"): super().__init__() self.distance_type = distance_type @@ -435,7 +436,7 @@ def forward(self, labels): raise ValueError(self.distance_type) -class FeatureSimilarity(paddle.nn.Layer): +class FeatureSimilarity(nn.Layer): def __init__(self, similarity_type="l2", temperature=2): super().__init__() self.similarity_type = similarity_type @@ -454,7 +455,7 @@ def forward(self, features): logits -= logits_max.detach() return logits elif self.similarity_type == "cosine": - cos_func = paddle.nn.CosineSimilarity(axis=-1) + cos_func = nn.CosineSimilarity(axis=-1) logits = cos_func(features[:, :, None, :], features[:, None, :, :]) logits /= self.t return logits @@ -462,7 +463,7 @@ def forward(self, features): raise ValueError(self.similarity_type) -class RnCLoss(paddle.nn.Layer): +class RnCLoss(nn.Layer): def __init__(self, rnc_config): super().__init__() @@ -474,7 +475,7 @@ def __init__(self, rnc_config): ) self.rnc_weight = rnc_config["rank_reg_coeff"] self.loss_cal_mode = rnc_config["loss_cal_style"] - self.softmax_cri = paddle.nn.Softmax(axis=-1) + self.softmax_cri = nn.Softmax(axis=-1) def cal_loss(self, features, labels): From 618529ad3177209a2682aa53f5f2da0896c70331 Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Mon, 8 Jul 2024 13:57:26 +0800 Subject: [PATCH 2/3] replace paddle.nn. with nn. --- ppsci/arch/activation.py | 2 +- ppsci/arch/amgnet.py | 28 ++--- ppsci/arch/cuboid_transformer.py | 56 ++++----- ppsci/arch/cuboid_transformer_decoder.py | 70 +++++------ ppsci/arch/cuboid_transformer_encoder.py | 74 ++++++------ ppsci/arch/cuboid_transformer_utils.py | 28 ++--- ppsci/arch/mlp.py | 2 +- ppsci/arch/nowcastnet.py | 147 +++++++++++------------ ppsci/arch/phycrnet.py | 22 ++-- 9 files changed, 204 insertions(+), 225 deletions(-) diff --git a/ppsci/arch/activation.py b/ppsci/arch/activation.py index f73b51581b..bfba786853 100644 --- a/ppsci/arch/activation.py +++ b/ppsci/arch/activation.py @@ -51,7 +51,7 @@ def __init__(self, beta: float = 1.0): super().__init__() self.beta = self.create_parameter( shape=[], - default_initializer=paddle.nn.initializer.Constant(beta), + default_initializer=nn.initializer.Constant(beta), ) def forward(self, x): diff --git a/ppsci/arch/amgnet.py b/ppsci/arch/amgnet.py index b0e4b89929..ce728317d6 100644 --- a/ppsci/arch/amgnet.py +++ b/ppsci/arch/amgnet.py @@ -238,21 +238,21 @@ def faster_graph_connectivity(perm, edge_index, edge_weight, score, pos, N, norm value_A = edge_weight.clone() value_A = paddle.squeeze(value_A) - model_1 = paddle.nn.Sequential( - ("l1", paddle.nn.Linear(128, 256)), - ("act1", paddle.nn.ReLU()), - ("l2", paddle.nn.Linear(256, 256)), - ("act2", paddle.nn.ReLU()), - ("l4", paddle.nn.Linear(256, 128)), - ("act4", paddle.nn.ReLU()), - ("l5", paddle.nn.Linear(128, 1)), + model_1 = nn.Sequential( + ("l1", nn.Linear(128, 256)), + ("act1", nn.ReLU()), + ("l2", nn.Linear(256, 256)), + ("act2", nn.ReLU()), + ("l4", nn.Linear(256, 128)), + ("act4", nn.ReLU()), + ("l5", nn.Linear(128, 1)), ) - model_2 = paddle.nn.Sequential( - ("l1", paddle.nn.Linear(1, 64)), - ("act1", paddle.nn.ReLU()), - ("l2", paddle.nn.Linear(64, 128)), - ("act2", paddle.nn.ReLU()), - ("l4", paddle.nn.Linear(128, 128)), + model_2 = nn.Sequential( + ("l1", nn.Linear(1, 64)), + ("act1", nn.ReLU()), + ("l2", nn.Linear(64, 128)), + ("act2", nn.ReLU()), + ("l4", nn.Linear(128, 128)), ) val_A = model_1(value_A) diff --git a/ppsci/arch/cuboid_transformer.py b/ppsci/arch/cuboid_transformer.py index 0b74caf52b..e0e6cbded6 100644 --- a/ppsci/arch/cuboid_transformer.py +++ b/ppsci/arch/cuboid_transformer.py @@ -16,7 +16,7 @@ """A space-time Transformer with Cuboid Attention""" -class InitialEncoder(paddle.nn.Layer): +class InitialEncoder(nn.Layer): def __init__( self, dim, @@ -38,16 +38,14 @@ def __init__( for i in range(num_conv_layers): if i == 0: conv_block.append( - paddle.nn.Conv2D( + nn.Conv2D( kernel_size=(3, 3), padding=(1, 1), in_channels=dim, out_channels=out_dim, ) ) - conv_block.append( - paddle.nn.GroupNorm(num_groups=16, num_channels=out_dim) - ) + conv_block.append(nn.GroupNorm(num_groups=16, num_channels=out_dim)) conv_block.append( act_mod.get_activation(activation) if activation != "leaky_relu" @@ -55,22 +53,20 @@ def __init__( ) else: conv_block.append( - paddle.nn.Conv2D( + nn.Conv2D( kernel_size=(3, 3), padding=(1, 1), in_channels=out_dim, out_channels=out_dim, ) ) - conv_block.append( - paddle.nn.GroupNorm(num_groups=16, num_channels=out_dim) - ) + conv_block.append(nn.GroupNorm(num_groups=16, num_channels=out_dim)) conv_block.append( act_mod.get_activation(activation) if activation != "leaky_relu" else nn.LeakyReLU(NEGATIVE_SLOPE) ) - self.conv_block = paddle.nn.Sequential(*conv_block) + self.conv_block = nn.Sequential(*conv_block) if isinstance(downsample_scale, int): patch_merge_downsample = (1, downsample_scale, downsample_scale) elif len(downsample_scale) == 2: @@ -121,7 +117,7 @@ def forward(self, x): return x -class FinalDecoder(paddle.nn.Layer): +class FinalDecoder(nn.Layer): def __init__( self, target_thw: Tuple[int, ...], @@ -142,20 +138,20 @@ def __init__( conv_block = [] for i in range(num_conv_layers): conv_block.append( - paddle.nn.Conv2D( + nn.Conv2D( kernel_size=(3, 3), padding=(1, 1), in_channels=dim, out_channels=dim, ) ) - conv_block.append(paddle.nn.GroupNorm(num_groups=16, num_channels=dim)) + conv_block.append(nn.GroupNorm(num_groups=16, num_channels=dim)) conv_block.append( act_mod.get_activation(activation) if activation != "leaky_relu" else nn.LeakyReLU(NEGATIVE_SLOPE) ) - self.conv_block = paddle.nn.Sequential(*conv_block) + self.conv_block = nn.Sequential(*conv_block) self.upsample = cuboid_decoder.Upsample3DLayer( dim=dim, out_dim=dim, @@ -196,7 +192,7 @@ def forward(self, x): return x -class InitialStackPatchMergingEncoder(paddle.nn.Layer): +class InitialStackPatchMergingEncoder(nn.Layer): def __init__( self, num_merge: int, @@ -220,8 +216,8 @@ def __init__( self.downsample_scale_list = downsample_scale_list[:num_merge] self.num_conv_per_merge_list = num_conv_per_merge_list self.num_group_list = [max(1, out_dim // 4) for out_dim in self.out_dim_list] - self.conv_block_list = paddle.nn.LayerList() - self.patch_merge_list = paddle.nn.LayerList() + self.conv_block_list = nn.LayerList() + self.patch_merge_list = nn.LayerList() for i in range(num_merge): if i == 0: in_dim = in_dim @@ -236,7 +232,7 @@ def __init__( else: conv_in_dim = out_dim conv_block.append( - paddle.nn.Conv2D( + nn.Conv2D( kernel_size=(3, 3), padding=(1, 1), in_channels=conv_in_dim, @@ -244,7 +240,7 @@ def __init__( ) ) conv_block.append( - paddle.nn.GroupNorm( + nn.GroupNorm( num_groups=self.num_group_list[i], num_channels=out_dim ) ) @@ -253,7 +249,7 @@ def __init__( if activation != "leaky_relu" else nn.LeakyReLU(NEGATIVE_SLOPE) ) - conv_block = paddle.nn.Sequential(*conv_block) + conv_block = nn.Sequential(*conv_block) self.conv_block_list.append(conv_block) patch_merge = cuboid_encoder.PatchMerging3D( dim=out_dim, @@ -303,7 +299,7 @@ def forward(self, x): return x -class FinalStackUpsamplingDecoder(paddle.nn.Layer): +class FinalStackUpsamplingDecoder(nn.Layer): def __init__( self, target_shape_list: Tuple[Tuple[int, ...]], @@ -326,8 +322,8 @@ def __init__( self.in_dim = in_dim self.num_conv_per_up_list = num_conv_per_up_list self.num_group_list = [max(1, out_dim // 4) for out_dim in self.out_dim_list] - self.conv_block_list = paddle.nn.LayerList() - self.upsample_list = paddle.nn.LayerList() + self.conv_block_list = nn.LayerList() + self.upsample_list = nn.LayerList() for i in range(self.num_upsample): if i == 0: in_dim = in_dim @@ -349,7 +345,7 @@ def __init__( else: conv_in_dim = out_dim conv_block.append( - paddle.nn.Conv2D( + nn.Conv2D( kernel_size=(3, 3), padding=(1, 1), in_channels=conv_in_dim, @@ -357,7 +353,7 @@ def __init__( ) ) conv_block.append( - paddle.nn.GroupNorm( + nn.GroupNorm( num_groups=self.num_group_list[i], num_channels=out_dim ) ) @@ -366,7 +362,7 @@ def __init__( if activation != "leaky_relu" else nn.LeakyReLU(NEGATIVE_SLOPE) ) - conv_block = paddle.nn.Sequential(*conv_block) + conv_block = nn.Sequential(*conv_block) self.conv_block_list.append(conv_block) self.reset_parameters() @@ -686,7 +682,7 @@ def __init__( embed_dim=base_units, typ=pos_embed_type, maxH=H_in, maxW=W_in, maxT=T_in ) mem_shapes = self.encoder.get_mem_shapes() - self.z_proj = paddle.nn.Linear( + self.z_proj = nn.Linear( in_features=mem_shapes[-1][-1], out_features=mem_shapes[-1][-1] ) self.dec_pos_embed = cuboid_decoder.PosEmbed( @@ -799,7 +795,7 @@ def get_initial_encoder_final_decoder( new_input_shape = self.initial_encoder.patch_merge.get_out_shape( self.input_shape ) - self.dec_final_proj = paddle.nn.Linear( + self.dec_final_proj = nn.Linear( in_features=self.base_units, out_features=C_out ) elif self.initial_downsample_type == "stack_conv": @@ -839,7 +835,7 @@ def get_initial_encoder_final_decoder( linear_init_mode=self.down_up_linear_init_mode, norm_init_mode=self.norm_init_mode, ) - self.dec_final_proj = paddle.nn.Linear( + self.dec_final_proj = nn.Linear( in_features=dec_target_shape_list[-1][-1], out_features=C_out ) new_input_shape = self.initial_encoder.get_out_shape_list(self.input_shape)[ @@ -892,7 +888,7 @@ def get_initial_z(self, final_mem, T_out): shape=[B, -1, -1, -1, -1] ) elif self.z_init_method == "nearest_interp": - initial_z = paddle.nn.functional.interpolate( + initial_z = nn.functional.interpolate( x=final_mem.transpose(perm=[0, 4, 1, 2, 3]), size=(T_out, final_mem.shape[2], final_mem.shape[3]), ).transpose(perm=[0, 2, 3, 4, 1]) diff --git a/ppsci/arch/cuboid_transformer_decoder.py b/ppsci/arch/cuboid_transformer_decoder.py index 6504717042..894363b1a8 100644 --- a/ppsci/arch/cuboid_transformer_decoder.py +++ b/ppsci/arch/cuboid_transformer_decoder.py @@ -12,7 +12,7 @@ from ppsci.utils import initializer -class PosEmbed(paddle.nn.Layer): +class PosEmbed(nn.Layer): """Pose embeding Args: @@ -37,20 +37,12 @@ def __init__(self, embed_dim, maxT, maxH, maxW, typ: str = "t+h+w"): self.maxW = maxW self.embed_dim = embed_dim if self.typ == "t+h+w": - self.T_embed = paddle.nn.Embedding( - num_embeddings=maxT, embedding_dim=embed_dim - ) - self.H_embed = paddle.nn.Embedding( - num_embeddings=maxH, embedding_dim=embed_dim - ) - self.W_embed = paddle.nn.Embedding( - num_embeddings=maxW, embedding_dim=embed_dim - ) + self.T_embed = nn.Embedding(num_embeddings=maxT, embedding_dim=embed_dim) + self.H_embed = nn.Embedding(num_embeddings=maxH, embedding_dim=embed_dim) + self.W_embed = nn.Embedding(num_embeddings=maxW, embedding_dim=embed_dim) elif self.typ == "t+hw": - self.T_embed = paddle.nn.Embedding( - num_embeddings=maxT, embedding_dim=embed_dim - ) - self.HW_embed = paddle.nn.Embedding( + self.T_embed = nn.Embedding(num_embeddings=maxT, embedding_dim=embed_dim) + self.HW_embed = nn.Embedding( num_embeddings=maxH * maxW, embedding_dim=embed_dim ) else: @@ -168,7 +160,7 @@ def compute_cuboid_cross_attention_mask( return attn_mask -class CuboidCrossAttentionLayer(paddle.nn.Layer): +class CuboidCrossAttentionLayer(nn.Layer): """Implements the cuboid cross attention. The idea of Cuboid Cross Attention is to extend the idea of cuboid self attention to work for the @@ -307,21 +299,19 @@ def __init__( self.register_buffer( name="relative_position_index", tensor=relative_position_index ) - self.q_proj = paddle.nn.Linear( - in_features=dim, out_features=dim, bias_attr=qkv_bias - ) - self.kv_proj = paddle.nn.Linear( + self.q_proj = nn.Linear(in_features=dim, out_features=dim, bias_attr=qkv_bias) + self.kv_proj = nn.Linear( in_features=dim, out_features=dim * 2, bias_attr=qkv_bias ) - self.attn_drop = paddle.nn.Dropout(p=attn_drop) - self.proj = paddle.nn.Linear(in_features=dim, out_features=dim) - self.proj_drop = paddle.nn.Dropout(p=proj_drop) + self.attn_drop = nn.Dropout(p=attn_drop) + self.proj = nn.Linear(in_features=dim, out_features=dim) + self.proj_drop = nn.Dropout(p=proj_drop) if self.use_global_vector: if self.separate_global_qkv: - self.l2g_q_net = paddle.nn.Linear( + self.l2g_q_net = nn.Linear( in_features=dim, out_features=dim, bias_attr=qkv_bias ) - self.l2g_global_kv_net = paddle.nn.Linear( + self.l2g_global_kv_net = nn.Linear( in_features=global_dim_ratio * dim, out_features=dim * 2, bias_attr=qkv_bias, @@ -546,7 +536,7 @@ def forward(self, x, mem, mem_global_vectors=None): return x -class StackCuboidCrossAttentionBlock(paddle.nn.Layer): +class StackCuboidCrossAttentionBlock(nn.Layer): """A stack of cuboid cross attention layers. The advantage of cuboid attention is that we can combine cuboid attention building blocks with different @@ -648,7 +638,7 @@ def __init__( self.use_inter_ffn = use_inter_ffn self.use_global_vector = use_global_vector if self.use_inter_ffn: - self.ffn_l = paddle.nn.LayerList( + self.ffn_l = nn.LayerList( sublayers=[ cuboid_encoder.PositionwiseFFN( units=dim, @@ -666,7 +656,7 @@ def __init__( ] ) else: - self.ffn_l = paddle.nn.LayerList( + self.ffn_l = nn.LayerList( sublayers=[ cuboid_encoder.PositionwiseFFN( units=dim, @@ -682,7 +672,7 @@ def __init__( ) ] ) - self.attn_l = paddle.nn.LayerList( + self.attn_l = nn.LayerList( sublayers=[ CuboidCrossAttentionLayer( dim=dim, @@ -755,7 +745,7 @@ def forward(self, x, mem, mem_global_vector=None): return x -class Upsample3DLayer(paddle.nn.Layer): +class Upsample3DLayer(nn.Layer): """Upsampling based on nn.UpSampling and Conv3x3. If the temporal dimension remains the same: @@ -789,12 +779,10 @@ def __init__( self.out_dim = out_dim self.temporal_upsample = temporal_upsample if temporal_upsample: - self.up = paddle.nn.Upsample(size=target_size, mode="nearest") + self.up = nn.Upsample(size=target_size, mode="nearest") else: - self.up = paddle.nn.Upsample( - size=(target_size[1], target_size[2]), mode="nearest" - ) - self.conv = paddle.nn.Conv2D( + self.up = nn.Upsample(size=(target_size[1], target_size[2]), mode="nearest") + self.conv = nn.Conv2D( in_channels=dim, out_channels=out_dim, kernel_size=(kernel_size, kernel_size), @@ -855,7 +843,7 @@ def forward(self, x): ) -class CuboidTransformerDecoder(paddle.nn.Layer): +class CuboidTransformerDecoder(nn.Layer): """Decoder of the CuboidTransformer. For each block, we first apply the StackCuboidSelfAttention and then apply the StackCuboidCrossAttention @@ -1062,8 +1050,8 @@ def __init__( ) for _ in range(ele_depth) ] - self_blocks.append(paddle.nn.LayerList(sublayers=stack_cuboid_blocks)) - self.self_blocks = paddle.nn.LayerList(sublayers=self_blocks) + self_blocks.append(nn.LayerList(sublayers=stack_cuboid_blocks)) + self.self_blocks = nn.LayerList(sublayers=self_blocks) if block_cross_attn_patterns is not None: if isinstance(block_cross_attn_patterns, (tuple, list)): assert len(block_cross_attn_patterns) == self.num_blocks @@ -1121,9 +1109,9 @@ def __init__( assert ( len(block_cross_n_temporal) == self.num_blocks ), f"Incorrect input format! Received block_cross_n_temporal={block_cross_n_temporal}" - self.cross_blocks = paddle.nn.LayerList() + self.cross_blocks = nn.LayerList() for i in range(self.cross_start, self.num_blocks): - cross_block = paddle.nn.LayerList( + cross_block = nn.LayerList( sublayers=[ StackCuboidCrossAttentionBlock( dim=self.mem_shapes[i][-1], @@ -1157,7 +1145,7 @@ def __init__( self.cross_blocks.append(cross_block) if self.num_blocks > 1: if self.upsample_type == "upsample": - self.upsample_layers = paddle.nn.LayerList( + self.upsample_layers = nn.LayerList( sublayers=[ Upsample3DLayer( dim=self.mem_shapes[i + 1][-1], @@ -1174,7 +1162,7 @@ def __init__( else: raise NotImplementedError(f"{self.upsample_type} is invalid.") if self.hierarchical_pos_embed: - self.hierarchical_pos_embed_l = paddle.nn.LayerList( + self.hierarchical_pos_embed_l = nn.LayerList( sublayers=[ PosEmbed( embed_dim=self.mem_shapes[i][-1], diff --git a/ppsci/arch/cuboid_transformer_encoder.py b/ppsci/arch/cuboid_transformer_encoder.py index 34ec3efa53..79b2e6fd1d 100644 --- a/ppsci/arch/cuboid_transformer_encoder.py +++ b/ppsci/arch/cuboid_transformer_encoder.py @@ -15,7 +15,7 @@ NEGATIVE_SLOPE = 0.1 -class PatchMerging3D(paddle.nn.Layer): +class PatchMerging3D(nn.Layer): """Patch Merging Layer Args: @@ -47,7 +47,7 @@ def __init__( self.out_dim = out_dim self.downsample = downsample self.padding_type = padding_type - self.reduction = paddle.nn.Linear( + self.reduction = nn.Linear( in_features=downsample[0] * downsample[1] * downsample[2] * dim, out_features=out_dim, bias_attr=False, @@ -125,7 +125,7 @@ def forward(self, x): return x -class PositionwiseFFN(paddle.nn.Layer): +class PositionwiseFFN(nn.Layer): """The Position-wise FFN layer used in Transformer-like architectures If pre_norm is True: @@ -183,20 +183,20 @@ def __init__( ("pre_norm", pre_norm), ] ) - self.dropout_layer = paddle.nn.Dropout(p=dropout) - self.activation_dropout_layer = paddle.nn.Dropout(p=activation_dropout) - self.ffn_1 = paddle.nn.Linear( + self.dropout_layer = nn.Dropout(p=dropout) + self.activation_dropout_layer = nn.Dropout(p=activation_dropout) + self.ffn_1 = nn.Linear( in_features=units, out_features=hidden_size, bias_attr=True ) if self._gated_proj: - self.ffn_1_gate = paddle.nn.Linear( + self.ffn_1_gate = nn.Linear( in_features=units, out_features=hidden_size, bias_attr=True ) if activation == "leaky_relu": self.activation = nn.LeakyReLU(NEGATIVE_SLOPE) else: self.activation = act_mod.get_activation(activation) - self.ffn_2 = paddle.nn.Linear( + self.ffn_2 = nn.Linear( in_features=hidden_size, out_features=units, bias_attr=True ) self.layer_norm = cuboid_utils.get_norm_layer( @@ -400,9 +400,9 @@ def masked_softmax(att_score, mask, axis: int = -1): att_score = att_score.masked_fill(paddle.logical_not(mask), -1e4) else: att_score = att_score.masked_fill(paddle.logical_not(mask), -1e18) - att_weights = paddle.nn.functional.softmax(x=att_score, axis=axis) * mask + att_weights = nn.functional.softmax(x=att_score, axis=axis) * mask else: - att_weights = paddle.nn.functional.softmax(x=att_score, axis=axis) + att_weights = nn.functional.softmax(x=att_score, axis=axis) return att_weights @@ -451,7 +451,7 @@ def cuboid_reorder_reverse(data, cuboid_size, strategy, orig_data_shape): return data -class CuboidSelfAttentionLayer(paddle.nn.Layer): +class CuboidSelfAttentionLayer(nn.Layer): """Implements the cuboid self attention. The idea of Cuboid Self Attention is to divide the input tensor (T, H, W) into several non-overlapping cuboids. @@ -588,49 +588,47 @@ def __init__( self.register_buffer( name="relative_position_index", tensor=relative_position_index ) - self.qkv = paddle.nn.Linear( - in_features=dim, out_features=dim * 3, bias_attr=qkv_bias - ) - self.attn_drop = paddle.nn.Dropout(p=attn_drop) + self.qkv = nn.Linear(in_features=dim, out_features=dim * 3, bias_attr=qkv_bias) + self.attn_drop = nn.Dropout(p=attn_drop) if self.use_global_vector: if self.separate_global_qkv: - self.l2g_q_net = paddle.nn.Linear( + self.l2g_q_net = nn.Linear( in_features=dim, out_features=dim, bias_attr=qkv_bias ) - self.l2g_global_kv_net = paddle.nn.Linear( + self.l2g_global_kv_net = nn.Linear( in_features=global_dim_ratio * dim, out_features=dim * 2, bias_attr=qkv_bias, ) - self.g2l_global_q_net = paddle.nn.Linear( + self.g2l_global_q_net = nn.Linear( in_features=global_dim_ratio * dim, out_features=dim, bias_attr=qkv_bias, ) - self.g2l_k_net = paddle.nn.Linear( + self.g2l_k_net = nn.Linear( in_features=dim, out_features=dim, bias_attr=qkv_bias ) - self.g2l_v_net = paddle.nn.Linear( + self.g2l_v_net = nn.Linear( in_features=dim, out_features=global_dim_ratio * dim, bias_attr=qkv_bias, ) if self.use_global_self_attn: - self.g2g_global_qkv_net = paddle.nn.Linear( + self.g2g_global_qkv_net = nn.Linear( in_features=global_dim_ratio * dim, out_features=global_dim_ratio * dim * 3, bias_attr=qkv_bias, ) else: - self.global_qkv = paddle.nn.Linear( + self.global_qkv = nn.Linear( in_features=dim, out_features=dim * 3, bias_attr=qkv_bias ) - self.global_attn_drop = paddle.nn.Dropout(p=attn_drop) + self.global_attn_drop = nn.Dropout(p=attn_drop) if use_final_proj: - self.proj = paddle.nn.Linear(in_features=dim, out_features=dim) - self.proj_drop = paddle.nn.Dropout(p=proj_drop) + self.proj = nn.Linear(in_features=dim, out_features=dim) + self.proj_drop = nn.Dropout(p=proj_drop) if self.use_global_vector: - self.global_proj = paddle.nn.Linear( + self.global_proj = nn.Linear( in_features=global_dim_ratio * dim, out_features=global_dim_ratio * dim, ) @@ -955,7 +953,7 @@ def forward(self, x, global_vectors=None): return x -class StackCuboidSelfAttentionBlock(paddle.nn.Layer): +class StackCuboidSelfAttentionBlock(nn.Layer): """ - "use_inter_ffn" is True x --> attn1 -----+-------> ffn1 ---+---> attn2 --> ... --> ffn_k --> out @@ -1055,7 +1053,7 @@ def __init__( self.use_global_self_attn = use_global_self_attn self.global_dim_ratio = global_dim_ratio if self.use_inter_ffn: - self.ffn_l = paddle.nn.LayerList( + self.ffn_l = nn.LayerList( sublayers=[ PositionwiseFFN( units=dim, @@ -1073,7 +1071,7 @@ def __init__( ] ) if self.use_global_vector_ffn and self.use_global_vector: - self.global_ffn_l = paddle.nn.LayerList( + self.global_ffn_l = nn.LayerList( sublayers=[ PositionwiseFFN( units=global_dim_ratio * dim, @@ -1091,7 +1089,7 @@ def __init__( ] ) else: - self.ffn_l = paddle.nn.LayerList( + self.ffn_l = nn.LayerList( sublayers=[ PositionwiseFFN( units=dim, @@ -1108,7 +1106,7 @@ def __init__( ] ) if self.use_global_vector_ffn and self.use_global_vector: - self.global_ffn_l = paddle.nn.LayerList( + self.global_ffn_l = nn.LayerList( sublayers=[ PositionwiseFFN( units=global_dim_ratio * dim, @@ -1124,7 +1122,7 @@ def __init__( ) ] ) - self.attn_l = paddle.nn.LayerList( + self.attn_l = nn.LayerList( sublayers=[ CuboidSelfAttentionLayer( dim=dim, @@ -1233,7 +1231,7 @@ def forward(self, x, global_vectors=None): return x -class CuboidTransformerEncoder(paddle.nn.Layer): +class CuboidTransformerEncoder(nn.Layer): """Encoder of the CuboidTransformer x --> attn_block --> patch_merge --> attn_block --> patch_merge --> ... --> out @@ -1346,7 +1344,7 @@ def __init__( self.block_units = block_units if self.num_blocks > 1: if downsample_type == "patch_merge": - self.down_layers = paddle.nn.LayerList( + self.down_layers = nn.LayerList( sublayers=[ PatchMerging3D( dim=self.block_units[i], @@ -1362,9 +1360,9 @@ def __init__( else: raise NotImplementedError(f"{downsample_type} is invalid.") if self.use_global_vector: - self.down_layer_global_proj = paddle.nn.LayerList( + self.down_layer_global_proj = nn.LayerList( sublayers=[ - paddle.nn.Linear( + nn.Linear( in_features=global_dim_ratio * self.block_units[i], out_features=global_dim_ratio * self.block_units[i + 1], ) @@ -1410,9 +1408,9 @@ def __init__( self.block_cuboid_size = block_cuboid_size self.block_strategy = block_strategy self.block_shift_size = block_shift_size - self.blocks = paddle.nn.LayerList( + self.blocks = nn.LayerList( sublayers=[ - paddle.nn.Sequential( + nn.Sequential( *[ StackCuboidSelfAttentionBlock( dim=self.block_units[i], diff --git a/ppsci/arch/cuboid_transformer_utils.py b/ppsci/arch/cuboid_transformer_utils.py index 456e975cfd..02ef060002 100644 --- a/ppsci/arch/cuboid_transformer_utils.py +++ b/ppsci/arch/cuboid_transformer_utils.py @@ -12,7 +12,7 @@ def round_to(dat, c): return dat + (dat - dat % c) % c -class RMSNorm(paddle.nn.Layer): +class RMSNorm(nn.Layer): """Root Mean Square Layer Normalization proposed in "[NeurIPS2019] Root Mean Square Layer Normalization" Args: @@ -94,7 +94,7 @@ def get_norm_layer( if normalization == "layer_norm": assert in_channels > 0 assert axis == -1 - norm_layer = paddle.nn.LayerNorm( + norm_layer = nn.LayerNorm( normalized_shape=in_channels, epsilon=epsilon, **kwargs ) elif normalization == "rms_norm": @@ -106,7 +106,7 @@ def get_norm_layer( ) return norm_layer elif normalization is None: - return paddle.nn.Identity() + return nn.Identity() else: raise NotImplementedError("The type of normalization must be str") @@ -117,7 +117,7 @@ def generalize_padding(x, pad_t, pad_h, pad_w, padding_type, t_pad_left=False): assert padding_type in ["zeros", "ignore", "nearest"] B, T, H, W, C = x.shape if padding_type == "nearest": - return paddle.nn.functional.interpolate( + return nn.functional.interpolate( x=x.transpose(perm=[0, 4, 1, 2, 3]), size=(T + pad_t, H + pad_h, W + pad_w) ).transpose(perm=[0, 2, 3, 4, 1]) elif t_pad_left: @@ -138,7 +138,7 @@ def generalize_unpadding(x, pad_t, pad_h, pad_w, padding_type): if pad_t == 0 and pad_h == 0 and pad_w == 0: return x if padding_type == "nearest": - return paddle.nn.functional.interpolate( + return nn.functional.interpolate( x=x.transpose(perm=[0, 4, 1, 2, 3]), size=(T - pad_t, H - pad_h, W - pad_w) ).transpose(perm=[0, 2, 3, 4, 1]) else: @@ -146,13 +146,13 @@ def generalize_unpadding(x, pad_t, pad_h, pad_w, padding_type): def apply_initialization( - m: paddle.nn.Layer, + m: nn.Layer, linear_mode: str = "0", conv_mode: str = "0", norm_mode: str = "0", embed_mode: str = "0", ): - if isinstance(m, paddle.nn.Linear): + if isinstance(m, nn.Linear): if linear_mode in ("0",): m.weight = initializer.kaiming_normal_(m.weight, nonlinearity="linear") elif linear_mode in ("1",): @@ -166,10 +166,10 @@ def apply_initialization( elif isinstance( m, ( - paddle.nn.Conv2D, - paddle.nn.Conv3D, - paddle.nn.Conv2DTranspose, - paddle.nn.Conv3DTranspose, + nn.Conv2D, + nn.Conv3D, + nn.Conv2DTranspose, + nn.Conv3DTranspose, ), ): if conv_mode in ("0",): @@ -180,19 +180,19 @@ def apply_initialization( raise NotImplementedError(f"{conv_mode} is invalid.") if hasattr(m, "bias") and m.bias is not None: m.bias = initializer.zeros_(m.bias) - elif isinstance(m, paddle.nn.LayerNorm): + elif isinstance(m, nn.LayerNorm): if norm_mode in ("0",): m.weight = initializer.zeros_(m.weight) m.bias = initializer.zeros_(m.bias) else: raise NotImplementedError(f"{norm_mode} is invalid.") - elif isinstance(m, paddle.nn.GroupNorm): + elif isinstance(m, nn.GroupNorm): if norm_mode in ("0",): m.weight = initializer.ones_(m.weight) m.bias = initializer.zeros_(m.bias) else: raise NotImplementedError(f"{norm_mode} is invalid.") - elif isinstance(m, paddle.nn.Embedding): + elif isinstance(m, nn.Embedding): if embed_mode in ("0",): m.weight.data = initializer.trunc_normal_(m.weight.data, std=0.02) else: diff --git a/ppsci/arch/mlp.py b/ppsci/arch/mlp.py index fea0ea79d2..9a9d142a8d 100644 --- a/ppsci/arch/mlp.py +++ b/ppsci/arch/mlp.py @@ -103,7 +103,7 @@ def __init__(self, periods: Dict[str, Tuple[float, bool]]): ) # mu = 2*pi / period for sin/cos function for k, (p, trainable) in periods.items() } - self.freqs = paddle.nn.ParameterList(list(self.freqs_dict.values())) + self.freqs = nn.ParameterList(list(self.freqs_dict.values())) def forward(self, x: Dict[str, paddle.Tensor]): y = {k: v for k, v in x.items()} # shallow copy to avoid modifying input dict diff --git a/ppsci/arch/nowcastnet.py b/ppsci/arch/nowcastnet.py index 38d5209616..bc7538ad91 100644 --- a/ppsci/arch/nowcastnet.py +++ b/ppsci/arch/nowcastnet.py @@ -16,6 +16,7 @@ from typing import Tuple import paddle +from paddle import nn from ppsci.arch import base @@ -139,7 +140,7 @@ def forward_tensor(self, x): return gen_result.unsqueeze(axis=-1) -class Evolution_Network(paddle.nn.Layer): +class Evolution_Network(nn.Layer): def __init__(self, n_channels, n_classes, base_c=64, bilinear=True): super().__init__() self.n_channels = n_channels @@ -161,7 +162,7 @@ def __init__(self, n_channels, n_classes, base_c=64, bilinear=True): gamma = self.create_parameter( shape=param1.shape, dtype=param1.dtype, - default_initializer=paddle.nn.initializer.Assign(param1), + default_initializer=nn.initializer.Assign(param1), ) gamma.stop_gradient = False self.gamma = gamma @@ -190,26 +191,26 @@ def forward(self, x): return x, v -class DoubleConv(paddle.nn.Layer): +class DoubleConv(nn.Layer): def __init__(self, in_channels, out_channels, kernel=3, mid_channels=None): super().__init__() if not mid_channels: mid_channels = out_channels - self.double_conv = paddle.nn.Sequential( - paddle.nn.BatchNorm2D(num_features=in_channels), - paddle.nn.ReLU(), - paddle.nn.utils.spectral_norm( - layer=paddle.nn.Conv2D( + self.double_conv = nn.Sequential( + nn.BatchNorm2D(num_features=in_channels), + nn.ReLU(), + nn.utils.spectral_norm( + layer=nn.Conv2D( in_channels=in_channels, out_channels=mid_channels, kernel_size=kernel, padding=kernel // 2, ) ), - paddle.nn.BatchNorm2D(num_features=mid_channels), - paddle.nn.ReLU(), - paddle.nn.utils.spectral_norm( - layer=paddle.nn.Conv2D( + nn.BatchNorm2D(num_features=mid_channels), + nn.ReLU(), + nn.utils.spectral_norm( + layer=nn.Conv2D( in_channels=mid_channels, out_channels=out_channels, kernel_size=kernel, @@ -217,10 +218,10 @@ def __init__(self, in_channels, out_channels, kernel=3, mid_channels=None): ) ), ) - self.single_conv = paddle.nn.Sequential( - paddle.nn.BatchNorm2D(num_features=in_channels), - paddle.nn.utils.spectral_norm( - layer=paddle.nn.Conv2D( + self.single_conv = nn.Sequential( + nn.BatchNorm2D(num_features=in_channels), + nn.utils.spectral_norm( + layer=nn.Conv2D( in_channels=in_channels, out_channels=out_channels, kernel_size=kernel, @@ -236,11 +237,11 @@ def forward(self, x): return x -class Down(paddle.nn.Layer): +class Down(nn.Layer): def __init__(self, in_channels, out_channels, kernel=3): super().__init__() - self.maxpool_conv = paddle.nn.Sequential( - paddle.nn.MaxPool2D(kernel_size=2), + self.maxpool_conv = nn.Sequential( + nn.MaxPool2D(kernel_size=2), DoubleConv(in_channels, out_channels, kernel), ) @@ -249,18 +250,16 @@ def forward(self, x): return x -class Up(paddle.nn.Layer): +class Up(nn.Layer): def __init__(self, in_channels, out_channels, bilinear=True, kernel=3): super().__init__() if bilinear: - self.up = paddle.nn.Upsample( - scale_factor=2, mode="bilinear", align_corners=True - ) + self.up = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True) self.conv = DoubleConv( in_channels, out_channels, kernel=kernel, mid_channels=in_channels // 2 ) else: - self.up = paddle.nn.Conv2DTranspose( + self.up = nn.Conv2DTranspose( in_channels=in_channels, out_channels=in_channels // 2, kernel_size=2, @@ -273,25 +272,23 @@ def forward(self, x1, x2): # input is CHW diffY = x2.shape[2] - x1.shape[2] diffX = x2.shape[3] - x1.shape[3] - x1 = paddle.nn.functional.pad( + x1 = nn.functional.pad( x1, [diffX // 2, diffX - diffX // 2, diffY // 2, diffY - diffY // 2] ) x = paddle.concat(x=[x2, x1], axis=1) return self.conv(x) -class Up_S(paddle.nn.Layer): +class Up_S(nn.Layer): def __init__(self, in_channels, out_channels, bilinear=True, kernel=3): super().__init__() if bilinear: - self.up = paddle.nn.Upsample( - scale_factor=2, mode="bilinear", align_corners=True - ) + self.up = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True) self.conv = DoubleConv( in_channels, out_channels, kernel=kernel, mid_channels=in_channels ) else: - self.up = paddle.nn.Conv2DTranspose( + self.up = nn.Conv2DTranspose( in_channels=in_channels, out_channels=in_channels, kernel_size=2, @@ -304,10 +301,10 @@ def forward(self, x): return self.conv(x) -class OutConv(paddle.nn.Layer): +class OutConv(nn.Layer): def __init__(self, in_channels, out_channels): super().__init__() - self.conv = paddle.nn.Conv2D( + self.conv = nn.Conv2D( in_channels=in_channels, out_channels=out_channels, kernel_size=1 ) @@ -315,7 +312,7 @@ def forward(self, x): return self.conv(x) -class Generative_Encoder(paddle.nn.Layer): +class Generative_Encoder(nn.Layer): def __init__(self, n_channels, base_c=64): super().__init__() base_c = base_c @@ -332,13 +329,13 @@ def forward(self, x): return x -class Generative_Decoder(paddle.nn.Layer): +class Generative_Decoder(nn.Layer): def __init__(self, opt): super().__init__() self.opt = opt nf = opt.ngf ic = opt.ic_feature - self.fc = paddle.nn.Conv2D( + self.fc = nn.Conv2D( in_channels=ic, out_channels=8 * nf, kernel_size=3, padding=1 ) self.head_0 = GenBlock(8 * nf, 8 * nf, opt) @@ -348,10 +345,10 @@ def __init__(self, opt): self.up_1 = GenBlock(2 * nf, 1 * nf, opt, double_conv=True) self.up_2 = GenBlock(1 * nf, 1 * nf, opt, double_conv=True) final_nc = nf * 1 - self.conv_img = paddle.nn.Conv2D( + self.conv_img = nn.Conv2D( in_channels=final_nc, out_channels=self.opt.gen_oc, kernel_size=3, padding=1 ) - self.up = paddle.nn.Upsample(scale_factor=2) + self.up = nn.Upsample(scale_factor=2) def forward(self, x, evo): x = self.fc(x) @@ -364,26 +361,26 @@ def forward(self, x, evo): x = self.up(x) x = self.up_1(x, evo) x = self.up_2(x, evo) - x = self.conv_img(paddle.nn.functional.leaky_relu(x=x, negative_slope=0.2)) + x = self.conv_img(nn.functional.leaky_relu(x=x, negative_slope=0.2)) return x -class GenBlock(paddle.nn.Layer): +class GenBlock(nn.Layer): def __init__(self, fin, fout, opt, use_se=False, dilation=1, double_conv=False): super().__init__() self.learned_shortcut = fin != fout fmiddle = min(fin, fout) self.opt = opt self.double_conv = double_conv - self.pad = paddle.nn.Pad2D(padding=dilation, mode="reflect") - self.conv_0 = paddle.nn.Conv2D( + self.pad = nn.Pad2D(padding=dilation, mode="reflect") + self.conv_0 = nn.Conv2D( in_channels=fin, out_channels=fmiddle, kernel_size=3, padding=0, dilation=dilation, ) - self.conv_1 = paddle.nn.Conv2D( + self.conv_1 = nn.Conv2D( in_channels=fmiddle, out_channels=fout, kernel_size=3, @@ -391,13 +388,13 @@ def __init__(self, fin, fout, opt, use_se=False, dilation=1, double_conv=False): dilation=dilation, ) if self.learned_shortcut: - self.conv_s = paddle.nn.Conv2D( + self.conv_s = nn.Conv2D( in_channels=fin, out_channels=fout, kernel_size=1, bias_attr=False ) - self.conv_0 = paddle.nn.utils.spectral_norm(layer=self.conv_0) - self.conv_1 = paddle.nn.utils.spectral_norm(layer=self.conv_1) + self.conv_0 = nn.utils.spectral_norm(layer=self.conv_0) + self.conv_1 = nn.utils.spectral_norm(layer=self.conv_1) if self.learned_shortcut: - self.conv_s = paddle.nn.utils.spectral_norm(layer=self.conv_s) + self.conv_s = nn.utils.spectral_norm(layer=self.conv_s) ic = opt.evo_ic self.norm_0 = SPADE(fin, ic) self.norm_1 = SPADE(fmiddle, ic) @@ -420,37 +417,37 @@ def shortcut(self, x, evo): return x_s def actvn(self, x): - return paddle.nn.functional.leaky_relu(x=x, negative_slope=0.2) + return nn.functional.leaky_relu(x=x, negative_slope=0.2) -class SPADE(paddle.nn.Layer): +class SPADE(nn.Layer): def __init__(self, norm_nc, label_nc): super().__init__() ks = 3 - self.param_free_norm = paddle.nn.InstanceNorm2D( + self.param_free_norm = nn.InstanceNorm2D( num_features=norm_nc, weight_attr=False, bias_attr=False, momentum=1 - 0.1 ) nhidden = 64 ks = 3 pw = ks // 2 - self.mlp_shared = paddle.nn.Sequential( - paddle.nn.Pad2D(padding=pw, mode="reflect"), - paddle.nn.Conv2D( + self.mlp_shared = nn.Sequential( + nn.Pad2D(padding=pw, mode="reflect"), + nn.Conv2D( in_channels=label_nc, out_channels=nhidden, kernel_size=ks, padding=0 ), - paddle.nn.ReLU(), + nn.ReLU(), ) - self.pad = paddle.nn.Pad2D(padding=pw, mode="reflect") - self.mlp_gamma = paddle.nn.Conv2D( + self.pad = nn.Pad2D(padding=pw, mode="reflect") + self.mlp_gamma = nn.Conv2D( in_channels=nhidden, out_channels=norm_nc, kernel_size=ks, padding=0 ) - self.mlp_beta = paddle.nn.Conv2D( + self.mlp_beta = nn.Conv2D( in_channels=nhidden, out_channels=norm_nc, kernel_size=ks, padding=0 ) def forward(self, x, evo): normalized = self.param_free_norm(x) - evo = paddle.nn.functional.adaptive_avg_pool2d(x=evo, output_size=x.shape[2:]) + evo = nn.functional.adaptive_avg_pool2d(x=evo, output_size=x.shape[2:]) actv = self.mlp_shared(evo) gamma = self.mlp_gamma(self.pad(actv)) beta = self.mlp_beta(self.pad(actv)) @@ -458,12 +455,12 @@ def forward(self, x, evo): return out -class Noise_Projector(paddle.nn.Layer): +class Noise_Projector(nn.Layer): def __init__(self, input_length): super().__init__() self.input_length = input_length - self.conv_first = paddle.nn.utils.spectral_norm( - paddle.nn.Conv2D( + self.conv_first = nn.utils.spectral_norm( + nn.Conv2D( in_channels=self.input_length, out_channels=self.input_length * 2, kernel_size=3, @@ -484,29 +481,29 @@ def forward(self, x): return x -class ProjBlock(paddle.nn.Layer): +class ProjBlock(nn.Layer): def __init__(self, in_channel, out_channel): super().__init__() - self.one_conv = paddle.nn.utils.spectral_norm( - paddle.nn.Conv2D( + self.one_conv = nn.utils.spectral_norm( + nn.Conv2D( in_channels=in_channel, out_channels=out_channel - in_channel, kernel_size=1, padding=0, ) ) - self.double_conv = paddle.nn.Sequential( - paddle.nn.utils.spectral_norm( - paddle.nn.Conv2D( + self.double_conv = nn.Sequential( + nn.utils.spectral_norm( + nn.Conv2D( in_channels=in_channel, out_channels=out_channel, kernel_size=3, padding=1, ) ), - paddle.nn.ReLU(), - paddle.nn.utils.spectral_norm( - paddle.nn.Conv2D( + nn.ReLU(), + nn.utils.spectral_norm( + nn.Conv2D( in_channels=out_channel, out_channels=out_channel, kernel_size=3, @@ -538,7 +535,7 @@ def warp(input, flow, grid, mode="bilinear", padding_mode="zeros"): vgrid[:, 0, :, :] = 2.0 * vgrid[:, 0, :, :].clone() / max(W - 1, 1) - 1.0 vgrid[:, 1, :, :] = 2.0 * vgrid[:, 1, :, :].clone() / max(H - 1, 1) - 1.0 vgrid = vgrid.transpose(perm=[0, 2, 3, 1]) - output = paddle.nn.functional.grid_sample( + output = nn.functional.grid_sample( x=input.cpu(), grid=vgrid.cpu(), padding_mode=padding_mode, @@ -552,7 +549,7 @@ def l2normalize(v, eps=1e-12): return v / (v.norm() + eps) -class spectral_norm(paddle.nn.Layer): +class spectral_norm(nn.Layer): def __init__(self, module, name="weight", power_iterations=1): super().__init__() self.module = module @@ -596,7 +593,7 @@ def _make_params(self): out_0 = paddle.create_parameter( shape=tmp_w.shape, dtype=tmp_w.numpy().dtype, - default_initializer=paddle.nn.initializer.Assign(tmp_w), + default_initializer=nn.initializer.Assign(tmp_w), ) out_0.stop_gradient = True u = out_0 @@ -605,7 +602,7 @@ def _make_params(self): out_1 = paddle.create_parameter( shape=tmp_w.shape, dtype=tmp_w.numpy().dtype, - default_initializer=paddle.nn.initializer.Assign(tmp_w), + default_initializer=nn.initializer.Assign(tmp_w), ) out_1.stop_gradient = True v = out_1 @@ -615,7 +612,7 @@ def _make_params(self): out_2 = paddle.create_parameter( shape=tmp_w.shape, dtype=tmp_w.numpy().dtype, - default_initializer=paddle.nn.initializer.Assign(tmp_w), + default_initializer=nn.initializer.Assign(tmp_w), ) out_2.stop_gradient = False w_bar = out_2 @@ -636,7 +633,7 @@ def create_param(x): param = paddle.create_parameter( shape=x.shape, dtype=x.dtype, - default_initializer=paddle.nn.initializer.Assign(x), + default_initializer=nn.initializer.Assign(x), ) param.stop_gradient = x.stop_gradient return param diff --git a/ppsci/arch/phycrnet.py b/ppsci/arch/phycrnet.py index f020607be8..c72583ebf9 100644 --- a/ppsci/arch/phycrnet.py +++ b/ppsci/arch/phycrnet.py @@ -133,7 +133,7 @@ def __init__( self.num_convlstm = num_layers[1] # encoder - downsampling - self.encoder = paddle.nn.LayerList( + self.encoder = nn.LayerList( [ encoder_block( input_channels=self.input_channels[i], @@ -147,7 +147,7 @@ def __init__( ) # ConvLSTM - self.convlstm = paddle.nn.LayerList( + self.convlstm = nn.LayerList( [ ConvLSTMCell( input_channels=self.input_channels[i], @@ -170,7 +170,7 @@ def __init__( # initialize weights self.apply(_initialize_weights) - initializer_0 = paddle.nn.initializer.Constant(0.0) + initializer_0 = nn.initializer.Constant(0.0) initializer_0(self.output_layer.bias) self.enable_transform = True @@ -334,8 +334,8 @@ def __init__( padding_mode="circular", ) - initializer_0 = paddle.nn.initializer.Constant(0.0) - initializer_1 = paddle.nn.initializer.Constant(1.0) + initializer_0 = nn.initializer.Constant(0.0) + initializer_1 = nn.initializer.Constant(1.0) initializer_0(self.Wxi.bias) initializer_0(self.Wxf.bias) @@ -343,10 +343,10 @@ def __init__( initializer_1(self.Wxo.bias) def forward(self, x, h, c): - ci = paddle.nn.functional.sigmoid(self.Wxi(x) + self.Whi(h)) - cf = paddle.nn.functional.sigmoid(self.Wxf(x) + self.Whf(h)) + ci = nn.functional.sigmoid(self.Wxi(x) + self.Whi(h)) + cf = nn.functional.sigmoid(self.Wxf(x) + self.Whf(h)) cc = cf * c + ci * paddle.tanh(self.Wxc(x) + self.Whc(h)) - co = paddle.nn.functional.sigmoid(self.Wxo(x) + self.Who(h)) + co = nn.functional.sigmoid(self.Wxo(x) + self.Who(h)) ch = co * paddle.tanh(cc) return ch, cc @@ -387,7 +387,7 @@ def __init__( self.act = nn.ReLU() - initializer_0 = paddle.nn.initializer.Constant(0.0) + initializer_0 = nn.initializer.Constant(0.0) initializer_0(self.conv.bias) def forward(self, x): @@ -418,7 +418,7 @@ def __init__(self, der_filter, resol, kernel_size=3, name=""): self.filter.weight = self.create_parameter( shape=self.filter.weight.shape, dtype=self.filter.weight.dtype, - default_initializer=paddle.nn.initializer.Assign( + default_initializer=nn.initializer.Assign( paddle.to_tensor( der_filter, dtype=paddle.get_default_dtype(), stop_gradient=True ) @@ -455,7 +455,7 @@ def __init__(self, der_filter, resol, kernel_size=3, name=""): self.filter.weight = self.create_parameter( shape=self.filter.weight.shape, dtype=self.filter.weight.dtype, - default_initializer=paddle.nn.initializer.Assign( + default_initializer=nn.initializer.Assign( paddle.to_tensor( der_filter, dtype=paddle.get_default_dtype(), stop_gradient=True ) From a18e02889e7c0535336ae78fc26ef2e902459729 Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Mon, 8 Jul 2024 14:01:57 +0800 Subject: [PATCH 3/3] update Extformer-MoE in docs --- README.md | 1 + docs/index.md | 1 + docs/zh/examples/extformer_moe.md | 6 +++--- examples/extformer_moe/requirements.txt | 2 ++ mkdocs.yml | 1 + 5 files changed, 8 insertions(+), 3 deletions(-) create mode 100644 examples/extformer_moe/requirements.txt diff --git a/README.md b/README.md index b7763837a4..0f1b7dfab0 100644 --- a/README.md +++ b/README.md @@ -92,6 +92,7 @@ PaddleScience 是一个基于深度学习框架 PaddlePaddle 开发的科学计 | 问题类型 | 案例名称 | 优化算法 | 模型类型 | 训练方式 | 数据集 | 参考资料 | |-----|---------|-----|---------|----|---------|---------| +| 天气预报 | [Extformer-MoE 气象预报](https://paddlescience-docs.readthedocs.io/zh/latest/zh/examples/extformer_moe.md) | 数据驱动 | FourCastNet | 监督学习 | [enso](https://tianchi.aliyun.com/dataset/98942) | - | | 天气预报 | [FourCastNet 气象预报](https://paddlescience-docs.readthedocs.io/zh/latest/zh/examples/fourcastnet) | 数据驱动 | FourCastNet | 监督学习 | [ERA5](https://app.globus.org/file-manager?origin_id=945b3c9e-0f8c-11ed-8daf-9f359c660fbd&origin_path=%2F~%2Fdata%2F) | [Paper](https://arxiv.org/pdf/2202.11214.pdf) | | 天气预报 | [NowCastNet 气象预报](https://paddlescience-docs.readthedocs.io/zh/latest/zh/examples/nowcastnet) | 数据驱动 | NowCastNet | 监督学习 | [MRMS](https://app.globus.org/file-manager?origin_id=945b3c9e-0f8c-11ed-8daf-9f359c660fbd&origin_path=%2F~%2Fdata%2F) | [Paper](https://www.nature.com/articles/s41586-023-06184-4) | | 天气预报 | [GraphCast 气象预报](https://paddlescience-docs.readthedocs.io/zh/latest/zh/examples/graphcast) | 数据驱动 | GraphCastNet | 监督学习 | - | [Paper](https://arxiv.org/abs/2212.12794) | diff --git a/docs/index.md b/docs/index.md index 195da54656..d5257fd1b9 100644 --- a/docs/index.md +++ b/docs/index.md @@ -137,6 +137,7 @@ | 问题类型 | 案例名称 | 优化算法 | 模型类型 | 训练方式 | 数据集 | 参考资料 | |-----|---------|-----|---------|----|---------|---------| +| 天气预报 | [Extformer-MoE 气象预报](./zh/examples/extformer_moe.md) | 数据驱动 | FourCastNet | 监督学习 | [enso](https://tianchi.aliyun.com/dataset/98942) | - | | 天气预报 | [FourCastNet 气象预报](./zh/examples/fourcastnet.md) | 数据驱动 | FourCastNet | 监督学习 | [ERA5](https://app.globus.org/file-manager?origin_id=945b3c9e-0f8c-11ed-8daf-9f359c660fbd&origin_path=%2F~%2Fdata%2F) | [Paper](https://arxiv.org/pdf/2202.11214.pdf) | | 天气预报 | [NowCastNet 气象预报](./zh/examples/nowcastnet.md) | 数据驱动 | NowCastNet | 监督学习 | [MRMS](https://app.globus.org/file-manager?origin_id=945b3c9e-0f8c-11ed-8daf-9f359c660fbd&origin_path=%2F~%2Fdata%2F) | [Paper](https://www.nature.com/articles/s41586-023-06184-4) | | 天气预报 | [GraphCast 气象预报](./zh/examples/graphcast.md) | 数据驱动 | GraphCastNet | 监督学习 | - | [Paper](https://arxiv.org/abs/2212.12794) | diff --git a/docs/zh/examples/extformer_moe.md b/docs/zh/examples/extformer_moe.md index 6c4286b8a8..25bb101da4 100644 --- a/docs/zh/examples/extformer_moe.md +++ b/docs/zh/examples/extformer_moe.md @@ -2,15 +2,15 @@ !!! note - 开始训练、评估前,请先下载 [ICAR-ENSO数据集](https://tianchi.aliyun.com/dataset/98942),并对应修改 yaml 配置文件中的 `FILE_PATH` 为解压后的数据集路径。 - 若训练时显存不足,可指定 `MODEL.checkpoint_level` 为 0、1 或 2,此时使用 recompute 模式运行,以训练时间换取显存。 + 1. 开始训练、评估前,请先下载 [ICAR-ENSO数据集](https://tianchi.aliyun.com/dataset/98942),并对应修改 yaml 配置文件中的 `FILE_PATH` 为解压后的数据集路径。 + 2. 开始训练、评估前,请安装 `xarray` 和 `h5netcdf`:`pip install requirements.txt` + 3. 若训练时显存不足,可指定 `MODEL.checkpoint_level` 为 `1` 或 `2`,此时使用 recompute 模式运行,以训练时间换取显存。 === "模型训练命令" ``` sh # ICAR-ENSO 数据预训练模型: Extformer-MoE python extformer_moe_enso_train.py - # python extformer_moe_enso_train.py MODEL.checkpoint_level=0 # using recompute to run in device with small GPU memory # python extformer_moe_enso_train.py MODEL.checkpoint_level=1 # using recompute to run in device with small GPU memory # python extformer_moe_enso_train.py MODEL.checkpoint_level=2 # using recompute to run in device with small GPU memory ``` diff --git a/examples/extformer_moe/requirements.txt b/examples/extformer_moe/requirements.txt new file mode 100644 index 0000000000..c0f424a290 --- /dev/null +++ b/examples/extformer_moe/requirements.txt @@ -0,0 +1,2 @@ +h5netcdf +xarray==2024.2.0 diff --git a/mkdocs.yml b/mkdocs.yml index e024b2a460..74af8b1b12 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -84,6 +84,7 @@ nav: - 材料科学(AI for Material): - hPINNs: zh/examples/hpinns.md - 地球科学(AI for Earth Science): + - Extformer-MoE: zh/examples/extformer_moe.md - FourCastNet: zh/examples/fourcastnet.md - NowcastNet: zh/examples/nowcastnet.md - DGMR: zh/examples/dgmr.md