diff --git a/contrib/PanopticSeg/paddlepanseg/models/mask2former/_ms_deform_attn.py b/contrib/PanopticSeg/paddlepanseg/models/mask2former/_ms_deform_attn.py index 2f1d318327..7be625f158 100644 --- a/contrib/PanopticSeg/paddlepanseg/models/mask2former/_ms_deform_attn.py +++ b/contrib/PanopticSeg/paddlepanseg/models/mask2former/_ms_deform_attn.py @@ -13,8 +13,8 @@ # limitations under the License. # Adapted from https://github.com/facebookresearch/Mask2Former -# -# Original copyright info: +# +# Original copyright info: # ------------------------------------------------------------------------------------------------ # Deformable DETR @@ -56,24 +56,23 @@ def slow_ms_deform_attn(value, value_spatial_shapes, sampling_locations, (b * num_heads, depth, h, w)) sampling_grid_l_ = sampling_grids[:, :, :, i].transpose( (0, 2, 1, 3, 4)).flatten(0, 1) - sampling_value_l_ = F.grid_sample( - value_l_, - sampling_grid_l_, - mode='bilinear', - padding_mode='zeros', - align_corners=False) + sampling_value_l_ = F.grid_sample(value_l_, + sampling_grid_l_, + mode='bilinear', + padding_mode='zeros', + align_corners=False) sampling_value_list.append(sampling_value_l_) # (b, n_queries, num_heads, num_levels*num_points) -> (b, num_heads, n_queries, num_levels*num_points) -> (b, num_heads, 1, n_queries, num_levels*num_points) attention_weights = attention_weights.transpose((0, 2, 1, 3, 4)).reshape( (b * num_heads, 1, n_queries, num_levels * num_points)) - output = (paddle.stack( - sampling_value_list, - axis=-2).flatten(-2) * attention_weights).sum(-1).reshape( - (b, num_heads * depth, n_queries)) + output = (paddle.stack(sampling_value_list, axis=-2).flatten(-2) * + attention_weights).sum(-1).reshape( + (b, num_heads * depth, n_queries)) return output.transpose((0, 2, 1)) class LinearWithFrozenBias(nn.Layer): + def __init__(self, in_features, out_features, @@ -83,11 +82,10 @@ def __init__(self, super().__init__() self._dtype = self._helper.get_default_dtype() self._weight_attr = weight_attr - self.weight = self.create_parameter( - shape=[in_features, out_features], - attr=self._weight_attr, - dtype=self._dtype, - is_bias=False) + self.weight = self.create_parameter(shape=[in_features, out_features], + attr=self._weight_attr, + dtype=self._dtype, + is_bias=False) bias_init_val = bias_init_val.flatten() bias_init_val = bias_init_val.astype(self._dtype) if bias_init_val.shape[0] != out_features: @@ -107,12 +105,13 @@ def extra_repr(self): class MSDeformAttn(nn.Layer, THLinearInitMixin): + def __init__(self, embed_dim=256, num_levels=4, num_heads=8, num_points=4): super().__init__() if embed_dim % num_heads != 0: raise ValueError( - "`embed_dim` must be divisible by `num_heads`, but got {} and {}". - format(embed_dim, num_heads)) + "`embed_dim` must be divisible by `num_heads`, but got {} and {}" + .format(embed_dim, num_heads)) self.im2col_step = 128 @@ -121,8 +120,8 @@ def __init__(self, embed_dim=256, num_levels=4, num_heads=8, num_points=4): self.num_heads = num_heads self.num_points = num_points - self.sampling_offsets = nn.Linear(embed_dim, num_heads * num_levels * - num_points * 2) + self.sampling_offsets = nn.Linear( + embed_dim, num_heads * num_levels * num_points * 2) self.attention_weights = nn.Linear(embed_dim, num_heads * num_levels * num_points) self.value_proj = nn.Linear(embed_dim, embed_dim) @@ -162,8 +161,8 @@ def forward(self, input_padding_mask=None): n, len_q, _ = query.shape n, len_in, _ = input_flatten.shape - assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1] - ).sum() == len_in + assert (input_spatial_shapes[:, 0] * + input_spatial_shapes[:, 1]).sum() == len_in value = self.value_proj(input_flatten) if input_padding_mask is not None: @@ -181,7 +180,7 @@ def forward(self, if reference_points.shape[-1] == 2: offset_normalizer = paddle.stack( [input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], - -1) + -1).astype('float32') sampling_locations = reference_points[:, :, None, :, None, :] \ + sampling_offsets / offset_normalizer[None, None, None, :, None, :] elif reference_points.shape[-1] == 4: @@ -189,14 +188,16 @@ def forward(self, + sampling_offsets / self.num_points * reference_points[:, :, None, :, None, 2:] * 0.5 else: raise ValueError( - "Last dim of reference_points must be 2 or 4, but get {} instead.". - format(reference_points.shape[-1])) + "Last dim of reference_points must be 2 or 4, but get {} instead." + .format(reference_points.shape[-1])) if paddle.is_compiled_with_cuda(): # GPU with use_custom_op('ms_deform_attn') as msda: - output = msda.ms_deform_attn( - value, input_spatial_shapes, input_level_start_index, - sampling_locations, attention_weights, self.im2col_step) + output = msda.ms_deform_attn(value, input_spatial_shapes, + input_level_start_index, + sampling_locations, + attention_weights, + self.im2col_step) else: # CPU output = slow_ms_deform_attn(value, input_spatial_shapes, diff --git a/paddleseg/models/layers/ms_deformable_attention.py b/paddleseg/models/layers/ms_deformable_attention.py index 8af9f36679..b9fba6f67b 100644 --- a/paddleseg/models/layers/ms_deformable_attention.py +++ b/paddleseg/models/layers/ms_deformable_attention.py @@ -11,6 +11,7 @@ class MSDeformAttn(nn.Layer): + def __init__(self, d_model=256, n_levels=4, @@ -57,18 +58,20 @@ def __init__(self, @staticmethod def _is_power_of_2(n): if (not isinstance(n, int)) or (n < 0): - raise ValueError('invalid input for _is_power_of_2: {} (type: {})'. - format(n, type(n))) + raise ValueError( + 'invalid input for _is_power_of_2: {} (type: {})'.format( + n, type(n))) return (n & (n - 1) == 0) and n != 0 def _reset_parameters(self): constant_init(self.sampling_offsets.weight, value=0.) - thetas = paddle.arange( - self.n_heads, dtype='float32') * (2.0 * math.pi / self.n_heads) + thetas = paddle.arange(self.n_heads, + dtype='float32') * (2.0 * math.pi / self.n_heads) grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1) - grid_init = (grid_init / grid_init.abs().max( - -1, keepdim=True)[0]).reshape([self.n_heads, 1, 1, 2]).tile( - [1, self.n_levels, self.n_points, 1]) + grid_init = (grid_init / + grid_init.abs().max(-1, keepdim=True)[0]).reshape( + [self.n_heads, 1, 1, + 2]).tile([1, self.n_levels, self.n_points, 1]) for i in range(self.n_points): grid_init[:, :, i, :] *= i + 1 @@ -112,8 +115,8 @@ def masked_fill(x, mask, value): N, Len_q, _ = query.shape N, Len_in, _ = input_flatten.shape - assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1] - ).sum() == Len_in + assert (input_spatial_shapes[:, 0] * + input_spatial_shapes[:, 1]).sum() == Len_in value = self.value_proj(input_flatten) if input_padding_mask is not None: @@ -133,7 +136,7 @@ def masked_fill(x, mask, value): if reference_points.shape[-1] == 2: offset_normalizer = paddle.stack( [input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], - -1) + -1).astype('float32') sampling_locations = reference_points[:, :, None, :, None, :] \ + sampling_offsets / offset_normalizer[None, None, None, :, None, :] elif reference_points.shape[-1] == 4: @@ -152,8 +155,10 @@ def masked_fill(x, mask, value): "https://paddleseg.bj.bcebos.com/dygraph/customized_ops/ms_deform_attn.zip" ) exit() - output = ms_deform_attn.ms_deform_attn( - value, input_spatial_shapes, input_level_start_index, - sampling_locations, attention_weights, self.im2col_step) + output = ms_deform_attn.ms_deform_attn(value, input_spatial_shapes, + input_level_start_index, + sampling_locations, + attention_weights, + self.im2col_step) output = self.output_proj(output) return output