diff --git a/paddlemix/models/groundingdino/ms_deform_attn.py b/paddlemix/models/groundingdino/ms_deform_attn.py index 5d29c9ea3e20f4..0a4dd728391ba0 100644 --- a/paddlemix/models/groundingdino/ms_deform_attn.py +++ b/paddlemix/models/groundingdino/ms_deform_attn.py @@ -190,7 +190,7 @@ def forward( ) if reference_points.shape[-1] == 2: - offset_normalizer = value_spatial_shapes.flip([1]).reshape([1, 1, 1, self.num_levels, 1, 2]) + offset_normalizer = value_spatial_shapes.flip([1]).reshape([1, 1, 1, self.num_levels, 1, 2]).astype(sampling_offsets.dtype) sampling_locations = ( reference_points.reshape([bs, Len_q, 1, self.num_levels, 1, 2]) + sampling_offsets / offset_normalizer ) diff --git a/paddlemix/models/groundingdino/utils.py b/paddlemix/models/groundingdino/utils.py index 42984c8ea91c02..b801af9ccc2273 100644 --- a/paddlemix/models/groundingdino/utils.py +++ b/paddlemix/models/groundingdino/utils.py @@ -105,10 +105,9 @@ def gen_encoder_output_proposals( grid = paddle.concat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) # H_, W_, 2 scale = paddle.concat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).reshape([N_, 1, 1, 2]) - grid = (grid.unsqueeze(0).tile([N_, 1, 1, 1]) + 0.5) / scale + grid = (grid.unsqueeze(0).tile([N_, 1, 1, 1]) + 0.5) / scale.astype(grid.dtype) if learnedwh is not None: - # import ipdb; ipdb.set_trace() wh = paddle.ones_like(grid) * learnedwh.sigmoid() * (2.0**lvl) else: wh = paddle.ones_like(grid) * 0.05 * (2.0**lvl)