diff --git a/gallery/plot_datapoints.py b/gallery/plot_datapoints.py index c5a6efa9895..fef282ae091 100644 --- a/gallery/plot_datapoints.py +++ b/gallery/plot_datapoints.py @@ -80,7 +80,7 @@ # corresponding image alongside the actual values: bounding_box = datapoints.BoundingBoxes( - [17, 16, 344, 495], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=image.shape[-2:] + [17, 16, 344, 495], format=datapoints.BoundingBoxFormat.XYXY, canvas_size=image.shape[-2:] ) print(bounding_box) @@ -108,7 +108,7 @@ def __getitem__(self, item): target["boxes"] = datapoints.BoundingBoxes( boxes, format=datapoints.BoundingBoxFormat.XYXY, - spatial_size=F.get_spatial_size(img), + canvas_size=F.get_size(img), ) target["labels"] = labels target["masks"] = datapoints.Mask(masks) @@ -129,7 +129,7 @@ def __call__(self, img, target): target["boxes"] = datapoints.BoundingBoxes( target["boxes"], format=datapoints.BoundingBoxFormat.XYXY, - spatial_size=F.get_spatial_size(img), + canvas_size=F.get_size(img), ) target["masks"] = datapoints.Mask(target["masks"]) return img, target diff --git a/gallery/plot_transforms_v2.py b/gallery/plot_transforms_v2.py index c7bae878016..88916ba44f9 100644 --- a/gallery/plot_transforms_v2.py +++ b/gallery/plot_transforms_v2.py @@ -30,7 +30,7 @@ def load_data(): masks = datapoints.Mask(merged_masks == labels.view(-1, 1, 1)) bounding_boxes = datapoints.BoundingBoxes( - masks_to_boxes(masks), format=datapoints.BoundingBoxFormat.XYXY, spatial_size=image.shape[-2:] + masks_to_boxes(masks), format=datapoints.BoundingBoxFormat.XYXY, canvas_size=image.shape[-2:] ) return path, image, bounding_boxes, masks, labels diff --git a/test/common_utils.py b/test/common_utils.py index c9cff035cac..b5edda3edb2 100644 --- a/test/common_utils.py +++ b/test/common_utils.py @@ -412,7 +412,7 @@ def load(self, device="cpu"): ) -def _parse_spatial_size(size, *, name="size"): +def _parse_canvas_size(size, *, name="size"): if size == "random": raise ValueError("This should never happen") elif isinstance(size, int) and size > 0: @@ -467,12 +467,13 @@ def load(self, device): @dataclasses.dataclass class ImageLoader(TensorLoader): - spatial_size: Tuple[int, int] = dataclasses.field(init=False) + canvas_size: Tuple[int, int] = dataclasses.field(init=False) num_channels: int = dataclasses.field(init=False) memory_format: torch.memory_format = torch.contiguous_format + canvas_size: Tuple[int, int] = dataclasses.field(init=False) def __post_init__(self): - self.spatial_size = self.shape[-2:] + self.canvas_size = self.canvas_size = self.shape[-2:] self.num_channels = self.shape[-3] def load(self, device): @@ -538,7 +539,7 @@ def make_image_loader( ): if not constant_alpha: raise ValueError("This should never happen") - size = _parse_spatial_size(size) + size = _parse_canvas_size(size) num_channels = get_num_channels(color_space) def fn(shape, dtype, device, memory_format): @@ -578,7 +579,7 @@ def make_image_loaders( def make_image_loader_for_interpolation( size=(233, 147), *, color_space="RGB", dtype=torch.uint8, memory_format=torch.contiguous_format ): - size = _parse_spatial_size(size) + size = _parse_canvas_size(size) num_channels = get_num_channels(color_space) def fn(shape, dtype, device, memory_format): @@ -623,43 +624,20 @@ def make_image_loaders_for_interpolation( class BoundingBoxesLoader(TensorLoader): format: datapoints.BoundingBoxFormat spatial_size: Tuple[int, int] + canvas_size: Tuple[int, int] = dataclasses.field(init=False) + + def __post_init__(self): + self.canvas_size = self.spatial_size def make_bounding_box( - size=None, + canvas_size=DEFAULT_SIZE, *, format=datapoints.BoundingBoxFormat.XYXY, - spatial_size=None, batch_dims=(), dtype=None, device="cpu", ): - """ - size: Size of the actual bounding box, i.e. - - (box[3] - box[1], box[2] - box[0]) for XYXY - - (H, W) for XYWH and CXCYWH - spatial_size: Size of the reference object, e.g. an image. Corresponds to the .spatial_size attribute on - returned datapoints.BoundingBoxes - - To generate a valid joint sample, you need to set spatial_size here to the same value as size on the other maker - functions, e.g. - - .. code:: - - image = make_image=(size=size) - bounding_boxes = make_bounding_box(spatial_size=size) - assert F.get_spatial_size(bounding_boxes) == F.get_spatial_size(image) - - For convenience, if both size and spatial_size are omitted, spatial_size defaults to the same value as size for all - other maker functions, e.g. - - .. code:: - - image = make_image=() - bounding_boxes = make_bounding_box() - assert F.get_spatial_size(bounding_boxes) == F.get_spatial_size(image) - """ - def sample_position(values, max_value): # We cannot use torch.randint directly here, because it only allows integer scalars as values for low and high. # However, if we have batch_dims, we need tensors as limits. @@ -668,28 +646,16 @@ def sample_position(values, max_value): if isinstance(format, str): format = datapoints.BoundingBoxFormat[format] - if spatial_size is None: - if size is None: - spatial_size = DEFAULT_SIZE - else: - height, width = size - height_margin, width_margin = torch.randint(10, (2,)).tolist() - spatial_size = (height + height_margin, width + width_margin) - dtype = dtype or torch.float32 if any(dim == 0 for dim in batch_dims): return datapoints.BoundingBoxes( - torch.empty(*batch_dims, 4, dtype=dtype, device=device), format=format, spatial_size=spatial_size + torch.empty(*batch_dims, 4, dtype=dtype, device=device), format=format, canvas_size=canvas_size ) - if size is None: - h, w = [torch.randint(1, s, batch_dims) for s in spatial_size] - else: - h, w = [torch.full(batch_dims, s, dtype=torch.int) for s in size] - - y = sample_position(h, spatial_size[0]) - x = sample_position(w, spatial_size[1]) + h, w = [torch.randint(1, c, batch_dims) for c in canvas_size] + y = sample_position(h, canvas_size[0]) + x = sample_position(w, canvas_size[1]) if format is datapoints.BoundingBoxFormat.XYWH: parts = (x, y, w, h) @@ -706,15 +672,15 @@ def sample_position(values, max_value): raise ValueError(f"Format {format} is not supported") return datapoints.BoundingBoxes( - torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, spatial_size=spatial_size + torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, canvas_size=canvas_size ) -def make_bounding_box_loader(*, extra_dims=(), format, spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.float32): +def make_bounding_box_loader(*, extra_dims=(), format, canvas_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.float32): if isinstance(format, str): format = datapoints.BoundingBoxFormat[format] - spatial_size = _parse_spatial_size(spatial_size, name="spatial_size") + canvas_size = _parse_canvas_size(canvas_size, name="canvas_size") def fn(shape, dtype, device): *batch_dims, num_coordinates = shape @@ -722,21 +688,21 @@ def fn(shape, dtype, device): raise pytest.UsageError() return make_bounding_box( - format=format, spatial_size=spatial_size, batch_dims=batch_dims, dtype=dtype, device=device + format=format, canvas_size=canvas_size, batch_dims=batch_dims, dtype=dtype, device=device ) - return BoundingBoxesLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, spatial_size=spatial_size) + return BoundingBoxesLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, spatial_size=canvas_size) def make_bounding_box_loaders( *, extra_dims=DEFAULT_EXTRA_DIMS, formats=tuple(datapoints.BoundingBoxFormat), - spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, + canvas_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtypes=(torch.float32, torch.float64, torch.int64), ): for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes): - yield make_bounding_box_loader(**params, spatial_size=spatial_size) + yield make_bounding_box_loader(**params, canvas_size=canvas_size) make_bounding_boxes = from_loaders(make_bounding_box_loaders) @@ -761,7 +727,7 @@ def make_detection_mask(size=DEFAULT_SIZE, *, num_objects=5, batch_dims=(), dtyp def make_detection_mask_loader(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_objects=5, extra_dims=(), dtype=torch.uint8): # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects - size = _parse_spatial_size(size) + size = _parse_canvas_size(size) def fn(shape, dtype, device): *batch_dims, num_objects, height, width = shape @@ -802,7 +768,7 @@ def make_segmentation_mask_loader( size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_categories=10, extra_dims=(), dtype=torch.uint8 ): # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values - spatial_size = _parse_spatial_size(size) + canvas_size = _parse_canvas_size(size) def fn(shape, dtype, device): *batch_dims, height, width = shape @@ -810,7 +776,7 @@ def fn(shape, dtype, device): (height, width), num_categories=num_categories, batch_dims=batch_dims, dtype=dtype, device=device ) - return MaskLoader(fn, shape=(*extra_dims, *spatial_size), dtype=dtype) + return MaskLoader(fn, shape=(*extra_dims, *canvas_size), dtype=dtype) def make_segmentation_mask_loaders( @@ -860,7 +826,7 @@ def make_video_loader( extra_dims=(), dtype=torch.uint8, ): - size = _parse_spatial_size(size) + size = _parse_canvas_size(size) def fn(shape, dtype, device, memory_format): *batch_dims, num_frames, _, height, width = shape diff --git a/test/test_datapoints.py b/test/test_datapoints.py index a5f09043582..f0a44ec1720 100644 --- a/test/test_datapoints.py +++ b/test/test_datapoints.py @@ -27,7 +27,7 @@ def test_mask_instance(data): "format", ["XYXY", "CXCYWH", datapoints.BoundingBoxFormat.XYXY, datapoints.BoundingBoxFormat.XYWH] ) def test_bbox_instance(data, format): - bboxes = datapoints.BoundingBoxes(data, format=format, spatial_size=(32, 32)) + bboxes = datapoints.BoundingBoxes(data, format=format, canvas_size=(32, 32)) assert isinstance(bboxes, torch.Tensor) assert bboxes.ndim == 2 and bboxes.shape[1] == 4 if isinstance(format, str): @@ -164,7 +164,7 @@ def test_wrap_like(): [ datapoints.Image(torch.rand(3, 16, 16)), datapoints.Video(torch.rand(2, 3, 16, 16)), - datapoints.BoundingBoxes([0.0, 1.0, 2.0, 3.0], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(10, 10)), + datapoints.BoundingBoxes([0.0, 1.0, 2.0, 3.0], format=datapoints.BoundingBoxFormat.XYXY, canvas_size=(10, 10)), datapoints.Mask(torch.randint(0, 256, (16, 16), dtype=torch.uint8)), ], ) diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py index d1f24410703..4c50cf0b968 100644 --- a/test/test_prototype_transforms.py +++ b/test/test_prototype_transforms.py @@ -164,7 +164,7 @@ def test__copy_paste(self, label_type): labels = torch.nn.functional.one_hot(labels, num_classes=5) target = { "boxes": BoundingBoxes( - torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", spatial_size=(32, 32) + torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", canvas_size=(32, 32) ), "masks": Mask(masks), "labels": label_type(labels), @@ -179,7 +179,7 @@ def test__copy_paste(self, label_type): paste_labels = torch.nn.functional.one_hot(paste_labels, num_classes=5) paste_target = { "boxes": BoundingBoxes( - torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", spatial_size=(32, 32) + torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", canvas_size=(32, 32) ), "masks": Mask(paste_masks), "labels": label_type(paste_labels), @@ -210,13 +210,13 @@ class TestFixedSizeCrop: def test__get_params(self, mocker): crop_size = (7, 7) batch_shape = (10,) - spatial_size = (11, 5) + canvas_size = (11, 5) transform = transforms.FixedSizeCrop(size=crop_size) flat_inputs = [ - make_image(size=spatial_size, color_space="RGB"), - make_bounding_box(format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=batch_shape), + make_image(size=canvas_size, color_space="RGB"), + make_bounding_box(format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=batch_shape), ] params = transform._get_params(flat_inputs) @@ -295,7 +295,7 @@ def test__transform(self, mocker, needs): def test__transform_culling(self, mocker): batch_size = 10 - spatial_size = (10, 10) + canvas_size = (10, 10) is_valid = torch.randint(0, 2, (batch_size,), dtype=torch.bool) mocker.patch( @@ -304,17 +304,17 @@ def test__transform_culling(self, mocker): needs_crop=True, top=0, left=0, - height=spatial_size[0], - width=spatial_size[1], + height=canvas_size[0], + width=canvas_size[1], is_valid=is_valid, needs_pad=False, ), ) bounding_boxes = make_bounding_box( - format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(batch_size,) + format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(batch_size,) ) - masks = make_detection_mask(size=spatial_size, batch_dims=(batch_size,)) + masks = make_detection_mask(size=canvas_size, batch_dims=(batch_size,)) labels = make_label(extra_dims=(batch_size,)) transform = transforms.FixedSizeCrop((-1, -1)) @@ -334,7 +334,7 @@ def test__transform_culling(self, mocker): def test__transform_bounding_boxes_clamping(self, mocker): batch_size = 3 - spatial_size = (10, 10) + canvas_size = (10, 10) mocker.patch( "torchvision.prototype.transforms._geometry.FixedSizeCrop._get_params", @@ -342,15 +342,15 @@ def test__transform_bounding_boxes_clamping(self, mocker): needs_crop=True, top=0, left=0, - height=spatial_size[0], - width=spatial_size[1], + height=canvas_size[0], + width=canvas_size[1], is_valid=torch.full((batch_size,), fill_value=True), needs_pad=False, ), ) bounding_boxes = make_bounding_box( - format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(batch_size,) + format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(batch_size,) ) mock = mocker.patch("torchvision.prototype.transforms._geometry.F.clamp_bounding_boxes") @@ -496,7 +496,7 @@ def make_datapoints(): pil_image = to_image_pil(make_image(size=size, color_space="RGB")) target = { - "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), + "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), "labels": make_label(extra_dims=(num_objects,), categories=80), "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long), } @@ -505,7 +505,7 @@ def make_datapoints(): tensor_image = torch.Tensor(make_image(size=size, color_space="RGB")) target = { - "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), + "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), "labels": make_label(extra_dims=(num_objects,), categories=80), "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long), } @@ -514,7 +514,7 @@ def make_datapoints(): datapoint_image = make_image(size=size, color_space="RGB") target = { - "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), + "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), "labels": make_label(extra_dims=(num_objects,), categories=80), "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long), } diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index e5624d78fed..4c1815fddea 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -174,20 +174,20 @@ class TestSmoke: ) @pytest.mark.parametrize("device", cpu_and_cuda()) def test_common(self, transform, adapter, container_type, image_or_video, device): - spatial_size = F.get_spatial_size(image_or_video) + canvas_size = F.get_size(image_or_video) input = dict( image_or_video=image_or_video, - image_datapoint=make_image(size=spatial_size), - video_datapoint=make_video(size=spatial_size), - image_pil=next(make_pil_images(sizes=[spatial_size], color_spaces=["RGB"])), + image_datapoint=make_image(size=canvas_size), + video_datapoint=make_video(size=canvas_size), + image_pil=next(make_pil_images(sizes=[canvas_size], color_spaces=["RGB"])), bounding_boxes_xyxy=make_bounding_box( - format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(3,) + format=datapoints.BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(3,) ), bounding_boxes_xywh=make_bounding_box( - format=datapoints.BoundingBoxFormat.XYWH, spatial_size=spatial_size, batch_dims=(4,) + format=datapoints.BoundingBoxFormat.XYWH, canvas_size=canvas_size, batch_dims=(4,) ), bounding_boxes_cxcywh=make_bounding_box( - format=datapoints.BoundingBoxFormat.CXCYWH, spatial_size=spatial_size, batch_dims=(5,) + format=datapoints.BoundingBoxFormat.CXCYWH, canvas_size=canvas_size, batch_dims=(5,) ), bounding_boxes_degenerate_xyxy=datapoints.BoundingBoxes( [ @@ -199,7 +199,7 @@ def test_common(self, transform, adapter, container_type, image_or_video, device [2, 2, 1, 1], # x1 > x2, y1 > y2 ], format=datapoints.BoundingBoxFormat.XYXY, - spatial_size=spatial_size, + canvas_size=canvas_size, ), bounding_boxes_degenerate_xywh=datapoints.BoundingBoxes( [ @@ -211,7 +211,7 @@ def test_common(self, transform, adapter, container_type, image_or_video, device [0, 0, -1, -1], # negative height and width ], format=datapoints.BoundingBoxFormat.XYWH, - spatial_size=spatial_size, + canvas_size=canvas_size, ), bounding_boxes_degenerate_cxcywh=datapoints.BoundingBoxes( [ @@ -223,10 +223,10 @@ def test_common(self, transform, adapter, container_type, image_or_video, device [0, 0, -1, -1], # negative height and width ], format=datapoints.BoundingBoxFormat.CXCYWH, - spatial_size=spatial_size, + canvas_size=canvas_size, ), - detection_mask=make_detection_mask(size=spatial_size), - segmentation_mask=make_segmentation_mask(size=spatial_size), + detection_mask=make_detection_mask(size=canvas_size), + segmentation_mask=make_segmentation_mask(size=canvas_size), int=0, float=0.0, bool=True, @@ -271,7 +271,7 @@ def test_common(self, transform, adapter, container_type, image_or_video, device # TODO: we should test that against all degenerate boxes above for format in list(datapoints.BoundingBoxFormat): sample = dict( - boxes=datapoints.BoundingBoxes([[0, 0, 0, 0]], format=format, spatial_size=(224, 244)), + boxes=datapoints.BoundingBoxes([[0, 0, 0, 0]], format=format, canvas_size=(224, 244)), labels=torch.tensor([3]), ) assert transforms.SanitizeBoundingBoxes()(sample)["boxes"].shape == (0, 4) @@ -473,11 +473,11 @@ def test_assertions(self): @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)]) @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]]) - def test__get_params(self, fill, side_range, mocker): + def test__get_params(self, fill, side_range): transform = transforms.RandomZoomOut(fill=fill, side_range=side_range) - image = mocker.MagicMock(spec=datapoints.Image) - h, w = image.spatial_size = (24, 32) + h, w = size = (24, 32) + image = make_image(size) params = transform._get_params([image]) @@ -490,9 +490,7 @@ def test__get_params(self, fill, side_range, mocker): @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)]) @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]]) def test__transform(self, fill, side_range, mocker): - inpt = mocker.MagicMock(spec=datapoints.Image) - inpt.num_channels = 3 - inpt.spatial_size = (24, 32) + inpt = make_image((24, 32)) transform = transforms.RandomZoomOut(fill=fill, side_range=side_range, p=1) @@ -559,11 +557,9 @@ def test_assertions(self): @pytest.mark.parametrize("padding", [None, 1, [2, 3], [1, 2, 3, 4]]) @pytest.mark.parametrize("size, pad_if_needed", [((10, 10), False), ((50, 25), True)]) - def test__get_params(self, padding, pad_if_needed, size, mocker): - image = mocker.MagicMock(spec=datapoints.Image) - image.num_channels = 3 - image.spatial_size = (24, 32) - h, w = image.spatial_size + def test__get_params(self, padding, pad_if_needed, size): + h, w = size = (24, 32) + image = make_image(size) transform = transforms.RandomCrop(size, padding=padding, pad_if_needed=pad_if_needed) params = transform._get_params([image]) @@ -613,21 +609,16 @@ def test__transform(self, padding, pad_if_needed, fill, padding_mode, mocker): output_size, padding=padding, pad_if_needed=pad_if_needed, fill=fill, padding_mode=padding_mode ) - inpt = mocker.MagicMock(spec=datapoints.Image) - inpt.num_channels = 3 - inpt.spatial_size = (32, 32) + h, w = size = (32, 32) + inpt = make_image(size) - expected = mocker.MagicMock(spec=datapoints.Image) - expected.num_channels = 3 if isinstance(padding, int): - expected.spatial_size = (inpt.spatial_size[0] + padding, inpt.spatial_size[1] + padding) + new_size = (h + padding, w + padding) elif isinstance(padding, list): - expected.spatial_size = ( - inpt.spatial_size[0] + sum(padding[0::2]), - inpt.spatial_size[1] + sum(padding[1::2]), - ) + new_size = (h + sum(padding[0::2]), w + sum(padding[1::2])) else: - expected.spatial_size = inpt.spatial_size + new_size = size + expected = make_image(new_size) _ = mocker.patch("torchvision.transforms.v2.functional.pad", return_value=expected) fn_crop = mocker.patch("torchvision.transforms.v2.functional.crop") @@ -703,7 +694,7 @@ def test__transform(self, kernel_size, sigma, mocker): fn = mocker.patch("torchvision.transforms.v2.functional.gaussian_blur") inpt = mocker.MagicMock(spec=datapoints.Image) inpt.num_channels = 3 - inpt.spatial_size = (24, 32) + inpt.canvas_size = (24, 32) # vfdev-5, Feature Request: let's store params as Transform attribute # This could be also helpful for users @@ -749,16 +740,14 @@ def test_assertions(self): with pytest.raises(TypeError, match="Got inappropriate fill arg"): transforms.RandomPerspective(0.5, fill="abc") - def test__get_params(self, mocker): + def test__get_params(self): dscale = 0.5 transform = transforms.RandomPerspective(dscale) - image = mocker.MagicMock(spec=datapoints.Image) - image.num_channels = 3 - image.spatial_size = (24, 32) + + image = make_image((24, 32)) params = transform._get_params([image]) - h, w = image.spatial_size assert "coefficients" in params assert len(params["coefficients"]) == 8 @@ -769,9 +758,9 @@ def test__transform(self, distortion_scale, mocker): transform = transforms.RandomPerspective(distortion_scale, fill=fill, interpolation=interpolation) fn = mocker.patch("torchvision.transforms.v2.functional.perspective") - inpt = mocker.MagicMock(spec=datapoints.Image) - inpt.num_channels = 3 - inpt.spatial_size = (24, 32) + + inpt = make_image((24, 32)) + # vfdev-5, Feature Request: let's store params as Transform attribute # This could be also helpful for users # Otherwise, we can mock transform._get_params @@ -809,17 +798,16 @@ def test_assertions(self): with pytest.raises(TypeError, match="Got inappropriate fill arg"): transforms.ElasticTransform(1.0, 2.0, fill="abc") - def test__get_params(self, mocker): + def test__get_params(self): alpha = 2.0 sigma = 3.0 transform = transforms.ElasticTransform(alpha, sigma) - image = mocker.MagicMock(spec=datapoints.Image) - image.num_channels = 3 - image.spatial_size = (24, 32) + + h, w = size = (24, 32) + image = make_image(size) params = transform._get_params([image]) - h, w = image.spatial_size displacement = params["displacement"] assert displacement.shape == (1, h, w, 2) assert (-alpha / w <= displacement[0, ..., 0]).all() and (displacement[0, ..., 0] <= alpha / w).all() @@ -845,7 +833,7 @@ def test__transform(self, alpha, sigma, mocker): fn = mocker.patch("torchvision.transforms.v2.functional.elastic") inpt = mocker.MagicMock(spec=datapoints.Image) inpt.num_channels = 3 - inpt.spatial_size = (24, 32) + inpt.canvas_size = (24, 32) # Let's mock transform._get_params to control the output: transform._get_params = mocker.MagicMock() @@ -856,7 +844,7 @@ def test__transform(self, alpha, sigma, mocker): class TestRandomErasing: - def test_assertions(self, mocker): + def test_assertions(self): with pytest.raises(TypeError, match="Argument value should be either a number or str or a sequence"): transforms.RandomErasing(value={}) @@ -872,9 +860,7 @@ def test_assertions(self, mocker): with pytest.raises(ValueError, match="Scale should be between 0 and 1"): transforms.RandomErasing(scale=[-1, 2]) - image = mocker.MagicMock(spec=datapoints.Image) - image.num_channels = 3 - image.spatial_size = (24, 32) + image = make_image((24, 32)) transform = transforms.RandomErasing(value=[1, 2, 3, 4]) @@ -882,10 +868,9 @@ def test_assertions(self, mocker): transform._get_params([image]) @pytest.mark.parametrize("value", [5.0, [1, 2, 3], "random"]) - def test__get_params(self, value, mocker): - image = mocker.MagicMock(spec=datapoints.Image) - image.num_channels = 3 - image.spatial_size = (24, 32) + def test__get_params(self, value): + image = make_image((24, 32)) + num_channels, height, width = F.get_dimensions(image) transform = transforms.RandomErasing(value=value) params = transform._get_params([image]) @@ -895,14 +880,14 @@ def test__get_params(self, value, mocker): i, j = params["i"], params["j"] assert isinstance(v, torch.Tensor) if value == "random": - assert v.shape == (image.num_channels, h, w) + assert v.shape == (num_channels, h, w) elif isinstance(value, (int, float)): assert v.shape == (1, 1, 1) elif isinstance(value, (list, tuple)): - assert v.shape == (image.num_channels, 1, 1) + assert v.shape == (num_channels, 1, 1) - assert 0 <= i <= image.spatial_size[0] - h - assert 0 <= j <= image.spatial_size[1] - w + assert 0 <= i <= height - h + assert 0 <= j <= width - w @pytest.mark.parametrize("p", [0, 1]) def test__transform(self, mocker, p): @@ -1061,14 +1046,13 @@ def test_assertions(self): class TestRandomIoUCrop: @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("options", [[0.5, 0.9], [2.0]]) - def test__get_params(self, device, options, mocker): - image = mocker.MagicMock(spec=datapoints.Image) - image.num_channels = 3 - image.spatial_size = (24, 32) + def test__get_params(self, device, options): + orig_h, orig_w = size = (24, 32) + image = make_image(size) bboxes = datapoints.BoundingBoxes( torch.tensor([[1, 1, 10, 10], [20, 20, 23, 23], [1, 20, 10, 23], [20, 1, 23, 10]]), format="XYXY", - spatial_size=image.spatial_size, + canvas_size=size, device=device, ) sample = [image, bboxes] @@ -1087,8 +1071,6 @@ def test__get_params(self, device, options, mocker): assert len(params["is_within_crop_area"]) > 0 assert params["is_within_crop_area"].dtype == torch.bool - orig_h = image.spatial_size[0] - orig_w = image.spatial_size[1] assert int(transform.min_scale * orig_h) <= params["height"] <= int(transform.max_scale * orig_h) assert int(transform.min_scale * orig_w) <= params["width"] <= int(transform.max_scale * orig_w) @@ -1103,7 +1085,7 @@ def test__get_params(self, device, options, mocker): def test__transform_empty_params(self, mocker): transform = transforms.RandomIoUCrop(sampler_options=[2.0]) image = datapoints.Image(torch.rand(1, 3, 4, 4)) - bboxes = datapoints.BoundingBoxes(torch.tensor([[1, 1, 2, 2]]), format="XYXY", spatial_size=(4, 4)) + bboxes = datapoints.BoundingBoxes(torch.tensor([[1, 1, 2, 2]]), format="XYXY", canvas_size=(4, 4)) label = torch.tensor([1]) sample = [image, bboxes, label] # Let's mock transform._get_params to control the output: @@ -1122,9 +1104,10 @@ def test_forward_assertion(self): def test__transform(self, mocker): transform = transforms.RandomIoUCrop() - image = datapoints.Image(torch.rand(3, 32, 24)) - bboxes = make_bounding_box(format="XYXY", spatial_size=(32, 24), batch_dims=(6,)) - masks = make_detection_mask((32, 24), num_objects=6) + size = (32, 24) + image = make_image(size) + bboxes = make_bounding_box(format="XYXY", canvas_size=size, batch_dims=(6,)) + masks = make_detection_mask(size, num_objects=6) sample = [image, bboxes, masks] @@ -1155,13 +1138,14 @@ def test__transform(self, mocker): class TestScaleJitter: - def test__get_params(self, mocker): - spatial_size = (24, 32) + def test__get_params(self): + canvas_size = (24, 32) target_size = (16, 12) scale_range = (0.5, 1.5) transform = transforms.ScaleJitter(target_size=target_size, scale_range=scale_range) - sample = mocker.MagicMock(spec=datapoints.Image, num_channels=3, spatial_size=spatial_size) + + sample = make_image(canvas_size) n_samples = 5 for _ in range(n_samples): @@ -1174,11 +1158,11 @@ def test__get_params(self, mocker): assert isinstance(size, tuple) and len(size) == 2 height, width = size - r_min = min(target_size[1] / spatial_size[0], target_size[0] / spatial_size[1]) * scale_range[0] - r_max = min(target_size[1] / spatial_size[0], target_size[0] / spatial_size[1]) * scale_range[1] + r_min = min(target_size[1] / canvas_size[0], target_size[0] / canvas_size[1]) * scale_range[0] + r_max = min(target_size[1] / canvas_size[0], target_size[0] / canvas_size[1]) * scale_range[1] - assert int(spatial_size[0] * r_min) <= height <= int(spatial_size[0] * r_max) - assert int(spatial_size[1] * r_min) <= width <= int(spatial_size[1] * r_max) + assert int(canvas_size[0] * r_min) <= height <= int(canvas_size[0] * r_max) + assert int(canvas_size[1] * r_min) <= width <= int(canvas_size[1] * r_max) def test__transform(self, mocker): interpolation_sentinel = mocker.MagicMock(spec=InterpolationMode) @@ -1206,12 +1190,12 @@ def test__transform(self, mocker): class TestRandomShortestSize: @pytest.mark.parametrize("min_size,max_size", [([5, 9], 20), ([5, 9], None)]) - def test__get_params(self, min_size, max_size, mocker): - spatial_size = (3, 10) + def test__get_params(self, min_size, max_size): + canvas_size = (3, 10) transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size, antialias=True) - sample = mocker.MagicMock(spec=datapoints.Image, num_channels=3, spatial_size=spatial_size) + sample = make_image(canvas_size) params = transform._get_params([sample]) assert "size" in params @@ -1523,7 +1507,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): boxes = torch.randint(0, min(H, W) // 2, size=(num_boxes, 4)) boxes[:, 2:] += boxes[:, :2] boxes = boxes.clamp(min=0, max=min(H, W)) - boxes = datapoints.BoundingBoxes(boxes, format="XYXY", spatial_size=(H, W)) + boxes = datapoints.BoundingBoxes(boxes, format="XYXY", canvas_size=(H, W)) masks = datapoints.Mask(torch.randint(0, 2, size=(num_boxes, H, W), dtype=torch.uint8)) @@ -1597,7 +1581,7 @@ def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type): boxes = datapoints.BoundingBoxes( boxes, format=datapoints.BoundingBoxFormat.XYXY, - spatial_size=(H, W), + canvas_size=(H, W), ) masks = datapoints.Mask(torch.randint(0, 2, size=(boxes.shape[0], H, W))) @@ -1651,7 +1635,7 @@ def test_sanitize_bounding_boxes_errors(): good_bbox = datapoints.BoundingBoxes( [[0, 0, 10, 10]], format=datapoints.BoundingBoxFormat.XYXY, - spatial_size=(20, 20), + canvas_size=(20, 20), ) with pytest.raises(ValueError, match="min_size must be >= 1"): @@ -1678,7 +1662,7 @@ def test_sanitize_bounding_boxes_errors(): [[0, 0, 10, 10]], ], format=datapoints.BoundingBoxFormat.XYXY, - spatial_size=(20, 20), + canvas_size=(20, 20), ) different_sizes = {"bbox": bad_bbox, "labels": torch.arange(bad_bbox.shape[0])} transforms.SanitizeBoundingBoxes()(different_sizes) diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py index 9adec66b3c4..47a0b05b511 100644 --- a/test/test_transforms_v2_consistency.py +++ b/test/test_transforms_v2_consistency.py @@ -31,7 +31,7 @@ from torchvision.transforms import functional as legacy_F from torchvision.transforms.v2 import functional as prototype_F from torchvision.transforms.v2.functional import to_image_pil -from torchvision.transforms.v2.utils import query_spatial_size +from torchvision.transforms.v2.utils import query_size DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=["RGB"], extra_dims=[(4,)]) @@ -1090,7 +1090,7 @@ def make_label(extra_dims, categories): pil_image = to_image_pil(make_image(size=size, color_space="RGB")) target = { - "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), + "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), "labels": make_label(extra_dims=(num_objects,), categories=80), } if with_mask: @@ -1100,7 +1100,7 @@ def make_label(extra_dims, categories): tensor_image = torch.Tensor(make_image(size=size, color_space="RGB", dtype=torch.float32)) target = { - "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), + "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), "labels": make_label(extra_dims=(num_objects,), categories=80), } if with_mask: @@ -1110,7 +1110,7 @@ def make_label(extra_dims, categories): datapoint_image = make_image(size=size, color_space="RGB", dtype=torch.float32) target = { - "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), + "boxes": make_bounding_box(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), "labels": make_label(extra_dims=(num_objects,), categories=80), } if with_mask: @@ -1172,7 +1172,7 @@ def __init__(self, size, fill=0): self.fill = v2_transforms._geometry._setup_fill_arg(fill) def _get_params(self, sample): - height, width = query_spatial_size(sample) + height, width = query_size(sample) padding = [0, 0, max(self.size - width, 0), max(self.size - height, 0)] needs_padding = any(padding) return dict(padding=padding, needs_padding=needs_padding) diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py index 5d692b58108..230695ff93e 100644 --- a/test/test_transforms_v2_functional.py +++ b/test/test_transforms_v2_functional.py @@ -351,7 +351,7 @@ def test_scripted_smoke(self, info, args_kwargs, device): F.get_image_size, F.get_num_channels, F.get_num_frames, - F.get_spatial_size, + F.get_size, F.rgb_to_grayscale, F.uniform_temporal_subsample, ], @@ -568,27 +568,27 @@ class TestClampBoundingBoxes: [ dict(), dict(format=datapoints.BoundingBoxFormat.XYXY), - dict(spatial_size=(1, 1)), + dict(canvas_size=(1, 1)), ], ) def test_simple_tensor_insufficient_metadata(self, metadata): simple_tensor = next(make_bounding_boxes()).as_subclass(torch.Tensor) - with pytest.raises(ValueError, match=re.escape("`format` and `spatial_size` has to be passed")): + with pytest.raises(ValueError, match=re.escape("`format` and `canvas_size` has to be passed")): F.clamp_bounding_boxes(simple_tensor, **metadata) @pytest.mark.parametrize( "metadata", [ dict(format=datapoints.BoundingBoxFormat.XYXY), - dict(spatial_size=(1, 1)), - dict(format=datapoints.BoundingBoxFormat.XYXY, spatial_size=(1, 1)), + dict(canvas_size=(1, 1)), + dict(format=datapoints.BoundingBoxFormat.XYXY, canvas_size=(1, 1)), ], ) def test_datapoint_explicit_metadata(self, metadata): datapoint = next(make_bounding_boxes()) - with pytest.raises(ValueError, match=re.escape("`format` and `spatial_size` must not be passed")): + with pytest.raises(ValueError, match=re.escape("`format` and `canvas_size` must not be passed")): F.clamp_bounding_boxes(datapoint, **metadata) @@ -673,7 +673,7 @@ def test_correctness_crop_bounding_boxes(device, format, top, left, height, widt # expected_bboxes.append(out_box) format = datapoints.BoundingBoxFormat.XYXY - spatial_size = (64, 76) + canvas_size = (64, 76) in_boxes = [ [10.0, 15.0, 25.0, 35.0], [50.0, 5.0, 70.0, 22.0], @@ -684,23 +684,23 @@ def test_correctness_crop_bounding_boxes(device, format, top, left, height, widt in_boxes = convert_format_bounding_boxes(in_boxes, datapoints.BoundingBoxFormat.XYXY, format) expected_bboxes = clamp_bounding_boxes( - datapoints.BoundingBoxes(expected_bboxes, format="XYXY", spatial_size=spatial_size) + datapoints.BoundingBoxes(expected_bboxes, format="XYXY", canvas_size=canvas_size) ).tolist() - output_boxes, output_spatial_size = F.crop_bounding_boxes( + output_boxes, output_canvas_size = F.crop_bounding_boxes( in_boxes, format, top, left, - spatial_size[0], - spatial_size[1], + canvas_size[0], + canvas_size[1], ) if format != datapoints.BoundingBoxFormat.XYXY: output_boxes = convert_format_bounding_boxes(output_boxes, format, datapoints.BoundingBoxFormat.XYXY) torch.testing.assert_close(output_boxes.tolist(), expected_bboxes) - torch.testing.assert_close(output_spatial_size, spatial_size) + torch.testing.assert_close(output_canvas_size, canvas_size) @pytest.mark.parametrize("device", cpu_and_cuda()) @@ -737,7 +737,7 @@ def _compute_expected_bbox(bbox, top_, left_, height_, width_, size_): return bbox format = datapoints.BoundingBoxFormat.XYXY - spatial_size = (100, 100) + canvas_size = (100, 100) in_boxes = [ [10.0, 10.0, 20.0, 20.0], [5.0, 10.0, 15.0, 20.0], @@ -748,18 +748,18 @@ def _compute_expected_bbox(bbox, top_, left_, height_, width_, size_): expected_bboxes = torch.tensor(expected_bboxes, device=device) in_boxes = datapoints.BoundingBoxes( - in_boxes, format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, device=device + in_boxes, format=datapoints.BoundingBoxFormat.XYXY, canvas_size=canvas_size, device=device ) if format != datapoints.BoundingBoxFormat.XYXY: in_boxes = convert_format_bounding_boxes(in_boxes, datapoints.BoundingBoxFormat.XYXY, format) - output_boxes, output_spatial_size = F.resized_crop_bounding_boxes(in_boxes, format, top, left, height, width, size) + output_boxes, output_canvas_size = F.resized_crop_bounding_boxes(in_boxes, format, top, left, height, width, size) if format != datapoints.BoundingBoxFormat.XYXY: output_boxes = convert_format_bounding_boxes(output_boxes, format, datapoints.BoundingBoxFormat.XYXY) torch.testing.assert_close(output_boxes, expected_bboxes) - torch.testing.assert_close(output_spatial_size, size) + torch.testing.assert_close(output_canvas_size, size) def _parse_padding(padding): @@ -798,28 +798,28 @@ def _compute_expected_bbox(bbox, padding_): bbox = bbox.to(dtype) return bbox - def _compute_expected_spatial_size(bbox, padding_): + def _compute_expected_canvas_size(bbox, padding_): pad_left, pad_up, pad_right, pad_down = _parse_padding(padding_) - height, width = bbox.spatial_size + height, width = bbox.canvas_size return height + pad_up + pad_down, width + pad_left + pad_right for bboxes in make_bounding_boxes(): bboxes = bboxes.to(device) bboxes_format = bboxes.format - bboxes_spatial_size = bboxes.spatial_size + bboxes_canvas_size = bboxes.canvas_size - output_boxes, output_spatial_size = F.pad_bounding_boxes( - bboxes, format=bboxes_format, spatial_size=bboxes_spatial_size, padding=padding + output_boxes, output_canvas_size = F.pad_bounding_boxes( + bboxes, format=bboxes_format, canvas_size=bboxes_canvas_size, padding=padding ) - torch.testing.assert_close(output_spatial_size, _compute_expected_spatial_size(bboxes, padding)) + torch.testing.assert_close(output_canvas_size, _compute_expected_canvas_size(bboxes, padding)) if bboxes.ndim < 2 or bboxes.shape[0] == 0: bboxes = [bboxes] expected_bboxes = [] for bbox in bboxes: - bbox = datapoints.BoundingBoxes(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size) + bbox = datapoints.BoundingBoxes(bbox, format=bboxes_format, canvas_size=bboxes_canvas_size) expected_bboxes.append(_compute_expected_bbox(bbox, padding)) if len(expected_bboxes) > 1: @@ -887,24 +887,24 @@ def _compute_expected_bbox(bbox, pcoeffs_): out_bbox = datapoints.BoundingBoxes( out_bbox, format=datapoints.BoundingBoxFormat.XYXY, - spatial_size=bbox.spatial_size, + canvas_size=bbox.canvas_size, dtype=bbox.dtype, device=bbox.device, ) return clamp_bounding_boxes(convert_format_bounding_boxes(out_bbox, new_format=bbox.format)) - spatial_size = (32, 38) + canvas_size = (32, 38) pcoeffs = _get_perspective_coeffs(startpoints, endpoints) inv_pcoeffs = _get_perspective_coeffs(endpoints, startpoints) - for bboxes in make_bounding_boxes(spatial_size=spatial_size, extra_dims=((4,),)): + for bboxes in make_bounding_boxes(canvas_size=canvas_size, extra_dims=((4,),)): bboxes = bboxes.to(device) output_bboxes = F.perspective_bounding_boxes( bboxes.as_subclass(torch.Tensor), format=bboxes.format, - spatial_size=bboxes.spatial_size, + canvas_size=bboxes.canvas_size, startpoints=None, endpoints=None, coefficients=pcoeffs, @@ -915,7 +915,7 @@ def _compute_expected_bbox(bbox, pcoeffs_): expected_bboxes = [] for bbox in bboxes: - bbox = datapoints.BoundingBoxes(bbox, format=bboxes.format, spatial_size=bboxes.spatial_size) + bbox = datapoints.BoundingBoxes(bbox, format=bboxes.format, canvas_size=bboxes.canvas_size) expected_bboxes.append(_compute_expected_bbox(bbox, inv_pcoeffs)) if len(expected_bboxes) > 1: expected_bboxes = torch.stack(expected_bboxes) @@ -932,15 +932,15 @@ def _compute_expected_bbox(bbox, pcoeffs_): def test_correctness_center_crop_bounding_boxes(device, output_size): def _compute_expected_bbox(bbox, output_size_): format_ = bbox.format - spatial_size_ = bbox.spatial_size + canvas_size_ = bbox.canvas_size dtype = bbox.dtype bbox = convert_format_bounding_boxes(bbox.float(), format_, datapoints.BoundingBoxFormat.XYWH) if len(output_size_) == 1: output_size_.append(output_size_[-1]) - cy = int(round((spatial_size_[0] - output_size_[0]) * 0.5)) - cx = int(round((spatial_size_[1] - output_size_[1]) * 0.5)) + cy = int(round((canvas_size_[0] - output_size_[0]) * 0.5)) + cx = int(round((canvas_size_[1] - output_size_[1]) * 0.5)) out_bbox = [ bbox[0].item() - cx, bbox[1].item() - cy, @@ -949,16 +949,16 @@ def _compute_expected_bbox(bbox, output_size_): ] out_bbox = torch.tensor(out_bbox) out_bbox = convert_format_bounding_boxes(out_bbox, datapoints.BoundingBoxFormat.XYWH, format_) - out_bbox = clamp_bounding_boxes(out_bbox, format=format_, spatial_size=output_size) + out_bbox = clamp_bounding_boxes(out_bbox, format=format_, canvas_size=output_size) return out_bbox.to(dtype=dtype, device=bbox.device) for bboxes in make_bounding_boxes(extra_dims=((4,),)): bboxes = bboxes.to(device) bboxes_format = bboxes.format - bboxes_spatial_size = bboxes.spatial_size + bboxes_canvas_size = bboxes.canvas_size - output_boxes, output_spatial_size = F.center_crop_bounding_boxes( - bboxes, bboxes_format, bboxes_spatial_size, output_size + output_boxes, output_canvas_size = F.center_crop_bounding_boxes( + bboxes, bboxes_format, bboxes_canvas_size, output_size ) if bboxes.ndim < 2: @@ -966,7 +966,7 @@ def _compute_expected_bbox(bbox, output_size_): expected_bboxes = [] for bbox in bboxes: - bbox = datapoints.BoundingBoxes(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size) + bbox = datapoints.BoundingBoxes(bbox, format=bboxes_format, canvas_size=bboxes_canvas_size) expected_bboxes.append(_compute_expected_bbox(bbox, output_size)) if len(expected_bboxes) > 1: @@ -975,7 +975,7 @@ def _compute_expected_bbox(bbox, output_size_): expected_bboxes = expected_bboxes[0] torch.testing.assert_close(output_boxes, expected_bboxes, atol=1, rtol=0) - torch.testing.assert_close(output_spatial_size, output_size) + torch.testing.assert_close(output_canvas_size, output_size) @pytest.mark.parametrize("device", cpu_and_cuda()) @@ -1003,11 +1003,11 @@ def _compute_expected_mask(mask, output_size): # Copied from test/test_functional_tensor.py @pytest.mark.parametrize("device", cpu_and_cuda()) -@pytest.mark.parametrize("spatial_size", ("small", "large")) +@pytest.mark.parametrize("canvas_size", ("small", "large")) @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16]) @pytest.mark.parametrize("ksize", [(3, 3), [3, 5], (23, 23)]) @pytest.mark.parametrize("sigma", [[0.5, 0.5], (0.5, 0.5), (0.8, 0.8), (1.7, 1.7)]) -def test_correctness_gaussian_blur_image_tensor(device, spatial_size, dt, ksize, sigma): +def test_correctness_gaussian_blur_image_tensor(device, canvas_size, dt, ksize, sigma): fn = F.gaussian_blur_image_tensor # true_cv2_results = { @@ -1027,7 +1027,7 @@ def test_correctness_gaussian_blur_image_tensor(device, spatial_size, dt, ksize, p = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "gaussian_blur_opencv_results.pt") true_cv2_results = torch.load(p) - if spatial_size == "small": + if canvas_size == "small": tensor = ( torch.from_numpy(np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3))).permute(2, 0, 1).to(device) ) diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py index f4e00a2b8f5..7d10fbed4dc 100644 --- a/test/test_transforms_v2_refactored.py +++ b/test/test_transforms_v2_refactored.py @@ -392,7 +392,7 @@ def assert_warns_antialias_default_value(): yield -def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, spatial_size, affine_matrix): +def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, canvas_size, affine_matrix): def transform(bbox): # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1 in_dtype = bbox.dtype @@ -426,7 +426,7 @@ def transform(bbox): out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True ) # It is important to clamp before casting, especially for CXCYWH format, dtype=int64 - out_bbox = F.clamp_bounding_boxes(out_bbox, format=format, spatial_size=spatial_size) + out_bbox = F.clamp_bounding_boxes(out_bbox, format=format, canvas_size=canvas_size) out_bbox = out_bbox.to(dtype=in_dtype) return out_bbox @@ -514,14 +514,14 @@ def test_kernel_bounding_boxes(self, format, size, use_max_size, dtype, device): bounding_boxes = make_bounding_box( format=format, - spatial_size=self.INPUT_SIZE, + canvas_size=self.INPUT_SIZE, dtype=dtype, device=device, ) check_kernel( F.resize_bounding_boxes, bounding_boxes, - spatial_size=bounding_boxes.spatial_size, + canvas_size=bounding_boxes.canvas_size, size=size, **max_size_kwarg, check_scripted_vs_eager=not isinstance(size, int), @@ -588,8 +588,8 @@ def test_transform(self, size, device, make_input): check_transform(transforms.Resize, make_input(self.INPUT_SIZE, device=device), size=size, antialias=True) def _check_output_size(self, input, output, *, size, max_size): - assert tuple(F.get_spatial_size(output)) == self._compute_output_size( - input_size=F.get_spatial_size(input), size=size, max_size=max_size + assert tuple(F.get_size(output)) == self._compute_output_size( + input_size=F.get_size(input), size=size, max_size=max_size ) @pytest.mark.parametrize("size", OUTPUT_SIZES) @@ -613,9 +613,9 @@ def test_image_correctness(self, size, interpolation, use_max_size, fn): torch.testing.assert_close(actual, expected, atol=1, rtol=0) def _reference_resize_bounding_boxes(self, bounding_boxes, *, size, max_size=None): - old_height, old_width = bounding_boxes.spatial_size + old_height, old_width = bounding_boxes.canvas_size new_height, new_width = self._compute_output_size( - input_size=bounding_boxes.spatial_size, size=size, max_size=max_size + input_size=bounding_boxes.canvas_size, size=size, max_size=max_size ) if (old_height, old_width) == (new_height, new_width): @@ -632,10 +632,10 @@ def _reference_resize_bounding_boxes(self, bounding_boxes, *, size, max_size=Non expected_bboxes = reference_affine_bounding_boxes_helper( bounding_boxes, format=bounding_boxes.format, - spatial_size=(new_height, new_width), + canvas_size=(new_height, new_width), affine_matrix=affine_matrix, ) - return datapoints.BoundingBoxes.wrap_like(bounding_boxes, expected_bboxes, spatial_size=(new_height, new_width)) + return datapoints.BoundingBoxes.wrap_like(bounding_boxes, expected_bboxes, canvas_size=(new_height, new_width)) @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat)) @pytest.mark.parametrize("size", OUTPUT_SIZES) @@ -645,7 +645,7 @@ def test_bounding_boxes_correctness(self, format, size, use_max_size, fn): if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)): return - bounding_boxes = make_bounding_box(format=format, spatial_size=self.INPUT_SIZE) + bounding_boxes = make_bounding_box(format=format, canvas_size=self.INPUT_SIZE) actual = fn(bounding_boxes, size=size, **max_size_kwarg) expected = self._reference_resize_bounding_boxes(bounding_boxes, size=size, **max_size_kwarg) @@ -762,7 +762,7 @@ def test_transform_unknown_size_error(self): def test_noop(self, size, make_input): input = make_input(self.INPUT_SIZE) - output = F.resize(input, size=F.get_spatial_size(input), antialias=True) + output = F.resize(input, size=F.get_size(input), antialias=True) # This identity check is not a requirement. It is here to avoid breaking the behavior by accident. If there # is a good reason to break this, feel free to downgrade to an equality check. @@ -792,11 +792,11 @@ def test_no_regression_5405(self, make_input): input = make_input(self.INPUT_SIZE) - size = min(F.get_spatial_size(input)) + size = min(F.get_size(input)) max_size = size + 1 output = F.resize(input, size=size, max_size=max_size, antialias=True) - assert max(F.get_spatial_size(output)) == max_size + assert max(F.get_size(output)) == max_size class TestHorizontalFlip: @@ -814,7 +814,7 @@ def test_kernel_bounding_boxes(self, format, dtype, device): F.horizontal_flip_bounding_boxes, bounding_boxes, format=format, - spatial_size=bounding_boxes.spatial_size, + canvas_size=bounding_boxes.canvas_size, ) @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask]) @@ -874,7 +874,7 @@ def test_image_correctness(self, fn): def _reference_horizontal_flip_bounding_boxes(self, bounding_boxes): affine_matrix = np.array( [ - [-1, 0, bounding_boxes.spatial_size[1]], + [-1, 0, bounding_boxes.canvas_size[1]], [0, 1, 0], ], dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32", @@ -883,7 +883,7 @@ def _reference_horizontal_flip_bounding_boxes(self, bounding_boxes): expected_bboxes = reference_affine_bounding_boxes_helper( bounding_boxes, format=bounding_boxes.format, - spatial_size=bounding_boxes.spatial_size, + canvas_size=bounding_boxes.canvas_size, affine_matrix=affine_matrix, ) @@ -995,7 +995,7 @@ def test_kernel_bounding_boxes(self, param, value, format, dtype, device): F.affine_bounding_boxes, bounding_boxes, format=format, - spatial_size=bounding_boxes.spatial_size, + canvas_size=bounding_boxes.canvas_size, **{param: value}, check_scripted_vs_eager=not (param == "shear" and isinstance(value, (int, float))), ) @@ -1133,7 +1133,7 @@ def _compute_affine_matrix(self, *, angle, translate, scale, shear, center): def _reference_affine_bounding_boxes(self, bounding_boxes, *, angle, translate, scale, shear, center): if center is None: - center = [s * 0.5 for s in bounding_boxes.spatial_size[::-1]] + center = [s * 0.5 for s in bounding_boxes.canvas_size[::-1]] affine_matrix = self._compute_affine_matrix( angle=angle, translate=translate, scale=scale, shear=shear, center=center @@ -1143,7 +1143,7 @@ def _reference_affine_bounding_boxes(self, bounding_boxes, *, angle, translate, expected_bboxes = reference_affine_bounding_boxes_helper( bounding_boxes, format=bounding_boxes.format, - spatial_size=bounding_boxes.spatial_size, + canvas_size=bounding_boxes.canvas_size, affine_matrix=affine_matrix, ) @@ -1202,7 +1202,7 @@ def test_transform_bounding_boxes_correctness(self, format, center, seed): @pytest.mark.parametrize("seed", list(range(10))) def test_transform_get_params_bounds(self, degrees, translate, scale, shear, seed): image = make_image() - height, width = F.get_spatial_size(image) + height, width = F.get_size(image) transform = transforms.RandomAffine(degrees=degrees, translate=translate, scale=scale, shear=shear) @@ -1293,7 +1293,7 @@ def test_kernel_bounding_boxes(self, format, dtype, device): F.vertical_flip_bounding_boxes, bounding_boxes, format=format, - spatial_size=bounding_boxes.spatial_size, + canvas_size=bounding_boxes.canvas_size, ) @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask]) @@ -1352,7 +1352,7 @@ def _reference_vertical_flip_bounding_boxes(self, bounding_boxes): affine_matrix = np.array( [ [1, 0, 0], - [0, -1, bounding_boxes.spatial_size[0]], + [0, -1, bounding_boxes.canvas_size[0]], ], dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32", ) @@ -1360,7 +1360,7 @@ def _reference_vertical_flip_bounding_boxes(self, bounding_boxes): expected_bboxes = reference_affine_bounding_boxes_helper( bounding_boxes, format=bounding_boxes.format, - spatial_size=bounding_boxes.spatial_size, + canvas_size=bounding_boxes.canvas_size, affine_matrix=affine_matrix, ) @@ -1449,7 +1449,7 @@ def test_kernel_bounding_boxes(self, param, value, format, dtype, device): F.rotate_bounding_boxes, bounding_boxes, format=format, - spatial_size=bounding_boxes.spatial_size, + canvas_size=bounding_boxes.canvas_size, **kwargs, ) @@ -1555,7 +1555,7 @@ def _reference_rotate_bounding_boxes(self, bounding_boxes, *, angle, expand, cen raise ValueError("This reference currently does not support expand=True") if center is None: - center = [s * 0.5 for s in bounding_boxes.spatial_size[::-1]] + center = [s * 0.5 for s in bounding_boxes.canvas_size[::-1]] a = np.cos(angle * np.pi / 180.0) b = np.sin(angle * np.pi / 180.0) @@ -1572,7 +1572,7 @@ def _reference_rotate_bounding_boxes(self, bounding_boxes, *, angle, expand, cen expected_bboxes = reference_affine_bounding_boxes_helper( bounding_boxes, format=bounding_boxes.format, - spatial_size=bounding_boxes.spatial_size, + canvas_size=bounding_boxes.canvas_size, affine_matrix=affine_matrix, ) @@ -1834,7 +1834,7 @@ def make_inpt_with_bbox_and_mask(self, make_input): mask_dtype = torch.bool sample = { "inpt": make_input(size=(H, W), dtype=inpt_dtype), - "bbox": make_bounding_box(size=(H, W), dtype=bbox_dtype), + "bbox": make_bounding_box(canvas_size=(H, W), dtype=bbox_dtype), "mask": make_detection_mask(size=(H, W), dtype=mask_dtype), } @@ -1988,7 +1988,7 @@ def test_error(self, T): for input_with_bad_type in ( F.to_pil_image(imgs[0]), datapoints.Mask(torch.rand(12, 12)), - datapoints.BoundingBoxes(torch.rand(2, 4), format="XYXY", spatial_size=12), + datapoints.BoundingBoxes(torch.rand(2, 4), format="XYXY", canvas_size=12), ): with pytest.raises(ValueError, match="does not support PIL images, "): cutmix_mixup(input_with_bad_type) diff --git a/test/test_transforms_v2_utils.py b/test/test_transforms_v2_utils.py index 58c8bfd5815..f880dac6c67 100644 --- a/test/test_transforms_v2_utils.py +++ b/test/test_transforms_v2_utils.py @@ -4,16 +4,16 @@ import torch import torchvision.transforms.v2.utils -from common_utils import make_bounding_box, make_detection_mask, make_image +from common_utils import DEFAULT_SIZE, make_bounding_box, make_detection_mask, make_image from torchvision import datapoints from torchvision.transforms.v2.functional import to_image_pil from torchvision.transforms.v2.utils import has_all, has_any -IMAGE = make_image(color_space="RGB") -BOUNDING_BOX = make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY, spatial_size=IMAGE.spatial_size) -MASK = make_detection_mask(size=IMAGE.spatial_size) +IMAGE = make_image(DEFAULT_SIZE, color_space="RGB") +BOUNDING_BOX = make_bounding_box(DEFAULT_SIZE, format=datapoints.BoundingBoxFormat.XYXY) +MASK = make_detection_mask(DEFAULT_SIZE) @pytest.mark.parametrize( diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py index 6f1c91ac62a..85eb24a806c 100644 --- a/test/transforms_v2_kernel_infos.py +++ b/test/transforms_v2_kernel_infos.py @@ -184,8 +184,8 @@ def float32_vs_uint8_fill_adapter(other_args, kwargs): return other_args, dict(kwargs, fill=fill) -def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, spatial_size, affine_matrix): - def transform(bbox, affine_matrix_, format_, spatial_size_): +def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, canvas_size, affine_matrix): + def transform(bbox, affine_matrix_, format_, canvas_size_): # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1 in_dtype = bbox.dtype if not torch.is_floating_point(bbox): @@ -218,14 +218,14 @@ def transform(bbox, affine_matrix_, format_, spatial_size_): out_bbox, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format_, inplace=True ) # It is important to clamp before casting, especially for CXCYWH format, dtype=int64 - out_bbox = F.clamp_bounding_boxes(out_bbox, format=format_, spatial_size=spatial_size_) + out_bbox = F.clamp_bounding_boxes(out_bbox, format=format_, canvas_size=canvas_size_) out_bbox = out_bbox.to(dtype=in_dtype) return out_bbox if bounding_boxes.ndim < 2: bounding_boxes = [bounding_boxes] - expected_bboxes = [transform(bbox, affine_matrix, format, spatial_size) for bbox in bounding_boxes] + expected_bboxes = [transform(bbox, affine_matrix, format, canvas_size) for bbox in bounding_boxes] if len(expected_bboxes) > 1: expected_bboxes = torch.stack(expected_bboxes) else: @@ -321,11 +321,11 @@ def reference_crop_bounding_boxes(bounding_boxes, *, format, top, left, height, dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32", ) - spatial_size = (height, width) + canvas_size = (height, width) expected_bboxes = reference_affine_bounding_boxes_helper( - bounding_boxes, format=format, spatial_size=spatial_size, affine_matrix=affine_matrix + bounding_boxes, format=format, canvas_size=canvas_size, affine_matrix=affine_matrix ) - return expected_bboxes, spatial_size + return expected_bboxes, canvas_size def reference_inputs_crop_bounding_boxes(): @@ -507,7 +507,7 @@ def sample_inputs_pad_bounding_boxes(): yield ArgsKwargs( bounding_boxes_loader, format=bounding_boxes_loader.format, - spatial_size=bounding_boxes_loader.spatial_size, + canvas_size=bounding_boxes_loader.canvas_size, padding=padding, padding_mode="constant", ) @@ -530,7 +530,7 @@ def sample_inputs_pad_video(): yield ArgsKwargs(video_loader, padding=[1]) -def reference_pad_bounding_boxes(bounding_boxes, *, format, spatial_size, padding, padding_mode): +def reference_pad_bounding_boxes(bounding_boxes, *, format, canvas_size, padding, padding_mode): left, right, top, bottom = _parse_pad_padding(padding) @@ -542,11 +542,11 @@ def reference_pad_bounding_boxes(bounding_boxes, *, format, spatial_size, paddin dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32", ) - height = spatial_size[0] + top + bottom - width = spatial_size[1] + left + right + height = canvas_size[0] + top + bottom + width = canvas_size[1] + left + right expected_bboxes = reference_affine_bounding_boxes_helper( - bounding_boxes, format=format, spatial_size=(height, width), affine_matrix=affine_matrix + bounding_boxes, format=format, canvas_size=(height, width), affine_matrix=affine_matrix ) return expected_bboxes, (height, width) @@ -558,7 +558,7 @@ def reference_inputs_pad_bounding_boxes(): yield ArgsKwargs( bounding_boxes_loader, format=bounding_boxes_loader.format, - spatial_size=bounding_boxes_loader.spatial_size, + canvas_size=bounding_boxes_loader.canvas_size, padding=padding, padding_mode="constant", ) @@ -660,7 +660,7 @@ def sample_inputs_perspective_bounding_boxes(): yield ArgsKwargs( bounding_boxes_loader, format=bounding_boxes_loader.format, - spatial_size=bounding_boxes_loader.spatial_size, + canvas_size=bounding_boxes_loader.canvas_size, startpoints=None, endpoints=None, coefficients=_PERSPECTIVE_COEFFS[0], @@ -669,7 +669,7 @@ def sample_inputs_perspective_bounding_boxes(): format = datapoints.BoundingBoxFormat.XYXY loader = make_bounding_box_loader(format=format) yield ArgsKwargs( - loader, format=format, spatial_size=loader.spatial_size, startpoints=_STARTPOINTS, endpoints=_ENDPOINTS + loader, format=format, canvas_size=loader.canvas_size, startpoints=_STARTPOINTS, endpoints=_ENDPOINTS ) @@ -742,13 +742,13 @@ def sample_inputs_perspective_video(): ) -def _get_elastic_displacement(spatial_size): - return torch.rand(1, *spatial_size, 2) +def _get_elastic_displacement(canvas_size): + return torch.rand(1, *canvas_size, 2) def sample_inputs_elastic_image_tensor(): for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]): - displacement = _get_elastic_displacement(image_loader.spatial_size) + displacement = _get_elastic_displacement(image_loader.canvas_size) for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype): yield ArgsKwargs(image_loader, displacement=displacement, fill=fill) @@ -762,18 +762,18 @@ def reference_inputs_elastic_image_tensor(): F.InterpolationMode.BICUBIC, ], ): - displacement = _get_elastic_displacement(image_loader.spatial_size) + displacement = _get_elastic_displacement(image_loader.canvas_size) for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype): yield ArgsKwargs(image_loader, interpolation=interpolation, displacement=displacement, fill=fill) def sample_inputs_elastic_bounding_boxes(): for bounding_boxes_loader in make_bounding_box_loaders(): - displacement = _get_elastic_displacement(bounding_boxes_loader.spatial_size) + displacement = _get_elastic_displacement(bounding_boxes_loader.canvas_size) yield ArgsKwargs( bounding_boxes_loader, format=bounding_boxes_loader.format, - spatial_size=bounding_boxes_loader.spatial_size, + canvas_size=bounding_boxes_loader.canvas_size, displacement=displacement, ) @@ -850,7 +850,7 @@ def sample_inputs_center_crop_bounding_boxes(): yield ArgsKwargs( bounding_boxes_loader, format=bounding_boxes_loader.format, - spatial_size=bounding_boxes_loader.spatial_size, + canvas_size=bounding_boxes_loader.canvas_size, output_size=output_size, ) @@ -975,7 +975,7 @@ def make_beta_distributed_image(shape, dtype, device, *, alpha, beta, memory_for image.mul_(torch.iinfo(dtype).max).round_() return image.to(dtype=dtype, device=device, memory_format=memory_format, copy=True) - spatial_size = (256, 256) + canvas_size = (256, 256) for dtype, color_space, fn in itertools.product( [torch.uint8], ["GRAY", "RGB"], @@ -1005,7 +1005,7 @@ def make_beta_distributed_image(shape, dtype, device, *, alpha, beta, memory_for ], ], ): - image_loader = ImageLoader(fn, shape=(get_num_channels(color_space), *spatial_size), dtype=dtype) + image_loader = ImageLoader(fn, shape=(get_num_channels(color_space), *canvas_size), dtype=dtype) yield ArgsKwargs(image_loader) @@ -1487,7 +1487,7 @@ def sample_inputs_clamp_bounding_boxes(): yield ArgsKwargs( bounding_boxes_loader, format=bounding_boxes_loader.format, - spatial_size=bounding_boxes_loader.spatial_size, + canvas_size=bounding_boxes_loader.canvas_size, ) @@ -1502,7 +1502,7 @@ def sample_inputs_clamp_bounding_boxes(): _FIVE_TEN_CROP_SIZES = [7, (6,), [5], (6, 5), [7, 6]] -def _get_five_ten_crop_spatial_size(size): +def _get_five_ten_crop_canvas_size(size): if isinstance(size, int): crop_height = crop_width = size elif len(size) == 1: @@ -1515,7 +1515,7 @@ def _get_five_ten_crop_spatial_size(size): def sample_inputs_five_crop_image_tensor(): for size in _FIVE_TEN_CROP_SIZES: for image_loader in make_image_loaders( - sizes=[_get_five_ten_crop_spatial_size(size)], + sizes=[_get_five_ten_crop_canvas_size(size)], color_spaces=["RGB"], dtypes=[torch.float32], ): @@ -1525,21 +1525,21 @@ def sample_inputs_five_crop_image_tensor(): def reference_inputs_five_crop_image_tensor(): for size in _FIVE_TEN_CROP_SIZES: for image_loader in make_image_loaders( - sizes=[_get_five_ten_crop_spatial_size(size)], extra_dims=[()], dtypes=[torch.uint8] + sizes=[_get_five_ten_crop_canvas_size(size)], extra_dims=[()], dtypes=[torch.uint8] ): yield ArgsKwargs(image_loader, size=size) def sample_inputs_five_crop_video(): size = _FIVE_TEN_CROP_SIZES[0] - for video_loader in make_video_loaders(sizes=[_get_five_ten_crop_spatial_size(size)]): + for video_loader in make_video_loaders(sizes=[_get_five_ten_crop_canvas_size(size)]): yield ArgsKwargs(video_loader, size=size) def sample_inputs_ten_crop_image_tensor(): for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]): for image_loader in make_image_loaders( - sizes=[_get_five_ten_crop_spatial_size(size)], + sizes=[_get_five_ten_crop_canvas_size(size)], color_spaces=["RGB"], dtypes=[torch.float32], ): @@ -1549,14 +1549,14 @@ def sample_inputs_ten_crop_image_tensor(): def reference_inputs_ten_crop_image_tensor(): for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]): for image_loader in make_image_loaders( - sizes=[_get_five_ten_crop_spatial_size(size)], extra_dims=[()], dtypes=[torch.uint8] + sizes=[_get_five_ten_crop_canvas_size(size)], extra_dims=[()], dtypes=[torch.uint8] ): yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip) def sample_inputs_ten_crop_video(): size = _FIVE_TEN_CROP_SIZES[0] - for video_loader in make_video_loaders(sizes=[_get_five_ten_crop_spatial_size(size)]): + for video_loader in make_video_loaders(sizes=[_get_five_ten_crop_canvas_size(size)]): yield ArgsKwargs(video_loader, size=size) diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py index b3dc46348bc..780a950403c 100644 --- a/torchvision/datapoints/_bounding_box.py +++ b/torchvision/datapoints/_bounding_box.py @@ -30,7 +30,7 @@ class BoundingBoxes(Datapoint): Args: data: Any data that can be turned into a tensor with :func:`torch.as_tensor`. format (BoundingBoxFormat, str): Format of the bounding box. - spatial_size (two-tuple of ints): Height and width of the corresponding image or video. + canvas_size (two-tuple of ints): Height and width of the corresponding image or video. dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from ``data``. device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a @@ -40,13 +40,13 @@ class BoundingBoxes(Datapoint): """ format: BoundingBoxFormat - spatial_size: Tuple[int, int] + canvas_size: Tuple[int, int] @classmethod - def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat, spatial_size: Tuple[int, int]) -> BoundingBoxes: + def _wrap(cls, tensor: torch.Tensor, *, format: BoundingBoxFormat, canvas_size: Tuple[int, int]) -> BoundingBoxes: bounding_boxes = tensor.as_subclass(cls) bounding_boxes.format = format - bounding_boxes.spatial_size = spatial_size + bounding_boxes.canvas_size = canvas_size return bounding_boxes def __new__( @@ -54,7 +54,7 @@ def __new__( data: Any, *, format: Union[BoundingBoxFormat, str], - spatial_size: Tuple[int, int], + canvas_size: Tuple[int, int], dtype: Optional[torch.dtype] = None, device: Optional[Union[torch.device, str, int]] = None, requires_grad: Optional[bool] = None, @@ -64,7 +64,7 @@ def __new__( if isinstance(format, str): format = BoundingBoxFormat[format.upper()] - return cls._wrap(tensor, format=format, spatial_size=spatial_size) + return cls._wrap(tensor, format=format, canvas_size=canvas_size) @classmethod def wrap_like( @@ -73,7 +73,7 @@ def wrap_like( tensor: torch.Tensor, *, format: Optional[BoundingBoxFormat] = None, - spatial_size: Optional[Tuple[int, int]] = None, + canvas_size: Optional[Tuple[int, int]] = None, ) -> BoundingBoxes: """Wrap a :class:`torch.Tensor` as :class:`BoundingBoxes` from a reference. @@ -82,7 +82,7 @@ def wrap_like( tensor (Tensor): Tensor to be wrapped as :class:`BoundingBoxes` format (BoundingBoxFormat, str, optional): Format of the bounding box. If omitted, it is taken from the reference. - spatial_size (two-tuple of ints, optional): Height and width of the corresponding image or video. If + canvas_size (two-tuple of ints, optional): Height and width of the corresponding image or video. If omitted, it is taken from the reference. """ @@ -92,21 +92,21 @@ def wrap_like( return cls._wrap( tensor, format=format if format is not None else other.format, - spatial_size=spatial_size if spatial_size is not None else other.spatial_size, + canvas_size=canvas_size if canvas_size is not None else other.canvas_size, ) def __repr__(self, *, tensor_contents: Any = None) -> str: # type: ignore[override] - return self._make_repr(format=self.format, spatial_size=self.spatial_size) + return self._make_repr(format=self.format, canvas_size=self.canvas_size) def horizontal_flip(self) -> BoundingBoxes: output = self._F.horizontal_flip_bounding_boxes( - self.as_subclass(torch.Tensor), format=self.format, spatial_size=self.spatial_size + self.as_subclass(torch.Tensor), format=self.format, canvas_size=self.canvas_size ) return BoundingBoxes.wrap_like(self, output) def vertical_flip(self) -> BoundingBoxes: output = self._F.vertical_flip_bounding_boxes( - self.as_subclass(torch.Tensor), format=self.format, spatial_size=self.spatial_size + self.as_subclass(torch.Tensor), format=self.format, canvas_size=self.canvas_size ) return BoundingBoxes.wrap_like(self, output) @@ -117,25 +117,25 @@ def resize( # type: ignore[override] max_size: Optional[int] = None, antialias: Optional[Union[str, bool]] = "warn", ) -> BoundingBoxes: - output, spatial_size = self._F.resize_bounding_boxes( + output, canvas_size = self._F.resize_bounding_boxes( self.as_subclass(torch.Tensor), - spatial_size=self.spatial_size, + canvas_size=self.canvas_size, size=size, max_size=max_size, ) - return BoundingBoxes.wrap_like(self, output, spatial_size=spatial_size) + return BoundingBoxes.wrap_like(self, output, canvas_size=canvas_size) def crop(self, top: int, left: int, height: int, width: int) -> BoundingBoxes: - output, spatial_size = self._F.crop_bounding_boxes( + output, canvas_size = self._F.crop_bounding_boxes( self.as_subclass(torch.Tensor), self.format, top=top, left=left, height=height, width=width ) - return BoundingBoxes.wrap_like(self, output, spatial_size=spatial_size) + return BoundingBoxes.wrap_like(self, output, canvas_size=canvas_size) def center_crop(self, output_size: List[int]) -> BoundingBoxes: - output, spatial_size = self._F.center_crop_bounding_boxes( - self.as_subclass(torch.Tensor), format=self.format, spatial_size=self.spatial_size, output_size=output_size + output, canvas_size = self._F.center_crop_bounding_boxes( + self.as_subclass(torch.Tensor), format=self.format, canvas_size=self.canvas_size, output_size=output_size ) - return BoundingBoxes.wrap_like(self, output, spatial_size=spatial_size) + return BoundingBoxes.wrap_like(self, output, canvas_size=canvas_size) def resized_crop( self, @@ -147,10 +147,10 @@ def resized_crop( interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, antialias: Optional[Union[str, bool]] = "warn", ) -> BoundingBoxes: - output, spatial_size = self._F.resized_crop_bounding_boxes( + output, canvas_size = self._F.resized_crop_bounding_boxes( self.as_subclass(torch.Tensor), self.format, top, left, height, width, size=size ) - return BoundingBoxes.wrap_like(self, output, spatial_size=spatial_size) + return BoundingBoxes.wrap_like(self, output, canvas_size=canvas_size) def pad( self, @@ -158,14 +158,14 @@ def pad( fill: Optional[Union[int, float, List[float]]] = None, padding_mode: str = "constant", ) -> BoundingBoxes: - output, spatial_size = self._F.pad_bounding_boxes( + output, canvas_size = self._F.pad_bounding_boxes( self.as_subclass(torch.Tensor), format=self.format, - spatial_size=self.spatial_size, + canvas_size=self.canvas_size, padding=padding, padding_mode=padding_mode, ) - return BoundingBoxes.wrap_like(self, output, spatial_size=spatial_size) + return BoundingBoxes.wrap_like(self, output, canvas_size=canvas_size) def rotate( self, @@ -175,15 +175,15 @@ def rotate( center: Optional[List[float]] = None, fill: _FillTypeJIT = None, ) -> BoundingBoxes: - output, spatial_size = self._F.rotate_bounding_boxes( + output, canvas_size = self._F.rotate_bounding_boxes( self.as_subclass(torch.Tensor), format=self.format, - spatial_size=self.spatial_size, + canvas_size=self.canvas_size, angle=angle, expand=expand, center=center, ) - return BoundingBoxes.wrap_like(self, output, spatial_size=spatial_size) + return BoundingBoxes.wrap_like(self, output, canvas_size=canvas_size) def affine( self, @@ -198,7 +198,7 @@ def affine( output = self._F.affine_bounding_boxes( self.as_subclass(torch.Tensor), self.format, - self.spatial_size, + self.canvas_size, angle, translate=translate, scale=scale, @@ -218,7 +218,7 @@ def perspective( output = self._F.perspective_bounding_boxes( self.as_subclass(torch.Tensor), format=self.format, - spatial_size=self.spatial_size, + canvas_size=self.canvas_size, startpoints=startpoints, endpoints=endpoints, coefficients=coefficients, @@ -232,6 +232,6 @@ def elastic( fill: _FillTypeJIT = None, ) -> BoundingBoxes: output = self._F.elastic_bounding_boxes( - self.as_subclass(torch.Tensor), self.format, self.spatial_size, displacement=displacement + self.as_subclass(torch.Tensor), self.format, self.canvas_size, displacement=displacement ) return BoundingBoxes.wrap_like(self, output) diff --git a/torchvision/datapoints/_datapoint.py b/torchvision/datapoints/_datapoint.py index 35072159d7f..2059a3a18a0 100644 --- a/torchvision/datapoints/_datapoint.py +++ b/torchvision/datapoints/_datapoint.py @@ -138,7 +138,7 @@ def __deepcopy__(self: D, memo: Dict[int, Any]) -> D: # *not* happen for `deepcopy(Tensor)`. A side-effect from detaching is that the `Tensor.requires_grad` # attribute is cleared, so we need to refill it before we return. # Note: We don't explicitly handle deep-copying of the metadata here. The only metadata we currently have is - # `BoundingBoxes.format` and `BoundingBoxes.spatial_size`, which are immutable and thus implicitly deep-copied by + # `BoundingBoxes.format` and `BoundingBoxes.canvas_size`, which are immutable and thus implicitly deep-copied by # `BoundingBoxes.clone()`. return self.detach().clone().requires_grad_(self.requires_grad) # type: ignore[return-value] diff --git a/torchvision/datapoints/_dataset_wrapper.py b/torchvision/datapoints/_dataset_wrapper.py index 26e94972bde..f1e7857264a 100644 --- a/torchvision/datapoints/_dataset_wrapper.py +++ b/torchvision/datapoints/_dataset_wrapper.py @@ -341,13 +341,13 @@ def coco_dectection_wrapper_factory(dataset, target_keys): default={"image_id", "boxes", "labels"}, ) - def segmentation_to_mask(segmentation, *, spatial_size): + def segmentation_to_mask(segmentation, *, canvas_size): from pycocotools import mask segmentation = ( - mask.frPyObjects(segmentation, *spatial_size) + mask.frPyObjects(segmentation, *canvas_size) if isinstance(segmentation, dict) - else mask.merge(mask.frPyObjects(segmentation, *spatial_size)) + else mask.merge(mask.frPyObjects(segmentation, *canvas_size)) ) return torch.from_numpy(mask.decode(segmentation)) @@ -359,7 +359,7 @@ def wrapper(idx, sample): if not target: return image, dict(image_id=image_id) - spatial_size = tuple(F.get_spatial_size(image)) + canvas_size = tuple(F.get_size(image)) batched_target = list_of_dicts_to_dict_of_lists(target) target = {} @@ -372,7 +372,7 @@ def wrapper(idx, sample): datapoints.BoundingBoxes( batched_target["bbox"], format=datapoints.BoundingBoxFormat.XYWH, - spatial_size=spatial_size, + canvas_size=canvas_size, ), new_format=datapoints.BoundingBoxFormat.XYXY, ) @@ -381,7 +381,7 @@ def wrapper(idx, sample): target["masks"] = datapoints.Mask( torch.stack( [ - segmentation_to_mask(segmentation, spatial_size=spatial_size) + segmentation_to_mask(segmentation, canvas_size=canvas_size) for segmentation in batched_target["segmentation"] ] ), @@ -456,7 +456,7 @@ def wrapper(idx, sample): for bndbox in batched_instances["bndbox"] ], format=datapoints.BoundingBoxFormat.XYXY, - spatial_size=(image.height, image.width), + canvas_size=(image.height, image.width), ) if "labels" in target_keys: @@ -493,7 +493,7 @@ def wrapper(idx, sample): datapoints.BoundingBoxes( item, format=datapoints.BoundingBoxFormat.XYWH, - spatial_size=(image.height, image.width), + canvas_size=(image.height, image.width), ), new_format=datapoints.BoundingBoxFormat.XYXY, ), @@ -543,7 +543,7 @@ def wrapper(idx, sample): target["boxes"] = datapoints.BoundingBoxes( batched_target["bbox"], format=datapoints.BoundingBoxFormat.XYXY, - spatial_size=(image.height, image.width), + canvas_size=(image.height, image.width), ) if "labels" in target_keys: @@ -638,7 +638,7 @@ def wrapper(idx, sample): if "bbox" in target_keys: target["bbox"] = F.convert_format_bounding_boxes( datapoints.BoundingBoxes( - target["bbox"], format=datapoints.BoundingBoxFormat.XYWH, spatial_size=(image.height, image.width) + target["bbox"], format=datapoints.BoundingBoxFormat.XYWH, canvas_size=(image.height, image.width) ), new_format=datapoints.BoundingBoxFormat.XYXY, ) diff --git a/torchvision/datapoints/_image.py b/torchvision/datapoints/_image.py index e47a6c10fc3..2ebf4954d02 100644 --- a/torchvision/datapoints/_image.py +++ b/torchvision/datapoints/_image.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, List, Optional, Tuple, Union +from typing import Any, List, Optional, Union import PIL.Image import torch @@ -56,14 +56,6 @@ def wrap_like(cls, other: Image, tensor: torch.Tensor) -> Image: def __repr__(self, *, tensor_contents: Any = None) -> str: # type: ignore[override] return self._make_repr() - @property - def spatial_size(self) -> Tuple[int, int]: - return tuple(self.shape[-2:]) # type: ignore[return-value] - - @property - def num_channels(self) -> int: - return self.shape[-3] - def horizontal_flip(self) -> Image: output = self._F.horizontal_flip_image_tensor(self.as_subclass(torch.Tensor)) return Image.wrap_like(self, output) diff --git a/torchvision/datapoints/_mask.py b/torchvision/datapoints/_mask.py index 0135d793d32..bc50b30583c 100644 --- a/torchvision/datapoints/_mask.py +++ b/torchvision/datapoints/_mask.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, List, Optional, Tuple, Union +from typing import Any, List, Optional, Union import PIL.Image import torch @@ -51,10 +51,6 @@ def wrap_like( ) -> Mask: return cls._wrap(tensor) - @property - def spatial_size(self) -> Tuple[int, int]: - return tuple(self.shape[-2:]) # type: ignore[return-value] - def horizontal_flip(self) -> Mask: output = self._F.horizontal_flip_mask(self.as_subclass(torch.Tensor)) return Mask.wrap_like(self, output) diff --git a/torchvision/datapoints/_video.py b/torchvision/datapoints/_video.py index a6fbe2bd473..d527a68a4d1 100644 --- a/torchvision/datapoints/_video.py +++ b/torchvision/datapoints/_video.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, List, Optional, Tuple, Union +from typing import Any, List, Optional, Union import torch from torchvision.transforms.functional import InterpolationMode @@ -46,18 +46,6 @@ def wrap_like(cls, other: Video, tensor: torch.Tensor) -> Video: def __repr__(self, *, tensor_contents: Any = None) -> str: # type: ignore[override] return self._make_repr() - @property - def spatial_size(self) -> Tuple[int, int]: - return tuple(self.shape[-2:]) # type: ignore[return-value] - - @property - def num_channels(self) -> int: - return self.shape[-3] - - @property - def num_frames(self) -> int: - return self.shape[-4] - def horizontal_flip(self) -> Video: output = self._F.horizontal_flip_video(self.as_subclass(torch.Tensor)) return Video.wrap_like(self, output) diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py index 0e50fb75588..a5f883a24d7 100644 --- a/torchvision/prototype/transforms/_augment.py +++ b/torchvision/prototype/transforms/_augment.py @@ -11,7 +11,7 @@ from torchvision.transforms.v2._transform import _RandomApplyTransform from torchvision.transforms.v2.functional._geometry import _check_interpolation -from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_spatial_size +from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_size class _BaseMixupCutmix(_RandomApplyTransform): @@ -64,7 +64,7 @@ class RandomCutmix(_BaseMixupCutmix): def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: lam = float(self._dist.sample(())) # type: ignore[arg-type] - H, W = query_spatial_size(flat_inputs) + H, W = query_size(flat_inputs) r_x = torch.randint(W, ()) r_y = torch.randint(H, ()) diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py index b328c132070..28aff8416d2 100644 --- a/torchvision/prototype/transforms/_geometry.py +++ b/torchvision/prototype/transforms/_geometry.py @@ -7,7 +7,7 @@ from torchvision.prototype.datapoints import Label, OneHotLabel from torchvision.transforms.v2 import functional as F, Transform from torchvision.transforms.v2._utils import _setup_fill_arg, _setup_size -from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_bounding_boxes, query_spatial_size +from torchvision.transforms.v2.utils import has_any, is_simple_tensor, query_bounding_boxes, query_size class FixedSizeCrop(Transform): @@ -46,7 +46,7 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None: ) def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: - height, width = query_spatial_size(flat_inputs) + height, width = query_size(flat_inputs) new_height = min(height, self.crop_height) new_width = min(width, self.crop_width) @@ -67,7 +67,7 @@ def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: if needs_crop and bounding_boxes is not None: format = bounding_boxes.format - bounding_boxes, spatial_size = F.crop_bounding_boxes( + bounding_boxes, canvas_size = F.crop_bounding_boxes( bounding_boxes.as_subclass(torch.Tensor), format=format, top=top, @@ -75,7 +75,7 @@ def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: height=new_height, width=new_width, ) - bounding_boxes = F.clamp_bounding_boxes(bounding_boxes, format=format, spatial_size=spatial_size) + bounding_boxes = F.clamp_bounding_boxes(bounding_boxes, format=format, canvas_size=canvas_size) height_and_width = F.convert_format_bounding_boxes( bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYWH )[..., 2:] @@ -115,9 +115,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: elif isinstance(inpt, datapoints.BoundingBoxes): inpt = datapoints.BoundingBoxes.wrap_like( inpt, - F.clamp_bounding_boxes( - inpt[params["is_valid"]], format=inpt.format, spatial_size=inpt.spatial_size - ), + F.clamp_bounding_boxes(inpt[params["is_valid"]], format=inpt.format, canvas_size=inpt.canvas_size), ) if params["needs_pad"]: diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py index 2c6844c969e..3291c2f5004 100644 --- a/torchvision/transforms/v2/_augment.py +++ b/torchvision/transforms/v2/_augment.py @@ -12,7 +12,7 @@ from ._transform import _RandomApplyTransform, Transform from ._utils import _parse_labels_getter -from .utils import has_any, is_simple_tensor, query_chw, query_spatial_size +from .utils import has_any, is_simple_tensor, query_chw, query_size class RandomErasing(_RandomApplyTransform): @@ -284,7 +284,7 @@ class Cutmix(_BaseMixupCutmix): def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: lam = float(self._dist.sample(())) # type: ignore[arg-type] - H, W = query_spatial_size(flat_inputs) + H, W = query_size(flat_inputs) r_x = torch.randint(W, size=(1,)) r_y = torch.randint(H, size=(1,)) diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py index 785e1f6970b..2921903da8f 100644 --- a/torchvision/transforms/v2/_auto_augment.py +++ b/torchvision/transforms/v2/_auto_augment.py @@ -9,7 +9,7 @@ from torchvision.transforms import _functional_tensor as _FT from torchvision.transforms.v2 import AutoAugmentPolicy, functional as F, InterpolationMode, Transform from torchvision.transforms.v2.functional._geometry import _check_interpolation -from torchvision.transforms.v2.functional._meta import get_spatial_size +from torchvision.transforms.v2.functional._meta import get_size from ._utils import _setup_fill_arg from .utils import check_type, is_simple_tensor @@ -312,7 +312,7 @@ def _get_policies( def forward(self, *inputs: Any) -> Any: flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs) - height, width = get_spatial_size(image_or_video) + height, width = get_size(image_or_video) policy = self._policies[int(torch.randint(len(self._policies), ()))] @@ -403,7 +403,7 @@ def __init__( def forward(self, *inputs: Any) -> Any: flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs) - height, width = get_spatial_size(image_or_video) + height, width = get_size(image_or_video) for _ in range(self.num_ops): transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE) @@ -474,7 +474,7 @@ def __init__( def forward(self, *inputs: Any) -> Any: flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs) - height, width = get_spatial_size(image_or_video) + height, width = get_size(image_or_video) transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE) @@ -568,7 +568,7 @@ def _sample_dirichlet(self, params: torch.Tensor) -> torch.Tensor: def forward(self, *inputs: Any) -> Any: flat_inputs_with_spec, orig_image_or_video = self._flatten_and_extract_image_or_video(inputs) - height, width = get_spatial_size(orig_image_or_video) + height, width = get_size(orig_image_or_video) if isinstance(orig_image_or_video, torch.Tensor): image_or_video = orig_image_or_video diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py index a64f7a40e4b..9e7ca64d41c 100644 --- a/torchvision/transforms/v2/_geometry.py +++ b/torchvision/transforms/v2/_geometry.py @@ -22,7 +22,7 @@ _setup_float_or_seq, _setup_size, ) -from .utils import has_all, has_any, is_simple_tensor, query_bounding_boxes, query_spatial_size +from .utils import has_all, has_any, is_simple_tensor, query_bounding_boxes, query_size class RandomHorizontalFlip(_RandomApplyTransform): @@ -267,7 +267,7 @@ def __init__( self._log_ratio = torch.log(torch.tensor(self.ratio)) def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: - height, width = query_spatial_size(flat_inputs) + height, width = query_size(flat_inputs) area = height * width log_ratio = self._log_ratio @@ -558,7 +558,7 @@ def __init__( raise ValueError(f"Invalid canvas side range provided {side_range}.") def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: - orig_h, orig_w = query_spatial_size(flat_inputs) + orig_h, orig_w = query_size(flat_inputs) r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0]) canvas_width = int(orig_w * r) @@ -735,7 +735,7 @@ def __init__( self.center = center def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: - height, width = query_spatial_size(flat_inputs) + height, width = query_size(flat_inputs) angle = torch.empty(1).uniform_(self.degrees[0], self.degrees[1]).item() if self.translate is not None: @@ -859,7 +859,7 @@ def __init__( self.padding_mode = padding_mode def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: - padded_height, padded_width = query_spatial_size(flat_inputs) + padded_height, padded_width = query_size(flat_inputs) if self.padding is not None: pad_left, pad_right, pad_top, pad_bottom = self.padding @@ -972,7 +972,7 @@ def __init__( self._fill = _setup_fill_arg(fill) def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: - height, width = query_spatial_size(flat_inputs) + height, width = query_size(flat_inputs) distortion_scale = self.distortion_scale @@ -1072,7 +1072,7 @@ def __init__( self._fill = _setup_fill_arg(fill) def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: - size = list(query_spatial_size(flat_inputs)) + size = list(query_size(flat_inputs)) dx = torch.rand([1, 1] + size) * 2 - 1 if self.sigma[0] > 0.0: @@ -1164,7 +1164,7 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None: ) def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: - orig_h, orig_w = query_spatial_size(flat_inputs) + orig_h, orig_w = query_size(flat_inputs) bboxes = query_bounding_boxes(flat_inputs) while True: @@ -1282,7 +1282,7 @@ def __init__( self.antialias = antialias def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: - orig_height, orig_width = query_spatial_size(flat_inputs) + orig_height, orig_width = query_size(flat_inputs) scale = self.scale_range[0] + torch.rand(1) * (self.scale_range[1] - self.scale_range[0]) r = min(self.target_size[1] / orig_height, self.target_size[0] / orig_width) * scale @@ -1347,7 +1347,7 @@ def __init__( self.antialias = antialias def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: - orig_height, orig_width = query_spatial_size(flat_inputs) + orig_height, orig_width = query_size(flat_inputs) min_size = self.min_size[int(torch.randint(len(self.min_size), ()))] r = min_size / min(orig_height, orig_width) diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py index 71cc159c907..f0b62221083 100644 --- a/torchvision/transforms/v2/_meta.py +++ b/torchvision/transforms/v2/_meta.py @@ -30,7 +30,7 @@ def _transform(self, inpt: datapoints.BoundingBoxes, params: Dict[str, Any]) -> class ClampBoundingBoxes(Transform): """[BETA] Clamp bounding boxes to their corresponding image dimensions. - The clamping is done according to the bounding boxes' ``spatial_size`` meta-data. + The clamping is done according to the bounding boxes' ``canvas_size`` meta-data. .. v2betastatus:: ClampBoundingBoxes transform diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py index a4cb594b2b3..a799070ee1e 100644 --- a/torchvision/transforms/v2/_misc.py +++ b/torchvision/transforms/v2/_misc.py @@ -408,7 +408,7 @@ def forward(self, *inputs: Any) -> Any: valid = (ws >= self.min_size) & (hs >= self.min_size) & (boxes >= 0).all(dim=-1) # TODO: Do we really need to check for out of bounds here? All # transforms should be clamping anyway, so this should never happen? - image_h, image_w = boxes.spatial_size + image_h, image_w = boxes.canvas_size valid &= (boxes[:, 0] <= image_w) & (boxes[:, 2] <= image_w) valid &= (boxes[:, 1] <= image_h) & (boxes[:, 3] <= image_h) diff --git a/torchvision/transforms/v2/functional/__init__.py b/torchvision/transforms/v2/functional/__init__.py index 16f5ff50071..24b4b4218e0 100644 --- a/torchvision/transforms/v2/functional/__init__.py +++ b/torchvision/transforms/v2/functional/__init__.py @@ -15,12 +15,12 @@ get_num_channels_image_pil, get_num_channels_video, get_num_channels, - get_spatial_size_bounding_boxes, - get_spatial_size_image_tensor, - get_spatial_size_image_pil, - get_spatial_size_mask, - get_spatial_size_video, - get_spatial_size, + get_size_bounding_boxes, + get_size_image_tensor, + get_size_image_pil, + get_size_mask, + get_size_video, + get_size, ) # usort: skip from ._augment import erase, erase_image_pil, erase_image_tensor, erase_video diff --git a/torchvision/transforms/v2/functional/_deprecated.py b/torchvision/transforms/v2/functional/_deprecated.py index 99097aecc66..f27d0b29deb 100644 --- a/torchvision/transforms/v2/functional/_deprecated.py +++ b/torchvision/transforms/v2/functional/_deprecated.py @@ -19,6 +19,6 @@ def to_tensor(inpt: Any) -> torch.Tensor: def get_image_size(inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]) -> List[int]: warnings.warn( "The function `get_image_size(...)` is deprecated and will be removed in a future release. " - "Instead, please use `get_spatial_size(...)` which returns `[h, w]` instead of `[w, h]`." + "Instead, please use `get_size(...)` which returns `[h, w]` instead of `[w, h]`." ) return _F.get_image_size(inpt) diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py index 469e58ff9c4..a24507256be 100644 --- a/torchvision/transforms/v2/functional/_geometry.py +++ b/torchvision/transforms/v2/functional/_geometry.py @@ -23,7 +23,7 @@ from torchvision.utils import _log_api_usage_once -from ._meta import clamp_bounding_boxes, convert_format_bounding_boxes, get_spatial_size_image_pil +from ._meta import clamp_bounding_boxes, convert_format_bounding_boxes, get_size_image_pil from ._utils import is_simple_tensor @@ -52,18 +52,18 @@ def horizontal_flip_mask(mask: torch.Tensor) -> torch.Tensor: def horizontal_flip_bounding_boxes( - bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, spatial_size: Tuple[int, int] + bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, canvas_size: Tuple[int, int] ) -> torch.Tensor: shape = bounding_boxes.shape bounding_boxes = bounding_boxes.clone().reshape(-1, 4) if format == datapoints.BoundingBoxFormat.XYXY: - bounding_boxes[:, [2, 0]] = bounding_boxes[:, [0, 2]].sub_(spatial_size[1]).neg_() + bounding_boxes[:, [2, 0]] = bounding_boxes[:, [0, 2]].sub_(canvas_size[1]).neg_() elif format == datapoints.BoundingBoxFormat.XYWH: - bounding_boxes[:, 0].add_(bounding_boxes[:, 2]).sub_(spatial_size[1]).neg_() + bounding_boxes[:, 0].add_(bounding_boxes[:, 2]).sub_(canvas_size[1]).neg_() else: # format == datapoints.BoundingBoxFormat.CXCYWH: - bounding_boxes[:, 0].sub_(spatial_size[1]).neg_() + bounding_boxes[:, 0].sub_(canvas_size[1]).neg_() return bounding_boxes.reshape(shape) @@ -102,18 +102,18 @@ def vertical_flip_mask(mask: torch.Tensor) -> torch.Tensor: def vertical_flip_bounding_boxes( - bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, spatial_size: Tuple[int, int] + bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, canvas_size: Tuple[int, int] ) -> torch.Tensor: shape = bounding_boxes.shape bounding_boxes = bounding_boxes.clone().reshape(-1, 4) if format == datapoints.BoundingBoxFormat.XYXY: - bounding_boxes[:, [1, 3]] = bounding_boxes[:, [3, 1]].sub_(spatial_size[0]).neg_() + bounding_boxes[:, [1, 3]] = bounding_boxes[:, [3, 1]].sub_(canvas_size[0]).neg_() elif format == datapoints.BoundingBoxFormat.XYWH: - bounding_boxes[:, 1].add_(bounding_boxes[:, 3]).sub_(spatial_size[0]).neg_() + bounding_boxes[:, 1].add_(bounding_boxes[:, 3]).sub_(canvas_size[0]).neg_() else: # format == datapoints.BoundingBoxFormat.CXCYWH: - bounding_boxes[:, 1].sub_(spatial_size[0]).neg_() + bounding_boxes[:, 1].sub_(canvas_size[0]).neg_() return bounding_boxes.reshape(shape) @@ -146,7 +146,7 @@ def vertical_flip(inpt: datapoints._InputTypeJIT) -> datapoints._InputTypeJIT: def _compute_resized_output_size( - spatial_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None + canvas_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None ) -> List[int]: if isinstance(size, int): size = [size] @@ -155,7 +155,7 @@ def _compute_resized_output_size( "max_size should only be passed if size specifies the length of the smaller edge, " "i.e. size should be an int or a sequence of length 1 in torchscript mode." ) - return __compute_resized_output_size(spatial_size, size=size, max_size=max_size) + return __compute_resized_output_size(canvas_size, size=size, max_size=max_size) def resize_image_tensor( @@ -275,13 +275,13 @@ def resize_mask(mask: torch.Tensor, size: List[int], max_size: Optional[int] = N def resize_bounding_boxes( - bounding_boxes: torch.Tensor, spatial_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None + bounding_boxes: torch.Tensor, canvas_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None ) -> Tuple[torch.Tensor, Tuple[int, int]]: - old_height, old_width = spatial_size - new_height, new_width = _compute_resized_output_size(spatial_size, size=size, max_size=max_size) + old_height, old_width = canvas_size + new_height, new_width = _compute_resized_output_size(canvas_size, size=size, max_size=max_size) if (new_height, new_width) == (old_height, old_width): - return bounding_boxes, spatial_size + return bounding_boxes, canvas_size w_ratio = new_width / old_width h_ratio = new_height / old_height @@ -643,7 +643,7 @@ def affine_image_pil( # it is visually better to estimate the center without 0.5 offset # otherwise image rotated by 90 degrees is shifted vs output image of torch.rot90 or F_t.affine if center is None: - height, width = get_spatial_size_image_pil(image) + height, width = get_size_image_pil(image) center = [width * 0.5, height * 0.5] matrix = _get_inverse_affine_matrix(center, angle, translate, scale, shear) @@ -653,7 +653,7 @@ def affine_image_pil( def _affine_bounding_boxes_with_expand( bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, - spatial_size: Tuple[int, int], + canvas_size: Tuple[int, int], angle: Union[int, float], translate: List[float], scale: float, @@ -662,7 +662,7 @@ def _affine_bounding_boxes_with_expand( expand: bool = False, ) -> Tuple[torch.Tensor, Tuple[int, int]]: if bounding_boxes.numel() == 0: - return bounding_boxes, spatial_size + return bounding_boxes, canvas_size original_shape = bounding_boxes.shape original_dtype = bounding_boxes.dtype @@ -680,7 +680,7 @@ def _affine_bounding_boxes_with_expand( ) if center is None: - height, width = spatial_size + height, width = canvas_size center = [width * 0.5, height * 0.5] affine_vector = _get_inverse_affine_matrix(center, angle, translate, scale, shear, inverted=False) @@ -710,7 +710,7 @@ def _affine_bounding_boxes_with_expand( if expand: # Compute minimum point for transformed image frame: # Points are Top-Left, Top-Right, Bottom-Left, Bottom-Right points. - height, width = spatial_size + height, width = canvas_size points = torch.tensor( [ [0.0, 0.0, 1.0], @@ -728,21 +728,21 @@ def _affine_bounding_boxes_with_expand( # Estimate meta-data for image with inverted=True and with center=[0,0] affine_vector = _get_inverse_affine_matrix([0.0, 0.0], angle, translate, scale, shear) new_width, new_height = _compute_affine_output_size(affine_vector, width, height) - spatial_size = (new_height, new_width) + canvas_size = (new_height, new_width) - out_bboxes = clamp_bounding_boxes(out_bboxes, format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size) + out_bboxes = clamp_bounding_boxes(out_bboxes, format=datapoints.BoundingBoxFormat.XYXY, canvas_size=canvas_size) out_bboxes = convert_format_bounding_boxes( out_bboxes, old_format=datapoints.BoundingBoxFormat.XYXY, new_format=format, inplace=True ).reshape(original_shape) out_bboxes = out_bboxes.to(original_dtype) - return out_bboxes, spatial_size + return out_bboxes, canvas_size def affine_bounding_boxes( bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, - spatial_size: Tuple[int, int], + canvas_size: Tuple[int, int], angle: Union[int, float], translate: List[float], scale: float, @@ -752,7 +752,7 @@ def affine_bounding_boxes( out_box, _ = _affine_bounding_boxes_with_expand( bounding_boxes, format=format, - spatial_size=spatial_size, + canvas_size=canvas_size, angle=angle, translate=translate, scale=scale, @@ -930,7 +930,7 @@ def rotate_image_pil( def rotate_bounding_boxes( bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, - spatial_size: Tuple[int, int], + canvas_size: Tuple[int, int], angle: float, expand: bool = False, center: Optional[List[float]] = None, @@ -941,7 +941,7 @@ def rotate_bounding_boxes( return _affine_bounding_boxes_with_expand( bounding_boxes, format=format, - spatial_size=spatial_size, + canvas_size=canvas_size, angle=-angle, translate=[0.0, 0.0], scale=1.0, @@ -1168,7 +1168,7 @@ def pad_mask( def pad_bounding_boxes( bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, - spatial_size: Tuple[int, int], + canvas_size: Tuple[int, int], padding: List[int], padding_mode: str = "constant", ) -> Tuple[torch.Tensor, Tuple[int, int]]: @@ -1184,12 +1184,12 @@ def pad_bounding_boxes( pad = [left, top, 0, 0] bounding_boxes = bounding_boxes + torch.tensor(pad, dtype=bounding_boxes.dtype, device=bounding_boxes.device) - height, width = spatial_size + height, width = canvas_size height += top + bottom width += left + right - spatial_size = (height, width) + canvas_size = (height, width) - return clamp_bounding_boxes(bounding_boxes, format=format, spatial_size=spatial_size), spatial_size + return clamp_bounding_boxes(bounding_boxes, format=format, canvas_size=canvas_size), canvas_size def pad_video( @@ -1261,9 +1261,9 @@ def crop_bounding_boxes( sub = [left, top, 0, 0] bounding_boxes = bounding_boxes - torch.tensor(sub, dtype=bounding_boxes.dtype, device=bounding_boxes.device) - spatial_size = (height, width) + canvas_size = (height, width) - return clamp_bounding_boxes(bounding_boxes, format=format, spatial_size=spatial_size), spatial_size + return clamp_bounding_boxes(bounding_boxes, format=format, canvas_size=canvas_size), canvas_size def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor: @@ -1412,7 +1412,7 @@ def perspective_image_pil( def perspective_bounding_boxes( bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, - spatial_size: Tuple[int, int], + canvas_size: Tuple[int, int], startpoints: Optional[List[List[int]]], endpoints: Optional[List[List[int]]], coefficients: Optional[List[float]] = None, @@ -1493,7 +1493,7 @@ def perspective_bounding_boxes( out_bboxes = clamp_bounding_boxes( torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_boxes.dtype), format=datapoints.BoundingBoxFormat.XYXY, - spatial_size=spatial_size, + canvas_size=canvas_size, ) # out_bboxes should be of shape [N boxes, 4] @@ -1651,7 +1651,7 @@ def _create_identity_grid(size: Tuple[int, int], device: torch.device, dtype: to def elastic_bounding_boxes( bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, - spatial_size: Tuple[int, int], + canvas_size: Tuple[int, int], displacement: torch.Tensor, ) -> torch.Tensor: if bounding_boxes.numel() == 0: @@ -1670,7 +1670,7 @@ def elastic_bounding_boxes( convert_format_bounding_boxes(bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY) ).reshape(-1, 4) - id_grid = _create_identity_grid(spatial_size, device=device, dtype=dtype) + id_grid = _create_identity_grid(canvas_size, device=device, dtype=dtype) # We construct an approximation of inverse grid as inv_grid = id_grid - displacement # This is not an exact inverse of the grid inv_grid = id_grid.sub_(displacement) @@ -1683,7 +1683,7 @@ def elastic_bounding_boxes( index_x, index_y = index_xy[:, 0], index_xy[:, 1] # Transform points: - t_size = torch.tensor(spatial_size[::-1], device=displacement.device, dtype=displacement.dtype) + t_size = torch.tensor(canvas_size[::-1], device=displacement.device, dtype=displacement.dtype) transformed_points = inv_grid[0, index_y, index_x, :].add_(1).mul_(0.5 * t_size).sub_(0.5) transformed_points = transformed_points.reshape(-1, 4, 2) @@ -1691,7 +1691,7 @@ def elastic_bounding_boxes( out_bboxes = clamp_bounding_boxes( torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_boxes.dtype), format=datapoints.BoundingBoxFormat.XYXY, - spatial_size=spatial_size, + canvas_size=canvas_size, ) return convert_format_bounding_boxes( @@ -1804,13 +1804,13 @@ def center_crop_image_tensor(image: torch.Tensor, output_size: List[int]) -> tor @torch.jit.unused def center_crop_image_pil(image: PIL.Image.Image, output_size: List[int]) -> PIL.Image.Image: crop_height, crop_width = _center_crop_parse_output_size(output_size) - image_height, image_width = get_spatial_size_image_pil(image) + image_height, image_width = get_size_image_pil(image) if crop_height > image_height or crop_width > image_width: padding_ltrb = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width) image = pad_image_pil(image, padding_ltrb, fill=0) - image_height, image_width = get_spatial_size_image_pil(image) + image_height, image_width = get_size_image_pil(image) if crop_width == image_width and crop_height == image_height: return image @@ -1821,11 +1821,11 @@ def center_crop_image_pil(image: PIL.Image.Image, output_size: List[int]) -> PIL def center_crop_bounding_boxes( bounding_boxes: torch.Tensor, format: datapoints.BoundingBoxFormat, - spatial_size: Tuple[int, int], + canvas_size: Tuple[int, int], output_size: List[int], ) -> Tuple[torch.Tensor, Tuple[int, int]]: crop_height, crop_width = _center_crop_parse_output_size(output_size) - crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, *spatial_size) + crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, *canvas_size) return crop_bounding_boxes( bounding_boxes, format, top=crop_top, left=crop_left, height=crop_height, width=crop_width ) @@ -1905,7 +1905,7 @@ def resized_crop_bounding_boxes( size: List[int], ) -> Tuple[torch.Tensor, Tuple[int, int]]: bounding_boxes, _ = crop_bounding_boxes(bounding_boxes, format, top, left, height, width) - return resize_bounding_boxes(bounding_boxes, spatial_size=(height, width), size=size) + return resize_bounding_boxes(bounding_boxes, canvas_size=(height, width), size=size) def resized_crop_mask( @@ -2000,7 +2000,7 @@ def five_crop_image_pil( image: PIL.Image.Image, size: List[int] ) -> Tuple[PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image]: crop_height, crop_width = _parse_five_crop_size(size) - image_height, image_width = get_spatial_size_image_pil(image) + image_height, image_width = get_size_image_pil(image) if crop_width > image_width or crop_height > image_height: raise ValueError(f"Requested crop size {size} is bigger than input size {(image_height, image_width)}") diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py index f564b180389..91b370675b9 100644 --- a/torchvision/transforms/v2/functional/_meta.py +++ b/torchvision/transforms/v2/functional/_meta.py @@ -26,23 +26,29 @@ def get_dimensions_image_tensor(image: torch.Tensor) -> List[int]: get_dimensions_image_pil = _FP.get_dimensions +def get_dimensions_video(video: torch.Tensor) -> List[int]: + return get_dimensions_image_tensor(video) + + def get_dimensions(inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoTypeJIT]) -> List[int]: if not torch.jit.is_scripting(): _log_api_usage_once(get_dimensions) if torch.jit.is_scripting() or is_simple_tensor(inpt): return get_dimensions_image_tensor(inpt) - elif isinstance(inpt, (datapoints.Image, datapoints.Video)): - channels = inpt.num_channels - height, width = inpt.spatial_size - return [channels, height, width] - elif isinstance(inpt, PIL.Image.Image): - return get_dimensions_image_pil(inpt) - else: - raise TypeError( - f"Input can either be a plain tensor, an `Image` or `Video` datapoint, or a PIL image, " - f"but got {type(inpt)} instead." - ) + + for typ, get_size_fn in { + datapoints.Image: get_dimensions_image_tensor, + datapoints.Video: get_dimensions_video, + PIL.Image.Image: get_dimensions_image_pil, + }.items(): + if isinstance(inpt, typ): + return get_size_fn(inpt) + + raise TypeError( + f"Input can either be a plain tensor, an `Image` or `Video` datapoint, or a PIL image, " + f"but got {type(inpt)} instead." + ) def get_num_channels_image_tensor(image: torch.Tensor) -> int: @@ -69,15 +75,19 @@ def get_num_channels(inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoType if torch.jit.is_scripting() or is_simple_tensor(inpt): return get_num_channels_image_tensor(inpt) - elif isinstance(inpt, (datapoints.Image, datapoints.Video)): - return inpt.num_channels - elif isinstance(inpt, PIL.Image.Image): - return get_num_channels_image_pil(inpt) - else: - raise TypeError( - f"Input can either be a plain tensor, an `Image` or `Video` datapoint, or a PIL image, " - f"but got {type(inpt)} instead." - ) + + for typ, get_size_fn in { + datapoints.Image: get_num_channels_image_tensor, + datapoints.Video: get_num_channels_video, + PIL.Image.Image: get_num_channels_image_pil, + }.items(): + if isinstance(inpt, typ): + return get_size_fn(inpt) + + raise TypeError( + f"Input can either be a plain tensor, an `Image` or `Video` datapoint, or a PIL image, " + f"but got {type(inpt)} instead." + ) # We changed the names to ensure it can be used not only for images but also videos. Thus, we just alias it without @@ -85,7 +95,7 @@ def get_num_channels(inpt: Union[datapoints._ImageTypeJIT, datapoints._VideoType get_image_num_channels = get_num_channels -def get_spatial_size_image_tensor(image: torch.Tensor) -> List[int]: +def get_size_image_tensor(image: torch.Tensor) -> List[int]: hw = list(image.shape[-2:]) ndims = len(hw) if ndims == 2: @@ -95,39 +105,48 @@ def get_spatial_size_image_tensor(image: torch.Tensor) -> List[int]: @torch.jit.unused -def get_spatial_size_image_pil(image: PIL.Image.Image) -> List[int]: +def get_size_image_pil(image: PIL.Image.Image) -> List[int]: width, height = _FP.get_image_size(image) return [height, width] -def get_spatial_size_video(video: torch.Tensor) -> List[int]: - return get_spatial_size_image_tensor(video) +def get_size_video(video: torch.Tensor) -> List[int]: + return get_size_image_tensor(video) -def get_spatial_size_mask(mask: torch.Tensor) -> List[int]: - return get_spatial_size_image_tensor(mask) +def get_size_mask(mask: torch.Tensor) -> List[int]: + return get_size_image_tensor(mask) @torch.jit.unused -def get_spatial_size_bounding_boxes(bounding_boxes: datapoints.BoundingBoxes) -> List[int]: - return list(bounding_boxes.spatial_size) +def get_size_bounding_boxes(bounding_box: datapoints.BoundingBoxes) -> List[int]: + return list(bounding_box.canvas_size) -def get_spatial_size(inpt: datapoints._InputTypeJIT) -> List[int]: +def get_size(inpt: datapoints._InputTypeJIT) -> List[int]: if not torch.jit.is_scripting(): - _log_api_usage_once(get_spatial_size) + _log_api_usage_once(get_size) if torch.jit.is_scripting() or is_simple_tensor(inpt): - return get_spatial_size_image_tensor(inpt) - elif isinstance(inpt, (datapoints.Image, datapoints.Video, datapoints.BoundingBoxes, datapoints.Mask)): - return list(inpt.spatial_size) - elif isinstance(inpt, PIL.Image.Image): - return get_spatial_size_image_pil(inpt) - else: - raise TypeError( - f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, " - f"but got {type(inpt)} instead." - ) + return get_size_image_tensor(inpt) + + # TODO: This is just the poor mans version of a dispatcher. This will be properly addressed with + # https://github.com/pytorch/vision/pull/7747 when we can register the kernels above without the need to have + # a method on the datapoint class + for typ, get_size_fn in { + datapoints.Image: get_size_image_tensor, + datapoints.BoundingBoxes: get_size_bounding_boxes, + datapoints.Mask: get_size_mask, + datapoints.Video: get_size_video, + PIL.Image.Image: get_size_image_pil, + }.items(): + if isinstance(inpt, typ): + return get_size_fn(inpt) + + raise TypeError( + f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, " + f"but got {type(inpt)} instead." + ) def get_num_frames_video(video: torch.Tensor) -> int: @@ -141,7 +160,7 @@ def get_num_frames(inpt: datapoints._VideoTypeJIT) -> int: if torch.jit.is_scripting() or is_simple_tensor(inpt): return get_num_frames_video(inpt) elif isinstance(inpt, datapoints.Video): - return inpt.num_frames + return get_num_frames_video(inpt) else: raise TypeError(f"Input can either be a plain tensor or a `Video` datapoint, but got {type(inpt)} instead.") @@ -240,7 +259,7 @@ def convert_format_bounding_boxes( def _clamp_bounding_boxes( - bounding_boxes: torch.Tensor, format: BoundingBoxFormat, spatial_size: Tuple[int, int] + bounding_boxes: torch.Tensor, format: BoundingBoxFormat, canvas_size: Tuple[int, int] ) -> torch.Tensor: # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every # BoundingBoxFormat instead of converting back and forth @@ -249,8 +268,8 @@ def _clamp_bounding_boxes( xyxy_boxes = convert_format_bounding_boxes( bounding_boxes, old_format=format, new_format=datapoints.BoundingBoxFormat.XYXY, inplace=True ) - xyxy_boxes[..., 0::2].clamp_(min=0, max=spatial_size[1]) - xyxy_boxes[..., 1::2].clamp_(min=0, max=spatial_size[0]) + xyxy_boxes[..., 0::2].clamp_(min=0, max=canvas_size[1]) + xyxy_boxes[..., 1::2].clamp_(min=0, max=canvas_size[0]) out_boxes = convert_format_bounding_boxes( xyxy_boxes, old_format=BoundingBoxFormat.XYXY, new_format=format, inplace=True ) @@ -260,21 +279,20 @@ def _clamp_bounding_boxes( def clamp_bounding_boxes( inpt: datapoints._InputTypeJIT, format: Optional[BoundingBoxFormat] = None, - spatial_size: Optional[Tuple[int, int]] = None, + canvas_size: Optional[Tuple[int, int]] = None, ) -> datapoints._InputTypeJIT: if not torch.jit.is_scripting(): _log_api_usage_once(clamp_bounding_boxes) if torch.jit.is_scripting() or is_simple_tensor(inpt): - if format is None or spatial_size is None: - raise ValueError("For simple tensor inputs, `format` and `spatial_size` has to be passed.") - return _clamp_bounding_boxes(inpt, format=format, spatial_size=spatial_size) + + if format is None or canvas_size is None: + raise ValueError("For simple tensor inputs, `format` and `canvas_size` has to be passed.") + return _clamp_bounding_boxes(inpt, format=format, canvas_size=canvas_size) elif isinstance(inpt, datapoints.BoundingBoxes): - if format is not None or spatial_size is not None: - raise ValueError("For bounding box datapoint inputs, `format` and `spatial_size` must not be passed.") - output = _clamp_bounding_boxes( - inpt.as_subclass(torch.Tensor), format=inpt.format, spatial_size=inpt.spatial_size - ) + if format is not None or canvas_size is not None: + raise ValueError("For bounding box datapoint inputs, `format` and `canvas_size` must not be passed.") + output = _clamp_bounding_boxes(inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size) return datapoints.BoundingBoxes.wrap_like(inpt, output) else: raise TypeError( diff --git a/torchvision/transforms/v2/utils.py b/torchvision/transforms/v2/utils.py index 978333296d0..dd9f4489dee 100644 --- a/torchvision/transforms/v2/utils.py +++ b/torchvision/transforms/v2/utils.py @@ -6,15 +6,15 @@ from torchvision import datapoints from torchvision._utils import sequence_to_str -from torchvision.transforms.v2.functional import get_dimensions, get_spatial_size, is_simple_tensor +from torchvision.transforms.v2.functional import get_dimensions, get_size, is_simple_tensor def query_bounding_boxes(flat_inputs: List[Any]) -> datapoints.BoundingBoxes: bounding_boxes = [inpt for inpt in flat_inputs if isinstance(inpt, datapoints.BoundingBoxes)] if not bounding_boxes: - raise TypeError("No bounding box was found in the sample") + raise TypeError("No bounding boxes were found in the sample") elif len(bounding_boxes) > 1: - raise ValueError("Found multiple bounding boxes in the sample") + raise ValueError("Found multiple bounding boxes instances in the sample") return bounding_boxes.pop() @@ -22,7 +22,7 @@ def query_chw(flat_inputs: List[Any]) -> Tuple[int, int, int]: chws = { tuple(get_dimensions(inpt)) for inpt in flat_inputs - if isinstance(inpt, (datapoints.Image, PIL.Image.Image, datapoints.Video)) or is_simple_tensor(inpt) + if check_type(inpt, (is_simple_tensor, datapoints.Image, PIL.Image.Image, datapoints.Video)) } if not chws: raise TypeError("No image or video was found in the sample") @@ -32,14 +32,21 @@ def query_chw(flat_inputs: List[Any]) -> Tuple[int, int, int]: return c, h, w -def query_spatial_size(flat_inputs: List[Any]) -> Tuple[int, int]: +def query_size(flat_inputs: List[Any]) -> Tuple[int, int]: sizes = { - tuple(get_spatial_size(inpt)) + tuple(get_size(inpt)) for inpt in flat_inputs - if isinstance( - inpt, (datapoints.Image, PIL.Image.Image, datapoints.Video, datapoints.Mask, datapoints.BoundingBoxes) + if check_type( + inpt, + ( + is_simple_tensor, + datapoints.Image, + PIL.Image.Image, + datapoints.Video, + datapoints.Mask, + datapoints.BoundingBoxes, + ), ) - or is_simple_tensor(inpt) } if not sizes: raise TypeError("No image, video, mask or bounding box was found in the sample")