diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index d26b21fbc7e..7eb92ae1d8b 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -390,21 +390,6 @@ def was_applied(output, inpt): assert transform.was_applied(output, input) -class TestPad: - def test_assertions(self): - with pytest.raises(TypeError, match="Got inappropriate padding arg"): - transforms.Pad("abc") - - with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"): - transforms.Pad([-0.7, 0, 0.7]) - - with pytest.raises(TypeError, match="Got inappropriate fill arg"): - transforms.Pad(12, fill="abc") - - with pytest.raises(ValueError, match="Padding mode should be either"): - transforms.Pad(12, padding_mode="abc") - - class TestRandomZoomOut: def test_assertions(self): with pytest.raises(TypeError, match="Got inappropriate fill arg"): diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py index 3ad88ec1e51..92635f8cdbd 100644 --- a/test/test_transforms_v2_consistency.py +++ b/test/test_transforms_v2_consistency.py @@ -109,21 +109,6 @@ def __init__( ], make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(20, 19)]), ), - ConsistencyConfig( - v2_transforms.Pad, - legacy_transforms.Pad, - [ - NotScriptableArgsKwargs(3), - ArgsKwargs([3]), - ArgsKwargs([2, 3]), - ArgsKwargs([3, 2, 1, 4]), - NotScriptableArgsKwargs(5, fill=1, padding_mode="constant"), - ArgsKwargs([5], fill=1, padding_mode="constant"), - NotScriptableArgsKwargs(5, padding_mode="edge"), - NotScriptableArgsKwargs(5, padding_mode="reflect"), - NotScriptableArgsKwargs(5, padding_mode="symmetric"), - ], - ), *[ ConsistencyConfig( v2_transforms.LinearTransformation, diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py index a7d8496708c..cb72e8ce28b 100644 --- a/test/test_transforms_v2_functional.py +++ b/test/test_transforms_v2_functional.py @@ -524,75 +524,6 @@ def test_tv_tensor_explicit_metadata(self, metadata): # `transforms_v2_kernel_infos.py` -def _parse_padding(padding): - if isinstance(padding, int): - return [padding] * 4 - if isinstance(padding, list): - if len(padding) == 1: - return padding * 4 - if len(padding) == 2: - return padding * 2 # [left, up, right, down] - - return padding - - -@pytest.mark.parametrize("device", cpu_and_cuda()) -@pytest.mark.parametrize("padding", [[1], [1, 1], [1, 1, 2, 2]]) -def test_correctness_pad_bounding_boxes(device, padding): - def _compute_expected_bbox(bbox, format, padding_): - pad_left, pad_up, _, _ = _parse_padding(padding_) - - dtype = bbox.dtype - bbox = ( - bbox.clone() - if format == tv_tensors.BoundingBoxFormat.XYXY - else convert_bounding_box_format(bbox, old_format=format, new_format=tv_tensors.BoundingBoxFormat.XYXY) - ) - - bbox[0::2] += pad_left - bbox[1::2] += pad_up - - bbox = convert_bounding_box_format(bbox, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format) - if bbox.dtype != dtype: - # Temporary cast to original dtype - # e.g. float32 -> int - bbox = bbox.to(dtype) - return bbox - - def _compute_expected_canvas_size(bbox, padding_): - pad_left, pad_up, pad_right, pad_down = _parse_padding(padding_) - height, width = bbox.canvas_size - return height + pad_up + pad_down, width + pad_left + pad_right - - for bboxes in make_multiple_bounding_boxes(extra_dims=((4,),)): - bboxes = bboxes.to(device) - bboxes_format = bboxes.format - bboxes_canvas_size = bboxes.canvas_size - - output_boxes, output_canvas_size = F.pad_bounding_boxes( - bboxes, format=bboxes_format, canvas_size=bboxes_canvas_size, padding=padding - ) - - torch.testing.assert_close(output_canvas_size, _compute_expected_canvas_size(bboxes, padding)) - - expected_bboxes = torch.stack( - [_compute_expected_bbox(b, bboxes_format, padding) for b in bboxes.reshape(-1, 4).unbind()] - ).reshape(bboxes.shape) - - torch.testing.assert_close(output_boxes, expected_bboxes, atol=1, rtol=0) - - -@pytest.mark.parametrize("device", cpu_and_cuda()) -def test_correctness_pad_segmentation_mask_on_fixed_input(device): - mask = torch.ones((1, 3, 3), dtype=torch.long, device=device) - - out_mask = F.pad_mask(mask, padding=[1, 1, 1, 1]) - - expected_mask = torch.zeros((1, 5, 5), dtype=torch.long, device=device) - expected_mask[:, 1:-1, 1:-1] = 1 - torch.testing.assert_close(out_mask, expected_mask) - - @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize( "startpoints, endpoints", diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py index 99fecb0ce9d..e8a0e11b85e 100644 --- a/test/test_transforms_v2_refactored.py +++ b/test/test_transforms_v2_refactored.py @@ -3346,3 +3346,171 @@ def test_transform_errors_warnings(self): for param in ["scale", "ratio"]: with pytest.warns(match="Scale and ratio should be of kind"): transforms.RandomResizedCrop(size=self.INPUT_SIZE, **{param: [1, 0]}) + + +class TestPad: + EXHAUSTIVE_TYPE_PADDINGS = [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]] + CORRECTNESS_PADDINGS = [ + padding + for padding in EXHAUSTIVE_TYPE_PADDINGS + if isinstance(padding, int) or isinstance(padding, list) and len(padding) > 1 + ] + PADDING_MODES = ["constant", "symmetric", "edge", "reflect"] + + @param_value_parametrization( + padding=EXHAUSTIVE_TYPE_PADDINGS, + fill=EXHAUSTIVE_TYPE_FILLS, + padding_mode=PADDING_MODES, + ) + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, param, value, dtype, device): + if param == "fill": + value = adapt_fill(value, dtype=dtype) + kwargs = {param: value} + if param != "padding": + kwargs["padding"] = [1] + + image = make_image(dtype=dtype, device=device) + + check_kernel( + F.pad_image, + image, + **kwargs, + check_scripted_vs_eager=not ( + (param == "padding" and isinstance(value, int)) + # See https://github.com/pytorch/vision/pull/7252#issue-1585585521 for details + or ( + param == "fill" + and ( + isinstance(value, tuple) or (isinstance(value, list) and any(isinstance(v, int) for v in value)) + ) + ) + ), + ) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + def test_kernel_bounding_boxes(self, format): + bounding_boxes = make_bounding_boxes(format=format) + check_kernel( + F.pad_bounding_boxes, + bounding_boxes, + format=bounding_boxes.format, + canvas_size=bounding_boxes.canvas_size, + padding=[1], + ) + + @pytest.mark.parametrize("padding_mode", ["symmetric", "edge", "reflect"]) + def test_kernel_bounding_boxes_errors(self, padding_mode): + bounding_boxes = make_bounding_boxes() + with pytest.raises(ValueError, match=f"'{padding_mode}' is not supported"): + F.pad_bounding_boxes( + bounding_boxes, + format=bounding_boxes.format, + canvas_size=bounding_boxes.canvas_size, + padding=[1], + padding_mode=padding_mode, + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask]) + def test_kernel_mask(self, make_mask): + check_kernel(F.pad_mask, make_mask(), padding=[1]) + + @pytest.mark.parametrize("fill", [[1], (0,), [1, 0, 1], (0, 1, 0)]) + def test_kernel_mask_errors(self, fill): + with pytest.raises(ValueError, match="Non-scalar fill value is not supported"): + check_kernel(F.pad_mask, make_segmentation_mask(), padding=[1], fill=fill) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_functional(self, make_input): + check_functional(F.pad, make_input(), padding=[1]) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.pad_image, torch.Tensor), + # The PIL kernel uses fill=0 as default rather than fill=None as all others. + # Since the whole fill story is already really inconsistent, we won't introduce yet another case to allow + # for this test to pass. + # See https://github.com/pytorch/vision/issues/6623 for a discussion. + # (F._pad_image_pil, PIL.Image.Image), + (F.pad_image, tv_tensors.Image), + (F.pad_bounding_boxes, tv_tensors.BoundingBoxes), + (F.pad_mask, tv_tensors.Mask), + (F.pad_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.pad, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_transform(self, make_input): + check_transform(transforms.Pad(padding=[1]), make_input()) + + def test_transform_errors(self): + with pytest.raises(TypeError, match="Got inappropriate padding arg"): + transforms.Pad("abc") + + with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"): + transforms.Pad([-0.7, 0, 0.7]) + + with pytest.raises(TypeError, match="Got inappropriate fill arg"): + transforms.Pad(12, fill="abc") + + with pytest.raises(ValueError, match="Padding mode should be either"): + transforms.Pad(12, padding_mode="abc") + + @pytest.mark.parametrize("padding", CORRECTNESS_PADDINGS) + @pytest.mark.parametrize( + ("padding_mode", "fill"), + [ + *[("constant", fill) for fill in CORRECTNESS_FILLS], + *[(padding_mode, None) for padding_mode in ["symmetric", "edge", "reflect"]], + ], + ) + @pytest.mark.parametrize("fn", [F.pad, transform_cls_to_functional(transforms.Pad)]) + def test_image_correctness(self, padding, padding_mode, fill, fn): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = fn(image, padding=padding, padding_mode=padding_mode, fill=fill) + expected = F.to_image(F.pad(F.to_pil_image(image), padding=padding, padding_mode=padding_mode, fill=fill)) + + assert_equal(actual, expected) + + def _reference_pad_bounding_boxes(self, bounding_boxes, *, padding): + if isinstance(padding, int): + padding = [padding] + left, top, right, bottom = padding * (4 // len(padding)) + + affine_matrix = np.array( + [ + [1, 0, left], + [0, 1, top], + ], + ) + + height = bounding_boxes.canvas_size[0] + top + bottom + width = bounding_boxes.canvas_size[1] + left + right + + return reference_affine_bounding_boxes_helper( + bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=(height, width) + ) + + @pytest.mark.parametrize("padding", CORRECTNESS_PADDINGS) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.int64, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("fn", [F.pad, transform_cls_to_functional(transforms.Pad)]) + def test_bounding_boxes_correctness(self, padding, format, dtype, device, fn): + bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) + + actual = fn(bounding_boxes, padding=padding) + expected = self._reference_pad_bounding_boxes(bounding_boxes, padding=padding) + + assert_equal(actual, expected) diff --git a/test/transforms_v2_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py index 0346ca89ad9..0071c6d3c51 100644 --- a/test/transforms_v2_dispatcher_infos.py +++ b/test/transforms_v2_dispatcher_infos.py @@ -1,7 +1,7 @@ import pytest import torchvision.transforms.v2.functional as F from torchvision import tv_tensors -from transforms_v2_kernel_infos import KERNEL_INFOS, pad_xfail_jit_fill_condition +from transforms_v2_kernel_infos import KERNEL_INFOS from transforms_v2_legacy_utils import InfoBase, TestMark __all__ = ["DispatcherInfo", "DISPATCHER_INFOS"] @@ -111,20 +111,6 @@ def xfail_jit_python_scalar_arg(name, *, reason=None): DISPATCHER_INFOS = [ - DispatcherInfo( - F.pad, - kernels={ - tv_tensors.Image: F.pad_image, - tv_tensors.Video: F.pad_video, - tv_tensors.BoundingBoxes: F.pad_bounding_boxes, - tv_tensors.Mask: F.pad_mask, - }, - pil_kernel_info=PILKernelInfo(F._pad_image_pil, kernel_name="pad_image_pil"), - test_marks=[ - xfail_jit("F.pad only supports vector fills for list of floats", condition=pad_xfail_jit_fill_condition), - xfail_jit_python_scalar_arg("padding"), - ], - ), DispatcherInfo( F.perspective, kernels={ diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py index e6efb7a349a..3e0d2e0eeca 100644 --- a/test/transforms_v2_kernel_infos.py +++ b/test/transforms_v2_kernel_infos.py @@ -1,14 +1,13 @@ import functools import itertools -import numpy as np import PIL.Image import pytest import torch.testing import torchvision.transforms.v2.functional as F from torchvision import tv_tensors -from torchvision.transforms._functional_tensor import _max_value as get_max_value, _parse_pad_padding -from transforms_v2_legacy_utils import ( +from torchvision.transforms._functional_tensor import _max_value as get_max_value +from transforms_v2_legacy_utils import ( # noqa: F401 ArgsKwargs, combinations_grid, DEFAULT_PORTRAIT_SPATIAL_SIZE, @@ -183,211 +182,6 @@ def float32_vs_uint8_fill_adapter(other_args, kwargs): return other_args, dict(kwargs, fill=fill) -def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, canvas_size, affine_matrix): - def transform(bbox, affine_matrix_, format_, canvas_size_): - # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1 - in_dtype = bbox.dtype - if not torch.is_floating_point(bbox): - bbox = bbox.float() - bbox_xyxy = F.convert_bounding_box_format( - bbox.as_subclass(torch.Tensor), - old_format=format_, - new_format=tv_tensors.BoundingBoxFormat.XYXY, - inplace=True, - ) - points = np.array( - [ - [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0], - [bbox_xyxy[2].item(), bbox_xyxy[1].item(), 1.0], - [bbox_xyxy[0].item(), bbox_xyxy[3].item(), 1.0], - [bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0], - ] - ) - transformed_points = np.matmul(points, affine_matrix_.T) - out_bbox = torch.tensor( - [ - np.min(transformed_points[:, 0]).item(), - np.min(transformed_points[:, 1]).item(), - np.max(transformed_points[:, 0]).item(), - np.max(transformed_points[:, 1]).item(), - ], - dtype=bbox_xyxy.dtype, - ) - out_bbox = F.convert_bounding_box_format( - out_bbox, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format_, inplace=True - ) - # It is important to clamp before casting, especially for CXCYWH format, dtype=int64 - out_bbox = F.clamp_bounding_boxes(out_bbox, format=format_, canvas_size=canvas_size_) - out_bbox = out_bbox.to(dtype=in_dtype) - return out_bbox - - return torch.stack( - [transform(b, affine_matrix, format, canvas_size) for b in bounding_boxes.reshape(-1, 4).unbind()] - ).reshape(bounding_boxes.shape) - - -_PAD_PARAMS = combinations_grid( - padding=[[1], [1, 1], [1, 1, 2, 2]], - padding_mode=["constant", "symmetric", "edge", "reflect"], -) - - -def sample_inputs_pad_image_tensor(): - make_pad_image_loaders = functools.partial( - make_image_loaders, sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=["RGB"], dtypes=[torch.float32] - ) - - for image_loader, padding in itertools.product( - make_pad_image_loaders(), - [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]], - ): - yield ArgsKwargs(image_loader, padding=padding) - - for image_loader in make_pad_image_loaders(): - for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype): - yield ArgsKwargs(image_loader, padding=[1], fill=fill) - - for image_loader, padding_mode in itertools.product( - # We branch for non-constant padding and integer inputs - make_pad_image_loaders(dtypes=[torch.uint8]), - ["constant", "symmetric", "edge", "reflect"], - ): - yield ArgsKwargs(image_loader, padding=[1], padding_mode=padding_mode) - - # `torch.nn.functional.pad` does not support symmetric padding, and thus we have a custom implementation. Besides - # negative padding, this is already handled by the inputs above. - for image_loader in make_pad_image_loaders(): - yield ArgsKwargs(image_loader, padding=[-1], padding_mode="symmetric") - - -def reference_inputs_pad_image_tensor(): - for image_loader, params in itertools.product( - make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]), _PAD_PARAMS - ): - for fill in get_fills( - num_channels=image_loader.num_channels, - dtype=image_loader.dtype, - ): - # FIXME: PIL kernel doesn't support sequences of length 1 if the number of channels is larger. Shouldn't it? - if isinstance(fill, (list, tuple)): - continue - - yield ArgsKwargs(image_loader, fill=fill, **params) - - -def sample_inputs_pad_bounding_boxes(): - for bounding_boxes_loader, padding in itertools.product( - make_bounding_box_loaders(), [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]] - ): - yield ArgsKwargs( - bounding_boxes_loader, - format=bounding_boxes_loader.format, - canvas_size=bounding_boxes_loader.canvas_size, - padding=padding, - padding_mode="constant", - ) - - -def sample_inputs_pad_mask(): - for mask_loader in make_mask_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_categories=[10], num_objects=[5]): - yield ArgsKwargs(mask_loader, padding=[1]) - - -def reference_inputs_pad_mask(): - for mask_loader, fill, params in itertools.product( - make_mask_loaders(num_objects=[1], extra_dims=[()]), [None, 127], _PAD_PARAMS - ): - yield ArgsKwargs(mask_loader, fill=fill, **params) - - -def sample_inputs_pad_video(): - for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): - yield ArgsKwargs(video_loader, padding=[1]) - - -def reference_pad_bounding_boxes(bounding_boxes, *, format, canvas_size, padding, padding_mode): - - left, right, top, bottom = _parse_pad_padding(padding) - - affine_matrix = np.array( - [ - [1, 0, left], - [0, 1, top], - ], - dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32", - ) - - height = canvas_size[0] + top + bottom - width = canvas_size[1] + left + right - - expected_bboxes = reference_affine_bounding_boxes_helper( - bounding_boxes, format=format, canvas_size=(height, width), affine_matrix=affine_matrix - ) - return expected_bboxes, (height, width) - - -def reference_inputs_pad_bounding_boxes(): - for bounding_boxes_loader, padding in itertools.product( - make_bounding_box_loaders(extra_dims=((), (4,))), [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]] - ): - yield ArgsKwargs( - bounding_boxes_loader, - format=bounding_boxes_loader.format, - canvas_size=bounding_boxes_loader.canvas_size, - padding=padding, - padding_mode="constant", - ) - - -def pad_xfail_jit_fill_condition(args_kwargs): - fill = args_kwargs.kwargs.get("fill") - if not isinstance(fill, (list, tuple)): - return False - elif isinstance(fill, tuple): - return True - else: # isinstance(fill, list): - return all(isinstance(f, int) for f in fill) - - -KERNEL_INFOS.extend( - [ - KernelInfo( - F.pad_image, - sample_inputs_fn=sample_inputs_pad_image_tensor, - reference_fn=pil_reference_wrapper(F._pad_image_pil), - reference_inputs_fn=reference_inputs_pad_image_tensor, - float32_vs_uint8=float32_vs_uint8_fill_adapter, - closeness_kwargs=float32_vs_uint8_pixel_difference(), - test_marks=[ - xfail_jit_python_scalar_arg("padding"), - xfail_jit( - "F.pad only supports vector fills for list of floats", condition=pad_xfail_jit_fill_condition - ), - ], - ), - KernelInfo( - F.pad_bounding_boxes, - sample_inputs_fn=sample_inputs_pad_bounding_boxes, - reference_fn=reference_pad_bounding_boxes, - reference_inputs_fn=reference_inputs_pad_bounding_boxes, - test_marks=[ - xfail_jit_python_scalar_arg("padding"), - ], - ), - KernelInfo( - F.pad_mask, - sample_inputs_fn=sample_inputs_pad_mask, - reference_fn=pil_reference_wrapper(F._pad_image_pil), - reference_inputs_fn=reference_inputs_pad_mask, - float32_vs_uint8=float32_vs_uint8_fill_adapter, - ), - KernelInfo( - F.pad_video, - sample_inputs_fn=sample_inputs_pad_video, - ), - ] -) - _PERSPECTIVE_COEFFS = [ [1.2405, 0.1772, -6.9113, 0.0463, 1.251, -5.235, 0.00013, 0.0018], [0.7366, -0.11724, 1.45775, -0.15012, 0.73406, 2.6019, -0.0072, -0.0063],