From 2583987b24cc98dd7b691d93b65aeb507ac6599a Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 3 Jul 2023 13:16:03 +0200
Subject: [PATCH 01/17] extract make_* functions out of make_*_loader

---
 test/common_utils.py | 46 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 37 insertions(+), 9 deletions(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index abefd07c43d..28d36855d63 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -492,6 +492,34 @@ def get_num_channels(color_space):
     return num_channels
 
 
+def make_image(
+    spatial_size,
+    *,
+    color_space="RGB",
+    batch_dims=(),
+    dtype=torch.float32,
+    device="cpu",
+    constant_alpha=True,
+    memory_format=torch.contiguous_format,
+):
+    spatial_size = _parse_spatial_size(spatial_size)
+    num_channels = get_num_channels(color_space)
+    max_value = get_max_value(dtype)
+
+    data = torch.testing.make_tensor(
+        (*batch_dims, num_channels, *spatial_size),
+        low=0,
+        high=max_value,
+        dtype=dtype,
+        device=device,
+        memory_format=memory_format,
+    )
+    if color_space in {"GRAY_ALPHA", "RGBA"} and constant_alpha:
+        data[..., -1, :, :] = max_value
+
+    return datapoints.Image(data)
+
+
 def make_image_loader(
     size="random",
     *,
@@ -505,20 +533,20 @@ def make_image_loader(
     num_channels = get_num_channels(color_space)
 
     def fn(shape, dtype, device, memory_format):
-        max_value = get_max_value(dtype)
-        data = torch.testing.make_tensor(
-            shape, low=0, high=max_value, dtype=dtype, device=device, memory_format=memory_format
+        *batch_dims, _, spatial_size = shape
+        return make_image(
+            spatial_size,
+            color_space=color_space,
+            batch_dims=batch_dims,
+            dtype=dtype,
+            device=device,
+            constant_alpha=constant_alpha,
+            memory_format=memory_format,
         )
-        if color_space in {"GRAY_ALPHA", "RGBA"} and constant_alpha:
-            data[..., -1, :, :] = max_value
-        return datapoints.Image(data)
 
     return ImageLoader(fn, shape=(*extra_dims, num_channels, *size), dtype=dtype, memory_format=memory_format)
 
 
-make_image = from_loader(make_image_loader)
-
-
 def make_image_loaders(
     *,
     sizes=DEFAULT_SPATIAL_SIZES,

From 093e6d6d58c5a971ce5b732c776313f36879f1c5 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 3 Jul 2023 13:35:42 +0200
Subject: [PATCH 02/17] fix arg name

---
 test/common_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index 28d36855d63..cf7c8623ace 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -493,7 +493,7 @@ def get_num_channels(color_space):
 
 
 def make_image(
-    spatial_size,
+    size,
     *,
     color_space="RGB",
     batch_dims=(),
@@ -502,7 +502,7 @@ def make_image(
     constant_alpha=True,
     memory_format=torch.contiguous_format,
 ):
-    spatial_size = _parse_spatial_size(spatial_size)
+    spatial_size = _parse_spatial_size(size)
     num_channels = get_num_channels(color_space)
     max_value = get_max_value(dtype)
 

From c1850ac64b7b621de1e53e762b6ed6a3956f6e5f Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 3 Jul 2023 13:59:50 +0200
Subject: [PATCH 03/17] port remaining helpers

---
 test/common_utils.py | 149 ++++++++++++++++++++++++++-----------------
 1 file changed, 91 insertions(+), 58 deletions(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index cf7c8623ace..f6cea90c3b0 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -496,10 +496,10 @@ def make_image(
     size,
     *,
     color_space="RGB",
+    constant_alpha=True,
     batch_dims=(),
     dtype=torch.float32,
     device="cpu",
-    constant_alpha=True,
     memory_format=torch.contiguous_format,
 ):
     spatial_size = _parse_spatial_size(size)
@@ -629,59 +629,64 @@ def randint_with_tensor_bounds(arg1, arg2=None, **kwargs):
     ).reshape(low.shape)
 
 
+def make_bounding_box(
+    format=datapoints.BoundingBoxFormat.XYXY, spatial_size="random", batch_dims=(), dtype=torch.float32, device="cpu"
+):
+    if isinstance(format, str):
+        format = datapoints.BoundingBoxFormat[format]
+
+    spatial_size = _parse_spatial_size(spatial_size, name="spatial_size")
+
+    if any(dim == 0 for dim in batch_dims):
+        return datapoints.BoundingBox(
+            torch.empty(*batch_dims, 4, dtype=dtype, device=device), format=format, spatial_size=spatial_size
+        )
+
+    height, width = spatial_size
+    if format == datapoints.BoundingBoxFormat.XYXY:
+        x1 = torch.randint(0, width // 2, batch_dims)
+        y1 = torch.randint(0, height // 2, batch_dims)
+        x2 = randint_with_tensor_bounds(x1 + 1, width - x1) + x1
+        y2 = randint_with_tensor_bounds(y1 + 1, height - y1) + y1
+        parts = (x1, y1, x2, y2)
+    elif format == datapoints.BoundingBoxFormat.XYWH:
+        x = torch.randint(0, width // 2, batch_dims)
+        y = torch.randint(0, height // 2, batch_dims)
+        w = randint_with_tensor_bounds(1, width - x)
+        h = randint_with_tensor_bounds(1, height - y)
+        parts = (x, y, w, h)
+    elif format == datapoints.BoundingBoxFormat.CXCYWH:
+        cx = torch.randint(1, width - 1, batch_dims)
+        cy = torch.randint(1, height - 1, batch_dims)
+        w = randint_with_tensor_bounds(1, torch.minimum(cx, width - cx) + 1)
+        h = randint_with_tensor_bounds(1, torch.minimum(cy, height - cy) + 1)
+        parts = (cx, cy, w, h)
+    else:
+        raise ValueError(f"Can't make bounding box in format {format}")
+
+    return datapoints.BoundingBox(
+        torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, spatial_size=spatial_size
+    )
+
+
 def make_bounding_box_loader(*, extra_dims=(), format, spatial_size="random", dtype=torch.float32):
     if isinstance(format, str):
         format = datapoints.BoundingBoxFormat[format]
-    if format not in {
-        datapoints.BoundingBoxFormat.XYXY,
-        datapoints.BoundingBoxFormat.XYWH,
-        datapoints.BoundingBoxFormat.CXCYWH,
-    }:
-        raise pytest.UsageError(f"Can't make bounding box in format {format}")
 
     spatial_size = _parse_spatial_size(spatial_size, name="spatial_size")
 
     def fn(shape, dtype, device):
-        *extra_dims, num_coordinates = shape
+        *batch_dims, num_coordinates = shape
         if num_coordinates != 4:
             raise pytest.UsageError()
 
-        if any(dim == 0 for dim in extra_dims):
-            return datapoints.BoundingBox(
-                torch.empty(*extra_dims, 4, dtype=dtype, device=device), format=format, spatial_size=spatial_size
-            )
-
-        height, width = spatial_size
-
-        if format == datapoints.BoundingBoxFormat.XYXY:
-            x1 = torch.randint(0, width // 2, extra_dims)
-            y1 = torch.randint(0, height // 2, extra_dims)
-            x2 = randint_with_tensor_bounds(x1 + 1, width - x1) + x1
-            y2 = randint_with_tensor_bounds(y1 + 1, height - y1) + y1
-            parts = (x1, y1, x2, y2)
-        elif format == datapoints.BoundingBoxFormat.XYWH:
-            x = torch.randint(0, width // 2, extra_dims)
-            y = torch.randint(0, height // 2, extra_dims)
-            w = randint_with_tensor_bounds(1, width - x)
-            h = randint_with_tensor_bounds(1, height - y)
-            parts = (x, y, w, h)
-        else:  # format == features.BoundingBoxFormat.CXCYWH:
-            cx = torch.randint(1, width - 1, extra_dims)
-            cy = torch.randint(1, height - 1, extra_dims)
-            w = randint_with_tensor_bounds(1, torch.minimum(cx, width - cx) + 1)
-            h = randint_with_tensor_bounds(1, torch.minimum(cy, height - cy) + 1)
-            parts = (cx, cy, w, h)
-
-        return datapoints.BoundingBox(
-            torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, spatial_size=spatial_size
+        return make_bounding_box(
+            format=format, spatial_size=spatial_size, batch_dims=batch_dims, dtype=dtype, device=device
         )
 
     return BoundingBoxLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, spatial_size=spatial_size)
 
 
-make_bounding_box = from_loader(make_bounding_box_loader)
-
-
 def make_bounding_box_loaders(
     *,
     extra_dims=DEFAULT_EXTRA_DIMS,
@@ -700,21 +705,31 @@ class MaskLoader(TensorLoader):
     pass
 
 
+def make_detection_mask(size, *, num_objects="random", batch_dims=(), dtype=torch.bool, device="cpu"):
+    """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks"""
+    spatial_size = _parse_spatial_size(size)
+    num_objects = int(torch.randint(1, 11, ())) if num_objects == "random" else num_objects
+
+    data = torch.testing.make_tensor(
+        (*batch_dims, num_objects, *spatial_size), low=0, high=2, dtype=dtype, device=device
+    )
+    return datapoints.Mask(data)
+
+
 def make_detection_mask_loader(size="random", *, num_objects="random", extra_dims=(), dtype=torch.uint8):
     # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects
     size = _parse_spatial_size(size)
     num_objects = int(torch.randint(1, 11, ())) if num_objects == "random" else num_objects
 
     def fn(shape, dtype, device):
-        data = torch.testing.make_tensor(shape, low=0, high=2, dtype=dtype, device=device)
-        return datapoints.Mask(data)
+        *batch_dims, num_objects, height, width = shape
+        return make_detection_mask(
+            (height, width), num_objects=num_objects, batch_dims=batch_dims, dtype=dtype, device=device
+        )
 
     return MaskLoader(fn, shape=(*extra_dims, num_objects, *size), dtype=dtype)
 
 
-make_detection_mask = from_loader(make_detection_mask_loader)
-
-
 def make_detection_mask_loaders(
     sizes=DEFAULT_SPATIAL_SIZES,
     num_objects=(1, 0, "random"),
@@ -728,19 +743,28 @@ def make_detection_mask_loaders(
 make_detection_masks = from_loaders(make_detection_mask_loaders)
 
 
-def make_segmentation_mask_loader(size="random", *, num_categories="random", extra_dims=(), dtype=torch.uint8):
-    # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values
-    size = _parse_spatial_size(size)
+def make_segmentation_mask(size, *, num_categories="random", batch_dims=(), dtype=torch.uint8, device="cpu"):
+    """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value"""
+    spatial_size = _parse_spatial_size(size)
     num_categories = int(torch.randint(1, 11, ())) if num_categories == "random" else num_categories
 
-    def fn(shape, dtype, device):
-        data = torch.testing.make_tensor(shape, low=0, high=num_categories, dtype=dtype, device=device)
-        return datapoints.Mask(data)
+    data = torch.testing.make_tensor(
+        (*batch_dims, *spatial_size), low=0, high=num_categories, dtype=dtype, device=device
+    )
+    return datapoints.Mask(data)
 
-    return MaskLoader(fn, shape=(*extra_dims, *size), dtype=dtype)
 
+def make_segmentation_mask_loader(size="random", *, num_categories="random", extra_dims=(), dtype=torch.uint8):
+    # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values
+    spatial_size = _parse_spatial_size(size)
+
+    def fn(shape, dtype, device):
+        *batch_dims, height, width = shape
+        return make_segmentation_mask(
+            (height, width), num_categories=num_categories, batch_dims=batch_dims, dtype=dtype, device=device
+        )
 
-make_segmentation_mask = from_loader(make_segmentation_mask_loader)
+    return MaskLoader(fn, shape=(*extra_dims, *spatial_size), dtype=dtype)
 
 
 def make_segmentation_mask_loaders(
@@ -778,6 +802,12 @@ class VideoLoader(ImageLoader):
     pass
 
 
+def make_video(size, *, num_frames="random", batch_dims=(), **kwargs):
+    num_frames = int(torch.randint(1, 5, ())) if num_frames == "random" else num_frames
+
+    return datapoints.Video(make_image(size=size, batch_dims=(*batch_dims, num_frames), **kwargs))
+
+
 def make_video_loader(
     size="random",
     *,
@@ -790,17 +820,20 @@ def make_video_loader(
     num_frames = int(torch.randint(1, 5, ())) if num_frames == "random" else num_frames
 
     def fn(shape, dtype, device, memory_format):
-        video = make_image(
-            size=shape[-2:], extra_dims=shape[:-3], dtype=dtype, device=device, memory_format=memory_format
+        *batch_dims, num_frames, _, height, width = shape
+        return make_video(
+            (height, width),
+            num_frames=num_frames,
+            color_space=color_space,
+            batch_dims=batch_dims,
+            dtype=dtype,
+            device=device,
+            memory_format=memory_format,
         )
-        return datapoints.Video(video)
 
     return VideoLoader(fn, shape=(*extra_dims, num_frames, get_num_channels(color_space), *size), dtype=dtype)
 
 
-make_video = from_loader(make_video_loader)
-
-
 def make_video_loaders(
     *,
     sizes=DEFAULT_SPATIAL_SIZES,

From ec7e472839693f9c78a6e02a5a806af7b0d6d72b Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 3 Jul 2023 21:15:47 +0200
Subject: [PATCH 04/17] update

---
 test/common_utils.py                   | 22 +++++++++++---------
 test/test_transforms_v2.py             | 19 ++++++++---------
 test/test_transforms_v2_consistency.py | 17 +++++++++++-----
 test/test_transforms_v2_refactored.py  | 28 +++++++++-----------------
 test/test_transforms_v2_utils.py       |  4 ++--
 5 files changed, 46 insertions(+), 44 deletions(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index f6cea90c3b0..05d8325db3b 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -496,14 +496,14 @@ def make_image(
     size,
     *,
     color_space="RGB",
-    constant_alpha=True,
     batch_dims=(),
-    dtype=torch.float32,
+    dtype=None,
     device="cpu",
     memory_format=torch.contiguous_format,
 ):
     spatial_size = _parse_spatial_size(size)
     num_channels = get_num_channels(color_space)
+    dtype = dtype or torch.uint8
     max_value = get_max_value(dtype)
 
     data = torch.testing.make_tensor(
@@ -514,7 +514,7 @@ def make_image(
         device=device,
         memory_format=memory_format,
     )
-    if color_space in {"GRAY_ALPHA", "RGBA"} and constant_alpha:
+    if color_space in {"GRAY_ALPHA", "RGBA"}:
         data[..., -1, :, :] = max_value
 
     return datapoints.Image(data)
@@ -529,18 +529,19 @@ def make_image_loader(
     constant_alpha=True,
     memory_format=torch.contiguous_format,
 ):
+    if not constant_alpha:
+        raise ValueError("This should never happen")
     size = _parse_spatial_size(size)
     num_channels = get_num_channels(color_space)
 
     def fn(shape, dtype, device, memory_format):
-        *batch_dims, _, spatial_size = shape
+        *batch_dims, _, height, width = shape
         return make_image(
-            spatial_size,
+            (height, width),
             color_space=color_space,
             batch_dims=batch_dims,
             dtype=dtype,
             device=device,
-            constant_alpha=constant_alpha,
             memory_format=memory_format,
         )
 
@@ -630,12 +631,13 @@ def randint_with_tensor_bounds(arg1, arg2=None, **kwargs):
 
 
 def make_bounding_box(
-    format=datapoints.BoundingBoxFormat.XYXY, spatial_size="random", batch_dims=(), dtype=torch.float32, device="cpu"
+    format=datapoints.BoundingBoxFormat.XYXY, spatial_size="random", batch_dims=(), dtype=None, device="cpu"
 ):
     if isinstance(format, str):
         format = datapoints.BoundingBoxFormat[format]
 
     spatial_size = _parse_spatial_size(spatial_size, name="spatial_size")
+    dtype = dtype or torch.float32
 
     if any(dim == 0 for dim in batch_dims):
         return datapoints.BoundingBox(
@@ -705,10 +707,11 @@ class MaskLoader(TensorLoader):
     pass
 
 
-def make_detection_mask(size, *, num_objects="random", batch_dims=(), dtype=torch.bool, device="cpu"):
+def make_detection_mask(size, *, num_objects="random", batch_dims=(), dtype=None, device="cpu"):
     """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks"""
     spatial_size = _parse_spatial_size(size)
     num_objects = int(torch.randint(1, 11, ())) if num_objects == "random" else num_objects
+    dtype = dtype or torch.bool
 
     data = torch.testing.make_tensor(
         (*batch_dims, num_objects, *spatial_size), low=0, high=2, dtype=dtype, device=device
@@ -743,10 +746,11 @@ def make_detection_mask_loaders(
 make_detection_masks = from_loaders(make_detection_mask_loaders)
 
 
-def make_segmentation_mask(size, *, num_categories="random", batch_dims=(), dtype=torch.uint8, device="cpu"):
+def make_segmentation_mask(size, *, num_categories="random", batch_dims=(), dtype=None, device="cpu"):
     """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value"""
     spatial_size = _parse_spatial_size(size)
     num_categories = int(torch.randint(1, 11, ())) if num_categories == "random" else num_categories
+    dtype = dtype or torch.uint8
 
     data = torch.testing.make_tensor(
         (*batch_dims, *spatial_size), low=0, high=num_categories, dtype=dtype, device=device
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 093c378aa72..db225429262 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -17,6 +17,7 @@
     assert_equal,
     assert_run_python_script,
     cpu_and_cuda,
+    DEFAULT_PORTRAIT_SPATIAL_SIZE,
     make_bounding_box,
     make_bounding_boxes,
     make_detection_mask,
@@ -167,8 +168,8 @@ class TestSmoke:
     @pytest.mark.parametrize(
         "image_or_video",
         [
-            make_image(),
-            make_video(),
+            make_image(size=DEFAULT_PORTRAIT_SPATIAL_SIZE),
+            make_video(size=DEFAULT_PORTRAIT_SPATIAL_SIZE),
             next(make_pil_images(color_spaces=["RGB"])),
             next(make_vanilla_tensor_images()),
         ],
@@ -182,13 +183,13 @@ def test_common(self, transform, adapter, container_type, image_or_video, device
             video_datapoint=make_video(size=spatial_size),
             image_pil=next(make_pil_images(sizes=[spatial_size], color_spaces=["RGB"])),
             bounding_box_xyxy=make_bounding_box(
-                format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(3,)
+                format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(3,)
             ),
             bounding_box_xywh=make_bounding_box(
-                format=datapoints.BoundingBoxFormat.XYWH, spatial_size=spatial_size, extra_dims=(4,)
+                format=datapoints.BoundingBoxFormat.XYWH, spatial_size=spatial_size, batch_dims=(4,)
             ),
             bounding_box_cxcywh=make_bounding_box(
-                format=datapoints.BoundingBoxFormat.CXCYWH, spatial_size=spatial_size, extra_dims=(5,)
+                format=datapoints.BoundingBoxFormat.CXCYWH, spatial_size=spatial_size, batch_dims=(5,)
             ),
             bounding_box_degenerate_xyxy=datapoints.BoundingBox(
                 [
@@ -352,7 +353,7 @@ def test_random_resized_crop(self, transform, input):
             next(make_vanilla_tensor_images()),
             next(make_vanilla_tensor_images()),
             next(make_pil_images()),
-            make_image(),
+            make_image(size=DEFAULT_PORTRAIT_SPATIAL_SIZE),
             next(make_videos()),
         ],
         3,
@@ -1124,7 +1125,7 @@ def test__transform(self, mocker):
         transform = transforms.RandomIoUCrop()
 
         image = datapoints.Image(torch.rand(3, 32, 24))
-        bboxes = make_bounding_box(format="XYXY", spatial_size=(32, 24), extra_dims=(6,))
+        bboxes = make_bounding_box(format="XYXY", spatial_size=(32, 24), batch_dims=(6,))
         masks = make_detection_mask((32, 24), num_objects=6)
 
         sample = [image, bboxes, masks]
@@ -1346,8 +1347,8 @@ class TestToDtype:
     )
     def test_call(self, dtype, expected_dtypes):
         sample = dict(
-            video=make_video(dtype=torch.int64),
-            image=make_image(dtype=torch.uint8),
+            video=make_video(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.int64),
+            image=make_image(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.uint8),
             bounding_box=make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY, dtype=torch.float32),
             str="str",
             int=0,
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index f035dde45ed..ccaa973d8d2 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -17,6 +17,7 @@
     ArgsKwargs,
     assert_close,
     assert_equal,
+    DEFAULT_PORTRAIT_SPATIAL_SIZE,
     make_bounding_box,
     make_detection_mask,
     make_image,
@@ -708,8 +709,14 @@ def test_call_consistency(config, args_kwargs):
             id=transform_cls.__name__,
         )
         for transform_cls, get_params_args_kwargs in [
-            (v2_transforms.RandomResizedCrop, ArgsKwargs(make_image(), scale=[0.3, 0.7], ratio=[0.5, 1.5])),
-            (v2_transforms.RandomErasing, ArgsKwargs(make_image(), scale=(0.3, 0.7), ratio=(0.5, 1.5))),
+            (
+                v2_transforms.RandomResizedCrop,
+                ArgsKwargs(make_image(size=DEFAULT_PORTRAIT_SPATIAL_SIZE), scale=[0.3, 0.7], ratio=[0.5, 1.5]),
+            ),
+            (
+                v2_transforms.RandomErasing,
+                ArgsKwargs(make_image(size=DEFAULT_PORTRAIT_SPATIAL_SIZE), scale=(0.3, 0.7), ratio=(0.5, 1.5)),
+            ),
             (v2_transforms.ColorJitter, ArgsKwargs(brightness=None, contrast=None, saturation=None, hue=None)),
             (v2_transforms.ElasticTransform, ArgsKwargs(alpha=[15.3, 27.2], sigma=[2.5, 3.9], size=[17, 31])),
             (v2_transforms.GaussianBlur, ArgsKwargs(0.3, 1.4)),
@@ -1090,7 +1097,7 @@ def make_label(extra_dims, categories):
 
         pil_image = to_image_pil(make_image(size=size, color_space="RGB"))
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
@@ -1100,7 +1107,7 @@ def make_label(extra_dims, categories):
 
         tensor_image = torch.Tensor(make_image(size=size, color_space="RGB"))
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
@@ -1110,7 +1117,7 @@ def make_label(extra_dims, categories):
 
         datapoint_image = make_image(size=size, color_space="RGB")
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 2130a8cf50a..43dc9e466ec 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -310,34 +310,24 @@ def wrapper(input, *args, **kwargs):
 
 def make_input(input_type, *, dtype=None, device="cpu", spatial_size=(17, 11), mask_type="segmentation", **kwargs):
     if input_type in {torch.Tensor, PIL.Image.Image, datapoints.Image}:
-        input = make_image(size=spatial_size, dtype=dtype or torch.uint8, device=device, **kwargs)
+        input = make_image(size=spatial_size, dtype=dtype, device=device, **kwargs)
         if input_type is torch.Tensor:
             input = input.as_subclass(torch.Tensor)
         elif input_type is PIL.Image.Image:
             input = F.to_image_pil(input)
     elif input_type is datapoints.BoundingBox:
-        kwargs.setdefault("format", datapoints.BoundingBoxFormat.XYXY)
-        input = make_bounding_box(
-            dtype=dtype or torch.float32,
-            device=device,
-            spatial_size=spatial_size,
-            **kwargs,
-        )
+        input = make_bounding_box(dtype=dtype, device=device, spatial_size=spatial_size, **kwargs)
     elif input_type is datapoints.Mask:
-        if mask_type == "segmentation":
-            make_mask = make_segmentation_mask
-            default_dtype = torch.uint8
-        elif mask_type == "detection":
-            make_mask = make_detection_mask
-            default_dtype = torch.bool
-        else:
-            raise ValueError(f"`mask_type` can be `'segmentation'` or `'detection'`, but got {mask_type}.")
-        input = make_mask(size=spatial_size, dtype=dtype or default_dtype, device=device, **kwargs)
+        make_mask = {
+            "segmentation": make_segmentation_mask,
+            "detection": make_detection_mask,
+        }[mask_type]
+        input = make_mask(size=spatial_size, dtype=dtype, device=device, **kwargs)
     elif input_type is datapoints.Video:
-        input = make_video(size=spatial_size, dtype=dtype or torch.uint8, device=device, **kwargs)
+        input = make_video(size=spatial_size, dtype=dtype, device=device, **kwargs)
     else:
         raise TypeError(
-            f"Input can either be a plain tensor, any TorchVision datapoint, or a PIL image, "
+            f"Input type can either be torch.Tensor, PIL.Image.Image, or any TorchVision datapoint class, "
             f"but got {input_type} instead."
         )
 
diff --git a/test/test_transforms_v2_utils.py b/test/test_transforms_v2_utils.py
index 198ab39a475..98271b893d6 100644
--- a/test/test_transforms_v2_utils.py
+++ b/test/test_transforms_v2_utils.py
@@ -4,14 +4,14 @@
 import torch
 
 import torchvision.transforms.v2.utils
-from common_utils import make_bounding_box, make_detection_mask, make_image
+from common_utils import DEFAULT_PORTRAIT_SPATIAL_SIZE, make_bounding_box, make_detection_mask, make_image
 
 from torchvision import datapoints
 from torchvision.transforms.v2.functional import to_image_pil
 from torchvision.transforms.v2.utils import has_all, has_any
 
 
-IMAGE = make_image(color_space="RGB")
+IMAGE = make_image(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, color_space="RGB")
 BOUNDING_BOX = make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY, spatial_size=IMAGE.spatial_size)
 MASK = make_detection_mask(size=IMAGE.spatial_size)
 

From 5b56570762cf27befd8168077b70b6d19e92ade0 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 3 Jul 2023 21:33:42 +0200
Subject: [PATCH 05/17] size -> spatial_size

---
 test/common_utils.py                   | 23 ++++++++---------------
 test/test_prototype_transforms.py      | 16 ++++++++--------
 test/test_transforms_v2.py             | 18 +++++++++---------
 test/test_transforms_v2_consistency.py | 22 +++++++++++-----------
 test/test_transforms_v2_refactored.py  | 10 +++++-----
 test/test_transforms_v2_utils.py       |  4 ++--
 6 files changed, 43 insertions(+), 50 deletions(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index 05d8325db3b..feae7717ecb 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -493,15 +493,9 @@ def get_num_channels(color_space):
 
 
 def make_image(
-    size,
-    *,
-    color_space="RGB",
-    batch_dims=(),
-    dtype=None,
-    device="cpu",
-    memory_format=torch.contiguous_format,
+    spatial_size, *, color_space="RGB", batch_dims=(), dtype=None, device="cpu", memory_format=torch.contiguous_format
 ):
-    spatial_size = _parse_spatial_size(size)
+    spatial_size = _parse_spatial_size(spatial_size)
     num_channels = get_num_channels(color_space)
     dtype = dtype or torch.uint8
     max_value = get_max_value(dtype)
@@ -707,9 +701,9 @@ class MaskLoader(TensorLoader):
     pass
 
 
-def make_detection_mask(size, *, num_objects="random", batch_dims=(), dtype=None, device="cpu"):
+def make_detection_mask(spatial_size, *, num_objects="random", batch_dims=(), dtype=None, device="cpu"):
     """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks"""
-    spatial_size = _parse_spatial_size(size)
+    spatial_size = _parse_spatial_size(spatial_size)
     num_objects = int(torch.randint(1, 11, ())) if num_objects == "random" else num_objects
     dtype = dtype or torch.bool
 
@@ -746,9 +740,9 @@ def make_detection_mask_loaders(
 make_detection_masks = from_loaders(make_detection_mask_loaders)
 
 
-def make_segmentation_mask(size, *, num_categories="random", batch_dims=(), dtype=None, device="cpu"):
+def make_segmentation_mask(spatial_size, *, num_categories="random", batch_dims=(), dtype=None, device="cpu"):
     """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value"""
-    spatial_size = _parse_spatial_size(size)
+    spatial_size = _parse_spatial_size(spatial_size)
     num_categories = int(torch.randint(1, 11, ())) if num_categories == "random" else num_categories
     dtype = dtype or torch.uint8
 
@@ -806,10 +800,9 @@ class VideoLoader(ImageLoader):
     pass
 
 
-def make_video(size, *, num_frames="random", batch_dims=(), **kwargs):
+def make_video(spatial_size, *, num_frames="random", batch_dims=(), **kwargs):
     num_frames = int(torch.randint(1, 5, ())) if num_frames == "random" else num_frames
-
-    return datapoints.Video(make_image(size=size, batch_dims=(*batch_dims, num_frames), **kwargs))
+    return datapoints.Video(make_image(spatial_size, batch_dims=(*batch_dims, num_frames), **kwargs))
 
 
 def make_video_loader(
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 255c3b5c32f..52b572c7628 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -215,7 +215,7 @@ def test__get_params(self, mocker):
         transform = transforms.FixedSizeCrop(size=crop_size)
 
         flat_inputs = [
-            make_image(size=spatial_size, color_space="RGB"),
+            make_image(spatial_size=spatial_size, color_space="RGB"),
             make_bounding_box(format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=batch_shape),
         ]
         params = transform._get_params(flat_inputs)
@@ -314,7 +314,7 @@ def test__transform_culling(self, mocker):
         bounding_boxes = make_bounding_box(
             format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
         )
-        masks = make_detection_mask(size=spatial_size, extra_dims=(batch_size,))
+        masks = make_detection_mask(spatial_size=spatial_size)
         labels = make_label(extra_dims=(batch_size,))
 
         transform = transforms.FixedSizeCrop((-1, -1))
@@ -494,29 +494,29 @@ def make_datapoints():
         size = (600, 800)
         num_objects = 22
 
-        pil_image = to_image_pil(make_image(size=size, color_space="RGB"))
+        pil_image = to_image_pil(make_image(spatial_size=size, color_space="RGB"))
         target = {
             "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
-            "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
+            "masks": make_detection_mask(spatial_size=size, num_objects=num_objects, dtype=torch.long),
         }
 
         yield (pil_image, target)
 
-        tensor_image = torch.Tensor(make_image(size=size, color_space="RGB"))
+        tensor_image = torch.Tensor(make_image(spatial_size=size, color_space="RGB"))
         target = {
             "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
-            "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
+            "masks": make_detection_mask(spatial_size=size, num_objects=num_objects, dtype=torch.long),
         }
 
         yield (tensor_image, target)
 
-        datapoint_image = make_image(size=size, color_space="RGB")
+        datapoint_image = make_image(spatial_size=size, color_space="RGB")
         target = {
             "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
-            "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
+            "masks": make_detection_mask(spatial_size=size, num_objects=num_objects, dtype=torch.long),
         }
 
         yield (datapoint_image, target)
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index db225429262..9cc95312a06 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -168,8 +168,8 @@ class TestSmoke:
     @pytest.mark.parametrize(
         "image_or_video",
         [
-            make_image(size=DEFAULT_PORTRAIT_SPATIAL_SIZE),
-            make_video(size=DEFAULT_PORTRAIT_SPATIAL_SIZE),
+            make_image(spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE),
+            make_video(spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE),
             next(make_pil_images(color_spaces=["RGB"])),
             next(make_vanilla_tensor_images()),
         ],
@@ -179,8 +179,8 @@ def test_common(self, transform, adapter, container_type, image_or_video, device
         spatial_size = F.get_spatial_size(image_or_video)
         input = dict(
             image_or_video=image_or_video,
-            image_datapoint=make_image(size=spatial_size),
-            video_datapoint=make_video(size=spatial_size),
+            image_datapoint=make_image(spatial_size=spatial_size),
+            video_datapoint=make_video(spatial_size=spatial_size),
             image_pil=next(make_pil_images(sizes=[spatial_size], color_spaces=["RGB"])),
             bounding_box_xyxy=make_bounding_box(
                 format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(3,)
@@ -227,8 +227,8 @@ def test_common(self, transform, adapter, container_type, image_or_video, device
                 format=datapoints.BoundingBoxFormat.CXCYWH,
                 spatial_size=spatial_size,
             ),
-            detection_mask=make_detection_mask(size=spatial_size),
-            segmentation_mask=make_segmentation_mask(size=spatial_size),
+            detection_mask=make_detection_mask(spatial_size=spatial_size),
+            segmentation_mask=make_segmentation_mask(spatial_size=spatial_size),
             int=0,
             float=0.0,
             bool=True,
@@ -353,7 +353,7 @@ def test_random_resized_crop(self, transform, input):
             next(make_vanilla_tensor_images()),
             next(make_vanilla_tensor_images()),
             next(make_pil_images()),
-            make_image(size=DEFAULT_PORTRAIT_SPATIAL_SIZE),
+            make_image(spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE),
             next(make_videos()),
         ],
         3,
@@ -1347,8 +1347,8 @@ class TestToDtype:
     )
     def test_call(self, dtype, expected_dtypes):
         sample = dict(
-            video=make_video(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.int64),
-            image=make_image(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.uint8),
+            video=make_video(spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.int64),
+            image=make_image(spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.uint8),
             bounding_box=make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY, dtype=torch.float32),
             str="str",
             int=0,
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index ccaa973d8d2..cf1b6b6b52c 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -711,11 +711,11 @@ def test_call_consistency(config, args_kwargs):
         for transform_cls, get_params_args_kwargs in [
             (
                 v2_transforms.RandomResizedCrop,
-                ArgsKwargs(make_image(size=DEFAULT_PORTRAIT_SPATIAL_SIZE), scale=[0.3, 0.7], ratio=[0.5, 1.5]),
+                ArgsKwargs(make_image(spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE), scale=[0.3, 0.7], ratio=[0.5, 1.5]),
             ),
             (
                 v2_transforms.RandomErasing,
-                ArgsKwargs(make_image(size=DEFAULT_PORTRAIT_SPATIAL_SIZE), scale=(0.3, 0.7), ratio=(0.5, 1.5)),
+                ArgsKwargs(make_image(spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE), scale=(0.3, 0.7), ratio=(0.5, 1.5)),
             ),
             (v2_transforms.ColorJitter, ArgsKwargs(brightness=None, contrast=None, saturation=None, hue=None)),
             (v2_transforms.ElasticTransform, ArgsKwargs(alpha=[15.3, 27.2], sigma=[2.5, 3.9], size=[17, 31])),
@@ -724,7 +724,7 @@ def test_call_consistency(config, args_kwargs):
                 v2_transforms.RandomAffine,
                 ArgsKwargs(degrees=[-20.0, 10.0], translate=None, scale_ranges=None, shears=None, img_size=[15, 29]),
             ),
-            (v2_transforms.RandomCrop, ArgsKwargs(make_image(size=(61, 47)), output_size=(19, 25))),
+            (v2_transforms.RandomCrop, ArgsKwargs(make_image(spatial_size=(61, 47)), output_size=(19, 25))),
             (v2_transforms.RandomPerspective, ArgsKwargs(23, 17, 0.5)),
             (v2_transforms.RandomRotation, ArgsKwargs(degrees=[-20.0, 10.0])),
             (v2_transforms.AutoAugment, ArgsKwargs(5)),
@@ -1095,33 +1095,33 @@ def make_datapoints(self, with_mask=True):
         def make_label(extra_dims, categories):
             return torch.randint(categories, extra_dims, dtype=torch.int64)
 
-        pil_image = to_image_pil(make_image(size=size, color_space="RGB"))
+        pil_image = to_image_pil(make_image(spatial_size=size, color_space="RGB"))
         target = {
             "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
-            target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long)
+            target["masks"] = make_detection_mask(spatial_size=size, num_objects=num_objects, dtype=torch.long)
 
         yield (pil_image, target)
 
-        tensor_image = torch.Tensor(make_image(size=size, color_space="RGB"))
+        tensor_image = torch.Tensor(make_image(spatial_size=size, color_space="RGB"))
         target = {
             "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
-            target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long)
+            target["masks"] = make_detection_mask(spatial_size=size, num_objects=num_objects, dtype=torch.long)
 
         yield (tensor_image, target)
 
-        datapoint_image = make_image(size=size, color_space="RGB")
+        datapoint_image = make_image(spatial_size=size, color_space="RGB")
         target = {
             "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
-            target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long)
+            target["masks"] = make_detection_mask(spatial_size=size, num_objects=num_objects, dtype=torch.long)
 
         yield (datapoint_image, target)
 
@@ -1203,8 +1203,8 @@ def make_datapoints(self, supports_pil=True, image_dtype=torch.uint8):
         conv_fns.extend([torch.Tensor, lambda x: x])
 
         for conv_fn in conv_fns:
-            datapoint_image = make_image(size=size, color_space="RGB", dtype=image_dtype)
-            datapoint_mask = make_segmentation_mask(size=size, num_categories=num_categories, dtype=torch.uint8)
+            datapoint_image = make_image(spatial_size=size, color_space="RGB", dtype=image_dtype)
+            datapoint_mask = make_segmentation_mask(spatial_size=size, num_categories=num_categories, dtype=torch.uint8)
 
             dp = (conv_fn(datapoint_image), datapoint_mask)
             dp_ref = (
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 43dc9e466ec..be768900ece 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -308,23 +308,23 @@ def wrapper(input, *args, **kwargs):
     return wrapper
 
 
-def make_input(input_type, *, dtype=None, device="cpu", spatial_size=(17, 11), mask_type="segmentation", **kwargs):
+def make_input(input_type, *, spatial_size=(17, 11), mask_type="segmentation", **kwargs):
     if input_type in {torch.Tensor, PIL.Image.Image, datapoints.Image}:
-        input = make_image(size=spatial_size, dtype=dtype, device=device, **kwargs)
+        input = make_image(spatial_size=spatial_size, **kwargs)
         if input_type is torch.Tensor:
             input = input.as_subclass(torch.Tensor)
         elif input_type is PIL.Image.Image:
             input = F.to_image_pil(input)
     elif input_type is datapoints.BoundingBox:
-        input = make_bounding_box(dtype=dtype, device=device, spatial_size=spatial_size, **kwargs)
+        input = make_bounding_box(spatial_size=spatial_size)
     elif input_type is datapoints.Mask:
         make_mask = {
             "segmentation": make_segmentation_mask,
             "detection": make_detection_mask,
         }[mask_type]
-        input = make_mask(size=spatial_size, dtype=dtype, device=device, **kwargs)
+        input = make_mask(spatial_size, **kwargs)
     elif input_type is datapoints.Video:
-        input = make_video(size=spatial_size, dtype=dtype, device=device, **kwargs)
+        input = make_video(spatial_size, **kwargs)
     else:
         raise TypeError(
             f"Input type can either be torch.Tensor, PIL.Image.Image, or any TorchVision datapoint class, "
diff --git a/test/test_transforms_v2_utils.py b/test/test_transforms_v2_utils.py
index 98271b893d6..ad30c223530 100644
--- a/test/test_transforms_v2_utils.py
+++ b/test/test_transforms_v2_utils.py
@@ -11,9 +11,9 @@
 from torchvision.transforms.v2.utils import has_all, has_any
 
 
-IMAGE = make_image(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, color_space="RGB")
+IMAGE = make_image(spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, color_space="RGB")
 BOUNDING_BOX = make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY, spatial_size=IMAGE.spatial_size)
-MASK = make_detection_mask(size=IMAGE.spatial_size)
+MASK = make_detection_mask(spatial_size=IMAGE.spatial_size)
 
 
 @pytest.mark.parametrize(

From 9b620c619596bb5942575d490a319015fbe3d40e Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 3 Jul 2023 21:41:43 +0200
Subject: [PATCH 06/17] fix prototype tests

---
 test/test_prototype_transforms.py | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 52b572c7628..80e5162ed30 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -9,6 +9,7 @@
 from common_utils import (
     assert_equal,
     DEFAULT_EXTRA_DIMS,
+    DEFAULT_PORTRAIT_SPATIAL_SIZE,
     make_bounding_box,
     make_detection_mask,
     make_image,
@@ -79,8 +80,8 @@ def test_mixup_cutmix(transform, input):
     for unsup_data in [
         make_label(),
         make_bounding_box(format="XYXY"),
-        make_detection_mask(),
-        make_segmentation_mask(),
+        make_detection_mask(DEFAULT_PORTRAIT_SPATIAL_SIZE),
+        make_segmentation_mask(DEFAULT_PORTRAIT_SPATIAL_SIZE),
     ]:
         input_copy["unsupported"] = unsup_data
         with pytest.raises(TypeError, match=err_msg):
@@ -216,7 +217,7 @@ def test__get_params(self, mocker):
 
         flat_inputs = [
             make_image(spatial_size=spatial_size, color_space="RGB"),
-            make_bounding_box(format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=batch_shape),
+            make_bounding_box(format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=batch_shape),
         ]
         params = transform._get_params(flat_inputs)
 
@@ -312,9 +313,9 @@ def test__transform_culling(self, mocker):
         )
 
         bounding_boxes = make_bounding_box(
-            format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
+            format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(batch_size,)
         )
-        masks = make_detection_mask(spatial_size=spatial_size)
+        masks = make_detection_mask(spatial_size=spatial_size, batch_dims=(batch_size,))
         labels = make_label(extra_dims=(batch_size,))
 
         transform = transforms.FixedSizeCrop((-1, -1))
@@ -350,7 +351,7 @@ def test__transform_bounding_box_clamping(self, mocker):
         )
 
         bounding_box = make_bounding_box(
-            format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
+            format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(batch_size,)
         )
         mock = mocker.patch("torchvision.prototype.transforms._geometry.F.clamp_bounding_box")
 
@@ -389,9 +390,9 @@ class TestPermuteDimensions:
     )
     def test_call(self, dims, inverse_dims):
         sample = dict(
-            image=make_image(),
-            bounding_box=make_bounding_box(format=BoundingBoxFormat.XYXY),
-            video=make_video(),
+            image=make_image(DEFAULT_PORTRAIT_SPATIAL_SIZE),
+            bounding_box=make_bounding_box(format=BoundingBoxFormat.XYXY, spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE),
+            video=make_video(DEFAULT_PORTRAIT_SPATIAL_SIZE),
             str="str",
             int=0,
         )
@@ -433,9 +434,9 @@ class TestTransposeDimensions:
     )
     def test_call(self, dims):
         sample = dict(
-            image=make_image(),
-            bounding_box=make_bounding_box(format=BoundingBoxFormat.XYXY),
-            video=make_video(),
+            image=make_image(DEFAULT_PORTRAIT_SPATIAL_SIZE),
+            bounding_box=make_bounding_box(format=BoundingBoxFormat.XYXY, spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE),
+            video=make_video(DEFAULT_PORTRAIT_SPATIAL_SIZE),
             str="str",
             int=0,
         )
@@ -496,7 +497,7 @@ def make_datapoints():
 
         pil_image = to_image_pil(make_image(spatial_size=size, color_space="RGB"))
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
             "masks": make_detection_mask(spatial_size=size, num_objects=num_objects, dtype=torch.long),
         }
@@ -505,7 +506,7 @@ def make_datapoints():
 
         tensor_image = torch.Tensor(make_image(spatial_size=size, color_space="RGB"))
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
             "masks": make_detection_mask(spatial_size=size, num_objects=num_objects, dtype=torch.long),
         }
@@ -514,7 +515,7 @@ def make_datapoints():
 
         datapoint_image = make_image(spatial_size=size, color_space="RGB")
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
             "masks": make_detection_mask(spatial_size=size, num_objects=num_objects, dtype=torch.long),
         }

From e14986d8eefbfd1c6796ae5dde4fb9b0f3a676eb Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Mon, 3 Jul 2023 23:33:44 +0200
Subject: [PATCH 07/17] fix reference consistency test

---
 test/test_transforms_v2_consistency.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index cf1b6b6b52c..f65a3407b7c 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -1105,7 +1105,7 @@ def make_label(extra_dims, categories):
 
         yield (pil_image, target)
 
-        tensor_image = torch.Tensor(make_image(spatial_size=size, color_space="RGB"))
+        tensor_image = torch.Tensor(make_image(spatial_size=size, color_space="RGB", dtype=torch.float32))
         target = {
             "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
@@ -1115,7 +1115,7 @@ def make_label(extra_dims, categories):
 
         yield (tensor_image, target)
 
-        datapoint_image = make_image(spatial_size=size, color_space="RGB")
+        datapoint_image = make_image(spatial_size=size, color_space="RGB", dtype=torch.float32)
         target = {
             "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),

From 53e13cc39eb9b51cd652940d5f588c7f179ba1e7 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 4 Jul 2023 14:33:35 +0200
Subject: [PATCH 08/17] remove random parameters

---
 test/common_utils.py               | 47 ++++++++-------
 test/test_transforms_v2.py         |  2 +-
 test/transforms_v2_kernel_infos.py | 93 ++++++++++++++++--------------
 3 files changed, 73 insertions(+), 69 deletions(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index feae7717ecb..2656449cf8c 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -399,6 +399,9 @@ def load(self, device="cpu"):
         )
 
 
+# new v2 default
+DEFAULT_SPATIAL_SIZE = (17, 11)
+# old v2 defaults
 DEFAULT_SQUARE_SPATIAL_SIZE = 15
 DEFAULT_LANDSCAPE_SPATIAL_SIZE = (7, 33)
 DEFAULT_PORTRAIT_SPATIAL_SIZE = (31, 9)
@@ -406,13 +409,12 @@ def load(self, device="cpu"):
     DEFAULT_LANDSCAPE_SPATIAL_SIZE,
     DEFAULT_PORTRAIT_SPATIAL_SIZE,
     DEFAULT_SQUARE_SPATIAL_SIZE,
-    "random",
 )
 
 
 def _parse_spatial_size(size, *, name="size"):
     if size == "random":
-        return tuple(torch.randint(15, 33, (2,)).tolist())
+        raise ValueError("This should never happen")
     elif isinstance(size, int) and size > 0:
         return (size, size)
     elif (
@@ -515,7 +517,7 @@ def make_image(
 
 
 def make_image_loader(
-    size="random",
+    size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
     *,
     color_space="RGB",
     extra_dims=(),
@@ -563,7 +565,7 @@ def make_image_loaders(
 
 
 def make_image_loader_for_interpolation(
-    size="random", *, color_space="RGB", dtype=torch.uint8, memory_format=torch.contiguous_format
+    size=(233, 147), *, color_space="RGB", dtype=torch.uint8, memory_format=torch.contiguous_format
 ):
     size = _parse_spatial_size(size)
     num_channels = get_num_channels(color_space)
@@ -625,7 +627,7 @@ def randint_with_tensor_bounds(arg1, arg2=None, **kwargs):
 
 
 def make_bounding_box(
-    format=datapoints.BoundingBoxFormat.XYXY, spatial_size="random", batch_dims=(), dtype=None, device="cpu"
+    format=datapoints.BoundingBoxFormat.XYXY, spatial_size=DEFAULT_SPATIAL_SIZE, batch_dims=(), dtype=None, device="cpu"
 ):
     if isinstance(format, str):
         format = datapoints.BoundingBoxFormat[format]
@@ -665,7 +667,7 @@ def make_bounding_box(
     )
 
 
-def make_bounding_box_loader(*, extra_dims=(), format, spatial_size="random", dtype=torch.float32):
+def make_bounding_box_loader(*, extra_dims=(), format, spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.float32):
     if isinstance(format, str):
         format = datapoints.BoundingBoxFormat[format]
 
@@ -687,7 +689,7 @@ def make_bounding_box_loaders(
     *,
     extra_dims=DEFAULT_EXTRA_DIMS,
     formats=tuple(datapoints.BoundingBoxFormat),
-    spatial_size="random",
+    spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
     dtypes=(torch.float32, torch.float64, torch.int64),
 ):
     for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes):
@@ -701,10 +703,9 @@ class MaskLoader(TensorLoader):
     pass
 
 
-def make_detection_mask(spatial_size, *, num_objects="random", batch_dims=(), dtype=None, device="cpu"):
+def make_detection_mask(spatial_size, *, num_objects=5, batch_dims=(), dtype=None, device="cpu"):
     """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks"""
     spatial_size = _parse_spatial_size(spatial_size)
-    num_objects = int(torch.randint(1, 11, ())) if num_objects == "random" else num_objects
     dtype = dtype or torch.bool
 
     data = torch.testing.make_tensor(
@@ -713,10 +714,9 @@ def make_detection_mask(spatial_size, *, num_objects="random", batch_dims=(), dt
     return datapoints.Mask(data)
 
 
-def make_detection_mask_loader(size="random", *, num_objects="random", extra_dims=(), dtype=torch.uint8):
+def make_detection_mask_loader(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_objects=5, extra_dims=(), dtype=torch.uint8):
     # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects
     size = _parse_spatial_size(size)
-    num_objects = int(torch.randint(1, 11, ())) if num_objects == "random" else num_objects
 
     def fn(shape, dtype, device):
         *batch_dims, num_objects, height, width = shape
@@ -729,7 +729,7 @@ def fn(shape, dtype, device):
 
 def make_detection_mask_loaders(
     sizes=DEFAULT_SPATIAL_SIZES,
-    num_objects=(1, 0, "random"),
+    num_objects=(1, 0, 5),
     extra_dims=DEFAULT_EXTRA_DIMS,
     dtypes=(torch.uint8,),
 ):
@@ -740,10 +740,9 @@ def make_detection_mask_loaders(
 make_detection_masks = from_loaders(make_detection_mask_loaders)
 
 
-def make_segmentation_mask(spatial_size, *, num_categories="random", batch_dims=(), dtype=None, device="cpu"):
+def make_segmentation_mask(spatial_size, *, num_categories=10, batch_dims=(), dtype=None, device="cpu"):
     """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value"""
     spatial_size = _parse_spatial_size(spatial_size)
-    num_categories = int(torch.randint(1, 11, ())) if num_categories == "random" else num_categories
     dtype = dtype or torch.uint8
 
     data = torch.testing.make_tensor(
@@ -752,7 +751,9 @@ def make_segmentation_mask(spatial_size, *, num_categories="random", batch_dims=
     return datapoints.Mask(data)
 
 
-def make_segmentation_mask_loader(size="random", *, num_categories="random", extra_dims=(), dtype=torch.uint8):
+def make_segmentation_mask_loader(
+    size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_categories=10, extra_dims=(), dtype=torch.uint8
+):
     # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values
     spatial_size = _parse_spatial_size(size)
 
@@ -768,7 +769,7 @@ def fn(shape, dtype, device):
 def make_segmentation_mask_loaders(
     *,
     sizes=DEFAULT_SPATIAL_SIZES,
-    num_categories=(1, 2, "random"),
+    num_categories=(1, 2, 10),
     extra_dims=DEFAULT_EXTRA_DIMS,
     dtypes=(torch.uint8,),
 ):
@@ -782,8 +783,8 @@ def make_segmentation_mask_loaders(
 def make_mask_loaders(
     *,
     sizes=DEFAULT_SPATIAL_SIZES,
-    num_objects=(1, 0, "random"),
-    num_categories=(1, 2, "random"),
+    num_objects=(1, 0, 5),
+    num_categories=(1, 2, 10),
     extra_dims=DEFAULT_EXTRA_DIMS,
     dtypes=(torch.uint8,),
 ):
@@ -800,21 +801,19 @@ class VideoLoader(ImageLoader):
     pass
 
 
-def make_video(spatial_size, *, num_frames="random", batch_dims=(), **kwargs):
-    num_frames = int(torch.randint(1, 5, ())) if num_frames == "random" else num_frames
+def make_video(spatial_size, *, num_frames=3, batch_dims=(), **kwargs):
     return datapoints.Video(make_image(spatial_size, batch_dims=(*batch_dims, num_frames), **kwargs))
 
 
 def make_video_loader(
-    size="random",
+    size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
     *,
     color_space="RGB",
-    num_frames="random",
+    num_frames=3,
     extra_dims=(),
     dtype=torch.uint8,
 ):
     size = _parse_spatial_size(size)
-    num_frames = int(torch.randint(1, 5, ())) if num_frames == "random" else num_frames
 
     def fn(shape, dtype, device, memory_format):
         *batch_dims, num_frames, _, height, width = shape
@@ -838,7 +837,7 @@ def make_video_loaders(
         "GRAY",
         "RGB",
     ),
-    num_frames=(1, 0, "random"),
+    num_frames=(1, 0, 3),
     extra_dims=DEFAULT_EXTRA_DIMS,
     dtypes=(torch.uint8, torch.float32, torch.float64),
 ):
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 9cc95312a06..1c6ee23c95f 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -290,7 +290,7 @@ def test_common(self, transform, adapter, container_type, image_or_video, device
                         ],
                         dtypes=[torch.uint8],
                         extra_dims=[(), (4,)],
-                        **(dict(num_frames=["random"]) if fn is make_videos else dict()),
+                        **(dict(num_frames=[3]) if fn is make_videos else dict()),
                     )
                     for fn in [
                         make_images,
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index cae8d3157e9..d9533cbb725 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -11,6 +11,7 @@
 from common_utils import (
     ArgsKwargs,
     combinations_grid,
+    DEFAULT_PORTRAIT_SPATIAL_SIZE,
     get_num_channels,
     ImageLoader,
     InfoBase,
@@ -296,7 +297,7 @@ def sample_inputs_crop_bounding_box():
 
 
 def sample_inputs_crop_mask():
-    for mask_loader in make_mask_loaders(sizes=[(16, 17)], num_categories=["random"], num_objects=["random"]):
+    for mask_loader in make_mask_loaders(sizes=[(16, 17)], num_categories=[10], num_objects=[5]):
         yield ArgsKwargs(mask_loader, top=4, left=3, height=7, width=8)
 
 
@@ -306,7 +307,7 @@ def reference_inputs_crop_mask():
 
 
 def sample_inputs_crop_video():
-    for video_loader in make_video_loaders(sizes=[(16, 17)], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[(16, 17)], num_frames=[3]):
         yield ArgsKwargs(video_loader, top=4, left=3, height=7, width=8)
 
 
@@ -415,7 +416,7 @@ def sample_inputs_resized_crop_mask():
 
 
 def sample_inputs_resized_crop_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader, **_RESIZED_CROP_PARAMS[0])
 
 
@@ -457,7 +458,7 @@ def sample_inputs_resized_crop_video():
 
 def sample_inputs_pad_image_tensor():
     make_pad_image_loaders = functools.partial(
-        make_image_loaders, sizes=["random"], color_spaces=["RGB"], dtypes=[torch.float32]
+        make_image_loaders, sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=["RGB"], dtypes=[torch.float32]
     )
 
     for image_loader, padding in itertools.product(
@@ -512,7 +513,7 @@ def sample_inputs_pad_bounding_box():
 
 
 def sample_inputs_pad_mask():
-    for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]):
+    for mask_loader in make_mask_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_categories=[10], num_objects=[5]):
         yield ArgsKwargs(mask_loader, padding=[1])
 
 
@@ -524,7 +525,7 @@ def reference_inputs_pad_mask():
 
 
 def sample_inputs_pad_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader, padding=[1])
 
 
@@ -620,7 +621,7 @@ def pad_xfail_jit_fill_condition(args_kwargs):
 
 
 def sample_inputs_perspective_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"]):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]):
         for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
             yield ArgsKwargs(
                 image_loader, startpoints=None, endpoints=None, fill=fill, coefficients=_PERSPECTIVE_COEFFS[0]
@@ -672,7 +673,7 @@ def sample_inputs_perspective_bounding_box():
 
 
 def sample_inputs_perspective_mask():
-    for mask_loader in make_mask_loaders(sizes=["random"]):
+    for mask_loader in make_mask_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]):
         yield ArgsKwargs(mask_loader, startpoints=None, endpoints=None, coefficients=_PERSPECTIVE_COEFFS[0])
 
     yield ArgsKwargs(make_detection_mask_loader(), startpoints=_STARTPOINTS, endpoints=_ENDPOINTS)
@@ -686,7 +687,7 @@ def reference_inputs_perspective_mask():
 
 
 def sample_inputs_perspective_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader, startpoints=None, endpoints=None, coefficients=_PERSPECTIVE_COEFFS[0])
 
     yield ArgsKwargs(make_video_loader(), startpoints=_STARTPOINTS, endpoints=_ENDPOINTS)
@@ -745,7 +746,7 @@ def _get_elastic_displacement(spatial_size):
 
 
 def sample_inputs_elastic_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"]):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]):
         displacement = _get_elastic_displacement(image_loader.spatial_size)
         for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
             yield ArgsKwargs(image_loader, displacement=displacement, fill=fill)
@@ -777,13 +778,13 @@ def sample_inputs_elastic_bounding_box():
 
 
 def sample_inputs_elastic_mask():
-    for mask_loader in make_mask_loaders(sizes=["random"]):
+    for mask_loader in make_mask_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]):
         displacement = _get_elastic_displacement(mask_loader.shape[-2:])
         yield ArgsKwargs(mask_loader, displacement=displacement)
 
 
 def sample_inputs_elastic_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         displacement = _get_elastic_displacement(video_loader.shape[-2:])
         yield ArgsKwargs(video_loader, displacement=displacement)
 
@@ -854,7 +855,7 @@ def sample_inputs_center_crop_bounding_box():
 
 
 def sample_inputs_center_crop_mask():
-    for mask_loader in make_mask_loaders(sizes=["random"], num_categories=["random"], num_objects=["random"]):
+    for mask_loader in make_mask_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_categories=[10], num_objects=[5]):
         height, width = mask_loader.shape[-2:]
         yield ArgsKwargs(mask_loader, output_size=(height // 2, width // 2))
 
@@ -867,7 +868,7 @@ def reference_inputs_center_crop_mask():
 
 
 def sample_inputs_center_crop_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         height, width = video_loader.shape[-2:]
         yield ArgsKwargs(video_loader, output_size=(height // 2, width // 2))
 
@@ -947,7 +948,7 @@ def sample_inputs_gaussian_blur_video():
 
 
 def sample_inputs_equalize_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader)
 
 
@@ -1008,7 +1009,7 @@ def make_beta_distributed_image(shape, dtype, device, *, alpha, beta, memory_for
 
 
 def sample_inputs_equalize_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader)
 
 
@@ -1031,7 +1032,7 @@ def sample_inputs_equalize_video():
 
 
 def sample_inputs_invert_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader)
 
 
@@ -1041,7 +1042,7 @@ def reference_inputs_invert_image_tensor():
 
 
 def sample_inputs_invert_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader)
 
 
@@ -1067,7 +1068,7 @@ def sample_inputs_invert_video():
 
 
 def sample_inputs_posterize_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader, bits=_POSTERIZE_BITS[0])
 
 
@@ -1080,7 +1081,7 @@ def reference_inputs_posterize_image_tensor():
 
 
 def sample_inputs_posterize_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader, bits=_POSTERIZE_BITS[0])
 
 
@@ -1110,7 +1111,7 @@ def _get_solarize_thresholds(dtype):
 
 
 def sample_inputs_solarize_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader, threshold=next(_get_solarize_thresholds(image_loader.dtype)))
 
 
@@ -1125,7 +1126,7 @@ def uint8_to_float32_threshold_adapter(other_args, kwargs):
 
 
 def sample_inputs_solarize_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader, threshold=next(_get_solarize_thresholds(video_loader.dtype)))
 
 
@@ -1149,7 +1150,7 @@ def sample_inputs_solarize_video():
 
 
 def sample_inputs_autocontrast_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader)
 
 
@@ -1159,7 +1160,7 @@ def reference_inputs_autocontrast_image_tensor():
 
 
 def sample_inputs_autocontrast_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader)
 
 
@@ -1189,7 +1190,7 @@ def sample_inputs_autocontrast_video():
 
 def sample_inputs_adjust_sharpness_image_tensor():
     for image_loader in make_image_loaders(
-        sizes=["random", (2, 2)],
+        sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE, (2, 2)],
         color_spaces=("GRAY", "RGB"),
     ):
         yield ArgsKwargs(image_loader, sharpness_factor=_ADJUST_SHARPNESS_FACTORS[0])
@@ -1204,7 +1205,7 @@ def reference_inputs_adjust_sharpness_image_tensor():
 
 
 def sample_inputs_adjust_sharpness_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader, sharpness_factor=_ADJUST_SHARPNESS_FACTORS[0])
 
 
@@ -1228,7 +1229,7 @@ def sample_inputs_adjust_sharpness_video():
 
 
 def sample_inputs_erase_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"]):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]):
         # FIXME: make the parameters more diverse
         h, w = 6, 7
         v = torch.rand(image_loader.num_channels, h, w)
@@ -1236,7 +1237,7 @@ def sample_inputs_erase_image_tensor():
 
 
 def sample_inputs_erase_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         # FIXME: make the parameters more diverse
         h, w = 6, 7
         v = torch.rand(video_loader.num_channels, h, w)
@@ -1261,7 +1262,7 @@ def sample_inputs_erase_video():
 
 
 def sample_inputs_adjust_brightness_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader, brightness_factor=_ADJUST_BRIGHTNESS_FACTORS[0])
 
 
@@ -1274,7 +1275,7 @@ def reference_inputs_adjust_brightness_image_tensor():
 
 
 def sample_inputs_adjust_brightness_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader, brightness_factor=_ADJUST_BRIGHTNESS_FACTORS[0])
 
 
@@ -1301,7 +1302,7 @@ def sample_inputs_adjust_brightness_video():
 
 
 def sample_inputs_adjust_contrast_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader, contrast_factor=_ADJUST_CONTRAST_FACTORS[0])
 
 
@@ -1314,7 +1315,7 @@ def reference_inputs_adjust_contrast_image_tensor():
 
 
 def sample_inputs_adjust_contrast_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader, contrast_factor=_ADJUST_CONTRAST_FACTORS[0])
 
 
@@ -1353,7 +1354,7 @@ def sample_inputs_adjust_contrast_video():
 
 def sample_inputs_adjust_gamma_image_tensor():
     gamma, gain = _ADJUST_GAMMA_GAMMAS_GAINS[0]
-    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader, gamma=gamma, gain=gain)
 
 
@@ -1367,7 +1368,7 @@ def reference_inputs_adjust_gamma_image_tensor():
 
 def sample_inputs_adjust_gamma_video():
     gamma, gain = _ADJUST_GAMMA_GAMMAS_GAINS[0]
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader, gamma=gamma, gain=gain)
 
 
@@ -1397,7 +1398,7 @@ def sample_inputs_adjust_gamma_video():
 
 
 def sample_inputs_adjust_hue_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader, hue_factor=_ADJUST_HUE_FACTORS[0])
 
 
@@ -1410,7 +1411,7 @@ def reference_inputs_adjust_hue_image_tensor():
 
 
 def sample_inputs_adjust_hue_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader, hue_factor=_ADJUST_HUE_FACTORS[0])
 
 
@@ -1439,7 +1440,7 @@ def sample_inputs_adjust_hue_video():
 
 
 def sample_inputs_adjust_saturation_image_tensor():
-    for image_loader in make_image_loaders(sizes=["random"], color_spaces=("GRAY", "RGB")):
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
         yield ArgsKwargs(image_loader, saturation_factor=_ADJUST_SATURATION_FACTORS[0])
 
 
@@ -1452,7 +1453,7 @@ def reference_inputs_adjust_saturation_image_tensor():
 
 
 def sample_inputs_adjust_saturation_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader, saturation_factor=_ADJUST_SATURATION_FACTORS[0])
 
 
@@ -1612,7 +1613,7 @@ def wrapper(input_tensor, *other_args, **kwargs):
 
 def sample_inputs_normalize_image_tensor():
     for image_loader, (mean, std) in itertools.product(
-        make_image_loaders(sizes=["random"], color_spaces=["RGB"], dtypes=[torch.float32]),
+        make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=["RGB"], dtypes=[torch.float32]),
         _NORMALIZE_MEANS_STDS,
     ):
         yield ArgsKwargs(image_loader, mean=mean, std=std)
@@ -1637,7 +1638,7 @@ def reference_inputs_normalize_image_tensor():
 def sample_inputs_normalize_video():
     mean, std = _NORMALIZE_MEANS_STDS[0]
     for video_loader in make_video_loaders(
-        sizes=["random"], color_spaces=["RGB"], num_frames=["random"], dtypes=[torch.float32]
+        sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=["RGB"], num_frames=[3], dtypes=[torch.float32]
     ):
         yield ArgsKwargs(video_loader, mean=mean, std=std)
 
@@ -1671,7 +1672,9 @@ def sample_inputs_convert_dtype_image_tensor():
             # conversion cannot be performed safely
             continue
 
-        for image_loader in make_image_loaders(sizes=["random"], color_spaces=["RGB"], dtypes=[input_dtype]):
+        for image_loader in make_image_loaders(
+            sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=["RGB"], dtypes=[input_dtype]
+        ):
             yield ArgsKwargs(image_loader, dtype=output_dtype)
 
 
@@ -1736,7 +1739,7 @@ def reference_inputs_convert_dtype_image_tensor():
 
 
 def sample_inputs_convert_dtype_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
         yield ArgsKwargs(video_loader)
 
 
@@ -1781,7 +1784,7 @@ def sample_inputs_convert_dtype_video():
 
 
 def sample_inputs_uniform_temporal_subsample_video():
-    for video_loader in make_video_loaders(sizes=["random"], num_frames=[4]):
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[4]):
         yield ArgsKwargs(video_loader, num_samples=2)
 
 
@@ -1797,7 +1800,9 @@ def reference_uniform_temporal_subsample_video(x, num_samples):
 
 
 def reference_inputs_uniform_temporal_subsample_video():
-    for video_loader in make_video_loaders(sizes=["random"], color_spaces=["RGB"], num_frames=[10]):
+    for video_loader in make_video_loaders(
+        sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=["RGB"], num_frames=[10]
+    ):
         for num_samples in range(1, video_loader.shape[-4] + 1):
             yield ArgsKwargs(video_loader, num_samples)
 

From cdb7e17351e18f19caabfa06dc0667da5304712f Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 4 Jul 2023 14:38:10 +0200
Subject: [PATCH 09/17] move default spatial size into low level functions

---
 test/common_utils.py                  | 60 ++++++++++++++++-----------
 test/test_transforms_v2_refactored.py | 10 ++---
 2 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index 2656449cf8c..837f7e1f5b9 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -495,18 +495,20 @@ def get_num_channels(color_space):
 
 
 def make_image(
-    spatial_size, *, color_space="RGB", batch_dims=(), dtype=None, device="cpu", memory_format=torch.contiguous_format
+    spatial_size=DEFAULT_SPATIAL_SIZE,
+    *,
+    color_space="RGB",
+    batch_dims=(),
+    dtype=None,
+    device="cpu",
+    memory_format=torch.contiguous_format,
 ):
-    spatial_size = _parse_spatial_size(spatial_size)
-    num_channels = get_num_channels(color_space)
-    dtype = dtype or torch.uint8
     max_value = get_max_value(dtype)
-
     data = torch.testing.make_tensor(
-        (*batch_dims, num_channels, *spatial_size),
+        (*batch_dims, get_num_channels(color_space), *spatial_size),
         low=0,
         high=max_value,
-        dtype=dtype,
+        dtype=dtype or torch.uint8,
         device=device,
         memory_format=memory_format,
     )
@@ -627,12 +629,16 @@ def randint_with_tensor_bounds(arg1, arg2=None, **kwargs):
 
 
 def make_bounding_box(
-    format=datapoints.BoundingBoxFormat.XYXY, spatial_size=DEFAULT_SPATIAL_SIZE, batch_dims=(), dtype=None, device="cpu"
+    spatial_size=DEFAULT_SPATIAL_SIZE,
+    *,
+    format=datapoints.BoundingBoxFormat.XYXY,
+    batch_dims=(),
+    dtype=None,
+    device="cpu",
 ):
     if isinstance(format, str):
         format = datapoints.BoundingBoxFormat[format]
 
-    spatial_size = _parse_spatial_size(spatial_size, name="spatial_size")
     dtype = dtype or torch.float32
 
     if any(dim == 0 for dim in batch_dims):
@@ -703,15 +709,17 @@ class MaskLoader(TensorLoader):
     pass
 
 
-def make_detection_mask(spatial_size, *, num_objects=5, batch_dims=(), dtype=None, device="cpu"):
+def make_detection_mask(spatial_size=DEFAULT_SPATIAL_SIZE, *, num_objects=5, batch_dims=(), dtype=None, device="cpu"):
     """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks"""
-    spatial_size = _parse_spatial_size(spatial_size)
-    dtype = dtype or torch.bool
-
-    data = torch.testing.make_tensor(
-        (*batch_dims, num_objects, *spatial_size), low=0, high=2, dtype=dtype, device=device
+    return datapoints.Mask(
+        torch.testing.make_tensor(
+            (*batch_dims, num_objects, *spatial_size),
+            low=0,
+            high=2,
+            dtype=dtype or torch.bool,
+            device=device,
+        )
     )
-    return datapoints.Mask(data)
 
 
 def make_detection_mask_loader(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_objects=5, extra_dims=(), dtype=torch.uint8):
@@ -740,15 +748,19 @@ def make_detection_mask_loaders(
 make_detection_masks = from_loaders(make_detection_mask_loaders)
 
 
-def make_segmentation_mask(spatial_size, *, num_categories=10, batch_dims=(), dtype=None, device="cpu"):
+def make_segmentation_mask(
+    spatial_size=DEFAULT_SPATIAL_SIZE, *, num_categories=10, batch_dims=(), dtype=None, device="cpu"
+):
     """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value"""
-    spatial_size = _parse_spatial_size(spatial_size)
-    dtype = dtype or torch.uint8
-
-    data = torch.testing.make_tensor(
-        (*batch_dims, *spatial_size), low=0, high=num_categories, dtype=dtype, device=device
+    return datapoints.Mask(
+        torch.testing.make_tensor(
+            (*batch_dims, *spatial_size),
+            low=0,
+            high=num_categories,
+            dtype=dtype or torch.uint8,
+            device=device,
+        )
     )
-    return datapoints.Mask(data)
 
 
 def make_segmentation_mask_loader(
@@ -801,7 +813,7 @@ class VideoLoader(ImageLoader):
     pass
 
 
-def make_video(spatial_size, *, num_frames=3, batch_dims=(), **kwargs):
+def make_video(spatial_size=DEFAULT_SPATIAL_SIZE, *, num_frames=3, batch_dims=(), **kwargs):
     return datapoints.Video(make_image(spatial_size, batch_dims=(*batch_dims, num_frames), **kwargs))
 
 
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index be768900ece..57fce82f254 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -308,23 +308,23 @@ def wrapper(input, *args, **kwargs):
     return wrapper
 
 
-def make_input(input_type, *, spatial_size=(17, 11), mask_type="segmentation", **kwargs):
+def make_input(input_type, *, mask_type="segmentation", **kwargs):
     if input_type in {torch.Tensor, PIL.Image.Image, datapoints.Image}:
-        input = make_image(spatial_size=spatial_size, **kwargs)
+        input = make_image(**kwargs)
         if input_type is torch.Tensor:
             input = input.as_subclass(torch.Tensor)
         elif input_type is PIL.Image.Image:
             input = F.to_image_pil(input)
     elif input_type is datapoints.BoundingBox:
-        input = make_bounding_box(spatial_size=spatial_size)
+        input = make_bounding_box()
     elif input_type is datapoints.Mask:
         make_mask = {
             "segmentation": make_segmentation_mask,
             "detection": make_detection_mask,
         }[mask_type]
-        input = make_mask(spatial_size, **kwargs)
+        input = make_mask(**kwargs)
     elif input_type is datapoints.Video:
-        input = make_video(spatial_size, **kwargs)
+        input = make_video(**kwargs)
     else:
         raise TypeError(
             f"Input type can either be torch.Tensor, PIL.Image.Image, or any TorchVision datapoint class, "

From 9b3073196b5b10b36e985559c9a4c6e371eb03ea Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 5 Jul 2023 11:47:43 +0200
Subject: [PATCH 10/17] add size parameter to make_bounding_box

---
 test/common_utils.py               | 66 ++++++++++++++++--------------
 test/transforms_v2_kernel_infos.py |  3 ++
 2 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index 837f7e1f5b9..ca4dc78811b 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -616,29 +616,32 @@ class BoundingBoxLoader(TensorLoader):
     spatial_size: Tuple[int, int]
 
 
-def randint_with_tensor_bounds(arg1, arg2=None, **kwargs):
-    low, high = torch.broadcast_tensors(
-        *[torch.as_tensor(arg) for arg in ((0, arg1) if arg2 is None else (arg1, arg2))]
-    )
-    return torch.stack(
-        [
-            torch.randint(low_scalar, high_scalar, (), **kwargs)
-            for low_scalar, high_scalar in zip(low.flatten().tolist(), high.flatten().tolist())
-        ]
-    ).reshape(low.shape)
-
-
 def make_bounding_box(
-    spatial_size=DEFAULT_SPATIAL_SIZE,
+    size=None,
     *,
     format=datapoints.BoundingBoxFormat.XYXY,
+    spatial_size=None,
     batch_dims=(),
     dtype=None,
     device="cpu",
 ):
+    def sample_position(values, max_value):
+        # We cannot use torch.randint directly here, because it only allows integer scalars as values for low and high.
+        # However, if we have batch_dims, we need tensors as limits.
+        return torch.stack([torch.randint(max_value - v, ()) for v in values.flatten().tolist()]).reshape(values.shape)
+
     if isinstance(format, str):
         format = datapoints.BoundingBoxFormat[format]
 
+    if spatial_size is None:
+        if size is None:
+            spatial_size = DEFAULT_SPATIAL_SIZE
+        else:
+            height, width = size
+            height_margin, width_margin = torch.randint(10, (2,)).tolist()
+            spatial_size = (height + height_margin, width + width_margin)
+    spatial_height, spatial_width = spatial_size
+
     dtype = dtype or torch.float32
 
     if any(dim == 0 for dim in batch_dims):
@@ -646,27 +649,28 @@ def make_bounding_box(
             torch.empty(*batch_dims, 4, dtype=dtype, device=device), format=format, spatial_size=spatial_size
         )
 
-    height, width = spatial_size
-    if format == datapoints.BoundingBoxFormat.XYXY:
-        x1 = torch.randint(0, width // 2, batch_dims)
-        y1 = torch.randint(0, height // 2, batch_dims)
-        x2 = randint_with_tensor_bounds(x1 + 1, width - x1) + x1
-        y2 = randint_with_tensor_bounds(y1 + 1, height - y1) + y1
-        parts = (x1, y1, x2, y2)
-    elif format == datapoints.BoundingBoxFormat.XYWH:
-        x = torch.randint(0, width // 2, batch_dims)
-        y = torch.randint(0, height // 2, batch_dims)
-        w = randint_with_tensor_bounds(1, width - x)
-        h = randint_with_tensor_bounds(1, height - y)
+    if size is None:
+        h = torch.randint(1, spatial_height - 1, batch_dims)
+        w = torch.randint(1, spatial_width - 1, batch_dims)
+    else:
+        h, w = [torch.full(batch_dims, v, dtype=torch.int) for v in size]
+
+    y = sample_position(h, spatial_height)
+    x = sample_position(w, spatial_width)
+
+    if format is datapoints.BoundingBoxFormat.XYWH:
         parts = (x, y, w, h)
-    elif format == datapoints.BoundingBoxFormat.CXCYWH:
-        cx = torch.randint(1, width - 1, batch_dims)
-        cy = torch.randint(1, height - 1, batch_dims)
-        w = randint_with_tensor_bounds(1, torch.minimum(cx, width - cx) + 1)
-        h = randint_with_tensor_bounds(1, torch.minimum(cy, height - cy) + 1)
+    elif format is datapoints.BoundingBoxFormat.XYXY:
+        x1, y1 = x, y
+        x2 = x1 + w
+        y2 = y1 + h
+        parts = (x1, y1, x2, y2)
+    elif format is datapoints.BoundingBoxFormat.CXCYWH:
+        cx = x + w / 2
+        cy = y + h / 2
         parts = (cx, cy, w, h)
     else:
-        raise ValueError(f"Can't make bounding box in format {format}")
+        raise ValueError(f"Format {format} is not supported")
 
     return datapoints.BoundingBox(
         torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, spatial_size=spatial_size
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
index d9533cbb725..dc04fbfc7a9 100644
--- a/test/transforms_v2_kernel_infos.py
+++ b/test/transforms_v2_kernel_infos.py
@@ -261,6 +261,9 @@ def reference_inputs_convert_format_bounding_box():
         reference_fn=reference_convert_format_bounding_box,
         reference_inputs_fn=reference_inputs_convert_format_bounding_box,
         logs_usage=True,
+        closeness_kwargs={
+            (("TestKernels", "test_against_reference"), torch.int64, "cpu"): dict(atol=1, rtol=0),
+        },
     ),
 )
 

From 38b589e7222ca6d0f39ceb84107c097aa7e9f684 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 5 Jul 2023 13:09:44 +0200
Subject: [PATCH 11/17] spatial_size -> size

---
 test/common_utils.py                   | 24 ++++++++----------
 test/test_prototype_transforms.py      | 16 ++++++------
 test/test_transforms_v2.py             | 18 ++++++-------
 test/test_transforms_v2_consistency.py | 22 ++++++++--------
 test/test_transforms_v2_refactored.py  | 35 +++++++++++++-------------
 test/test_transforms_v2_utils.py       |  4 +--
 6 files changed, 59 insertions(+), 60 deletions(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index ca4dc78811b..6c813f47c03 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -400,7 +400,7 @@ def load(self, device="cpu"):
 
 
 # new v2 default
-DEFAULT_SPATIAL_SIZE = (17, 11)
+DEFAULT_SIZE = (17, 11)
 # old v2 defaults
 DEFAULT_SQUARE_SPATIAL_SIZE = 15
 DEFAULT_LANDSCAPE_SPATIAL_SIZE = (7, 33)
@@ -495,7 +495,7 @@ def get_num_channels(color_space):
 
 
 def make_image(
-    spatial_size=DEFAULT_SPATIAL_SIZE,
+    size=DEFAULT_SIZE,
     *,
     color_space="RGB",
     batch_dims=(),
@@ -505,7 +505,7 @@ def make_image(
 ):
     max_value = get_max_value(dtype)
     data = torch.testing.make_tensor(
-        (*batch_dims, get_num_channels(color_space), *spatial_size),
+        (*batch_dims, get_num_channels(color_space), *size),
         low=0,
         high=max_value,
         dtype=dtype or torch.uint8,
@@ -635,7 +635,7 @@ def sample_position(values, max_value):
 
     if spatial_size is None:
         if size is None:
-            spatial_size = DEFAULT_SPATIAL_SIZE
+            spatial_size = DEFAULT_SIZE
         else:
             height, width = size
             height_margin, width_margin = torch.randint(10, (2,)).tolist()
@@ -713,11 +713,11 @@ class MaskLoader(TensorLoader):
     pass
 
 
-def make_detection_mask(spatial_size=DEFAULT_SPATIAL_SIZE, *, num_objects=5, batch_dims=(), dtype=None, device="cpu"):
+def make_detection_mask(size=DEFAULT_SIZE, *, num_objects=5, batch_dims=(), dtype=None, device="cpu"):
     """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks"""
     return datapoints.Mask(
         torch.testing.make_tensor(
-            (*batch_dims, num_objects, *spatial_size),
+            (*batch_dims, num_objects, *size),
             low=0,
             high=2,
             dtype=dtype or torch.bool,
@@ -752,13 +752,11 @@ def make_detection_mask_loaders(
 make_detection_masks = from_loaders(make_detection_mask_loaders)
 
 
-def make_segmentation_mask(
-    spatial_size=DEFAULT_SPATIAL_SIZE, *, num_categories=10, batch_dims=(), dtype=None, device="cpu"
-):
+def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(), dtype=None, device="cpu"):
     """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value"""
     return datapoints.Mask(
         torch.testing.make_tensor(
-            (*batch_dims, *spatial_size),
+            (*batch_dims, *size),
             low=0,
             high=num_categories,
             dtype=dtype or torch.uint8,
@@ -817,8 +815,8 @@ class VideoLoader(ImageLoader):
     pass
 
 
-def make_video(spatial_size=DEFAULT_SPATIAL_SIZE, *, num_frames=3, batch_dims=(), **kwargs):
-    return datapoints.Video(make_image(spatial_size, batch_dims=(*batch_dims, num_frames), **kwargs))
+def make_video(size=DEFAULT_SIZE, *, num_frames=3, batch_dims=(), **kwargs):
+    return datapoints.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs))
 
 
 def make_video_loader(
@@ -836,8 +834,8 @@ def fn(shape, dtype, device, memory_format):
         return make_video(
             (height, width),
             num_frames=num_frames,
-            color_space=color_space,
             batch_dims=batch_dims,
+            color_space=color_space,
             dtype=dtype,
             device=device,
             memory_format=memory_format,
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index 80e5162ed30..cfbcc7c0557 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -216,7 +216,7 @@ def test__get_params(self, mocker):
         transform = transforms.FixedSizeCrop(size=crop_size)
 
         flat_inputs = [
-            make_image(spatial_size=spatial_size, color_space="RGB"),
+            make_image(size=spatial_size, color_space="RGB"),
             make_bounding_box(format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=batch_shape),
         ]
         params = transform._get_params(flat_inputs)
@@ -315,7 +315,7 @@ def test__transform_culling(self, mocker):
         bounding_boxes = make_bounding_box(
             format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(batch_size,)
         )
-        masks = make_detection_mask(spatial_size=spatial_size, batch_dims=(batch_size,))
+        masks = make_detection_mask(size=spatial_size, batch_dims=(batch_size,))
         labels = make_label(extra_dims=(batch_size,))
 
         transform = transforms.FixedSizeCrop((-1, -1))
@@ -495,29 +495,29 @@ def make_datapoints():
         size = (600, 800)
         num_objects = 22
 
-        pil_image = to_image_pil(make_image(spatial_size=size, color_space="RGB"))
+        pil_image = to_image_pil(make_image(size=size, color_space="RGB"))
         target = {
             "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
-            "masks": make_detection_mask(spatial_size=size, num_objects=num_objects, dtype=torch.long),
+            "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
         }
 
         yield (pil_image, target)
 
-        tensor_image = torch.Tensor(make_image(spatial_size=size, color_space="RGB"))
+        tensor_image = torch.Tensor(make_image(size=size, color_space="RGB"))
         target = {
             "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
-            "masks": make_detection_mask(spatial_size=size, num_objects=num_objects, dtype=torch.long),
+            "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
         }
 
         yield (tensor_image, target)
 
-        datapoint_image = make_image(spatial_size=size, color_space="RGB")
+        datapoint_image = make_image(size=size, color_space="RGB")
         target = {
             "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
-            "masks": make_detection_mask(spatial_size=size, num_objects=num_objects, dtype=torch.long),
+            "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
         }
 
         yield (datapoint_image, target)
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 1c6ee23c95f..0f6c4cc3b2e 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -168,8 +168,8 @@ class TestSmoke:
     @pytest.mark.parametrize(
         "image_or_video",
         [
-            make_image(spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE),
-            make_video(spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE),
+            make_image(size=DEFAULT_PORTRAIT_SPATIAL_SIZE),
+            make_video(size=DEFAULT_PORTRAIT_SPATIAL_SIZE),
             next(make_pil_images(color_spaces=["RGB"])),
             next(make_vanilla_tensor_images()),
         ],
@@ -179,8 +179,8 @@ def test_common(self, transform, adapter, container_type, image_or_video, device
         spatial_size = F.get_spatial_size(image_or_video)
         input = dict(
             image_or_video=image_or_video,
-            image_datapoint=make_image(spatial_size=spatial_size),
-            video_datapoint=make_video(spatial_size=spatial_size),
+            image_datapoint=make_image(size=spatial_size),
+            video_datapoint=make_video(size=spatial_size),
             image_pil=next(make_pil_images(sizes=[spatial_size], color_spaces=["RGB"])),
             bounding_box_xyxy=make_bounding_box(
                 format=datapoints.BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(3,)
@@ -227,8 +227,8 @@ def test_common(self, transform, adapter, container_type, image_or_video, device
                 format=datapoints.BoundingBoxFormat.CXCYWH,
                 spatial_size=spatial_size,
             ),
-            detection_mask=make_detection_mask(spatial_size=spatial_size),
-            segmentation_mask=make_segmentation_mask(spatial_size=spatial_size),
+            detection_mask=make_detection_mask(size=spatial_size),
+            segmentation_mask=make_segmentation_mask(size=spatial_size),
             int=0,
             float=0.0,
             bool=True,
@@ -353,7 +353,7 @@ def test_random_resized_crop(self, transform, input):
             next(make_vanilla_tensor_images()),
             next(make_vanilla_tensor_images()),
             next(make_pil_images()),
-            make_image(spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE),
+            make_image(size=DEFAULT_PORTRAIT_SPATIAL_SIZE),
             next(make_videos()),
         ],
         3,
@@ -1347,8 +1347,8 @@ class TestToDtype:
     )
     def test_call(self, dtype, expected_dtypes):
         sample = dict(
-            video=make_video(spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.int64),
-            image=make_image(spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.uint8),
+            video=make_video(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.int64),
+            image=make_image(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.uint8),
             bounding_box=make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY, dtype=torch.float32),
             str="str",
             int=0,
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index f65a3407b7c..3f631d7ac94 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -711,11 +711,11 @@ def test_call_consistency(config, args_kwargs):
         for transform_cls, get_params_args_kwargs in [
             (
                 v2_transforms.RandomResizedCrop,
-                ArgsKwargs(make_image(spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE), scale=[0.3, 0.7], ratio=[0.5, 1.5]),
+                ArgsKwargs(make_image(size=DEFAULT_PORTRAIT_SPATIAL_SIZE), scale=[0.3, 0.7], ratio=[0.5, 1.5]),
             ),
             (
                 v2_transforms.RandomErasing,
-                ArgsKwargs(make_image(spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE), scale=(0.3, 0.7), ratio=(0.5, 1.5)),
+                ArgsKwargs(make_image(size=DEFAULT_PORTRAIT_SPATIAL_SIZE), scale=(0.3, 0.7), ratio=(0.5, 1.5)),
             ),
             (v2_transforms.ColorJitter, ArgsKwargs(brightness=None, contrast=None, saturation=None, hue=None)),
             (v2_transforms.ElasticTransform, ArgsKwargs(alpha=[15.3, 27.2], sigma=[2.5, 3.9], size=[17, 31])),
@@ -724,7 +724,7 @@ def test_call_consistency(config, args_kwargs):
                 v2_transforms.RandomAffine,
                 ArgsKwargs(degrees=[-20.0, 10.0], translate=None, scale_ranges=None, shears=None, img_size=[15, 29]),
             ),
-            (v2_transforms.RandomCrop, ArgsKwargs(make_image(spatial_size=(61, 47)), output_size=(19, 25))),
+            (v2_transforms.RandomCrop, ArgsKwargs(make_image(size=(61, 47)), output_size=(19, 25))),
             (v2_transforms.RandomPerspective, ArgsKwargs(23, 17, 0.5)),
             (v2_transforms.RandomRotation, ArgsKwargs(degrees=[-20.0, 10.0])),
             (v2_transforms.AutoAugment, ArgsKwargs(5)),
@@ -1095,33 +1095,33 @@ def make_datapoints(self, with_mask=True):
         def make_label(extra_dims, categories):
             return torch.randint(categories, extra_dims, dtype=torch.int64)
 
-        pil_image = to_image_pil(make_image(spatial_size=size, color_space="RGB"))
+        pil_image = to_image_pil(make_image(size=size, color_space="RGB"))
         target = {
             "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
-            target["masks"] = make_detection_mask(spatial_size=size, num_objects=num_objects, dtype=torch.long)
+            target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long)
 
         yield (pil_image, target)
 
-        tensor_image = torch.Tensor(make_image(spatial_size=size, color_space="RGB", dtype=torch.float32))
+        tensor_image = torch.Tensor(make_image(size=size, color_space="RGB", dtype=torch.float32))
         target = {
             "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
-            target["masks"] = make_detection_mask(spatial_size=size, num_objects=num_objects, dtype=torch.long)
+            target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long)
 
         yield (tensor_image, target)
 
-        datapoint_image = make_image(spatial_size=size, color_space="RGB", dtype=torch.float32)
+        datapoint_image = make_image(size=size, color_space="RGB", dtype=torch.float32)
         target = {
             "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
-            target["masks"] = make_detection_mask(spatial_size=size, num_objects=num_objects, dtype=torch.long)
+            target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long)
 
         yield (datapoint_image, target)
 
@@ -1203,8 +1203,8 @@ def make_datapoints(self, supports_pil=True, image_dtype=torch.uint8):
         conv_fns.extend([torch.Tensor, lambda x: x])
 
         for conv_fn in conv_fns:
-            datapoint_image = make_image(spatial_size=size, color_space="RGB", dtype=image_dtype)
-            datapoint_mask = make_segmentation_mask(spatial_size=size, num_categories=num_categories, dtype=torch.uint8)
+            datapoint_image = make_image(size=size, color_space="RGB", dtype=image_dtype)
+            datapoint_mask = make_segmentation_mask(size=size, num_categories=num_categories, dtype=torch.uint8)
 
             dp = (conv_fn(datapoint_image), datapoint_mask)
             dp_ref = (
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 57fce82f254..e44cd5e9264 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -316,7 +316,7 @@ def make_input(input_type, *, mask_type="segmentation", **kwargs):
         elif input_type is PIL.Image.Image:
             input = F.to_image_pil(input)
     elif input_type is datapoints.BoundingBox:
-        input = make_bounding_box()
+        input = make_bounding_box(**kwargs)
     elif input_type is datapoints.Mask:
         make_mask = {
             "segmentation": make_segmentation_mask,
@@ -506,7 +506,7 @@ def test_kernel_image_tensor(self, size, interpolation, use_max_size, antialias,
 
         check_kernel(
             F.resize_image_tensor,
-            make_input(datapoints.Image, dtype=dtype, device=device, spatial_size=self.INPUT_SIZE),
+            make_input(datapoints.Image, dtype=dtype, device=device, size=self.INPUT_SIZE),
             size=size,
             interpolation=interpolation,
             **max_size_kwarg,
@@ -540,14 +540,14 @@ def test_kernel_bounding_box(self, format, size, use_max_size, dtype, device):
     def test_kernel_mask(self, mask_type):
         check_kernel(
             F.resize_mask,
-            make_input(datapoints.Mask, spatial_size=self.INPUT_SIZE, mask_type=mask_type),
+            make_input(datapoints.Mask, size=self.INPUT_SIZE, mask_type=mask_type),
             size=self.OUTPUT_SIZES[-1],
         )
 
     def test_kernel_video(self):
         check_kernel(
             F.resize_video,
-            make_input(datapoints.Video, spatial_size=self.INPUT_SIZE),
+            make_input(datapoints.Video, size=self.INPUT_SIZE),
             size=self.OUTPUT_SIZES[-1],
             antialias=True,
         )
@@ -568,7 +568,7 @@ def test_dispatcher(self, size, input_type, kernel):
         check_dispatcher(
             F.resize,
             kernel,
-            make_input(input_type, spatial_size=self.INPUT_SIZE),
+            make_input(input_type, size=self.INPUT_SIZE),
             size=size,
             antialias=True,
             check_scripted_smoke=not isinstance(size, int),
@@ -595,7 +595,7 @@ def test_dispatcher_signature(self, kernel, input_type):
         [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
     )
     def test_transform(self, size, device, input_type):
-        input = make_input(input_type, device=device, spatial_size=self.INPUT_SIZE)
+        input = make_input(input_type, device=device, size=self.INPUT_SIZE)
 
         check_transform(
             transforms.Resize,
@@ -619,7 +619,7 @@ def test_image_correctness(self, size, interpolation, use_max_size, fn):
         if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
             return
 
-        image = make_input(torch.Tensor, dtype=torch.uint8, device="cpu", spatial_size=self.INPUT_SIZE)
+        image = make_input(torch.Tensor, dtype=torch.uint8, device="cpu", size=self.INPUT_SIZE)
 
         actual = fn(image, size=size, interpolation=interpolation, **max_size_kwarg, antialias=True)
         expected = F.to_image_tensor(
@@ -662,7 +662,7 @@ def test_bounding_box_correctness(self, format, size, use_max_size, fn):
         if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
             return
 
-        bounding_box = make_input(datapoints.BoundingBox, spatial_size=self.INPUT_SIZE)
+        bounding_box = make_input(datapoints.BoundingBox, size=self.INPUT_SIZE)
 
         actual = fn(bounding_box, size=size, **max_size_kwarg)
         expected = self._reference_resize_bounding_box(bounding_box, size=size, **max_size_kwarg)
@@ -676,7 +676,7 @@ def test_bounding_box_correctness(self, format, size, use_max_size, fn):
         [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.Video],
     )
     def test_pil_interpolation_compat_smoke(self, interpolation, input_type):
-        input = make_input(input_type, spatial_size=self.INPUT_SIZE)
+        input = make_input(input_type, size=self.INPUT_SIZE)
 
         with (
             contextlib.nullcontext()
@@ -692,9 +692,7 @@ def test_pil_interpolation_compat_smoke(self, interpolation, input_type):
 
     def test_dispatcher_pil_antialias_warning(self):
         with pytest.warns(UserWarning, match="Anti-alias option is always applied for PIL Image input"):
-            F.resize(
-                make_input(PIL.Image.Image, spatial_size=self.INPUT_SIZE), size=self.OUTPUT_SIZES[0], antialias=False
-            )
+            F.resize(make_input(PIL.Image.Image, size=self.INPUT_SIZE), size=self.OUTPUT_SIZES[0], antialias=False)
 
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
     @pytest.mark.parametrize(
@@ -711,7 +709,7 @@ def test_max_size_error(self, size, input_type):
             match = "size should be an int or a sequence of length 1"
 
         with pytest.raises(ValueError, match=match):
-            F.resize(make_input(input_type, spatial_size=self.INPUT_SIZE), size=size, max_size=max_size, antialias=True)
+            F.resize(make_input(input_type, size=self.INPUT_SIZE), size=size, max_size=max_size, antialias=True)
 
     @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES)
     @pytest.mark.parametrize(
@@ -725,7 +723,7 @@ def test_antialias_warning(self, interpolation, input_type):
             else assert_no_warnings()
         ):
             F.resize(
-                make_input(input_type, spatial_size=self.INPUT_SIZE),
+                make_input(input_type, size=self.INPUT_SIZE),
                 size=self.OUTPUT_SIZES[0],
                 interpolation=interpolation,
             )
@@ -742,7 +740,7 @@ def test_interpolation_int(self, interpolation, input_type):
         if issubclass(input_type, torch.Tensor) and interpolation is transforms.InterpolationMode.NEAREST_EXACT:
             return
 
-        input = make_input(input_type, spatial_size=self.INPUT_SIZE)
+        input = make_input(input_type, size=self.INPUT_SIZE)
 
         expected = F.resize(input, size=self.OUTPUT_SIZES[0], interpolation=interpolation, antialias=True)
         actual = F.resize(
@@ -763,7 +761,10 @@ def test_transform_unknown_size_error(self):
         [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
     )
     def test_noop(self, size, input_type):
-        input = make_input(input_type, spatial_size=self.INPUT_SIZE)
+        input = make_input(
+            input_type,
+            **{"spatial_size" if issubclass(input_type, datapoints.BoundingBox) else "size": self.INPUT_SIZE},
+        )
 
         output = F.resize(input, size=size, antialias=True)
 
@@ -785,7 +786,7 @@ def test_no_regression_5405(self, input_type):
         # Checks that `max_size` is not ignored if `size == small_edge_size`
         # See https://github.com/pytorch/vision/issues/5405
 
-        input = make_input(input_type, spatial_size=self.INPUT_SIZE)
+        input = make_input(input_type, size=self.INPUT_SIZE)
 
         size = min(F.get_spatial_size(input))
         max_size = size + 1
diff --git a/test/test_transforms_v2_utils.py b/test/test_transforms_v2_utils.py
index ad30c223530..98271b893d6 100644
--- a/test/test_transforms_v2_utils.py
+++ b/test/test_transforms_v2_utils.py
@@ -11,9 +11,9 @@
 from torchvision.transforms.v2.utils import has_all, has_any
 
 
-IMAGE = make_image(spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, color_space="RGB")
+IMAGE = make_image(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, color_space="RGB")
 BOUNDING_BOX = make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY, spatial_size=IMAGE.spatial_size)
-MASK = make_detection_mask(spatial_size=IMAGE.spatial_size)
+MASK = make_detection_mask(size=IMAGE.spatial_size)
 
 
 @pytest.mark.parametrize(

From d07343d43594250b23045015ddba54ac4f559866 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 5 Jul 2023 13:42:27 +0200
Subject: [PATCH 12/17] [PoC] remove make_input uses from TestResize

---
 test/common_utils.py                  |  10 +-
 test/test_transforms_v2_refactored.py | 166 ++++++++++++++------------
 2 files changed, 101 insertions(+), 75 deletions(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index 6c813f47c03..009d9dfb21a 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -27,7 +27,7 @@
 from torch.testing._comparison import BooleanPair, NonePair, not_close_error_metas, NumberPair, TensorLikePair
 from torchvision import datapoints, io
 from torchvision.transforms._functional_tensor import _max_value as get_max_value
-from torchvision.transforms.v2.functional import convert_dtype_image_tensor, to_image_tensor
+from torchvision.transforms.v2.functional import convert_dtype_image_tensor, to_image_pil, to_image_tensor
 
 
 IN_OSS_CI = any(os.getenv(var) == "true" for var in ["CIRCLECI", "GITHUB_ACTIONS"])
@@ -518,6 +518,14 @@ def make_image(
     return datapoints.Image(data)
 
 
+def make_image_tensor(*args, **kwargs):
+    return make_image(*args, **kwargs).as_subclass(torch.Tensor)
+
+
+def make_image_pil(*args, **kwargs):
+    return to_image_pil(make_image(*args, **kwargs))
+
+
 def make_image_loader(
     size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
     *,
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index e44cd5e9264..ad2ddd6e051 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -20,6 +20,8 @@
     make_bounding_box,
     make_detection_mask,
     make_image,
+    make_image_pil,
+    make_image_tensor,
     make_segmentation_mask,
     make_video,
     set_rng_seed,
@@ -506,7 +508,7 @@ def test_kernel_image_tensor(self, size, interpolation, use_max_size, antialias,
 
         check_kernel(
             F.resize_image_tensor,
-            make_input(datapoints.Image, dtype=dtype, device=device, size=self.INPUT_SIZE),
+            make_image(self.INPUT_SIZE, dtype=dtype, device=device),
             size=size,
             interpolation=interpolation,
             **max_size_kwarg,
@@ -524,8 +526,11 @@ def test_kernel_bounding_box(self, format, size, use_max_size, dtype, device):
         if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
             return
 
-        bounding_box = make_input(
-            datapoints.BoundingBox, dtype=dtype, device=device, format=format, spatial_size=self.INPUT_SIZE
+        bounding_box = make_bounding_box(
+            format=format,
+            spatial_size=self.INPUT_SIZE,
+            dtype=dtype,
+            device=device,
         )
         check_kernel(
             F.resize_bounding_box,
@@ -536,53 +541,44 @@ def test_kernel_bounding_box(self, format, size, use_max_size, dtype, device):
             check_scripted_vs_eager=not isinstance(size, int),
         )
 
-    @pytest.mark.parametrize("mask_type", ["segmentation", "detection"])
-    def test_kernel_mask(self, mask_type):
-        check_kernel(
-            F.resize_mask,
-            make_input(datapoints.Mask, size=self.INPUT_SIZE, mask_type=mask_type),
-            size=self.OUTPUT_SIZES[-1],
-        )
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
+    def test_kernel_mask(self, make_mask):
+        check_kernel(F.resize_mask, make_mask(self.INPUT_SIZE), size=self.OUTPUT_SIZES[-1])
 
     def test_kernel_video(self):
-        check_kernel(
-            F.resize_video,
-            make_input(datapoints.Video, size=self.INPUT_SIZE),
-            size=self.OUTPUT_SIZES[-1],
-            antialias=True,
-        )
+        check_kernel(F.resize_video, make_video(self.INPUT_SIZE), size=self.OUTPUT_SIZES[-1], antialias=True)
 
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
     @pytest.mark.parametrize(
-        ("input_type", "kernel"),
+        ("kernel", "make_input"),
         [
-            (torch.Tensor, F.resize_image_tensor),
-            (PIL.Image.Image, F.resize_image_pil),
-            (datapoints.Image, F.resize_image_tensor),
-            (datapoints.BoundingBox, F.resize_bounding_box),
-            (datapoints.Mask, F.resize_mask),
-            (datapoints.Video, F.resize_video),
+            (F.resize_image_tensor, make_image_tensor),
+            (F.resize_image_pil, make_image_pil),
+            (F.resize_image_tensor, make_image),
+            (F.resize_bounding_box, make_bounding_box),
+            (F.resize_mask, make_segmentation_mask),
+            (F.resize_video, make_video),
         ],
     )
-    def test_dispatcher(self, size, input_type, kernel):
+    def test_dispatcher(self, size, kernel, make_input):
         check_dispatcher(
             F.resize,
             kernel,
-            make_input(input_type, size=self.INPUT_SIZE),
+            make_input(self.INPUT_SIZE),
             size=size,
             antialias=True,
             check_scripted_smoke=not isinstance(size, int),
         )
 
     @pytest.mark.parametrize(
-        ("input_type", "kernel"),
+        ("kernel", "input_type"),
         [
-            (torch.Tensor, F.resize_image_tensor),
-            (PIL.Image.Image, F.resize_image_pil),
-            (datapoints.Image, F.resize_image_tensor),
-            (datapoints.BoundingBox, F.resize_bounding_box),
-            (datapoints.Mask, F.resize_mask),
-            (datapoints.Video, F.resize_video),
+            (F.resize_image_tensor, torch.Tensor),
+            (F.resize_image_pil, PIL.Image.Image),
+            (F.resize_image_tensor, datapoints.Image),
+            (F.resize_bounding_box, datapoints.BoundingBox),
+            (F.resize_mask, datapoints.Mask),
+            (F.resize_video, datapoints.Video),
         ],
     )
     def test_dispatcher_signature(self, kernel, input_type):
@@ -591,18 +587,19 @@ def test_dispatcher_signature(self, kernel, input_type):
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
     @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_box,
+            make_segmentation_mask,
+            make_detection_mask,
+            make_video,
+        ],
     )
-    def test_transform(self, size, device, input_type):
-        input = make_input(input_type, device=device, size=self.INPUT_SIZE)
-
-        check_transform(
-            transforms.Resize,
-            input,
-            size=size,
-            antialias=True,
-        )
+    def test_transform(self, size, device, make_input):
+        check_transform(transforms.Resize, make_input(self.INPUT_SIZE, device=device), size=size, antialias=True)
 
     def _check_output_size(self, input, output, *, size, max_size):
         assert tuple(F.get_spatial_size(output)) == self._compute_output_size(
@@ -619,7 +616,7 @@ def test_image_correctness(self, size, interpolation, use_max_size, fn):
         if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
             return
 
-        image = make_input(torch.Tensor, dtype=torch.uint8, device="cpu", size=self.INPUT_SIZE)
+        image = make_image(self.INPUT_SIZE, dtype=torch.uint8)
 
         actual = fn(image, size=size, interpolation=interpolation, **max_size_kwarg, antialias=True)
         expected = F.to_image_tensor(
@@ -662,7 +659,7 @@ def test_bounding_box_correctness(self, format, size, use_max_size, fn):
         if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
             return
 
-        bounding_box = make_input(datapoints.BoundingBox, size=self.INPUT_SIZE)
+        bounding_box = make_bounding_box(format=format, spatial_size=self.INPUT_SIZE)
 
         actual = fn(bounding_box, size=size, **max_size_kwarg)
         expected = self._reference_resize_bounding_box(bounding_box, size=size, **max_size_kwarg)
@@ -672,11 +669,11 @@ def test_bounding_box_correctness(self, format, size, use_max_size, fn):
 
     @pytest.mark.parametrize("interpolation", set(transforms.InterpolationMode) - set(INTERPOLATION_MODES))
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.Video],
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_video],
     )
-    def test_pil_interpolation_compat_smoke(self, interpolation, input_type):
-        input = make_input(input_type, size=self.INPUT_SIZE)
+    def test_pil_interpolation_compat_smoke(self, interpolation, make_input):
+        input = make_input(self.INPUT_SIZE)
 
         with (
             contextlib.nullcontext()
@@ -696,10 +693,18 @@ def test_dispatcher_pil_antialias_warning(self):
 
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_box,
+            make_segmentation_mask,
+            make_detection_mask,
+            make_video,
+        ],
     )
-    def test_max_size_error(self, size, input_type):
+    def test_max_size_error(self, size, make_input):
         if isinstance(size, int) or len(size) == 1:
             max_size = (size if isinstance(size, int) else size[0]) - 1
             match = "must be strictly greater than the requested size"
@@ -709,39 +714,39 @@ def test_max_size_error(self, size, input_type):
             match = "size should be an int or a sequence of length 1"
 
         with pytest.raises(ValueError, match=match):
-            F.resize(make_input(input_type, size=self.INPUT_SIZE), size=size, max_size=max_size, antialias=True)
+            F.resize(make_input(self.INPUT_SIZE), size=size, max_size=max_size, antialias=True)
 
     @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES)
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, datapoints.Image, datapoints.Video],
+        "make_input",
+        [make_image_tensor, make_image, make_video],
     )
-    def test_antialias_warning(self, interpolation, input_type):
+    def test_antialias_warning(self, interpolation, make_input):
         with (
             assert_warns_antialias_default_value()
             if interpolation in {transforms.InterpolationMode.BILINEAR, transforms.InterpolationMode.BICUBIC}
             else assert_no_warnings()
         ):
             F.resize(
-                make_input(input_type, size=self.INPUT_SIZE),
+                make_input(self.INPUT_SIZE),
                 size=self.OUTPUT_SIZES[0],
                 interpolation=interpolation,
             )
 
     @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES)
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.Video],
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_video],
     )
-    def test_interpolation_int(self, interpolation, input_type):
+    def test_interpolation_int(self, interpolation, make_input):
+        input = make_input(self.INPUT_SIZE)
+
         # `InterpolationMode.NEAREST_EXACT` has no proper corresponding integer equivalent. Internally, we map it to
         # `0` to be the same as `InterpolationMode.NEAREST` for PIL. However, for the tensor backend there is a
         # difference and thus we don't test it here.
-        if issubclass(input_type, torch.Tensor) and interpolation is transforms.InterpolationMode.NEAREST_EXACT:
+        if isinstance(input, torch.Tensor) and interpolation is transforms.InterpolationMode.NEAREST_EXACT:
             return
 
-        input = make_input(input_type, size=self.INPUT_SIZE)
-
         expected = F.resize(input, size=self.OUTPUT_SIZES[0], interpolation=interpolation, antialias=True)
         actual = F.resize(
             input, size=self.OUTPUT_SIZES[0], interpolation=pil_modes_mapping[interpolation], antialias=True
@@ -757,14 +762,19 @@ def test_transform_unknown_size_error(self):
         "size", [min(INPUT_SIZE), [min(INPUT_SIZE)], (min(INPUT_SIZE),), list(INPUT_SIZE), tuple(INPUT_SIZE)]
     )
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_box,
+            make_segmentation_mask,
+            make_detection_mask,
+            make_video,
+        ],
     )
-    def test_noop(self, size, input_type):
-        input = make_input(
-            input_type,
-            **{"spatial_size" if issubclass(input_type, datapoints.BoundingBox) else "size": self.INPUT_SIZE},
-        )
+    def test_noop(self, size, make_input):
+        input = make_input(**{"spatial_size" if make_input is make_bounding_box else "size": self.INPUT_SIZE})
 
         output = F.resize(input, size=size, antialias=True)
 
@@ -779,14 +789,22 @@ def test_noop(self, size, input_type):
             assert output is input
 
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_box,
+            make_segmentation_mask,
+            make_detection_mask,
+            make_video,
+        ],
     )
-    def test_no_regression_5405(self, input_type):
+    def test_no_regression_5405(self, make_input):
         # Checks that `max_size` is not ignored if `size == small_edge_size`
         # See https://github.com/pytorch/vision/issues/5405
 
-        input = make_input(input_type, size=self.INPUT_SIZE)
+        input = make_input(self.INPUT_SIZE)
 
         size = min(F.get_spatial_size(input))
         max_size = size + 1

From df2f87115715a956c75f31dbf33fec3f4b08a9c3 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 5 Jul 2023 14:07:11 +0200
Subject: [PATCH 13/17] remove make_input

---
 test/test_transforms_v2_refactored.py | 288 ++++++++++++--------------
 1 file changed, 129 insertions(+), 159 deletions(-)

diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index ad2ddd6e051..00d395336f4 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -310,32 +310,6 @@ def wrapper(input, *args, **kwargs):
     return wrapper
 
 
-def make_input(input_type, *, mask_type="segmentation", **kwargs):
-    if input_type in {torch.Tensor, PIL.Image.Image, datapoints.Image}:
-        input = make_image(**kwargs)
-        if input_type is torch.Tensor:
-            input = input.as_subclass(torch.Tensor)
-        elif input_type is PIL.Image.Image:
-            input = F.to_image_pil(input)
-    elif input_type is datapoints.BoundingBox:
-        input = make_bounding_box(**kwargs)
-    elif input_type is datapoints.Mask:
-        make_mask = {
-            "segmentation": make_segmentation_mask,
-            "detection": make_detection_mask,
-        }[mask_type]
-        input = make_mask(**kwargs)
-    elif input_type is datapoints.Video:
-        input = make_video(**kwargs)
-    else:
-        raise TypeError(
-            f"Input type can either be torch.Tensor, PIL.Image.Image, or any TorchVision datapoint class, "
-            f"but got {input_type} instead."
-        )
-
-    return input
-
-
 def param_value_parametrization(**kwargs):
     """Helper function to turn
 
@@ -689,7 +663,7 @@ def test_pil_interpolation_compat_smoke(self, interpolation, make_input):
 
     def test_dispatcher_pil_antialias_warning(self):
         with pytest.warns(UserWarning, match="Anti-alias option is always applied for PIL Image input"):
-            F.resize(make_input(PIL.Image.Image, size=self.INPUT_SIZE), size=self.OUTPUT_SIZES[0], antialias=False)
+            F.resize(make_image_pil(self.INPUT_SIZE), size=self.OUTPUT_SIZES[0], antialias=False)
 
     @pytest.mark.parametrize("size", OUTPUT_SIZES)
     @pytest.mark.parametrize(
@@ -817,13 +791,13 @@ class TestHorizontalFlip:
     @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_image_tensor(self, dtype, device):
-        check_kernel(F.horizontal_flip_image_tensor, make_input(torch.Tensor, dtype=dtype, device=device))
+        check_kernel(F.horizontal_flip_image_tensor, make_image(dtype=dtype, device=device))
 
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_box(self, format, dtype, device):
-        bounding_box = make_input(datapoints.BoundingBox, dtype=dtype, device=device, format=format)
+        bounding_box = make_bounding_box(format=format, dtype=dtype, device=device)
         check_kernel(
             F.horizontal_flip_bounding_box,
             bounding_box,
@@ -831,56 +805,54 @@ def test_kernel_bounding_box(self, format, dtype, device):
             spatial_size=bounding_box.spatial_size,
         )
 
-    @pytest.mark.parametrize("mask_type", ["segmentation", "detection"])
-    def test_kernel_mask(self, mask_type):
-        check_kernel(F.horizontal_flip_mask, make_input(datapoints.Mask, mask_type=mask_type))
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
+    def test_kernel_mask(self, make_mask):
+        check_kernel(F.horizontal_flip_mask, make_mask())
 
     def test_kernel_video(self):
-        check_kernel(F.horizontal_flip_video, make_input(datapoints.Video))
+        check_kernel(F.horizontal_flip_video, make_video())
 
     @pytest.mark.parametrize(
-        ("input_type", "kernel"),
+        ("kernel", "make_input"),
         [
-            (torch.Tensor, F.horizontal_flip_image_tensor),
-            (PIL.Image.Image, F.horizontal_flip_image_pil),
-            (datapoints.Image, F.horizontal_flip_image_tensor),
-            (datapoints.BoundingBox, F.horizontal_flip_bounding_box),
-            (datapoints.Mask, F.horizontal_flip_mask),
-            (datapoints.Video, F.horizontal_flip_video),
+            (F.horizontal_flip_image_tensor, make_image_tensor),
+            (F.horizontal_flip_image_pil, make_image_pil),
+            (F.horizontal_flip_image_tensor, make_image),
+            (F.horizontal_flip_bounding_box, make_bounding_box),
+            (F.horizontal_flip_mask, make_segmentation_mask),
+            (F.horizontal_flip_video, make_video),
         ],
     )
-    def test_dispatcher(self, kernel, input_type):
-        check_dispatcher(F.horizontal_flip, kernel, make_input(input_type))
+    def test_dispatcher(self, kernel, make_input):
+        check_dispatcher(F.horizontal_flip, kernel, make_input())
 
     @pytest.mark.parametrize(
-        ("input_type", "kernel"),
+        ("kernel", "input_type"),
         [
-            (torch.Tensor, F.horizontal_flip_image_tensor),
-            (PIL.Image.Image, F.horizontal_flip_image_pil),
-            (datapoints.Image, F.horizontal_flip_image_tensor),
-            (datapoints.BoundingBox, F.horizontal_flip_bounding_box),
-            (datapoints.Mask, F.horizontal_flip_mask),
-            (datapoints.Video, F.horizontal_flip_video),
+            (F.horizontal_flip_image_tensor, torch.Tensor),
+            (F.horizontal_flip_image_pil, PIL.Image.Image),
+            (F.horizontal_flip_image_tensor, datapoints.Image),
+            (F.horizontal_flip_bounding_box, datapoints.BoundingBox),
+            (F.horizontal_flip_mask, datapoints.Mask),
+            (F.horizontal_flip_video, datapoints.Video),
         ],
     )
     def test_dispatcher_signature(self, kernel, input_type):
         check_dispatcher_signatures_match(F.horizontal_flip, kernel=kernel, input_type=input_type)
 
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_transform(self, input_type, device):
-        input = make_input(input_type, device=device)
-
-        check_transform(transforms.RandomHorizontalFlip, input, p=1)
+    def test_transform(self, make_input, device):
+        check_transform(transforms.RandomHorizontalFlip, make_input(device=device), p=1)
 
     @pytest.mark.parametrize(
         "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)]
     )
     def test_image_correctness(self, fn):
-        image = make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
+        image = make_image(dtype=torch.uint8, device="cpu")
 
         actual = fn(image)
         expected = F.to_image_tensor(F.horizontal_flip(F.to_image_pil(image)))
@@ -910,7 +882,7 @@ def _reference_horizontal_flip_bounding_box(self, bounding_box):
         "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)]
     )
     def test_bounding_box_correctness(self, format, fn):
-        bounding_box = make_input(datapoints.BoundingBox, format=format)
+        bounding_box = make_bounding_box(format=format)
 
         actual = fn(bounding_box)
         expected = self._reference_horizontal_flip_bounding_box(bounding_box)
@@ -918,12 +890,12 @@ def test_bounding_box_correctness(self, format, fn):
         torch.testing.assert_close(actual, expected)
 
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_transform_noop(self, input_type, device):
-        input = make_input(input_type, device=device)
+    def test_transform_noop(self, make_input, device):
+        input = make_input(device=device)
 
         transform = transforms.RandomHorizontalFlip(p=0)
 
@@ -988,7 +960,7 @@ def test_kernel_image_tensor(self, param, value, dtype, device):
             value = adapt_fill(value, dtype=dtype)
         self._check_kernel(
             F.affine_image_tensor,
-            make_input(torch.Tensor, dtype=dtype, device=device),
+            make_image(dtype=dtype, device=device),
             **{param: value},
             check_scripted_vs_eager=not (param in {"shear", "fill"} and isinstance(value, (int, float))),
             check_cuda_vs_cpu=dict(atol=1, rtol=0)
@@ -1006,58 +978,58 @@ def test_kernel_image_tensor(self, param, value, dtype, device):
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_box(self, param, value, format, dtype, device):
-        bounding_box = make_input(datapoints.BoundingBox, format=format, dtype=dtype, device=device)
+        bounding_box = make_bounding_box(format=format, dtype=dtype, device=device)
         self._check_kernel(
             F.affine_bounding_box,
-            make_input(datapoints.BoundingBox, format=format, dtype=dtype, device=device),
+            bounding_box,
             format=format,
             spatial_size=bounding_box.spatial_size,
             **{param: value},
             check_scripted_vs_eager=not (param == "shear" and isinstance(value, (int, float))),
         )
 
-    @pytest.mark.parametrize("mask_type", ["segmentation", "detection"])
-    def test_kernel_mask(self, mask_type):
-        self._check_kernel(F.affine_mask, make_input(datapoints.Mask, mask_type=mask_type))
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
+    def test_kernel_mask(self, make_mask):
+        self._check_kernel(F.affine_mask, make_mask())
 
     def test_kernel_video(self):
-        self._check_kernel(F.affine_video, make_input(datapoints.Video))
+        self._check_kernel(F.affine_video, make_video())
 
     @pytest.mark.parametrize(
-        ("input_type", "kernel"),
+        ("kernel", "make_input"),
         [
-            (torch.Tensor, F.affine_image_tensor),
-            (PIL.Image.Image, F.affine_image_pil),
-            (datapoints.Image, F.affine_image_tensor),
-            (datapoints.BoundingBox, F.affine_bounding_box),
-            (datapoints.Mask, F.affine_mask),
-            (datapoints.Video, F.affine_video),
+            (F.affine_image_tensor, make_image_tensor),
+            (F.affine_image_pil, make_image_pil),
+            (F.affine_image_tensor, make_image),
+            (F.affine_bounding_box, make_bounding_box),
+            (F.affine_mask, make_segmentation_mask),
+            (F.affine_video, make_video),
         ],
     )
-    def test_dispatcher(self, kernel, input_type):
-        check_dispatcher(F.affine, kernel, make_input(input_type), **self._MINIMAL_AFFINE_KWARGS)
+    def test_dispatcher(self, kernel, make_input):
+        check_dispatcher(F.affine, kernel, make_input(), **self._MINIMAL_AFFINE_KWARGS)
 
     @pytest.mark.parametrize(
-        ("input_type", "kernel"),
+        ("kernel", "input_type"),
         [
-            (torch.Tensor, F.affine_image_tensor),
-            (PIL.Image.Image, F.affine_image_pil),
-            (datapoints.Image, F.affine_image_tensor),
-            (datapoints.BoundingBox, F.affine_bounding_box),
-            (datapoints.Mask, F.affine_mask),
-            (datapoints.Video, F.affine_video),
+            (F.affine_image_tensor, torch.Tensor),
+            (F.affine_image_pil, PIL.Image.Image),
+            (F.affine_image_tensor, datapoints.Image),
+            (F.affine_bounding_box, datapoints.BoundingBox),
+            (F.affine_mask, datapoints.Mask),
+            (F.affine_video, datapoints.Video),
         ],
     )
     def test_dispatcher_signature(self, kernel, input_type):
         check_dispatcher_signatures_match(F.affine, kernel=kernel, input_type=input_type)
 
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_transform(self, input_type, device):
-        input = make_input(input_type, device=device)
+    def test_transform(self, make_input, device):
+        input = make_input(device=device)
 
         check_transform(transforms.RandomAffine, input, **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES)
 
@@ -1071,7 +1043,7 @@ def test_transform(self, input_type, device):
     )
     @pytest.mark.parametrize("fill", CORRECTNESS_FILLS)
     def test_functional_image_correctness(self, angle, translate, scale, shear, center, interpolation, fill):
-        image = make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
+        image = make_image(dtype=torch.uint8, device="cpu")
 
         fill = adapt_fill(fill, dtype=torch.uint8)
 
@@ -1108,7 +1080,7 @@ def test_functional_image_correctness(self, angle, translate, scale, shear, cent
     @pytest.mark.parametrize("fill", CORRECTNESS_FILLS)
     @pytest.mark.parametrize("seed", list(range(5)))
     def test_transform_image_correctness(self, center, interpolation, fill, seed):
-        image = make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
+        image = make_image(dtype=torch.uint8, device="cpu")
 
         fill = adapt_fill(fill, dtype=torch.uint8)
 
@@ -1172,7 +1144,7 @@ def _reference_affine_bounding_box(self, bounding_box, *, angle, translate, scal
     @pytest.mark.parametrize("shear", _CORRECTNESS_AFFINE_KWARGS["shear"])
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     def test_functional_bounding_box_correctness(self, format, angle, translate, scale, shear, center):
-        bounding_box = make_input(datapoints.BoundingBox, format=format)
+        bounding_box = make_bounding_box(format=format)
 
         actual = F.affine(
             bounding_box,
@@ -1197,7 +1169,7 @@ def test_functional_bounding_box_correctness(self, format, angle, translate, sca
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     @pytest.mark.parametrize("seed", list(range(5)))
     def test_transform_bounding_box_correctness(self, format, center, seed):
-        bounding_box = make_input(datapoints.BoundingBox, format=format)
+        bounding_box = make_bounding_box(format=format)
 
         transform = transforms.RandomAffine(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center)
 
@@ -1217,7 +1189,7 @@ def test_transform_bounding_box_correctness(self, format, center, seed):
     @pytest.mark.parametrize("shear", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["shear"])
     @pytest.mark.parametrize("seed", list(range(10)))
     def test_transform_get_params_bounds(self, degrees, translate, scale, shear, seed):
-        image = make_input(torch.Tensor)
+        image = make_image()
         height, width = F.get_spatial_size(image)
 
         transform = transforms.RandomAffine(degrees=degrees, translate=translate, scale=scale, shear=shear)
@@ -1298,13 +1270,13 @@ class TestVerticalFlip:
     @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_image_tensor(self, dtype, device):
-        check_kernel(F.vertical_flip_image_tensor, make_input(torch.Tensor, dtype=dtype, device=device))
+        check_kernel(F.vertical_flip_image_tensor, make_image(dtype=dtype, device=device))
 
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
     @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_kernel_bounding_box(self, format, dtype, device):
-        bounding_box = make_input(datapoints.BoundingBox, dtype=dtype, device=device, format=format)
+        bounding_box = make_bounding_box(format=format, dtype=dtype, device=device)
         check_kernel(
             F.vertical_flip_bounding_box,
             bounding_box,
@@ -1312,54 +1284,52 @@ def test_kernel_bounding_box(self, format, dtype, device):
             spatial_size=bounding_box.spatial_size,
         )
 
-    @pytest.mark.parametrize("mask_type", ["segmentation", "detection"])
-    def test_kernel_mask(self, mask_type):
-        check_kernel(F.vertical_flip_mask, make_input(datapoints.Mask, mask_type=mask_type))
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
+    def test_kernel_mask(self, make_mask):
+        check_kernel(F.vertical_flip_mask, make_mask())
 
     def test_kernel_video(self):
-        check_kernel(F.vertical_flip_video, make_input(datapoints.Video))
+        check_kernel(F.vertical_flip_video, make_video())
 
     @pytest.mark.parametrize(
-        ("input_type", "kernel"),
+        ("kernel", "make_input"),
         [
-            (torch.Tensor, F.vertical_flip_image_tensor),
-            (PIL.Image.Image, F.vertical_flip_image_pil),
-            (datapoints.Image, F.vertical_flip_image_tensor),
-            (datapoints.BoundingBox, F.vertical_flip_bounding_box),
-            (datapoints.Mask, F.vertical_flip_mask),
-            (datapoints.Video, F.vertical_flip_video),
+            (F.vertical_flip_image_tensor, make_image_tensor),
+            (F.vertical_flip_image_pil, make_image_pil),
+            (F.vertical_flip_image_tensor, make_image),
+            (F.vertical_flip_bounding_box, make_bounding_box),
+            (F.vertical_flip_mask, make_segmentation_mask),
+            (F.vertical_flip_video, make_video),
         ],
     )
-    def test_dispatcher(self, kernel, input_type):
-        check_dispatcher(F.vertical_flip, kernel, make_input(input_type))
+    def test_dispatcher(self, kernel, make_input):
+        check_dispatcher(F.vertical_flip, kernel, make_input())
 
     @pytest.mark.parametrize(
-        ("input_type", "kernel"),
+        ("kernel", "input_type"),
         [
-            (torch.Tensor, F.vertical_flip_image_tensor),
-            (PIL.Image.Image, F.vertical_flip_image_pil),
-            (datapoints.Image, F.vertical_flip_image_tensor),
-            (datapoints.BoundingBox, F.vertical_flip_bounding_box),
-            (datapoints.Mask, F.vertical_flip_mask),
-            (datapoints.Video, F.vertical_flip_video),
+            (F.vertical_flip_image_tensor, torch.Tensor),
+            (F.vertical_flip_image_pil, PIL.Image.Image),
+            (F.vertical_flip_image_tensor, datapoints.Image),
+            (F.vertical_flip_bounding_box, datapoints.BoundingBox),
+            (F.vertical_flip_mask, datapoints.Mask),
+            (F.vertical_flip_video, datapoints.Video),
         ],
     )
     def test_dispatcher_signature(self, kernel, input_type):
         check_dispatcher_signatures_match(F.vertical_flip, kernel=kernel, input_type=input_type)
 
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_transform(self, input_type, device):
-        input = make_input(input_type, device=device)
-
-        check_transform(transforms.RandomVerticalFlip, input, p=1)
+    def test_transform(self, make_input, device):
+        check_transform(transforms.RandomVerticalFlip, make_input(device=device), p=1)
 
     @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)])
     def test_image_correctness(self, fn):
-        image = make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
+        image = make_image(dtype=torch.uint8, device="cpu")
 
         actual = fn(image)
         expected = F.to_image_tensor(F.vertical_flip(F.to_image_pil(image)))
@@ -1387,7 +1357,7 @@ def _reference_vertical_flip_bounding_box(self, bounding_box):
     @pytest.mark.parametrize("format", list(datapoints.BoundingBoxFormat))
     @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)])
     def test_bounding_box_correctness(self, format, fn):
-        bounding_box = make_input(datapoints.BoundingBox, format=format)
+        bounding_box = make_bounding_box(format=format)
 
         actual = fn(bounding_box)
         expected = self._reference_vertical_flip_bounding_box(bounding_box)
@@ -1395,12 +1365,12 @@ def test_bounding_box_correctness(self, format, fn):
         torch.testing.assert_close(actual, expected)
 
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_transform_noop(self, input_type, device):
-        input = make_input(input_type, device=device)
+    def test_transform_noop(self, make_input, device):
+        input = make_input(device=device)
 
         transform = transforms.RandomVerticalFlip(p=0)
 
@@ -1443,7 +1413,7 @@ def test_kernel_image_tensor(self, param, value, dtype, device):
             kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"]
         check_kernel(
             F.rotate_image_tensor,
-            make_input(torch.Tensor, dtype=dtype, device=device),
+            make_image(dtype=dtype, device=device),
             **kwargs,
             check_scripted_vs_eager=not (param == "fill" and isinstance(value, (int, float))),
         )
@@ -1461,7 +1431,7 @@ def test_kernel_bounding_box(self, param, value, format, dtype, device):
         if param != "angle":
             kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"]
 
-        bounding_box = make_input(datapoints.BoundingBox, dtype=dtype, device=device, format=format)
+        bounding_box = make_bounding_box(format=format, dtype=dtype, device=device)
 
         check_kernel(
             F.rotate_bounding_box,
@@ -1471,50 +1441,50 @@ def test_kernel_bounding_box(self, param, value, format, dtype, device):
             **kwargs,
         )
 
-    @pytest.mark.parametrize("mask_type", ["segmentation", "detection"])
-    def test_kernel_mask(self, mask_type):
-        check_kernel(F.rotate_mask, make_input(datapoints.Mask, mask_type=mask_type), **self._MINIMAL_AFFINE_KWARGS)
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
+    def test_kernel_mask(self, make_mask):
+        check_kernel(F.rotate_mask, make_mask(), **self._MINIMAL_AFFINE_KWARGS)
 
     def test_kernel_video(self):
-        check_kernel(F.rotate_video, make_input(datapoints.Video), **self._MINIMAL_AFFINE_KWARGS)
+        check_kernel(F.rotate_video, make_video(), **self._MINIMAL_AFFINE_KWARGS)
 
     @pytest.mark.parametrize(
-        ("input_type", "kernel"),
+        ("kernel", "make_input"),
         [
-            (torch.Tensor, F.rotate_image_tensor),
-            (PIL.Image.Image, F.rotate_image_pil),
-            (datapoints.Image, F.rotate_image_tensor),
-            (datapoints.BoundingBox, F.rotate_bounding_box),
-            (datapoints.Mask, F.rotate_mask),
-            (datapoints.Video, F.rotate_video),
+            (F.rotate_image_tensor, make_image_tensor),
+            (F.rotate_image_pil, make_image_pil),
+            (F.rotate_image_tensor, make_image),
+            (F.rotate_bounding_box, make_bounding_box),
+            (F.rotate_mask, make_segmentation_mask),
+            (F.rotate_video, make_video),
         ],
     )
-    def test_dispatcher(self, kernel, input_type):
-        check_dispatcher(F.rotate, kernel, make_input(input_type), **self._MINIMAL_AFFINE_KWARGS)
+    def test_dispatcher(self, kernel, make_input):
+        check_dispatcher(F.rotate, kernel, make_input(), **self._MINIMAL_AFFINE_KWARGS)
 
     @pytest.mark.parametrize(
-        ("input_type", "kernel"),
+        ("kernel", "input_type"),
         [
-            (torch.Tensor, F.rotate_image_tensor),
-            (PIL.Image.Image, F.rotate_image_pil),
-            (datapoints.Image, F.rotate_image_tensor),
-            (datapoints.BoundingBox, F.rotate_bounding_box),
-            (datapoints.Mask, F.rotate_mask),
-            (datapoints.Video, F.rotate_video),
+            (F.rotate_image_tensor, torch.Tensor),
+            (F.rotate_image_pil, PIL.Image.Image),
+            (F.rotate_image_tensor, datapoints.Image),
+            (F.rotate_bounding_box, datapoints.BoundingBox),
+            (F.rotate_mask, datapoints.Mask),
+            (F.rotate_video, datapoints.Video),
         ],
     )
     def test_dispatcher_signature(self, kernel, input_type):
         check_dispatcher_signatures_match(F.rotate, kernel=kernel, input_type=input_type)
 
     @pytest.mark.parametrize(
-        "input_type",
-        [torch.Tensor, PIL.Image.Image, datapoints.Image, datapoints.BoundingBox, datapoints.Mask, datapoints.Video],
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_box, make_segmentation_mask, make_video],
     )
     @pytest.mark.parametrize("device", cpu_and_cuda())
-    def test_transform(self, input_type, device):
-        input = make_input(input_type, device=device)
-
-        check_transform(transforms.RandomRotation, input, **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES)
+    def test_transform(self, make_input, device):
+        check_transform(
+            transforms.RandomRotation, make_input(device=device), **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES
+        )
 
     @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"])
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
@@ -1524,7 +1494,7 @@ def test_transform(self, input_type, device):
     @pytest.mark.parametrize("expand", [False, True])
     @pytest.mark.parametrize("fill", CORRECTNESS_FILLS)
     def test_functional_image_correctness(self, angle, center, interpolation, expand, fill):
-        image = make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
+        image = make_image(dtype=torch.uint8, device="cpu")
 
         fill = adapt_fill(fill, dtype=torch.uint8)
 
@@ -1546,7 +1516,7 @@ def test_functional_image_correctness(self, angle, center, interpolation, expand
     @pytest.mark.parametrize("fill", CORRECTNESS_FILLS)
     @pytest.mark.parametrize("seed", list(range(5)))
     def test_transform_image_correctness(self, center, interpolation, expand, fill, seed):
-        image = make_input(torch.Tensor, dtype=torch.uint8, device="cpu")
+        image = make_image(dtype=torch.uint8, device="cpu")
 
         fill = adapt_fill(fill, dtype=torch.uint8)
 
@@ -1602,7 +1572,7 @@ def _reference_rotate_bounding_box(self, bounding_box, *, angle, expand, center)
     @pytest.mark.parametrize("expand", [False])
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     def test_functional_bounding_box_correctness(self, format, angle, expand, center):
-        bounding_box = make_input(datapoints.BoundingBox, format=format)
+        bounding_box = make_bounding_box(format=format)
 
         actual = F.rotate(bounding_box, angle=angle, expand=expand, center=center)
         expected = self._reference_rotate_bounding_box(bounding_box, angle=angle, expand=expand, center=center)
@@ -1615,7 +1585,7 @@ def test_functional_bounding_box_correctness(self, format, angle, expand, center
     @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
     @pytest.mark.parametrize("seed", list(range(5)))
     def test_transform_bounding_box_correctness(self, format, expand, center, seed):
-        bounding_box = make_input(datapoints.BoundingBox, format=format)
+        bounding_box = make_bounding_box(format=format)
 
         transform = transforms.RandomRotation(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, expand=expand, center=center)
 

From 181ac2a27441b32ee40d52d249f846c545015429 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 5 Jul 2023 14:22:16 +0200
Subject: [PATCH 14/17] refactor TestResize::test_noop

---
 test/test_transforms_v2_refactored.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
index 00d395336f4..69180b99dbc 100644
--- a/test/test_transforms_v2_refactored.py
+++ b/test/test_transforms_v2_refactored.py
@@ -748,9 +748,9 @@ def test_transform_unknown_size_error(self):
         ],
     )
     def test_noop(self, size, make_input):
-        input = make_input(**{"spatial_size" if make_input is make_bounding_box else "size": self.INPUT_SIZE})
+        input = make_input(self.INPUT_SIZE)
 
-        output = F.resize(input, size=size, antialias=True)
+        output = F.resize(input, size=F.get_spatial_size(input), antialias=True)
 
         # This identity check is not a requirement. It is here to avoid breaking the behavior by accident. If there
         # is a good reason to break this, feel free to downgrade to an equality check.

From ee1754a9591b66610af8758261cf47976bf068f8 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 5 Jul 2023 14:35:12 +0200
Subject: [PATCH 15/17] cleanup

---
 test/test_prototype_transforms.py      | 17 ++++++++---------
 test/test_transforms_v2.py             | 11 +++++------
 test/test_transforms_v2_consistency.py | 11 ++---------
 test/test_transforms_v2_utils.py       |  4 ++--
 4 files changed, 17 insertions(+), 26 deletions(-)

diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index cfbcc7c0557..c574979e22c 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -9,7 +9,6 @@
 from common_utils import (
     assert_equal,
     DEFAULT_EXTRA_DIMS,
-    DEFAULT_PORTRAIT_SPATIAL_SIZE,
     make_bounding_box,
     make_detection_mask,
     make_image,
@@ -80,8 +79,8 @@ def test_mixup_cutmix(transform, input):
     for unsup_data in [
         make_label(),
         make_bounding_box(format="XYXY"),
-        make_detection_mask(DEFAULT_PORTRAIT_SPATIAL_SIZE),
-        make_segmentation_mask(DEFAULT_PORTRAIT_SPATIAL_SIZE),
+        make_detection_mask(),
+        make_segmentation_mask(),
     ]:
         input_copy["unsupported"] = unsup_data
         with pytest.raises(TypeError, match=err_msg):
@@ -390,9 +389,9 @@ class TestPermuteDimensions:
     )
     def test_call(self, dims, inverse_dims):
         sample = dict(
-            image=make_image(DEFAULT_PORTRAIT_SPATIAL_SIZE),
-            bounding_box=make_bounding_box(format=BoundingBoxFormat.XYXY, spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE),
-            video=make_video(DEFAULT_PORTRAIT_SPATIAL_SIZE),
+            image=make_image(),
+            bounding_box=make_bounding_box(format=BoundingBoxFormat.XYXY),
+            video=make_video(),
             str="str",
             int=0,
         )
@@ -434,9 +433,9 @@ class TestTransposeDimensions:
     )
     def test_call(self, dims):
         sample = dict(
-            image=make_image(DEFAULT_PORTRAIT_SPATIAL_SIZE),
-            bounding_box=make_bounding_box(format=BoundingBoxFormat.XYXY, spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE),
-            video=make_video(DEFAULT_PORTRAIT_SPATIAL_SIZE),
+            image=make_image(),
+            bounding_box=make_bounding_box(format=BoundingBoxFormat.XYXY),
+            video=make_video(),
             str="str",
             int=0,
         )
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 0f6c4cc3b2e..3743581794f 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -17,7 +17,6 @@
     assert_equal,
     assert_run_python_script,
     cpu_and_cuda,
-    DEFAULT_PORTRAIT_SPATIAL_SIZE,
     make_bounding_box,
     make_bounding_boxes,
     make_detection_mask,
@@ -168,8 +167,8 @@ class TestSmoke:
     @pytest.mark.parametrize(
         "image_or_video",
         [
-            make_image(size=DEFAULT_PORTRAIT_SPATIAL_SIZE),
-            make_video(size=DEFAULT_PORTRAIT_SPATIAL_SIZE),
+            make_image(),
+            make_video(),
             next(make_pil_images(color_spaces=["RGB"])),
             next(make_vanilla_tensor_images()),
         ],
@@ -353,7 +352,7 @@ def test_random_resized_crop(self, transform, input):
             next(make_vanilla_tensor_images()),
             next(make_vanilla_tensor_images()),
             next(make_pil_images()),
-            make_image(size=DEFAULT_PORTRAIT_SPATIAL_SIZE),
+            make_image(),
             next(make_videos()),
         ],
         3,
@@ -1347,8 +1346,8 @@ class TestToDtype:
     )
     def test_call(self, dtype, expected_dtypes):
         sample = dict(
-            video=make_video(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.int64),
-            image=make_image(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.uint8),
+            video=make_video(dtype=torch.int64),
+            image=make_image(dtype=torch.uint8),
             bounding_box=make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY, dtype=torch.float32),
             str="str",
             int=0,
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index 3f631d7ac94..bf297473bc2 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -17,7 +17,6 @@
     ArgsKwargs,
     assert_close,
     assert_equal,
-    DEFAULT_PORTRAIT_SPATIAL_SIZE,
     make_bounding_box,
     make_detection_mask,
     make_image,
@@ -709,14 +708,8 @@ def test_call_consistency(config, args_kwargs):
             id=transform_cls.__name__,
         )
         for transform_cls, get_params_args_kwargs in [
-            (
-                v2_transforms.RandomResizedCrop,
-                ArgsKwargs(make_image(size=DEFAULT_PORTRAIT_SPATIAL_SIZE), scale=[0.3, 0.7], ratio=[0.5, 1.5]),
-            ),
-            (
-                v2_transforms.RandomErasing,
-                ArgsKwargs(make_image(size=DEFAULT_PORTRAIT_SPATIAL_SIZE), scale=(0.3, 0.7), ratio=(0.5, 1.5)),
-            ),
+            (v2_transforms.RandomResizedCrop, ArgsKwargs(make_image(), scale=[0.3, 0.7], ratio=[0.5, 1.5])),
+            (v2_transforms.RandomErasing, ArgsKwargs(make_image(), scale=(0.3, 0.7), ratio=(0.5, 1.5))),
             (v2_transforms.ColorJitter, ArgsKwargs(brightness=None, contrast=None, saturation=None, hue=None)),
             (v2_transforms.ElasticTransform, ArgsKwargs(alpha=[15.3, 27.2], sigma=[2.5, 3.9], size=[17, 31])),
             (v2_transforms.GaussianBlur, ArgsKwargs(0.3, 1.4)),
diff --git a/test/test_transforms_v2_utils.py b/test/test_transforms_v2_utils.py
index 98271b893d6..198ab39a475 100644
--- a/test/test_transforms_v2_utils.py
+++ b/test/test_transforms_v2_utils.py
@@ -4,14 +4,14 @@
 import torch
 
 import torchvision.transforms.v2.utils
-from common_utils import DEFAULT_PORTRAIT_SPATIAL_SIZE, make_bounding_box, make_detection_mask, make_image
+from common_utils import make_bounding_box, make_detection_mask, make_image
 
 from torchvision import datapoints
 from torchvision.transforms.v2.functional import to_image_pil
 from torchvision.transforms.v2.utils import has_all, has_any
 
 
-IMAGE = make_image(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, color_space="RGB")
+IMAGE = make_image(color_space="RGB")
 BOUNDING_BOX = make_bounding_box(format=datapoints.BoundingBoxFormat.XYXY, spatial_size=IMAGE.spatial_size)
 MASK = make_detection_mask(size=IMAGE.spatial_size)
 

From ba94e48d2c99a01643691b3f9154ce92e62184a9 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 5 Jul 2023 16:22:00 +0200
Subject: [PATCH 16/17] add comment

---
 test/common_utils.py | 36 ++++++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index 009d9dfb21a..72ecf104301 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -633,6 +633,32 @@ def make_bounding_box(
     dtype=None,
     device="cpu",
 ):
+    """
+    size: Size of the actual bounding box, i.e.
+        - (box[3] - box[1], box[2] - box[0]) for XYXY
+        - (H, W) for XYWH and CXCYWH
+    spatial_size: Size of the reference object, e.g. an image. Corresponds to the .spatial_size attribute on
+        returned datapoints.BoundingBox
+
+    To generate a valid joint sample, you need to set spatial_size here to the same value as size on the other maker
+    functions, e.g.
+
+    .. code::
+
+        image = make_image=(size=size)
+        bounding_box = make_bounding_box(spatial_size=size)
+        assert F.get_spatial_size(bounding_box) == F.get_spatial_size(image)
+
+    For convenience, if both size and spatial_size are omitted, spatial_size defaults to the same value as size for all
+    other maker functions, e.g.
+
+    .. code::
+
+        image = make_image=()
+        bounding_box = make_bounding_box()
+        assert F.get_spatial_size(bounding_box) == F.get_spatial_size(image)
+    """
+
     def sample_position(values, max_value):
         # We cannot use torch.randint directly here, because it only allows integer scalars as values for low and high.
         # However, if we have batch_dims, we need tensors as limits.
@@ -648,7 +674,6 @@ def sample_position(values, max_value):
             height, width = size
             height_margin, width_margin = torch.randint(10, (2,)).tolist()
             spatial_size = (height + height_margin, width + width_margin)
-    spatial_height, spatial_width = spatial_size
 
     dtype = dtype or torch.float32
 
@@ -658,13 +683,12 @@ def sample_position(values, max_value):
         )
 
     if size is None:
-        h = torch.randint(1, spatial_height - 1, batch_dims)
-        w = torch.randint(1, spatial_width - 1, batch_dims)
+        h, w = [torch.randint(1, s, batch_dims) for s in spatial_size]
     else:
-        h, w = [torch.full(batch_dims, v, dtype=torch.int) for v in size]
+        h, w = [torch.full(batch_dims, s, dtype=torch.int) for s in size]
 
-    y = sample_position(h, spatial_height)
-    x = sample_position(w, spatial_width)
+    y = sample_position(h, spatial_size[0])
+    x = sample_position(w, spatial_size[1])
 
     if format is datapoints.BoundingBoxFormat.XYWH:
         parts = (x, y, w, h)

From 519f0fad9af28e8dc95206df449ffb40cd5ac104 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 5 Jul 2023 20:39:13 +0200
Subject: [PATCH 17/17] remove obsolete tests

---
 test/test_transforms_v2_functional.py | 157 --------------------------
 1 file changed, 157 deletions(-)

diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
index 79ea20d854e..465cc227107 100644
--- a/test/test_transforms_v2_functional.py
+++ b/test/test_transforms_v2_functional.py
@@ -665,163 +665,6 @@ def _compute_affine_matrix(angle_, translate_, scale_, shear_, center_):
     return true_matrix
 
 
-@pytest.mark.parametrize("angle", range(-90, 90, 56))
-@pytest.mark.parametrize("expand, center", [(True, None), (False, None), (False, (12, 14))])
-def test_correctness_rotate_bounding_box(angle, expand, center):
-    def _compute_expected_bbox(bbox, angle_, expand_, center_):
-        affine_matrix = _compute_affine_matrix(angle_, [0.0, 0.0], 1.0, [0.0, 0.0], center_)
-        affine_matrix = affine_matrix[:2, :]
-
-        height, width = bbox.spatial_size
-        bbox_xyxy = convert_format_bounding_box(bbox, new_format=datapoints.BoundingBoxFormat.XYXY)
-        points = np.array(
-            [
-                [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0],
-                [bbox_xyxy[2].item(), bbox_xyxy[1].item(), 1.0],
-                [bbox_xyxy[0].item(), bbox_xyxy[3].item(), 1.0],
-                [bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0],
-                # image frame
-                [0.0, 0.0, 1.0],
-                [0.0, height, 1.0],
-                [width, height, 1.0],
-                [width, 0.0, 1.0],
-            ]
-        )
-        transformed_points = np.matmul(points, affine_matrix.T)
-        out_bbox = [
-            float(np.min(transformed_points[:4, 0])),
-            float(np.min(transformed_points[:4, 1])),
-            float(np.max(transformed_points[:4, 0])),
-            float(np.max(transformed_points[:4, 1])),
-        ]
-        if expand_:
-            tr_x = np.min(transformed_points[4:, 0])
-            tr_y = np.min(transformed_points[4:, 1])
-            out_bbox[0] -= tr_x
-            out_bbox[1] -= tr_y
-            out_bbox[2] -= tr_x
-            out_bbox[3] -= tr_y
-
-            height = int(height - 2 * tr_y)
-            width = int(width - 2 * tr_x)
-
-        out_bbox = datapoints.BoundingBox(
-            out_bbox,
-            format=datapoints.BoundingBoxFormat.XYXY,
-            spatial_size=(height, width),
-            dtype=bbox.dtype,
-            device=bbox.device,
-        )
-        out_bbox = clamp_bounding_box(convert_format_bounding_box(out_bbox, new_format=bbox.format))
-        return out_bbox, (height, width)
-
-    spatial_size = (32, 38)
-
-    for bboxes in make_bounding_boxes(spatial_size=spatial_size, extra_dims=((4,),)):
-        bboxes_format = bboxes.format
-        bboxes_spatial_size = bboxes.spatial_size
-
-        output_bboxes, output_spatial_size = F.rotate_bounding_box(
-            bboxes.as_subclass(torch.Tensor),
-            format=bboxes_format,
-            spatial_size=bboxes_spatial_size,
-            angle=angle,
-            expand=expand,
-            center=center,
-        )
-
-        center_ = center
-        if center_ is None:
-            center_ = [s * 0.5 for s in bboxes_spatial_size[::-1]]
-
-        if bboxes.ndim < 2:
-            bboxes = [bboxes]
-
-        expected_bboxes = []
-        for bbox in bboxes:
-            bbox = datapoints.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
-            expected_bbox, expected_spatial_size = _compute_expected_bbox(bbox, -angle, expand, center_)
-            expected_bboxes.append(expected_bbox)
-        if len(expected_bboxes) > 1:
-            expected_bboxes = torch.stack(expected_bboxes)
-        else:
-            expected_bboxes = expected_bboxes[0]
-        torch.testing.assert_close(output_bboxes, expected_bboxes, atol=1, rtol=0)
-        torch.testing.assert_close(output_spatial_size, expected_spatial_size, atol=1, rtol=0)
-
-
-@pytest.mark.parametrize("device", cpu_and_cuda())
-@pytest.mark.parametrize("expand", [False])  # expand=True does not match D2
-def test_correctness_rotate_bounding_box_on_fixed_input(device, expand):
-    # Check transformation against known expected output
-    format = datapoints.BoundingBoxFormat.XYXY
-    spatial_size = (64, 64)
-    # xyxy format
-    in_boxes = [
-        [1, 1, 5, 5],
-        [1, spatial_size[0] - 6, 5, spatial_size[0] - 2],
-        [spatial_size[1] - 6, spatial_size[0] - 6, spatial_size[1] - 2, spatial_size[0] - 2],
-        [spatial_size[1] // 2 - 10, spatial_size[0] // 2 - 10, spatial_size[1] // 2 + 10, spatial_size[0] // 2 + 10],
-    ]
-    in_boxes = torch.tensor(in_boxes, dtype=torch.float64, device=device)
-    # Tested parameters
-    angle = 45
-    center = None if expand else [12, 23]
-
-    # # Expected bboxes computed using Detectron2:
-    # from detectron2.data.transforms import RotationTransform, AugmentationList
-    # from detectron2.data.transforms import AugInput
-    # import cv2
-    # inpt = AugInput(im1, boxes=np.array(in_boxes, dtype="float32"))
-    # augs = AugmentationList([RotationTransform(*size, angle, expand=expand, center=center, interp=cv2.INTER_NEAREST), ])
-    # out = augs(inpt)
-    # print(inpt.boxes)
-    if expand:
-        expected_bboxes = [
-            [1.65937957, 42.67157288, 7.31623382, 48.32842712],
-            [41.96446609, 82.9766594, 47.62132034, 88.63351365],
-            [82.26955262, 42.67157288, 87.92640687, 48.32842712],
-            [31.35786438, 31.35786438, 59.64213562, 59.64213562],
-        ]
-    else:
-        expected_bboxes = [
-            [-11.33452378, 12.39339828, -5.67766953, 18.05025253],
-            [28.97056275, 52.69848481, 34.627417, 58.35533906],
-            [69.27564928, 12.39339828, 74.93250353, 18.05025253],
-            [18.36396103, 1.07968978, 46.64823228, 29.36396103],
-        ]
-        expected_bboxes = clamp_bounding_box(
-            datapoints.BoundingBox(expected_bboxes, format="XYXY", spatial_size=spatial_size)
-        ).tolist()
-
-    output_boxes, _ = F.rotate_bounding_box(
-        in_boxes,
-        format=format,
-        spatial_size=spatial_size,
-        angle=angle,
-        expand=expand,
-        center=center,
-    )
-
-    torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
-
-
-@pytest.mark.parametrize("device", cpu_and_cuda())
-def test_correctness_rotate_segmentation_mask_on_fixed_input(device):
-    # Check transformation against known expected output and CPU/CUDA devices
-
-    # Create a fixed input segmentation mask with 2 square masks
-    # in top-left, bottom-left corners
-    mask = torch.zeros(1, 32, 32, dtype=torch.long, device=device)
-    mask[0, 2:10, 2:10] = 1
-    mask[0, 32 - 9 : 32 - 3, 3:9] = 2
-
-    # Rotate 90 degrees
-    expected_mask = torch.rot90(mask, k=1, dims=(-2, -1))
-    out_mask = F.rotate_mask(mask, 90, expand=False)
-    torch.testing.assert_close(out_mask, expected_mask)
-
-
 @pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "format",