pytorch · pmeier · Jul 5, 2023 · Jul 3, 2023 · Jul 3, 2023 · Jul 3, 2023
diff --git a/test/common_utils.py b/test/common_utils.py
@@ -492,6 +492,28 @@ def get_num_channels(color_space):
     return num_channels
 
 
+def make_image(
+    spatial_size, *, color_space="RGB", batch_dims=(), dtype=None, device="cpu", memory_format=torch.contiguous_format
+):
+    spatial_size = _parse_spatial_size(spatial_size)
+    num_channels = get_num_channels(color_space)
+    dtype = dtype or torch.uint8
+    max_value = get_max_value(dtype)
+
+    data = torch.testing.make_tensor(
+        (*batch_dims, num_channels, *spatial_size),
+        low=0,
+        high=max_value,
+        dtype=dtype,
+        device=device,
+        memory_format=memory_format,
+    )
+    if color_space in {"GRAY_ALPHA", "RGBA"}:
+        data[..., -1, :, :] = max_value
+
+    return datapoints.Image(data)
+
+
 def make_image_loader(
     size="random",
     *,
@@ -501,24 +523,25 @@ def make_image_loader(
     constant_alpha=True,
     memory_format=torch.contiguous_format,
 ):
+    if not constant_alpha:
+        raise ValueError("This should never happen")
     size = _parse_spatial_size(size)
     num_channels = get_num_channels(color_space)
 
     def fn(shape, dtype, device, memory_format):
-        max_value = get_max_value(dtype)
-        data = torch.testing.make_tensor(
-            shape, low=0, high=max_value, dtype=dtype, device=device, memory_format=memory_format
+        *batch_dims, _, height, width = shape
+        return make_image(
+            (height, width),
+            color_space=color_space,
+            batch_dims=batch_dims,
+            dtype=dtype,
+            device=device,
+            memory_format=memory_format,
         )
-        if color_space in {"GRAY_ALPHA", "RGBA"} and constant_alpha:
-            data[..., -1, :, :] = max_value
-        return datapoints.Image(data)
 
     return ImageLoader(fn, shape=(*extra_dims, num_channels, *size), dtype=dtype, memory_format=memory_format)
 
 
-make_image = from_loader(make_image_loader)
-
-
 def make_image_loaders(
     *,
     sizes=DEFAULT_SPATIAL_SIZES,
@@ -601,59 +624,65 @@ def randint_with_tensor_bounds(arg1, arg2=None, **kwargs):
     ).reshape(low.shape)
 
 
+def make_bounding_box(
+    format=datapoints.BoundingBoxFormat.XYXY, spatial_size="random", batch_dims=(), dtype=None, device="cpu"
+):
+    if isinstance(format, str):
+        format = datapoints.BoundingBoxFormat[format]
+
+    spatial_size = _parse_spatial_size(spatial_size, name="spatial_size")
+    dtype = dtype or torch.float32
+
+    if any(dim == 0 for dim in batch_dims):
+        return datapoints.BoundingBox(
+            torch.empty(*batch_dims, 4, dtype=dtype, device=device), format=format, spatial_size=spatial_size
+        )
+
+    height, width = spatial_size
+    if format == datapoints.BoundingBoxFormat.XYXY:
+        x1 = torch.randint(0, width // 2, batch_dims)
+        y1 = torch.randint(0, height // 2, batch_dims)
+        x2 = randint_with_tensor_bounds(x1 + 1, width - x1) + x1
+        y2 = randint_with_tensor_bounds(y1 + 1, height - y1) + y1
+        parts = (x1, y1, x2, y2)
+    elif format == datapoints.BoundingBoxFormat.XYWH:
+        x = torch.randint(0, width // 2, batch_dims)
+        y = torch.randint(0, height // 2, batch_dims)
+        w = randint_with_tensor_bounds(1, width - x)
+        h = randint_with_tensor_bounds(1, height - y)
+        parts = (x, y, w, h)
+    elif format == datapoints.BoundingBoxFormat.CXCYWH:
+        cx = torch.randint(1, width - 1, batch_dims)
+        cy = torch.randint(1, height - 1, batch_dims)
+        w = randint_with_tensor_bounds(1, torch.minimum(cx, width - cx) + 1)
+        h = randint_with_tensor_bounds(1, torch.minimum(cy, height - cy) + 1)
+        parts = (cx, cy, w, h)
+    else:
+        raise ValueError(f"Can't make bounding box in format {format}")
+
+    return datapoints.BoundingBox(
+        torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, spatial_size=spatial_size
+    )
+
+
 def make_bounding_box_loader(*, extra_dims=(), format, spatial_size="random", dtype=torch.float32):
     if isinstance(format, str):
         format = datapoints.BoundingBoxFormat[format]
-    if format not in {
-        datapoints.BoundingBoxFormat.XYXY,
-        datapoints.BoundingBoxFormat.XYWH,
-        datapoints.BoundingBoxFormat.CXCYWH,
-    }:
-        raise pytest.UsageError(f"Can't make bounding box in format {format}")
 
     spatial_size = _parse_spatial_size(spatial_size, name="spatial_size")
 
     def fn(shape, dtype, device):
-        *extra_dims, num_coordinates = shape
+        *batch_dims, num_coordinates = shape
         if num_coordinates != 4:
             raise pytest.UsageError()
 
-        if any(dim == 0 for dim in extra_dims):
-            return datapoints.BoundingBox(
-                torch.empty(*extra_dims, 4, dtype=dtype, device=device), format=format, spatial_size=spatial_size
-            )
-
-        height, width = spatial_size
-
-        if format == datapoints.BoundingBoxFormat.XYXY:
-            x1 = torch.randint(0, width // 2, extra_dims)
-            y1 = torch.randint(0, height // 2, extra_dims)
-            x2 = randint_with_tensor_bounds(x1 + 1, width - x1) + x1
-            y2 = randint_with_tensor_bounds(y1 + 1, height - y1) + y1
-            parts = (x1, y1, x2, y2)
-        elif format == datapoints.BoundingBoxFormat.XYWH:
-            x = torch.randint(0, width // 2, extra_dims)
-            y = torch.randint(0, height // 2, extra_dims)
-            w = randint_with_tensor_bounds(1, width - x)
-            h = randint_with_tensor_bounds(1, height - y)
-            parts = (x, y, w, h)
-        else:  # format == features.BoundingBoxFormat.CXCYWH:
-            cx = torch.randint(1, width - 1, extra_dims)
-            cy = torch.randint(1, height - 1, extra_dims)
-            w = randint_with_tensor_bounds(1, torch.minimum(cx, width - cx) + 1)
-            h = randint_with_tensor_bounds(1, torch.minimum(cy, height - cy) + 1)
-            parts = (cx, cy, w, h)
-
-        return datapoints.BoundingBox(
-            torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, spatial_size=spatial_size
+        return make_bounding_box(
+            format=format, spatial_size=spatial_size, batch_dims=batch_dims, dtype=dtype, device=device
         )
 
     return BoundingBoxLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, spatial_size=spatial_size)
 
 
-make_bounding_box = from_loader(make_bounding_box_loader)
-
-
 def make_bounding_box_loaders(
     *,
     extra_dims=DEFAULT_EXTRA_DIMS,
@@ -672,21 +701,32 @@ class MaskLoader(TensorLoader):
     pass
 
 
+def make_detection_mask(spatial_size, *, num_objects="random", batch_dims=(), dtype=None, device="cpu"):
+    """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks"""
+    spatial_size = _parse_spatial_size(spatial_size)
+    num_objects = int(torch.randint(1, 11, ())) if num_objects == "random" else num_objects
+    dtype = dtype or torch.bool
+
+    data = torch.testing.make_tensor(
+        (*batch_dims, num_objects, *spatial_size), low=0, high=2, dtype=dtype, device=device
+    )
+    return datapoints.Mask(data)
+
+
 def make_detection_mask_loader(size="random", *, num_objects="random", extra_dims=(), dtype=torch.uint8):
     # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects
     size = _parse_spatial_size(size)
     num_objects = int(torch.randint(1, 11, ())) if num_objects == "random" else num_objects
 
     def fn(shape, dtype, device):
-        data = torch.testing.make_tensor(shape, low=0, high=2, dtype=dtype, device=device)
-        return datapoints.Mask(data)
+        *batch_dims, num_objects, height, width = shape
+        return make_detection_mask(
+            (height, width), num_objects=num_objects, batch_dims=batch_dims, dtype=dtype, device=device
+        )
 
     return MaskLoader(fn, shape=(*extra_dims, num_objects, *size), dtype=dtype)
 
 
-make_detection_mask = from_loader(make_detection_mask_loader)
-
-
 def make_detection_mask_loaders(
     sizes=DEFAULT_SPATIAL_SIZES,
     num_objects=(1, 0, "random"),
@@ -700,19 +740,29 @@ def make_detection_mask_loaders(
 make_detection_masks = from_loaders(make_detection_mask_loaders)
 
 
-def make_segmentation_mask_loader(size="random", *, num_categories="random", extra_dims=(), dtype=torch.uint8):
-    # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values
-    size = _parse_spatial_size(size)
+def make_segmentation_mask(spatial_size, *, num_categories="random", batch_dims=(), dtype=None, device="cpu"):
+    """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value"""
+    spatial_size = _parse_spatial_size(spatial_size)
     num_categories = int(torch.randint(1, 11, ())) if num_categories == "random" else num_categories
+    dtype = dtype or torch.uint8
+
+    data = torch.testing.make_tensor(
+        (*batch_dims, *spatial_size), low=0, high=num_categories, dtype=dtype, device=device
+    )
+    return datapoints.Mask(data)
 
-    def fn(shape, dtype, device):
-        data = torch.testing.make_tensor(shape, low=0, high=num_categories, dtype=dtype, device=device)
-        return datapoints.Mask(data)
 
-    return MaskLoader(fn, shape=(*extra_dims, *size), dtype=dtype)
+def make_segmentation_mask_loader(size="random", *, num_categories="random", extra_dims=(), dtype=torch.uint8):
+    # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values
+    spatial_size = _parse_spatial_size(size)
 
+    def fn(shape, dtype, device):
+        *batch_dims, height, width = shape
+        return make_segmentation_mask(
+            (height, width), num_categories=num_categories, batch_dims=batch_dims, dtype=dtype, device=device
+        )
 
-make_segmentation_mask = from_loader(make_segmentation_mask_loader)
+    return MaskLoader(fn, shape=(*extra_dims, *spatial_size), dtype=dtype)
 
 
 def make_segmentation_mask_loaders(
@@ -750,6 +800,11 @@ class VideoLoader(ImageLoader):
     pass
 
 
+def make_video(spatial_size, *, num_frames="random", batch_dims=(), **kwargs):
+    num_frames = int(torch.randint(1, 5, ())) if num_frames == "random" else num_frames
+    return datapoints.Video(make_image(spatial_size, batch_dims=(*batch_dims, num_frames), **kwargs))
+
+
 def make_video_loader(
     size="random",
     *,
@@ -762,17 +817,20 @@ def make_video_loader(
     num_frames = int(torch.randint(1, 5, ())) if num_frames == "random" else num_frames
 
     def fn(shape, dtype, device, memory_format):
-        video = make_image(
-            size=shape[-2:], extra_dims=shape[:-3], dtype=dtype, device=device, memory_format=memory_format
+        *batch_dims, num_frames, _, height, width = shape
+        return make_video(
+            (height, width),
+            num_frames=num_frames,
+            color_space=color_space,
+            batch_dims=batch_dims,
+            dtype=dtype,
+            device=device,
+            memory_format=memory_format,
         )
-        return datapoints.Video(video)
 
     return VideoLoader(fn, shape=(*extra_dims, num_frames, get_num_channels(color_space), *size), dtype=dtype)
 
 
-make_video = from_loader(make_video_loader)
-
-
 def make_video_loaders(
     *,
     sizes=DEFAULT_SPATIAL_SIZES,

diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
@@ -9,6 +9,7 @@
 from common_utils import (
     assert_equal,
     DEFAULT_EXTRA_DIMS,
+    DEFAULT_PORTRAIT_SPATIAL_SIZE,
     make_bounding_box,
     make_detection_mask,
     make_image,
@@ -79,8 +80,8 @@ def test_mixup_cutmix(transform, input):
     for unsup_data in [
         make_label(),
         make_bounding_box(format="XYXY"),
-        make_detection_mask(),
-        make_segmentation_mask(),
+        make_detection_mask(DEFAULT_PORTRAIT_SPATIAL_SIZE),
+        make_segmentation_mask(DEFAULT_PORTRAIT_SPATIAL_SIZE),
     ]:
         input_copy["unsupported"] = unsup_data
         with pytest.raises(TypeError, match=err_msg):
@@ -215,8 +216,8 @@ def test__get_params(self, mocker):
         transform = transforms.FixedSizeCrop(size=crop_size)
 
         flat_inputs = [
-            make_image(size=spatial_size, color_space="RGB"),
-            make_bounding_box(format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=batch_shape),
+            make_image(spatial_size=spatial_size, color_space="RGB"),
+            make_bounding_box(format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=batch_shape),
         ]
         params = transform._get_params(flat_inputs)
 
@@ -312,9 +313,9 @@ def test__transform_culling(self, mocker):
         )
 
         bounding_boxes = make_bounding_box(
-            format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
+            format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(batch_size,)
         )
-        masks = make_detection_mask(size=spatial_size, extra_dims=(batch_size,))
+        masks = make_detection_mask(spatial_size=spatial_size, batch_dims=(batch_size,))
         labels = make_label(extra_dims=(batch_size,))
 
         transform = transforms.FixedSizeCrop((-1, -1))
@@ -350,7 +351,7 @@ def test__transform_bounding_box_clamping(self, mocker):
         )
 
         bounding_box = make_bounding_box(
-            format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
+            format=BoundingBoxFormat.XYXY, spatial_size=spatial_size, batch_dims=(batch_size,)
         )
         mock = mocker.patch("torchvision.prototype.transforms._geometry.F.clamp_bounding_box")
 
@@ -389,9 +390,9 @@ class TestPermuteDimensions:
     )
     def test_call(self, dims, inverse_dims):
         sample = dict(
-            image=make_image(),
-            bounding_box=make_bounding_box(format=BoundingBoxFormat.XYXY),
-            video=make_video(),
+            image=make_image(DEFAULT_PORTRAIT_SPATIAL_SIZE),
+            bounding_box=make_bounding_box(format=BoundingBoxFormat.XYXY, spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE),
+            video=make_video(DEFAULT_PORTRAIT_SPATIAL_SIZE),
             str="str",
             int=0,
         )
@@ -433,9 +434,9 @@ class TestTransposeDimensions:
     )
     def test_call(self, dims):
         sample = dict(
-            image=make_image(),
-            bounding_box=make_bounding_box(format=BoundingBoxFormat.XYXY),
-            video=make_video(),
+            image=make_image(DEFAULT_PORTRAIT_SPATIAL_SIZE),
+            bounding_box=make_bounding_box(format=BoundingBoxFormat.XYXY, spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE),
+            video=make_video(DEFAULT_PORTRAIT_SPATIAL_SIZE),
             str="str",
             int=0,
         )
@@ -494,29 +495,29 @@ def make_datapoints():
         size = (600, 800)
         num_objects = 22
 
-        pil_image = to_image_pil(make_image(size=size, color_space="RGB"))
+        pil_image = to_image_pil(make_image(spatial_size=size, color_space="RGB"))
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
-            "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
+            "masks": make_detection_mask(spatial_size=size, num_objects=num_objects, dtype=torch.long),
         }
 
         yield (pil_image, target)
 
-        tensor_image = torch.Tensor(make_image(size=size, color_space="RGB"))
+        tensor_image = torch.Tensor(make_image(spatial_size=size, color_space="RGB"))
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
-            "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
+            "masks": make_detection_mask(spatial_size=size, num_objects=num_objects, dtype=torch.long),
         }
 
         yield (tensor_image, target)
 
-        datapoint_image = make_image(size=size, color_space="RGB")
+        datapoint_image = make_image(spatial_size=size, color_space="RGB")
         target = {
-            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
-            "masks": make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long),
+            "masks": make_detection_mask(spatial_size=size, num_objects=num_objects, dtype=torch.long),
         }
 
         yield (datapoint_image, target)