diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
index eba23d0cd52..94a0433ae48 100644
--- a/test/prototype_common_utils.py
+++ b/test/prototype_common_utils.py
@@ -184,13 +184,13 @@ def load(self, device="cpu"):
         return args, kwargs
 
 
-DEFAULT_SQUARE_IMAGE_SIZE = 15
-DEFAULT_LANDSCAPE_IMAGE_SIZE = (7, 33)
-DEFAULT_PORTRAIT_IMAGE_SIZE = (31, 9)
-DEFAULT_IMAGE_SIZES = (DEFAULT_LANDSCAPE_IMAGE_SIZE, DEFAULT_PORTRAIT_IMAGE_SIZE, DEFAULT_SQUARE_IMAGE_SIZE, "random")
+DEFAULT_SQUARE_SPATIAL_SIZE = 15
+DEFAULT_LANDSCAPE_SPATIAL_SIZE = (7, 33)
+DEFAULT_PORTRAIT_SPATIAL_SIZE = (31, 9)
+DEFAULT_SPATIAL_SIZES = (DEFAULT_LANDSCAPE_SPATIAL_SIZE, DEFAULT_PORTRAIT_SPATIAL_SIZE, DEFAULT_SQUARE_SPATIAL_SIZE, "random")
 
 
-def _parse_image_size(size, *, name="size"):
+def _parse_spatial_size(size, *, name="size"):
     if size == "random":
         return tuple(torch.randint(15, 33, (2,)).tolist())
     elif isinstance(size, int) and size > 0:
@@ -246,11 +246,11 @@ def load(self, device):
 @dataclasses.dataclass
 class ImageLoader(TensorLoader):
     color_space: features.ColorSpace
-    image_size: Tuple[int, int] = dataclasses.field(init=False)
+    spatial_size: Tuple[int, int] = dataclasses.field(init=False)
     num_channels: int = dataclasses.field(init=False)
 
     def __post_init__(self):
-        self.image_size = self.shape[-2:]
+        self.spatial_size = self.shape[-2:]
         self.num_channels = self.shape[-3]
 
 
@@ -277,7 +277,7 @@ def make_image_loader(
     dtype=torch.float32,
     constant_alpha=True,
 ):
-    size = _parse_image_size(size)
+    size = _parse_spatial_size(size)
     num_channels = get_num_channels(color_space)
 
     def fn(shape, dtype, device):
@@ -295,7 +295,7 @@ def fn(shape, dtype, device):
 
 def make_image_loaders(
     *,
-    sizes=DEFAULT_IMAGE_SIZES,
+    sizes=DEFAULT_SPATIAL_SIZES,
     color_spaces=(
         features.ColorSpace.GRAY,
         features.ColorSpace.GRAY_ALPHA,
@@ -316,7 +316,7 @@ def make_image_loaders(
 @dataclasses.dataclass
 class BoundingBoxLoader(TensorLoader):
     format: features.BoundingBoxFormat
-    image_size: Tuple[int, int]
+    spatial_size: Tuple[int, int]
 
 
 def randint_with_tensor_bounds(arg1, arg2=None, **kwargs):
@@ -331,7 +331,7 @@ def randint_with_tensor_bounds(arg1, arg2=None, **kwargs):
     ).reshape(low.shape)
 
 
-def make_bounding_box_loader(*, extra_dims=(), format, image_size="random", dtype=torch.float32):
+def make_bounding_box_loader(*, extra_dims=(), format, spatial_size="random", dtype=torch.float32):
     if isinstance(format, str):
         format = features.BoundingBoxFormat[format]
     if format not in {
@@ -341,7 +341,7 @@ def make_bounding_box_loader(*, extra_dims=(), format, image_size="random", dtyp
     }:
         raise pytest.UsageError(f"Can't make bounding box in format {format}")
 
-    image_size = _parse_image_size(image_size, name="image_size")
+    spatial_size = _parse_spatial_size(spatial_size, name="spatial_size")
 
     def fn(shape, dtype, device):
         *extra_dims, num_coordinates = shape
@@ -350,10 +350,10 @@ def fn(shape, dtype, device):
 
         if any(dim == 0 for dim in extra_dims):
             return features.BoundingBox(
-                torch.empty(*extra_dims, 4, dtype=dtype, device=device), format=format, spatial_size=image_size
+                torch.empty(*extra_dims, 4, dtype=dtype, device=device), format=format, spatial_size=spatial_size
             )
 
-        height, width = image_size
+        height, width = spatial_size
 
         if format == features.BoundingBoxFormat.XYXY:
             x1 = torch.randint(0, width // 2, extra_dims)
@@ -375,10 +375,10 @@ def fn(shape, dtype, device):
             parts = (cx, cy, w, h)
 
         return features.BoundingBox(
-            torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, spatial_size=image_size
+            torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, spatial_size=spatial_size
         )
 
-    return BoundingBoxLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, image_size=image_size)
+    return BoundingBoxLoader(fn, shape=(*extra_dims, 4), dtype=dtype, format=format, spatial_size=spatial_size)
 
 
 make_bounding_box = from_loader(make_bounding_box_loader)
@@ -388,11 +388,11 @@ def make_bounding_box_loaders(
     *,
     extra_dims=DEFAULT_EXTRA_DIMS,
     formats=tuple(features.BoundingBoxFormat),
-    image_size="random",
+    spatial_size="random",
     dtypes=(torch.float32, torch.int64),
 ):
     for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes):
-        yield make_bounding_box_loader(**params, image_size=image_size)
+        yield make_bounding_box_loader(**params, spatial_size=spatial_size)
 
 
 make_bounding_boxes = from_loaders(make_bounding_box_loaders)
@@ -475,7 +475,7 @@ class MaskLoader(TensorLoader):
 
 def make_detection_mask_loader(size="random", *, num_objects="random", extra_dims=(), dtype=torch.uint8):
     # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects
-    size = _parse_image_size(size)
+    size = _parse_spatial_size(size)
     num_objects = int(torch.randint(1, 11, ())) if num_objects == "random" else num_objects
 
     def fn(shape, dtype, device):
@@ -489,7 +489,7 @@ def fn(shape, dtype, device):
 
 
 def make_detection_mask_loaders(
-    sizes=DEFAULT_IMAGE_SIZES,
+    sizes=DEFAULT_SPATIAL_SIZES,
     num_objects=(1, 0, "random"),
     extra_dims=DEFAULT_EXTRA_DIMS,
     dtypes=(torch.uint8,),
@@ -503,7 +503,7 @@ def make_detection_mask_loaders(
 
 def make_segmentation_mask_loader(size="random", *, num_categories="random", extra_dims=(), dtype=torch.uint8):
     # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values
-    size = _parse_image_size(size)
+    size = _parse_spatial_size(size)
     num_categories = int(torch.randint(1, 11, ())) if num_categories == "random" else num_categories
 
     def fn(shape, dtype, device):
@@ -518,7 +518,7 @@ def fn(shape, dtype, device):
 
 def make_segmentation_mask_loaders(
     *,
-    sizes=DEFAULT_IMAGE_SIZES,
+    sizes=DEFAULT_SPATIAL_SIZES,
     num_categories=(1, 2, "random"),
     extra_dims=DEFAULT_EXTRA_DIMS,
     dtypes=(torch.uint8,),
@@ -532,7 +532,7 @@ def make_segmentation_mask_loaders(
 
 def make_mask_loaders(
     *,
-    sizes=DEFAULT_IMAGE_SIZES,
+    sizes=DEFAULT_SPATIAL_SIZES,
     num_objects=(1, 0, "random"),
     num_categories=(1, 2, "random"),
     extra_dims=DEFAULT_EXTRA_DIMS,
@@ -559,7 +559,7 @@ def make_video_loader(
     extra_dims=(),
     dtype=torch.uint8,
 ):
-    size = _parse_image_size(size)
+    size = _parse_spatial_size(size)
     num_frames = int(torch.randint(1, 5, ())) if num_frames == "random" else num_frames
 
     def fn(shape, dtype, device):
@@ -576,7 +576,7 @@ def fn(shape, dtype, device):
 
 def make_video_loaders(
     *,
-    sizes=DEFAULT_IMAGE_SIZES,
+    sizes=DEFAULT_SPATIAL_SIZES,
     color_spaces=(
         features.ColorSpace.GRAY,
         features.ColorSpace.RGB,
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 62721c90f4a..27832859f07 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -145,7 +145,7 @@ def sample_inputs_horizontal_flip_bounding_box():
         formats=[features.BoundingBoxFormat.XYXY], dtypes=[torch.float32]
     ):
         yield ArgsKwargs(
-            bounding_box_loader, format=bounding_box_loader.format, image_size=bounding_box_loader.spatial_size
+            bounding_box_loader, format=bounding_box_loader.format, spatial_size=bounding_box_loader.spatial_size
         )
 
 
@@ -185,9 +185,9 @@ def sample_inputs_horizontal_flip_video():
 )
 
 
-def _get_resize_sizes(image_size):
-    height, width = image_size
-    length = max(image_size)
+def _get_resize_sizes(spatial_size):
+    height, width = spatial_size
+    length = max(spatial_size)
     yield length
     yield [length]
     yield (length,)
@@ -252,7 +252,7 @@ def reference_inputs_resize_image_tensor():
 def sample_inputs_resize_bounding_box():
     for bounding_box_loader in make_bounding_box_loaders():
         for size in _get_resize_sizes(bounding_box_loader.spatial_size):
-            yield ArgsKwargs(bounding_box_loader, size=size, image_size=bounding_box_loader.spatial_size)
+            yield ArgsKwargs(bounding_box_loader, size=size, spatial_size=bounding_box_loader.spatial_size)
 
 
 def sample_inputs_resize_mask():
@@ -394,7 +394,7 @@ def sample_inputs_affine_bounding_box():
         yield ArgsKwargs(
             bounding_box_loader,
             format=bounding_box_loader.format,
-            image_size=bounding_box_loader.spatial_size,
+            spatial_size=bounding_box_loader.spatial_size,
             **affine_params,
         )
 
@@ -422,9 +422,9 @@ def _compute_affine_matrix(angle, translate, scale, shear, center):
     return true_matrix
 
 
-def reference_affine_bounding_box(bounding_box, *, format, image_size, angle, translate, scale, shear, center=None):
+def reference_affine_bounding_box(bounding_box, *, format, spatial_size, angle, translate, scale, shear, center=None):
     if center is None:
-        center = [s * 0.5 for s in image_size[::-1]]
+        center = [s * 0.5 for s in spatial_size[::-1]]
 
     def transform(bbox):
         affine_matrix = _compute_affine_matrix(angle, translate, scale, shear, center)
@@ -473,7 +473,7 @@ def reference_inputs_affine_bounding_box():
         yield ArgsKwargs(
             bounding_box_loader,
             format=bounding_box_loader.format,
-            image_size=bounding_box_loader.spatial_size,
+            spatial_size=bounding_box_loader.spatial_size,
             **affine_kwargs,
         )
 
@@ -650,7 +650,7 @@ def sample_inputs_vertical_flip_bounding_box():
         formats=[features.BoundingBoxFormat.XYXY], dtypes=[torch.float32]
     ):
         yield ArgsKwargs(
-            bounding_box_loader, format=bounding_box_loader.format, image_size=bounding_box_loader.spatial_size
+            bounding_box_loader, format=bounding_box_loader.format, spatial_size=bounding_box_loader.spatial_size
         )
 
 
@@ -729,7 +729,7 @@ def sample_inputs_rotate_bounding_box():
         yield ArgsKwargs(
             bounding_box_loader,
             format=bounding_box_loader.format,
-            image_size=bounding_box_loader.spatial_size,
+            spatial_size=bounding_box_loader.spatial_size,
             angle=_ROTATE_ANGLES[0],
         )
 
@@ -1001,7 +1001,7 @@ def sample_inputs_pad_bounding_box():
         yield ArgsKwargs(
             bounding_box_loader,
             format=bounding_box_loader.format,
-            image_size=bounding_box_loader.spatial_size,
+            spatial_size=bounding_box_loader.spatial_size,
             padding=padding,
             padding_mode="constant",
         )
@@ -1131,8 +1131,8 @@ def sample_inputs_perspective_video():
 )
 
 
-def _get_elastic_displacement(image_size):
-    return torch.rand(1, *image_size, 2)
+def _get_elastic_displacement(spatial_size):
+    return torch.rand(1, *spatial_size, 2)
 
 
 def sample_inputs_elastic_image_tensor():
@@ -1212,7 +1212,7 @@ def sample_inputs_elastic_video():
 )
 
 
-_CENTER_CROP_IMAGE_SIZES = [(16, 16), (7, 33), (31, 9)]
+_CENTER_CROP_SPATIAL_SIZES = [(16, 16), (7, 33), (31, 9)]
 _CENTER_CROP_OUTPUT_SIZES = [[4, 3], [42, 70], [4], 3, (5, 2), (6,)]
 
 
@@ -1231,7 +1231,7 @@ def sample_inputs_center_crop_image_tensor():
 
 def reference_inputs_center_crop_image_tensor():
     for image_loader, output_size in itertools.product(
-        make_image_loaders(sizes=_CENTER_CROP_IMAGE_SIZES, extra_dims=[()]), _CENTER_CROP_OUTPUT_SIZES
+        make_image_loaders(sizes=_CENTER_CROP_SPATIAL_SIZES, extra_dims=[()]), _CENTER_CROP_OUTPUT_SIZES
     ):
         yield ArgsKwargs(image_loader, output_size=output_size)
 
@@ -1241,7 +1241,7 @@ def sample_inputs_center_crop_bounding_box():
         yield ArgsKwargs(
             bounding_box_loader,
             format=bounding_box_loader.format,
-            image_size=bounding_box_loader.spatial_size,
+            spatial_size=bounding_box_loader.spatial_size,
             output_size=output_size,
         )
 
@@ -1254,7 +1254,7 @@ def sample_inputs_center_crop_mask():
 
 def reference_inputs_center_crop_mask():
     for mask_loader, output_size in itertools.product(
-        make_mask_loaders(sizes=_CENTER_CROP_IMAGE_SIZES, extra_dims=[()], num_objects=[1]), _CENTER_CROP_OUTPUT_SIZES
+        make_mask_loaders(sizes=_CENTER_CROP_SPATIAL_SIZES, extra_dims=[()], num_objects=[1]), _CENTER_CROP_OUTPUT_SIZES
     ):
         yield ArgsKwargs(mask_loader, output_size=output_size)
 
@@ -1820,7 +1820,7 @@ def sample_inputs_adjust_saturation_video():
 def sample_inputs_clamp_bounding_box():
     for bounding_box_loader in make_bounding_box_loaders():
         yield ArgsKwargs(
-            bounding_box_loader, format=bounding_box_loader.format, image_size=bounding_box_loader.spatial_size
+            bounding_box_loader, format=bounding_box_loader.format, spatial_size=bounding_box_loader.spatial_size
         )
 
 
@@ -1834,7 +1834,7 @@ def sample_inputs_clamp_bounding_box():
 _FIVE_TEN_CROP_SIZES = [7, (6,), [5], (6, 5), [7, 6]]
 
 
-def _get_five_ten_crop_image_size(size):
+def _get_five_ten_crop_spatial_size(size):
     if isinstance(size, int):
         crop_height = crop_width = size
     elif len(size) == 1:
@@ -1847,28 +1847,28 @@ def _get_five_ten_crop_image_size(size):
 def sample_inputs_five_crop_image_tensor():
     for size in _FIVE_TEN_CROP_SIZES:
         for image_loader in make_image_loaders(
-            sizes=[_get_five_ten_crop_image_size(size)], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]
+            sizes=[_get_five_ten_crop_spatial_size(size)], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]
         ):
             yield ArgsKwargs(image_loader, size=size)
 
 
 def reference_inputs_five_crop_image_tensor():
     for size in _FIVE_TEN_CROP_SIZES:
-        for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_image_size(size)], extra_dims=[()]):
+        for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_spatial_size(size)], extra_dims=[()]):
             yield ArgsKwargs(image_loader, size=size)
 
 
 def sample_inputs_ten_crop_image_tensor():
     for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]):
         for image_loader in make_image_loaders(
-            sizes=[_get_five_ten_crop_image_size(size)], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]
+            sizes=[_get_five_ten_crop_spatial_size(size)], color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]
         ):
             yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip)
 
 
 def reference_inputs_ten_crop_image_tensor():
     for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]):
-        for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_image_size(size)], extra_dims=[()]):
+        for image_loader in make_image_loaders(sizes=[_get_five_ten_crop_spatial_size(size)], extra_dims=[()]):
             yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip)
 
 
diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
index d5e49078259..2c095fa6e81 100644
--- a/test/test_prototype_transforms.py
+++ b/test/test_prototype_transforms.py
@@ -559,7 +559,7 @@ def test__transform(self, degrees, expand, fill, center, mocker):
 
     @pytest.mark.parametrize("angle", [34, -87])
     @pytest.mark.parametrize("expand", [False, True])
-    def test_boundingbox_image_size(self, angle, expand):
+    def test_boundingbox_spatial_size(self, angle, expand):
         # Specific test for BoundingBox.rotate
         bbox = features.BoundingBox(
             torch.tensor([1, 2, 3, 4]), format=features.BoundingBoxFormat.XYXY, spatial_size=(32, 32)
@@ -1281,7 +1281,7 @@ def test__transform(self, mocker):
         transform = transforms.RandomIoUCrop()
 
         image = features.Image(torch.rand(3, 32, 24))
-        bboxes = make_bounding_box(format="XYXY", image_size=(32, 24), extra_dims=(6,))
+        bboxes = make_bounding_box(format="XYXY", spatial_size=(32, 24), extra_dims=(6,))
         label = features.Label(torch.randint(0, 10, size=(6,)))
         ohe_label = features.OneHotLabel(torch.zeros(6, 10).scatter_(1, label.unsqueeze(1), 1))
         masks = make_detection_mask((32, 24), num_objects=6)
@@ -1329,12 +1329,12 @@ def test__transform(self, mocker):
 
 class TestScaleJitter:
     def test__get_params(self, mocker):
-        image_size = (24, 32)
+        spatial_size = (24, 32)
         target_size = (16, 12)
         scale_range = (0.5, 1.5)
 
         transform = transforms.ScaleJitter(target_size=target_size, scale_range=scale_range)
-        sample = mocker.MagicMock(spec=features.Image, num_channels=3, image_size=image_size)
+        sample = mocker.MagicMock(spec=features.Image, num_channels=3, spatial_size=spatial_size)
 
         n_samples = 5
         for _ in range(n_samples):
@@ -1347,11 +1347,11 @@ def test__get_params(self, mocker):
             assert isinstance(size, tuple) and len(size) == 2
             height, width = size
 
-            r_min = min(target_size[1] / image_size[0], target_size[0] / image_size[1]) * scale_range[0]
-            r_max = min(target_size[1] / image_size[0], target_size[0] / image_size[1]) * scale_range[1]
+            r_min = min(target_size[1] / spatial_size[0], target_size[0] / spatial_size[1]) * scale_range[0]
+            r_max = min(target_size[1] / spatial_size[0], target_size[0] / spatial_size[1]) * scale_range[1]
 
-            assert int(image_size[0] * r_min) <= height <= int(image_size[0] * r_max)
-            assert int(image_size[1] * r_min) <= width <= int(image_size[1] * r_max)
+            assert int(spatial_size[0] * r_min) <= height <= int(spatial_size[0] * r_max)
+            assert int(spatial_size[1] * r_min) <= width <= int(spatial_size[1] * r_max)
 
     def test__transform(self, mocker):
         interpolation_sentinel = mocker.MagicMock()
@@ -1379,13 +1379,13 @@ def test__transform(self, mocker):
 
 class TestRandomShortestSize:
     def test__get_params(self, mocker):
-        image_size = (3, 10)
+        spatial_size = (3, 10)
         min_size = [5, 9]
         max_size = 20
 
         transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size)
 
-        sample = mocker.MagicMock(spec=features.Image, num_channels=3, image_size=image_size)
+        sample = mocker.MagicMock(spec=features.Image, num_channels=3, spatial_size=spatial_size)
         params = transform._get_params(sample)
 
         assert "size" in params
@@ -1550,14 +1550,14 @@ class TestFixedSizeCrop:
     def test__get_params(self, mocker):
         crop_size = (7, 7)
         batch_shape = (10,)
-        image_size = (11, 5)
+        spatial_size = (11, 5)
 
         transform = transforms.FixedSizeCrop(size=crop_size)
 
         sample = dict(
-            image=make_image(size=image_size, color_space=features.ColorSpace.RGB),
+            image=make_image(size=spatial_size, color_space=features.ColorSpace.RGB),
             bounding_boxes=make_bounding_box(
-                format=features.BoundingBoxFormat.XYXY, image_size=image_size, extra_dims=batch_shape
+                format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=batch_shape
             ),
         )
         params = transform._get_params(sample)
@@ -1638,7 +1638,7 @@ def test__transform(self, mocker, needs):
 
     def test__transform_culling(self, mocker):
         batch_size = 10
-        image_size = (10, 10)
+        spatial_size = (10, 10)
 
         is_valid = torch.randint(0, 2, (batch_size,), dtype=torch.bool)
         mocker.patch(
@@ -1647,17 +1647,17 @@ def test__transform_culling(self, mocker):
                 needs_crop=True,
                 top=0,
                 left=0,
-                height=image_size[0],
-                width=image_size[1],
+                height=spatial_size[0],
+                width=spatial_size[1],
                 is_valid=is_valid,
                 needs_pad=False,
             ),
         )
 
         bounding_boxes = make_bounding_box(
-            format=features.BoundingBoxFormat.XYXY, image_size=image_size, extra_dims=(batch_size,)
+            format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
         )
-        masks = make_detection_mask(size=image_size, extra_dims=(batch_size,))
+        masks = make_detection_mask(size=spatial_size, extra_dims=(batch_size,))
         labels = make_label(extra_dims=(batch_size,))
 
         transform = transforms.FixedSizeCrop((-1, -1))
@@ -1678,7 +1678,7 @@ def test__transform_culling(self, mocker):
 
     def test__transform_bounding_box_clamping(self, mocker):
         batch_size = 3
-        image_size = (10, 10)
+        spatial_size = (10, 10)
 
         mocker.patch(
             "torchvision.prototype.transforms._geometry.FixedSizeCrop._get_params",
@@ -1686,15 +1686,15 @@ def test__transform_bounding_box_clamping(self, mocker):
                 needs_crop=True,
                 top=0,
                 left=0,
-                height=image_size[0],
-                width=image_size[1],
+                height=spatial_size[0],
+                width=spatial_size[1],
                 is_valid=torch.full((batch_size,), fill_value=True),
                 needs_pad=False,
             ),
         )
 
         bounding_box = make_bounding_box(
-            format=features.BoundingBoxFormat.XYXY, image_size=image_size, extra_dims=(batch_size,)
+            format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, extra_dims=(batch_size,)
         )
         mock = mocker.patch("torchvision.prototype.transforms._geometry.F.clamp_bounding_box")
 
diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index c8debe1e293..8d5d05dc142 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -871,7 +871,7 @@ def make_datapoints(self, with_mask=True):
 
         pil_image = to_image_pil(make_image(size=size, color_space=features.ColorSpace.RGB))
         target = {
-            "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
@@ -881,7 +881,7 @@ def make_datapoints(self, with_mask=True):
 
         tensor_image = torch.Tensor(make_image(size=size, color_space=features.ColorSpace.RGB))
         target = {
-            "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
@@ -891,7 +891,7 @@ def make_datapoints(self, with_mask=True):
 
         feature_image = make_image(size=size, color_space=features.ColorSpace.RGB)
         target = {
-            "boxes": make_bounding_box(image_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
+            "boxes": make_bounding_box(spatial_size=size, format="XYXY", extra_dims=(num_objects,), dtype=torch.float),
             "labels": make_label(extra_dims=(num_objects,), categories=80),
         }
         if with_mask:
diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
index b1b06b6288b..be34547b618 100644
--- a/test/test_prototype_transforms_functional.py
+++ b/test/test_prototype_transforms_functional.py
@@ -333,16 +333,16 @@ def _compute_affine_matrix(angle_, translate_, scale_, shear_, center_):
 @pytest.mark.parametrize("device", cpu_and_gpu())
 def test_correctness_affine_bounding_box_on_fixed_input(device):
     # Check transformation against known expected output
-    image_size = (64, 64)
+    spatial_size = (64, 64)
     # xyxy format
     in_boxes = [
         [20, 25, 35, 45],
         [50, 5, 70, 22],
-        [image_size[1] // 2 - 10, image_size[0] // 2 - 10, image_size[1] // 2 + 10, image_size[0] // 2 + 10],
+        [spatial_size[1] // 2 - 10, spatial_size[0] // 2 - 10, spatial_size[1] // 2 + 10, spatial_size[0] // 2 + 10],
         [1, 1, 5, 5],
     ]
     in_boxes = features.BoundingBox(
-        in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=image_size, dtype=torch.float64, device=device
+        in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, dtype=torch.float64, device=device
     )
     # Tested parameters
     angle = 63
@@ -355,9 +355,9 @@ def test_correctness_affine_bounding_box_on_fixed_input(device):
     # from albumentations.augmentations.geometric.functional import normalize_bbox, denormalize_bbox
     # expected_bboxes = []
     # for in_box in in_boxes:
-    #     n_in_box = normalize_bbox(in_box, *image_size)
-    #     n_out_box = bbox_shift_scale_rotate(n_in_box, -angle, scale, dx, dy, *image_size)
-    #     out_box = denormalize_bbox(n_out_box, *image_size)
+    #     n_in_box = normalize_bbox(in_box, *spatial_size)
+    #     n_out_box = bbox_shift_scale_rotate(n_in_box, -angle, scale, dx, dy, *spatial_size)
+    #     out_box = denormalize_bbox(n_out_box, *spatial_size)
     #     expected_bboxes.append(out_box)
     expected_bboxes = [
         (24.522435977922218, 34.375689508290854, 46.443125279998114, 54.3516575015695),
@@ -371,7 +371,7 @@ def test_correctness_affine_bounding_box_on_fixed_input(device):
         in_boxes.format,
         in_boxes.spatial_size,
         angle,
-        (dx * image_size[1], dy * image_size[0]),
+        (dx * spatial_size[1], dy * spatial_size[0]),
         scale,
         shear=(0, 0),
     )
@@ -455,16 +455,16 @@ def _compute_expected_bbox(bbox, angle_, expand_, center_):
             (height, width),
         )
 
-    image_size = (32, 38)
+    spatial_size = (32, 38)
 
-    for bboxes in make_bounding_boxes(image_size=image_size, extra_dims=((4,),)):
+    for bboxes in make_bounding_boxes(spatial_size=spatial_size, extra_dims=((4,),)):
         bboxes_format = bboxes.format
-        bboxes_image_size = bboxes.spatial_size
+        bboxes_spatial_size = bboxes.spatial_size
 
-        output_bboxes, output_image_size = F.rotate_bounding_box(
+        output_bboxes, output_spatial_size = F.rotate_bounding_box(
             bboxes,
             bboxes_format,
-            image_size=bboxes_image_size,
+            spatial_size=bboxes_spatial_size,
             angle=angle,
             expand=expand,
             center=center,
@@ -472,38 +472,38 @@ def _compute_expected_bbox(bbox, angle_, expand_, center_):
 
         center_ = center
         if center_ is None:
-            center_ = [s * 0.5 for s in bboxes_image_size[::-1]]
+            center_ = [s * 0.5 for s in bboxes_spatial_size[::-1]]
 
         if bboxes.ndim < 2:
             bboxes = [bboxes]
 
         expected_bboxes = []
         for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_image_size)
-            expected_bbox, expected_image_size = _compute_expected_bbox(bbox, -angle, expand, center_)
+            bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
+            expected_bbox, expected_spatial_size = _compute_expected_bbox(bbox, -angle, expand, center_)
             expected_bboxes.append(expected_bbox)
         if len(expected_bboxes) > 1:
             expected_bboxes = torch.stack(expected_bboxes)
         else:
             expected_bboxes = expected_bboxes[0]
         torch.testing.assert_close(output_bboxes, expected_bboxes, atol=1, rtol=0)
-        torch.testing.assert_close(output_image_size, expected_image_size, atol=1, rtol=0)
+        torch.testing.assert_close(output_spatial_size, expected_spatial_size, atol=1, rtol=0)
 
 
 @pytest.mark.parametrize("device", cpu_and_gpu())
 @pytest.mark.parametrize("expand", [False])  # expand=True does not match D2
 def test_correctness_rotate_bounding_box_on_fixed_input(device, expand):
     # Check transformation against known expected output
-    image_size = (64, 64)
+    spatial_size = (64, 64)
     # xyxy format
     in_boxes = [
         [1, 1, 5, 5],
-        [1, image_size[0] - 6, 5, image_size[0] - 2],
-        [image_size[1] - 6, image_size[0] - 6, image_size[1] - 2, image_size[0] - 2],
-        [image_size[1] // 2 - 10, image_size[0] // 2 - 10, image_size[1] // 2 + 10, image_size[0] // 2 + 10],
+        [1, spatial_size[0] - 6, 5, spatial_size[0] - 2],
+        [spatial_size[1] - 6, spatial_size[0] - 6, spatial_size[1] - 2, spatial_size[0] - 2],
+        [spatial_size[1] // 2 - 10, spatial_size[0] // 2 - 10, spatial_size[1] // 2 + 10, spatial_size[0] // 2 + 10],
     ]
     in_boxes = features.BoundingBox(
-        in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=image_size, dtype=torch.float64, device=device
+        in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, dtype=torch.float64, device=device
     )
     # Tested parameters
     angle = 45
@@ -597,7 +597,7 @@ def test_correctness_crop_bounding_box(device, format, top, left, height, width,
     if format != features.BoundingBoxFormat.XYXY:
         in_boxes = convert_format_bounding_box(in_boxes, features.BoundingBoxFormat.XYXY, format)
 
-    output_boxes, output_image_size = F.crop_bounding_box(
+    output_boxes, output_spatial_size = F.crop_bounding_box(
         in_boxes,
         format,
         top,
@@ -610,7 +610,7 @@ def test_correctness_crop_bounding_box(device, format, top, left, height, width,
         output_boxes = convert_format_bounding_box(output_boxes, format, features.BoundingBoxFormat.XYXY)
 
     torch.testing.assert_close(output_boxes.tolist(), expected_bboxes)
-    torch.testing.assert_close(output_image_size, size)
+    torch.testing.assert_close(output_spatial_size, size)
 
 
 @pytest.mark.parametrize("device", cpu_and_gpu())
@@ -658,7 +658,7 @@ def _compute_expected_bbox(bbox, top_, left_, height_, width_, size_):
         bbox[3] = (bbox[3] - top_) * size_[0] / height_
         return bbox
 
-    image_size = (100, 100)
+    spatial_size = (100, 100)
     # xyxy format
     in_boxes = [
         [10.0, 10.0, 20.0, 20.0],
@@ -670,18 +670,18 @@ def _compute_expected_bbox(bbox, top_, left_, height_, width_, size_):
     expected_bboxes = torch.tensor(expected_bboxes, device=device)
 
     in_boxes = features.BoundingBox(
-        in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=image_size, device=device
+        in_boxes, format=features.BoundingBoxFormat.XYXY, spatial_size=spatial_size, device=device
     )
     if format != features.BoundingBoxFormat.XYXY:
         in_boxes = convert_format_bounding_box(in_boxes, features.BoundingBoxFormat.XYXY, format)
 
-    output_boxes, output_image_size = F.resized_crop_bounding_box(in_boxes, format, top, left, height, width, size)
+    output_boxes, output_spatial_size = F.resized_crop_bounding_box(in_boxes, format, top, left, height, width, size)
 
     if format != features.BoundingBoxFormat.XYXY:
         output_boxes = convert_format_bounding_box(output_boxes, format, features.BoundingBoxFormat.XYXY)
 
     torch.testing.assert_close(output_boxes, expected_bboxes)
-    torch.testing.assert_close(output_image_size, size)
+    torch.testing.assert_close(output_spatial_size, size)
 
 
 def _parse_padding(padding):
@@ -718,7 +718,7 @@ def _compute_expected_bbox(bbox, padding_):
             bbox = bbox.to(bbox_dtype)
         return bbox
 
-    def _compute_expected_image_size(bbox, padding_):
+    def _compute_expected_spatial_size(bbox, padding_):
         pad_left, pad_up, pad_right, pad_down = _parse_padding(padding_)
         height, width = bbox.spatial_size
         return height + pad_up + pad_down, width + pad_left + pad_right
@@ -726,20 +726,20 @@ def _compute_expected_image_size(bbox, padding_):
     for bboxes in make_bounding_boxes():
         bboxes = bboxes.to(device)
         bboxes_format = bboxes.format
-        bboxes_image_size = bboxes.spatial_size
+        bboxes_spatial_size = bboxes.spatial_size
 
-        output_boxes, output_image_size = F.pad_bounding_box(
-            bboxes, format=bboxes_format, image_size=bboxes_image_size, padding=padding
+        output_boxes, output_spatial_size = F.pad_bounding_box(
+            bboxes, format=bboxes_format, spatial_size=bboxes_spatial_size, padding=padding
         )
 
-        torch.testing.assert_close(output_image_size, _compute_expected_image_size(bboxes, padding))
+        torch.testing.assert_close(output_spatial_size, _compute_expected_spatial_size(bboxes, padding))
 
         if bboxes.ndim < 2 or bboxes.shape[0] == 0:
             bboxes = [bboxes]
 
         expected_bboxes = []
         for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_image_size)
+            bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
             expected_bboxes.append(_compute_expected_bbox(bbox, padding))
 
         if len(expected_bboxes) > 1:
@@ -815,15 +815,15 @@ def _compute_expected_bbox(bbox, pcoeffs_):
             out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox.format, copy=False
         )
 
-    image_size = (32, 38)
+    spatial_size = (32, 38)
 
     pcoeffs = _get_perspective_coeffs(startpoints, endpoints)
     inv_pcoeffs = _get_perspective_coeffs(endpoints, startpoints)
 
-    for bboxes in make_bounding_boxes(image_size=image_size, extra_dims=((4,),)):
+    for bboxes in make_bounding_boxes(spatial_size=spatial_size, extra_dims=((4,),)):
         bboxes = bboxes.to(device)
         bboxes_format = bboxes.format
-        bboxes_image_size = bboxes.spatial_size
+        bboxes_spatial_size = bboxes.spatial_size
 
         output_bboxes = F.perspective_bounding_box(
             bboxes,
@@ -836,7 +836,7 @@ def _compute_expected_bbox(bbox, pcoeffs_):
 
         expected_bboxes = []
         for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_image_size)
+            bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
             expected_bboxes.append(_compute_expected_bbox(bbox, inv_pcoeffs))
         if len(expected_bboxes) > 1:
             expected_bboxes = torch.stack(expected_bboxes)
@@ -853,14 +853,14 @@ def _compute_expected_bbox(bbox, pcoeffs_):
 def test_correctness_center_crop_bounding_box(device, output_size):
     def _compute_expected_bbox(bbox, output_size_):
         format_ = bbox.format
-        image_size_ = bbox.spatial_size
+        spatial_size_ = bbox.spatial_size
         bbox = convert_format_bounding_box(bbox, format_, features.BoundingBoxFormat.XYWH)
 
         if len(output_size_) == 1:
             output_size_.append(output_size_[-1])
 
-        cy = int(round((image_size_[0] - output_size_[0]) * 0.5))
-        cx = int(round((image_size_[1] - output_size_[1]) * 0.5))
+        cy = int(round((spatial_size_[0] - output_size_[0]) * 0.5))
+        cx = int(round((spatial_size_[1] - output_size_[1]) * 0.5))
         out_bbox = [
             bbox[0].item() - cx,
             bbox[1].item() - cy,
@@ -879,10 +879,10 @@ def _compute_expected_bbox(bbox, output_size_):
     for bboxes in make_bounding_boxes(extra_dims=((4,),)):
         bboxes = bboxes.to(device)
         bboxes_format = bboxes.format
-        bboxes_image_size = bboxes.spatial_size
+        bboxes_spatial_size = bboxes.spatial_size
 
-        output_boxes, output_image_size = F.center_crop_bounding_box(
-            bboxes, bboxes_format, bboxes_image_size, output_size
+        output_boxes, output_spatial_size = F.center_crop_bounding_box(
+            bboxes, bboxes_format, bboxes_spatial_size, output_size
         )
 
         if bboxes.ndim < 2:
@@ -890,7 +890,7 @@ def _compute_expected_bbox(bbox, output_size_):
 
         expected_bboxes = []
         for bbox in bboxes:
-            bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_image_size)
+            bbox = features.BoundingBox(bbox, format=bboxes_format, spatial_size=bboxes_spatial_size)
             expected_bboxes.append(_compute_expected_bbox(bbox, output_size))
 
         if len(expected_bboxes) > 1:
@@ -898,7 +898,7 @@ def _compute_expected_bbox(bbox, output_size_):
         else:
             expected_bboxes = expected_bboxes[0]
         torch.testing.assert_close(output_boxes, expected_bboxes)
-        torch.testing.assert_close(output_image_size, output_size)
+        torch.testing.assert_close(output_spatial_size, output_size)
 
 
 @pytest.mark.parametrize("device", cpu_and_gpu())
@@ -926,11 +926,11 @@ def _compute_expected_mask(mask, output_size):
 
 # Copied from test/test_functional_tensor.py
 @pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("image_size", ("small", "large"))
+@pytest.mark.parametrize("spatial_size", ("small", "large"))
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 @pytest.mark.parametrize("ksize", [(3, 3), [3, 5], (23, 23)])
 @pytest.mark.parametrize("sigma", [[0.5, 0.5], (0.5, 0.5), (0.8, 0.8), (1.7, 1.7)])
-def test_correctness_gaussian_blur_image_tensor(device, image_size, dt, ksize, sigma):
+def test_correctness_gaussian_blur_image_tensor(device, spatial_size, dt, ksize, sigma):
     fn = F.gaussian_blur_image_tensor
 
     # true_cv2_results = {
@@ -950,7 +950,7 @@ def test_correctness_gaussian_blur_image_tensor(device, image_size, dt, ksize, s
     p = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "gaussian_blur_opencv_results.pt")
     true_cv2_results = torch.load(p)
 
-    if image_size == "small":
+    if spatial_size == "small":
         tensor = (
             torch.from_numpy(np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3))).permute(2, 0, 1).to(device)
         )
diff --git a/test/test_prototype_transforms_utils.py b/test/test_prototype_transforms_utils.py
index 5301559999a..3d5960c9625 100644
--- a/test/test_prototype_transforms_utils.py
+++ b/test/test_prototype_transforms_utils.py
@@ -11,7 +11,7 @@
 
 
 IMAGE = make_image(color_space=features.ColorSpace.RGB)
-BOUNDING_BOX = make_bounding_box(format=features.BoundingBoxFormat.XYXY, image_size=IMAGE.spatial_size)
+BOUNDING_BOX = make_bounding_box(format=features.BoundingBoxFormat.XYXY, spatial_size=IMAGE.spatial_size)
 MASK = make_detection_mask(size=IMAGE.spatial_size)