pytorch · datumbox · Nov 4, 2022 · Nov 3, 2022 · Nov 3, 2022 · Nov 3, 2022
diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
@@ -1184,38 +1184,38 @@ def reference_inputs_pad_bounding_box():
 def sample_inputs_perspective_image_tensor():
     for image_loader in make_image_loaders(sizes=["random"]):
         for fill in [None, 128.0, 128, [12.0], [12.0 + c for c in range(image_loader.num_channels)]]:
-            yield ArgsKwargs(image_loader, fill=fill, perspective_coeffs=_PERSPECTIVE_COEFFS[0])
+            yield ArgsKwargs(image_loader, None, None, fill=fill, coefficients=_PERSPECTIVE_COEFFS[0])
-            yield ArgsKwargs(image_loader, None, None, fill=fill, coefficients=_PERSPECTIVE_COEFFS[0])
+            yield ArgsKwargs(image_loader, startpoints=None, endpoints=None, fill=fill, coefficients=_PERSPECTIVE_COEFFS[0])
-            yield ArgsKwargs(image_loader, None, None, fill=fill, coefficients=_PERSPECTIVE_COEFFS[0])
+            yield ArgsKwargs(image_loader, startpoints=None, endpoints=None, fill=fill, coefficients=_PERSPECTIVE_COEFFS[0])
 
 
 def reference_inputs_perspective_image_tensor():
-    for image_loader, perspective_coeffs in itertools.product(make_image_loaders(extra_dims=[()]), _PERSPECTIVE_COEFFS):
+    for image_loader, coefficients in itertools.product(make_image_loaders(extra_dims=[()]), _PERSPECTIVE_COEFFS):
         # FIXME: PIL kernel doesn't support sequences of length 1 if the number of channels is larger. Shouldn't it?
         for fill in [None, 128.0, 128, [12.0 + c for c in range(image_loader.num_channels)]]:
-            yield ArgsKwargs(image_loader, fill=fill, perspective_coeffs=perspective_coeffs)
+            yield ArgsKwargs(image_loader, None, None, fill=fill, coefficients=coefficients)
 
 
 def sample_inputs_perspective_bounding_box():
     for bounding_box_loader in make_bounding_box_loaders():
         yield ArgsKwargs(
-            bounding_box_loader, format=bounding_box_loader.format, perspective_coeffs=_PERSPECTIVE_COEFFS[0]
+            bounding_box_loader, bounding_box_loader.format, None, None, coefficients=_PERSPECTIVE_COEFFS[0]
         )
 
 
 def sample_inputs_perspective_mask():
     for mask_loader in make_mask_loaders(sizes=["random"]):
-        yield ArgsKwargs(mask_loader, perspective_coeffs=_PERSPECTIVE_COEFFS[0])
+        yield ArgsKwargs(mask_loader, None, None, coefficients=_PERSPECTIVE_COEFFS[0])
 
 
 def reference_inputs_perspective_mask():
     for mask_loader, perspective_coeffs in itertools.product(
         make_mask_loaders(extra_dims=[()], num_objects=[1]), _PERSPECTIVE_COEFFS
     ):
-        yield ArgsKwargs(mask_loader, perspective_coeffs=perspective_coeffs)
+        yield ArgsKwargs(mask_loader, None, None, coefficients=perspective_coeffs)
 
 
 def sample_inputs_perspective_video():
     for video_loader in make_video_loaders(sizes=["random"], num_frames=["random"]):
-        yield ArgsKwargs(video_loader, perspective_coeffs=_PERSPECTIVE_COEFFS[0])
+        yield ArgsKwargs(video_loader, None, None, coefficients=_PERSPECTIVE_COEFFS[0])
 
 
 KERNEL_INFOS.extend(

diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py
@@ -917,8 +917,8 @@ def test__get_params(self, mocker):
         params = transform._get_params([image])
 
         h, w = image.spatial_size
-        assert "perspective_coeffs" in params
-        assert len(params["perspective_coeffs"]) == 8
+        assert "coefficients" in params
+        assert len(params["coefficients"]) == 8
 
     @pytest.mark.parametrize("distortion_scale", [0.1, 0.7])
     def test__transform(self, distortion_scale, mocker):
@@ -940,7 +940,7 @@ def test__transform(self, distortion_scale, mocker):
         params = transform._get_params([inpt])
 
         fill = transforms._utils._convert_fill_arg(fill)
-        fn.assert_called_once_with(inpt, **params, fill=fill, interpolation=interpolation)
+        fn.assert_called_once_with(inpt, None, None, **params, fill=fill, interpolation=interpolation)
 
 
 class TestElasticTransform:

diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py
@@ -874,7 +874,9 @@ def _compute_expected_bbox(bbox, pcoeffs_):
         output_bboxes = F.perspective_bounding_box(
             bboxes,
             bboxes_format,
-            perspective_coeffs=pcoeffs,
+            None,
+            None,
+            coefficients=pcoeffs,
         )
 
         if bboxes.ndim < 2:

diff --git a/torchvision/prototype/features/_bounding_box.py b/torchvision/prototype/features/_bounding_box.py
@@ -169,11 +169,15 @@ def affine(
 
     def perspective(
         self,
-        perspective_coeffs: List[float],
+        startpoints: Optional[List[List[int]]],
+        endpoints: Optional[List[List[int]]],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
+        coefficients: Optional[List[float]] = None,
     ) -> BoundingBox:
-        output = self._F.perspective_bounding_box(self.as_subclass(torch.Tensor), self.format, perspective_coeffs)
+        output = self._F.perspective_bounding_box(
+            self.as_subclass(torch.Tensor), startpoints, endpoints, self.format, coefficients=coefficients
+        )
         return BoundingBox.wrap_like(self, output)
 
     def elastic(

diff --git a/torchvision/prototype/features/_feature.py b/torchvision/prototype/features/_feature.py
@@ -218,9 +218,11 @@ def affine(
 
     def perspective(
         self,
-        perspective_coeffs: List[float],
+        startpoints: Optional[List[List[int]]],
+        endpoints: Optional[List[List[int]]],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
+        coefficients: Optional[List[float]] = None,
     ) -> _Feature:
         return self
 

diff --git a/torchvision/prototype/features/_image.py b/torchvision/prototype/features/_image.py
@@ -206,12 +206,19 @@ def affine(
 
     def perspective(
         self,
-        perspective_coeffs: List[float],
+        startpoints: Optional[List[List[int]]],
+        endpoints: Optional[List[List[int]]],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
+        coefficients: Optional[List[float]] = None,
     ) -> Image:
         output = self._F.perspective_image_tensor(
-            self.as_subclass(torch.Tensor), perspective_coeffs, interpolation=interpolation, fill=fill
+            self.as_subclass(torch.Tensor),
+            startpoints,
+            endpoints,
+            interpolation=interpolation,
+            fill=fill,
+            coefficients=coefficients,
         )
         return Image.wrap_like(self, output)
 

diff --git a/torchvision/prototype/features/_mask.py b/torchvision/prototype/features/_mask.py
@@ -118,11 +118,15 @@ def affine(
 
     def perspective(
         self,
-        perspective_coeffs: List[float],
+        startpoints: Optional[List[List[int]]],
+        endpoints: Optional[List[List[int]]],
         interpolation: InterpolationMode = InterpolationMode.NEAREST,
         fill: FillTypeJIT = None,
+        coefficients: Optional[List[float]] = None,
     ) -> Mask:
-        output = self._F.perspective_mask(self.as_subclass(torch.Tensor), perspective_coeffs, fill=fill)
+        output = self._F.perspective_mask(
+            self.as_subclass(torch.Tensor), startpoints, endpoints, fill=fill, coefficients=coefficients
+        )
         return Mask.wrap_like(self, output)
 
     def elastic(

diff --git a/torchvision/prototype/features/_video.py b/torchvision/prototype/features/_video.py
@@ -166,12 +166,19 @@ def affine(
 
     def perspective(
         self,
-        perspective_coeffs: List[float],
+        startpoints: Optional[List[List[int]]],
+        endpoints: Optional[List[List[int]]],
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
         fill: FillTypeJIT = None,
+        coefficients: Optional[List[float]] = None,
     ) -> Video:
         output = self._F.perspective_video(
-            self.as_subclass(torch.Tensor), perspective_coeffs, interpolation=interpolation, fill=fill
+            self.as_subclass(torch.Tensor),
+            startpoints,
+            endpoints,
+            interpolation=interpolation,
+            fill=fill,
+            coefficients=coefficients,
         )
         return Video.wrap_like(self, output)
 

diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
@@ -524,15 +524,17 @@ def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         startpoints = [[0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]]
         endpoints = [topleft, topright, botright, botleft]
         perspective_coeffs = _get_perspective_coeffs(startpoints, endpoints)
-        return dict(perspective_coeffs=perspective_coeffs)
+        return dict(coefficients=perspective_coeffs)
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         fill = self.fill[type(inpt)]
         return F.perspective(
             inpt,
-            **params,
+            None,
+            None,
             fill=fill,
             interpolation=self.interpolation,
+            **params,
         )
 
 

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
@@ -11,6 +11,7 @@
 from torchvision.transforms.functional import (
     _compute_resized_output_size as __compute_resized_output_size,
     _get_inverse_affine_matrix,
+    _get_perspective_coeffs,
     InterpolationMode,
     pil_modes_mapping,
     pil_to_tensor,
@@ -906,12 +907,30 @@ def crop(inpt: features.InputTypeJIT, top: int, left: int, height: int, width: i
         return crop_image_pil(inpt, top, left, height, width)
 
 
+def _perspective_coefficients(
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
+    coefficients: Optional[List[float]],
+) -> List[float]:
+    if coefficients is not None:
+        if len(coefficients) != 8:
+            raise ValueError("Argument coefficients should have 8 float values")
+        return coefficients
+    elif startpoints is not None and endpoints is not None:
+        return _get_perspective_coeffs(startpoints, endpoints)
+    else:
+        raise ValueError("Either the startpoints/endpoints or the coefficients must have non `None` values.")
+
+
 def perspective_image_tensor(
     image: torch.Tensor,
-    perspective_coeffs: List[float],
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     fill: features.FillTypeJIT = None,
+    coefficients: Optional[List[float]] = None,
 ) -> torch.Tensor:
+    perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
     if image.numel() == 0:
         return image
 
@@ -934,21 +953,24 @@ def perspective_image_tensor(
 @torch.jit.unused
 def perspective_image_pil(
     image: PIL.Image.Image,
-    perspective_coeffs: List[float],
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
     interpolation: InterpolationMode = InterpolationMode.BICUBIC,
     fill: features.FillTypeJIT = None,
+    coefficients: Optional[List[float]] = None,
 ) -> PIL.Image.Image:
+    perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
     return _FP.perspective(image, perspective_coeffs, interpolation=pil_modes_mapping[interpolation], fill=fill)
 
 
 def perspective_bounding_box(
     bounding_box: torch.Tensor,
     format: features.BoundingBoxFormat,
-    perspective_coeffs: List[float],
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
+    coefficients: Optional[List[float]] = None,
 ) -> torch.Tensor:
-
-    if len(perspective_coeffs) != 8:
-        raise ValueError("Argument perspective_coeffs should have 8 float values")
+    perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
 
     original_shape = bounding_box.shape
     bounding_box = (
@@ -1029,8 +1051,10 @@ def perspective_bounding_box(
 
 def perspective_mask(
     mask: torch.Tensor,
-    perspective_coeffs: List[float],
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
     fill: features.FillTypeJIT = None,
+    coefficients: Optional[List[float]] = None,
 ) -> torch.Tensor:
     if mask.ndim < 3:
         mask = mask.unsqueeze(0)
@@ -1039,7 +1063,7 @@ def perspective_mask(
         needs_squeeze = False
 
     output = perspective_image_tensor(
-        mask, perspective_coeffs=perspective_coeffs, interpolation=InterpolationMode.NEAREST, fill=fill
+        mask, startpoints, endpoints, interpolation=InterpolationMode.NEAREST, fill=fill, coefficients=coefficients
     )
 
     if needs_squeeze:
@@ -1050,25 +1074,37 @@ def perspective_mask(
 
 def perspective_video(
     video: torch.Tensor,
-    perspective_coeffs: List[float],
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     fill: features.FillTypeJIT = None,
+    coefficients: Optional[List[float]] = None,
 ) -> torch.Tensor:
-    return perspective_image_tensor(video, perspective_coeffs, interpolation=interpolation, fill=fill)
+    return perspective_image_tensor(
+        video, startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients
+    )
 
 
 def perspective(
     inpt: features.InputTypeJIT,
-    perspective_coeffs: List[float],
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     fill: features.FillTypeJIT = None,
+    coefficients: Optional[List[float]] = None,
 ) -> features.InputTypeJIT:
     if isinstance(inpt, torch.Tensor) and (torch.jit.is_scripting() or not isinstance(inpt, features._Feature)):
-        return perspective_image_tensor(inpt, perspective_coeffs, interpolation=interpolation, fill=fill)
+        return perspective_image_tensor(
+            inpt, startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients
+        )
     elif isinstance(inpt, features._Feature):
-        return inpt.perspective(perspective_coeffs, interpolation=interpolation, fill=fill)
+        return inpt.perspective(
+            startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients
+        )
     else:
-        return perspective_image_pil(inpt, perspective_coeffs, interpolation=interpolation, fill=fill)
+        return perspective_image_pil(
+            inpt, startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients
+        )
 
 
 def elastic_image_tensor(