diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py index 908408bd39cce..60c2993219d9f 100644 --- a/python/paddle/vision/transforms/transforms.py +++ b/python/paddle/vision/transforms/transforms.py @@ -11,19 +11,71 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations import math import numbers import random import traceback from collections.abc import Iterable, Sequence +from typing import ( + TYPE_CHECKING, + Any, + Generic, + Literal, + Protocol, + TypeVar, + overload, +) import numpy as np +from typing_extensions import TypeAlias import paddle from . import functional as F +if TYPE_CHECKING: + import numpy.typing as npt + from PIL.Image import Image as PILImage + + from paddle import Tensor + from paddle._typing import DataLayoutImage, Size2, Size3, Size4 + + _TransformInputKeys: TypeAlias = Sequence[ + Literal["image", "coords", "boxes", "mask"] + ] + _InterpolationPil: TypeAlias = Literal[ + "nearest", "bilinear", "bicubic", "lanczos", "hamming" + ] + _InterpolationCv2: TypeAlias = Literal[ + "nearest", "bilinear", "area", "bicubic", "lanczos" + ] + _PaddingMode: TypeAlias = Literal[ + "constant", "edge", "reflect", "symmetric" + ] + +_InputT = TypeVar( + "_InputT", "Tensor", "PILImage", "npt.NDArray[Any]", contravariant=True +) +_RetT = TypeVar( + "_RetT", "Tensor", "PILImage", "npt.NDArray[Any]", covariant=True +) + + +class _Transform(Protocol, Generic[_InputT, _RetT]): + @overload + def __call__(self, data: _InputT) -> _RetT: + ... + + @overload + def __call__(self, data: tuple[_InputT, ...]) -> tuple[_RetT, ...]: + ... + + def __call__(self, data) -> Any: + ... + + __all__ = [] @@ -69,7 +121,7 @@ def _check_input( return value -class Compose: +class Compose(_Transform[_InputT, _RetT]): """ Composes several transforms together use for composing list of transforms together for a dataset transform. @@ -97,10 +149,20 @@ class Compose: (811, 608) [1] """ - def __init__(self, transforms): + transforms: Sequence[_Transform[Any, Any]] + + def __init__(self, transforms: Sequence[_Transform[Any, Any]]) -> None: self.transforms = transforms - def __call__(self, data): + @overload + def __call__(self, data: _InputT) -> _RetT: + ... + + @overload + def __call__(self, data: tuple[_InputT, ...]) -> tuple[_RetT, ...]: + ... + + def __call__(self, data) -> Any: for f in self.transforms: try: data = f(data) @@ -113,7 +175,7 @@ def __call__(self, data): raise e return data - def __repr__(self): + def __repr__(self) -> str: format_string = self.__class__.__name__ + '(' for t in self.transforms: format_string += '\n' @@ -122,7 +184,7 @@ def __repr__(self): return format_string -class BaseTransform: +class BaseTransform(_Transform[_InputT, _RetT]): """ Base class of all transforms used in computer vision. @@ -229,7 +291,10 @@ class BaseTransform: """ - def __init__(self, keys=None): + keys: _TransformInputKeys + params: Any + + def __init__(self, keys: _TransformInputKeys | None = None) -> None: if keys is None: keys = ("image",) elif not isinstance(keys, Sequence): @@ -245,7 +310,15 @@ def __init__(self, keys=None): def _get_params(self, inputs): pass - def __call__(self, inputs): + @overload + def __call__(self, inputs: _InputT) -> _RetT: + ... + + @overload + def __call__(self, inputs: tuple[_InputT, ...]) -> tuple[_RetT, ...]: + ... + + def __call__(self, inputs) -> Any: """Apply transform on single input data""" if not isinstance(inputs, tuple): inputs = (inputs,) @@ -280,8 +353,11 @@ def _apply_boxes(self, boxes): def _apply_mask(self, mask): raise NotImplementedError + def _apply_coords(self, coords): + raise NotImplementedError + -class ToTensor(BaseTransform): +class ToTensor(BaseTransform[_InputT, "Tensor"]): """Convert a ``PIL.Image`` or ``numpy.ndarray`` to ``paddle.Tensor``. Converts a PIL.Image or numpy.ndarray (H x W x C) to a paddle.Tensor of shape (C x H x W). @@ -328,7 +404,13 @@ class ToTensor(BaseTransform): paddle.float32 """ - def __init__(self, data_format='CHW', keys=None): + data_format: DataLayoutImage + + def __init__( + self, + data_format: DataLayoutImage = 'CHW', + keys: _TransformInputKeys | None = None, + ) -> None: super().__init__(keys) self.data_format = data_format @@ -343,7 +425,7 @@ def _apply_image(self, img): return F.to_tensor(img, self.data_format) -class Resize(BaseTransform): +class Resize(BaseTransform[_InputT, _RetT]): """Resize the input Image to the given size. Args: @@ -394,7 +476,15 @@ class Resize(BaseTransform): (150, 200) """ - def __init__(self, size, interpolation='bilinear', keys=None): + size: Size2 + interpolation: _InterpolationPil | _InterpolationCv2 + + def __init__( + self, + size: Size2, + interpolation: _InterpolationPil | _InterpolationCv2 = 'bilinear', + keys: _TransformInputKeys | None = None, + ) -> None: super().__init__(keys) assert isinstance(size, int) or ( isinstance(size, Iterable) and len(size) == 2 @@ -406,7 +496,7 @@ def _apply_image(self, img): return F.resize(img, self.size, self.interpolation) -class RandomResizedCrop(BaseTransform): +class RandomResizedCrop(BaseTransform[_InputT, _RetT]): """Crop the input data to random size and aspect ratio. A crop of random size (default: of 0.08 to 1.0) of the original size and a random aspect ratio (default: of 3/4 to 1.33) of the original aspect ratio is made. @@ -456,14 +546,19 @@ class RandomResizedCrop(BaseTransform): """ + size: Size2 + scale: Sequence[float] + ratio: Sequence[float] + interpolation: _InterpolationPil | _InterpolationCv2 + def __init__( self, - size, - scale=(0.08, 1.0), - ratio=(3.0 / 4, 4.0 / 3), - interpolation='bilinear', - keys=None, - ): + size: Size2, + scale: Sequence[float] = (0.08, 1.0), + ratio: Sequence[float] = (3.0 / 4, 4.0 / 3), + interpolation: _InterpolationPil | _InterpolationCv2 = 'bilinear', + keys: _TransformInputKeys | None = None, + ) -> None: super().__init__(keys) if isinstance(size, int): self.size = (size, size) @@ -612,7 +707,7 @@ def _apply_image(self, img): return F.resize(cropped_img, self.size, self.interpolation) -class CenterCrop(BaseTransform): +class CenterCrop(BaseTransform[_InputT, _RetT]): """Crops the given the input data at the center. Args: @@ -642,7 +737,11 @@ class CenterCrop(BaseTransform): """ - def __init__(self, size, keys=None): + size: Size2 + + def __init__( + self, size: Size2, keys: _TransformInputKeys | None = None + ) -> None: super().__init__(keys) if isinstance(size, numbers.Number): self.size = (int(size), int(size)) @@ -653,7 +752,7 @@ def _apply_image(self, img): return F.center_crop(img, self.size) -class RandomHorizontalFlip(BaseTransform): +class RandomHorizontalFlip(BaseTransform[_InputT, _RetT]): """Horizontally flip the input data randomly with a given probability. Args: @@ -688,7 +787,11 @@ class RandomHorizontalFlip(BaseTransform): """ - def __init__(self, prob=0.5, keys=None): + prob: float + + def __init__( + self, prob: float = 0.5, keys: _TransformInputKeys | None = None + ) -> None: super().__init__(keys) assert 0 <= prob <= 1, "probability must be between 0 and 1" self.prob = prob @@ -712,7 +815,7 @@ def _static_apply_image(self, img): ) -class RandomVerticalFlip(BaseTransform): +class RandomVerticalFlip(BaseTransform[_InputT, _RetT]): """Vertically flip the input data randomly with a given probability. Args: @@ -747,7 +850,11 @@ class RandomVerticalFlip(BaseTransform): """ - def __init__(self, prob=0.5, keys=None): + prob: float + + def __init__( + self, prob: float = 0.5, keys: _TransformInputKeys | None = None + ) -> None: super().__init__(keys) assert 0 <= prob <= 1, "probability must be between 0 and 1" self.prob = prob @@ -771,7 +878,7 @@ def _static_apply_image(self, img): ) -class Normalize(BaseTransform): +class Normalize(BaseTransform[_InputT, _RetT]): """Normalize the input data with mean and standard deviation. Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels, this transform will normalize each channel of the input data. @@ -814,9 +921,19 @@ class Normalize(BaseTransform): """ + mean: Sequence[float] + std: Sequence[float] + data_format: DataLayoutImage + to_rgb: bool + def __init__( - self, mean=0.0, std=1.0, data_format='CHW', to_rgb=False, keys=None - ): + self, + mean: float | Sequence[float] = 0.0, + std: float | Sequence[float] = 1.0, + data_format: DataLayoutImage = 'CHW', + to_rgb: bool = False, + keys: _TransformInputKeys | None = None, + ) -> None: super().__init__(keys) if isinstance(mean, numbers.Number): mean = [mean, mean, mean] @@ -835,7 +952,7 @@ def _apply_image(self, img): ) -class Transpose(BaseTransform): +class Transpose(BaseTransform[_InputT, _RetT]): """Transpose input data to a target format. For example, most transforms use HWC mode image, while the Neural Network might use CHW mode input tensor. @@ -869,7 +986,13 @@ class Transpose(BaseTransform): """ - def __init__(self, order=(2, 0, 1), keys=None): + order: Sequence[int] + + def __init__( + self, + order: Sequence[int] = (2, 0, 1), + keys: _TransformInputKeys | None = None, + ) -> None: super().__init__(keys) self.order = order @@ -885,7 +1008,7 @@ def _apply_image(self, img): return img.transpose(self.order) -class BrightnessTransform(BaseTransform): +class BrightnessTransform(BaseTransform[_InputT, _RetT]): """Adjust brightness of the image. Args: @@ -920,7 +1043,11 @@ class BrightnessTransform(BaseTransform): """ - def __init__(self, value, keys=None): + value: float + + def __init__( + self, value: float, keys: _TransformInputKeys | None = None + ) -> None: super().__init__(keys) self.value = _check_input(value, 'brightness') @@ -932,7 +1059,7 @@ def _apply_image(self, img): return F.adjust_brightness(img, brightness_factor) -class ContrastTransform(BaseTransform): +class ContrastTransform(BaseTransform[_InputT, _RetT]): """Adjust contrast of the image. Args: @@ -963,7 +1090,11 @@ class ContrastTransform(BaseTransform): """ - def __init__(self, value, keys=None): + value: float + + def __init__( + self, value: float, keys: _TransformInputKeys | None = None + ) -> None: super().__init__(keys) if value < 0: raise ValueError("contrast value should be non-negative") @@ -977,7 +1108,7 @@ def _apply_image(self, img): return F.adjust_contrast(img, contrast_factor) -class SaturationTransform(BaseTransform): +class SaturationTransform(BaseTransform[_InputT, _RetT]): """Adjust saturation of the image. Args: @@ -1007,7 +1138,11 @@ class SaturationTransform(BaseTransform): (224, 224) """ - def __init__(self, value, keys=None): + value: float + + def __init__( + self, value: float, keys: _TransformInputKeys | None = None + ) -> None: super().__init__(keys) self.value = _check_input(value, 'saturation') @@ -1019,7 +1154,7 @@ def _apply_image(self, img): return F.adjust_saturation(img, saturation_factor) -class HueTransform(BaseTransform): +class HueTransform(BaseTransform[_InputT, _RetT]): """Adjust hue of the image. Args: @@ -1050,7 +1185,11 @@ class HueTransform(BaseTransform): """ - def __init__(self, value, keys=None): + value: float + + def __init__( + self, value: float, keys: _TransformInputKeys | None = None + ) -> None: super().__init__(keys) self.value = _check_input( value, 'hue', center=0, bound=(-0.5, 0.5), clip_first_on_zero=False @@ -1064,7 +1203,7 @@ def _apply_image(self, img): return F.adjust_hue(img, hue_factor) -class ColorJitter(BaseTransform): +class ColorJitter(BaseTransform[_InputT, _RetT]): """Randomly change the brightness, contrast, saturation and hue of an image. Args: @@ -1101,9 +1240,19 @@ class ColorJitter(BaseTransform): """ + brightness: float + contrast: float + saturation: float + hue: float + def __init__( - self, brightness=0, contrast=0, saturation=0, hue=0, keys=None - ): + self, + brightness: float = 0, + contrast: float = 0, + saturation: float = 0, + hue: float = 0, + keys: _TransformInputKeys | None = None, + ) -> None: super().__init__(keys) self.brightness = brightness self.contrast = contrast @@ -1152,7 +1301,7 @@ def _apply_image(self, img): return transform(img) -class RandomCrop(BaseTransform): +class RandomCrop(BaseTransform[_InputT, _RetT]): """Crops the given CV Image at a random location. Args: @@ -1209,15 +1358,21 @@ class RandomCrop(BaseTransform): [3, 224, 224] """ + size: Size2 + padding: Size2 | Size4 | None + pad_if_needed: bool + fill: Size3 + padding_mode: _PaddingMode + def __init__( self, - size, - padding=None, - pad_if_needed=False, - fill=0, - padding_mode='constant', - keys=None, - ): + size: Size2, + padding: Size2 | Size4 | None = None, + pad_if_needed: bool = False, + fill: Size3 = 0, + padding_mode: _PaddingMode = 'constant', + keys: _TransformInputKeys | None = None, + ) -> None: super().__init__(keys) if isinstance(size, numbers.Number): self.size = (int(size), int(size)) @@ -1280,7 +1435,7 @@ def _apply_image(self, img): return F.crop(img, i, j, h, w) -class Pad(BaseTransform): +class Pad(BaseTransform[_InputT, _RetT]): """Pads the given CV Image on all sides with the given "pad" value. Args: @@ -1325,7 +1480,17 @@ class Pad(BaseTransform): (228, 228) """ - def __init__(self, padding, fill=0, padding_mode='constant', keys=None): + padding: Size2 | Size4 + fill: Size3 + padding_mode: _PaddingMode + + def __init__( + self, + padding: Size2 | Size4, + fill: Size3 = 0, + padding_mode: _PaddingMode = 'constant', + keys: _TransformInputKeys | None = None, + ) -> None: assert isinstance(padding, (numbers.Number, list, tuple)) assert isinstance(fill, (numbers.Number, str, list, tuple)) assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric'] @@ -1382,7 +1547,7 @@ def _setup_angle(x, name, req_sizes=(2,)): return [float(d) for d in x] -class RandomAffine(BaseTransform): +class RandomAffine(BaseTransform[_InputT, _RetT]): """Random affine transformation of the image. Args: @@ -1439,17 +1604,25 @@ class RandomAffine(BaseTransform): [3, 256, 300] """ + degrees: Sequence[float] + translate: tuple[float, float] | None + scale: tuple[float, float] | None + shear: float | Sequence[float] | None + interpolation: _InterpolationPil | _InterpolationCv2 + fill: Size3 + center: tuple[float, float] + def __init__( self, - degrees, - translate=None, - scale=None, - shear=None, - interpolation='nearest', - fill=0, - center=None, - keys=None, - ): + degrees: float | Sequence[float], + translate: tuple[float, float] | None = None, + scale: tuple[float, float] | None = None, + shear: float | Sequence[float] | None = None, + interpolation: _InterpolationPil | _InterpolationCv2 = 'nearest', + fill: Size3 = 0, + center: tuple[float, float] = None, + keys: _TransformInputKeys | None = None, + ) -> None: self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,)) super().__init__(keys) @@ -1545,7 +1718,7 @@ def _apply_image(self, img): ) -class RandomRotation(BaseTransform): +class RandomRotation(BaseTransform[_InputT, _RetT]): """Rotates the image by angle. Args: @@ -1593,15 +1766,21 @@ class RandomRotation(BaseTransform): (150, 200) """ + degrees: Sequence[float] + interpolation: _InterpolationPil | _InterpolationCv2 + expand: bool + center: tuple[float, float] + fill: Size3 + def __init__( self, - degrees, - interpolation='nearest', - expand=False, - center=None, - fill=0, - keys=None, - ): + degrees: float | Sequence[float], + interpolation: _InterpolationPil | _InterpolationCv2 = 'nearest', + expand: bool = False, + center: tuple[float, float] = None, + fill: Size3 = 0, + keys: _TransformInputKeys | None = None, + ) -> None: if isinstance(degrees, numbers.Number): if degrees < 0: raise ValueError( @@ -1647,7 +1826,7 @@ def _apply_image(self, img): ) -class RandomPerspective(BaseTransform): +class RandomPerspective(BaseTransform[_InputT, _RetT]): """Random perspective transformation with a given probability. Args: @@ -1691,14 +1870,19 @@ class RandomPerspective(BaseTransform): [3, 200, 150] """ + prob: float + distortion_scale: float + interpolation: _InterpolationPil | _InterpolationCv2 + fill: Size3 + def __init__( self, - prob=0.5, - distortion_scale=0.5, - interpolation='nearest', - fill=0, - keys=None, - ): + prob: float = 0.5, + distortion_scale: float = 0.5, + interpolation: _InterpolationPil | _InterpolationCv2 = 'nearest', + fill: Size3 = 0, + keys: _TransformInputKeys | None = None, + ) -> None: super().__init__(keys) assert 0 <= prob <= 1, "probability must be between 0 and 1" assert ( @@ -1712,7 +1896,9 @@ def __init__( self.interpolation = interpolation self.fill = fill - def get_params(self, width, height, distortion_scale): + def get_params( + self, width: int, height: int, distortion_scale: float + ) -> tuple[list[list[int]], list[list[int]]]: """ Returns: startpoints (list[list[int]]): [top-left, top-right, bottom-right, bottom-left] of the original image, @@ -1783,7 +1969,7 @@ def _apply_image(self, img): return img -class Grayscale(BaseTransform): +class Grayscale(BaseTransform[_InputT, _RetT]): """Converts image to grayscale. Args: @@ -1814,7 +2000,13 @@ class Grayscale(BaseTransform): (224, 224) """ - def __init__(self, num_output_channels=1, keys=None): + num_output_channels: int + + def __init__( + self, + num_output_channels: int = 1, + keys: _TransformInputKeys | None = None, + ) -> None: super().__init__(keys) self.num_output_channels = num_output_channels @@ -1829,7 +2021,7 @@ def _apply_image(self, img): return F.to_grayscale(img, self.num_output_channels) -class RandomErasing(BaseTransform): +class RandomErasing(BaseTransform[_InputT, _RetT]): """Erase the pixels in a rectangle region selected randomly. Args: @@ -1873,15 +2065,21 @@ class RandomErasing(BaseTransform): """ + prob: float + scale: Sequence[float] + ratio: Sequence[float] + value: int | float | Sequence[float] | str + inplace: bool + def __init__( self, - prob=0.5, - scale=(0.02, 0.33), - ratio=(0.3, 3.3), - value=0, - inplace=False, - keys=None, - ): + prob: float = 0.5, + scale: Sequence[float] = (0.02, 0.33), + ratio: Sequence[float] = (0.3, 3.3), + value: float | Sequence[float] | str = 0, + inplace: bool = False, + keys: _TransformInputKeys | None = None, + ) -> None: super().__init__(keys) assert isinstance( scale, (tuple, list)