From cead4bf08f9920f48be07b4b85bd73105d149134 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 21 Feb 2023 11:20:33 +0100 Subject: [PATCH 01/27] add ffmpeg to Linux CPU and GPU unittest workflows (#7295) --- .github/workflows/test-linux-cpu.yml | 2 +- .github/workflows/test-linux-gpu.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-linux-cpu.yml b/.github/workflows/test-linux-cpu.yml index 19521cdd011..769ee4f841b 100644 --- a/.github/workflows/test-linux-cpu.yml +++ b/.github/workflows/test-linux-cpu.yml @@ -39,7 +39,7 @@ jobs: fi # Create Conda Env - conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy + conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy 'ffmpeg<4.3' conda activate /work/ci_env # Install PyTorch, Torchvision, and testing libraries diff --git a/.github/workflows/test-linux-gpu.yml b/.github/workflows/test-linux-gpu.yml index 831de27e350..95d06402db1 100644 --- a/.github/workflows/test-linux-gpu.yml +++ b/.github/workflows/test-linux-gpu.yml @@ -43,7 +43,7 @@ jobs: fi # Create Conda Env - conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy + conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy 'ffmpeg<4.3' conda activate /work/ci_env # Install PyTorch, Torchvision, and testing libraries From df1f2d6ed3de69f3984797024f5caf8eb11a086b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 21 Feb 2023 11:02:29 +0000 Subject: [PATCH 02/27] Update transforms docs sub-structure (#7291) Co-authored-by: Philip Meier --- docs/source/transforms.rst | 87 ++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 51 deletions(-) diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst index 5909b68966b..d831b81e37f 100644 --- a/docs/source/transforms.rst +++ b/docs/source/transforms.rst @@ -14,11 +14,10 @@ transformations. This is useful if you have to build a more complex transformation pipeline (e.g. in the case of segmentation tasks). -Most transformations accept both `PIL `_ -images and tensor images, although some transformations are :ref:`PIL-only -` and some are :ref:`tensor-only -`. The :ref:`conversion_transforms` may be used to -convert to and from PIL images. +Most transformations accept both `PIL `_ images +and tensor images, although some transformations are PIL-only and some are +tensor-only. The :ref:`conversion_transforms` may be used to convert to and from +PIL images, or for converting dtypes and ranges. The transformations that accept tensor images also accept batches of tensor images. A Tensor Image is a tensor with ``(C, H, W)`` shape, where ``C`` is a @@ -70,8 +69,10 @@ The following examples illustrate the use of the available transforms: produce the same results. -Scriptable transforms ---------------------- +Transforms scriptability +------------------------ + +.. TODO: Add note about v2 scriptability (in next PR) In order to script the transformations, please use ``torch.nn.Sequential`` instead of :class:`Compose`. @@ -89,39 +90,36 @@ Make sure to use only scriptable transformations, i.e. that work with ``torch.Te For any custom transformations to be used with ``torch.jit.script``, they should be derived from ``torch.nn.Module``. -Compositions of transforms --------------------------- +Geometry +-------- .. autosummary:: :toctree: generated/ :template: class.rst - Compose - + Resize + RandomCrop + RandomResizedCrop + CenterCrop + FiveCrop + TenCrop + Pad + RandomAffine + RandomPerspective + RandomRotation + RandomHorizontalFlip + RandomVerticalFlip -Transforms on PIL Image and torch.\*Tensor ------------------------------------------- +Color +----- .. autosummary:: :toctree: generated/ :template: class.rst - CenterCrop ColorJitter - FiveCrop Grayscale - Pad - RandomAffine - RandomApply - RandomCrop RandomGrayscale - RandomHorizontalFlip - RandomPerspective - RandomResizedCrop - RandomRotation - RandomVerticalFlip - Resize - TenCrop GaussianBlur RandomInvert RandomPosterize @@ -130,23 +128,20 @@ Transforms on PIL Image and torch.\*Tensor RandomAutocontrast RandomEqualize - -.. _transforms_pil_only: - -Transforms on PIL Image only ----------------------------- +Composition +----------- .. autosummary:: :toctree: generated/ :template: class.rst + Compose + RandomApply RandomChoice RandomOrder -.. _transforms_tensor_only: - -Transforms on torch.\*Tensor only ---------------------------------- +Miscellaneous +------------- .. autosummary:: :toctree: generated/ @@ -155,12 +150,12 @@ Transforms on torch.\*Tensor only LinearTransformation Normalize RandomErasing - ConvertImageDtype + Lambda .. _conversion_transforms: -Conversion Transforms ---------------------- +Conversion +---------- .. autosummary:: :toctree: generated/ @@ -169,20 +164,10 @@ Conversion Transforms ToPILImage ToTensor PILToTensor + ConvertImageDtype - -Generic Transforms ------------------- - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - Lambda - - -Automatic Augmentation Transforms ---------------------------------- +Auto-Augmentation +----------------- `AutoAugment `_ is a common Data Augmentation technique that can improve the accuracy of Image Classification models. Though the data augmentation policies are directly linked to their trained dataset, empirical studies show that From c7a20ba5f60991b6169c639b613b3c128c339446 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 21 Feb 2023 16:38:12 +0000 Subject: [PATCH 03/27] Added docs for v2 transforms (part 1) (#7297) Co-authored-by: vfdev Co-authored-by: Philip Meier --- docs/source/conf.py | 2 + docs/source/transforms.rst | 40 ++ torchvision/transforms/v2/_augment.py | 32 ++ torchvision/transforms/v2/_auto_augment.py | 80 ++++ torchvision/transforms/v2/_color.py | 139 +++++++ torchvision/transforms/v2/_container.py | 65 ++++ torchvision/transforms/v2/_deprecated.py | 25 ++ torchvision/transforms/v2/_geometry.py | 350 +++++++++++++++++- torchvision/transforms/v2/_meta.py | 21 ++ torchvision/transforms/v2/_misc.py | 68 ++++ torchvision/transforms/v2/_type_conversion.py | 30 ++ 11 files changed, 850 insertions(+), 2 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 72c83d7893d..304a1cc6e22 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -33,6 +33,8 @@ sys.path.append(os.path.abspath(".")) +torchvision.disable_beta_transforms_warning() + # -- General configuration ------------------------------------------------ # Required version of sphinx is set from docs/requirements.txt diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst index d831b81e37f..00d929d0675 100644 --- a/docs/source/transforms.rst +++ b/docs/source/transforms.rst @@ -98,17 +98,29 @@ Geometry :template: class.rst Resize + v2.Resize RandomCrop + v2.RandomCrop RandomResizedCrop + v2.RandomResizedCrop CenterCrop + v2.CenterCrop FiveCrop + v2.FiveCrop TenCrop + v2.TenCrop Pad + v2.Pad RandomAffine + v2.RandomAffine RandomPerspective + v2.RandomPerspective RandomRotation + v2.RandomRotation RandomHorizontalFlip + v2.RandomHorizontalFlip RandomVerticalFlip + v2.RandomVerticalFlip Color ----- @@ -118,15 +130,25 @@ Color :template: class.rst ColorJitter + v2.ColorJitter Grayscale + v2.Grayscale RandomGrayscale + v2.RandomGrayscale GaussianBlur + v2.GaussianBlur RandomInvert + v2.RandomInvert RandomPosterize + v2.RandomPosterize RandomSolarize + v2.RandomSolarize RandomAdjustSharpness + v2.RandomAdjustSharpness RandomAutocontrast + v2.RandomAutocontrast RandomEqualize + v2.RandomEqualize Composition ----------- @@ -136,9 +158,13 @@ Composition :template: class.rst Compose + v2.Compose RandomApply + v2.RandomApply RandomChoice + v2.RandomChoice RandomOrder + v2.RandomOrder Miscellaneous ------------- @@ -148,9 +174,13 @@ Miscellaneous :template: class.rst LinearTransformation + v2.LinearTransformation Normalize + v2.Normalize RandomErasing + v2.RandomErasing Lambda + v2.Lambda .. _conversion_transforms: @@ -162,9 +192,15 @@ Conversion :template: class.rst ToPILImage + v2.ToPILImage + v2.ToImagePIL ToTensor + v2.ToTensor PILToTensor + v2.PILToTensor ConvertImageDtype + v2.ConvertImageDtype + v2.ConvertDtype Auto-Augmentation ----------------- @@ -181,9 +217,13 @@ The new transform can be used standalone or mixed-and-matched with existing tran AutoAugmentPolicy AutoAugment + v2.AutoAugment RandAugment + v2.RandAugment TrivialAugmentWide + v2.TrivialAugmentWide AugMix + v2.AugMix .. _functional_transforms: diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py index 157605d6f3c..b5aac9ca9a2 100644 --- a/torchvision/transforms/v2/_augment.py +++ b/torchvision/transforms/v2/_augment.py @@ -13,6 +13,38 @@ class RandomErasing(_RandomApplyTransform): + """[BETA] Randomly selects a rectangle region in the input image or video and erases its pixels. + + .. betastatus:: RandomErasing transform + + This transform does not support PIL Image. + 'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/abs/1708.04896 + + Args: + p: probability that the random erasing operation will be performed. + scale: range of proportion of erased area against input image. + ratio: range of aspect ratio of erased area. + value: erasing value. Default is 0. If a single int, it is used to + erase all pixels. If a tuple of length 3, it is used to erase + R, G, B channels respectively. + If a str of 'random', erasing each pixel with random values. + inplace: boolean to make this transform inplace. Default set to False. + + Returns: + Erased input. + + Example: + >>> from torchvision.transforms import v2 as transforms + >>> + >>> transform = transforms.Compose([ + >>> transforms.RandomHorizontalFlip(), + >>> transforms.PILToTensor(), + >>> transforms.ConvertImageDtype(torch.float), + >>> transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + >>> transforms.RandomErasing(), + >>> ]) + """ + _v1_transform_cls = _transforms.RandomErasing def _extract_params_for_v1_transform(self) -> Dict[str, Any]: diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py index b4791755dc5..98e23b99796 100644 --- a/torchvision/transforms/v2/_auto_augment.py +++ b/torchvision/transforms/v2/_auto_augment.py @@ -162,6 +162,24 @@ def _apply_image_or_video_transform( class AutoAugment(_AutoAugmentBase): + r"""[BETA] AutoAugment data augmentation method based on + `"AutoAugment: Learning Augmentation Strategies from Data" `_. + + .. betastatus:: AutoAugment transform + + If the image is torch Tensor, it should be of type torch.uint8, and it is expected + to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "L" or "RGB". + + Args: + policy (AutoAugmentPolicy): Desired policy enum defined by + :class:`torchvision.transforms.autoaugment.AutoAugmentPolicy`. Default is ``AutoAugmentPolicy.IMAGENET``. + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + fill (sequence or number, optional): Pixel fill value for the area outside the transformed + image. If given a number, the value is used for all bands respectively. + """ _v1_transform_cls = _transforms.AutoAugment _AUGMENTATION_SPACE = { @@ -318,6 +336,27 @@ def forward(self, *inputs: Any) -> Any: class RandAugment(_AutoAugmentBase): + r"""[BETA] RandAugment data augmentation method based on + `"RandAugment: Practical automated data augmentation with a reduced search space" + `_. + + .. betastatus:: RandAugment transform + + If the image is torch Tensor, it should be of type torch.uint8, and it is expected + to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "L" or "RGB". + + Args: + num_ops (int): Number of augmentation transformations to apply sequentially. + magnitude (int): Magnitude for all the transformations. + num_magnitude_bins (int): The number of different magnitude values. + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + fill (sequence or number, optional): Pixel fill value for the area outside the transformed + image. If given a number, the value is used for all bands respectively. + """ + _v1_transform_cls = _transforms.RandAugment _AUGMENTATION_SPACE = { "Identity": (lambda num_bins, height, width: None, False), @@ -379,6 +418,24 @@ def forward(self, *inputs: Any) -> Any: class TrivialAugmentWide(_AutoAugmentBase): + r"""[BETA] Dataset-independent data-augmentation with TrivialAugment Wide, as described in + `"TrivialAugment: Tuning-free Yet State-of-the-Art Data Augmentation" `_. + + .. betastatus:: TrivialAugmentWide transform + + If the image is torch Tensor, it should be of type torch.uint8, and it is expected + to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "L" or "RGB". + + Args: + num_magnitude_bins (int): The number of different magnitude values. + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + fill (sequence or number, optional): Pixel fill value for the area outside the transformed + image. If given a number, the value is used for all bands respectively. + """ + _v1_transform_cls = _transforms.TrivialAugmentWide _AUGMENTATION_SPACE = { "Identity": (lambda num_bins, height, width: None, False), @@ -430,6 +487,29 @@ def forward(self, *inputs: Any) -> Any: class AugMix(_AutoAugmentBase): + r"""[BETA] AugMix data augmentation method based on + `"AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty" `_. + + .. betastatus:: AugMix transform + + If the image is torch Tensor, it should be of type torch.uint8, and it is expected + to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "L" or "RGB". + + Args: + severity (int): The severity of base augmentation operators. Default is ``3``. + mixture_width (int): The number of augmentation chains. Default is ``3``. + chain_depth (int): The depth of augmentation chains. A negative value denotes stochastic depth sampled from the interval [1, 3]. + Default is ``-1``. + alpha (float): The hyperparameter for the probability distributions. Default is ``1.0``. + all_ops (bool): Use all operations (including brightness, contrast, color and sharpness). Default is ``True``. + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + fill (sequence or number, optional): Pixel fill value for the area outside the transformed + image. If given a number, the value is used for all bands respectively. + """ + _v1_transform_cls = _transforms.AugMix _PARTIAL_AUGMENTATION_SPACE = { diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py index 64796e16ca4..785a3965e60 100644 --- a/torchvision/transforms/v2/_color.py +++ b/torchvision/transforms/v2/_color.py @@ -11,6 +11,23 @@ class Grayscale(Transform): + """[BETA] Convert images or videos to grayscale. + + .. betastatus:: Grayscale transform + + If the image is torch Tensor, it is expected + to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions + + Args: + num_output_channels (int): (1 or 3) number of channels desired for output image + + Returns: + PIL Image: Grayscale version of the input. + + - If ``num_output_channels == 1`` : returned image is single channel + - If ``num_output_channels == 3`` : returned image is 3 channel with r == g == b + """ + _v1_transform_cls = _transforms.Grayscale _transformed_types = ( @@ -29,6 +46,24 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomGrayscale(_RandomApplyTransform): + """[BETA] Randomly convert image to grayscale with a probability of p (default 0.1). + + .. betastatus:: RandomGrayscale transform + + If the image is torch Tensor, it is expected + to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions + + Args: + p (float): probability that image should be converted to grayscale. + + Returns: + PIL Image or Tensor: Grayscale version of the input image with probability p and unchanged + with probability (1-p). + - If input image is 1 channel: grayscale version is 1 channel + - If input image is 3 channel: grayscale version is 3 channel with r == g == b + + """ + _v1_transform_cls = _transforms.RandomGrayscale _transformed_types = ( @@ -50,6 +85,32 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class ColorJitter(Transform): + """[BETA] Randomly change the brightness, contrast, saturation and hue of an image. + + .. betastatus:: ColorJitter transform + + If the image is torch Tensor, it is expected + to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + If img is PIL Image, mode "1", "I", "F" and modes with transparency (alpha channel) are not supported. + + Args: + brightness (float or tuple of float (min, max)): How much to jitter brightness. + brightness_factor is chosen uniformly from [max(0, 1 - brightness), 1 + brightness] + or the given [min, max]. Should be non negative numbers. + contrast (float or tuple of float (min, max)): How much to jitter contrast. + contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast] + or the given [min, max]. Should be non-negative numbers. + saturation (float or tuple of float (min, max)): How much to jitter saturation. + saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation] + or the given [min, max]. Should be non negative numbers. + hue (float or tuple of float (min, max)): How much to jitter hue. + hue_factor is chosen uniformly from [-hue, hue] or the given [min, max]. + Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5. + To jitter hue, the pixel values of the input image has to be non-negative for conversion to HSV space; + thus it does not work if you normalize your image to an interval with negative values, + or use an interpolation that generates negative values before using this function. + """ + _v1_transform_cls = _transforms.ColorJitter def _extract_params_for_v1_transform(self) -> Dict[str, Any]: @@ -205,6 +266,18 @@ def _transform( class RandomEqualize(_RandomApplyTransform): + """[BETA] Equalize the histogram of the given image randomly with a given probability. + + .. betastatus:: RandomEqualize transform + + If the image is torch Tensor, it is expected + to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "P", "L" or "RGB". + + Args: + p (float): probability of the image being equalized. Default value is 0.5 + """ + _v1_transform_cls = _transforms.RandomEqualize def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: @@ -212,6 +285,18 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomInvert(_RandomApplyTransform): + """[BETA] Inverts the colors of the given image randomly with a given probability. + + .. betastatus:: RandomInvert transform + + If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format, + where ... means it can have an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "L" or "RGB". + + Args: + p (float): probability of the image being color inverted. Default value is 0.5 + """ + _v1_transform_cls = _transforms.RandomInvert def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: @@ -219,6 +304,20 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomPosterize(_RandomApplyTransform): + """[BETA] Posterize the image randomly with a given probability by reducing the + number of bits for each color channel. + + .. betastatus:: RandomPosterize transform + + If the image is torch Tensor, it should be of type torch.uint8, + and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "L" or "RGB". + + Args: + bits (int): number of bits to keep for each channel (0-8) + p (float): probability of the image being posterized. Default value is 0.5 + """ + _v1_transform_cls = _transforms.RandomPosterize def __init__(self, bits: int, p: float = 0.5) -> None: @@ -230,6 +329,20 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomSolarize(_RandomApplyTransform): + """[BETA] Solarize the image randomly with a given probability by inverting all pixel + values above a threshold. + + .. betastatus:: RandomSolarize transform + + If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format, + where ... means it can have an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "L" or "RGB". + + Args: + threshold (float): all pixels equal or above this value are inverted. + p (float): probability of the image being solarized. Default value is 0.5 + """ + _v1_transform_cls = _transforms.RandomSolarize def __init__(self, threshold: float, p: float = 0.5) -> None: @@ -241,6 +354,18 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomAutocontrast(_RandomApplyTransform): + """[BETA] Autocontrast the pixels of the given image randomly with a given probability. + + .. betastatus:: RandomAutocontrast transform + + If the image is torch Tensor, it is expected + to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "L" or "RGB". + + Args: + p (float): probability of the image being autocontrasted. Default value is 0.5 + """ + _v1_transform_cls = _transforms.RandomAutocontrast def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: @@ -248,6 +373,20 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomAdjustSharpness(_RandomApplyTransform): + """[BETA] Adjust the sharpness of the image randomly with a given probability. + + .. betastatus:: RandomAdjustSharpness transform + + If the image is torch Tensor, + it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + + Args: + sharpness_factor (float): How much to adjust the sharpness. Can be + any non-negative number. 0 gives a blurred image, 1 gives the + original image while 2 increases the sharpness by a factor of 2. + p (float): probability of the image being sharpened. Default value is 0.5 + """ + _v1_transform_cls = _transforms.RandomAdjustSharpness def __init__(self, sharpness_factor: float, p: float = 0.5) -> None: diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py index 555010fda1e..66da9c187c0 100644 --- a/torchvision/transforms/v2/_container.py +++ b/torchvision/transforms/v2/_container.py @@ -9,6 +9,37 @@ class Compose(Transform): + """[BETA] Composes several transforms together. + + .. betastatus:: Compose transform + + This transform does not support torchscript. + Please, see the note below. + + Args: + transforms (list of ``Transform`` objects): list of transforms to compose. + + Example: + >>> transforms.Compose([ + >>> transforms.CenterCrop(10), + >>> transforms.PILToTensor(), + >>> transforms.ConvertImageDtype(torch.float), + >>> ]) + + .. note:: + In order to script the transformations, please use ``torch.nn.Sequential`` as below. + + >>> transforms = torch.nn.Sequential( + >>> transforms.CenterCrop(10), + >>> transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + >>> ) + >>> scripted_transforms = torch.jit.script(transforms) + + Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require + `lambda` functions or ``PIL.Image``. + + """ + def __init__(self, transforms: Sequence[Callable]) -> None: super().__init__() if not isinstance(transforms, Sequence): @@ -29,6 +60,27 @@ def extra_repr(self) -> str: class RandomApply(Transform): + """[BETA] Apply randomly a list of transformations with a given probability. + + .. betastatus:: RandomApply transform + + .. note:: + In order to script the transformation, please use ``torch.nn.ModuleList`` as input instead of list/tuple of + transforms as shown below: + + >>> transforms = transforms.RandomApply(torch.nn.ModuleList([ + >>> transforms.ColorJitter(), + >>> ]), p=0.3) + >>> scripted_transforms = torch.jit.script(transforms) + + Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require + `lambda` functions or ``PIL.Image``. + + Args: + transforms (sequence or torch.nn.Module): list of transformations + p (float): probability + """ + _v1_transform_cls = _transforms.RandomApply def __init__(self, transforms: Union[Sequence[Callable], nn.ModuleList], p: float = 0.5) -> None: @@ -63,6 +115,12 @@ def extra_repr(self) -> str: class RandomChoice(Transform): + """[BETA] Apply single transformation randomly picked from a list. + + .. betastatus:: RandomChoice transform + + This transform does not support torchscript.""" + def __init__( self, transforms: Sequence[Callable], @@ -99,6 +157,13 @@ def forward(self, *inputs: Any) -> Any: class RandomOrder(Transform): + """[BETA] Apply a list of transformations in a random order. + + .. betastatus:: RandomOrder transform + + This transform does not support torchscript. + """ + def __init__(self, transforms: Sequence[Callable]) -> None: if not isinstance(transforms, Sequence): raise TypeError("Argument transforms should be a sequence of callables") diff --git a/torchvision/transforms/v2/_deprecated.py b/torchvision/transforms/v2/_deprecated.py index bfb0d06239f..c44e6b08d11 100644 --- a/torchvision/transforms/v2/_deprecated.py +++ b/torchvision/transforms/v2/_deprecated.py @@ -10,6 +10,31 @@ class ToTensor(Transform): + """[BETA] Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor. + + .. betastatus:: ToTensor transform + + .. warning:: + :class:`v2.ToTensor` is deprecated and will be removed in a future release. + Please use instead ``transforms.Compose([transforms.ToImageTensor(), transforms.ConvertImageDtype()])``. + + This transform does not support torchscript. + + + Converts a PIL Image or numpy.ndarray (H x W x C) in the range + [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] + if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr, RGBA, CMYK, 1) + or if the numpy.ndarray has dtype = np.uint8 + + In the other cases, tensors are returned without scaling. + + .. note:: + Because the input image is scaled to [0.0, 1.0], this transformation should not be used when + transforming target image masks. See the `references`_ for implementing the transforms for image masks. + + .. _references: https://github.com/pytorch/vision/tree/main/references/segmentation + """ + _transformed_types = (PIL.Image.Image, np.ndarray) def __init__(self) -> None: diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py index f1eed87b9c0..af8ca4b6471 100644 --- a/torchvision/transforms/v2/_geometry.py +++ b/torchvision/transforms/v2/_geometry.py @@ -26,6 +26,18 @@ class RandomHorizontalFlip(_RandomApplyTransform): + """[BETA] Horizontally flip the given image/box/mask randomly with a given probability. + + .. betastatus:: RandomHorizontalFlip transform + + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading + dimensions + + Args: + p (float): probability of the image being flipped. Default value is 0.5 + """ + _v1_transform_cls = _transforms.RandomHorizontalFlip def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: @@ -33,6 +45,18 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomVerticalFlip(_RandomApplyTransform): + """[BETA] Vertically flip the given image/box/mask randomly with a given probability. + + .. betastatus:: RandomVerticalFlip transform + + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading + dimensions + + Args: + p (float): probability of the image being flipped. Default value is 0.5 + """ + _v1_transform_cls = _transforms.RandomVerticalFlip def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: @@ -40,6 +64,62 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class Resize(Transform): + """[BETA] Resize the input image/box/mask to the given size. + + .. betastatus:: Resize transform + + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions + + .. warning:: + The output image might be different depending on its type: when downsampling, the interpolation of PIL images + and tensors is slightly different, because PIL applies antialiasing. This may lead to significant differences + in the performance of a network. Therefore, it is preferable to train and serve a model with the same input + types. See also below the ``antialias`` parameter, which can help making the output of PIL images and tensors + closer. + + Args: + size (sequence or int): Desired output size. If size is a sequence like + (h, w), output size will be matched to this. If size is an int, + smaller edge of the image will be matched to this number. + i.e, if height > width, then image will be rescaled to + (size * height / width, size). + + .. note:: + In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``. + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``, + ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported. + The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. + max_size (int, optional): The maximum allowed for the longer edge of + the resized image: if the longer edge of the image is greater + than ``max_size`` after being resized according to ``size``, then + the image is resized again so that the longer edge is equal to + ``max_size``. As a result, ``size`` might be overruled, i.e. the + smaller edge may be shorter than ``size``. This is only supported + if ``size`` is an int (or a sequence of length 1 in torchscript + mode). + antialias (bool, optional): Whether to apply antialiasing. + It only affects **tensors** with bilinear or bicubic modes and it is + ignored otherwise: on PIL images, antialiasing is always applied on + bilinear or bicubic modes; on other modes (for PIL images and + tensors), antialiasing makes no sense and this parameter is ignored. + Possible values are: + + - ``True``: will apply antialiasing for bilinear or bicubic modes. + Other mode aren't affected. This is probably what you want to use. + - ``False``: will not apply antialiasing for tensors on any mode. PIL + images are still antialiased on bilinear or bicubic modes, because + PIL doesn't support no antialias. + - ``None``: equivalent to ``False`` for tensors and ``True`` for + PIL images. This value exists for legacy reasons and you probably + don't want to use it unless you really know what you are doing. + + The current default is ``None`` **but will change to** ``True`` **in + v0.17** for the PIL and Tensor backends to be consistent. + """ + _v1_transform_cls = _transforms.Resize def __init__( @@ -76,6 +156,20 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class CenterCrop(Transform): + """[BETA] Crops the given image/box/mask at the center. + + .. betastatus:: CenterCrop transform + + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. + If image size is smaller than output size along any edge, image is padded with 0 and then center cropped. + + Args: + size (sequence or int): Desired output size of the crop. If size is an + int instead of sequence like (h, w), a square crop (size, size) is + made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). + """ + _v1_transform_cls = _transforms.CenterCrop def __init__(self, size: Union[int, Sequence[int]]): @@ -87,6 +181,53 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomResizedCrop(Transform): + """[BETA] Crop a random portion of image/box/mask and resize it to a given size. + + .. betastatus:: RandomResizedCrop transform + + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions + + A crop of the original image is made: the crop has a random area (H * W) + and a random aspect ratio. This crop is finally resized to the given + size. This is popularly used to train the Inception networks. + + Args: + size (int or sequence): expected output size of the crop, for each edge. If size is an + int instead of sequence like (h, w), a square output size ``(size, size)`` is + made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). + + .. note:: + In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``. + scale (tuple of float): Specifies the lower and upper bounds for the random area of the crop, + before resizing. The scale is defined with respect to the area of the original image. + ratio (tuple of float): lower and upper bounds for the random aspect ratio of the crop, before + resizing. + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``, + ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported. + The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. + antialias (bool, optional): Whether to apply antialiasing. + It only affects **tensors** with bilinear or bicubic modes and it is + ignored otherwise: on PIL images, antialiasing is always applied on + bilinear or bicubic modes; on other modes (for PIL images and + tensors), antialiasing makes no sense and this parameter is ignored. + Possible values are: + + - ``True``: will apply antialiasing for bilinear or bicubic modes. + Other mode aren't affected. This is probably what you want to use. + - ``False``: will not apply antialiasing for tensors on any mode. PIL + images are still antialiased on bilinear or bicubic modes, because + PIL doesn't support no antialias. + - ``None``: equivalent to ``False`` for tensors and ``True`` for + PIL images. This value exists for legacy reasons and you probably + don't want to use it unless you really know what you are doing. + + The current default is ``None`` **but will change to** ``True`` **in + v0.17** for the PIL and Tensor backends to be consistent. + """ + _v1_transform_cls = _transforms.RandomResizedCrop def __init__( @@ -164,7 +305,24 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class FiveCrop(Transform): - """ + """[BETA] Crop the given image/box/mask into four corners and the central crop. + + .. betastatus:: FiveCrop transform + + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading + dimensions + + .. Note:: + This transform returns a tuple of images and there may be a mismatch in the number of + inputs and targets your Dataset returns. See below for an example of how to deal with + this. + + Args: + size (sequence or int): Desired output size of the crop. If size is an ``int`` + instead of sequence like (h, w), a square crop of size (size, size) is made. + If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). + Example: >>> class BatchMultiCrop(transforms.Transform): ... def forward(self, sample: Tuple[Tuple[Union[datapoints.Image, datapoints.Video], ...], int]): @@ -209,8 +367,27 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None: class TenCrop(Transform): - """ + """[BETA] Crop the given image/box/mask into four corners and the central crop plus the flipped version of + these (horizontal flipping is used by default). + + .. betastatus:: TenCrop transform + + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading + dimensions. + See :class:`~torchvision.transforms.v2.FiveCrop` for an example. + + .. Note:: + This transform returns a tuple of images and there may be a mismatch in the number of + inputs and targets your Dataset returns. See below for an example of how to deal with + this. + + Args: + size (sequence or int): Desired output size of the crop. If size is an + int instead of sequence like (h, w), a square crop (size, size) is + made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). + vertical_flip (bool): Use vertical flipping instead of horizontal """ _v1_transform_cls = _transforms.TenCrop @@ -249,6 +426,46 @@ def _transform( class Pad(Transform): + """[BETA] Pad the given image/box/mask on all sides with the given "pad" value. + + .. betastatus:: Pad transform + + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means at most 2 leading dimensions for mode reflect and symmetric, + at most 3 leading dimensions for mode edge, + and an arbitrary number of leading dimensions for mode constant + + Args: + padding (int or sequence): Padding on each border. If a single int is provided this + is used to pad all borders. If sequence of length 2 is provided this is the padding + on left/right and top/bottom respectively. If a sequence of length 4 is provided + this is the padding for the left, top, right and bottom borders respectively. + + .. note:: + In torchscript mode padding as single int is not supported, use a sequence of + length 1: ``[padding, ]``. + fill (number or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of + length 3, it is used to fill R, G, B channels respectively. + This value is only used when the padding_mode is constant. + Only number is supported for torch Tensor. + Only int or tuple value is supported for PIL Image. + padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric. + Default is constant. + + - constant: pads with a constant value, this value is specified with fill + + - edge: pads with the last value at the edge of the image. + If input a 5D torch Tensor, the last 3 dimensions will be padded instead of the last 2 + + - reflect: pads with reflection of image without repeating the last value on the edge. + For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode + will result in [3, 2, 1, 2, 3, 4, 3, 2] + + - symmetric: pads with reflection of image repeating the last value on the edge. + For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode + will result in [2, 1, 1, 2, 3, 4, 4, 3] + """ + _v1_transform_cls = _transforms.Pad def _extract_params_for_v1_transform(self) -> Dict[str, Any]: @@ -323,6 +540,34 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomRotation(Transform): + """[BETA] Rotate the image/box/mask by angle. + + .. betastatus:: RandomRotation transform + + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. + + Args: + degrees (sequence or number): Range of degrees to select from. + If degrees is a number instead of sequence like (min, max), the range of degrees + will be (-degrees, +degrees). + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. + expand (bool, optional): Optional expansion flag. + If true, expands the output to make it large enough to hold the entire rotated image. + If false or omitted, make the output image the same size as the input image. + Note that the expand flag assumes rotation around the center and no translation. + center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner. + Default is the center of the image. + fill (sequence or number): Pixel fill value for the area outside the rotated + image. Default is ``0``. If given a number, the value is used for all bands respectively. + + .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters + + """ + _v1_transform_cls = _transforms.RandomRotation def __init__( @@ -363,6 +608,42 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomAffine(Transform): + """[BETA] Random affine transformation of the image/box/mask keeping center invariant. + + .. betastatus:: RandomAffine transform + + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. + + Args: + degrees (sequence or number): Range of degrees to select from. + If degrees is a number instead of sequence like (min, max), the range of degrees + will be (-degrees, +degrees). Set to 0 to deactivate rotations. + translate (tuple, optional): tuple of maximum absolute fraction for horizontal + and vertical translations. For example translate=(a, b), then horizontal shift + is randomly sampled in the range -img_width * a < dx < img_width * a and vertical shift is + randomly sampled in the range -img_height * b < dy < img_height * b. Will not translate by default. + scale (tuple, optional): scaling factor interval, e.g (a, b), then scale is + randomly sampled from the range a <= scale <= b. Will keep original scale by default. + shear (sequence or number, optional): Range of degrees to select from. + If shear is a number, a shear parallel to the x-axis in the range (-shear, +shear) + will be applied. Else if shear is a sequence of 2 values a shear parallel to the x-axis in the + range (shear[0], shear[1]) will be applied. Else if shear is a sequence of 4 values, + an x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied. + Will not apply shear by default. + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. + fill (sequence or number): Pixel fill value for the area outside the transformed + image. Default is ``0``. If given a number, the value is used for all bands respectively. + center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner. + Default is the center of the image. + + .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters + + """ + _v1_transform_cls = _transforms.RandomAffine def __init__( @@ -443,6 +724,52 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomCrop(Transform): + """[BETA] Crop the given image/box/mask at a random location. + + .. betastatus:: RandomCrop transform + + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions, + but if non-constant padding is used, the input is expected to have at most 2 leading dimensions + + Args: + size (sequence or int): Desired output size of the crop. If size is an + int instead of sequence like (h, w), a square crop (size, size) is + made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). + padding (int or sequence, optional): Optional padding on each border + of the image. Default is None. If a single int is provided this + is used to pad all borders. If sequence of length 2 is provided this is the padding + on left/right and top/bottom respectively. If a sequence of length 4 is provided + this is the padding for the left, top, right and bottom borders respectively. + + .. note:: + In torchscript mode padding as single int is not supported, use a sequence of + length 1: ``[padding, ]``. + pad_if_needed (boolean): It will pad the image if smaller than the + desired size to avoid raising an exception. Since cropping is done + after padding, the padding seems to be done at a random offset. + fill (number or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of + length 3, it is used to fill R, G, B channels respectively. + This value is only used when the padding_mode is constant. + Only number is supported for torch Tensor. + Only int or tuple value is supported for PIL Image. + padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric. + Default is constant. + + - constant: pads with a constant value, this value is specified with fill + + - edge: pads with the last value at the edge of the image. + If input a 5D torch Tensor, the last 3 dimensions will be padded instead of the last 2 + + - reflect: pads with reflection of image without repeating the last value on the edge. + For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode + will result in [3, 2, 1, 2, 3, 4, 3, 2] + + - symmetric: pads with reflection of image repeating the last value on the edge. + For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode + will result in [2, 1, 1, 2, 3, 4, 4, 3] + """ + _v1_transform_cls = _transforms.RandomCrop def _extract_params_for_v1_transform(self) -> Dict[str, Any]: @@ -552,6 +879,25 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomPerspective(_RandomApplyTransform): + """[BETA] Performs a random perspective transformation of the given image/box/mask with a given probability. + + .. betastatus:: RandomPerspective transform + + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. + + Args: + distortion_scale (float): argument to control the degree of distortion and ranges from 0 to 1. + Default is 0.5. + p (float): probability of the image being transformed. Default is 0.5. + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. + fill (sequence or number): Pixel fill value for the area outside the transformed + image. Default is ``0``. If given a number, the value is used for all bands respectively. + """ + _v1_transform_cls = _transforms.RandomPerspective def __init__( diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py index 0d1544094ca..7d0f0ec39f9 100644 --- a/torchvision/transforms/v2/_meta.py +++ b/torchvision/transforms/v2/_meta.py @@ -22,6 +22,27 @@ def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> da class ConvertDtype(Transform): + """[BETA] Convert a tensor image/box/mask to the given ``dtype`` and scale the values accordingly + + .. betastatus:: ConvertDtype transform + + This function does not support PIL Image. + + Args: + dtype (torch.dtype): Desired data type of the output + + .. note:: + + When converting from a smaller to a larger integer ``dtype`` the maximum values are **not** mapped exactly. + If converted back and forth, this mismatch has no effect. + + Raises: + RuntimeError: When trying to cast :class:`torch.float32` to :class:`torch.int32` or :class:`torch.int64` as + well as for trying to cast :class:`torch.float64` to :class:`torch.int64`. These conversions might lead to + overflow errors since the floating point ``dtype`` cannot store consecutive integers over the whole range + of the integer ``dtype``. + """ + _v1_transform_cls = _transforms.ConvertImageDtype _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video) diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py index 6dd0755cfbb..6998d416c91 100644 --- a/torchvision/transforms/v2/_misc.py +++ b/torchvision/transforms/v2/_misc.py @@ -21,6 +21,16 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class Lambda(Transform): + """[BETA] Apply a user-defined lambda as a transform. + + .. betastatus:: Lambda transform + + This transform does not support torchscript. + + Args: + lambd (function): Lambda/function to be used for transform. + """ + def __init__(self, lambd: Callable[[Any], Any], *types: Type): super().__init__() self.lambd = lambd @@ -42,6 +52,26 @@ def extra_repr(self) -> str: class LinearTransformation(Transform): + """[BETA] Transform a tensor image with a square transformation matrix and a mean_vector computed offline. + + .. betastatus:: LinearTransformation transform + + This transform does not support PIL Image. + Given transformation_matrix and mean_vector, will flatten the torch.*Tensor and + subtract mean_vector from it which is then followed by computing the dot + product with the transformation matrix and then reshaping the tensor to its + original shape. + + Applications: + whitening transformation: Suppose X is a column vector zero-centered data. + Then compute the data covariance matrix [D x D] with torch.mm(X.t(), X), + perform SVD on this matrix and pass it as transformation_matrix. + + Args: + transformation_matrix (Tensor): tensor [D x D], D = C x H x W + mean_vector (Tensor): tensor [D], D = C x H x W + """ + _v1_transform_cls = _transforms.LinearTransformation _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video) @@ -105,6 +135,26 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class Normalize(Transform): + """[BETA] Normalize a tensor image with mean and standard deviation. + + .. betastatus:: Normalize transform + + This transform does not support PIL Image. + Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n`` + channels, this transform will normalize each channel of the input + ``torch.*Tensor`` i.e., + ``output[channel] = (input[channel] - mean[channel]) / std[channel]`` + + .. note:: + This transform acts out of place, i.e., it does not mutate the input tensor. + + Args: + mean (sequence): Sequence of means for each channel. + std (sequence): Sequence of standard deviations for each channel. + inplace(bool,optional): Bool to make this operation in-place. + + """ + _v1_transform_cls = _transforms.Normalize _transformed_types = (datapoints.Image, is_simple_tensor, datapoints.Video) @@ -125,6 +175,24 @@ def _transform( class GaussianBlur(Transform): + """[BETA] Blurs image with randomly chosen Gaussian blur. + + .. betastatus:: GausssianBlur transform + + If the image is torch Tensor, it is expected + to have [..., C, H, W] shape, where ... means an arbitrary number of leading dimensions. + + Args: + kernel_size (int or sequence): Size of the Gaussian kernel. + sigma (float or tuple of float (min, max)): Standard deviation to be used for + creating kernel to perform blurring. If float, sigma is fixed. If it is tuple + of float (min, max), sigma is chosen uniformly at random to lie in the + given range. + + Returns: + PIL Image or Tensor: Gaussian blurred version of the input image. + """ + _v1_transform_cls = _transforms.GaussianBlur def __init__( diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py index 984d5ba50c0..b0743feb10d 100644 --- a/torchvision/transforms/v2/_type_conversion.py +++ b/torchvision/transforms/v2/_type_conversion.py @@ -11,6 +11,15 @@ class PILToTensor(Transform): + """[BETA] Convert a ``PIL Image`` to a tensor of the same type. + + .. betastatus:: PILToTensor transform + + This transform does not support torchscript. + + Converts a PIL Image (H x W x C) to a Tensor of shape (C x H x W). + """ + _transformed_types = (PIL.Image.Image,) def _transform(self, inpt: PIL.Image.Image, params: Dict[str, Any]) -> torch.Tensor: @@ -27,6 +36,27 @@ def _transform( class ToImagePIL(Transform): + """[BETA] Convert a tensor or an ndarray to PIL Image. + + .. betastatus:: ToImagePIL transform + + This transform does not support torchscript. + + Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape + H x W x C to a PIL Image while preserving the value range. + + Args: + mode (`PIL.Image mode`_): color space and pixel depth of input data (optional). + If ``mode`` is ``None`` (default) there are some assumptions made about the input data: + - If the input has 4 channels, the ``mode`` is assumed to be ``RGBA``. + - If the input has 3 channels, the ``mode`` is assumed to be ``RGB``. + - If the input has 2 channels, the ``mode`` is assumed to be ``LA``. + - If the input has 1 channel, the ``mode`` is determined by the data type (i.e ``int``, ``float``, + ``short``). + + .. _PIL.Image mode: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#concept-modes + """ + _transformed_types = (is_simple_tensor, datapoints.Image, np.ndarray) def __init__(self, mode: Optional[str] = None) -> None: From 07023255c664f10f6ab442f3c65325cb1a81eae0 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 22 Feb 2023 11:34:07 +0100 Subject: [PATCH 04/27] reduce GHA log output (#7267) Co-authored-by: vfdev --- .github/workflows/test-linux-cpu.yml | 6 +++--- .github/workflows/test-linux-gpu.yml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test-linux-cpu.yml b/.github/workflows/test-linux-cpu.yml index 769ee4f841b..8a9f7d33b49 100644 --- a/.github/workflows/test-linux-cpu.yml +++ b/.github/workflows/test-linux-cpu.yml @@ -39,7 +39,7 @@ jobs: fi # Create Conda Env - conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy 'ffmpeg<4.3' + conda create -yp ci_env --quiet python="${PYTHON_VERSION}" numpy libpng jpeg scipy 'ffmpeg<4.3' conda activate /work/ci_env # Install PyTorch, Torchvision, and testing libraries @@ -50,8 +50,8 @@ jobs: -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \ "${CUDATOOLKIT}" python3 setup.py develop - python3 -m pip install pytest pytest-mock 'av<10' + python3 -m pip install --progress-bar=off pytest pytest-mock 'av<10' # Run Tests python3 -m torch.utils.collect_env - python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20 + python3 -m pytest --junitxml=test-results/junit.xml --durations 20 diff --git a/.github/workflows/test-linux-gpu.yml b/.github/workflows/test-linux-gpu.yml index 95d06402db1..d1275071bf7 100644 --- a/.github/workflows/test-linux-gpu.yml +++ b/.github/workflows/test-linux-gpu.yml @@ -43,7 +43,7 @@ jobs: fi # Create Conda Env - conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy 'ffmpeg<4.3' + conda create -yp ci_env --quiet python="${PYTHON_VERSION}" numpy libpng jpeg scipy 'ffmpeg<4.3' conda activate /work/ci_env # Install PyTorch, Torchvision, and testing libraries @@ -54,8 +54,8 @@ jobs: -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \ "${CUDATOOLKIT}" python3 setup.py develop - python3 -m pip install pytest pytest-mock 'av<10' + python3 -m pip install --progress-bar=off pytest pytest-mock 'av<10' # Run Tests python3 -m torch.utils.collect_env - python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20 + python3 -m pytest --junitxml=test-results/junit.xml --durations 20 From 011ebd7478ae273a164e62157c0ed6c459eb2fc5 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 22 Feb 2023 13:27:17 +0100 Subject: [PATCH 05/27] align transforms v2 signatures with v1 (#7301) Co-authored-by: vfdev --- test/test_transforms_v2_consistency.py | 9 ++++++--- torchvision/transforms/v2/_container.py | 2 +- torchvision/transforms/v2/_geometry.py | 8 ++++---- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py index 125d7ec7a3f..43f17c9b15a 100644 --- a/test/test_transforms_v2_consistency.py +++ b/test/test_transforms_v2_consistency.py @@ -540,9 +540,12 @@ def test_signature_consistency(config): f"not. Please add a default value." ) - legacy_kinds = {name: param.kind for name, param in legacy_params.items()} - prototype_kinds = {name: prototype_params[name].kind for name in legacy_kinds.keys()} - assert prototype_kinds == legacy_kinds + legacy_signature = list(legacy_params.keys()) + # Since we made sure that we don't have any extra parameters without default above, we clamp the prototype signature + # to the same number of parameters as the legacy one + prototype_signature = list(prototype_params.keys())[: len(legacy_signature)] + + assert prototype_signature == legacy_signature def check_call_consistency( diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py index 66da9c187c0..08282962ffd 100644 --- a/torchvision/transforms/v2/_container.py +++ b/torchvision/transforms/v2/_container.py @@ -124,8 +124,8 @@ class RandomChoice(Transform): def __init__( self, transforms: Sequence[Callable], - probabilities: Optional[List[float]] = None, p: Optional[List[float]] = None, + probabilities: Optional[List[float]] = None, ) -> None: if not isinstance(transforms, Sequence): raise TypeError("Argument transforms should be a sequence of callables") diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py index af8ca4b6471..4d7a5fca384 100644 --- a/torchvision/transforms/v2/_geometry.py +++ b/torchvision/transforms/v2/_geometry.py @@ -575,8 +575,8 @@ def __init__( degrees: Union[numbers.Number, Sequence], interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST, expand: bool = False, - fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0, center: Optional[List[float]] = None, + fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0, ) -> None: super().__init__() self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,)) @@ -903,9 +903,9 @@ class RandomPerspective(_RandomApplyTransform): def __init__( self, distortion_scale: float = 0.5, - fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0, - interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, p: float = 0.5, + interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, + fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0, ) -> None: super().__init__(p=p) @@ -966,8 +966,8 @@ def __init__( self, alpha: Union[float, Sequence[float]] = 50.0, sigma: Union[float, Sequence[float]] = 5.0, - fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0, interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR, + fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0, ) -> None: super().__init__() self.alpha = _setup_float_or_seq(alpha, "alpha", 2) From 72d48e277837516564c1ceee77669595c71f27cc Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 23 Feb 2023 12:05:31 +0000 Subject: [PATCH 06/27] Extend default heuristic of SanitizeBoundingBoxes to support tuples (#7304) Co-authored-by: Philip Meier --- test/test_transforms_v2.py | 54 +++++++++++++++++++++++------- torchvision/transforms/v2/_misc.py | 31 ++++++++++++----- 2 files changed, 63 insertions(+), 22 deletions(-) diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index 2e43c86f91d..a1e1cb720d5 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -1935,7 +1935,14 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): @pytest.mark.parametrize( "labels_getter", ("default", "labels", lambda inputs: inputs["labels"], None, lambda inputs: None) ) -def test_sanitize_bounding_boxes(min_size, labels_getter): +@pytest.mark.parametrize("sample_type", (tuple, dict)) +def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type): + + if sample_type is tuple and not isinstance(labels_getter, str): + # The "lambda inputs: inputs["labels"]" labels_getter used in this test + # doesn't work if the input is a tuple. + return + H, W = 256, 128 boxes_and_validity = [ @@ -1970,35 +1977,56 @@ def test_sanitize_bounding_boxes(min_size, labels_getter): ) masks = datapoints.Mask(torch.randint(0, 2, size=(boxes.shape[0], H, W))) - + whatever = torch.rand(10) + input_img = torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8) sample = { - "image": torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8), + "image": input_img, "labels": labels, "boxes": boxes, - "whatever": torch.rand(10), + "whatever": whatever, "None": None, "masks": masks, } + if sample_type is tuple: + img = sample.pop("image") + sample = (img, sample) + out = transforms.SanitizeBoundingBoxes(min_size=min_size, labels_getter=labels_getter)(sample) - assert out["image"] is sample["image"] - assert out["whatever"] is sample["whatever"] + if sample_type is tuple: + out_image = out[0] + out_labels = out[1]["labels"] + out_boxes = out[1]["boxes"] + out_masks = out[1]["masks"] + out_whatever = out[1]["whatever"] + else: + out_image = out["image"] + out_labels = out["labels"] + out_boxes = out["boxes"] + out_masks = out["masks"] + out_whatever = out["whatever"] + + assert out_image is input_img + assert out_whatever is whatever if labels_getter is None or (callable(labels_getter) and labels_getter({"labels": "blah"}) is None): - assert out["labels"] is sample["labels"] + assert out_labels is labels else: - assert isinstance(out["labels"], torch.Tensor) - assert out["boxes"].shape[0] == out["labels"].shape[0] == out["masks"].shape[0] + assert isinstance(out_labels, torch.Tensor) + assert out_boxes.shape[0] == out_labels.shape[0] == out_masks.shape[0] # This works because we conveniently set labels to arange(num_boxes) - assert out["labels"].tolist() == valid_indices + assert out_labels.tolist() == valid_indices @pytest.mark.parametrize("key", ("labels", "LABELS", "LaBeL", "SOME_WEIRD_KEY_THAT_HAS_LABeL_IN_IT")) -def test_sanitize_bounding_boxes_default_heuristic(key): +@pytest.mark.parametrize("sample_type", (tuple, dict)) +def test_sanitize_bounding_boxes_default_heuristic(key, sample_type): labels = torch.arange(10) - d = {key: labels} - assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(d) is labels + sample = {key: labels, "another_key": "whatever"} + if sample_type is tuple: + sample = (None, sample, "whatever_again") + assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(sample) is labels if key.lower() != "labels": # If "labels" is in the dict (case-insensitive), diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py index 6998d416c91..8cc4aa6a3db 100644 --- a/torchvision/transforms/v2/_misc.py +++ b/torchvision/transforms/v2/_misc.py @@ -1,7 +1,7 @@ import collections import warnings from contextlib import suppress -from typing import Any, Callable, cast, Dict, List, Optional, Sequence, Type, Union +from typing import Any, Callable, cast, Dict, List, Mapping, Optional, Sequence, Type, Union import PIL.Image @@ -269,7 +269,9 @@ def __init__( elif callable(labels_getter): self._labels_getter = labels_getter elif isinstance(labels_getter, str): - self._labels_getter = lambda inputs: inputs[labels_getter] + self._labels_getter = lambda inputs: SanitizeBoundingBoxes._get_dict_or_second_tuple_entry(inputs)[ + labels_getter # type: ignore[index] + ] elif labels_getter is None: self._labels_getter = None else: @@ -278,10 +280,27 @@ def __init__( f"Got {labels_getter} of type {type(labels_getter)}." ) + @staticmethod + def _get_dict_or_second_tuple_entry(inputs: Any) -> Mapping[str, Any]: + # datasets outputs may be plain dicts like {"img": ..., "labels": ..., "bbox": ...} + # or tuples like (img, {"labels":..., "bbox": ...}) + # This hacky helper accounts for both structures. + if isinstance(inputs, tuple): + inputs = inputs[1] + + if not isinstance(inputs, collections.abc.Mapping): + raise ValueError( + f"If labels_getter is a str or 'default', " + f"then the input to forward() must be a dict or a tuple whose second element is a dict." + f" Got {type(inputs)} instead." + ) + return inputs + @staticmethod def _find_labels_default_heuristic(inputs: Dict[str, Any]) -> Optional[torch.Tensor]: - # Tries to find a "label" key, otherwise tries for the first key that contains "label" - case insensitive + # Tries to find a "labels" key, otherwise tries for the first key that contains "label" - case insensitive # Returns None if nothing is found + inputs = SanitizeBoundingBoxes._get_dict_or_second_tuple_entry(inputs) candidate_key = None with suppress(StopIteration): candidate_key = next(key for key in inputs.keys() if key.lower() == "labels") @@ -298,12 +317,6 @@ def _find_labels_default_heuristic(inputs: Dict[str, Any]) -> Optional[torch.Ten def forward(self, *inputs: Any) -> Any: inputs = inputs if len(inputs) > 1 else inputs[0] - if isinstance(self.labels_getter, str) and not isinstance(inputs, collections.abc.Mapping): - raise ValueError( - f"If labels_getter is a str or 'default' (got {self.labels_getter}), " - f"then the input to forward() must be a dict. Got {type(inputs)} instead." - ) - if self._labels_getter is None: labels = None else: From b598de48d3e6b8293ba8eb0315da5e6504026d44 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 23 Feb 2023 14:46:35 +0100 Subject: [PATCH 07/27] add end-to-end example gallery for transforms v2 (#7302) Co-authored-by: Nicolas Hug --- docs/requirements.txt | 1 + gallery/assets/coco/images/000000000001.jpg | 1 + gallery/assets/coco/images/000000000002.jpg | 1 + gallery/assets/coco/instances.json | 1 + gallery/plot_transforms_v2_e2e.py | 152 ++++++++++++++++++++ 5 files changed, 156 insertions(+) create mode 120000 gallery/assets/coco/images/000000000001.jpg create mode 120000 gallery/assets/coco/images/000000000002.jpg create mode 100644 gallery/assets/coco/instances.json create mode 100644 gallery/plot_transforms_v2_e2e.py diff --git a/docs/requirements.txt b/docs/requirements.txt index 09a11359ae7..2a50d9b8f45 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -5,3 +5,4 @@ sphinx-gallery>=0.11.1 sphinx==5.0.0 tabulate -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme +pycocotools diff --git a/gallery/assets/coco/images/000000000001.jpg b/gallery/assets/coco/images/000000000001.jpg new file mode 120000 index 00000000000..9be80c7c273 --- /dev/null +++ b/gallery/assets/coco/images/000000000001.jpg @@ -0,0 +1 @@ +../../astronaut.jpg \ No newline at end of file diff --git a/gallery/assets/coco/images/000000000002.jpg b/gallery/assets/coco/images/000000000002.jpg new file mode 120000 index 00000000000..9f8efef9928 --- /dev/null +++ b/gallery/assets/coco/images/000000000002.jpg @@ -0,0 +1 @@ +../../dog2.jpg \ No newline at end of file diff --git a/gallery/assets/coco/instances.json b/gallery/assets/coco/instances.json new file mode 100644 index 00000000000..fe0e09270bf --- /dev/null +++ b/gallery/assets/coco/instances.json @@ -0,0 +1 @@ +{"images": [{"file_name": "000000000001.jpg", "height": 512, "width": 512, "id": 1}, {"file_name": "000000000002.jpg", "height": 500, "width": 500, "id": 2}], "annotations": [{"segmentation": [[40.0, 511.0, 26.0, 487.0, 28.0, 438.0, 17.0, 397.0, 24.0, 346.0, 38.0, 306.0, 61.0, 250.0, 111.0, 206.0, 111.0, 187.0, 120.0, 183.0, 136.0, 159.0, 159.0, 150.0, 181.0, 148.0, 182.0, 132.0, 175.0, 132.0, 168.0, 120.0, 154.0, 102.0, 153.0, 62.0, 188.0, 35.0, 191.0, 29.0, 208.0, 20.0, 210.0, 22.0, 227.0, 16.0, 240.0, 16.0, 276.0, 31.0, 285.0, 39.0, 301.0, 88.0, 297.0, 108.0, 281.0, 128.0, 273.0, 138.0, 266.0, 138.0, 264.0, 153.0, 257.0, 162.0, 256.0, 174.0, 284.0, 197.0, 300.0, 221.0, 303.0, 236.0, 337.0, 258.0, 357.0, 306.0, 361.0, 351.0, 358.0, 511.0]], "iscrowd": 0, "image_id": 1, "bbox": [17.0, 16.0, 344.0, 495.0], "category_id": 1, "id": 1}, {"segmentation": [[0.0, 411.0, 43.0, 401.0, 99.0, 395.0, 105.0, 351.0, 124.0, 326.0, 181.0, 294.0, 227.0, 280.0, 245.0, 262.0, 259.0, 234.0, 262.0, 207.0, 271.0, 140.0, 283.0, 139.0, 301.0, 162.0, 309.0, 181.0, 341.0, 175.0, 362.0, 139.0, 369.0, 139.0, 377.0, 163.0, 378.0, 203.0, 381.0, 212.0, 380.0, 220.0, 382.0, 242.0, 404.0, 264.0, 392.0, 293.0, 384.0, 295.0, 385.0, 316.0, 399.0, 343.0, 391.0, 448.0, 452.0, 475.0, 457.0, 494.0, 436.0, 498.0, 402.0, 491.0, 369.0, 488.0, 366.0, 496.0, 319.0, 496.0, 302.0, 485.0, 226.0, 469.0, 128.0, 456.0, 74.0, 458.0, 29.0, 439.0, 0.0, 445.0]], "iscrowd": 0, "image_id": 2, "bbox": [0.0, 139.0, 457.0, 359.0], "category_id": 18, "id": 2}]} diff --git a/gallery/plot_transforms_v2_e2e.py b/gallery/plot_transforms_v2_e2e.py new file mode 100644 index 00000000000..938578e4af9 --- /dev/null +++ b/gallery/plot_transforms_v2_e2e.py @@ -0,0 +1,152 @@ +""" +================================================== +transforms v2: End-to-end object detection example +================================================== + +Object detection is not supported out of the box by ``torchvision.transforms`` v1, since it only supports images. +``torchvision.transforms.v2`` enables jointly transforming images, videos, bounding boxes, and masks. This example +showcases an end-to-end object detection training using the stable ``torchvisio.datasets`` and ``torchvision.models`` as +well as the new ``torchvision.transforms.v2`` v2 API. +""" + +import pathlib +from collections import defaultdict + +import PIL.Image + +import torch +import torch.utils.data + +import torchvision + + +# sphinx_gallery_thumbnail_number = -1 +def show(sample): + import matplotlib.pyplot as plt + + from torchvision.transforms.v2 import functional as F + from torchvision.utils import draw_bounding_boxes + + image, target = sample + if isinstance(image, PIL.Image.Image): + image = F.to_image_tensor(image) + image = F.convert_dtype(image, torch.uint8) + annotated_image = draw_bounding_boxes(image, target["boxes"], colors="yellow", width=3) + + fig, ax = plt.subplots() + ax.imshow(annotated_image.permute(1, 2, 0).numpy()) + ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[]) + fig.tight_layout() + + fig.show() + + +# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that +# some APIs may slightly change in the future +torchvision.disable_beta_transforms_warning() + +from torchvision import models, datasets +import torchvision.transforms.v2 as transforms + + +######################################################################################################################## +# We start off by loading the :class:`~torchvision.datasets.CocoDetection` dataset to have a look at what it currently +# returns, and we'll see how to convert it to a format that is compatible with our new transforms. + + +def load_example_coco_detection_dataset(**kwargs): + # This loads fake data for illustration purposes of this example. In practice, you'll have + # to replace this with the proper data + root = pathlib.Path("assets") / "coco" + return datasets.CocoDetection(str(root / "images"), str(root / "instances.json"), **kwargs) + + +dataset = load_example_coco_detection_dataset() + +sample = dataset[0] +image, target = sample +print(type(image)) +print(type(target), type(target[0]), list(target[0].keys())) + + +######################################################################################################################## +# The dataset returns a two-tuple with the first item being a :class:`PIL.Image.Image` and second one a list of +# dictionaries, which each containing the annotations for a single object instance. As is, this format is not compatible +# with the ``torchvision.transforms.v2``, nor with the models. To overcome that, we provide the +# :func:`~torchvision.datasets.wrap_dataset_for_transforms_v2` function. For +# :class:`~torchvision.datasets.CocoDetection`, this changes the target structure to a single dictionary of lists. It +# also adds the key-value-pairs ``"boxes"``, ``"masks"``, and ``"labels"`` wrapped in the corresponding +# ``torchvision.datapoints``. + +dataset = datasets.wrap_dataset_for_transforms_v2(dataset) + +sample = dataset[0] +image, target = sample +print(type(image)) +print(type(target), list(target.keys())) +print(type(target["boxes"]), type(target["masks"]), type(target["labels"])) + +######################################################################################################################## +# As baseline, let's have a look at a sample without transformations: + +show(sample) + + +######################################################################################################################## +# With the dataset properly set up, we can now define the augmentation pipeline. This is done the same way it is done in +# ``torchvision.transforms`` v1, but now handles bounding boxes and masks without any extra configuration. + +transform = transforms.Compose( + [ + transforms.RandomPhotometricDistort(), + transforms.RandomZoomOut( + fill=defaultdict(lambda: 0, {PIL.Image.Image: (123, 117, 104)}) + ), + transforms.RandomIoUCrop(), + transforms.RandomHorizontalFlip(), + transforms.ToImageTensor(), + transforms.ConvertImageDtype(torch.float32), + transforms.SanitizeBoundingBoxes(), + ] +) + +######################################################################################################################## +# .. note:: +# Although the :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` transform is a no-op in this example, but it +# should be placed at least once at the end of a detection pipeline to remove degenerate bounding boxes as well as +# the corresponding labels and optionally masks. It is particularly critical to add it if +# :class:`~torchvision.transforms.v2.RandomIoUCrop` was used. +# +# Let's look how the sample looks like with our augmentation pipeline in place: + +dataset = load_example_coco_detection_dataset(transforms=transform) +dataset = datasets.wrap_dataset_for_transforms_v2(dataset) + +torch.manual_seed(3141) +sample = dataset[0] + +show(sample) + + +######################################################################################################################## +# We can see that the color of the image was distorted, we zoomed out on it (off center) and flipped it horizontally. +# In all of this, the bounding box was transformed accordingly. And without any further ado, we can start training. + +data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=2, + # We need a custom collation function here, since the object detection models expect a + # sequence of images and target dictionaries. The default collation function tries to + # `torch.stack` the individual elements, which fails in general for object detection, + # because the number of object instances varies between the samples. This is the same for + # `torchvision.transforms` v1 + collate_fn=lambda batch: tuple(zip(*batch)), +) + +model = models.get_model("ssd300_vgg16", weights=None, weights_backbone=None).train() + +for images, targets in data_loader: + loss_dict = model(images, targets) + print(loss_dict) + # Put your training logic here + break From dd5cec3557a760d2f634754e2783304952a26ff5 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 23 Feb 2023 16:25:44 +0000 Subject: [PATCH 08/27] Add v2 docs for color transforms (#7310) --- docs/source/transforms.rst | 1 + torchvision/transforms/v2/_color.py | 43 +++++++++++++++++++++++------ 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst index 00d929d0675..c2e9855d9e8 100644 --- a/docs/source/transforms.rst +++ b/docs/source/transforms.rst @@ -131,6 +131,7 @@ Color ColorJitter v2.ColorJitter + v2.RandomPhotometricDistort Grayscale v2.Grayscale RandomGrayscale diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py index 785a3965e60..2a581bf5640 100644 --- a/torchvision/transforms/v2/_color.py +++ b/torchvision/transforms/v2/_color.py @@ -46,7 +46,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomGrayscale(_RandomApplyTransform): - """[BETA] Randomly convert image to grayscale with a probability of p (default 0.1). + """[BETA] Randomly convert image or videos to grayscale with a probability of p (default 0.1). .. betastatus:: RandomGrayscale transform @@ -85,7 +85,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class ColorJitter(Transform): - """[BETA] Randomly change the brightness, contrast, saturation and hue of an image. + """[BETA] Randomly change the brightness, contrast, saturation and hue of an image or video. .. betastatus:: ColorJitter transform @@ -190,6 +190,31 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: # TODO: This class seems to be untested class RandomPhotometricDistort(Transform): + """[BETA] Randomly distorts the image or video as used in `SSD: Single Shot + MultiBox Detector `_. + + .. betastatus:: RandomPhotometricDistort transform + + This transform relies on :class:`~torchvision.transforms.v2.ColorJitter` + under the hood to adjust the contrast, saturation, hue, brightness, and also + randomly permutes channels. + + Args: + brightness (tuple of float (min, max), optional): How much to jitter brightness. + brightness_factor is chosen uniformly from [min, max]. Should be non negative numbers. + contrast tuple of float (min, max), optional): How much to jitter contrast. + contrast_factor is chosen uniformly from [min, max]. Should be non-negative numbers. + saturation (tuple of float (min, max), optional): How much to jitter saturation. + saturation_factor is chosen uniformly from [min, max]. Should be non negative numbers. + hue (tuple of float (min, max), optional): How much to jitter hue. + hue_factor is chosen uniformly from [min, max]. Should have -0.5 <= min <= max <= 0.5. + To jitter hue, the pixel values of the input image has to be non-negative for conversion to HSV space; + thus it does not work if you normalize your image to an interval with negative values, + or use an interpolation that generates negative values before using this function. + p (float, optional) probability each distortion operation (contrast, saturation, ...) to be applied. + Default is 0.5. + """ + _transformed_types = ( datapoints.Image, PIL.Image.Image, @@ -199,10 +224,10 @@ class RandomPhotometricDistort(Transform): def __init__( self, + brightness: Tuple[float, float] = (0.875, 1.125), contrast: Tuple[float, float] = (0.5, 1.5), saturation: Tuple[float, float] = (0.5, 1.5), hue: Tuple[float, float] = (-0.05, 0.05), - brightness: Tuple[float, float] = (0.875, 1.125), p: float = 0.5, ): super().__init__() @@ -266,7 +291,7 @@ def _transform( class RandomEqualize(_RandomApplyTransform): - """[BETA] Equalize the histogram of the given image randomly with a given probability. + """[BETA] Equalize the histogram of the given image or video with a given probability. .. betastatus:: RandomEqualize transform @@ -285,7 +310,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomInvert(_RandomApplyTransform): - """[BETA] Inverts the colors of the given image randomly with a given probability. + """[BETA] Inverts the colors of the given image or video with a given probability. .. betastatus:: RandomInvert transform @@ -304,7 +329,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomPosterize(_RandomApplyTransform): - """[BETA] Posterize the image randomly with a given probability by reducing the + """[BETA] Posterize the image or video with a given probability by reducing the number of bits for each color channel. .. betastatus:: RandomPosterize transform @@ -329,7 +354,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomSolarize(_RandomApplyTransform): - """[BETA] Solarize the image randomly with a given probability by inverting all pixel + """[BETA] Solarize the image or video with a given probability by inverting all pixel values above a threshold. .. betastatus:: RandomSolarize transform @@ -354,7 +379,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomAutocontrast(_RandomApplyTransform): - """[BETA] Autocontrast the pixels of the given image randomly with a given probability. + """[BETA] Autocontrast the pixels of the given image or video with a given probability. .. betastatus:: RandomAutocontrast transform @@ -373,7 +398,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomAdjustSharpness(_RandomApplyTransform): - """[BETA] Adjust the sharpness of the image randomly with a given probability. + """[BETA] Adjust the sharpness of the image or video with a given probability. .. betastatus:: RandomAdjustSharpness transform From 4fb043e09278c4fb053aaf8db710b72247fd16b5 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 23 Feb 2023 16:26:25 +0000 Subject: [PATCH 09/27] Add docs for containers and undeprecate p for RandomChoice (#7311) Co-authored-by: vfdev --- test/test_transforms_v2.py | 5 +--- torchvision/transforms/v2/_container.py | 37 +++++++++++++------------ 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index a1e1cb720d5..9173ec14f2c 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -1359,11 +1359,8 @@ def test_ctor(self, transform_cls, trfms): class TestRandomChoice: def test_assertions(self): - with pytest.warns(UserWarning, match="Argument p is deprecated and will be removed"): - transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], p=[1, 2]) - with pytest.raises(ValueError, match="The number of probabilities doesn't match the number of transforms"): - transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], probabilities=[1]) + transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], p=[1]) class TestRandomIoUCrop: diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py index 08282962ffd..27affc7100b 100644 --- a/torchvision/transforms/v2/_container.py +++ b/torchvision/transforms/v2/_container.py @@ -1,4 +1,3 @@ -import warnings from typing import Any, Callable, Dict, List, Optional, Sequence, Union import torch @@ -78,7 +77,7 @@ class RandomApply(Transform): Args: transforms (sequence or torch.nn.Module): list of transformations - p (float): probability + p (float): probability of applying the list of transforms """ _v1_transform_cls = _transforms.RandomApply @@ -119,39 +118,38 @@ class RandomChoice(Transform): .. betastatus:: RandomChoice transform - This transform does not support torchscript.""" + This transform does not support torchscript. + + Args: + transforms (sequence or torch.nn.Module): list of transformations + p (list of floats or None, optional): probability of each transform being picked. + If ``p`` doesn't sum to 1, it is automatically normalized. If ``None`` + (default), all transforms have the same probability. + """ def __init__( self, transforms: Sequence[Callable], p: Optional[List[float]] = None, - probabilities: Optional[List[float]] = None, ) -> None: if not isinstance(transforms, Sequence): raise TypeError("Argument transforms should be a sequence of callables") - if p is not None: - warnings.warn( - "Argument p is deprecated and will be removed in a future release. " - "Please use probabilities argument instead." - ) - probabilities = p - if probabilities is None: - probabilities = [1] * len(transforms) - elif len(probabilities) != len(transforms): + if p is None: + p = [1] * len(transforms) + elif len(p) != len(transforms): raise ValueError( - f"The number of probabilities doesn't match the number of transforms: " - f"{len(probabilities)} != {len(transforms)}" + f"The number of p doesn't match the number of transforms: " f"{len(p)} != {len(transforms)}" ) super().__init__() self.transforms = transforms - total = sum(probabilities) - self.probabilities = [prob / total for prob in probabilities] + total = sum(p) + self.p = [prob / total for prob in p] def forward(self, *inputs: Any) -> Any: - idx = int(torch.multinomial(torch.tensor(self.probabilities), 1)) + idx = int(torch.multinomial(torch.tensor(self.p), 1)) transform = self.transforms[idx] return transform(*inputs) @@ -162,6 +160,9 @@ class RandomOrder(Transform): .. betastatus:: RandomOrder transform This transform does not support torchscript. + + Args: + transforms (sequence or torch.nn.Module): list of transformations """ def __init__(self, transforms: Sequence[Callable]) -> None: From 684f8d24aafa812b281f4876d2f217cc43c10464 Mon Sep 17 00:00:00 2001 From: vfdev Date: Thu, 23 Feb 2023 17:57:22 +0100 Subject: [PATCH 10/27] Updated geometric transforms v2 docstring (#7303) Co-authored-by: Nicolas Hug Co-authored-by: Philip Meier --- docs/source/transforms.rst | 12 +- torchvision/transforms/v2/_geometry.py | 402 ++++++++++++++++++++----- 2 files changed, 334 insertions(+), 80 deletions(-) diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst index c2e9855d9e8..ddd6f37d083 100644 --- a/docs/source/transforms.rst +++ b/docs/source/transforms.rst @@ -99,10 +99,14 @@ Geometry Resize v2.Resize + v2.ScaleJitter + v2.RandomShortestSize + v2.RandomResize RandomCrop v2.RandomCrop RandomResizedCrop v2.RandomResizedCrop + v2.RandomIoUCrop CenterCrop v2.CenterCrop FiveCrop @@ -111,17 +115,21 @@ Geometry v2.TenCrop Pad v2.Pad + v2.RandomZoomOut + RandomRotation + v2.RandomRotation RandomAffine v2.RandomAffine RandomPerspective v2.RandomPerspective - RandomRotation - v2.RandomRotation + ElasticTransform + v2.ElasticTransform RandomHorizontalFlip v2.RandomHorizontalFlip RandomVerticalFlip v2.RandomVerticalFlip + Color ----- diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py index 4d7a5fca384..c3342eb9926 100644 --- a/torchvision/transforms/v2/_geometry.py +++ b/torchvision/transforms/v2/_geometry.py @@ -26,16 +26,17 @@ class RandomHorizontalFlip(_RandomApplyTransform): - """[BETA] Horizontally flip the given image/box/mask randomly with a given probability. + """[BETA] Horizontally flip the input with a given probability. .. betastatus:: RandomHorizontalFlip transform - If the image is torch Tensor, it is expected - to have [..., H, W] shape, where ... means an arbitrary number of leading - dimensions + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. Args: - p (float): probability of the image being flipped. Default value is 0.5 + p (float, optional): probability of the input being flipped. Default value is 0.5 """ _v1_transform_cls = _transforms.RandomHorizontalFlip @@ -45,16 +46,17 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomVerticalFlip(_RandomApplyTransform): - """[BETA] Vertically flip the given image/box/mask randomly with a given probability. + """[BETA] Vertically flip the input with a given probability. .. betastatus:: RandomVerticalFlip transform - If the image is torch Tensor, it is expected - to have [..., H, W] shape, where ... means an arbitrary number of leading - dimensions + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. Args: - p (float): probability of the image being flipped. Default value is 0.5 + p (float, optional): probability of the input being flipped. Default value is 0.5 """ _v1_transform_cls = _transforms.RandomVerticalFlip @@ -64,12 +66,14 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class Resize(Transform): - """[BETA] Resize the input image/box/mask to the given size. + """[BETA] Resize the input to the given size. .. betastatus:: Resize transform - If the image is torch Tensor, it is expected - to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. .. warning:: The output image might be different depending on its type: when downsampling, the interpolation of PIL images @@ -87,7 +91,7 @@ class Resize(Transform): .. note:: In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``. - interpolation (InterpolationMode): Desired interpolation enum defined by + interpolation (InterpolationMode, optional): Desired interpolation enum defined by :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``, ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported. @@ -156,12 +160,15 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class CenterCrop(Transform): - """[BETA] Crops the given image/box/mask at the center. + """[BETA] Crop the input at the center. .. betastatus:: CenterCrop transform - If the image is torch Tensor, it is expected - to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. + If image size is smaller than output size along any edge, image is padded with 0 and then center cropped. Args: @@ -181,14 +188,16 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomResizedCrop(Transform): - """[BETA] Crop a random portion of image/box/mask and resize it to a given size. + """[BETA] Crop a random portion of the input and resize it to a given size. .. betastatus:: RandomResizedCrop transform - If the image is torch Tensor, it is expected - to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. - A crop of the original image is made: the crop has a random area (H * W) + A crop of the original input is made: the crop has a random area (H * W) and a random aspect ratio. This crop is finally resized to the given size. This is popularly used to train the Inception networks. @@ -199,11 +208,11 @@ class RandomResizedCrop(Transform): .. note:: In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``. - scale (tuple of float): Specifies the lower and upper bounds for the random area of the crop, + scale (tuple of float, optional): Specifies the lower and upper bounds for the random area of the crop, before resizing. The scale is defined with respect to the area of the original image. - ratio (tuple of float): lower and upper bounds for the random aspect ratio of the crop, before + ratio (tuple of float, optional): lower and upper bounds for the random aspect ratio of the crop, before resizing. - interpolation (InterpolationMode): Desired interpolation enum defined by + interpolation (InterpolationMode, optional): Desired interpolation enum defined by :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``, ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported. @@ -305,13 +314,13 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class FiveCrop(Transform): - """[BETA] Crop the given image/box/mask into four corners and the central crop. + """[BETA] Crop the image or video into four corners and the central crop. .. betastatus:: FiveCrop transform - If the image is torch Tensor, it is expected - to have [..., H, W] shape, where ... means an arbitrary number of leading - dimensions + If the input is a :class:`torch.Tensor` or a :class:`~torchvision.datapoints.Image` or a + :class:`~torchvision.datapoints.Video` it can have arbitrary number of leading batch dimensions. + For example, the image can have ``[..., C, H, W]`` shape. .. Note:: This transform returns a tuple of images and there may be a mismatch in the number of @@ -367,14 +376,14 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None: class TenCrop(Transform): - """[BETA] Crop the given image/box/mask into four corners and the central crop plus the flipped version of + """[BETA] Crop the image or video into four corners and the central crop plus the flipped version of these (horizontal flipping is used by default). .. betastatus:: TenCrop transform - If the image is torch Tensor, it is expected - to have [..., H, W] shape, where ... means an arbitrary number of leading - dimensions. + If the input is a :class:`torch.Tensor` or a :class:`~torchvision.datapoints.Image` or a + :class:`~torchvision.datapoints.Video` it can have arbitrary number of leading batch dimensions. + For example, the image can have ``[..., C, H, W]`` shape. See :class:`~torchvision.transforms.v2.FiveCrop` for an example. @@ -387,7 +396,7 @@ class TenCrop(Transform): size (sequence or int): Desired output size of the crop. If size is an int instead of sequence like (h, w), a square crop (size, size) is made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). - vertical_flip (bool): Use vertical flipping instead of horizontal + vertical_flip (bool, optional): Use vertical flipping instead of horizontal """ _v1_transform_cls = _transforms.TenCrop @@ -426,14 +435,14 @@ def _transform( class Pad(Transform): - """[BETA] Pad the given image/box/mask on all sides with the given "pad" value. + """[BETA] Pad the input on all sides with the given "pad" value. .. betastatus:: Pad transform - If the image is torch Tensor, it is expected - to have [..., H, W] shape, where ... means at most 2 leading dimensions for mode reflect and symmetric, - at most 3 leading dimensions for mode edge, - and an arbitrary number of leading dimensions for mode constant + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. Args: padding (int or sequence): Padding on each border. If a single int is provided this @@ -444,18 +453,17 @@ class Pad(Transform): .. note:: In torchscript mode padding as single int is not supported, use a sequence of length 1: ``[padding, ]``. - fill (number or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of - length 3, it is used to fill R, G, B channels respectively. - This value is only used when the padding_mode is constant. - Only number is supported for torch Tensor. - Only int or tuple value is supported for PIL Image. - padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric. - Default is constant. + fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant. + Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively. + Fill value can be also a dictionary mapping data type to the fill value, e.g. + ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and + ``Mask`` will be filled with 0. + padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric. + Default is "constant". - constant: pads with a constant value, this value is specified with fill - edge: pads with the last value at the edge of the image. - If input a 5D torch Tensor, the last 3 dimensions will be padded instead of the last 2 - reflect: pads with reflection of image without repeating the last value on the edge. For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode @@ -501,6 +509,37 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomZoomOut(_RandomApplyTransform): + """[BETA] "Zoom out" transformation from + `"SSD: Single Shot MultiBox Detector" `_. + + .. betastatus:: RandomZoomOut transform + + This transformation randomly pads images, videos, bounding boxes and masks creating a zoom out effect. + Output spatial size is randomly sampled from original size up to a maximum size configured + with ``side_range`` parameter: + + .. code-block:: python + + r = uniform_sample(side_range[0], side_range[1]) + output_width = input_width * r + output_height = input_height * r + + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. + + Args: + fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant. + Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively. + Fill value can be also a dictionary mapping data type to the fill value, e.g. + ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and + ``Mask`` will be filled with 0. + side_range (sequence of floats, optional): tuple of two floats defines minimum and maximum factors to + scale the input size. + p (float, optional): probability of the input being flipped. Default value is 0.5 + """ + def __init__( self, fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0, @@ -540,18 +579,20 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomRotation(Transform): - """[BETA] Rotate the image/box/mask by angle. + """[BETA] Rotate the input by angle. .. betastatus:: RandomRotation transform - If the image is torch Tensor, it is expected - to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. Args: degrees (sequence or number): Range of degrees to select from. If degrees is a number instead of sequence like (min, max), the range of degrees will be (-degrees, +degrees). - interpolation (InterpolationMode): Desired interpolation enum defined by + interpolation (InterpolationMode, optional): Desired interpolation enum defined by :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. @@ -561,8 +602,11 @@ class RandomRotation(Transform): Note that the expand flag assumes rotation around the center and no translation. center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner. Default is the center of the image. - fill (sequence or number): Pixel fill value for the area outside the rotated - image. Default is ``0``. If given a number, the value is used for all bands respectively. + fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant. + Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively. + Fill value can be also a dictionary mapping data type to the fill value, e.g. + ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and + ``Mask`` will be filled with 0. .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters @@ -608,12 +652,14 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomAffine(Transform): - """[BETA] Random affine transformation of the image/box/mask keeping center invariant. + """[BETA] Random affine transformation the input keeping center invariant. .. betastatus:: RandomAffine transform - If the image is torch Tensor, it is expected - to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. Args: degrees (sequence or number): Range of degrees to select from. @@ -631,12 +677,15 @@ class RandomAffine(Transform): range (shear[0], shear[1]) will be applied. Else if shear is a sequence of 4 values, an x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied. Will not apply shear by default. - interpolation (InterpolationMode): Desired interpolation enum defined by + interpolation (InterpolationMode, optional): Desired interpolation enum defined by :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. - fill (sequence or number): Pixel fill value for the area outside the transformed - image. Default is ``0``. If given a number, the value is used for all bands respectively. + fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant. + Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively. + Fill value can be also a dictionary mapping data type to the fill value, e.g. + ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and + ``Mask`` will be filled with 0. center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner. Default is the center of the image. @@ -724,13 +773,14 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomCrop(Transform): - """[BETA] Crop the given image/box/mask at a random location. + """[BETA] Crop the input at a random location. .. betastatus:: RandomCrop transform - If the image is torch Tensor, it is expected - to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions, - but if non-constant padding is used, the input is expected to have at most 2 leading dimensions + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. Args: size (sequence or int): Desired output size of the crop. If size is an @@ -745,21 +795,20 @@ class RandomCrop(Transform): .. note:: In torchscript mode padding as single int is not supported, use a sequence of length 1: ``[padding, ]``. - pad_if_needed (boolean): It will pad the image if smaller than the + pad_if_needed (boolean, optional): It will pad the image if smaller than the desired size to avoid raising an exception. Since cropping is done after padding, the padding seems to be done at a random offset. - fill (number or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of - length 3, it is used to fill R, G, B channels respectively. - This value is only used when the padding_mode is constant. - Only number is supported for torch Tensor. - Only int or tuple value is supported for PIL Image. - padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric. + fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant. + Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively. + Fill value can be also a dictionary mapping data type to the fill value, e.g. + ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and + ``Mask`` will be filled with 0. + padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric. Default is constant. - constant: pads with a constant value, this value is specified with fill - edge: pads with the last value at the edge of the image. - If input a 5D torch Tensor, the last 3 dimensions will be padded instead of the last 2 - reflect: pads with reflection of image without repeating the last value on the edge. For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode @@ -879,23 +928,28 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomPerspective(_RandomApplyTransform): - """[BETA] Performs a random perspective transformation of the given image/box/mask with a given probability. + """[BETA] Perform a random perspective transformation of the input with a given probability. .. betastatus:: RandomPerspective transform - If the image is torch Tensor, it is expected - to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. Args: - distortion_scale (float): argument to control the degree of distortion and ranges from 0 to 1. + distortion_scale (float, optional): argument to control the degree of distortion and ranges from 0 to 1. Default is 0.5. - p (float): probability of the image being transformed. Default is 0.5. - interpolation (InterpolationMode): Desired interpolation enum defined by + p (float, optional): probability of the input being transformed. Default is 0.5. + interpolation (InterpolationMode, optional): Desired interpolation enum defined by :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. - fill (sequence or number): Pixel fill value for the area outside the transformed - image. Default is ``0``. If given a number, the value is used for all bands respectively. + fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant. + Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively. + Fill value can be also a dictionary mapping data type to the fill value, e.g. + ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and + ``Mask`` will be filled with 0. """ _v1_transform_cls = _transforms.RandomPerspective @@ -960,6 +1014,46 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class ElasticTransform(Transform): + """[BETA] Transform the input with elastic transformations. + + .. betastatus:: RandomPerspective transform + + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. + + Given alpha and sigma, it will generate displacement + vectors for all pixels based on random offsets. Alpha controls the strength + and sigma controls the smoothness of the displacements. + The displacements are added to an identity grid and the resulting grid is + used to transform the input. + + .. note:: + Implementation to transform bounding boxes is approximative (not exact). + We construct an approximation of the inverse grid as ``inverse_grid = idenity - displacement``. + This is not an exact inverse of the grid used to transform images, i.e. ``grid = identity + displacement``. + Our assumption is that ``displacement * displacement`` is small and can be ignored. + Large displacements would lead to large errors in the approximation. + + Applications: + Randomly transforms the morphology of objects in images and produces a + see-through-water-like effect. + + Args: + alpha (float or sequence of floats, optional): Magnitude of displacements. Default is 50.0. + sigma (float or sequence of floats, optional): Smoothness of displacements. Default is 5.0. + interpolation (InterpolationMode, optional): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. + fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant. + Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively. + Fill value can be also a dictionary mapping data type to the fill value, e.g. + ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and + ``Mask`` will be filled with 0. + """ + _v1_transform_cls = _transforms.ElasticTransform def __init__( @@ -1011,6 +1105,34 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomIoUCrop(Transform): + """[BETA] Random IoU crop transformation from + `"SSD: Single Shot MultiBox Detector" `_. + + .. betastatus:: RandomIoUCrop transform + + This transformation requires an image or video data and ``datapoints.BoundingBox`` in the input. + + .. warning:: + In order to properly remove the bounding boxes below the IoU threshold, `RandomIoUCrop` + must be followed by :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`, either immediately + after or later in the transforms pipeline. + + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. + + Args: + min_scale (float, optional): Minimum factors to scale the input size. + max_scale (float, optional): Maximum factors to scale the input size. + min_aspect_ratio (float, optional): Minimum aspect ratio for the cropped image or video. + max_aspect_ratio (float, optional): Maximum aspect ratio for the cropped image or video. + sampler_options (list of float, optional): List of minimal IoU (Jaccard) overlap between all the boxes and + a cropped image or video. Default, ``None`` which corresponds to ``[0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]`` + trials (int, optional): Number of trials to find a crop for a given value of minimal IoU (Jaccard) overlap. + Default, 40. + """ + def __init__( self, min_scale: float = 0.3, @@ -1107,6 +1229,45 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class ScaleJitter(Transform): + """[BETA] Perform Large Scale Jitter on the input according to + `"Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation" `_. + + .. betastatus:: ScaleJitter transform + + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. + + Args: + target_size (tuple of int): Target size. This parameter defines base scale for jittering, + e.g. ``min(target_size[0] / width, target_size[1] / height)``. + scale_range (tuple of float, optional): Minimum and maximum of the scale range. Default, ``(0.1, 2.0)``. + interpolation (InterpolationMode, optional): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``, + ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported. + The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. + antialias (bool, optional): Whether to apply antialiasing. + It only affects **tensors** with bilinear or bicubic modes and it is + ignored otherwise: on PIL images, antialiasing is always applied on + bilinear or bicubic modes; on other modes (for PIL images and + tensors), antialiasing makes no sense and this parameter is ignored. + Possible values are: + + - ``True``: will apply antialiasing for bilinear or bicubic modes. + Other mode aren't affected. This is probably what you want to use. + - ``False``: will not apply antialiasing for tensors on any mode. PIL + images are still antialiased on bilinear or bicubic modes, because + PIL doesn't support no antialias. + - ``None``: equivalent to ``False`` for tensors and ``True`` for + PIL images. This value exists for legacy reasons and you probably + don't want to use it unless you really know what you are doing. + + The current default is ``None`` **but will change to** ``True`` **in + v0.17** for the PIL and Tensor backends to be consistent. + """ + def __init__( self, target_size: Tuple[int, int], @@ -1135,6 +1296,43 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomShortestSize(Transform): + """[BETA] Randomly resize the input. + + .. betastatus:: RandomShortestSize transform + + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. + + Args: + min_size (int or sequence of int): Minimum spatial size. Single integer value or a sequence of integer values. + max_size (int, optional): Maximum spatial size. Default, None. + interpolation (InterpolationMode, optional): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``, + ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported. + The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. + antialias (bool, optional): Whether to apply antialiasing. + It only affects **tensors** with bilinear or bicubic modes and it is + ignored otherwise: on PIL images, antialiasing is always applied on + bilinear or bicubic modes; on other modes (for PIL images and + tensors), antialiasing makes no sense and this parameter is ignored. + Possible values are: + + - ``True``: will apply antialiasing for bilinear or bicubic modes. + Other mode aren't affected. This is probably what you want to use. + - ``False``: will not apply antialiasing for tensors on any mode. PIL + images are still antialiased on bilinear or bicubic modes, because + PIL doesn't support no antialias. + - ``None``: equivalent to ``False`` for tensors and ``True`` for + PIL images. This value exists for legacy reasons and you probably + don't want to use it unless you really know what you are doing. + + The current default is ``None`` **but will change to** ``True`` **in + v0.17** for the PIL and Tensor backends to be consistent. + """ + def __init__( self, min_size: Union[List[int], Tuple[int], int], @@ -1166,6 +1364,54 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomResize(Transform): + """[BETA] Randomly resize the input. + + .. betastatus:: RandomResize transform + + This transformation can be used together with ``RandomCrop`` as data augmentations to train + models on image segmentation task. + + Output spatial size is randomly sampled from the interval ``[min_size, max_size]``: + + .. code-block:: python + + size = uniform_sample(min_size, max_size) + output_width = size + output_height = size + + If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, + :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) + it can have arbitrary number of leading batch dimensions. For example, + the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape. + + Args: + min_size (int): Minimum output size for random sampling + max_size (int): Maximum output size for random sampling + interpolation (InterpolationMode, optional): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``, + ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported. + The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. + antialias (bool, optional): Whether to apply antialiasing. + It only affects **tensors** with bilinear or bicubic modes and it is + ignored otherwise: on PIL images, antialiasing is always applied on + bilinear or bicubic modes; on other modes (for PIL images and + tensors), antialiasing makes no sense and this parameter is ignored. + Possible values are: + + - ``True``: will apply antialiasing for bilinear or bicubic modes. + Other mode aren't affected. This is probably what you want to use. + - ``False``: will not apply antialiasing for tensors on any mode. PIL + images are still antialiased on bilinear or bicubic modes, because + PIL doesn't support no antialias. + - ``None``: equivalent to ``False`` for tensors and ``True`` for + PIL images. This value exists for legacy reasons and you probably + don't want to use it unless you really know what you are doing. + + The current default is ``None`` **but will change to** ``True`` **in + v0.17** for the PIL and Tensor backends to be consistent. + """ + def __init__( self, min_size: int, From 15dfd27245acedd08b107083993f54ef766ef382 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Fri, 24 Feb 2023 10:32:36 +0100 Subject: [PATCH 11/27] Cleanup for e2e gallery example for transforms v2 (#7318) --- docs/source/conf.py | 1 + gallery/plot_transforms_v2_e2e.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 304a1cc6e22..8b4ce17de9f 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -62,6 +62,7 @@ "gallery_dirs": "auto_examples", # path to where to save gallery generated output "backreferences_dir": "gen_modules/backreferences", "doc_module": ("torchvision",), + "remove_config_comments": True, } napoleon_use_ivar = True diff --git a/gallery/plot_transforms_v2_e2e.py b/gallery/plot_transforms_v2_e2e.py index 938578e4af9..533a3d5d752 100644 --- a/gallery/plot_transforms_v2_e2e.py +++ b/gallery/plot_transforms_v2_e2e.py @@ -1,6 +1,6 @@ """ ================================================== -transforms v2: End-to-end object detection example +Transforms v2: End-to-end object detection example ================================================== Object detection is not supported out of the box by ``torchvision.transforms`` v1, since it only supports images. @@ -20,7 +20,6 @@ import torchvision -# sphinx_gallery_thumbnail_number = -1 def show(sample): import matplotlib.pyplot as plt @@ -125,6 +124,7 @@ def load_example_coco_detection_dataset(**kwargs): torch.manual_seed(3141) sample = dataset[0] +# sphinx_gallery_thumbnail_number = 2 show(sample) From 1150f1cafb568c46950ff403308b41f7ab6a54bb Mon Sep 17 00:00:00 2001 From: mpearce25 Date: Fri, 24 Feb 2023 04:35:24 -0500 Subject: [PATCH 12/27] Singular Sanitize BoundingBox (#7316) Co-authored-by: Nicolas Hug --- gallery/plot_transforms_v2_e2e.py | 4 ++-- test/test_transforms_v2.py | 26 +++++++++++++------------- test/test_transforms_v2_consistency.py | 2 +- torchvision/transforms/v2/__init__.py | 2 +- torchvision/transforms/v2/_geometry.py | 4 ++-- torchvision/transforms/v2/_misc.py | 6 +++--- 6 files changed, 22 insertions(+), 22 deletions(-) diff --git a/gallery/plot_transforms_v2_e2e.py b/gallery/plot_transforms_v2_e2e.py index 533a3d5d752..aa25d214f31 100644 --- a/gallery/plot_transforms_v2_e2e.py +++ b/gallery/plot_transforms_v2_e2e.py @@ -105,13 +105,13 @@ def load_example_coco_detection_dataset(**kwargs): transforms.RandomHorizontalFlip(), transforms.ToImageTensor(), transforms.ConvertImageDtype(torch.float32), - transforms.SanitizeBoundingBoxes(), + transforms.SanitizeBoundingBox(), ] ) ######################################################################################################################## # .. note:: -# Although the :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` transform is a no-op in this example, but it +# Although the :class:`~torchvision.transforms.v2.SanitizeBoundingBox` transform is a no-op in this example, but it # should be placed at least once at the end of a detection pipeline to remove degenerate bounding boxes as well as # the corresponding labels and optionally masks. It is particularly critical to add it if # :class:`~torchvision.transforms.v2.RandomIoUCrop` was used. diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index 9173ec14f2c..93d5f17fcbe 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -275,7 +275,7 @@ def test_common(self, transform, adapter, container_type, image_or_video, device boxes=datapoints.BoundingBox([[0, 0, 0, 0]], format=format, spatial_size=(224, 244)), labels=torch.tensor([3]), ) - assert transforms.SanitizeBoundingBoxes()(sample)["boxes"].shape == (0, 4) + assert transforms.SanitizeBoundingBox()(sample)["boxes"].shape == (0, 4) @parametrize( [ @@ -1876,7 +1876,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): transforms.ConvertImageDtype(torch.float), ] if sanitize: - t += [transforms.SanitizeBoundingBoxes()] + t += [transforms.SanitizeBoundingBox()] t = transforms.Compose(t) num_boxes = 5 @@ -1917,7 +1917,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): # ssd and ssdlite contain RandomIoUCrop which may "remove" some bbox. It # doesn't remove them strictly speaking, it just marks some boxes as # degenerate and those boxes will be later removed by - # SanitizeBoundingBoxes(), which we add to the pipelines if the sanitize + # SanitizeBoundingBox(), which we add to the pipelines if the sanitize # param is True. # Note that the values below are probably specific to the random seed # set above (which is fine). @@ -1989,7 +1989,7 @@ def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type): img = sample.pop("image") sample = (img, sample) - out = transforms.SanitizeBoundingBoxes(min_size=min_size, labels_getter=labels_getter)(sample) + out = transforms.SanitizeBoundingBox(min_size=min_size, labels_getter=labels_getter)(sample) if sample_type is tuple: out_image = out[0] @@ -2023,13 +2023,13 @@ def test_sanitize_bounding_boxes_default_heuristic(key, sample_type): sample = {key: labels, "another_key": "whatever"} if sample_type is tuple: sample = (None, sample, "whatever_again") - assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(sample) is labels + assert transforms.SanitizeBoundingBox._find_labels_default_heuristic(sample) is labels if key.lower() != "labels": # If "labels" is in the dict (case-insensitive), # it takes precedence over other keys which would otherwise be a match d = {key: "something_else", "labels": labels} - assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(d) is labels + assert transforms.SanitizeBoundingBox._find_labels_default_heuristic(d) is labels def test_sanitize_bounding_boxes_errors(): @@ -2041,25 +2041,25 @@ def test_sanitize_bounding_boxes_errors(): ) with pytest.raises(ValueError, match="min_size must be >= 1"): - transforms.SanitizeBoundingBoxes(min_size=0) + transforms.SanitizeBoundingBox(min_size=0) with pytest.raises(ValueError, match="labels_getter should either be a str"): - transforms.SanitizeBoundingBoxes(labels_getter=12) + transforms.SanitizeBoundingBox(labels_getter=12) with pytest.raises(ValueError, match="Could not infer where the labels are"): bad_labels_key = {"bbox": good_bbox, "BAD_KEY": torch.arange(good_bbox.shape[0])} - transforms.SanitizeBoundingBoxes()(bad_labels_key) + transforms.SanitizeBoundingBox()(bad_labels_key) with pytest.raises(ValueError, match="If labels_getter is a str or 'default'"): not_a_dict = (good_bbox, torch.arange(good_bbox.shape[0])) - transforms.SanitizeBoundingBoxes()(not_a_dict) + transforms.SanitizeBoundingBox()(not_a_dict) with pytest.raises(ValueError, match="must be a tensor"): not_a_tensor = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0]).tolist()} - transforms.SanitizeBoundingBoxes()(not_a_tensor) + transforms.SanitizeBoundingBox()(not_a_tensor) with pytest.raises(ValueError, match="Number of boxes"): different_sizes = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0] + 3)} - transforms.SanitizeBoundingBoxes()(different_sizes) + transforms.SanitizeBoundingBox()(different_sizes) with pytest.raises(ValueError, match="boxes must be of shape"): bad_bbox = datapoints.BoundingBox( # batch with 2 elements @@ -2071,7 +2071,7 @@ def test_sanitize_bounding_boxes_errors(): spatial_size=(20, 20), ) different_sizes = {"bbox": bad_bbox, "labels": torch.arange(bad_bbox.shape[0])} - transforms.SanitizeBoundingBoxes()(different_sizes) + transforms.SanitizeBoundingBox()(different_sizes) @pytest.mark.parametrize( diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py index 43f17c9b15a..059a230ee5c 100644 --- a/test/test_transforms_v2_consistency.py +++ b/test/test_transforms_v2_consistency.py @@ -1099,7 +1099,7 @@ def make_label(extra_dims, categories): v2_transforms.Compose( [ v2_transforms.RandomIoUCrop(), - v2_transforms.SanitizeBoundingBoxes(labels_getter=lambda sample: sample[1]["labels"]), + v2_transforms.SanitizeBoundingBox(labels_getter=lambda sample: sample[1]["labels"]), ] ), {"with_mask": False}, diff --git a/torchvision/transforms/v2/__init__.py b/torchvision/transforms/v2/__init__.py index 7ad72c00934..6573446a33a 100644 --- a/torchvision/transforms/v2/__init__.py +++ b/torchvision/transforms/v2/__init__.py @@ -40,7 +40,7 @@ TenCrop, ) from ._meta import ClampBoundingBox, ConvertBoundingBoxFormat, ConvertDtype, ConvertImageDtype -from ._misc import GaussianBlur, Identity, Lambda, LinearTransformation, Normalize, SanitizeBoundingBoxes, ToDtype +from ._misc import GaussianBlur, Identity, Lambda, LinearTransformation, Normalize, SanitizeBoundingBox, ToDtype from ._temporal import UniformTemporalSubsample from ._type_conversion import PILToTensor, ToImagePIL, ToImageTensor, ToPILImage diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py index c3342eb9926..b2618bb892f 100644 --- a/torchvision/transforms/v2/_geometry.py +++ b/torchvision/transforms/v2/_geometry.py @@ -1114,7 +1114,7 @@ class RandomIoUCrop(Transform): .. warning:: In order to properly remove the bounding boxes below the IoU threshold, `RandomIoUCrop` - must be followed by :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`, either immediately + must be followed by :class:`~torchvision.transforms.v2.SanitizeBoundingBox`, either immediately after or later in the transforms pipeline. If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, @@ -1222,7 +1222,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: if isinstance(output, datapoints.BoundingBox): # We "mark" the invalid boxes as degenreate, and they can be - # removed by a later call to SanitizeBoundingBoxes() + # removed by a later call to SanitizeBoundingBox() output[~params["is_within_crop_area"]] = 0 return output diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py index 8cc4aa6a3db..53975a2ad2a 100644 --- a/torchvision/transforms/v2/_misc.py +++ b/torchvision/transforms/v2/_misc.py @@ -246,7 +246,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: return inpt.to(dtype=dtype) -class SanitizeBoundingBoxes(Transform): +class SanitizeBoundingBox(Transform): # This removes boxes and their corresponding labels: # - small or degenerate bboxes based on min_size (this includes those where X2 <= X1 or Y2 <= Y1) # - boxes with any coordinate outside the range of the image (negative, or > spatial_size) @@ -269,7 +269,7 @@ def __init__( elif callable(labels_getter): self._labels_getter = labels_getter elif isinstance(labels_getter, str): - self._labels_getter = lambda inputs: SanitizeBoundingBoxes._get_dict_or_second_tuple_entry(inputs)[ + self._labels_getter = lambda inputs: SanitizeBoundingBox._get_dict_or_second_tuple_entry(inputs)[ labels_getter # type: ignore[index] ] elif labels_getter is None: @@ -300,7 +300,7 @@ def _get_dict_or_second_tuple_entry(inputs: Any) -> Mapping[str, Any]: def _find_labels_default_heuristic(inputs: Dict[str, Any]) -> Optional[torch.Tensor]: # Tries to find a "labels" key, otherwise tries for the first key that contains "label" - case insensitive # Returns None if nothing is found - inputs = SanitizeBoundingBoxes._get_dict_or_second_tuple_entry(inputs) + inputs = SanitizeBoundingBox._get_dict_or_second_tuple_entry(inputs) candidate_key = None with suppress(StopIteration): candidate_key = next(key for key in inputs.keys() if key.lower() == "labels") From 384162e0c86e15cf9965bed2450b90fcddcaca48 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 24 Feb 2023 09:44:30 +0000 Subject: [PATCH 13/27] Misc docs transforms v2(#7314) Co-authored-by: Philip Meier Co-authored-by: vfdev --- docs/source/transforms.rst | 2 ++ torchvision/transforms/v2/_color.py | 33 +++++++------------ torchvision/transforms/v2/_misc.py | 51 +++++++++++++++++++++++------ 3 files changed, 54 insertions(+), 32 deletions(-) diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst index ddd6f37d083..1dec6bedf15 100644 --- a/docs/source/transforms.rst +++ b/docs/source/transforms.rst @@ -190,6 +190,7 @@ Miscellaneous v2.RandomErasing Lambda v2.Lambda + v2.SanitizeBoundingBox .. _conversion_transforms: @@ -210,6 +211,7 @@ Conversion ConvertImageDtype v2.ConvertImageDtype v2.ConvertDtype + v2.ToDtype Auto-Augmentation ----------------- diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py index 2a581bf5640..237e8d6181a 100644 --- a/torchvision/transforms/v2/_color.py +++ b/torchvision/transforms/v2/_color.py @@ -15,17 +15,11 @@ class Grayscale(Transform): .. betastatus:: Grayscale transform - If the image is torch Tensor, it is expected - to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions + If the input is a :class:`torch.Tensor`, it is expected + to have [..., 3 or 1, H, W] shape, where ... means an arbitrary number of leading dimensions Args: num_output_channels (int): (1 or 3) number of channels desired for output image - - Returns: - PIL Image: Grayscale version of the input. - - - If ``num_output_channels == 1`` : returned image is single channel - - If ``num_output_channels == 3`` : returned image is 3 channel with r == g == b """ _v1_transform_cls = _transforms.Grayscale @@ -50,18 +44,13 @@ class RandomGrayscale(_RandomApplyTransform): .. betastatus:: RandomGrayscale transform - If the image is torch Tensor, it is expected - to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions + If the input is a :class:`torch.Tensor`, it is expected to have [..., 3 or 1, H, W] shape, + where ... means an arbitrary number of leading dimensions + + The output has the same number of channels as the input. Args: p (float): probability that image should be converted to grayscale. - - Returns: - PIL Image or Tensor: Grayscale version of the input image with probability p and unchanged - with probability (1-p). - - If input image is 1 channel: grayscale version is 1 channel - - If input image is 3 channel: grayscale version is 3 channel with r == g == b - """ _v1_transform_cls = _transforms.RandomGrayscale @@ -89,7 +78,7 @@ class ColorJitter(Transform): .. betastatus:: ColorJitter transform - If the image is torch Tensor, it is expected + If the input is a :class:`torch.Tensor`, it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. If img is PIL Image, mode "1", "I", "F" and modes with transparency (alpha channel) are not supported. @@ -295,7 +284,7 @@ class RandomEqualize(_RandomApplyTransform): .. betastatus:: RandomEqualize transform - If the image is torch Tensor, it is expected + If the input is a :class:`torch.Tensor`, it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. If img is PIL Image, it is expected to be in mode "P", "L" or "RGB". @@ -334,7 +323,7 @@ class RandomPosterize(_RandomApplyTransform): .. betastatus:: RandomPosterize transform - If the image is torch Tensor, it should be of type torch.uint8, + If the input is a :class:`torch.Tensor`, it should be of type torch.uint8, and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. If img is PIL Image, it is expected to be in mode "L" or "RGB". @@ -383,7 +372,7 @@ class RandomAutocontrast(_RandomApplyTransform): .. betastatus:: RandomAutocontrast transform - If the image is torch Tensor, it is expected + If the input is a :class:`torch.Tensor`, it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. If img is PIL Image, it is expected to be in mode "L" or "RGB". @@ -402,7 +391,7 @@ class RandomAdjustSharpness(_RandomApplyTransform): .. betastatus:: RandomAdjustSharpness transform - If the image is torch Tensor, + If the input is a :class:`torch.Tensor`, it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. Args: diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py index 53975a2ad2a..2237334f7a2 100644 --- a/torchvision/transforms/v2/_misc.py +++ b/torchvision/transforms/v2/_misc.py @@ -15,13 +15,14 @@ from .utils import has_any, is_simple_tensor, query_bounding_box +# TODO: do we want/need to expose this? class Identity(Transform): def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: return inpt class Lambda(Transform): - """[BETA] Apply a user-defined lambda as a transform. + """[BETA] Apply a user-defined function as a transform. .. betastatus:: Lambda transform @@ -52,7 +53,7 @@ def extra_repr(self) -> str: class LinearTransformation(Transform): - """[BETA] Transform a tensor image with a square transformation matrix and a mean_vector computed offline. + """[BETA] Transform a tensor image or video with a square transformation matrix and a mean_vector computed offline. .. betastatus:: LinearTransformation transform @@ -135,7 +136,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class Normalize(Transform): - """[BETA] Normalize a tensor image with mean and standard deviation. + """[BETA] Normalize a tensor image or video with mean and standard deviation. .. betastatus:: Normalize transform @@ -179,7 +180,7 @@ class GaussianBlur(Transform): .. betastatus:: GausssianBlur transform - If the image is torch Tensor, it is expected + If the input is a Tensor, it is expected to have [..., C, H, W] shape, where ... means an arbitrary number of leading dimensions. Args: @@ -188,9 +189,6 @@ class GaussianBlur(Transform): creating kernel to perform blurring. If float, sigma is fixed. If it is tuple of float (min, max), sigma is chosen uniformly at random to lie in the given range. - - Returns: - PIL Image or Tensor: Gaussian blurred version of the input image. """ _v1_transform_cls = _transforms.GaussianBlur @@ -225,6 +223,15 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class ToDtype(Transform): + """[BETA] Converts the input to a specific dtype. + + .. betastatus:: ToDtype transform + + Args: + dtype (dtype or dict of Datapoint -> dtype): The dtype to convert to. A dict can be passed to specify + per-datapoint conversions, e.g. ``dtype={datapoints.Image: torch.float32, datapoints.Video: torch.float64}``. + """ + _transformed_types = (torch.Tensor,) def __init__(self, dtype: Union[torch.dtype, Dict[Type, Optional[torch.dtype]]]) -> None: @@ -247,9 +254,33 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class SanitizeBoundingBox(Transform): - # This removes boxes and their corresponding labels: - # - small or degenerate bboxes based on min_size (this includes those where X2 <= X1 or Y2 <= Y1) - # - boxes with any coordinate outside the range of the image (negative, or > spatial_size) + """[BETA] Remove degenerate/invalid bounding boxes and their corresponding labels and masks. + + .. betastatus:: SanitizeBoundingBox transform + + This transform removes bounding boxes and their associated labels/masks that: + + - are below a given ``min_size``: by default this also removes degenerate boxes that have e.g. X2 <= X1. + - have any coordinate outside of their corresponding image. You may want to + call :class:`~torchvision.transforms.v2.ClampBoundingBox` first to avoid undesired removals. + + It is recommended to call it at the end of a pipeline, before passing the + input to the models. It is critical to call this transform if + :class:`~torchvision.transforms.v2.RandomIoUCrop` was called. + If you want to be extra careful, you may call it after all transforms that + may modify bounding boxes but once at the end should be enough in most + cases. + + Args: + min_size (float, optional) The size below which bounding boxes are removed. Default is 1. + labels_getter (callable or str or None, optional): indicates how to identify the labels in the input. + It can be a str in which case the input is expected to be a dict, and ``labels_getter`` then specifies + the key whose value corresponds to the labels. It can also be a callable that takes the same input + as the transform, and returns the labels. + By default, this will try to find a "labels" key in the input, if + the input is a dict or it is a tuple whose second element is a dict. + This heuristic should work well with a lot of datasets, including the built-in torchvision datasets. + """ def __init__( self, From a01e485eac0685548bdc5a63ec0691d0409b5b5e Mon Sep 17 00:00:00 2001 From: vfdev Date: Fri, 24 Feb 2023 11:08:06 +0100 Subject: [PATCH 14/27] Minor updates in autoaugment, augment docstring v2 (#7317) Co-authored-by: Nicolas Hug --- torchvision/transforms/v2/_augment.py | 12 +++--- torchvision/transforms/v2/_auto_augment.py | 44 +++++++++++++--------- 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py index b5aac9ca9a2..0df7e0f249a 100644 --- a/torchvision/transforms/v2/_augment.py +++ b/torchvision/transforms/v2/_augment.py @@ -13,7 +13,7 @@ class RandomErasing(_RandomApplyTransform): - """[BETA] Randomly selects a rectangle region in the input image or video and erases its pixels. + """[BETA] Randomly select a rectangle region in the input image or video and erase its pixels. .. betastatus:: RandomErasing transform @@ -21,14 +21,14 @@ class RandomErasing(_RandomApplyTransform): 'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/abs/1708.04896 Args: - p: probability that the random erasing operation will be performed. - scale: range of proportion of erased area against input image. - ratio: range of aspect ratio of erased area. - value: erasing value. Default is 0. If a single int, it is used to + p (float, optional): probability that the random erasing operation will be performed. + scale (tuple of float, optional): range of proportion of erased area against input image. + ratio (tuple of float, optional): range of aspect ratio of erased area. + value (number or tuple of numbers): erasing value. Default is 0. If a single int, it is used to erase all pixels. If a tuple of length 3, it is used to erase R, G, B channels respectively. If a str of 'random', erasing each pixel with random values. - inplace: boolean to make this transform inplace. Default set to False. + inplace (bool, optional): boolean to make this transform inplace. Default set to False. Returns: Erased input. diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py index 98e23b99796..2cd88c1a74d 100644 --- a/torchvision/transforms/v2/_auto_augment.py +++ b/torchvision/transforms/v2/_auto_augment.py @@ -167,14 +167,16 @@ class AutoAugment(_AutoAugmentBase): .. betastatus:: AutoAugment transform - If the image is torch Tensor, it should be of type torch.uint8, and it is expected + This transformation works on images and videos only. + + If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. If img is PIL Image, it is expected to be in mode "L" or "RGB". Args: - policy (AutoAugmentPolicy): Desired policy enum defined by + policy (AutoAugmentPolicy, optional): Desired policy enum defined by :class:`torchvision.transforms.autoaugment.AutoAugmentPolicy`. Default is ``AutoAugmentPolicy.IMAGENET``. - interpolation (InterpolationMode): Desired interpolation enum defined by + interpolation (InterpolationMode, optional): Desired interpolation enum defined by :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. fill (sequence or number, optional): Pixel fill value for the area outside the transformed @@ -342,15 +344,17 @@ class RandAugment(_AutoAugmentBase): .. betastatus:: RandAugment transform - If the image is torch Tensor, it should be of type torch.uint8, and it is expected + This transformation works on images and videos only. + + If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. If img is PIL Image, it is expected to be in mode "L" or "RGB". Args: - num_ops (int): Number of augmentation transformations to apply sequentially. - magnitude (int): Magnitude for all the transformations. - num_magnitude_bins (int): The number of different magnitude values. - interpolation (InterpolationMode): Desired interpolation enum defined by + num_ops (int, optional): Number of augmentation transformations to apply sequentially. + magnitude (int, optional): Magnitude for all the transformations. + num_magnitude_bins (int, optional): The number of different magnitude values. + interpolation (InterpolationMode, optional): Desired interpolation enum defined by :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. fill (sequence or number, optional): Pixel fill value for the area outside the transformed @@ -423,13 +427,15 @@ class TrivialAugmentWide(_AutoAugmentBase): .. betastatus:: TrivialAugmentWide transform - If the image is torch Tensor, it should be of type torch.uint8, and it is expected + This transformation works on images and videos only. + + If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. If img is PIL Image, it is expected to be in mode "L" or "RGB". Args: - num_magnitude_bins (int): The number of different magnitude values. - interpolation (InterpolationMode): Desired interpolation enum defined by + num_magnitude_bins (int, optional): The number of different magnitude values. + interpolation (InterpolationMode, optional): Desired interpolation enum defined by :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. fill (sequence or number, optional): Pixel fill value for the area outside the transformed @@ -492,18 +498,20 @@ class AugMix(_AutoAugmentBase): .. betastatus:: AugMix transform - If the image is torch Tensor, it should be of type torch.uint8, and it is expected + This transformation works on images and videos only. + + If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. If img is PIL Image, it is expected to be in mode "L" or "RGB". Args: - severity (int): The severity of base augmentation operators. Default is ``3``. - mixture_width (int): The number of augmentation chains. Default is ``3``. - chain_depth (int): The depth of augmentation chains. A negative value denotes stochastic depth sampled from the interval [1, 3]. + severity (int, optional): The severity of base augmentation operators. Default is ``3``. + mixture_width (int, optional): The number of augmentation chains. Default is ``3``. + chain_depth (int, optional): The depth of augmentation chains. A negative value denotes stochastic depth sampled from the interval [1, 3]. Default is ``-1``. - alpha (float): The hyperparameter for the probability distributions. Default is ``1.0``. - all_ops (bool): Use all operations (including brightness, contrast, color and sharpness). Default is ``True``. - interpolation (InterpolationMode): Desired interpolation enum defined by + alpha (float, optional): The hyperparameter for the probability distributions. Default is ``1.0``. + all_ops (bool, optional): Use all operations (including brightness, contrast, color and sharpness). Default is ``True``. + interpolation (InterpolationMode, optional): Desired interpolation enum defined by :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. fill (sequence or number, optional): Pixel fill value for the area outside the transformed From 0dfc317f20b76a44f93c5903aa56802272c6fe54 Mon Sep 17 00:00:00 2001 From: vfdev Date: Fri, 24 Feb 2023 11:57:14 +0100 Subject: [PATCH 15/27] Fixed broken test_random_choice (#7315) --- test/test_transforms_v2.py | 2 +- test/test_transforms_v2_consistency.py | 2 +- torchvision/transforms/v2/_container.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index 93d5f17fcbe..9beded4c957 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -1359,7 +1359,7 @@ def test_ctor(self, transform_cls, trfms): class TestRandomChoice: def test_assertions(self): - with pytest.raises(ValueError, match="The number of probabilities doesn't match the number of transforms"): + with pytest.raises(ValueError, match="Length of p doesn't match the number of transforms"): transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], p=[1]) diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py index 059a230ee5c..a8a87cd43dd 100644 --- a/test/test_transforms_v2_consistency.py +++ b/test/test_transforms_v2_consistency.py @@ -822,7 +822,7 @@ def test_random_choice(self, probabilities): v2_transforms.Resize(256), legacy_transforms.CenterCrop(224), ], - probabilities=probabilities, + p=probabilities, ) legacy_transform = legacy_transforms.RandomChoice( [ diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py index 27affc7100b..7f9df337352 100644 --- a/torchvision/transforms/v2/_container.py +++ b/torchvision/transforms/v2/_container.py @@ -139,7 +139,7 @@ def __init__( p = [1] * len(transforms) elif len(p) != len(transforms): raise ValueError( - f"The number of p doesn't match the number of transforms: " f"{len(p)} != {len(transforms)}" + f"Length of p doesn't match the number of transforms: " f"{len(p)} != {len(transforms)}" ) super().__init__() From 9d768dd1ef2c0858a889cb033eefd855aafd14f9 Mon Sep 17 00:00:00 2001 From: vfdev Date: Fri, 24 Feb 2023 12:11:29 +0100 Subject: [PATCH 16/27] Updated _meta.py docstrings (#7320) Co-authored-by: Nicolas Hug --- docs/source/transforms.rst | 2 ++ torchvision/transforms/v2/_meta.py | 18 +++++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst index 1dec6bedf15..8e3c60085de 100644 --- a/docs/source/transforms.rst +++ b/docs/source/transforms.rst @@ -191,6 +191,7 @@ Miscellaneous Lambda v2.Lambda v2.SanitizeBoundingBox + v2.ClampBoundingBox .. _conversion_transforms: @@ -212,6 +213,7 @@ Conversion v2.ConvertImageDtype v2.ConvertDtype v2.ToDtype + v2.ConvertBoundingBoxFormat Auto-Augmentation ----------------- diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py index 7d0f0ec39f9..94ec851d045 100644 --- a/torchvision/transforms/v2/_meta.py +++ b/torchvision/transforms/v2/_meta.py @@ -9,6 +9,15 @@ class ConvertBoundingBoxFormat(Transform): + """[BETA] Convert bounding box coordinates to the given ``format``, e.g. from "CXCYWH" to "XYXY". + + .. betastatus:: ConvertBoundingBoxFormat transform + + Args: + format (str or datapoints.BoundingBoxFormat): output bounding box format. + Possible values are defined by :class:`~torchvision.datapoints.BoundingBoxFormat` and + string values match the enums, e.g. "XYXY" or "XYWH" etc. + """ _transformed_types = (datapoints.BoundingBox,) def __init__(self, format: Union[str, datapoints.BoundingBoxFormat]) -> None: @@ -22,7 +31,7 @@ def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> da class ConvertDtype(Transform): - """[BETA] Convert a tensor image/box/mask to the given ``dtype`` and scale the values accordingly + """[BETA] Convert input image or video to the given ``dtype`` and scale the values accordingly. .. betastatus:: ConvertDtype transform @@ -63,6 +72,13 @@ def _transform( class ClampBoundingBox(Transform): + """[BETA] Clamp bounding boxes to their corresponding image dimensions. + + The clamping is done according to the bounding boxes' ``spatial_size`` meta-data. + + .. betastatus:: ClampBoundingBox transform + + """ _transformed_types = (datapoints.BoundingBox,) def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> datapoints.BoundingBox: From 4c0638b5ee1cbe759c3cb6aac95a450a480c2581 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 24 Feb 2023 11:21:25 +0000 Subject: [PATCH 17/27] remove strEnum from BoundingBoxFormat (#7322) --- test/test_datapoints.py | 2 +- torchvision/datapoints/_bounding_box.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/test/test_datapoints.py b/test/test_datapoints.py index 5b875a6ef20..39c05123333 100644 --- a/test/test_datapoints.py +++ b/test/test_datapoints.py @@ -28,5 +28,5 @@ def test_bbox_instance(data, format): assert isinstance(bboxes, torch.Tensor) assert bboxes.ndim == 2 and bboxes.shape[1] == 4 if isinstance(format, str): - format = datapoints.BoundingBoxFormat.from_str(format.upper()) + format = datapoints.BoundingBoxFormat[(format.upper())] assert bboxes.format == format diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py index 1dc46f8f21a..75e779f0b21 100644 --- a/torchvision/datapoints/_bounding_box.py +++ b/torchvision/datapoints/_bounding_box.py @@ -1,18 +1,18 @@ from __future__ import annotations +from enum import Enum from typing import Any, List, Optional, Sequence, Tuple, Union import torch -from torchvision._utils import StrEnum from torchvision.transforms import InterpolationMode # TODO: this needs to be moved out of transforms from ._datapoint import _FillTypeJIT, Datapoint -class BoundingBoxFormat(StrEnum): - XYXY = StrEnum.auto() - XYWH = StrEnum.auto() - CXCYWH = StrEnum.auto() +class BoundingBoxFormat(Enum): + XYXY = "XYXY" + XYWH = "XYWH" + CXCYWH = "CXCYWH" class BoundingBox(Datapoint): @@ -39,7 +39,7 @@ def __new__( tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad) if isinstance(format, str): - format = BoundingBoxFormat.from_str(format.upper()) + format = BoundingBoxFormat[format.upper()] return cls._wrap(tensor, format=format, spatial_size=spatial_size) From d21e38a9a375077115176f73784fcd459a2cc83c Mon Sep 17 00:00:00 2001 From: vfdev Date: Fri, 24 Feb 2023 12:43:31 +0100 Subject: [PATCH 18/27] Updated _type_conversion.py docs (#7324) Co-authored-by: Nicolas Hug --- docs/source/transforms.rst | 1 + torchvision/transforms/v2/_type_conversion.py | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst index 8e3c60085de..0e9b053fb72 100644 --- a/docs/source/transforms.rst +++ b/docs/source/transforms.rst @@ -209,6 +209,7 @@ Conversion v2.ToTensor PILToTensor v2.PILToTensor + v2.ToImageTensor ConvertImageDtype v2.ConvertImageDtype v2.ConvertDtype diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py index b0743feb10d..504c5cc3d70 100644 --- a/torchvision/transforms/v2/_type_conversion.py +++ b/torchvision/transforms/v2/_type_conversion.py @@ -27,6 +27,13 @@ def _transform(self, inpt: PIL.Image.Image, params: Dict[str, Any]) -> torch.Ten class ToImageTensor(Transform): + """[BETA] Convert a tensor or an ndarray or PIL Image to :class:`~torchvision.datapoints.Image`. + + .. betastatus:: ToImageTensor transform + + This transform does not support torchscript. + """ + _transformed_types = (is_simple_tensor, PIL.Image.Image, np.ndarray) def _transform( From 4e040ee0bd663fe5616aee3eaa901322da3c9c9c Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Fri, 24 Feb 2023 13:05:56 +0100 Subject: [PATCH 19/27] add docs for datapoints (#7312) Co-authored-by: Nicolas Hug --- docs/source/datapoints.rst | 13 +++++++++ docs/source/index.rst | 1 + torchvision/datapoints/_bounding_box.py | 37 +++++++++++++++++++++++++ torchvision/datapoints/_image.py | 13 +++++++++ torchvision/datapoints/_mask.py | 13 +++++++++ torchvision/datapoints/_video.py | 12 ++++++++ 6 files changed, 89 insertions(+) create mode 100644 docs/source/datapoints.rst diff --git a/docs/source/datapoints.rst b/docs/source/datapoints.rst new file mode 100644 index 00000000000..07e20b090e6 --- /dev/null +++ b/docs/source/datapoints.rst @@ -0,0 +1,13 @@ +Datapoints +========== + +.. currentmodule:: torchvision.datapoints +.. autosummary:: + :toctree: generated/ + :template: class.rst + + Image + Video + BoundingBoxFormat + BoundingBox + Mask diff --git a/docs/source/index.rst b/docs/source/index.rst index 79dbebdd047..ac047ff5869 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -31,6 +31,7 @@ architectures, and common image transformations for computer vision. :maxdepth: 2 :caption: Package Reference + datapoints transforms models datasets diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py index 75e779f0b21..d8441823c3e 100644 --- a/torchvision/datapoints/_bounding_box.py +++ b/torchvision/datapoints/_bounding_box.py @@ -10,12 +10,35 @@ class BoundingBoxFormat(Enum): + """[BETA] Coordinate format of a bounding box. + + Available formats are + + * ``XYXY`` + * ``XYWH`` + * ``CXCYWH`` + """ + XYXY = "XYXY" XYWH = "XYWH" CXCYWH = "CXCYWH" class BoundingBox(Datapoint): + """[BETA] :class:`torch.Tensor` subclass for bounding boxes. + + Args: + data: Any data that can be turned into a tensor with :func:`torch.as_tensor`. + format (BoundingBoxFormat, str): Format of the bounding box. + spatial_size (two-tuple of ints): Height and width of the corresponding image or video. + dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from + ``data``. + device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a + :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU. + requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and + ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``. + """ + format: BoundingBoxFormat spatial_size: Tuple[int, int] @@ -52,6 +75,20 @@ def wrap_like( format: Optional[BoundingBoxFormat] = None, spatial_size: Optional[Tuple[int, int]] = None, ) -> BoundingBox: + """Wrap a :class:`torch.Tensor` as :class:`BoundingBox` from a reference. + + Args: + other (BoundingBox): Reference bounding box. + tensor (Tensor): Tensor to be wrapped as :class:`BoundingBox` + format (BoundingBoxFormat, str, optional): Format of the bounding box. If omitted, it is taken from the + reference. + spatial_size (two-tuple of ints, optional): Height and width of the corresponding image or video. If + omitted, it is taken from the reference. + + """ + if isinstance(format, str): + format = BoundingBoxFormat.from_str(format.upper()) + return cls._wrap( tensor, format=format if format is not None else other.format, diff --git a/torchvision/datapoints/_image.py b/torchvision/datapoints/_image.py index 21dfe5a5cd6..e47a6c10fc3 100644 --- a/torchvision/datapoints/_image.py +++ b/torchvision/datapoints/_image.py @@ -10,6 +10,19 @@ class Image(Datapoint): + """[BETA] :class:`torch.Tensor` subclass for images. + + Args: + data (tensor-like, PIL.Image.Image): Any data that can be turned into a tensor with :func:`torch.as_tensor` as + well as PIL images. + dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from + ``data``. + device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a + :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU. + requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and + ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``. + """ + @classmethod def _wrap(cls, tensor: torch.Tensor) -> Image: image = tensor.as_subclass(cls) diff --git a/torchvision/datapoints/_mask.py b/torchvision/datapoints/_mask.py index bb70ec12224..0135d793d32 100644 --- a/torchvision/datapoints/_mask.py +++ b/torchvision/datapoints/_mask.py @@ -10,6 +10,19 @@ class Mask(Datapoint): + """[BETA] :class:`torch.Tensor` subclass for segmentation and detection masks. + + Args: + data (tensor-like, PIL.Image.Image): Any data that can be turned into a tensor with :func:`torch.as_tensor` as + well as PIL images. + dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from + ``data``. + device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a + :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU. + requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and + ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``. + """ + @classmethod def _wrap(cls, tensor: torch.Tensor) -> Mask: return tensor.as_subclass(cls) diff --git a/torchvision/datapoints/_video.py b/torchvision/datapoints/_video.py index ab51c10233d..a6fbe2bd473 100644 --- a/torchvision/datapoints/_video.py +++ b/torchvision/datapoints/_video.py @@ -9,6 +9,18 @@ class Video(Datapoint): + """[BETA] :class:`torch.Tensor` subclass for videos. + + Args: + data (tensor-like): Any data that can be turned into a tensor with :func:`torch.as_tensor`. + dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from + ``data``. + device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a + :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU. + requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and + ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``. + """ + @classmethod def _wrap(cls, tensor: torch.Tensor) -> Video: video = tensor.as_subclass(cls) From f62a045d8fabcbf941189b6f9e6673e164825015 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 24 Feb 2023 13:46:18 +0000 Subject: [PATCH 20/27] Various doc enhancements (#7326) Co-authored-by: Philip Meier Co-authored-by: vfdev --- docs/source/conf.py | 1 + docs/source/datapoints.rst | 6 ++++++ docs/source/index.rst | 2 +- docs/source/transforms.rst | 8 +++++++- torchvision/transforms/transforms.py | 15 +++++++++++---- torchvision/transforms/v2/_container.py | 4 +--- torchvision/transforms/v2/_deprecated.py | 2 +- torchvision/transforms/v2/_meta.py | 4 +++- torchvision/transforms/v2/_misc.py | 8 +++++--- torchvision/transforms/v2/_type_conversion.py | 7 ++++--- 10 files changed, 40 insertions(+), 17 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 8b4ce17de9f..6d748f5b717 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -34,6 +34,7 @@ sys.path.append(os.path.abspath(".")) torchvision.disable_beta_transforms_warning() +import torchvision.datapoints # Don't remove, otherwise the docs for datapoints aren't linked properly # -- General configuration ------------------------------------------------ diff --git a/docs/source/datapoints.rst b/docs/source/datapoints.rst index 07e20b090e6..1cc62413e66 100644 --- a/docs/source/datapoints.rst +++ b/docs/source/datapoints.rst @@ -2,6 +2,12 @@ Datapoints ========== .. currentmodule:: torchvision.datapoints + +Datapoints are tensor subclasses which the :mod:`~torchvision.transforms.v2` v2 transforms use under the hood to +dispatch their inputs to the appropriate lower-level kernels. Most users do not +need to manipulate datapoints directly and can simply rely on dataset wrapping - +see e.g. :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`. + .. autosummary:: :toctree: generated/ :template: class.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index ac047ff5869..bc38fdb0307 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -31,8 +31,8 @@ architectures, and common image transformations for computer vision. :maxdepth: 2 :caption: Package Reference - datapoints transforms + datapoints models datasets utils diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst index 0e9b053fb72..1fe3e78f55f 100644 --- a/docs/source/transforms.rst +++ b/docs/source/transforms.rst @@ -198,6 +198,12 @@ Miscellaneous Conversion ---------- +.. note:: + Beware, some of these conversion transforms below will scale the values + while performing the conversion, while some may not do any scaling. By + scaling, we mean e.g. that a ``uint8`` -> ``float32`` would map the [0, + 255] range into [0, 1] (and vice-versa). + .. autosummary:: :toctree: generated/ :template: class.rst @@ -211,8 +217,8 @@ Conversion v2.PILToTensor v2.ToImageTensor ConvertImageDtype - v2.ConvertImageDtype v2.ConvertDtype + v2.ConvertImageDtype v2.ToDtype v2.ConvertBoundingBoxFormat diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py index 90cb0374eee..95eb9199ef3 100644 --- a/torchvision/transforms/transforms.py +++ b/torchvision/transforms/transforms.py @@ -105,7 +105,9 @@ def __repr__(self) -> str: class ToTensor: - """Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor. This transform does not support torchscript. + """Convert a PIL Image or ndarray to tensor and scale the values accordingly. + + This transform does not support torchscript. Converts a PIL Image or numpy.ndarray (H x W x C) in the range [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] @@ -139,7 +141,9 @@ def __repr__(self) -> str: class PILToTensor: - """Convert a ``PIL Image`` to a tensor of the same type. This transform does not support torchscript. + """Convert a PIL Image to a tensor of the same type - this does not scale values. + + This transform does not support torchscript. Converts a PIL Image (H x W x C) to a Tensor of shape (C x H x W). """ @@ -166,7 +170,8 @@ def __repr__(self) -> str: class ConvertImageDtype(torch.nn.Module): - """Convert a tensor image to the given ``dtype`` and scale the values accordingly + """Convert a tensor image to the given ``dtype`` and scale the values accordingly. + This function does not support PIL Image. Args: @@ -194,7 +199,9 @@ def forward(self, image): class ToPILImage: - """Convert a tensor or an ndarray to PIL Image. This transform does not support torchscript. + """Convert a tensor or an ndarray to PIL Image - this does not scale values. + + This transform does not support torchscript. Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape H x W x C to a PIL Image while preserving the value range. diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py index 7f9df337352..2f34a58902e 100644 --- a/torchvision/transforms/v2/_container.py +++ b/torchvision/transforms/v2/_container.py @@ -138,9 +138,7 @@ def __init__( if p is None: p = [1] * len(transforms) elif len(p) != len(transforms): - raise ValueError( - f"Length of p doesn't match the number of transforms: " f"{len(p)} != {len(transforms)}" - ) + raise ValueError(f"Length of p doesn't match the number of transforms: {len(p)} != {len(transforms)}") super().__init__() diff --git a/torchvision/transforms/v2/_deprecated.py b/torchvision/transforms/v2/_deprecated.py index c44e6b08d11..b5544ecfd49 100644 --- a/torchvision/transforms/v2/_deprecated.py +++ b/torchvision/transforms/v2/_deprecated.py @@ -10,7 +10,7 @@ class ToTensor(Transform): - """[BETA] Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor. + """[BETA] Convert a PIL Image or ndarray to tensor and scale the values accordingly. .. betastatus:: ToTensor transform diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py index 94ec851d045..7f28e25c602 100644 --- a/torchvision/transforms/v2/_meta.py +++ b/torchvision/transforms/v2/_meta.py @@ -9,7 +9,7 @@ class ConvertBoundingBoxFormat(Transform): - """[BETA] Convert bounding box coordinates to the given ``format``, e.g. from "CXCYWH" to "XYXY". + """[BETA] Convert bounding box coordinates to the given ``format``, eg from "CXCYWH" to "XYXY". .. betastatus:: ConvertBoundingBoxFormat transform @@ -18,6 +18,7 @@ class ConvertBoundingBoxFormat(Transform): Possible values are defined by :class:`~torchvision.datapoints.BoundingBoxFormat` and string values match the enums, e.g. "XYXY" or "XYWH" etc. """ + _transformed_types = (datapoints.BoundingBox,) def __init__(self, format: Union[str, datapoints.BoundingBoxFormat]) -> None: @@ -79,6 +80,7 @@ class ClampBoundingBox(Transform): .. betastatus:: ClampBoundingBox transform """ + _transformed_types = (datapoints.BoundingBox,) def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> datapoints.BoundingBox: diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py index 2237334f7a2..40d57856292 100644 --- a/torchvision/transforms/v2/_misc.py +++ b/torchvision/transforms/v2/_misc.py @@ -223,13 +223,15 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class ToDtype(Transform): - """[BETA] Converts the input to a specific dtype. + """[BETA] Converts the input to a specific dtype - this does not scale values. .. betastatus:: ToDtype transform Args: - dtype (dtype or dict of Datapoint -> dtype): The dtype to convert to. A dict can be passed to specify - per-datapoint conversions, e.g. ``dtype={datapoints.Image: torch.float32, datapoints.Video: torch.float64}``. + dtype (``torch.dtype`` or dict of ``Datapoint`` -> ``torch.dtype``): The dtype to convert to. + A dict can be passed to specify per-datapoint conversions, e.g. + ``dtype={datapoints.Image: torch.float32, datapoints.Video: + torch.float64}``. """ _transformed_types = (torch.Tensor,) diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py index 504c5cc3d70..92de314608c 100644 --- a/torchvision/transforms/v2/_type_conversion.py +++ b/torchvision/transforms/v2/_type_conversion.py @@ -11,7 +11,7 @@ class PILToTensor(Transform): - """[BETA] Convert a ``PIL Image`` to a tensor of the same type. + """[BETA] Convert a PIL Image to a tensor of the same type - this does not scale values. .. betastatus:: PILToTensor transform @@ -27,7 +27,8 @@ def _transform(self, inpt: PIL.Image.Image, params: Dict[str, Any]) -> torch.Ten class ToImageTensor(Transform): - """[BETA] Convert a tensor or an ndarray or PIL Image to :class:`~torchvision.datapoints.Image`. + """[BETA] Convert a tensor, ndarray, or PIL Image to :class:`~torchvision.datapoints.Image` + ; this does not scale values. .. betastatus:: ToImageTensor transform @@ -43,7 +44,7 @@ def _transform( class ToImagePIL(Transform): - """[BETA] Convert a tensor or an ndarray to PIL Image. + """[BETA] Convert a tensor or an ndarray to PIL Image - this does not scale values. .. betastatus:: ToImagePIL transform From 818b98904e5579a4e4d2335a5b5ab03be1fda3c3 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 24 Feb 2023 13:52:02 +0000 Subject: [PATCH 21/27] Add docs for UniformTemporalSubsample (#7325) Co-authored-by: Philip Meier Co-authored-by: vfdev --- docs/source/transforms.rst | 1 + torchvision/transforms/v2/_temporal.py | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst index 1fe3e78f55f..6957e79bbfa 100644 --- a/docs/source/transforms.rst +++ b/docs/source/transforms.rst @@ -192,6 +192,7 @@ Miscellaneous v2.Lambda v2.SanitizeBoundingBox v2.ClampBoundingBox + v2.UniformTemporalSubsample .. _conversion_transforms: diff --git a/torchvision/transforms/v2/_temporal.py b/torchvision/transforms/v2/_temporal.py index b26d6b0450f..ad7526bc4a4 100644 --- a/torchvision/transforms/v2/_temporal.py +++ b/torchvision/transforms/v2/_temporal.py @@ -7,6 +7,19 @@ class UniformTemporalSubsample(Transform): + """[BETA] Uniformly subsample ``num_samples`` indices from the temporal dimension of the video. + + .. betastatus:: UniformTemporalSubsample transform + + Videos are expected to be of shape ``[..., T, C, H, W]`` where ``T`` denotes the temporal dimension. + + When ``num_samples`` is larger than the size of temporal dimension of the video, it + will sample frames based on nearest neighbor interpolation. + + Args: + num_samples (int): The number of equispaced samples to be selected + """ + _transformed_types = (is_simple_tensor, datapoints.Video) def __init__(self, num_samples: int): From 198e6e49770171351ea554e5c0a3edbdcf181283 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Fri, 24 Feb 2023 15:12:46 +0100 Subject: [PATCH 22/27] fix BoundingBox.wrap_like (#7327) --- torchvision/datapoints/_bounding_box.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py index d8441823c3e..11d42f171e4 100644 --- a/torchvision/datapoints/_bounding_box.py +++ b/torchvision/datapoints/_bounding_box.py @@ -87,7 +87,7 @@ def wrap_like( """ if isinstance(format, str): - format = BoundingBoxFormat.from_str(format.upper()) + format = BoundingBoxFormat[format.upper()] return cls._wrap( tensor, From d8083d5dbe6f63a0abfa348f403c0d0527eb9548 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 24 Feb 2023 14:45:06 +0000 Subject: [PATCH 23/27] Add docs for functionals v2 (#7328) Co-authored-by: Philip Meier --- docs/source/transforms.rst | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst index 6957e79bbfa..22e0889a480 100644 --- a/docs/source/transforms.rst +++ b/docs/source/transforms.rst @@ -5,6 +5,22 @@ Transforming and augmenting images .. currentmodule:: torchvision.transforms + +.. note:: + In 0.15, we released a new set of transforms available in the + ``torchvision.transforms.v2`` namespace, which add support for transforming + not just images but also bounding boxes, masks, or videos. These transforms + are fully backward compatible with the current ones, and you'll see them + documented below with a `v2.` prefix. To get started with those new + transforms, you can check out + :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`. + Note that these transforms are still BETA, and while we don't expect major + breaking changes in the future, some APIs may still change according to user + feedback. Please submit any feedback you may have in + https://github.com/pytorch/vision/issues/6753, and you can also check out + https://github.com/pytorch/vision/issues/7319 to learn more about the APIs + that we suspect might involve future changes. + Transforms are common image transformations available in the ``torchvision.transforms`` module. They can be chained together using :class:`Compose`. @@ -253,6 +269,14 @@ Functional Transforms .. currentmodule:: torchvision.transforms.functional + +.. note:: + You'll find below the documentation for the existing + ``torchvision.transforms.functional`` namespace. The + ``torchvision.transforms.v2.functional`` namespace exists as well and can be + used! The same functionals are present, so you simply need to change your + import to rely on the ``v2`` namespace. + Functional transforms give you fine-grained control of the transformation pipeline. As opposed to the transformations above, functional transforms don't contain a random number generator for their parameters. From 5ec46adb10bb309848ddffaa361c3c57956aa5fe Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Fri, 24 Feb 2023 15:45:33 +0100 Subject: [PATCH 24/27] add gallery example for datapoints (#7321) Co-authored-by: vfdev Co-authored-by: Nicolas Hug --- gallery/plot_datapoints.py | 132 +++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 gallery/plot_datapoints.py diff --git a/gallery/plot_datapoints.py b/gallery/plot_datapoints.py new file mode 100644 index 00000000000..83ca6793598 --- /dev/null +++ b/gallery/plot_datapoints.py @@ -0,0 +1,132 @@ +""" +============== +Datapoints FAQ +============== + +The :mod:`torchvision.datapoints` namespace was introduced together with ``torchvision.transforms.v2``. This example +showcases what these datapoints are and how they behave. This is a fairly low-level topic that most users will not need +to worry about: you do not need to understand the internals of datapoints to efficiently rely on +``torchvision.transforms.v2``. It may however be useful for advanced users trying to implement their own datasets, +transforms, or work directly with the datapoints. +""" + +import PIL.Image + +import torch +import torchvision + +# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that +# some APIs may slightly change in the future +torchvision.disable_beta_transforms_warning() + +from torchvision import datapoints + + +######################################################################################################################## +# What are datapoints? +# -------------------- +# +# Datapoints are zero-copy tensor subclasses: + +tensor = torch.rand(3, 256, 256) +image = datapoints.Image(tensor) + +assert isinstance(image, torch.Tensor) +assert image.data_ptr() == tensor.data_ptr() + + +######################################################################################################################## +# Under the hood, they are needed in :mod:`torchvision.transforms.v2` to correctly dispatch to the appropriate function +# for the input data. +# +# What datapoints are supported? +# ------------------------------ +# +# So far :mod:`torchvision.datapoints` supports four types of datapoints: +# +# * :class:`~torchvision.datapoints.Image` +# * :class:`~torchvision.datapoints.Video` +# * :class:`~torchvision.datapoints.BoundingBox` +# * :class:`~torchvision.datapoints.Mask` +# +# How do I construct a datapoint? +# ------------------------------- +# +# Each datapoint class takes any tensor-like data that can be turned into a :class:`~torch.Tensor` + +image = datapoints.Image([[[[0, 1], [1, 0]]]]) +print(image) + + +######################################################################################################################## +# Similar to other PyTorch creations ops, the constructor also takes the ``dtype``, ``device``, and ``requires_grad`` +# parameters. + +float_image = datapoints.Image([[[0, 1], [1, 0]]], dtype=torch.float32, requires_grad=True) +print(float_image) + + +######################################################################################################################## +# In addition, :class:`~torchvision.datapoints.Image` and :class:`~torchvision.datapoints.Mask` also take a +# :class:`PIL.Image.Image` directly: + +image = datapoints.Image(PIL.Image.open("assets/astronaut.jpg")) +print(image.shape, image.dtype) + +######################################################################################################################## +# In general, the datapoints can also store additional metadata that complements the underlying tensor. For example, +# :class:`~torchvision.datapoints.BoundingBox` stores the coordinate format as well as the spatial size of the +# corresponding image alongside the actual values: + +bounding_box = datapoints.BoundingBox( + [17, 16, 344, 495], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=image.shape[-2:] +) +print(bounding_box) + + +######################################################################################################################## +# Do I have to wrap the output of the datasets myself? +# ---------------------------------------------------- +# +# Only if you are using custom datasets. For the built-in ones, you can use +# :func:`torchvision.datasets.wrap_dataset_for_transforms_v2`. Note that the function also supports subclasses of the +# built-in datasets. Meaning, if your custom dataset subclasses from a built-in one and the output type is the same, you +# also don't have to wrap manually. +# +# How do the datapoints behave inside a computation? +# -------------------------------------------------- +# +# Datapoints look and feel just like regular tensors. Everything that is supported on a plain :class:`torch.Tensor` +# also works on datapoints. +# Since for most operations involving datapoints, it cannot be safely inferred whether the result should retain the +# datapoint type, we choose to return a plain tensor instead of a datapoint (this might change, see note below): + +assert isinstance(image, datapoints.Image) + +new_image = image + 0 + +assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, datapoints.Image) + +######################################################################################################################## +# .. note:: +# +# This "unwrapping" behaviour is something we're actively seeking feedback on. If you find this surprising or if you +# have any suggestions on how to better support your use-cases, please reach out to us via this issue: +# https://github.com/pytorch/vision/issues/7319 +# +# There are two exceptions to this rule: +# +# 1. The operations :meth:`~torch.Tensor.clone`, :meth:`~torch.Tensor.to`, and :meth:`~torch.Tensor.requires_grad_` +# retain the datapoint type. +# 2. Inplace operations on datapoints cannot change the type of the datapoint they are called on. However, if you use +# the flow style, the returned value will be unwrapped: + +image = datapoints.Image([[[0, 1], [1, 0]]]) + +new_image = image.add_(1).mul_(2) + +assert isinstance(image, torch.Tensor) +print(image) + +assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, datapoints.Image) +assert (new_image == image).all() From db6630ec37e90cb5248b748efcf053954afe7d83 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 24 Feb 2023 16:16:34 +0000 Subject: [PATCH 25/27] Change betastatus doc warning and v2 import warning (#7329) --- docs/source/beta_status.py | 15 ++++++-- docs/source/transforms.rst | 8 ++--- torchvision/__init__.py | 9 ++--- torchvision/transforms/v2/_augment.py | 2 +- torchvision/transforms/v2/_auto_augment.py | 8 ++--- torchvision/transforms/v2/_color.py | 20 +++++------ torchvision/transforms/v2/_container.py | 8 ++--- torchvision/transforms/v2/_deprecated.py | 2 +- torchvision/transforms/v2/_geometry.py | 36 +++++++++---------- torchvision/transforms/v2/_meta.py | 6 ++-- torchvision/transforms/v2/_misc.py | 12 +++---- torchvision/transforms/v2/_temporal.py | 2 +- torchvision/transforms/v2/_type_conversion.py | 6 ++-- 13 files changed, 73 insertions(+), 61 deletions(-) diff --git a/docs/source/beta_status.py b/docs/source/beta_status.py index 925894df5c5..4a0fdc72c0f 100644 --- a/docs/source/beta_status.py +++ b/docs/source/beta_status.py @@ -4,15 +4,26 @@ class BetaStatus(Directive): has_content = True + text = "The {api_name} is in Beta stage, and backward compatibility is not guaranteed." def run(self): - api_name = " ".join(self.content) - text = f"The {api_name} is in Beta stage, and backward compatibility is not guaranteed." + text = self.text.format(api_name=" ".join(self.content)) return [nodes.warning("", nodes.paragraph("", "", nodes.Text(text)))] +class V2BetaStatus(BetaStatus): + text = ( + "The {api_name} is in Beta stage, and while we do not expect major breaking changes, " + "some APIs may still change according to user feedback. Please submit any feedback you may have " + "in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check " + "out https://github.com/pytorch/vision/issues/7319 to learn " + "more about the APIs that we suspect might involve future changes." + ) + + def setup(app): app.add_directive("betastatus", BetaStatus) + app.add_directive("v2betastatus", V2BetaStatus) return { "version": "0.1", "parallel_read_safe": True, diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst index 22e0889a480..0d6961bbe79 100644 --- a/docs/source/transforms.rst +++ b/docs/source/transforms.rst @@ -16,10 +16,10 @@ Transforming and augmenting images :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`. Note that these transforms are still BETA, and while we don't expect major breaking changes in the future, some APIs may still change according to user - feedback. Please submit any feedback you may have in - https://github.com/pytorch/vision/issues/6753, and you can also check out - https://github.com/pytorch/vision/issues/7319 to learn more about the APIs - that we suspect might involve future changes. + feedback. Please submit any feedback you may have `here + `_, and you can also check + out `this issue `_ to learn + more about the APIs that we suspect might involve future changes. Transforms are common image transformations available in the ``torchvision.transforms`` module. They can be chained together using diff --git a/torchvision/__init__.py b/torchvision/__init__.py index f29da9cf644..eed24091a52 100644 --- a/torchvision/__init__.py +++ b/torchvision/__init__.py @@ -100,10 +100,11 @@ def _is_tracing(): _WARN_ABOUT_BETA_TRANSFORMS = True _BETA_TRANSFORMS_WARNING = ( "The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. " - "While we will try our best to maintain backward compatibility, " - "some APIs or behaviors might change without a deprecation cycle. " - "To help us improve these new features, please provide your feedback " - "here: https://github.com/pytorch/vision/issues/6753." + "While we do not expect major breaking changes, some APIs may still change " + "according to user feedback. Please submit any feedback you may have in " + "this issue: https://github.com/pytorch/vision/issues/6753, and you can also " + "check out https://github.com/pytorch/vision/issues/7319 to learn more about " + "the APIs that we suspect might involve future changes. " "You can silence this warning by calling torchvision.disable_beta_transform_warning()." ) diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py index 0df7e0f249a..937e3508a87 100644 --- a/torchvision/transforms/v2/_augment.py +++ b/torchvision/transforms/v2/_augment.py @@ -15,7 +15,7 @@ class RandomErasing(_RandomApplyTransform): """[BETA] Randomly select a rectangle region in the input image or video and erase its pixels. - .. betastatus:: RandomErasing transform + .. v2betastatus:: RandomErasing transform This transform does not support PIL Image. 'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/abs/1708.04896 diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py index 2cd88c1a74d..34c0ced43d2 100644 --- a/torchvision/transforms/v2/_auto_augment.py +++ b/torchvision/transforms/v2/_auto_augment.py @@ -165,7 +165,7 @@ class AutoAugment(_AutoAugmentBase): r"""[BETA] AutoAugment data augmentation method based on `"AutoAugment: Learning Augmentation Strategies from Data" `_. - .. betastatus:: AutoAugment transform + .. v2betastatus:: AutoAugment transform This transformation works on images and videos only. @@ -342,7 +342,7 @@ class RandAugment(_AutoAugmentBase): `"RandAugment: Practical automated data augmentation with a reduced search space" `_. - .. betastatus:: RandAugment transform + .. v2betastatus:: RandAugment transform This transformation works on images and videos only. @@ -425,7 +425,7 @@ class TrivialAugmentWide(_AutoAugmentBase): r"""[BETA] Dataset-independent data-augmentation with TrivialAugment Wide, as described in `"TrivialAugment: Tuning-free Yet State-of-the-Art Data Augmentation" `_. - .. betastatus:: TrivialAugmentWide transform + .. v2betastatus:: TrivialAugmentWide transform This transformation works on images and videos only. @@ -496,7 +496,7 @@ class AugMix(_AutoAugmentBase): r"""[BETA] AugMix data augmentation method based on `"AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty" `_. - .. betastatus:: AugMix transform + .. v2betastatus:: AugMix transform This transformation works on images and videos only. diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py index 237e8d6181a..4ad534c988b 100644 --- a/torchvision/transforms/v2/_color.py +++ b/torchvision/transforms/v2/_color.py @@ -13,7 +13,7 @@ class Grayscale(Transform): """[BETA] Convert images or videos to grayscale. - .. betastatus:: Grayscale transform + .. v2betastatus:: Grayscale transform If the input is a :class:`torch.Tensor`, it is expected to have [..., 3 or 1, H, W] shape, where ... means an arbitrary number of leading dimensions @@ -42,7 +42,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomGrayscale(_RandomApplyTransform): """[BETA] Randomly convert image or videos to grayscale with a probability of p (default 0.1). - .. betastatus:: RandomGrayscale transform + .. v2betastatus:: RandomGrayscale transform If the input is a :class:`torch.Tensor`, it is expected to have [..., 3 or 1, H, W] shape, where ... means an arbitrary number of leading dimensions @@ -76,7 +76,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class ColorJitter(Transform): """[BETA] Randomly change the brightness, contrast, saturation and hue of an image or video. - .. betastatus:: ColorJitter transform + .. v2betastatus:: ColorJitter transform If the input is a :class:`torch.Tensor`, it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. @@ -182,7 +182,7 @@ class RandomPhotometricDistort(Transform): """[BETA] Randomly distorts the image or video as used in `SSD: Single Shot MultiBox Detector `_. - .. betastatus:: RandomPhotometricDistort transform + .. v2betastatus:: RandomPhotometricDistort transform This transform relies on :class:`~torchvision.transforms.v2.ColorJitter` under the hood to adjust the contrast, saturation, hue, brightness, and also @@ -282,7 +282,7 @@ def _transform( class RandomEqualize(_RandomApplyTransform): """[BETA] Equalize the histogram of the given image or video with a given probability. - .. betastatus:: RandomEqualize transform + .. v2betastatus:: RandomEqualize transform If the input is a :class:`torch.Tensor`, it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. @@ -301,7 +301,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomInvert(_RandomApplyTransform): """[BETA] Inverts the colors of the given image or video with a given probability. - .. betastatus:: RandomInvert transform + .. v2betastatus:: RandomInvert transform If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format, where ... means it can have an arbitrary number of leading dimensions. @@ -321,7 +321,7 @@ class RandomPosterize(_RandomApplyTransform): """[BETA] Posterize the image or video with a given probability by reducing the number of bits for each color channel. - .. betastatus:: RandomPosterize transform + .. v2betastatus:: RandomPosterize transform If the input is a :class:`torch.Tensor`, it should be of type torch.uint8, and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. @@ -346,7 +346,7 @@ class RandomSolarize(_RandomApplyTransform): """[BETA] Solarize the image or video with a given probability by inverting all pixel values above a threshold. - .. betastatus:: RandomSolarize transform + .. v2betastatus:: RandomSolarize transform If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format, where ... means it can have an arbitrary number of leading dimensions. @@ -370,7 +370,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomAutocontrast(_RandomApplyTransform): """[BETA] Autocontrast the pixels of the given image or video with a given probability. - .. betastatus:: RandomAutocontrast transform + .. v2betastatus:: RandomAutocontrast transform If the input is a :class:`torch.Tensor`, it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. @@ -389,7 +389,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomAdjustSharpness(_RandomApplyTransform): """[BETA] Adjust the sharpness of the image or video with a given probability. - .. betastatus:: RandomAdjustSharpness transform + .. v2betastatus:: RandomAdjustSharpness transform If the input is a :class:`torch.Tensor`, it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py index 2f34a58902e..fffef4157bd 100644 --- a/torchvision/transforms/v2/_container.py +++ b/torchvision/transforms/v2/_container.py @@ -10,7 +10,7 @@ class Compose(Transform): """[BETA] Composes several transforms together. - .. betastatus:: Compose transform + .. v2betastatus:: Compose transform This transform does not support torchscript. Please, see the note below. @@ -61,7 +61,7 @@ def extra_repr(self) -> str: class RandomApply(Transform): """[BETA] Apply randomly a list of transformations with a given probability. - .. betastatus:: RandomApply transform + .. v2betastatus:: RandomApply transform .. note:: In order to script the transformation, please use ``torch.nn.ModuleList`` as input instead of list/tuple of @@ -116,7 +116,7 @@ def extra_repr(self) -> str: class RandomChoice(Transform): """[BETA] Apply single transformation randomly picked from a list. - .. betastatus:: RandomChoice transform + .. v2betastatus:: RandomChoice transform This transform does not support torchscript. @@ -155,7 +155,7 @@ def forward(self, *inputs: Any) -> Any: class RandomOrder(Transform): """[BETA] Apply a list of transformations in a random order. - .. betastatus:: RandomOrder transform + .. v2betastatus:: RandomOrder transform This transform does not support torchscript. diff --git a/torchvision/transforms/v2/_deprecated.py b/torchvision/transforms/v2/_deprecated.py index b5544ecfd49..e900e853d2b 100644 --- a/torchvision/transforms/v2/_deprecated.py +++ b/torchvision/transforms/v2/_deprecated.py @@ -12,7 +12,7 @@ class ToTensor(Transform): """[BETA] Convert a PIL Image or ndarray to tensor and scale the values accordingly. - .. betastatus:: ToTensor transform + .. v2betastatus:: ToTensor transform .. warning:: :class:`v2.ToTensor` is deprecated and will be removed in a future release. diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py index b2618bb892f..59791c30b9d 100644 --- a/torchvision/transforms/v2/_geometry.py +++ b/torchvision/transforms/v2/_geometry.py @@ -28,7 +28,7 @@ class RandomHorizontalFlip(_RandomApplyTransform): """[BETA] Horizontally flip the input with a given probability. - .. betastatus:: RandomHorizontalFlip transform + .. v2betastatus:: RandomHorizontalFlip transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) @@ -48,7 +48,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomVerticalFlip(_RandomApplyTransform): """[BETA] Vertically flip the input with a given probability. - .. betastatus:: RandomVerticalFlip transform + .. v2betastatus:: RandomVerticalFlip transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) @@ -68,7 +68,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class Resize(Transform): """[BETA] Resize the input to the given size. - .. betastatus:: Resize transform + .. v2betastatus:: Resize transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) @@ -162,7 +162,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class CenterCrop(Transform): """[BETA] Crop the input at the center. - .. betastatus:: CenterCrop transform + .. v2betastatus:: CenterCrop transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) @@ -190,7 +190,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomResizedCrop(Transform): """[BETA] Crop a random portion of the input and resize it to a given size. - .. betastatus:: RandomResizedCrop transform + .. v2betastatus:: RandomResizedCrop transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) @@ -316,7 +316,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class FiveCrop(Transform): """[BETA] Crop the image or video into four corners and the central crop. - .. betastatus:: FiveCrop transform + .. v2betastatus:: FiveCrop transform If the input is a :class:`torch.Tensor` or a :class:`~torchvision.datapoints.Image` or a :class:`~torchvision.datapoints.Video` it can have arbitrary number of leading batch dimensions. @@ -379,7 +379,7 @@ class TenCrop(Transform): """[BETA] Crop the image or video into four corners and the central crop plus the flipped version of these (horizontal flipping is used by default). - .. betastatus:: TenCrop transform + .. v2betastatus:: TenCrop transform If the input is a :class:`torch.Tensor` or a :class:`~torchvision.datapoints.Image` or a :class:`~torchvision.datapoints.Video` it can have arbitrary number of leading batch dimensions. @@ -437,7 +437,7 @@ def _transform( class Pad(Transform): """[BETA] Pad the input on all sides with the given "pad" value. - .. betastatus:: Pad transform + .. v2betastatus:: Pad transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) @@ -512,7 +512,7 @@ class RandomZoomOut(_RandomApplyTransform): """[BETA] "Zoom out" transformation from `"SSD: Single Shot MultiBox Detector" `_. - .. betastatus:: RandomZoomOut transform + .. v2betastatus:: RandomZoomOut transform This transformation randomly pads images, videos, bounding boxes and masks creating a zoom out effect. Output spatial size is randomly sampled from original size up to a maximum size configured @@ -581,7 +581,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomRotation(Transform): """[BETA] Rotate the input by angle. - .. betastatus:: RandomRotation transform + .. v2betastatus:: RandomRotation transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) @@ -654,7 +654,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomAffine(Transform): """[BETA] Random affine transformation the input keeping center invariant. - .. betastatus:: RandomAffine transform + .. v2betastatus:: RandomAffine transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) @@ -775,7 +775,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomCrop(Transform): """[BETA] Crop the input at a random location. - .. betastatus:: RandomCrop transform + .. v2betastatus:: RandomCrop transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) @@ -930,7 +930,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomPerspective(_RandomApplyTransform): """[BETA] Perform a random perspective transformation of the input with a given probability. - .. betastatus:: RandomPerspective transform + .. v2betastatus:: RandomPerspective transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) @@ -1016,7 +1016,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class ElasticTransform(Transform): """[BETA] Transform the input with elastic transformations. - .. betastatus:: RandomPerspective transform + .. v2betastatus:: RandomPerspective transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) @@ -1108,7 +1108,7 @@ class RandomIoUCrop(Transform): """[BETA] Random IoU crop transformation from `"SSD: Single Shot MultiBox Detector" `_. - .. betastatus:: RandomIoUCrop transform + .. v2betastatus:: RandomIoUCrop transform This transformation requires an image or video data and ``datapoints.BoundingBox`` in the input. @@ -1232,7 +1232,7 @@ class ScaleJitter(Transform): """[BETA] Perform Large Scale Jitter on the input according to `"Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation" `_. - .. betastatus:: ScaleJitter transform + .. v2betastatus:: ScaleJitter transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) @@ -1298,7 +1298,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomShortestSize(Transform): """[BETA] Randomly resize the input. - .. betastatus:: RandomShortestSize transform + .. v2betastatus:: RandomShortestSize transform If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`, :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.) @@ -1366,7 +1366,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomResize(Transform): """[BETA] Randomly resize the input. - .. betastatus:: RandomResize transform + .. v2betastatus:: RandomResize transform This transformation can be used together with ``RandomCrop`` as data augmentations to train models on image segmentation task. diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py index 7f28e25c602..b7e2a42259f 100644 --- a/torchvision/transforms/v2/_meta.py +++ b/torchvision/transforms/v2/_meta.py @@ -11,7 +11,7 @@ class ConvertBoundingBoxFormat(Transform): """[BETA] Convert bounding box coordinates to the given ``format``, eg from "CXCYWH" to "XYXY". - .. betastatus:: ConvertBoundingBoxFormat transform + .. v2betastatus:: ConvertBoundingBoxFormat transform Args: format (str or datapoints.BoundingBoxFormat): output bounding box format. @@ -34,7 +34,7 @@ def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> da class ConvertDtype(Transform): """[BETA] Convert input image or video to the given ``dtype`` and scale the values accordingly. - .. betastatus:: ConvertDtype transform + .. v2betastatus:: ConvertDtype transform This function does not support PIL Image. @@ -77,7 +77,7 @@ class ClampBoundingBox(Transform): The clamping is done according to the bounding boxes' ``spatial_size`` meta-data. - .. betastatus:: ClampBoundingBox transform + .. v2betastatus:: ClampBoundingBox transform """ diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py index 40d57856292..c9b9025ebd9 100644 --- a/torchvision/transforms/v2/_misc.py +++ b/torchvision/transforms/v2/_misc.py @@ -24,7 +24,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class Lambda(Transform): """[BETA] Apply a user-defined function as a transform. - .. betastatus:: Lambda transform + .. v2betastatus:: Lambda transform This transform does not support torchscript. @@ -55,7 +55,7 @@ def extra_repr(self) -> str: class LinearTransformation(Transform): """[BETA] Transform a tensor image or video with a square transformation matrix and a mean_vector computed offline. - .. betastatus:: LinearTransformation transform + .. v2betastatus:: LinearTransformation transform This transform does not support PIL Image. Given transformation_matrix and mean_vector, will flatten the torch.*Tensor and @@ -138,7 +138,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class Normalize(Transform): """[BETA] Normalize a tensor image or video with mean and standard deviation. - .. betastatus:: Normalize transform + .. v2betastatus:: Normalize transform This transform does not support PIL Image. Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n`` @@ -178,7 +178,7 @@ def _transform( class GaussianBlur(Transform): """[BETA] Blurs image with randomly chosen Gaussian blur. - .. betastatus:: GausssianBlur transform + .. v2betastatus:: GausssianBlur transform If the input is a Tensor, it is expected to have [..., C, H, W] shape, where ... means an arbitrary number of leading dimensions. @@ -225,7 +225,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class ToDtype(Transform): """[BETA] Converts the input to a specific dtype - this does not scale values. - .. betastatus:: ToDtype transform + .. v2betastatus:: ToDtype transform Args: dtype (``torch.dtype`` or dict of ``Datapoint`` -> ``torch.dtype``): The dtype to convert to. @@ -258,7 +258,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class SanitizeBoundingBox(Transform): """[BETA] Remove degenerate/invalid bounding boxes and their corresponding labels and masks. - .. betastatus:: SanitizeBoundingBox transform + .. v2betastatus:: SanitizeBoundingBox transform This transform removes bounding boxes and their associated labels/masks that: diff --git a/torchvision/transforms/v2/_temporal.py b/torchvision/transforms/v2/_temporal.py index ad7526bc4a4..df4ad66643a 100644 --- a/torchvision/transforms/v2/_temporal.py +++ b/torchvision/transforms/v2/_temporal.py @@ -9,7 +9,7 @@ class UniformTemporalSubsample(Transform): """[BETA] Uniformly subsample ``num_samples`` indices from the temporal dimension of the video. - .. betastatus:: UniformTemporalSubsample transform + .. v2betastatus:: UniformTemporalSubsample transform Videos are expected to be of shape ``[..., T, C, H, W]`` where ``T`` denotes the temporal dimension. diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py index 92de314608c..60f44c5d3db 100644 --- a/torchvision/transforms/v2/_type_conversion.py +++ b/torchvision/transforms/v2/_type_conversion.py @@ -13,7 +13,7 @@ class PILToTensor(Transform): """[BETA] Convert a PIL Image to a tensor of the same type - this does not scale values. - .. betastatus:: PILToTensor transform + .. v2betastatus:: PILToTensor transform This transform does not support torchscript. @@ -30,7 +30,7 @@ class ToImageTensor(Transform): """[BETA] Convert a tensor, ndarray, or PIL Image to :class:`~torchvision.datapoints.Image` ; this does not scale values. - .. betastatus:: ToImageTensor transform + .. v2betastatus:: ToImageTensor transform This transform does not support torchscript. """ @@ -46,7 +46,7 @@ def _transform( class ToImagePIL(Transform): """[BETA] Convert a tensor or an ndarray to PIL Image - this does not scale values. - .. betastatus:: ToImagePIL transform + .. v2betastatus:: ToImagePIL transform This transform does not support torchscript. From 9de95667f506818e41d135bc18fc48351e6a72e2 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Fri, 24 Feb 2023 17:40:42 +0100 Subject: [PATCH 26/27] add gallery for transforms v2 (#7331) --- gallery/plot_transforms_v2.py | 109 ++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 gallery/plot_transforms_v2.py diff --git a/gallery/plot_transforms_v2.py b/gallery/plot_transforms_v2.py new file mode 100644 index 00000000000..d1096bec1e7 --- /dev/null +++ b/gallery/plot_transforms_v2.py @@ -0,0 +1,109 @@ +""" +================================== +Getting started with transforms v2 +================================== + +Most computer vision tasks are not supported out of the box by ``torchvision.transforms`` v1, since it only supports +images. ``torchvision.transforms.v2`` enables jointly transforming images, videos, bounding boxes, and masks. This +example showcases the core functionality of the new ``torchvision.transforms.v2`` API. +""" + +import pathlib + +import torch +import torchvision + + +def load_data(): + from torchvision.io import read_image + from torchvision import datapoints + from torchvision.ops import masks_to_boxes + + assets_directory = pathlib.Path("assets") + + path = assets_directory / "FudanPed00054.png" + image = datapoints.Image(read_image(str(path))) + merged_masks = read_image(str(assets_directory / "FudanPed00054_mask.png")) + + labels = torch.unique(merged_masks)[1:] + + masks = datapoints.Mask(merged_masks == labels.view(-1, 1, 1)) + + bounding_boxes = datapoints.BoundingBox( + masks_to_boxes(masks), format=datapoints.BoundingBoxFormat.XYXY, spatial_size=image.shape[-2:] + ) + + return path, image, bounding_boxes, masks, labels + + +######################################################################################################################## +# The :mod:`torchvision.transforms.v2` API supports images, videos, bounding boxes, and instance and segmentation +# masks. Thus, it offers native support for many Computer Vision tasks, like image and video classification, object +# detection or instance and semantic segmentation. Still, the interface is the same, making +# :mod:`torchvision.transforms.v2` a drop-in replacement for the existing :mod:`torchvision.transforms` API, aka v1. + +# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that +# some APIs may slightly change in the future +torchvision.disable_beta_transforms_warning() +import torchvision.transforms.v2 as transforms + +transform = transforms.Compose( + [ + transforms.ColorJitter(contrast=0.5), + transforms.RandomRotation(30), + transforms.CenterCrop(480), + ] +) + +######################################################################################################################## +# :mod:`torchvision.transforms.v2` natively supports jointly transforming multiple inputs while making sure that +# potential random behavior is consistent across all inputs. However, it doesn't enforce a specific input structure or +# order. + +path, image, bounding_boxes, masks, labels = load_data() + +torch.manual_seed(0) +new_image = transform(image) # Image Classification +new_image, new_bounding_boxes, new_labels = transform(image, bounding_boxes, labels) # Object Detection +new_image, new_bounding_boxes, new_masks, new_labels = transform( + image, bounding_boxes, masks, labels +) # Instance Segmentation +new_image, new_target = transform((image, {"boxes": bounding_boxes, "labels": labels})) # Arbitrary Structure + +######################################################################################################################## +# Under the hood, :mod:`torchvision.transforms.v2` relies on :mod:`torchvision.datapoints` for the dispatch to the +# appropriate function for the input data: :ref:`sphx_glr_auto_examples_plot_datapoints.py`. Note however, that as +# regular user, you likely don't have to touch this yourself. See +# :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`. +# +# All "foreign" types like :class:`str`'s or :class:`pathlib.Path`'s are passed through, allowing to store extra +# information directly with the sample: + +sample = {"path": path, "image": image} +new_sample = transform(sample) + +assert new_sample["path"] is sample["path"] + +######################################################################################################################## +# As stated above, :mod:`torchvision.transforms.v2` is a drop-in replacement for :mod:`torchvision.transforms` and thus +# also supports transforming plain :class:`torch.Tensor`'s as image or video if applicable. This is achieved with a +# simple heuristic: +# +# * If we find an explicit image or video (:class:`torchvision.datapoints.Image`, :class:`torchvision.datapoints.Video`, +# or :class:`PIL.Image.Image`) in the input, all other plain tensors are passed through. +# * If there is no explicit image or video, only the first plain :class:`torch.Tensor` will be transformed as image or +# video, while all others will be passed through. + +plain_tensor_image = torch.rand(image.shape) + +print(image.shape, plain_tensor_image.shape) + +# passing a plain tensor together with an explicit image, will not transform the former +plain_tensor_image, image = transform(plain_tensor_image, image) + +print(image.shape, plain_tensor_image.shape) + +# passing a plain tensor without an explicit image, will transform the former +plain_tensor_image, _ = transform(plain_tensor_image, bounding_boxes) + +print(image.shape, plain_tensor_image.shape) From fdf72de88d508e9cf06848dec00ada3e223cf265 Mon Sep 17 00:00:00 2001 From: vfdev Date: Fri, 24 Feb 2023 18:18:12 +0100 Subject: [PATCH 27/27] Fixed uncaught warnings in tests v2 (#7330) Co-authored-by: Nicolas Hug --- test/test_transforms_v2.py | 41 +++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index 9beded4c957..f5ca976963a 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -136,14 +136,14 @@ class TestSmoke: (transforms.RandomCrop([16, 16], pad_if_needed=True), None), (transforms.RandomHorizontalFlip(p=1.0), None), (transforms.RandomPerspective(p=1.0), None), - (transforms.RandomResize(min_size=10, max_size=20), None), - (transforms.RandomResizedCrop([16, 16]), None), + (transforms.RandomResize(min_size=10, max_size=20, antialias=True), None), + (transforms.RandomResizedCrop([16, 16], antialias=True), None), (transforms.RandomRotation(degrees=30), None), - (transforms.RandomShortestSize(min_size=10), None), + (transforms.RandomShortestSize(min_size=10, antialias=True), None), (transforms.RandomVerticalFlip(p=1.0), None), (transforms.RandomZoomOut(p=1.0), None), (transforms.Resize([16, 16], antialias=True), None), - (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2)), None), + (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2), antialias=True), None), (transforms.ClampBoundingBox(), None), (transforms.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.CXCYWH), None), (transforms.ConvertDtype(), None), @@ -1514,7 +1514,7 @@ class TestRandomShortestSize: def test__get_params(self, min_size, max_size, mocker): spatial_size = (3, 10) - transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size) + transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size, antialias=True) sample = mocker.MagicMock(spec=datapoints.Image, num_channels=3, spatial_size=spatial_size) params = transform._get_params([sample]) @@ -1595,7 +1595,7 @@ def test__get_params(self): min_size = 3 max_size = 6 - transform = transforms.RandomResize(min_size=min_size, max_size=max_size) + transform = transforms.RandomResize(min_size=min_size, max_size=max_size, antialias=True) for _ in range(10): params = transform._get_params([]) @@ -1791,15 +1791,21 @@ def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor): else: sample = image, label + if to_tensor is transforms.ToTensor: + with pytest.warns(UserWarning, match="deprecated and will be removed"): + to_tensor = to_tensor() + else: + to_tensor = to_tensor() + t = transforms.Compose( [ - transforms.RandomResizedCrop((224, 224)), + transforms.RandomResizedCrop((224, 224), antialias=True), transforms.RandomHorizontalFlip(p=1), transforms.RandAugment(), transforms.TrivialAugmentWide(), transforms.AugMix(), transforms.AutoAugment(), - to_tensor(), + to_tensor, # TODO: ConvertImageDtype is a pass-through on PIL images, is that # intended? This results in a failure if we convert to tensor after # it, because the image would still be uint8 which make Normalize @@ -1830,10 +1836,17 @@ def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor): @pytest.mark.parametrize("sanitize", (True, False)) def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): torch.manual_seed(0) + + if to_tensor is transforms.ToTensor: + with pytest.warns(UserWarning, match="deprecated and will be removed"): + to_tensor = to_tensor() + else: + to_tensor = to_tensor() + if data_augmentation == "hflip": t = [ transforms.RandomHorizontalFlip(p=1), - to_tensor(), + to_tensor, transforms.ConvertImageDtype(torch.float), ] elif data_augmentation == "lsj": @@ -1847,7 +1860,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): # ), transforms.RandomCrop((1024, 1024), pad_if_needed=True), transforms.RandomHorizontalFlip(p=1), - to_tensor(), + to_tensor, transforms.ConvertImageDtype(torch.float), ] elif data_augmentation == "multiscale": @@ -1856,7 +1869,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333, antialias=True ), transforms.RandomHorizontalFlip(p=1), - to_tensor(), + to_tensor, transforms.ConvertImageDtype(torch.float), ] elif data_augmentation == "ssd": @@ -1865,14 +1878,14 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): transforms.RandomZoomOut(fill=defaultdict(lambda: (123.0, 117.0, 104.0), {datapoints.Mask: 0})), transforms.RandomIoUCrop(), transforms.RandomHorizontalFlip(p=1), - to_tensor(), + to_tensor, transforms.ConvertImageDtype(torch.float), ] elif data_augmentation == "ssdlite": t = [ transforms.RandomIoUCrop(), transforms.RandomHorizontalFlip(p=1), - to_tensor(), + to_tensor, transforms.ConvertImageDtype(torch.float), ] if sanitize: @@ -1907,7 +1920,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): out = t(sample) - if to_tensor is transforms.ToTensor and image_type is not datapoints.Image: + if isinstance(to_tensor, transforms.ToTensor) and image_type is not datapoints.Image: assert is_simple_tensor(out["image"]) else: assert isinstance(out["image"], datapoints.Image)