From cead4bf08f9920f48be07b4b85bd73105d149134 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Tue, 21 Feb 2023 11:20:33 +0100
Subject: [PATCH 01/27] add ffmpeg to Linux CPU and GPU unittest workflows
 (#7295)

---
 .github/workflows/test-linux-cpu.yml | 2 +-
 .github/workflows/test-linux-gpu.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test-linux-cpu.yml b/.github/workflows/test-linux-cpu.yml
index 19521cdd011..769ee4f841b 100644
--- a/.github/workflows/test-linux-cpu.yml
+++ b/.github/workflows/test-linux-cpu.yml
@@ -39,7 +39,7 @@ jobs:
         fi
 
         # Create Conda Env
-        conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy
+        conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy 'ffmpeg<4.3'
         conda activate /work/ci_env
 
         # Install PyTorch, Torchvision, and testing libraries
diff --git a/.github/workflows/test-linux-gpu.yml b/.github/workflows/test-linux-gpu.yml
index 831de27e350..95d06402db1 100644
--- a/.github/workflows/test-linux-gpu.yml
+++ b/.github/workflows/test-linux-gpu.yml
@@ -43,7 +43,7 @@ jobs:
         fi
 
         # Create Conda Env
-        conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy
+        conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy 'ffmpeg<4.3'
         conda activate /work/ci_env
 
         # Install PyTorch, Torchvision, and testing libraries

From df1f2d6ed3de69f3984797024f5caf8eb11a086b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 21 Feb 2023 11:02:29 +0000
Subject: [PATCH 02/27] Update transforms docs sub-structure (#7291)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 docs/source/transforms.rst | 87 ++++++++++++++++----------------------
 1 file changed, 36 insertions(+), 51 deletions(-)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 5909b68966b..d831b81e37f 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -14,11 +14,10 @@ transformations.
 This is useful if you have to build a more complex transformation pipeline
 (e.g. in the case of segmentation tasks).
 
-Most transformations accept both `PIL <https://pillow.readthedocs.io>`_
-images and tensor images, although some transformations are :ref:`PIL-only
-<transforms_pil_only>` and some are :ref:`tensor-only
-<transforms_tensor_only>`. The :ref:`conversion_transforms` may be used to
-convert to and from PIL images.
+Most transformations accept both `PIL <https://pillow.readthedocs.io>`_ images
+and tensor images, although some transformations are PIL-only and some are
+tensor-only. The :ref:`conversion_transforms` may be used to convert to and from
+PIL images, or for converting dtypes and ranges.
 
 The transformations that accept tensor images also accept batches of tensor
 images. A Tensor Image is a tensor with ``(C, H, W)`` shape, where ``C`` is a
@@ -70,8 +69,10 @@ The following examples illustrate the use of the available transforms:
     produce the same results.
 
 
-Scriptable transforms
----------------------
+Transforms scriptability
+------------------------
+
+.. TODO: Add note about v2 scriptability (in next PR)
 
 In order to script the transformations, please use ``torch.nn.Sequential`` instead of :class:`Compose`.
 
@@ -89,39 +90,36 @@ Make sure to use only scriptable transformations, i.e. that work with ``torch.Te
 For any custom transformations to be used with ``torch.jit.script``, they should be derived from ``torch.nn.Module``.
 
 
-Compositions of transforms
---------------------------
+Geometry
+--------
 
 .. autosummary::
     :toctree: generated/
     :template: class.rst
 
-    Compose
-
+    Resize
+    RandomCrop
+    RandomResizedCrop
+    CenterCrop
+    FiveCrop
+    TenCrop
+    Pad
+    RandomAffine
+    RandomPerspective
+    RandomRotation
+    RandomHorizontalFlip
+    RandomVerticalFlip
 
-Transforms on PIL Image and torch.\*Tensor
-------------------------------------------
+Color
+-----
 
 .. autosummary::
     :toctree: generated/
     :template: class.rst
 
-    CenterCrop
     ColorJitter
-    FiveCrop
     Grayscale
-    Pad
-    RandomAffine
-    RandomApply
-    RandomCrop
     RandomGrayscale
-    RandomHorizontalFlip
-    RandomPerspective
-    RandomResizedCrop
-    RandomRotation
-    RandomVerticalFlip
-    Resize
-    TenCrop
     GaussianBlur
     RandomInvert
     RandomPosterize
@@ -130,23 +128,20 @@ Transforms on PIL Image and torch.\*Tensor
     RandomAutocontrast
     RandomEqualize
 
-
-.. _transforms_pil_only:
-
-Transforms on PIL Image only
-----------------------------
+Composition
+-----------
 
 .. autosummary::
     :toctree: generated/
     :template: class.rst
 
+    Compose
+    RandomApply
     RandomChoice
     RandomOrder
 
-.. _transforms_tensor_only:
-
-Transforms on torch.\*Tensor only
----------------------------------
+Miscellaneous
+-------------
 
 .. autosummary::
     :toctree: generated/
@@ -155,12 +150,12 @@ Transforms on torch.\*Tensor only
     LinearTransformation
     Normalize
     RandomErasing
-    ConvertImageDtype
+    Lambda
 
 .. _conversion_transforms:
 
-Conversion Transforms
----------------------
+Conversion
+----------
 
 .. autosummary::
     :toctree: generated/
@@ -169,20 +164,10 @@ Conversion Transforms
     ToPILImage
     ToTensor
     PILToTensor
+    ConvertImageDtype
 
-
-Generic Transforms
-------------------
-
-.. autosummary::
-    :toctree: generated/
-    :template: class.rst
-
-    Lambda
-
-
-Automatic Augmentation Transforms
----------------------------------
+Auto-Augmentation
+-----------------
 
 `AutoAugment <https://arxiv.org/pdf/1805.09501.pdf>`_ is a common Data Augmentation technique that can improve the accuracy of Image Classification models.
 Though the data augmentation policies are directly linked to their trained dataset, empirical studies show that

From c7a20ba5f60991b6169c639b613b3c128c339446 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 21 Feb 2023 16:38:12 +0000
Subject: [PATCH 03/27] Added docs for v2 transforms (part 1) (#7297)

Co-authored-by: vfdev <vfdev.5@gmail.com>
Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 docs/source/conf.py                           |   2 +
 docs/source/transforms.rst                    |  40 ++
 torchvision/transforms/v2/_augment.py         |  32 ++
 torchvision/transforms/v2/_auto_augment.py    |  80 ++++
 torchvision/transforms/v2/_color.py           | 139 +++++++
 torchvision/transforms/v2/_container.py       |  65 ++++
 torchvision/transforms/v2/_deprecated.py      |  25 ++
 torchvision/transforms/v2/_geometry.py        | 350 +++++++++++++++++-
 torchvision/transforms/v2/_meta.py            |  21 ++
 torchvision/transforms/v2/_misc.py            |  68 ++++
 torchvision/transforms/v2/_type_conversion.py |  30 ++
 11 files changed, 850 insertions(+), 2 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 72c83d7893d..304a1cc6e22 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -33,6 +33,8 @@
 
 sys.path.append(os.path.abspath("."))
 
+torchvision.disable_beta_transforms_warning()
+
 # -- General configuration ------------------------------------------------
 
 # Required version of sphinx is set from docs/requirements.txt
diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index d831b81e37f..00d929d0675 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -98,17 +98,29 @@ Geometry
     :template: class.rst
 
     Resize
+    v2.Resize
     RandomCrop
+    v2.RandomCrop
     RandomResizedCrop
+    v2.RandomResizedCrop
     CenterCrop
+    v2.CenterCrop
     FiveCrop
+    v2.FiveCrop
     TenCrop
+    v2.TenCrop
     Pad
+    v2.Pad
     RandomAffine
+    v2.RandomAffine
     RandomPerspective
+    v2.RandomPerspective
     RandomRotation
+    v2.RandomRotation
     RandomHorizontalFlip
+    v2.RandomHorizontalFlip
     RandomVerticalFlip
+    v2.RandomVerticalFlip
 
 Color
 -----
@@ -118,15 +130,25 @@ Color
     :template: class.rst
 
     ColorJitter
+    v2.ColorJitter
     Grayscale
+    v2.Grayscale
     RandomGrayscale
+    v2.RandomGrayscale
     GaussianBlur
+    v2.GaussianBlur
     RandomInvert
+    v2.RandomInvert
     RandomPosterize
+    v2.RandomPosterize
     RandomSolarize
+    v2.RandomSolarize
     RandomAdjustSharpness
+    v2.RandomAdjustSharpness
     RandomAutocontrast
+    v2.RandomAutocontrast
     RandomEqualize
+    v2.RandomEqualize
 
 Composition
 -----------
@@ -136,9 +158,13 @@ Composition
     :template: class.rst
 
     Compose
+    v2.Compose
     RandomApply
+    v2.RandomApply
     RandomChoice
+    v2.RandomChoice
     RandomOrder
+    v2.RandomOrder
 
 Miscellaneous
 -------------
@@ -148,9 +174,13 @@ Miscellaneous
     :template: class.rst
 
     LinearTransformation
+    v2.LinearTransformation
     Normalize
+    v2.Normalize
     RandomErasing
+    v2.RandomErasing
     Lambda
+    v2.Lambda
 
 .. _conversion_transforms:
 
@@ -162,9 +192,15 @@ Conversion
     :template: class.rst
 
     ToPILImage
+    v2.ToPILImage
+    v2.ToImagePIL
     ToTensor
+    v2.ToTensor
     PILToTensor
+    v2.PILToTensor
     ConvertImageDtype
+    v2.ConvertImageDtype
+    v2.ConvertDtype
 
 Auto-Augmentation
 -----------------
@@ -181,9 +217,13 @@ The new transform can be used standalone or mixed-and-matched with existing tran
 
     AutoAugmentPolicy
     AutoAugment
+    v2.AutoAugment
     RandAugment
+    v2.RandAugment
     TrivialAugmentWide
+    v2.TrivialAugmentWide
     AugMix
+    v2.AugMix
 
 .. _functional_transforms:
 
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index 157605d6f3c..b5aac9ca9a2 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -13,6 +13,38 @@
 
 
 class RandomErasing(_RandomApplyTransform):
+    """[BETA] Randomly selects a rectangle region in the input image or video and erases its pixels.
+
+    .. betastatus:: RandomErasing transform
+
+    This transform does not support PIL Image.
+    'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/abs/1708.04896
+
+    Args:
+         p: probability that the random erasing operation will be performed.
+         scale: range of proportion of erased area against input image.
+         ratio: range of aspect ratio of erased area.
+         value: erasing value. Default is 0. If a single int, it is used to
+            erase all pixels. If a tuple of length 3, it is used to erase
+            R, G, B channels respectively.
+            If a str of 'random', erasing each pixel with random values.
+         inplace: boolean to make this transform inplace. Default set to False.
+
+    Returns:
+        Erased input.
+
+    Example:
+        >>> from torchvision.transforms import v2 as transforms
+        >>>
+        >>> transform = transforms.Compose([
+        >>>   transforms.RandomHorizontalFlip(),
+        >>>   transforms.PILToTensor(),
+        >>>   transforms.ConvertImageDtype(torch.float),
+        >>>   transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        >>>   transforms.RandomErasing(),
+        >>> ])
+    """
+
     _v1_transform_cls = _transforms.RandomErasing
 
     def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
index b4791755dc5..98e23b99796 100644
--- a/torchvision/transforms/v2/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -162,6 +162,24 @@ def _apply_image_or_video_transform(
 
 
 class AutoAugment(_AutoAugmentBase):
+    r"""[BETA] AutoAugment data augmentation method based on
+    `"AutoAugment: Learning Augmentation Strategies from Data" <https://arxiv.org/pdf/1805.09501.pdf>`_.
+
+    .. betastatus:: AutoAugment transform
+
+    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        policy (AutoAugmentPolicy): Desired policy enum defined by
+            :class:`torchvision.transforms.autoaugment.AutoAugmentPolicy`. Default is ``AutoAugmentPolicy.IMAGENET``.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
     _v1_transform_cls = _transforms.AutoAugment
 
     _AUGMENTATION_SPACE = {
@@ -318,6 +336,27 @@ def forward(self, *inputs: Any) -> Any:
 
 
 class RandAugment(_AutoAugmentBase):
+    r"""[BETA] RandAugment data augmentation method based on
+    `"RandAugment: Practical automated data augmentation with a reduced search space"
+    <https://arxiv.org/abs/1909.13719>`_.
+
+    .. betastatus:: RandAugment transform
+
+    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        num_ops (int): Number of augmentation transformations to apply sequentially.
+        magnitude (int): Magnitude for all the transformations.
+        num_magnitude_bins (int): The number of different magnitude values.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
     _v1_transform_cls = _transforms.RandAugment
     _AUGMENTATION_SPACE = {
         "Identity": (lambda num_bins, height, width: None, False),
@@ -379,6 +418,24 @@ def forward(self, *inputs: Any) -> Any:
 
 
 class TrivialAugmentWide(_AutoAugmentBase):
+    r"""[BETA] Dataset-independent data-augmentation with TrivialAugment Wide, as described in
+    `"TrivialAugment: Tuning-free Yet State-of-the-Art Data Augmentation" <https://arxiv.org/abs/2103.10158>`_.
+
+    .. betastatus:: TrivialAugmentWide transform
+
+    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        num_magnitude_bins (int): The number of different magnitude values.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
     _v1_transform_cls = _transforms.TrivialAugmentWide
     _AUGMENTATION_SPACE = {
         "Identity": (lambda num_bins, height, width: None, False),
@@ -430,6 +487,29 @@ def forward(self, *inputs: Any) -> Any:
 
 
 class AugMix(_AutoAugmentBase):
+    r"""[BETA] AugMix data augmentation method based on
+    `"AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty" <https://arxiv.org/abs/1912.02781>`_.
+
+    .. betastatus:: AugMix transform
+
+    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        severity (int): The severity of base augmentation operators. Default is ``3``.
+        mixture_width (int): The number of augmentation chains. Default is ``3``.
+        chain_depth (int): The depth of augmentation chains. A negative value denotes stochastic depth sampled from the interval [1, 3].
+            Default is ``-1``.
+        alpha (float): The hyperparameter for the probability distributions. Default is ``1.0``.
+        all_ops (bool): Use all operations (including brightness, contrast, color and sharpness). Default is ``True``.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
     _v1_transform_cls = _transforms.AugMix
 
     _PARTIAL_AUGMENTATION_SPACE = {
diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py
index 64796e16ca4..785a3965e60 100644
--- a/torchvision/transforms/v2/_color.py
+++ b/torchvision/transforms/v2/_color.py
@@ -11,6 +11,23 @@
 
 
 class Grayscale(Transform):
+    """[BETA] Convert images or videos to grayscale.
+
+    .. betastatus:: Grayscale transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    Args:
+        num_output_channels (int): (1 or 3) number of channels desired for output image
+
+    Returns:
+        PIL Image: Grayscale version of the input.
+
+        - If ``num_output_channels == 1`` : returned image is single channel
+        - If ``num_output_channels == 3`` : returned image is 3 channel with r == g == b
+    """
+
     _v1_transform_cls = _transforms.Grayscale
 
     _transformed_types = (
@@ -29,6 +46,24 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomGrayscale(_RandomApplyTransform):
+    """[BETA] Randomly convert image to grayscale with a probability of p (default 0.1).
+
+    .. betastatus:: RandomGrayscale transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    Args:
+        p (float): probability that image should be converted to grayscale.
+
+    Returns:
+        PIL Image or Tensor: Grayscale version of the input image with probability p and unchanged
+        with probability (1-p).
+        - If input image is 1 channel: grayscale version is 1 channel
+        - If input image is 3 channel: grayscale version is 3 channel with r == g == b
+
+    """
+
     _v1_transform_cls = _transforms.RandomGrayscale
 
     _transformed_types = (
@@ -50,6 +85,32 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class ColorJitter(Transform):
+    """[BETA] Randomly change the brightness, contrast, saturation and hue of an image.
+
+    .. betastatus:: ColorJitter transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, mode "1", "I", "F" and modes with transparency (alpha channel) are not supported.
+
+    Args:
+        brightness (float or tuple of float (min, max)): How much to jitter brightness.
+            brightness_factor is chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
+            or the given [min, max]. Should be non negative numbers.
+        contrast (float or tuple of float (min, max)): How much to jitter contrast.
+            contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
+            or the given [min, max]. Should be non-negative numbers.
+        saturation (float or tuple of float (min, max)): How much to jitter saturation.
+            saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
+            or the given [min, max]. Should be non negative numbers.
+        hue (float or tuple of float (min, max)): How much to jitter hue.
+            hue_factor is chosen uniformly from [-hue, hue] or the given [min, max].
+            Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
+            To jitter hue, the pixel values of the input image has to be non-negative for conversion to HSV space;
+            thus it does not work if you normalize your image to an interval with negative values,
+            or use an interpolation that generates negative values before using this function.
+    """
+
     _v1_transform_cls = _transforms.ColorJitter
 
     def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
@@ -205,6 +266,18 @@ def _transform(
 
 
 class RandomEqualize(_RandomApplyTransform):
+    """[BETA] Equalize the histogram of the given image randomly with a given probability.
+
+    .. betastatus:: RandomEqualize transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "P", "L" or "RGB".
+
+    Args:
+        p (float): probability of the image being equalized. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomEqualize
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
@@ -212,6 +285,18 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomInvert(_RandomApplyTransform):
+    """[BETA] Inverts the colors of the given image randomly with a given probability.
+
+    .. betastatus:: RandomInvert transform
+
+    If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+    where ... means it can have an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        p (float): probability of the image being color inverted. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomInvert
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
@@ -219,6 +304,20 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomPosterize(_RandomApplyTransform):
+    """[BETA] Posterize the image randomly with a given probability by reducing the
+    number of bits for each color channel.
+
+    .. betastatus:: RandomPosterize transform
+
+    If the image is torch Tensor, it should be of type torch.uint8,
+    and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        bits (int): number of bits to keep for each channel (0-8)
+        p (float): probability of the image being posterized. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomPosterize
 
     def __init__(self, bits: int, p: float = 0.5) -> None:
@@ -230,6 +329,20 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomSolarize(_RandomApplyTransform):
+    """[BETA] Solarize the image randomly with a given probability by inverting all pixel
+    values above a threshold.
+
+    .. betastatus:: RandomSolarize transform
+
+    If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+    where ... means it can have an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        threshold (float): all pixels equal or above this value are inverted.
+        p (float): probability of the image being solarized. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomSolarize
 
     def __init__(self, threshold: float, p: float = 0.5) -> None:
@@ -241,6 +354,18 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomAutocontrast(_RandomApplyTransform):
+    """[BETA] Autocontrast the pixels of the given image randomly with a given probability.
+
+    .. betastatus:: RandomAutocontrast transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        p (float): probability of the image being autocontrasted. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomAutocontrast
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
@@ -248,6 +373,20 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomAdjustSharpness(_RandomApplyTransform):
+    """[BETA] Adjust the sharpness of the image randomly with a given probability.
+
+    .. betastatus:: RandomAdjustSharpness transform
+
+    If the image is torch Tensor,
+    it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        sharpness_factor (float):  How much to adjust the sharpness. Can be
+            any non-negative number. 0 gives a blurred image, 1 gives the
+            original image while 2 increases the sharpness by a factor of 2.
+        p (float): probability of the image being sharpened. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomAdjustSharpness
 
     def __init__(self, sharpness_factor: float, p: float = 0.5) -> None:
diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py
index 555010fda1e..66da9c187c0 100644
--- a/torchvision/transforms/v2/_container.py
+++ b/torchvision/transforms/v2/_container.py
@@ -9,6 +9,37 @@
 
 
 class Compose(Transform):
+    """[BETA] Composes several transforms together.
+
+    .. betastatus:: Compose transform
+
+    This transform does not support torchscript.
+    Please, see the note below.
+
+    Args:
+        transforms (list of ``Transform`` objects): list of transforms to compose.
+
+    Example:
+        >>> transforms.Compose([
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.PILToTensor(),
+        >>>     transforms.ConvertImageDtype(torch.float),
+        >>> ])
+
+    .. note::
+        In order to script the transformations, please use ``torch.nn.Sequential`` as below.
+
+        >>> transforms = torch.nn.Sequential(
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        >>> )
+        >>> scripted_transforms = torch.jit.script(transforms)
+
+        Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
+        `lambda` functions or ``PIL.Image``.
+
+    """
+
     def __init__(self, transforms: Sequence[Callable]) -> None:
         super().__init__()
         if not isinstance(transforms, Sequence):
@@ -29,6 +60,27 @@ def extra_repr(self) -> str:
 
 
 class RandomApply(Transform):
+    """[BETA] Apply randomly a list of transformations with a given probability.
+
+    .. betastatus:: RandomApply transform
+
+    .. note::
+        In order to script the transformation, please use ``torch.nn.ModuleList`` as input instead of list/tuple of
+        transforms as shown below:
+
+        >>> transforms = transforms.RandomApply(torch.nn.ModuleList([
+        >>>     transforms.ColorJitter(),
+        >>> ]), p=0.3)
+        >>> scripted_transforms = torch.jit.script(transforms)
+
+        Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
+        `lambda` functions or ``PIL.Image``.
+
+    Args:
+        transforms (sequence or torch.nn.Module): list of transformations
+        p (float): probability
+    """
+
     _v1_transform_cls = _transforms.RandomApply
 
     def __init__(self, transforms: Union[Sequence[Callable], nn.ModuleList], p: float = 0.5) -> None:
@@ -63,6 +115,12 @@ def extra_repr(self) -> str:
 
 
 class RandomChoice(Transform):
+    """[BETA] Apply single transformation randomly picked from a list.
+
+    .. betastatus:: RandomChoice transform
+
+    This transform does not support torchscript."""
+
     def __init__(
         self,
         transforms: Sequence[Callable],
@@ -99,6 +157,13 @@ def forward(self, *inputs: Any) -> Any:
 
 
 class RandomOrder(Transform):
+    """[BETA] Apply a list of transformations in a random order.
+
+    .. betastatus:: RandomOrder transform
+
+    This transform does not support torchscript.
+    """
+
     def __init__(self, transforms: Sequence[Callable]) -> None:
         if not isinstance(transforms, Sequence):
             raise TypeError("Argument transforms should be a sequence of callables")
diff --git a/torchvision/transforms/v2/_deprecated.py b/torchvision/transforms/v2/_deprecated.py
index bfb0d06239f..c44e6b08d11 100644
--- a/torchvision/transforms/v2/_deprecated.py
+++ b/torchvision/transforms/v2/_deprecated.py
@@ -10,6 +10,31 @@
 
 
 class ToTensor(Transform):
+    """[BETA] Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor.
+
+    .. betastatus:: ToTensor transform
+
+    .. warning::
+        :class:`v2.ToTensor` is deprecated and will be removed in a future release.
+        Please use instead ``transforms.Compose([transforms.ToImageTensor(), transforms.ConvertImageDtype()])``.
+
+    This transform does not support torchscript.
+
+
+    Converts a PIL Image or numpy.ndarray (H x W x C) in the range
+    [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]
+    if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr, RGBA, CMYK, 1)
+    or if the numpy.ndarray has dtype = np.uint8
+
+    In the other cases, tensors are returned without scaling.
+
+    .. note::
+        Because the input image is scaled to [0.0, 1.0], this transformation should not be used when
+        transforming target image masks. See the `references`_ for implementing the transforms for image masks.
+
+    .. _references: https://github.com/pytorch/vision/tree/main/references/segmentation
+    """
+
     _transformed_types = (PIL.Image.Image, np.ndarray)
 
     def __init__(self) -> None:
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index f1eed87b9c0..af8ca4b6471 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -26,6 +26,18 @@
 
 
 class RandomHorizontalFlip(_RandomApplyTransform):
+    """[BETA] Horizontally flip the given image/box/mask randomly with a given probability.
+
+    .. betastatus:: RandomHorizontalFlip transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading
+    dimensions
+
+    Args:
+        p (float): probability of the image being flipped. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomHorizontalFlip
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
@@ -33,6 +45,18 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomVerticalFlip(_RandomApplyTransform):
+    """[BETA] Vertically flip the given image/box/mask randomly with a given probability.
+
+    .. betastatus:: RandomVerticalFlip transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading
+    dimensions
+
+    Args:
+        p (float): probability of the image being flipped. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomVerticalFlip
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
@@ -40,6 +64,62 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class Resize(Transform):
+    """[BETA] Resize the input image/box/mask to the given size.
+
+    .. betastatus:: Resize transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    .. warning::
+        The output image might be different depending on its type: when downsampling, the interpolation of PIL images
+        and tensors is slightly different, because PIL applies antialiasing. This may lead to significant differences
+        in the performance of a network. Therefore, it is preferable to train and serve a model with the same input
+        types. See also below the ``antialias`` parameter, which can help making the output of PIL images and tensors
+        closer.
+
+    Args:
+        size (sequence or int): Desired output size. If size is a sequence like
+            (h, w), output size will be matched to this. If size is an int,
+            smaller edge of the image will be matched to this number.
+            i.e, if height > width, then image will be rescaled to
+            (size * height / width, size).
+
+            .. note::
+                In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        max_size (int, optional): The maximum allowed for the longer edge of
+            the resized image: if the longer edge of the image is greater
+            than ``max_size`` after being resized according to ``size``, then
+            the image is resized again so that the longer edge is equal to
+            ``max_size``. As a result, ``size`` might be overruled, i.e. the
+            smaller edge may be shorter than ``size``. This is only supported
+            if ``size`` is an int (or a sequence of length 1 in torchscript
+            mode).
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
+    """
+
     _v1_transform_cls = _transforms.Resize
 
     def __init__(
@@ -76,6 +156,20 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class CenterCrop(Transform):
+    """[BETA] Crops the given image/box/mask at the center.
+
+    .. betastatus:: CenterCrop transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+    """
+
     _v1_transform_cls = _transforms.CenterCrop
 
     def __init__(self, size: Union[int, Sequence[int]]):
@@ -87,6 +181,53 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomResizedCrop(Transform):
+    """[BETA] Crop a random portion of image/box/mask and resize it to a given size.
+
+    .. betastatus:: RandomResizedCrop transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    A crop of the original image is made: the crop has a random area (H * W)
+    and a random aspect ratio. This crop is finally resized to the given
+    size. This is popularly used to train the Inception networks.
+
+    Args:
+        size (int or sequence): expected output size of the crop, for each edge. If size is an
+            int instead of sequence like (h, w), a square output size ``(size, size)`` is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+
+            .. note::
+                In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
+        scale (tuple of float): Specifies the lower and upper bounds for the random area of the crop,
+            before resizing. The scale is defined with respect to the area of the original image.
+        ratio (tuple of float): lower and upper bounds for the random aspect ratio of the crop, before
+            resizing.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
+    """
+
     _v1_transform_cls = _transforms.RandomResizedCrop
 
     def __init__(
@@ -164,7 +305,24 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class FiveCrop(Transform):
-    """
+    """[BETA] Crop the given image/box/mask into four corners and the central crop.
+
+    .. betastatus:: FiveCrop transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading
+    dimensions
+
+    .. Note::
+         This transform returns a tuple of images and there may be a mismatch in the number of
+         inputs and targets your Dataset returns. See below for an example of how to deal with
+         this.
+
+    Args:
+         size (sequence or int): Desired output size of the crop. If size is an ``int``
+            instead of sequence like (h, w), a square crop of size (size, size) is made.
+            If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+
     Example:
         >>> class BatchMultiCrop(transforms.Transform):
         ...     def forward(self, sample: Tuple[Tuple[Union[datapoints.Image, datapoints.Video], ...], int]):
@@ -209,8 +367,27 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None:
 
 
 class TenCrop(Transform):
-    """
+    """[BETA] Crop the given image/box/mask into four corners and the central crop plus the flipped version of
+    these (horizontal flipping is used by default).
+
+    .. betastatus:: TenCrop transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading
+    dimensions.
+
     See :class:`~torchvision.transforms.v2.FiveCrop` for an example.
+
+    .. Note::
+         This transform returns a tuple of images and there may be a mismatch in the number of
+         inputs and targets your Dataset returns. See below for an example of how to deal with
+         this.
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+        vertical_flip (bool): Use vertical flipping instead of horizontal
     """
 
     _v1_transform_cls = _transforms.TenCrop
@@ -249,6 +426,46 @@ def _transform(
 
 
 class Pad(Transform):
+    """[BETA] Pad the given image/box/mask on all sides with the given "pad" value.
+
+    .. betastatus:: Pad transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means at most 2 leading dimensions for mode reflect and symmetric,
+    at most 3 leading dimensions for mode edge,
+    and an arbitrary number of leading dimensions for mode constant
+
+    Args:
+        padding (int or sequence): Padding on each border. If a single int is provided this
+            is used to pad all borders. If sequence of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a sequence of length 4 is provided
+            this is the padding for the left, top, right and bottom borders respectively.
+
+            .. note::
+                In torchscript mode padding as single int is not supported, use a sequence of
+                length 1: ``[padding, ]``.
+        fill (number or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant.
+            Only number is supported for torch Tensor.
+            Only int or tuple value is supported for PIL Image.
+        padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric.
+            Default is constant.
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value at the edge of the image.
+              If input a 5D torch Tensor, the last 3 dimensions will be padded instead of the last 2
+
+            - reflect: pads with reflection of image without repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+              will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+              will result in [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+
     _v1_transform_cls = _transforms.Pad
 
     def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
@@ -323,6 +540,34 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomRotation(Transform):
+    """[BETA] Rotate the image/box/mask by angle.
+
+    .. betastatus:: RandomRotation transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        degrees (sequence or number): Range of degrees to select from.
+            If degrees is a number instead of sequence like (min, max), the range of degrees
+            will be (-degrees, +degrees).
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        expand (bool, optional): Optional expansion flag.
+            If true, expands the output to make it large enough to hold the entire rotated image.
+            If false or omitted, make the output image the same size as the input image.
+            Note that the expand flag assumes rotation around the center and no translation.
+        center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
+            Default is the center of the image.
+        fill (sequence or number): Pixel fill value for the area outside the rotated
+            image. Default is ``0``. If given a number, the value is used for all bands respectively.
+
+    .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
+
+    """
+
     _v1_transform_cls = _transforms.RandomRotation
 
     def __init__(
@@ -363,6 +608,42 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomAffine(Transform):
+    """[BETA] Random affine transformation of the image/box/mask keeping center invariant.
+
+    .. betastatus:: RandomAffine transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        degrees (sequence or number): Range of degrees to select from.
+            If degrees is a number instead of sequence like (min, max), the range of degrees
+            will be (-degrees, +degrees). Set to 0 to deactivate rotations.
+        translate (tuple, optional): tuple of maximum absolute fraction for horizontal
+            and vertical translations. For example translate=(a, b), then horizontal shift
+            is randomly sampled in the range -img_width * a < dx < img_width * a and vertical shift is
+            randomly sampled in the range -img_height * b < dy < img_height * b. Will not translate by default.
+        scale (tuple, optional): scaling factor interval, e.g (a, b), then scale is
+            randomly sampled from the range a <= scale <= b. Will keep original scale by default.
+        shear (sequence or number, optional): Range of degrees to select from.
+            If shear is a number, a shear parallel to the x-axis in the range (-shear, +shear)
+            will be applied. Else if shear is a sequence of 2 values a shear parallel to the x-axis in the
+            range (shear[0], shear[1]) will be applied. Else if shear is a sequence of 4 values,
+            an x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
+            Will not apply shear by default.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (sequence or number): Pixel fill value for the area outside the transformed
+            image. Default is ``0``. If given a number, the value is used for all bands respectively.
+        center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
+            Default is the center of the image.
+
+    .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
+
+    """
+
     _v1_transform_cls = _transforms.RandomAffine
 
     def __init__(
@@ -443,6 +724,52 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomCrop(Transform):
+    """[BETA] Crop the given image/box/mask at a random location.
+
+    .. betastatus:: RandomCrop transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions,
+    but if non-constant padding is used, the input is expected to have at most 2 leading dimensions
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+        padding (int or sequence, optional): Optional padding on each border
+            of the image. Default is None. If a single int is provided this
+            is used to pad all borders. If sequence of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a sequence of length 4 is provided
+            this is the padding for the left, top, right and bottom borders respectively.
+
+            .. note::
+                In torchscript mode padding as single int is not supported, use a sequence of
+                length 1: ``[padding, ]``.
+        pad_if_needed (boolean): It will pad the image if smaller than the
+            desired size to avoid raising an exception. Since cropping is done
+            after padding, the padding seems to be done at a random offset.
+        fill (number or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant.
+            Only number is supported for torch Tensor.
+            Only int or tuple value is supported for PIL Image.
+        padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric.
+            Default is constant.
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value at the edge of the image.
+              If input a 5D torch Tensor, the last 3 dimensions will be padded instead of the last 2
+
+            - reflect: pads with reflection of image without repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+              will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+              will result in [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+
     _v1_transform_cls = _transforms.RandomCrop
 
     def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
@@ -552,6 +879,25 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomPerspective(_RandomApplyTransform):
+    """[BETA] Performs a random perspective transformation of the given image/box/mask with a given probability.
+
+    .. betastatus:: RandomPerspective transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        distortion_scale (float): argument to control the degree of distortion and ranges from 0 to 1.
+            Default is 0.5.
+        p (float): probability of the image being transformed. Default is 0.5.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (sequence or number): Pixel fill value for the area outside the transformed
+            image. Default is ``0``. If given a number, the value is used for all bands respectively.
+    """
+
     _v1_transform_cls = _transforms.RandomPerspective
 
     def __init__(
diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py
index 0d1544094ca..7d0f0ec39f9 100644
--- a/torchvision/transforms/v2/_meta.py
+++ b/torchvision/transforms/v2/_meta.py
@@ -22,6 +22,27 @@ def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> da
 
 
 class ConvertDtype(Transform):
+    """[BETA] Convert a tensor image/box/mask to the given ``dtype`` and scale the values accordingly
+
+    .. betastatus:: ConvertDtype transform
+
+    This function does not support PIL Image.
+
+    Args:
+        dtype (torch.dtype): Desired data type of the output
+
+    .. note::
+
+        When converting from a smaller to a larger integer ``dtype`` the maximum values are **not** mapped exactly.
+        If converted back and forth, this mismatch has no effect.
+
+    Raises:
+        RuntimeError: When trying to cast :class:`torch.float32` to :class:`torch.int32` or :class:`torch.int64` as
+            well as for trying to cast :class:`torch.float64` to :class:`torch.int64`. These conversions might lead to
+            overflow errors since the floating point ``dtype`` cannot store consecutive integers over the whole range
+            of the integer ``dtype``.
+    """
+
     _v1_transform_cls = _transforms.ConvertImageDtype
 
     _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 6dd0755cfbb..6998d416c91 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -21,6 +21,16 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class Lambda(Transform):
+    """[BETA] Apply a user-defined lambda as a transform.
+
+    .. betastatus:: Lambda transform
+
+    This transform does not support torchscript.
+
+    Args:
+        lambd (function): Lambda/function to be used for transform.
+    """
+
     def __init__(self, lambd: Callable[[Any], Any], *types: Type):
         super().__init__()
         self.lambd = lambd
@@ -42,6 +52,26 @@ def extra_repr(self) -> str:
 
 
 class LinearTransformation(Transform):
+    """[BETA] Transform a tensor image with a square transformation matrix and a mean_vector computed offline.
+
+    .. betastatus:: LinearTransformation transform
+
+    This transform does not support PIL Image.
+    Given transformation_matrix and mean_vector, will flatten the torch.*Tensor and
+    subtract mean_vector from it which is then followed by computing the dot
+    product with the transformation matrix and then reshaping the tensor to its
+    original shape.
+
+    Applications:
+        whitening transformation: Suppose X is a column vector zero-centered data.
+        Then compute the data covariance matrix [D x D] with torch.mm(X.t(), X),
+        perform SVD on this matrix and pass it as transformation_matrix.
+
+    Args:
+        transformation_matrix (Tensor): tensor [D x D], D = C x H x W
+        mean_vector (Tensor): tensor [D], D = C x H x W
+    """
+
     _v1_transform_cls = _transforms.LinearTransformation
 
     _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
@@ -105,6 +135,26 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class Normalize(Transform):
+    """[BETA] Normalize a tensor image with mean and standard deviation.
+
+    .. betastatus:: Normalize transform
+
+    This transform does not support PIL Image.
+    Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n``
+    channels, this transform will normalize each channel of the input
+    ``torch.*Tensor`` i.e.,
+    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
+
+    .. note::
+        This transform acts out of place, i.e., it does not mutate the input tensor.
+
+    Args:
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channel.
+        inplace(bool,optional): Bool to make this operation in-place.
+
+    """
+
     _v1_transform_cls = _transforms.Normalize
     _transformed_types = (datapoints.Image, is_simple_tensor, datapoints.Video)
 
@@ -125,6 +175,24 @@ def _transform(
 
 
 class GaussianBlur(Transform):
+    """[BETA] Blurs image with randomly chosen Gaussian blur.
+
+    .. betastatus:: GausssianBlur transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., C, H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        kernel_size (int or sequence): Size of the Gaussian kernel.
+        sigma (float or tuple of float (min, max)): Standard deviation to be used for
+            creating kernel to perform blurring. If float, sigma is fixed. If it is tuple
+            of float (min, max), sigma is chosen uniformly at random to lie in the
+            given range.
+
+    Returns:
+        PIL Image or Tensor: Gaussian blurred version of the input image.
+    """
+
     _v1_transform_cls = _transforms.GaussianBlur
 
     def __init__(
diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py
index 984d5ba50c0..b0743feb10d 100644
--- a/torchvision/transforms/v2/_type_conversion.py
+++ b/torchvision/transforms/v2/_type_conversion.py
@@ -11,6 +11,15 @@
 
 
 class PILToTensor(Transform):
+    """[BETA] Convert a ``PIL Image`` to a tensor of the same type.
+
+    .. betastatus:: PILToTensor transform
+
+    This transform does not support torchscript.
+
+    Converts a PIL Image (H x W x C) to a Tensor of shape (C x H x W).
+    """
+
     _transformed_types = (PIL.Image.Image,)
 
     def _transform(self, inpt: PIL.Image.Image, params: Dict[str, Any]) -> torch.Tensor:
@@ -27,6 +36,27 @@ def _transform(
 
 
 class ToImagePIL(Transform):
+    """[BETA] Convert a tensor or an ndarray to PIL Image.
+
+    .. betastatus:: ToImagePIL transform
+
+    This transform does not support torchscript.
+
+    Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape
+    H x W x C to a PIL Image while preserving the value range.
+
+    Args:
+        mode (`PIL.Image mode`_): color space and pixel depth of input data (optional).
+            If ``mode`` is ``None`` (default) there are some assumptions made about the input data:
+            - If the input has 4 channels, the ``mode`` is assumed to be ``RGBA``.
+            - If the input has 3 channels, the ``mode`` is assumed to be ``RGB``.
+            - If the input has 2 channels, the ``mode`` is assumed to be ``LA``.
+            - If the input has 1 channel, the ``mode`` is determined by the data type (i.e ``int``, ``float``,
+            ``short``).
+
+    .. _PIL.Image mode: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#concept-modes
+    """
+
     _transformed_types = (is_simple_tensor, datapoints.Image, np.ndarray)
 
     def __init__(self, mode: Optional[str] = None) -> None:

From 07023255c664f10f6ab442f3c65325cb1a81eae0 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 22 Feb 2023 11:34:07 +0100
Subject: [PATCH 04/27] reduce GHA log output (#7267)

Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 .github/workflows/test-linux-cpu.yml | 6 +++---
 .github/workflows/test-linux-gpu.yml | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/test-linux-cpu.yml b/.github/workflows/test-linux-cpu.yml
index 769ee4f841b..8a9f7d33b49 100644
--- a/.github/workflows/test-linux-cpu.yml
+++ b/.github/workflows/test-linux-cpu.yml
@@ -39,7 +39,7 @@ jobs:
         fi
 
         # Create Conda Env
-        conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy 'ffmpeg<4.3'
+        conda create -yp ci_env --quiet python="${PYTHON_VERSION}" numpy libpng jpeg scipy 'ffmpeg<4.3'
         conda activate /work/ci_env
 
         # Install PyTorch, Torchvision, and testing libraries
@@ -50,8 +50,8 @@ jobs:
           -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \
           "${CUDATOOLKIT}"
         python3 setup.py develop
-        python3 -m pip install pytest pytest-mock 'av<10'
+        python3 -m pip install --progress-bar=off pytest pytest-mock 'av<10'
 
         # Run Tests
         python3 -m torch.utils.collect_env
-        python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20
+        python3 -m pytest --junitxml=test-results/junit.xml --durations 20
diff --git a/.github/workflows/test-linux-gpu.yml b/.github/workflows/test-linux-gpu.yml
index 95d06402db1..d1275071bf7 100644
--- a/.github/workflows/test-linux-gpu.yml
+++ b/.github/workflows/test-linux-gpu.yml
@@ -43,7 +43,7 @@ jobs:
         fi
 
         # Create Conda Env
-        conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy 'ffmpeg<4.3'
+        conda create -yp ci_env --quiet python="${PYTHON_VERSION}" numpy libpng jpeg scipy 'ffmpeg<4.3'
         conda activate /work/ci_env
 
         # Install PyTorch, Torchvision, and testing libraries
@@ -54,8 +54,8 @@ jobs:
           -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \
           "${CUDATOOLKIT}"
         python3 setup.py develop
-        python3 -m pip install pytest pytest-mock 'av<10'
+        python3 -m pip install --progress-bar=off pytest pytest-mock 'av<10'
 
         # Run Tests
         python3 -m torch.utils.collect_env
-        python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20
+        python3 -m pytest --junitxml=test-results/junit.xml --durations 20

From 011ebd7478ae273a164e62157c0ed6c459eb2fc5 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 22 Feb 2023 13:27:17 +0100
Subject: [PATCH 05/27] align transforms v2 signatures with v1 (#7301)

Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 test/test_transforms_v2_consistency.py  | 9 ++++++---
 torchvision/transforms/v2/_container.py | 2 +-
 torchvision/transforms/v2/_geometry.py  | 8 ++++----
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index 125d7ec7a3f..43f17c9b15a 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -540,9 +540,12 @@ def test_signature_consistency(config):
             f"not. Please add a default value."
         )
 
-    legacy_kinds = {name: param.kind for name, param in legacy_params.items()}
-    prototype_kinds = {name: prototype_params[name].kind for name in legacy_kinds.keys()}
-    assert prototype_kinds == legacy_kinds
+    legacy_signature = list(legacy_params.keys())
+    # Since we made sure that we don't have any extra parameters without default above, we clamp the prototype signature
+    # to the same number of parameters as the legacy one
+    prototype_signature = list(prototype_params.keys())[: len(legacy_signature)]
+
+    assert prototype_signature == legacy_signature
 
 
 def check_call_consistency(
diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py
index 66da9c187c0..08282962ffd 100644
--- a/torchvision/transforms/v2/_container.py
+++ b/torchvision/transforms/v2/_container.py
@@ -124,8 +124,8 @@ class RandomChoice(Transform):
     def __init__(
         self,
         transforms: Sequence[Callable],
-        probabilities: Optional[List[float]] = None,
         p: Optional[List[float]] = None,
+        probabilities: Optional[List[float]] = None,
     ) -> None:
         if not isinstance(transforms, Sequence):
             raise TypeError("Argument transforms should be a sequence of callables")
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index af8ca4b6471..4d7a5fca384 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -575,8 +575,8 @@ def __init__(
         degrees: Union[numbers.Number, Sequence],
         interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
         expand: bool = False,
-        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
         center: Optional[List[float]] = None,
+        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
     ) -> None:
         super().__init__()
         self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
@@ -903,9 +903,9 @@ class RandomPerspective(_RandomApplyTransform):
     def __init__(
         self,
         distortion_scale: float = 0.5,
-        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
-        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
         p: float = 0.5,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
     ) -> None:
         super().__init__(p=p)
 
@@ -966,8 +966,8 @@ def __init__(
         self,
         alpha: Union[float, Sequence[float]] = 50.0,
         sigma: Union[float, Sequence[float]] = 5.0,
-        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
         interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
     ) -> None:
         super().__init__()
         self.alpha = _setup_float_or_seq(alpha, "alpha", 2)

From 72d48e277837516564c1ceee77669595c71f27cc Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 23 Feb 2023 12:05:31 +0000
Subject: [PATCH 06/27] Extend default heuristic of SanitizeBoundingBoxes to
 support tuples (#7304)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 test/test_transforms_v2.py         | 54 +++++++++++++++++++++++-------
 torchvision/transforms/v2/_misc.py | 31 ++++++++++++-----
 2 files changed, 63 insertions(+), 22 deletions(-)

diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 2e43c86f91d..a1e1cb720d5 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -1935,7 +1935,14 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
 @pytest.mark.parametrize(
     "labels_getter", ("default", "labels", lambda inputs: inputs["labels"], None, lambda inputs: None)
 )
-def test_sanitize_bounding_boxes(min_size, labels_getter):
+@pytest.mark.parametrize("sample_type", (tuple, dict))
+def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type):
+
+    if sample_type is tuple and not isinstance(labels_getter, str):
+        # The "lambda inputs: inputs["labels"]" labels_getter used in this test
+        # doesn't work if the input is a tuple.
+        return
+
     H, W = 256, 128
 
     boxes_and_validity = [
@@ -1970,35 +1977,56 @@ def test_sanitize_bounding_boxes(min_size, labels_getter):
     )
 
     masks = datapoints.Mask(torch.randint(0, 2, size=(boxes.shape[0], H, W)))
-
+    whatever = torch.rand(10)
+    input_img = torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8)
     sample = {
-        "image": torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8),
+        "image": input_img,
         "labels": labels,
         "boxes": boxes,
-        "whatever": torch.rand(10),
+        "whatever": whatever,
         "None": None,
         "masks": masks,
     }
 
+    if sample_type is tuple:
+        img = sample.pop("image")
+        sample = (img, sample)
+
     out = transforms.SanitizeBoundingBoxes(min_size=min_size, labels_getter=labels_getter)(sample)
 
-    assert out["image"] is sample["image"]
-    assert out["whatever"] is sample["whatever"]
+    if sample_type is tuple:
+        out_image = out[0]
+        out_labels = out[1]["labels"]
+        out_boxes = out[1]["boxes"]
+        out_masks = out[1]["masks"]
+        out_whatever = out[1]["whatever"]
+    else:
+        out_image = out["image"]
+        out_labels = out["labels"]
+        out_boxes = out["boxes"]
+        out_masks = out["masks"]
+        out_whatever = out["whatever"]
+
+    assert out_image is input_img
+    assert out_whatever is whatever
 
     if labels_getter is None or (callable(labels_getter) and labels_getter({"labels": "blah"}) is None):
-        assert out["labels"] is sample["labels"]
+        assert out_labels is labels
     else:
-        assert isinstance(out["labels"], torch.Tensor)
-        assert out["boxes"].shape[0] == out["labels"].shape[0] == out["masks"].shape[0]
+        assert isinstance(out_labels, torch.Tensor)
+        assert out_boxes.shape[0] == out_labels.shape[0] == out_masks.shape[0]
         # This works because we conveniently set labels to arange(num_boxes)
-        assert out["labels"].tolist() == valid_indices
+        assert out_labels.tolist() == valid_indices
 
 
 @pytest.mark.parametrize("key", ("labels", "LABELS", "LaBeL", "SOME_WEIRD_KEY_THAT_HAS_LABeL_IN_IT"))
-def test_sanitize_bounding_boxes_default_heuristic(key):
+@pytest.mark.parametrize("sample_type", (tuple, dict))
+def test_sanitize_bounding_boxes_default_heuristic(key, sample_type):
     labels = torch.arange(10)
-    d = {key: labels}
-    assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(d) is labels
+    sample = {key: labels, "another_key": "whatever"}
+    if sample_type is tuple:
+        sample = (None, sample, "whatever_again")
+    assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(sample) is labels
 
     if key.lower() != "labels":
         # If "labels" is in the dict (case-insensitive),
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 6998d416c91..8cc4aa6a3db 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -1,7 +1,7 @@
 import collections
 import warnings
 from contextlib import suppress
-from typing import Any, Callable, cast, Dict, List, Optional, Sequence, Type, Union
+from typing import Any, Callable, cast, Dict, List, Mapping, Optional, Sequence, Type, Union
 
 import PIL.Image
 
@@ -269,7 +269,9 @@ def __init__(
         elif callable(labels_getter):
             self._labels_getter = labels_getter
         elif isinstance(labels_getter, str):
-            self._labels_getter = lambda inputs: inputs[labels_getter]
+            self._labels_getter = lambda inputs: SanitizeBoundingBoxes._get_dict_or_second_tuple_entry(inputs)[
+                labels_getter  # type: ignore[index]
+            ]
         elif labels_getter is None:
             self._labels_getter = None
         else:
@@ -278,10 +280,27 @@ def __init__(
                 f"Got {labels_getter} of type {type(labels_getter)}."
             )
 
+    @staticmethod
+    def _get_dict_or_second_tuple_entry(inputs: Any) -> Mapping[str, Any]:
+        # datasets outputs may be plain dicts like {"img": ..., "labels": ..., "bbox": ...}
+        # or tuples like (img, {"labels":..., "bbox": ...})
+        # This hacky helper accounts for both structures.
+        if isinstance(inputs, tuple):
+            inputs = inputs[1]
+
+        if not isinstance(inputs, collections.abc.Mapping):
+            raise ValueError(
+                f"If labels_getter is a str or 'default', "
+                f"then the input to forward() must be a dict or a tuple whose second element is a dict."
+                f" Got {type(inputs)} instead."
+            )
+        return inputs
+
     @staticmethod
     def _find_labels_default_heuristic(inputs: Dict[str, Any]) -> Optional[torch.Tensor]:
-        # Tries to find a "label" key, otherwise tries for the first key that contains "label" - case insensitive
+        # Tries to find a "labels" key, otherwise tries for the first key that contains "label" - case insensitive
         # Returns None if nothing is found
+        inputs = SanitizeBoundingBoxes._get_dict_or_second_tuple_entry(inputs)
         candidate_key = None
         with suppress(StopIteration):
             candidate_key = next(key for key in inputs.keys() if key.lower() == "labels")
@@ -298,12 +317,6 @@ def _find_labels_default_heuristic(inputs: Dict[str, Any]) -> Optional[torch.Ten
     def forward(self, *inputs: Any) -> Any:
         inputs = inputs if len(inputs) > 1 else inputs[0]
 
-        if isinstance(self.labels_getter, str) and not isinstance(inputs, collections.abc.Mapping):
-            raise ValueError(
-                f"If labels_getter is a str or 'default' (got {self.labels_getter}), "
-                f"then the input to forward() must be a dict. Got {type(inputs)} instead."
-            )
-
         if self._labels_getter is None:
             labels = None
         else:

From b598de48d3e6b8293ba8eb0315da5e6504026d44 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 23 Feb 2023 14:46:35 +0100
Subject: [PATCH 07/27] add end-to-end example gallery for transforms v2
 (#7302)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 docs/requirements.txt                       |   1 +
 gallery/assets/coco/images/000000000001.jpg |   1 +
 gallery/assets/coco/images/000000000002.jpg |   1 +
 gallery/assets/coco/instances.json          |   1 +
 gallery/plot_transforms_v2_e2e.py           | 152 ++++++++++++++++++++
 5 files changed, 156 insertions(+)
 create mode 120000 gallery/assets/coco/images/000000000001.jpg
 create mode 120000 gallery/assets/coco/images/000000000002.jpg
 create mode 100644 gallery/assets/coco/instances.json
 create mode 100644 gallery/plot_transforms_v2_e2e.py

diff --git a/docs/requirements.txt b/docs/requirements.txt
index 09a11359ae7..2a50d9b8f45 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -5,3 +5,4 @@ sphinx-gallery>=0.11.1
 sphinx==5.0.0
 tabulate
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+pycocotools
diff --git a/gallery/assets/coco/images/000000000001.jpg b/gallery/assets/coco/images/000000000001.jpg
new file mode 120000
index 00000000000..9be80c7c273
--- /dev/null
+++ b/gallery/assets/coco/images/000000000001.jpg
@@ -0,0 +1 @@
+../../astronaut.jpg
\ No newline at end of file
diff --git a/gallery/assets/coco/images/000000000002.jpg b/gallery/assets/coco/images/000000000002.jpg
new file mode 120000
index 00000000000..9f8efef9928
--- /dev/null
+++ b/gallery/assets/coco/images/000000000002.jpg
@@ -0,0 +1 @@
+../../dog2.jpg
\ No newline at end of file
diff --git a/gallery/assets/coco/instances.json b/gallery/assets/coco/instances.json
new file mode 100644
index 00000000000..fe0e09270bf
--- /dev/null
+++ b/gallery/assets/coco/instances.json
@@ -0,0 +1 @@
+{"images": [{"file_name": "000000000001.jpg", "height": 512, "width": 512, "id": 1}, {"file_name": "000000000002.jpg", "height": 500, "width": 500, "id": 2}], "annotations": [{"segmentation": [[40.0, 511.0, 26.0, 487.0, 28.0, 438.0, 17.0, 397.0, 24.0, 346.0, 38.0, 306.0, 61.0, 250.0, 111.0, 206.0, 111.0, 187.0, 120.0, 183.0, 136.0, 159.0, 159.0, 150.0, 181.0, 148.0, 182.0, 132.0, 175.0, 132.0, 168.0, 120.0, 154.0, 102.0, 153.0, 62.0, 188.0, 35.0, 191.0, 29.0, 208.0, 20.0, 210.0, 22.0, 227.0, 16.0, 240.0, 16.0, 276.0, 31.0, 285.0, 39.0, 301.0, 88.0, 297.0, 108.0, 281.0, 128.0, 273.0, 138.0, 266.0, 138.0, 264.0, 153.0, 257.0, 162.0, 256.0, 174.0, 284.0, 197.0, 300.0, 221.0, 303.0, 236.0, 337.0, 258.0, 357.0, 306.0, 361.0, 351.0, 358.0, 511.0]], "iscrowd": 0, "image_id": 1, "bbox": [17.0, 16.0, 344.0, 495.0], "category_id": 1, "id": 1}, {"segmentation": [[0.0, 411.0, 43.0, 401.0, 99.0, 395.0, 105.0, 351.0, 124.0, 326.0, 181.0, 294.0, 227.0, 280.0, 245.0, 262.0, 259.0, 234.0, 262.0, 207.0, 271.0, 140.0, 283.0, 139.0, 301.0, 162.0, 309.0, 181.0, 341.0, 175.0, 362.0, 139.0, 369.0, 139.0, 377.0, 163.0, 378.0, 203.0, 381.0, 212.0, 380.0, 220.0, 382.0, 242.0, 404.0, 264.0, 392.0, 293.0, 384.0, 295.0, 385.0, 316.0, 399.0, 343.0, 391.0, 448.0, 452.0, 475.0, 457.0, 494.0, 436.0, 498.0, 402.0, 491.0, 369.0, 488.0, 366.0, 496.0, 319.0, 496.0, 302.0, 485.0, 226.0, 469.0, 128.0, 456.0, 74.0, 458.0, 29.0, 439.0, 0.0, 445.0]], "iscrowd": 0, "image_id": 2, "bbox": [0.0, 139.0, 457.0, 359.0], "category_id": 18, "id": 2}]}
diff --git a/gallery/plot_transforms_v2_e2e.py b/gallery/plot_transforms_v2_e2e.py
new file mode 100644
index 00000000000..938578e4af9
--- /dev/null
+++ b/gallery/plot_transforms_v2_e2e.py
@@ -0,0 +1,152 @@
+"""
+==================================================
+transforms v2: End-to-end object detection example
+==================================================
+
+Object detection is not supported out of the box by ``torchvision.transforms`` v1, since it only supports images.
+``torchvision.transforms.v2`` enables jointly transforming images, videos, bounding boxes, and masks. This example
+showcases an end-to-end object detection training using the stable ``torchvisio.datasets`` and ``torchvision.models`` as
+well as the new ``torchvision.transforms.v2`` v2 API.
+"""
+
+import pathlib
+from collections import defaultdict
+
+import PIL.Image
+
+import torch
+import torch.utils.data
+
+import torchvision
+
+
+# sphinx_gallery_thumbnail_number = -1
+def show(sample):
+    import matplotlib.pyplot as plt
+
+    from torchvision.transforms.v2 import functional as F
+    from torchvision.utils import draw_bounding_boxes
+
+    image, target = sample
+    if isinstance(image, PIL.Image.Image):
+        image = F.to_image_tensor(image)
+    image = F.convert_dtype(image, torch.uint8)
+    annotated_image = draw_bounding_boxes(image, target["boxes"], colors="yellow", width=3)
+
+    fig, ax = plt.subplots()
+    ax.imshow(annotated_image.permute(1, 2, 0).numpy())
+    ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
+    fig.tight_layout()
+
+    fig.show()
+
+
+# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that
+# some APIs may slightly change in the future
+torchvision.disable_beta_transforms_warning()
+
+from torchvision import models, datasets
+import torchvision.transforms.v2 as transforms
+
+
+########################################################################################################################
+# We start off by loading the :class:`~torchvision.datasets.CocoDetection` dataset to have a look at what it currently
+# returns, and we'll see how to convert it to a format that is compatible with our new transforms.
+
+
+def load_example_coco_detection_dataset(**kwargs):
+    # This loads fake data for illustration purposes of this example. In practice, you'll have
+    # to replace this with the proper data
+    root = pathlib.Path("assets") / "coco"
+    return datasets.CocoDetection(str(root / "images"), str(root / "instances.json"), **kwargs)
+
+
+dataset = load_example_coco_detection_dataset()
+
+sample = dataset[0]
+image, target = sample
+print(type(image))
+print(type(target), type(target[0]), list(target[0].keys()))
+
+
+########################################################################################################################
+# The dataset returns a two-tuple with the first item being a :class:`PIL.Image.Image` and second one a list of
+# dictionaries, which each containing the annotations for a single object instance. As is, this format is not compatible
+# with the ``torchvision.transforms.v2``, nor with the models. To overcome that, we provide the
+# :func:`~torchvision.datasets.wrap_dataset_for_transforms_v2` function. For
+# :class:`~torchvision.datasets.CocoDetection`, this changes the target structure to a single dictionary of lists. It
+# also adds the key-value-pairs ``"boxes"``, ``"masks"``, and ``"labels"`` wrapped in the corresponding
+# ``torchvision.datapoints``.
+
+dataset = datasets.wrap_dataset_for_transforms_v2(dataset)
+
+sample = dataset[0]
+image, target = sample
+print(type(image))
+print(type(target), list(target.keys()))
+print(type(target["boxes"]), type(target["masks"]), type(target["labels"]))
+
+########################################################################################################################
+# As baseline, let's have a look at a sample without transformations:
+
+show(sample)
+
+
+########################################################################################################################
+# With the dataset properly set up, we can now define the augmentation pipeline. This is done the same way it is done in
+# ``torchvision.transforms`` v1, but now handles bounding boxes and masks without any extra configuration.
+
+transform = transforms.Compose(
+    [
+        transforms.RandomPhotometricDistort(),
+        transforms.RandomZoomOut(
+            fill=defaultdict(lambda: 0, {PIL.Image.Image: (123, 117, 104)})
+        ),
+        transforms.RandomIoUCrop(),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToImageTensor(),
+        transforms.ConvertImageDtype(torch.float32),
+        transforms.SanitizeBoundingBoxes(),
+    ]
+)
+
+########################################################################################################################
+# .. note::
+#    Although the :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` transform is a no-op in this example, but it
+#    should be placed at least once at the end of a detection pipeline to remove degenerate bounding boxes as well as
+#    the corresponding labels and optionally masks. It is particularly critical to add it if
+#    :class:`~torchvision.transforms.v2.RandomIoUCrop` was used.
+#
+# Let's look how the sample looks like with our augmentation pipeline in place:
+
+dataset = load_example_coco_detection_dataset(transforms=transform)
+dataset = datasets.wrap_dataset_for_transforms_v2(dataset)
+
+torch.manual_seed(3141)
+sample = dataset[0]
+
+show(sample)
+
+
+########################################################################################################################
+# We can see that the color of the image was distorted, we zoomed out on it (off center) and flipped it horizontally.
+# In all of this, the bounding box was transformed accordingly. And without any further ado, we can start training.
+
+data_loader = torch.utils.data.DataLoader(
+    dataset,
+    batch_size=2,
+    # We need a custom collation function here, since the object detection models expect a
+    # sequence of images and target dictionaries. The default collation function tries to
+    # `torch.stack` the individual elements, which fails in general for object detection,
+    # because the number of object instances varies between the samples. This is the same for
+    # `torchvision.transforms` v1
+    collate_fn=lambda batch: tuple(zip(*batch)),
+)
+
+model = models.get_model("ssd300_vgg16", weights=None, weights_backbone=None).train()
+
+for images, targets in data_loader:
+    loss_dict = model(images, targets)
+    print(loss_dict)
+    # Put your training logic here
+    break

From dd5cec3557a760d2f634754e2783304952a26ff5 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 23 Feb 2023 16:25:44 +0000
Subject: [PATCH 08/27] Add v2 docs for color transforms (#7310)

---
 docs/source/transforms.rst          |  1 +
 torchvision/transforms/v2/_color.py | 43 +++++++++++++++++++++++------
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 00d929d0675..c2e9855d9e8 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -131,6 +131,7 @@ Color
 
     ColorJitter
     v2.ColorJitter
+    v2.RandomPhotometricDistort
     Grayscale
     v2.Grayscale
     RandomGrayscale
diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py
index 785a3965e60..2a581bf5640 100644
--- a/torchvision/transforms/v2/_color.py
+++ b/torchvision/transforms/v2/_color.py
@@ -46,7 +46,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomGrayscale(_RandomApplyTransform):
-    """[BETA] Randomly convert image to grayscale with a probability of p (default 0.1).
+    """[BETA] Randomly convert image or videos to grayscale with a probability of p (default 0.1).
 
     .. betastatus:: RandomGrayscale transform
 
@@ -85,7 +85,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class ColorJitter(Transform):
-    """[BETA] Randomly change the brightness, contrast, saturation and hue of an image.
+    """[BETA] Randomly change the brightness, contrast, saturation and hue of an image or video.
 
     .. betastatus:: ColorJitter transform
 
@@ -190,6 +190,31 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 # TODO: This class seems to be untested
 class RandomPhotometricDistort(Transform):
+    """[BETA] Randomly distorts the image or video as used in `SSD: Single Shot
+    MultiBox Detector <https://arxiv.org/abs/1512.02325>`_.
+
+    .. betastatus:: RandomPhotometricDistort transform
+
+    This transform relies on :class:`~torchvision.transforms.v2.ColorJitter`
+    under the hood to adjust the contrast, saturation, hue, brightness, and also
+    randomly permutes channels.
+
+    Args:
+        brightness (tuple of float (min, max), optional): How much to jitter brightness.
+            brightness_factor is chosen uniformly from [min, max]. Should be non negative numbers.
+        contrast tuple of float (min, max), optional): How much to jitter contrast.
+            contrast_factor is chosen uniformly from [min, max]. Should be non-negative numbers.
+        saturation (tuple of float (min, max), optional): How much to jitter saturation.
+            saturation_factor is chosen uniformly from [min, max]. Should be non negative numbers.
+        hue (tuple of float (min, max), optional): How much to jitter hue.
+            hue_factor is chosen uniformly from [min, max].  Should have -0.5 <= min <= max <= 0.5.
+            To jitter hue, the pixel values of the input image has to be non-negative for conversion to HSV space;
+            thus it does not work if you normalize your image to an interval with negative values,
+            or use an interpolation that generates negative values before using this function.
+        p (float, optional) probability each distortion operation (contrast, saturation, ...) to be applied.
+            Default is 0.5.
+    """
+
     _transformed_types = (
         datapoints.Image,
         PIL.Image.Image,
@@ -199,10 +224,10 @@ class RandomPhotometricDistort(Transform):
 
     def __init__(
         self,
+        brightness: Tuple[float, float] = (0.875, 1.125),
         contrast: Tuple[float, float] = (0.5, 1.5),
         saturation: Tuple[float, float] = (0.5, 1.5),
         hue: Tuple[float, float] = (-0.05, 0.05),
-        brightness: Tuple[float, float] = (0.875, 1.125),
         p: float = 0.5,
     ):
         super().__init__()
@@ -266,7 +291,7 @@ def _transform(
 
 
 class RandomEqualize(_RandomApplyTransform):
-    """[BETA] Equalize the histogram of the given image randomly with a given probability.
+    """[BETA] Equalize the histogram of the given image or video with a given probability.
 
     .. betastatus:: RandomEqualize transform
 
@@ -285,7 +310,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomInvert(_RandomApplyTransform):
-    """[BETA] Inverts the colors of the given image randomly with a given probability.
+    """[BETA] Inverts the colors of the given image or video with a given probability.
 
     .. betastatus:: RandomInvert transform
 
@@ -304,7 +329,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomPosterize(_RandomApplyTransform):
-    """[BETA] Posterize the image randomly with a given probability by reducing the
+    """[BETA] Posterize the image or video with a given probability by reducing the
     number of bits for each color channel.
 
     .. betastatus:: RandomPosterize transform
@@ -329,7 +354,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomSolarize(_RandomApplyTransform):
-    """[BETA] Solarize the image randomly with a given probability by inverting all pixel
+    """[BETA] Solarize the image or video with a given probability by inverting all pixel
     values above a threshold.
 
     .. betastatus:: RandomSolarize transform
@@ -354,7 +379,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomAutocontrast(_RandomApplyTransform):
-    """[BETA] Autocontrast the pixels of the given image randomly with a given probability.
+    """[BETA] Autocontrast the pixels of the given image or video with a given probability.
 
     .. betastatus:: RandomAutocontrast transform
 
@@ -373,7 +398,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomAdjustSharpness(_RandomApplyTransform):
-    """[BETA] Adjust the sharpness of the image randomly with a given probability.
+    """[BETA] Adjust the sharpness of the image or video with a given probability.
 
     .. betastatus:: RandomAdjustSharpness transform
 

From 4fb043e09278c4fb053aaf8db710b72247fd16b5 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 23 Feb 2023 16:26:25 +0000
Subject: [PATCH 09/27] Add docs for containers and undeprecate p for
 RandomChoice (#7311)

Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 test/test_transforms_v2.py              |  5 +---
 torchvision/transforms/v2/_container.py | 37 +++++++++++++------------
 2 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index a1e1cb720d5..9173ec14f2c 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -1359,11 +1359,8 @@ def test_ctor(self, transform_cls, trfms):
 
 class TestRandomChoice:
     def test_assertions(self):
-        with pytest.warns(UserWarning, match="Argument p is deprecated and will be removed"):
-            transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], p=[1, 2])
-
         with pytest.raises(ValueError, match="The number of probabilities doesn't match the number of transforms"):
-            transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], probabilities=[1])
+            transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], p=[1])
 
 
 class TestRandomIoUCrop:
diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py
index 08282962ffd..27affc7100b 100644
--- a/torchvision/transforms/v2/_container.py
+++ b/torchvision/transforms/v2/_container.py
@@ -1,4 +1,3 @@
-import warnings
 from typing import Any, Callable, Dict, List, Optional, Sequence, Union
 
 import torch
@@ -78,7 +77,7 @@ class RandomApply(Transform):
 
     Args:
         transforms (sequence or torch.nn.Module): list of transformations
-        p (float): probability
+        p (float): probability of applying the list of transforms
     """
 
     _v1_transform_cls = _transforms.RandomApply
@@ -119,39 +118,38 @@ class RandomChoice(Transform):
 
     .. betastatus:: RandomChoice transform
 
-    This transform does not support torchscript."""
+    This transform does not support torchscript.
+
+    Args:
+        transforms (sequence or torch.nn.Module): list of transformations
+        p (list of floats or None, optional): probability of each transform being picked.
+            If ``p`` doesn't sum to 1, it is automatically normalized. If ``None``
+            (default), all transforms have the same probability.
+    """
 
     def __init__(
         self,
         transforms: Sequence[Callable],
         p: Optional[List[float]] = None,
-        probabilities: Optional[List[float]] = None,
     ) -> None:
         if not isinstance(transforms, Sequence):
             raise TypeError("Argument transforms should be a sequence of callables")
-        if p is not None:
-            warnings.warn(
-                "Argument p is deprecated and will be removed in a future release. "
-                "Please use probabilities argument instead."
-            )
-            probabilities = p
 
-        if probabilities is None:
-            probabilities = [1] * len(transforms)
-        elif len(probabilities) != len(transforms):
+        if p is None:
+            p = [1] * len(transforms)
+        elif len(p) != len(transforms):
             raise ValueError(
-                f"The number of probabilities doesn't match the number of transforms: "
-                f"{len(probabilities)} != {len(transforms)}"
+                f"The number of p doesn't match the number of transforms: " f"{len(p)} != {len(transforms)}"
             )
 
         super().__init__()
 
         self.transforms = transforms
-        total = sum(probabilities)
-        self.probabilities = [prob / total for prob in probabilities]
+        total = sum(p)
+        self.p = [prob / total for prob in p]
 
     def forward(self, *inputs: Any) -> Any:
-        idx = int(torch.multinomial(torch.tensor(self.probabilities), 1))
+        idx = int(torch.multinomial(torch.tensor(self.p), 1))
         transform = self.transforms[idx]
         return transform(*inputs)
 
@@ -162,6 +160,9 @@ class RandomOrder(Transform):
     .. betastatus:: RandomOrder transform
 
     This transform does not support torchscript.
+
+    Args:
+        transforms (sequence or torch.nn.Module): list of transformations
     """
 
     def __init__(self, transforms: Sequence[Callable]) -> None:

From 684f8d24aafa812b281f4876d2f217cc43c10464 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Thu, 23 Feb 2023 17:57:22 +0100
Subject: [PATCH 10/27] Updated geometric transforms v2 docstring (#7303)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 docs/source/transforms.rst             |  12 +-
 torchvision/transforms/v2/_geometry.py | 402 ++++++++++++++++++++-----
 2 files changed, 334 insertions(+), 80 deletions(-)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index c2e9855d9e8..ddd6f37d083 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -99,10 +99,14 @@ Geometry
 
     Resize
     v2.Resize
+    v2.ScaleJitter
+    v2.RandomShortestSize
+    v2.RandomResize
     RandomCrop
     v2.RandomCrop
     RandomResizedCrop
     v2.RandomResizedCrop
+    v2.RandomIoUCrop
     CenterCrop
     v2.CenterCrop
     FiveCrop
@@ -111,17 +115,21 @@ Geometry
     v2.TenCrop
     Pad
     v2.Pad
+    v2.RandomZoomOut
+    RandomRotation
+    v2.RandomRotation
     RandomAffine
     v2.RandomAffine
     RandomPerspective
     v2.RandomPerspective
-    RandomRotation
-    v2.RandomRotation
+    ElasticTransform
+    v2.ElasticTransform
     RandomHorizontalFlip
     v2.RandomHorizontalFlip
     RandomVerticalFlip
     v2.RandomVerticalFlip
 
+
 Color
 -----
 
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index 4d7a5fca384..c3342eb9926 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -26,16 +26,17 @@
 
 
 class RandomHorizontalFlip(_RandomApplyTransform):
-    """[BETA] Horizontally flip the given image/box/mask randomly with a given probability.
+    """[BETA] Horizontally flip the input with a given probability.
 
     .. betastatus:: RandomHorizontalFlip transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading
-    dimensions
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
     Args:
-        p (float): probability of the image being flipped. Default value is 0.5
+        p (float, optional): probability of the input being flipped. Default value is 0.5
     """
 
     _v1_transform_cls = _transforms.RandomHorizontalFlip
@@ -45,16 +46,17 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomVerticalFlip(_RandomApplyTransform):
-    """[BETA] Vertically flip the given image/box/mask randomly with a given probability.
+    """[BETA] Vertically flip the input with a given probability.
 
     .. betastatus:: RandomVerticalFlip transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading
-    dimensions
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
     Args:
-        p (float): probability of the image being flipped. Default value is 0.5
+        p (float, optional): probability of the input being flipped. Default value is 0.5
     """
 
     _v1_transform_cls = _transforms.RandomVerticalFlip
@@ -64,12 +66,14 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class Resize(Transform):
-    """[BETA] Resize the input image/box/mask to the given size.
+    """[BETA] Resize the input to the given size.
 
     .. betastatus:: Resize transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
     .. warning::
         The output image might be different depending on its type: when downsampling, the interpolation of PIL images
@@ -87,7 +91,7 @@ class Resize(Transform):
 
             .. note::
                 In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
-        interpolation (InterpolationMode): Desired interpolation enum defined by
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
             ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
@@ -156,12 +160,15 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class CenterCrop(Transform):
-    """[BETA] Crops the given image/box/mask at the center.
+    """[BETA] Crop the input at the center.
 
     .. betastatus:: CenterCrop transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
     If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.
 
     Args:
@@ -181,14 +188,16 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomResizedCrop(Transform):
-    """[BETA] Crop a random portion of image/box/mask and resize it to a given size.
+    """[BETA] Crop a random portion of the input and resize it to a given size.
 
     .. betastatus:: RandomResizedCrop transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
-    A crop of the original image is made: the crop has a random area (H * W)
+    A crop of the original input is made: the crop has a random area (H * W)
     and a random aspect ratio. This crop is finally resized to the given
     size. This is popularly used to train the Inception networks.
 
@@ -199,11 +208,11 @@ class RandomResizedCrop(Transform):
 
             .. note::
                 In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
-        scale (tuple of float): Specifies the lower and upper bounds for the random area of the crop,
+        scale (tuple of float, optional): Specifies the lower and upper bounds for the random area of the crop,
             before resizing. The scale is defined with respect to the area of the original image.
-        ratio (tuple of float): lower and upper bounds for the random aspect ratio of the crop, before
+        ratio (tuple of float, optional): lower and upper bounds for the random aspect ratio of the crop, before
             resizing.
-        interpolation (InterpolationMode): Desired interpolation enum defined by
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
             ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
@@ -305,13 +314,13 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class FiveCrop(Transform):
-    """[BETA] Crop the given image/box/mask into four corners and the central crop.
+    """[BETA] Crop the image or video into four corners and the central crop.
 
     .. betastatus:: FiveCrop transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading
-    dimensions
+    If the input is a :class:`torch.Tensor` or a :class:`~torchvision.datapoints.Image` or a
+    :class:`~torchvision.datapoints.Video` it can have arbitrary number of leading batch dimensions.
+    For example, the image can have ``[..., C, H, W]`` shape.
 
     .. Note::
          This transform returns a tuple of images and there may be a mismatch in the number of
@@ -367,14 +376,14 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None:
 
 
 class TenCrop(Transform):
-    """[BETA] Crop the given image/box/mask into four corners and the central crop plus the flipped version of
+    """[BETA] Crop the image or video into four corners and the central crop plus the flipped version of
     these (horizontal flipping is used by default).
 
     .. betastatus:: TenCrop transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading
-    dimensions.
+    If the input is a :class:`torch.Tensor` or a :class:`~torchvision.datapoints.Image` or a
+    :class:`~torchvision.datapoints.Video` it can have arbitrary number of leading batch dimensions.
+    For example, the image can have ``[..., C, H, W]`` shape.
 
     See :class:`~torchvision.transforms.v2.FiveCrop` for an example.
 
@@ -387,7 +396,7 @@ class TenCrop(Transform):
         size (sequence or int): Desired output size of the crop. If size is an
             int instead of sequence like (h, w), a square crop (size, size) is
             made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
-        vertical_flip (bool): Use vertical flipping instead of horizontal
+        vertical_flip (bool, optional): Use vertical flipping instead of horizontal
     """
 
     _v1_transform_cls = _transforms.TenCrop
@@ -426,14 +435,14 @@ def _transform(
 
 
 class Pad(Transform):
-    """[BETA] Pad the given image/box/mask on all sides with the given "pad" value.
+    """[BETA] Pad the input on all sides with the given "pad" value.
 
     .. betastatus:: Pad transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means at most 2 leading dimensions for mode reflect and symmetric,
-    at most 3 leading dimensions for mode edge,
-    and an arbitrary number of leading dimensions for mode constant
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
     Args:
         padding (int or sequence): Padding on each border. If a single int is provided this
@@ -444,18 +453,17 @@ class Pad(Transform):
             .. note::
                 In torchscript mode padding as single int is not supported, use a sequence of
                 length 1: ``[padding, ]``.
-        fill (number or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of
-            length 3, it is used to fill R, G, B channels respectively.
-            This value is only used when the padding_mode is constant.
-            Only number is supported for torch Tensor.
-            Only int or tuple value is supported for PIL Image.
-        padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric.
-            Default is constant.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+        padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric.
+            Default is "constant".
 
             - constant: pads with a constant value, this value is specified with fill
 
             - edge: pads with the last value at the edge of the image.
-              If input a 5D torch Tensor, the last 3 dimensions will be padded instead of the last 2
 
             - reflect: pads with reflection of image without repeating the last value on the edge.
               For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
@@ -501,6 +509,37 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomZoomOut(_RandomApplyTransform):
+    """[BETA] "Zoom out" transformation from
+    `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
+
+    .. betastatus:: RandomZoomOut transform
+
+    This transformation randomly pads images, videos, bounding boxes and masks creating a zoom out effect.
+    Output spatial size is randomly sampled from original size up to a maximum size configured
+    with ``side_range`` parameter:
+
+    .. code-block:: python
+
+        r = uniform_sample(side_range[0], side_range[1])
+        output_width = input_width * r
+        output_height = input_height * r
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+        side_range (sequence of floats, optional): tuple of two floats defines minimum and maximum factors to
+            scale the input size.
+        p (float, optional): probability of the input being flipped. Default value is 0.5
+    """
+
     def __init__(
         self,
         fill: Union[datapoints._FillType, Dict[Type, datapoints._FillType]] = 0,
@@ -540,18 +579,20 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomRotation(Transform):
-    """[BETA] Rotate the image/box/mask by angle.
+    """[BETA] Rotate the input by angle.
 
     .. betastatus:: RandomRotation transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
     Args:
         degrees (sequence or number): Range of degrees to select from.
             If degrees is a number instead of sequence like (min, max), the range of degrees
             will be (-degrees, +degrees).
-        interpolation (InterpolationMode): Desired interpolation enum defined by
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
             The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
@@ -561,8 +602,11 @@ class RandomRotation(Transform):
             Note that the expand flag assumes rotation around the center and no translation.
         center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
             Default is the center of the image.
-        fill (sequence or number): Pixel fill value for the area outside the rotated
-            image. Default is ``0``. If given a number, the value is used for all bands respectively.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
 
     .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
 
@@ -608,12 +652,14 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomAffine(Transform):
-    """[BETA] Random affine transformation of the image/box/mask keeping center invariant.
+    """[BETA] Random affine transformation the input keeping center invariant.
 
     .. betastatus:: RandomAffine transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
     Args:
         degrees (sequence or number): Range of degrees to select from.
@@ -631,12 +677,15 @@ class RandomAffine(Transform):
             range (shear[0], shear[1]) will be applied. Else if shear is a sequence of 4 values,
             an x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
             Will not apply shear by default.
-        interpolation (InterpolationMode): Desired interpolation enum defined by
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
             The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
-        fill (sequence or number): Pixel fill value for the area outside the transformed
-            image. Default is ``0``. If given a number, the value is used for all bands respectively.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
         center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
             Default is the center of the image.
 
@@ -724,13 +773,14 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomCrop(Transform):
-    """[BETA] Crop the given image/box/mask at a random location.
+    """[BETA] Crop the input at a random location.
 
     .. betastatus:: RandomCrop transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions,
-    but if non-constant padding is used, the input is expected to have at most 2 leading dimensions
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
     Args:
         size (sequence or int): Desired output size of the crop. If size is an
@@ -745,21 +795,20 @@ class RandomCrop(Transform):
             .. note::
                 In torchscript mode padding as single int is not supported, use a sequence of
                 length 1: ``[padding, ]``.
-        pad_if_needed (boolean): It will pad the image if smaller than the
+        pad_if_needed (boolean, optional): It will pad the image if smaller than the
             desired size to avoid raising an exception. Since cropping is done
             after padding, the padding seems to be done at a random offset.
-        fill (number or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of
-            length 3, it is used to fill R, G, B channels respectively.
-            This value is only used when the padding_mode is constant.
-            Only number is supported for torch Tensor.
-            Only int or tuple value is supported for PIL Image.
-        padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+        padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric.
             Default is constant.
 
             - constant: pads with a constant value, this value is specified with fill
 
             - edge: pads with the last value at the edge of the image.
-              If input a 5D torch Tensor, the last 3 dimensions will be padded instead of the last 2
 
             - reflect: pads with reflection of image without repeating the last value on the edge.
               For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
@@ -879,23 +928,28 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomPerspective(_RandomApplyTransform):
-    """[BETA] Performs a random perspective transformation of the given image/box/mask with a given probability.
+    """[BETA] Perform a random perspective transformation of the input with a given probability.
 
     .. betastatus:: RandomPerspective transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
 
     Args:
-        distortion_scale (float): argument to control the degree of distortion and ranges from 0 to 1.
+        distortion_scale (float, optional): argument to control the degree of distortion and ranges from 0 to 1.
             Default is 0.5.
-        p (float): probability of the image being transformed. Default is 0.5.
-        interpolation (InterpolationMode): Desired interpolation enum defined by
+        p (float, optional): probability of the input being transformed. Default is 0.5.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
             The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
-        fill (sequence or number): Pixel fill value for the area outside the transformed
-            image. Default is ``0``. If given a number, the value is used for all bands respectively.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
     """
 
     _v1_transform_cls = _transforms.RandomPerspective
@@ -960,6 +1014,46 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class ElasticTransform(Transform):
+    """[BETA] Transform the input with elastic transformations.
+
+    .. betastatus:: RandomPerspective transform
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Given alpha and sigma, it will generate displacement
+    vectors for all pixels based on random offsets. Alpha controls the strength
+    and sigma controls the smoothness of the displacements.
+    The displacements are added to an identity grid and the resulting grid is
+    used to transform the input.
+
+    .. note::
+        Implementation to transform bounding boxes is approximative (not exact).
+        We construct an approximation of the inverse grid as ``inverse_grid = idenity - displacement``.
+        This is not an exact inverse of the grid used to transform images, i.e. ``grid = identity + displacement``.
+        Our assumption is that ``displacement * displacement`` is small and can be ignored.
+        Large displacements would lead to large errors in the approximation.
+
+    Applications:
+        Randomly transforms the morphology of objects in images and produces a
+        see-through-water-like effect.
+
+    Args:
+        alpha (float or sequence of floats, optional): Magnitude of displacements. Default is 50.0.
+        sigma (float or sequence of floats, optional): Smoothness of displacements. Default is 5.0.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={datapoints.Image: 127, datapoints.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+    """
+
     _v1_transform_cls = _transforms.ElasticTransform
 
     def __init__(
@@ -1011,6 +1105,34 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomIoUCrop(Transform):
+    """[BETA] Random IoU crop transformation from
+    `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
+
+    .. betastatus:: RandomIoUCrop transform
+
+    This transformation requires an image or video data and ``datapoints.BoundingBox`` in the input.
+
+    .. warning::
+        In order to properly remove the bounding boxes below the IoU threshold, `RandomIoUCrop`
+        must be followed by :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`, either immediately
+        after or later in the transforms pipeline.
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        min_scale (float, optional): Minimum factors to scale the input size.
+        max_scale (float, optional): Maximum factors to scale the input size.
+        min_aspect_ratio (float, optional): Minimum aspect ratio for the cropped image or video.
+        max_aspect_ratio (float, optional): Maximum aspect ratio for the cropped image or video.
+        sampler_options (list of float, optional): List of minimal IoU (Jaccard) overlap between all the boxes and
+            a cropped image or video. Default, ``None`` which corresponds to ``[0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]``
+        trials (int, optional): Number of trials to find a crop for a given value of minimal IoU (Jaccard) overlap.
+            Default, 40.
+    """
+
     def __init__(
         self,
         min_scale: float = 0.3,
@@ -1107,6 +1229,45 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class ScaleJitter(Transform):
+    """[BETA] Perform Large Scale Jitter on the input according to
+    `"Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation" <https://arxiv.org/abs/2012.07177>`_.
+
+    .. betastatus:: ScaleJitter transform
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        target_size (tuple of int): Target size. This parameter defines base scale for jittering,
+            e.g. ``min(target_size[0] / width, target_size[1] / height)``.
+        scale_range (tuple of float, optional): Minimum and maximum of the scale range. Default, ``(0.1, 2.0)``.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
+    """
+
     def __init__(
         self,
         target_size: Tuple[int, int],
@@ -1135,6 +1296,43 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomShortestSize(Transform):
+    """[BETA] Randomly resize the input.
+
+    .. betastatus:: RandomShortestSize transform
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        min_size (int or sequence of int): Minimum spatial size. Single integer value or a sequence of integer values.
+        max_size (int, optional): Maximum spatial size. Default, None.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
+    """
+
     def __init__(
         self,
         min_size: Union[List[int], Tuple[int], int],
@@ -1166,6 +1364,54 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomResize(Transform):
+    """[BETA] Randomly resize the input.
+
+    .. betastatus:: RandomResize transform
+
+    This transformation can be used together with ``RandomCrop`` as data augmentations to train
+    models on image segmentation task.
+
+    Output spatial size is randomly sampled from the interval ``[min_size, max_size]``:
+
+    .. code-block:: python
+
+        size = uniform_sample(min_size, max_size)
+        output_width = size
+        output_height = size
+
+    If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
+    :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        min_size (int): Minimum output size for random sampling
+        max_size (int): Maximum output size for random sampling
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
+    """
+
     def __init__(
         self,
         min_size: int,

From 15dfd27245acedd08b107083993f54ef766ef382 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 24 Feb 2023 10:32:36 +0100
Subject: [PATCH 11/27] Cleanup for e2e gallery example for transforms v2
 (#7318)

---
 docs/source/conf.py               | 1 +
 gallery/plot_transforms_v2_e2e.py | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 304a1cc6e22..8b4ce17de9f 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -62,6 +62,7 @@
     "gallery_dirs": "auto_examples",  # path to where to save gallery generated output
     "backreferences_dir": "gen_modules/backreferences",
     "doc_module": ("torchvision",),
+    "remove_config_comments": True,
 }
 
 napoleon_use_ivar = True
diff --git a/gallery/plot_transforms_v2_e2e.py b/gallery/plot_transforms_v2_e2e.py
index 938578e4af9..533a3d5d752 100644
--- a/gallery/plot_transforms_v2_e2e.py
+++ b/gallery/plot_transforms_v2_e2e.py
@@ -1,6 +1,6 @@
 """
 ==================================================
-transforms v2: End-to-end object detection example
+Transforms v2: End-to-end object detection example
 ==================================================
 
 Object detection is not supported out of the box by ``torchvision.transforms`` v1, since it only supports images.
@@ -20,7 +20,6 @@
 import torchvision
 
 
-# sphinx_gallery_thumbnail_number = -1
 def show(sample):
     import matplotlib.pyplot as plt
 
@@ -125,6 +124,7 @@ def load_example_coco_detection_dataset(**kwargs):
 torch.manual_seed(3141)
 sample = dataset[0]
 
+# sphinx_gallery_thumbnail_number = 2
 show(sample)
 
 

From 1150f1cafb568c46950ff403308b41f7ab6a54bb Mon Sep 17 00:00:00 2001
From: mpearce25 <pearcematthew25@gmail.com>
Date: Fri, 24 Feb 2023 04:35:24 -0500
Subject: [PATCH 12/27] Singular Sanitize BoundingBox (#7316)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 gallery/plot_transforms_v2_e2e.py      |  4 ++--
 test/test_transforms_v2.py             | 26 +++++++++++++-------------
 test/test_transforms_v2_consistency.py |  2 +-
 torchvision/transforms/v2/__init__.py  |  2 +-
 torchvision/transforms/v2/_geometry.py |  4 ++--
 torchvision/transforms/v2/_misc.py     |  6 +++---
 6 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/gallery/plot_transforms_v2_e2e.py b/gallery/plot_transforms_v2_e2e.py
index 533a3d5d752..aa25d214f31 100644
--- a/gallery/plot_transforms_v2_e2e.py
+++ b/gallery/plot_transforms_v2_e2e.py
@@ -105,13 +105,13 @@ def load_example_coco_detection_dataset(**kwargs):
         transforms.RandomHorizontalFlip(),
         transforms.ToImageTensor(),
         transforms.ConvertImageDtype(torch.float32),
-        transforms.SanitizeBoundingBoxes(),
+        transforms.SanitizeBoundingBox(),
     ]
 )
 
 ########################################################################################################################
 # .. note::
-#    Although the :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` transform is a no-op in this example, but it
+#    Although the :class:`~torchvision.transforms.v2.SanitizeBoundingBox` transform is a no-op in this example, but it
 #    should be placed at least once at the end of a detection pipeline to remove degenerate bounding boxes as well as
 #    the corresponding labels and optionally masks. It is particularly critical to add it if
 #    :class:`~torchvision.transforms.v2.RandomIoUCrop` was used.
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 9173ec14f2c..93d5f17fcbe 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -275,7 +275,7 @@ def test_common(self, transform, adapter, container_type, image_or_video, device
                 boxes=datapoints.BoundingBox([[0, 0, 0, 0]], format=format, spatial_size=(224, 244)),
                 labels=torch.tensor([3]),
             )
-            assert transforms.SanitizeBoundingBoxes()(sample)["boxes"].shape == (0, 4)
+            assert transforms.SanitizeBoundingBox()(sample)["boxes"].shape == (0, 4)
 
     @parametrize(
         [
@@ -1876,7 +1876,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
             transforms.ConvertImageDtype(torch.float),
         ]
     if sanitize:
-        t += [transforms.SanitizeBoundingBoxes()]
+        t += [transforms.SanitizeBoundingBox()]
     t = transforms.Compose(t)
 
     num_boxes = 5
@@ -1917,7 +1917,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
         # ssd and ssdlite contain RandomIoUCrop which may "remove" some bbox. It
         # doesn't remove them strictly speaking, it just marks some boxes as
         # degenerate and those boxes will be later removed by
-        # SanitizeBoundingBoxes(), which we add to the pipelines if the sanitize
+        # SanitizeBoundingBox(), which we add to the pipelines if the sanitize
         # param is True.
         # Note that the values below are probably specific to the random seed
         # set above (which is fine).
@@ -1989,7 +1989,7 @@ def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type):
         img = sample.pop("image")
         sample = (img, sample)
 
-    out = transforms.SanitizeBoundingBoxes(min_size=min_size, labels_getter=labels_getter)(sample)
+    out = transforms.SanitizeBoundingBox(min_size=min_size, labels_getter=labels_getter)(sample)
 
     if sample_type is tuple:
         out_image = out[0]
@@ -2023,13 +2023,13 @@ def test_sanitize_bounding_boxes_default_heuristic(key, sample_type):
     sample = {key: labels, "another_key": "whatever"}
     if sample_type is tuple:
         sample = (None, sample, "whatever_again")
-    assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(sample) is labels
+    assert transforms.SanitizeBoundingBox._find_labels_default_heuristic(sample) is labels
 
     if key.lower() != "labels":
         # If "labels" is in the dict (case-insensitive),
         # it takes precedence over other keys which would otherwise be a match
         d = {key: "something_else", "labels": labels}
-        assert transforms.SanitizeBoundingBoxes._find_labels_default_heuristic(d) is labels
+        assert transforms.SanitizeBoundingBox._find_labels_default_heuristic(d) is labels
 
 
 def test_sanitize_bounding_boxes_errors():
@@ -2041,25 +2041,25 @@ def test_sanitize_bounding_boxes_errors():
     )
 
     with pytest.raises(ValueError, match="min_size must be >= 1"):
-        transforms.SanitizeBoundingBoxes(min_size=0)
+        transforms.SanitizeBoundingBox(min_size=0)
     with pytest.raises(ValueError, match="labels_getter should either be a str"):
-        transforms.SanitizeBoundingBoxes(labels_getter=12)
+        transforms.SanitizeBoundingBox(labels_getter=12)
 
     with pytest.raises(ValueError, match="Could not infer where the labels are"):
         bad_labels_key = {"bbox": good_bbox, "BAD_KEY": torch.arange(good_bbox.shape[0])}
-        transforms.SanitizeBoundingBoxes()(bad_labels_key)
+        transforms.SanitizeBoundingBox()(bad_labels_key)
 
     with pytest.raises(ValueError, match="If labels_getter is a str or 'default'"):
         not_a_dict = (good_bbox, torch.arange(good_bbox.shape[0]))
-        transforms.SanitizeBoundingBoxes()(not_a_dict)
+        transforms.SanitizeBoundingBox()(not_a_dict)
 
     with pytest.raises(ValueError, match="must be a tensor"):
         not_a_tensor = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0]).tolist()}
-        transforms.SanitizeBoundingBoxes()(not_a_tensor)
+        transforms.SanitizeBoundingBox()(not_a_tensor)
 
     with pytest.raises(ValueError, match="Number of boxes"):
         different_sizes = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0] + 3)}
-        transforms.SanitizeBoundingBoxes()(different_sizes)
+        transforms.SanitizeBoundingBox()(different_sizes)
 
     with pytest.raises(ValueError, match="boxes must be of shape"):
         bad_bbox = datapoints.BoundingBox(  # batch with 2 elements
@@ -2071,7 +2071,7 @@ def test_sanitize_bounding_boxes_errors():
             spatial_size=(20, 20),
         )
         different_sizes = {"bbox": bad_bbox, "labels": torch.arange(bad_bbox.shape[0])}
-        transforms.SanitizeBoundingBoxes()(different_sizes)
+        transforms.SanitizeBoundingBox()(different_sizes)
 
 
 @pytest.mark.parametrize(
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index 43f17c9b15a..059a230ee5c 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -1099,7 +1099,7 @@ def make_label(extra_dims, categories):
                 v2_transforms.Compose(
                     [
                         v2_transforms.RandomIoUCrop(),
-                        v2_transforms.SanitizeBoundingBoxes(labels_getter=lambda sample: sample[1]["labels"]),
+                        v2_transforms.SanitizeBoundingBox(labels_getter=lambda sample: sample[1]["labels"]),
                     ]
                 ),
                 {"with_mask": False},
diff --git a/torchvision/transforms/v2/__init__.py b/torchvision/transforms/v2/__init__.py
index 7ad72c00934..6573446a33a 100644
--- a/torchvision/transforms/v2/__init__.py
+++ b/torchvision/transforms/v2/__init__.py
@@ -40,7 +40,7 @@
     TenCrop,
 )
 from ._meta import ClampBoundingBox, ConvertBoundingBoxFormat, ConvertDtype, ConvertImageDtype
-from ._misc import GaussianBlur, Identity, Lambda, LinearTransformation, Normalize, SanitizeBoundingBoxes, ToDtype
+from ._misc import GaussianBlur, Identity, Lambda, LinearTransformation, Normalize, SanitizeBoundingBox, ToDtype
 from ._temporal import UniformTemporalSubsample
 from ._type_conversion import PILToTensor, ToImagePIL, ToImageTensor, ToPILImage
 
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index c3342eb9926..b2618bb892f 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -1114,7 +1114,7 @@ class RandomIoUCrop(Transform):
 
     .. warning::
         In order to properly remove the bounding boxes below the IoU threshold, `RandomIoUCrop`
-        must be followed by :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`, either immediately
+        must be followed by :class:`~torchvision.transforms.v2.SanitizeBoundingBox`, either immediately
         after or later in the transforms pipeline.
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
@@ -1222,7 +1222,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
         if isinstance(output, datapoints.BoundingBox):
             # We "mark" the invalid boxes as degenreate, and they can be
-            # removed by a later call to SanitizeBoundingBoxes()
+            # removed by a later call to SanitizeBoundingBox()
             output[~params["is_within_crop_area"]] = 0
 
         return output
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 8cc4aa6a3db..53975a2ad2a 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -246,7 +246,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         return inpt.to(dtype=dtype)
 
 
-class SanitizeBoundingBoxes(Transform):
+class SanitizeBoundingBox(Transform):
     # This removes boxes and their corresponding labels:
     # - small or degenerate bboxes based on min_size (this includes those where X2 <= X1 or Y2 <= Y1)
     # - boxes with any coordinate outside the range of the image (negative, or > spatial_size)
@@ -269,7 +269,7 @@ def __init__(
         elif callable(labels_getter):
             self._labels_getter = labels_getter
         elif isinstance(labels_getter, str):
-            self._labels_getter = lambda inputs: SanitizeBoundingBoxes._get_dict_or_second_tuple_entry(inputs)[
+            self._labels_getter = lambda inputs: SanitizeBoundingBox._get_dict_or_second_tuple_entry(inputs)[
                 labels_getter  # type: ignore[index]
             ]
         elif labels_getter is None:
@@ -300,7 +300,7 @@ def _get_dict_or_second_tuple_entry(inputs: Any) -> Mapping[str, Any]:
     def _find_labels_default_heuristic(inputs: Dict[str, Any]) -> Optional[torch.Tensor]:
         # Tries to find a "labels" key, otherwise tries for the first key that contains "label" - case insensitive
         # Returns None if nothing is found
-        inputs = SanitizeBoundingBoxes._get_dict_or_second_tuple_entry(inputs)
+        inputs = SanitizeBoundingBox._get_dict_or_second_tuple_entry(inputs)
         candidate_key = None
         with suppress(StopIteration):
             candidate_key = next(key for key in inputs.keys() if key.lower() == "labels")

From 384162e0c86e15cf9965bed2450b90fcddcaca48 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 24 Feb 2023 09:44:30 +0000
Subject: [PATCH 13/27] Misc docs transforms v2(#7314)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 docs/source/transforms.rst          |  2 ++
 torchvision/transforms/v2/_color.py | 33 +++++++------------
 torchvision/transforms/v2/_misc.py  | 51 +++++++++++++++++++++++------
 3 files changed, 54 insertions(+), 32 deletions(-)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index ddd6f37d083..1dec6bedf15 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -190,6 +190,7 @@ Miscellaneous
     v2.RandomErasing
     Lambda
     v2.Lambda
+    v2.SanitizeBoundingBox
 
 .. _conversion_transforms:
 
@@ -210,6 +211,7 @@ Conversion
     ConvertImageDtype
     v2.ConvertImageDtype
     v2.ConvertDtype
+    v2.ToDtype
 
 Auto-Augmentation
 -----------------
diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py
index 2a581bf5640..237e8d6181a 100644
--- a/torchvision/transforms/v2/_color.py
+++ b/torchvision/transforms/v2/_color.py
@@ -15,17 +15,11 @@ class Grayscale(Transform):
 
     .. betastatus:: Grayscale transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions
+    If the input is a :class:`torch.Tensor`, it is expected
+    to have [..., 3 or 1, H, W] shape, where ... means an arbitrary number of leading dimensions
 
     Args:
         num_output_channels (int): (1 or 3) number of channels desired for output image
-
-    Returns:
-        PIL Image: Grayscale version of the input.
-
-        - If ``num_output_channels == 1`` : returned image is single channel
-        - If ``num_output_channels == 3`` : returned image is 3 channel with r == g == b
     """
 
     _v1_transform_cls = _transforms.Grayscale
@@ -50,18 +44,13 @@ class RandomGrayscale(_RandomApplyTransform):
 
     .. betastatus:: RandomGrayscale transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions
+    If the input is a :class:`torch.Tensor`, it is expected to have [..., 3 or 1, H, W] shape,
+    where ... means an arbitrary number of leading dimensions
+
+    The output has the same number of channels as the input.
 
     Args:
         p (float): probability that image should be converted to grayscale.
-
-    Returns:
-        PIL Image or Tensor: Grayscale version of the input image with probability p and unchanged
-        with probability (1-p).
-        - If input image is 1 channel: grayscale version is 1 channel
-        - If input image is 3 channel: grayscale version is 3 channel with r == g == b
-
     """
 
     _v1_transform_cls = _transforms.RandomGrayscale
@@ -89,7 +78,7 @@ class ColorJitter(Transform):
 
     .. betastatus:: ColorJitter transform
 
-    If the image is torch Tensor, it is expected
+    If the input is a :class:`torch.Tensor`, it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
     If img is PIL Image, mode "1", "I", "F" and modes with transparency (alpha channel) are not supported.
 
@@ -295,7 +284,7 @@ class RandomEqualize(_RandomApplyTransform):
 
     .. betastatus:: RandomEqualize transform
 
-    If the image is torch Tensor, it is expected
+    If the input is a :class:`torch.Tensor`, it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
     If img is PIL Image, it is expected to be in mode "P", "L" or "RGB".
 
@@ -334,7 +323,7 @@ class RandomPosterize(_RandomApplyTransform):
 
     .. betastatus:: RandomPosterize transform
 
-    If the image is torch Tensor, it should be of type torch.uint8,
+    If the input is a :class:`torch.Tensor`, it should be of type torch.uint8,
     and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
     If img is PIL Image, it is expected to be in mode "L" or "RGB".
 
@@ -383,7 +372,7 @@ class RandomAutocontrast(_RandomApplyTransform):
 
     .. betastatus:: RandomAutocontrast transform
 
-    If the image is torch Tensor, it is expected
+    If the input is a :class:`torch.Tensor`, it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
     If img is PIL Image, it is expected to be in mode "L" or "RGB".
 
@@ -402,7 +391,7 @@ class RandomAdjustSharpness(_RandomApplyTransform):
 
     .. betastatus:: RandomAdjustSharpness transform
 
-    If the image is torch Tensor,
+    If the input is a :class:`torch.Tensor`,
     it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
 
     Args:
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 53975a2ad2a..2237334f7a2 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -15,13 +15,14 @@
 from .utils import has_any, is_simple_tensor, query_bounding_box
 
 
+# TODO: do we want/need to expose this?
 class Identity(Transform):
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         return inpt
 
 
 class Lambda(Transform):
-    """[BETA] Apply a user-defined lambda as a transform.
+    """[BETA] Apply a user-defined function as a transform.
 
     .. betastatus:: Lambda transform
 
@@ -52,7 +53,7 @@ def extra_repr(self) -> str:
 
 
 class LinearTransformation(Transform):
-    """[BETA] Transform a tensor image with a square transformation matrix and a mean_vector computed offline.
+    """[BETA] Transform a tensor image or video with a square transformation matrix and a mean_vector computed offline.
 
     .. betastatus:: LinearTransformation transform
 
@@ -135,7 +136,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class Normalize(Transform):
-    """[BETA] Normalize a tensor image with mean and standard deviation.
+    """[BETA] Normalize a tensor image or video with mean and standard deviation.
 
     .. betastatus:: Normalize transform
 
@@ -179,7 +180,7 @@ class GaussianBlur(Transform):
 
     .. betastatus:: GausssianBlur transform
 
-    If the image is torch Tensor, it is expected
+    If the input is a Tensor, it is expected
     to have [..., C, H, W] shape, where ... means an arbitrary number of leading dimensions.
 
     Args:
@@ -188,9 +189,6 @@ class GaussianBlur(Transform):
             creating kernel to perform blurring. If float, sigma is fixed. If it is tuple
             of float (min, max), sigma is chosen uniformly at random to lie in the
             given range.
-
-    Returns:
-        PIL Image or Tensor: Gaussian blurred version of the input image.
     """
 
     _v1_transform_cls = _transforms.GaussianBlur
@@ -225,6 +223,15 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class ToDtype(Transform):
+    """[BETA] Converts the input to a specific dtype.
+
+    .. betastatus:: ToDtype transform
+
+    Args:
+        dtype (dtype or dict of Datapoint -> dtype): The dtype to convert to. A dict can be passed to specify
+            per-datapoint conversions, e.g. ``dtype={datapoints.Image: torch.float32, datapoints.Video: torch.float64}``.
+    """
+
     _transformed_types = (torch.Tensor,)
 
     def __init__(self, dtype: Union[torch.dtype, Dict[Type, Optional[torch.dtype]]]) -> None:
@@ -247,9 +254,33 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class SanitizeBoundingBox(Transform):
-    # This removes boxes and their corresponding labels:
-    # - small or degenerate bboxes based on min_size (this includes those where X2 <= X1 or Y2 <= Y1)
-    # - boxes with any coordinate outside the range of the image (negative, or > spatial_size)
+    """[BETA] Remove degenerate/invalid bounding boxes and their corresponding labels and masks.
+
+    .. betastatus:: SanitizeBoundingBox transform
+
+    This transform removes bounding boxes and their associated labels/masks that:
+
+    - are below a given ``min_size``: by default this also removes degenerate boxes that have e.g. X2 <= X1.
+    - have any coordinate outside of their corresponding image. You may want to
+      call :class:`~torchvision.transforms.v2.ClampBoundingBox` first to avoid undesired removals.
+
+    It is recommended to call it at the end of a pipeline, before passing the
+    input to the models. It is critical to call this transform if
+    :class:`~torchvision.transforms.v2.RandomIoUCrop` was called.
+    If you want to be extra careful, you may call it after all transforms that
+    may modify bounding boxes but once at the end should be enough in most
+    cases.
+
+    Args:
+        min_size (float, optional) The size below which bounding boxes are removed. Default is 1.
+        labels_getter (callable or str or None, optional): indicates how to identify the labels in the input.
+            It can be a str in which case the input is expected to be a dict, and ``labels_getter`` then specifies
+            the key whose value corresponds to the labels. It can also be a callable that takes the same input
+            as the transform, and returns the labels.
+            By default, this will try to find a "labels" key in the input, if
+            the input is a dict or it is a tuple whose second element is a dict.
+            This heuristic should work well with a lot of datasets, including the built-in torchvision datasets.
+    """
 
     def __init__(
         self,

From a01e485eac0685548bdc5a63ec0691d0409b5b5e Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Fri, 24 Feb 2023 11:08:06 +0100
Subject: [PATCH 14/27] Minor updates in autoaugment, augment docstring v2
 (#7317)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 torchvision/transforms/v2/_augment.py      | 12 +++---
 torchvision/transforms/v2/_auto_augment.py | 44 +++++++++++++---------
 2 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index b5aac9ca9a2..0df7e0f249a 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -13,7 +13,7 @@
 
 
 class RandomErasing(_RandomApplyTransform):
-    """[BETA] Randomly selects a rectangle region in the input image or video and erases its pixels.
+    """[BETA] Randomly select a rectangle region in the input image or video and erase its pixels.
 
     .. betastatus:: RandomErasing transform
 
@@ -21,14 +21,14 @@ class RandomErasing(_RandomApplyTransform):
     'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/abs/1708.04896
 
     Args:
-         p: probability that the random erasing operation will be performed.
-         scale: range of proportion of erased area against input image.
-         ratio: range of aspect ratio of erased area.
-         value: erasing value. Default is 0. If a single int, it is used to
+        p (float, optional): probability that the random erasing operation will be performed.
+        scale (tuple of float, optional): range of proportion of erased area against input image.
+        ratio (tuple of float, optional): range of aspect ratio of erased area.
+        value (number or tuple of numbers): erasing value. Default is 0. If a single int, it is used to
             erase all pixels. If a tuple of length 3, it is used to erase
             R, G, B channels respectively.
             If a str of 'random', erasing each pixel with random values.
-         inplace: boolean to make this transform inplace. Default set to False.
+        inplace (bool, optional): boolean to make this transform inplace. Default set to False.
 
     Returns:
         Erased input.
diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
index 98e23b99796..2cd88c1a74d 100644
--- a/torchvision/transforms/v2/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -167,14 +167,16 @@ class AutoAugment(_AutoAugmentBase):
 
     .. betastatus:: AutoAugment transform
 
-    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    This transformation works on images and videos only.
+
+    If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
     If img is PIL Image, it is expected to be in mode "L" or "RGB".
 
     Args:
-        policy (AutoAugmentPolicy): Desired policy enum defined by
+        policy (AutoAugmentPolicy, optional): Desired policy enum defined by
             :class:`torchvision.transforms.autoaugment.AutoAugmentPolicy`. Default is ``AutoAugmentPolicy.IMAGENET``.
-        interpolation (InterpolationMode): Desired interpolation enum defined by
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
         fill (sequence or number, optional): Pixel fill value for the area outside the transformed
@@ -342,15 +344,17 @@ class RandAugment(_AutoAugmentBase):
 
     .. betastatus:: RandAugment transform
 
-    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    This transformation works on images and videos only.
+
+    If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
     If img is PIL Image, it is expected to be in mode "L" or "RGB".
 
     Args:
-        num_ops (int): Number of augmentation transformations to apply sequentially.
-        magnitude (int): Magnitude for all the transformations.
-        num_magnitude_bins (int): The number of different magnitude values.
-        interpolation (InterpolationMode): Desired interpolation enum defined by
+        num_ops (int, optional): Number of augmentation transformations to apply sequentially.
+        magnitude (int, optional): Magnitude for all the transformations.
+        num_magnitude_bins (int, optional): The number of different magnitude values.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
         fill (sequence or number, optional): Pixel fill value for the area outside the transformed
@@ -423,13 +427,15 @@ class TrivialAugmentWide(_AutoAugmentBase):
 
     .. betastatus:: TrivialAugmentWide transform
 
-    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    This transformation works on images and videos only.
+
+    If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
     If img is PIL Image, it is expected to be in mode "L" or "RGB".
 
     Args:
-        num_magnitude_bins (int): The number of different magnitude values.
-        interpolation (InterpolationMode): Desired interpolation enum defined by
+        num_magnitude_bins (int, optional): The number of different magnitude values.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
         fill (sequence or number, optional): Pixel fill value for the area outside the transformed
@@ -492,18 +498,20 @@ class AugMix(_AutoAugmentBase):
 
     .. betastatus:: AugMix transform
 
-    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    This transformation works on images and videos only.
+
+    If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
     If img is PIL Image, it is expected to be in mode "L" or "RGB".
 
     Args:
-        severity (int): The severity of base augmentation operators. Default is ``3``.
-        mixture_width (int): The number of augmentation chains. Default is ``3``.
-        chain_depth (int): The depth of augmentation chains. A negative value denotes stochastic depth sampled from the interval [1, 3].
+        severity (int, optional): The severity of base augmentation operators. Default is ``3``.
+        mixture_width (int, optional): The number of augmentation chains. Default is ``3``.
+        chain_depth (int, optional): The depth of augmentation chains. A negative value denotes stochastic depth sampled from the interval [1, 3].
             Default is ``-1``.
-        alpha (float): The hyperparameter for the probability distributions. Default is ``1.0``.
-        all_ops (bool): Use all operations (including brightness, contrast, color and sharpness). Default is ``True``.
-        interpolation (InterpolationMode): Desired interpolation enum defined by
+        alpha (float, optional): The hyperparameter for the probability distributions. Default is ``1.0``.
+        all_ops (bool, optional): Use all operations (including brightness, contrast, color and sharpness). Default is ``True``.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
         fill (sequence or number, optional): Pixel fill value for the area outside the transformed

From 0dfc317f20b76a44f93c5903aa56802272c6fe54 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Fri, 24 Feb 2023 11:57:14 +0100
Subject: [PATCH 15/27] Fixed broken test_random_choice (#7315)

---
 test/test_transforms_v2.py              | 2 +-
 test/test_transforms_v2_consistency.py  | 2 +-
 torchvision/transforms/v2/_container.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 93d5f17fcbe..9beded4c957 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -1359,7 +1359,7 @@ def test_ctor(self, transform_cls, trfms):
 
 class TestRandomChoice:
     def test_assertions(self):
-        with pytest.raises(ValueError, match="The number of probabilities doesn't match the number of transforms"):
+        with pytest.raises(ValueError, match="Length of p doesn't match the number of transforms"):
             transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], p=[1])
 
 
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
index 059a230ee5c..a8a87cd43dd 100644
--- a/test/test_transforms_v2_consistency.py
+++ b/test/test_transforms_v2_consistency.py
@@ -822,7 +822,7 @@ def test_random_choice(self, probabilities):
                 v2_transforms.Resize(256),
                 legacy_transforms.CenterCrop(224),
             ],
-            probabilities=probabilities,
+            p=probabilities,
         )
         legacy_transform = legacy_transforms.RandomChoice(
             [
diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py
index 27affc7100b..7f9df337352 100644
--- a/torchvision/transforms/v2/_container.py
+++ b/torchvision/transforms/v2/_container.py
@@ -139,7 +139,7 @@ def __init__(
             p = [1] * len(transforms)
         elif len(p) != len(transforms):
             raise ValueError(
-                f"The number of p doesn't match the number of transforms: " f"{len(p)} != {len(transforms)}"
+                f"Length of p doesn't match the number of transforms: " f"{len(p)} != {len(transforms)}"
             )
 
         super().__init__()

From 9d768dd1ef2c0858a889cb033eefd855aafd14f9 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Fri, 24 Feb 2023 12:11:29 +0100
Subject: [PATCH 16/27] Updated _meta.py docstrings (#7320)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 docs/source/transforms.rst         |  2 ++
 torchvision/transforms/v2/_meta.py | 18 +++++++++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 1dec6bedf15..8e3c60085de 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -191,6 +191,7 @@ Miscellaneous
     Lambda
     v2.Lambda
     v2.SanitizeBoundingBox
+    v2.ClampBoundingBox
 
 .. _conversion_transforms:
 
@@ -212,6 +213,7 @@ Conversion
     v2.ConvertImageDtype
     v2.ConvertDtype
     v2.ToDtype
+    v2.ConvertBoundingBoxFormat
 
 Auto-Augmentation
 -----------------
diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py
index 7d0f0ec39f9..94ec851d045 100644
--- a/torchvision/transforms/v2/_meta.py
+++ b/torchvision/transforms/v2/_meta.py
@@ -9,6 +9,15 @@
 
 
 class ConvertBoundingBoxFormat(Transform):
+    """[BETA] Convert bounding box coordinates to the given ``format``, e.g. from "CXCYWH" to "XYXY".
+
+    .. betastatus:: ConvertBoundingBoxFormat transform
+
+    Args:
+        format (str or datapoints.BoundingBoxFormat): output bounding box format.
+            Possible values are defined by :class:`~torchvision.datapoints.BoundingBoxFormat` and
+            string values match the enums, e.g. "XYXY" or "XYWH" etc.
+    """
     _transformed_types = (datapoints.BoundingBox,)
 
     def __init__(self, format: Union[str, datapoints.BoundingBoxFormat]) -> None:
@@ -22,7 +31,7 @@ def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> da
 
 
 class ConvertDtype(Transform):
-    """[BETA] Convert a tensor image/box/mask to the given ``dtype`` and scale the values accordingly
+    """[BETA] Convert input image or video to the given ``dtype`` and scale the values accordingly.
 
     .. betastatus:: ConvertDtype transform
 
@@ -63,6 +72,13 @@ def _transform(
 
 
 class ClampBoundingBox(Transform):
+    """[BETA] Clamp bounding boxes to their corresponding image dimensions.
+
+    The clamping is done according to the bounding boxes' ``spatial_size`` meta-data.
+
+    .. betastatus:: ClampBoundingBox transform
+
+    """
     _transformed_types = (datapoints.BoundingBox,)
 
     def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> datapoints.BoundingBox:

From 4c0638b5ee1cbe759c3cb6aac95a450a480c2581 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 24 Feb 2023 11:21:25 +0000
Subject: [PATCH 17/27] remove strEnum from BoundingBoxFormat (#7322)

---
 test/test_datapoints.py                 |  2 +-
 torchvision/datapoints/_bounding_box.py | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/test/test_datapoints.py b/test/test_datapoints.py
index 5b875a6ef20..39c05123333 100644
--- a/test/test_datapoints.py
+++ b/test/test_datapoints.py
@@ -28,5 +28,5 @@ def test_bbox_instance(data, format):
     assert isinstance(bboxes, torch.Tensor)
     assert bboxes.ndim == 2 and bboxes.shape[1] == 4
     if isinstance(format, str):
-        format = datapoints.BoundingBoxFormat.from_str(format.upper())
+        format = datapoints.BoundingBoxFormat[(format.upper())]
     assert bboxes.format == format
diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py
index 1dc46f8f21a..75e779f0b21 100644
--- a/torchvision/datapoints/_bounding_box.py
+++ b/torchvision/datapoints/_bounding_box.py
@@ -1,18 +1,18 @@
 from __future__ import annotations
 
+from enum import Enum
 from typing import Any, List, Optional, Sequence, Tuple, Union
 
 import torch
-from torchvision._utils import StrEnum
 from torchvision.transforms import InterpolationMode  # TODO: this needs to be moved out of transforms
 
 from ._datapoint import _FillTypeJIT, Datapoint
 
 
-class BoundingBoxFormat(StrEnum):
-    XYXY = StrEnum.auto()
-    XYWH = StrEnum.auto()
-    CXCYWH = StrEnum.auto()
+class BoundingBoxFormat(Enum):
+    XYXY = "XYXY"
+    XYWH = "XYWH"
+    CXCYWH = "CXCYWH"
 
 
 class BoundingBox(Datapoint):
@@ -39,7 +39,7 @@ def __new__(
         tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
 
         if isinstance(format, str):
-            format = BoundingBoxFormat.from_str(format.upper())
+            format = BoundingBoxFormat[format.upper()]
 
         return cls._wrap(tensor, format=format, spatial_size=spatial_size)
 

From d21e38a9a375077115176f73784fcd459a2cc83c Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Fri, 24 Feb 2023 12:43:31 +0100
Subject: [PATCH 18/27] Updated _type_conversion.py docs (#7324)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 docs/source/transforms.rst                    | 1 +
 torchvision/transforms/v2/_type_conversion.py | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 8e3c60085de..0e9b053fb72 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -209,6 +209,7 @@ Conversion
     v2.ToTensor
     PILToTensor
     v2.PILToTensor
+    v2.ToImageTensor
     ConvertImageDtype
     v2.ConvertImageDtype
     v2.ConvertDtype
diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py
index b0743feb10d..504c5cc3d70 100644
--- a/torchvision/transforms/v2/_type_conversion.py
+++ b/torchvision/transforms/v2/_type_conversion.py
@@ -27,6 +27,13 @@ def _transform(self, inpt: PIL.Image.Image, params: Dict[str, Any]) -> torch.Ten
 
 
 class ToImageTensor(Transform):
+    """[BETA] Convert a tensor or an ndarray or PIL Image to :class:`~torchvision.datapoints.Image`.
+
+    .. betastatus:: ToImageTensor transform
+
+    This transform does not support torchscript.
+    """
+
     _transformed_types = (is_simple_tensor, PIL.Image.Image, np.ndarray)
 
     def _transform(

From 4e040ee0bd663fe5616aee3eaa901322da3c9c9c Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 24 Feb 2023 13:05:56 +0100
Subject: [PATCH 19/27] add docs for datapoints (#7312)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 docs/source/datapoints.rst              | 13 +++++++++
 docs/source/index.rst                   |  1 +
 torchvision/datapoints/_bounding_box.py | 37 +++++++++++++++++++++++++
 torchvision/datapoints/_image.py        | 13 +++++++++
 torchvision/datapoints/_mask.py         | 13 +++++++++
 torchvision/datapoints/_video.py        | 12 ++++++++
 6 files changed, 89 insertions(+)
 create mode 100644 docs/source/datapoints.rst

diff --git a/docs/source/datapoints.rst b/docs/source/datapoints.rst
new file mode 100644
index 00000000000..07e20b090e6
--- /dev/null
+++ b/docs/source/datapoints.rst
@@ -0,0 +1,13 @@
+Datapoints
+==========
+
+.. currentmodule:: torchvision.datapoints
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    Image
+    Video
+    BoundingBoxFormat
+    BoundingBox
+    Mask
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 79dbebdd047..ac047ff5869 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -31,6 +31,7 @@ architectures, and common image transformations for computer vision.
    :maxdepth: 2
    :caption: Package Reference
 
+   datapoints
    transforms
    models
    datasets
diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py
index 75e779f0b21..d8441823c3e 100644
--- a/torchvision/datapoints/_bounding_box.py
+++ b/torchvision/datapoints/_bounding_box.py
@@ -10,12 +10,35 @@
 
 
 class BoundingBoxFormat(Enum):
+    """[BETA] Coordinate format of a bounding box.
+
+    Available formats are
+
+    * ``XYXY``
+    * ``XYWH``
+    * ``CXCYWH``
+    """
+
     XYXY = "XYXY"
     XYWH = "XYWH"
     CXCYWH = "CXCYWH"
 
 
 class BoundingBox(Datapoint):
+    """[BETA] :class:`torch.Tensor` subclass for bounding boxes.
+
+    Args:
+        data: Any data that can be turned into a tensor with :func:`torch.as_tensor`.
+        format (BoundingBoxFormat, str): Format of the bounding box.
+        spatial_size (two-tuple of ints): Height and width of the corresponding image or video.
+        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
     format: BoundingBoxFormat
     spatial_size: Tuple[int, int]
 
@@ -52,6 +75,20 @@ def wrap_like(
         format: Optional[BoundingBoxFormat] = None,
         spatial_size: Optional[Tuple[int, int]] = None,
     ) -> BoundingBox:
+        """Wrap a :class:`torch.Tensor` as :class:`BoundingBox` from a reference.
+
+        Args:
+            other (BoundingBox): Reference bounding box.
+            tensor (Tensor): Tensor to be wrapped as :class:`BoundingBox`
+            format (BoundingBoxFormat, str, optional): Format of the bounding box.  If omitted, it is taken from the
+                reference.
+            spatial_size (two-tuple of ints, optional): Height and width of the corresponding image or video. If
+                omitted, it is taken from the reference.
+
+        """
+        if isinstance(format, str):
+            format = BoundingBoxFormat.from_str(format.upper())
+
         return cls._wrap(
             tensor,
             format=format if format is not None else other.format,
diff --git a/torchvision/datapoints/_image.py b/torchvision/datapoints/_image.py
index 21dfe5a5cd6..e47a6c10fc3 100644
--- a/torchvision/datapoints/_image.py
+++ b/torchvision/datapoints/_image.py
@@ -10,6 +10,19 @@
 
 
 class Image(Datapoint):
+    """[BETA] :class:`torch.Tensor` subclass for images.
+
+    Args:
+        data (tensor-like, PIL.Image.Image): Any data that can be turned into a tensor with :func:`torch.as_tensor` as
+            well as PIL images.
+        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
     @classmethod
     def _wrap(cls, tensor: torch.Tensor) -> Image:
         image = tensor.as_subclass(cls)
diff --git a/torchvision/datapoints/_mask.py b/torchvision/datapoints/_mask.py
index bb70ec12224..0135d793d32 100644
--- a/torchvision/datapoints/_mask.py
+++ b/torchvision/datapoints/_mask.py
@@ -10,6 +10,19 @@
 
 
 class Mask(Datapoint):
+    """[BETA] :class:`torch.Tensor` subclass for segmentation and detection masks.
+
+    Args:
+        data (tensor-like, PIL.Image.Image): Any data that can be turned into a tensor with :func:`torch.as_tensor` as
+            well as PIL images.
+        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
     @classmethod
     def _wrap(cls, tensor: torch.Tensor) -> Mask:
         return tensor.as_subclass(cls)
diff --git a/torchvision/datapoints/_video.py b/torchvision/datapoints/_video.py
index ab51c10233d..a6fbe2bd473 100644
--- a/torchvision/datapoints/_video.py
+++ b/torchvision/datapoints/_video.py
@@ -9,6 +9,18 @@
 
 
 class Video(Datapoint):
+    """[BETA] :class:`torch.Tensor` subclass for videos.
+
+    Args:
+        data (tensor-like): Any data that can be turned into a tensor with :func:`torch.as_tensor`.
+        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
     @classmethod
     def _wrap(cls, tensor: torch.Tensor) -> Video:
         video = tensor.as_subclass(cls)

From f62a045d8fabcbf941189b6f9e6673e164825015 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 24 Feb 2023 13:46:18 +0000
Subject: [PATCH 20/27] Various doc enhancements (#7326)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 docs/source/conf.py                           |  1 +
 docs/source/datapoints.rst                    |  6 ++++++
 docs/source/index.rst                         |  2 +-
 docs/source/transforms.rst                    |  8 +++++++-
 torchvision/transforms/transforms.py          | 15 +++++++++++----
 torchvision/transforms/v2/_container.py       |  4 +---
 torchvision/transforms/v2/_deprecated.py      |  2 +-
 torchvision/transforms/v2/_meta.py            |  4 +++-
 torchvision/transforms/v2/_misc.py            |  8 +++++---
 torchvision/transforms/v2/_type_conversion.py |  7 ++++---
 10 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 8b4ce17de9f..6d748f5b717 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -34,6 +34,7 @@
 sys.path.append(os.path.abspath("."))
 
 torchvision.disable_beta_transforms_warning()
+import torchvision.datapoints  # Don't remove, otherwise the docs for datapoints aren't linked properly
 
 # -- General configuration ------------------------------------------------
 
diff --git a/docs/source/datapoints.rst b/docs/source/datapoints.rst
index 07e20b090e6..1cc62413e66 100644
--- a/docs/source/datapoints.rst
+++ b/docs/source/datapoints.rst
@@ -2,6 +2,12 @@ Datapoints
 ==========
 
 .. currentmodule:: torchvision.datapoints
+
+Datapoints are tensor subclasses which the :mod:`~torchvision.transforms.v2` v2 transforms use under the hood to
+dispatch their inputs to the appropriate lower-level kernels. Most users do not
+need to manipulate datapoints directly and can simply rely on dataset wrapping -
+see e.g. :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`.
+
 .. autosummary::
     :toctree: generated/
     :template: class.rst
diff --git a/docs/source/index.rst b/docs/source/index.rst
index ac047ff5869..bc38fdb0307 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -31,8 +31,8 @@ architectures, and common image transformations for computer vision.
    :maxdepth: 2
    :caption: Package Reference
 
-   datapoints
    transforms
+   datapoints
    models
    datasets
    utils
diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 0e9b053fb72..1fe3e78f55f 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -198,6 +198,12 @@ Miscellaneous
 Conversion
 ----------
 
+.. note::
+    Beware, some of these conversion transforms below will scale the values
+    while performing the conversion, while some may not do any scaling. By
+    scaling, we mean e.g. that a ``uint8`` -> ``float32`` would map the [0,
+    255] range into [0, 1] (and vice-versa).
+    
 .. autosummary::
     :toctree: generated/
     :template: class.rst
@@ -211,8 +217,8 @@ Conversion
     v2.PILToTensor
     v2.ToImageTensor
     ConvertImageDtype
-    v2.ConvertImageDtype
     v2.ConvertDtype
+    v2.ConvertImageDtype
     v2.ToDtype
     v2.ConvertBoundingBoxFormat
 
diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py
index 90cb0374eee..95eb9199ef3 100644
--- a/torchvision/transforms/transforms.py
+++ b/torchvision/transforms/transforms.py
@@ -105,7 +105,9 @@ def __repr__(self) -> str:
 
 
 class ToTensor:
-    """Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor. This transform does not support torchscript.
+    """Convert a PIL Image or ndarray to tensor and scale the values accordingly.
+
+    This transform does not support torchscript.
 
     Converts a PIL Image or numpy.ndarray (H x W x C) in the range
     [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]
@@ -139,7 +141,9 @@ def __repr__(self) -> str:
 
 
 class PILToTensor:
-    """Convert a ``PIL Image`` to a tensor of the same type. This transform does not support torchscript.
+    """Convert a PIL Image to a tensor of the same type - this does not scale values.
+
+    This transform does not support torchscript.
 
     Converts a PIL Image (H x W x C) to a Tensor of shape (C x H x W).
     """
@@ -166,7 +170,8 @@ def __repr__(self) -> str:
 
 
 class ConvertImageDtype(torch.nn.Module):
-    """Convert a tensor image to the given ``dtype`` and scale the values accordingly
+    """Convert a tensor image to the given ``dtype`` and scale the values accordingly.
+
     This function does not support PIL Image.
 
     Args:
@@ -194,7 +199,9 @@ def forward(self, image):
 
 
 class ToPILImage:
-    """Convert a tensor or an ndarray to PIL Image. This transform does not support torchscript.
+    """Convert a tensor or an ndarray to PIL Image - this does not scale values.
+
+    This transform does not support torchscript.
 
     Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape
     H x W x C to a PIL Image while preserving the value range.
diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py
index 7f9df337352..2f34a58902e 100644
--- a/torchvision/transforms/v2/_container.py
+++ b/torchvision/transforms/v2/_container.py
@@ -138,9 +138,7 @@ def __init__(
         if p is None:
             p = [1] * len(transforms)
         elif len(p) != len(transforms):
-            raise ValueError(
-                f"Length of p doesn't match the number of transforms: " f"{len(p)} != {len(transforms)}"
-            )
+            raise ValueError(f"Length of p doesn't match the number of transforms: {len(p)} != {len(transforms)}")
 
         super().__init__()
 
diff --git a/torchvision/transforms/v2/_deprecated.py b/torchvision/transforms/v2/_deprecated.py
index c44e6b08d11..b5544ecfd49 100644
--- a/torchvision/transforms/v2/_deprecated.py
+++ b/torchvision/transforms/v2/_deprecated.py
@@ -10,7 +10,7 @@
 
 
 class ToTensor(Transform):
-    """[BETA] Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor.
+    """[BETA] Convert a PIL Image or ndarray to tensor and scale the values accordingly.
 
     .. betastatus:: ToTensor transform
 
diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py
index 94ec851d045..7f28e25c602 100644
--- a/torchvision/transforms/v2/_meta.py
+++ b/torchvision/transforms/v2/_meta.py
@@ -9,7 +9,7 @@
 
 
 class ConvertBoundingBoxFormat(Transform):
-    """[BETA] Convert bounding box coordinates to the given ``format``, e.g. from "CXCYWH" to "XYXY".
+    """[BETA] Convert bounding box coordinates to the given ``format``, eg from "CXCYWH" to "XYXY".
 
     .. betastatus:: ConvertBoundingBoxFormat transform
 
@@ -18,6 +18,7 @@ class ConvertBoundingBoxFormat(Transform):
             Possible values are defined by :class:`~torchvision.datapoints.BoundingBoxFormat` and
             string values match the enums, e.g. "XYXY" or "XYWH" etc.
     """
+
     _transformed_types = (datapoints.BoundingBox,)
 
     def __init__(self, format: Union[str, datapoints.BoundingBoxFormat]) -> None:
@@ -79,6 +80,7 @@ class ClampBoundingBox(Transform):
     .. betastatus:: ClampBoundingBox transform
 
     """
+
     _transformed_types = (datapoints.BoundingBox,)
 
     def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> datapoints.BoundingBox:
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 2237334f7a2..40d57856292 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -223,13 +223,15 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class ToDtype(Transform):
-    """[BETA] Converts the input to a specific dtype.
+    """[BETA] Converts the input to a specific dtype - this does not scale values.
 
     .. betastatus:: ToDtype transform
 
     Args:
-        dtype (dtype or dict of Datapoint -> dtype): The dtype to convert to. A dict can be passed to specify
-            per-datapoint conversions, e.g. ``dtype={datapoints.Image: torch.float32, datapoints.Video: torch.float64}``.
+        dtype (``torch.dtype`` or dict of ``Datapoint`` -> ``torch.dtype``): The dtype to convert to.
+            A dict can be passed to specify per-datapoint conversions, e.g.
+            ``dtype={datapoints.Image: torch.float32, datapoints.Video:
+            torch.float64}``.
     """
 
     _transformed_types = (torch.Tensor,)
diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py
index 504c5cc3d70..92de314608c 100644
--- a/torchvision/transforms/v2/_type_conversion.py
+++ b/torchvision/transforms/v2/_type_conversion.py
@@ -11,7 +11,7 @@
 
 
 class PILToTensor(Transform):
-    """[BETA] Convert a ``PIL Image`` to a tensor of the same type.
+    """[BETA] Convert a PIL Image to a tensor of the same type - this does not scale values.
 
     .. betastatus:: PILToTensor transform
 
@@ -27,7 +27,8 @@ def _transform(self, inpt: PIL.Image.Image, params: Dict[str, Any]) -> torch.Ten
 
 
 class ToImageTensor(Transform):
-    """[BETA] Convert a tensor or an ndarray or PIL Image to :class:`~torchvision.datapoints.Image`.
+    """[BETA] Convert a tensor, ndarray, or PIL Image to :class:`~torchvision.datapoints.Image`
+    ; this does not scale values.
 
     .. betastatus:: ToImageTensor transform
 
@@ -43,7 +44,7 @@ def _transform(
 
 
 class ToImagePIL(Transform):
-    """[BETA] Convert a tensor or an ndarray to PIL Image.
+    """[BETA] Convert a tensor or an ndarray to PIL Image - this does not scale values.
 
     .. betastatus:: ToImagePIL transform
 

From 818b98904e5579a4e4d2335a5b5ab03be1fda3c3 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 24 Feb 2023 13:52:02 +0000
Subject: [PATCH 21/27] Add docs for UniformTemporalSubsample (#7325)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 docs/source/transforms.rst             |  1 +
 torchvision/transforms/v2/_temporal.py | 13 +++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 1fe3e78f55f..6957e79bbfa 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -192,6 +192,7 @@ Miscellaneous
     v2.Lambda
     v2.SanitizeBoundingBox
     v2.ClampBoundingBox
+    v2.UniformTemporalSubsample
 
 .. _conversion_transforms:
 
diff --git a/torchvision/transforms/v2/_temporal.py b/torchvision/transforms/v2/_temporal.py
index b26d6b0450f..ad7526bc4a4 100644
--- a/torchvision/transforms/v2/_temporal.py
+++ b/torchvision/transforms/v2/_temporal.py
@@ -7,6 +7,19 @@
 
 
 class UniformTemporalSubsample(Transform):
+    """[BETA] Uniformly subsample ``num_samples`` indices from the temporal dimension of the video.
+
+    .. betastatus:: UniformTemporalSubsample transform
+
+    Videos are expected to be of shape ``[..., T, C, H, W]`` where ``T`` denotes the temporal dimension.
+
+    When ``num_samples`` is larger than the size of temporal dimension of the video, it
+    will sample frames based on nearest neighbor interpolation.
+
+    Args:
+        num_samples (int): The number of equispaced samples to be selected
+    """
+
     _transformed_types = (is_simple_tensor, datapoints.Video)
 
     def __init__(self, num_samples: int):

From 198e6e49770171351ea554e5c0a3edbdcf181283 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 24 Feb 2023 15:12:46 +0100
Subject: [PATCH 22/27] fix BoundingBox.wrap_like (#7327)

---
 torchvision/datapoints/_bounding_box.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/datapoints/_bounding_box.py b/torchvision/datapoints/_bounding_box.py
index d8441823c3e..11d42f171e4 100644
--- a/torchvision/datapoints/_bounding_box.py
+++ b/torchvision/datapoints/_bounding_box.py
@@ -87,7 +87,7 @@ def wrap_like(
 
         """
         if isinstance(format, str):
-            format = BoundingBoxFormat.from_str(format.upper())
+            format = BoundingBoxFormat[format.upper()]
 
         return cls._wrap(
             tensor,

From d8083d5dbe6f63a0abfa348f403c0d0527eb9548 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 24 Feb 2023 14:45:06 +0000
Subject: [PATCH 23/27] Add docs for functionals v2 (#7328)

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 docs/source/transforms.rst | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 6957e79bbfa..22e0889a480 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -5,6 +5,22 @@ Transforming and augmenting images
 
 .. currentmodule:: torchvision.transforms
 
+
+.. note::
+    In 0.15, we released a new set of transforms available in the
+    ``torchvision.transforms.v2`` namespace, which add support for transforming
+    not just images but also bounding boxes, masks, or videos. These transforms
+    are fully backward compatible with the current ones, and you'll see them
+    documented below with a `v2.` prefix. To get started with those new
+    transforms, you can check out
+    :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`.
+    Note that these transforms are still BETA, and while we don't expect major
+    breaking changes in the future, some APIs may still change according to user
+    feedback. Please submit any feedback you may have in
+    https://github.com/pytorch/vision/issues/6753, and you can also check out
+    https://github.com/pytorch/vision/issues/7319 to learn more about the APIs
+    that we suspect might involve future changes.
+
 Transforms are common image transformations available in the
 ``torchvision.transforms`` module. They can be chained together using
 :class:`Compose`.
@@ -253,6 +269,14 @@ Functional Transforms
 
 .. currentmodule:: torchvision.transforms.functional
 
+
+.. note::
+    You'll find below the documentation for the existing
+    ``torchvision.transforms.functional`` namespace. The
+    ``torchvision.transforms.v2.functional`` namespace exists as well and can be
+    used! The same functionals are present, so you simply need to change your
+    import to rely on the ``v2`` namespace.
+
 Functional transforms give you fine-grained control of the transformation pipeline.
 As opposed to the transformations above, functional transforms don't contain a random number
 generator for their parameters.

From 5ec46adb10bb309848ddffaa361c3c57956aa5fe Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 24 Feb 2023 15:45:33 +0100
Subject: [PATCH 24/27] add gallery example for datapoints (#7321)

Co-authored-by: vfdev <vfdev.5@gmail.com>
Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 gallery/plot_datapoints.py | 132 +++++++++++++++++++++++++++++++++++++
 1 file changed, 132 insertions(+)
 create mode 100644 gallery/plot_datapoints.py

diff --git a/gallery/plot_datapoints.py b/gallery/plot_datapoints.py
new file mode 100644
index 00000000000..83ca6793598
--- /dev/null
+++ b/gallery/plot_datapoints.py
@@ -0,0 +1,132 @@
+"""
+==============
+Datapoints FAQ
+==============
+
+The :mod:`torchvision.datapoints` namespace was introduced together with ``torchvision.transforms.v2``. This example
+showcases what these datapoints are and how they behave. This is a fairly low-level topic that most users will not need
+to worry about: you do not need to understand the internals of datapoints to efficiently rely on
+``torchvision.transforms.v2``. It may however be useful for advanced users trying to implement their own datasets,
+transforms, or work directly with the datapoints.
+"""
+
+import PIL.Image
+
+import torch
+import torchvision
+
+# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that
+# some APIs may slightly change in the future
+torchvision.disable_beta_transforms_warning()
+
+from torchvision import datapoints
+
+
+########################################################################################################################
+# What are datapoints?
+# --------------------
+#
+# Datapoints are zero-copy tensor subclasses:
+
+tensor = torch.rand(3, 256, 256)
+image = datapoints.Image(tensor)
+
+assert isinstance(image, torch.Tensor)
+assert image.data_ptr() == tensor.data_ptr()
+
+
+########################################################################################################################
+# Under the hood, they are needed in :mod:`torchvision.transforms.v2` to correctly dispatch to the appropriate function
+# for the input data.
+#
+# What datapoints are supported?
+# ------------------------------
+#
+# So far :mod:`torchvision.datapoints` supports four types of datapoints:
+#
+# * :class:`~torchvision.datapoints.Image`
+# * :class:`~torchvision.datapoints.Video`
+# * :class:`~torchvision.datapoints.BoundingBox`
+# * :class:`~torchvision.datapoints.Mask`
+#
+# How do I construct a datapoint?
+# -------------------------------
+#
+# Each datapoint class takes any tensor-like data that can be turned into a :class:`~torch.Tensor`
+
+image = datapoints.Image([[[[0, 1], [1, 0]]]])
+print(image)
+
+
+########################################################################################################################
+# Similar to other PyTorch creations ops, the constructor also takes the ``dtype``, ``device``, and ``requires_grad``
+# parameters.
+
+float_image = datapoints.Image([[[0, 1], [1, 0]]], dtype=torch.float32, requires_grad=True)
+print(float_image)
+
+
+########################################################################################################################
+# In addition, :class:`~torchvision.datapoints.Image` and :class:`~torchvision.datapoints.Mask` also take a
+# :class:`PIL.Image.Image` directly:
+
+image = datapoints.Image(PIL.Image.open("assets/astronaut.jpg"))
+print(image.shape, image.dtype)
+
+########################################################################################################################
+# In general, the datapoints can also store additional metadata that complements the underlying tensor. For example,
+# :class:`~torchvision.datapoints.BoundingBox` stores the coordinate format as well as the spatial size of the
+# corresponding image alongside the actual values:
+
+bounding_box = datapoints.BoundingBox(
+    [17, 16, 344, 495], format=datapoints.BoundingBoxFormat.XYXY, spatial_size=image.shape[-2:]
+)
+print(bounding_box)
+
+
+########################################################################################################################
+# Do I have to wrap the output of the datasets myself?
+# ----------------------------------------------------
+#
+# Only if you are using custom datasets. For the built-in ones, you can use
+# :func:`torchvision.datasets.wrap_dataset_for_transforms_v2`. Note that the function also supports subclasses of the
+# built-in datasets. Meaning, if your custom dataset subclasses from a built-in one and the output type is the same, you
+# also don't have to wrap manually.
+#
+# How do the datapoints behave inside a computation?
+# --------------------------------------------------
+#
+# Datapoints look and feel just like regular tensors. Everything that is supported on a plain :class:`torch.Tensor`
+# also works on datapoints.
+# Since for most operations involving datapoints, it cannot be safely inferred whether the result should retain the
+# datapoint type, we choose to return a plain tensor instead of a datapoint (this might change, see note below):
+
+assert isinstance(image, datapoints.Image)
+
+new_image = image + 0
+
+assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, datapoints.Image)
+
+########################################################################################################################
+# .. note::
+#
+#    This "unwrapping" behaviour is something we're actively seeking feedback on. If you find this surprising or if you
+#    have any suggestions on how to better support your use-cases, please reach out to us via this issue:
+#    https://github.com/pytorch/vision/issues/7319
+#
+# There are two exceptions to this rule:
+#
+# 1. The operations :meth:`~torch.Tensor.clone`, :meth:`~torch.Tensor.to`, and :meth:`~torch.Tensor.requires_grad_`
+#    retain the datapoint type.
+# 2. Inplace operations on datapoints cannot change the type of the datapoint they are called on. However, if you use
+#    the flow style, the returned value will be unwrapped:
+
+image = datapoints.Image([[[0, 1], [1, 0]]])
+
+new_image = image.add_(1).mul_(2)
+
+assert isinstance(image, torch.Tensor)
+print(image)
+
+assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, datapoints.Image)
+assert (new_image == image).all()

From db6630ec37e90cb5248b748efcf053954afe7d83 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 24 Feb 2023 16:16:34 +0000
Subject: [PATCH 25/27] Change betastatus doc warning and v2 import warning
 (#7329)

---
 docs/source/beta_status.py                    | 15 ++++++--
 docs/source/transforms.rst                    |  8 ++---
 torchvision/__init__.py                       |  9 ++---
 torchvision/transforms/v2/_augment.py         |  2 +-
 torchvision/transforms/v2/_auto_augment.py    |  8 ++---
 torchvision/transforms/v2/_color.py           | 20 +++++------
 torchvision/transforms/v2/_container.py       |  8 ++---
 torchvision/transforms/v2/_deprecated.py      |  2 +-
 torchvision/transforms/v2/_geometry.py        | 36 +++++++++----------
 torchvision/transforms/v2/_meta.py            |  6 ++--
 torchvision/transforms/v2/_misc.py            | 12 +++----
 torchvision/transforms/v2/_temporal.py        |  2 +-
 torchvision/transforms/v2/_type_conversion.py |  6 ++--
 13 files changed, 73 insertions(+), 61 deletions(-)

diff --git a/docs/source/beta_status.py b/docs/source/beta_status.py
index 925894df5c5..4a0fdc72c0f 100644
--- a/docs/source/beta_status.py
+++ b/docs/source/beta_status.py
@@ -4,15 +4,26 @@
 
 class BetaStatus(Directive):
     has_content = True
+    text = "The {api_name} is in Beta stage, and backward compatibility is not guaranteed."
 
     def run(self):
-        api_name = " ".join(self.content)
-        text = f"The {api_name} is in Beta stage, and backward compatibility is not guaranteed."
+        text = self.text.format(api_name=" ".join(self.content))
         return [nodes.warning("", nodes.paragraph("", "", nodes.Text(text)))]
 
 
+class V2BetaStatus(BetaStatus):
+    text = (
+        "The {api_name} is in Beta stage, and while we do not expect major breaking changes, "
+        "some APIs may still change according to user feedback. Please submit any feedback you may have "
+        "in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check "
+        "out https://github.com/pytorch/vision/issues/7319 to learn "
+        "more about the APIs that we suspect might involve future changes."
+    )
+
+
 def setup(app):
     app.add_directive("betastatus", BetaStatus)
+    app.add_directive("v2betastatus", V2BetaStatus)
     return {
         "version": "0.1",
         "parallel_read_safe": True,
diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 22e0889a480..0d6961bbe79 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -16,10 +16,10 @@ Transforming and augmenting images
     :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`.
     Note that these transforms are still BETA, and while we don't expect major
     breaking changes in the future, some APIs may still change according to user
-    feedback. Please submit any feedback you may have in
-    https://github.com/pytorch/vision/issues/6753, and you can also check out
-    https://github.com/pytorch/vision/issues/7319 to learn more about the APIs
-    that we suspect might involve future changes.
+    feedback. Please submit any feedback you may have `here
+    <https://github.com/pytorch/vision/issues/6753>`_, and you can also check
+    out `this issue <https://github.com/pytorch/vision/issues/7319>`_ to learn
+    more about the APIs that we suspect might involve future changes.
 
 Transforms are common image transformations available in the
 ``torchvision.transforms`` module. They can be chained together using
diff --git a/torchvision/__init__.py b/torchvision/__init__.py
index f29da9cf644..eed24091a52 100644
--- a/torchvision/__init__.py
+++ b/torchvision/__init__.py
@@ -100,10 +100,11 @@ def _is_tracing():
 _WARN_ABOUT_BETA_TRANSFORMS = True
 _BETA_TRANSFORMS_WARNING = (
     "The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. "
-    "While we will try our best to maintain backward compatibility, "
-    "some APIs or behaviors might change without a deprecation cycle. "
-    "To help us improve these new features, please provide your feedback "
-    "here: https://github.com/pytorch/vision/issues/6753."
+    "While we do not expect major breaking changes, some APIs may still change "
+    "according to user feedback. Please submit any feedback you may have in "
+    "this issue: https://github.com/pytorch/vision/issues/6753, and you can also "
+    "check out https://github.com/pytorch/vision/issues/7319 to learn more about "
+    "the APIs that we suspect might involve future changes. "
     "You can silence this warning by calling torchvision.disable_beta_transform_warning()."
 )
 
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index 0df7e0f249a..937e3508a87 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -15,7 +15,7 @@
 class RandomErasing(_RandomApplyTransform):
     """[BETA] Randomly select a rectangle region in the input image or video and erase its pixels.
 
-    .. betastatus:: RandomErasing transform
+    .. v2betastatus:: RandomErasing transform
 
     This transform does not support PIL Image.
     'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/abs/1708.04896
diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
index 2cd88c1a74d..34c0ced43d2 100644
--- a/torchvision/transforms/v2/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -165,7 +165,7 @@ class AutoAugment(_AutoAugmentBase):
     r"""[BETA] AutoAugment data augmentation method based on
     `"AutoAugment: Learning Augmentation Strategies from Data" <https://arxiv.org/pdf/1805.09501.pdf>`_.
 
-    .. betastatus:: AutoAugment transform
+    .. v2betastatus:: AutoAugment transform
 
     This transformation works on images and videos only.
 
@@ -342,7 +342,7 @@ class RandAugment(_AutoAugmentBase):
     `"RandAugment: Practical automated data augmentation with a reduced search space"
     <https://arxiv.org/abs/1909.13719>`_.
 
-    .. betastatus:: RandAugment transform
+    .. v2betastatus:: RandAugment transform
 
     This transformation works on images and videos only.
 
@@ -425,7 +425,7 @@ class TrivialAugmentWide(_AutoAugmentBase):
     r"""[BETA] Dataset-independent data-augmentation with TrivialAugment Wide, as described in
     `"TrivialAugment: Tuning-free Yet State-of-the-Art Data Augmentation" <https://arxiv.org/abs/2103.10158>`_.
 
-    .. betastatus:: TrivialAugmentWide transform
+    .. v2betastatus:: TrivialAugmentWide transform
 
     This transformation works on images and videos only.
 
@@ -496,7 +496,7 @@ class AugMix(_AutoAugmentBase):
     r"""[BETA] AugMix data augmentation method based on
     `"AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty" <https://arxiv.org/abs/1912.02781>`_.
 
-    .. betastatus:: AugMix transform
+    .. v2betastatus:: AugMix transform
 
     This transformation works on images and videos only.
 
diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py
index 237e8d6181a..4ad534c988b 100644
--- a/torchvision/transforms/v2/_color.py
+++ b/torchvision/transforms/v2/_color.py
@@ -13,7 +13,7 @@
 class Grayscale(Transform):
     """[BETA] Convert images or videos to grayscale.
 
-    .. betastatus:: Grayscale transform
+    .. v2betastatus:: Grayscale transform
 
     If the input is a :class:`torch.Tensor`, it is expected
     to have [..., 3 or 1, H, W] shape, where ... means an arbitrary number of leading dimensions
@@ -42,7 +42,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 class RandomGrayscale(_RandomApplyTransform):
     """[BETA] Randomly convert image or videos to grayscale with a probability of p (default 0.1).
 
-    .. betastatus:: RandomGrayscale transform
+    .. v2betastatus:: RandomGrayscale transform
 
     If the input is a :class:`torch.Tensor`, it is expected to have [..., 3 or 1, H, W] shape,
     where ... means an arbitrary number of leading dimensions
@@ -76,7 +76,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 class ColorJitter(Transform):
     """[BETA] Randomly change the brightness, contrast, saturation and hue of an image or video.
 
-    .. betastatus:: ColorJitter transform
+    .. v2betastatus:: ColorJitter transform
 
     If the input is a :class:`torch.Tensor`, it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
@@ -182,7 +182,7 @@ class RandomPhotometricDistort(Transform):
     """[BETA] Randomly distorts the image or video as used in `SSD: Single Shot
     MultiBox Detector <https://arxiv.org/abs/1512.02325>`_.
 
-    .. betastatus:: RandomPhotometricDistort transform
+    .. v2betastatus:: RandomPhotometricDistort transform
 
     This transform relies on :class:`~torchvision.transforms.v2.ColorJitter`
     under the hood to adjust the contrast, saturation, hue, brightness, and also
@@ -282,7 +282,7 @@ def _transform(
 class RandomEqualize(_RandomApplyTransform):
     """[BETA] Equalize the histogram of the given image or video with a given probability.
 
-    .. betastatus:: RandomEqualize transform
+    .. v2betastatus:: RandomEqualize transform
 
     If the input is a :class:`torch.Tensor`, it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
@@ -301,7 +301,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 class RandomInvert(_RandomApplyTransform):
     """[BETA] Inverts the colors of the given image or video with a given probability.
 
-    .. betastatus:: RandomInvert transform
+    .. v2betastatus:: RandomInvert transform
 
     If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format,
     where ... means it can have an arbitrary number of leading dimensions.
@@ -321,7 +321,7 @@ class RandomPosterize(_RandomApplyTransform):
     """[BETA] Posterize the image or video with a given probability by reducing the
     number of bits for each color channel.
 
-    .. betastatus:: RandomPosterize transform
+    .. v2betastatus:: RandomPosterize transform
 
     If the input is a :class:`torch.Tensor`, it should be of type torch.uint8,
     and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
@@ -346,7 +346,7 @@ class RandomSolarize(_RandomApplyTransform):
     """[BETA] Solarize the image or video with a given probability by inverting all pixel
     values above a threshold.
 
-    .. betastatus:: RandomSolarize transform
+    .. v2betastatus:: RandomSolarize transform
 
     If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format,
     where ... means it can have an arbitrary number of leading dimensions.
@@ -370,7 +370,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 class RandomAutocontrast(_RandomApplyTransform):
     """[BETA] Autocontrast the pixels of the given image or video with a given probability.
 
-    .. betastatus:: RandomAutocontrast transform
+    .. v2betastatus:: RandomAutocontrast transform
 
     If the input is a :class:`torch.Tensor`, it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
@@ -389,7 +389,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 class RandomAdjustSharpness(_RandomApplyTransform):
     """[BETA] Adjust the sharpness of the image or video with a given probability.
 
-    .. betastatus:: RandomAdjustSharpness transform
+    .. v2betastatus:: RandomAdjustSharpness transform
 
     If the input is a :class:`torch.Tensor`,
     it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py
index 2f34a58902e..fffef4157bd 100644
--- a/torchvision/transforms/v2/_container.py
+++ b/torchvision/transforms/v2/_container.py
@@ -10,7 +10,7 @@
 class Compose(Transform):
     """[BETA] Composes several transforms together.
 
-    .. betastatus:: Compose transform
+    .. v2betastatus:: Compose transform
 
     This transform does not support torchscript.
     Please, see the note below.
@@ -61,7 +61,7 @@ def extra_repr(self) -> str:
 class RandomApply(Transform):
     """[BETA] Apply randomly a list of transformations with a given probability.
 
-    .. betastatus:: RandomApply transform
+    .. v2betastatus:: RandomApply transform
 
     .. note::
         In order to script the transformation, please use ``torch.nn.ModuleList`` as input instead of list/tuple of
@@ -116,7 +116,7 @@ def extra_repr(self) -> str:
 class RandomChoice(Transform):
     """[BETA] Apply single transformation randomly picked from a list.
 
-    .. betastatus:: RandomChoice transform
+    .. v2betastatus:: RandomChoice transform
 
     This transform does not support torchscript.
 
@@ -155,7 +155,7 @@ def forward(self, *inputs: Any) -> Any:
 class RandomOrder(Transform):
     """[BETA] Apply a list of transformations in a random order.
 
-    .. betastatus:: RandomOrder transform
+    .. v2betastatus:: RandomOrder transform
 
     This transform does not support torchscript.
 
diff --git a/torchvision/transforms/v2/_deprecated.py b/torchvision/transforms/v2/_deprecated.py
index b5544ecfd49..e900e853d2b 100644
--- a/torchvision/transforms/v2/_deprecated.py
+++ b/torchvision/transforms/v2/_deprecated.py
@@ -12,7 +12,7 @@
 class ToTensor(Transform):
     """[BETA] Convert a PIL Image or ndarray to tensor and scale the values accordingly.
 
-    .. betastatus:: ToTensor transform
+    .. v2betastatus:: ToTensor transform
 
     .. warning::
         :class:`v2.ToTensor` is deprecated and will be removed in a future release.
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index b2618bb892f..59791c30b9d 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -28,7 +28,7 @@
 class RandomHorizontalFlip(_RandomApplyTransform):
     """[BETA] Horizontally flip the input with a given probability.
 
-    .. betastatus:: RandomHorizontalFlip transform
+    .. v2betastatus:: RandomHorizontalFlip transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -48,7 +48,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 class RandomVerticalFlip(_RandomApplyTransform):
     """[BETA] Vertically flip the input with a given probability.
 
-    .. betastatus:: RandomVerticalFlip transform
+    .. v2betastatus:: RandomVerticalFlip transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -68,7 +68,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 class Resize(Transform):
     """[BETA] Resize the input to the given size.
 
-    .. betastatus:: Resize transform
+    .. v2betastatus:: Resize transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -162,7 +162,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 class CenterCrop(Transform):
     """[BETA] Crop the input at the center.
 
-    .. betastatus:: CenterCrop transform
+    .. v2betastatus:: CenterCrop transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -190,7 +190,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 class RandomResizedCrop(Transform):
     """[BETA] Crop a random portion of the input and resize it to a given size.
 
-    .. betastatus:: RandomResizedCrop transform
+    .. v2betastatus:: RandomResizedCrop transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -316,7 +316,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 class FiveCrop(Transform):
     """[BETA] Crop the image or video into four corners and the central crop.
 
-    .. betastatus:: FiveCrop transform
+    .. v2betastatus:: FiveCrop transform
 
     If the input is a :class:`torch.Tensor` or a :class:`~torchvision.datapoints.Image` or a
     :class:`~torchvision.datapoints.Video` it can have arbitrary number of leading batch dimensions.
@@ -379,7 +379,7 @@ class TenCrop(Transform):
     """[BETA] Crop the image or video into four corners and the central crop plus the flipped version of
     these (horizontal flipping is used by default).
 
-    .. betastatus:: TenCrop transform
+    .. v2betastatus:: TenCrop transform
 
     If the input is a :class:`torch.Tensor` or a :class:`~torchvision.datapoints.Image` or a
     :class:`~torchvision.datapoints.Video` it can have arbitrary number of leading batch dimensions.
@@ -437,7 +437,7 @@ def _transform(
 class Pad(Transform):
     """[BETA] Pad the input on all sides with the given "pad" value.
 
-    .. betastatus:: Pad transform
+    .. v2betastatus:: Pad transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -512,7 +512,7 @@ class RandomZoomOut(_RandomApplyTransform):
     """[BETA] "Zoom out" transformation from
     `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
 
-    .. betastatus:: RandomZoomOut transform
+    .. v2betastatus:: RandomZoomOut transform
 
     This transformation randomly pads images, videos, bounding boxes and masks creating a zoom out effect.
     Output spatial size is randomly sampled from original size up to a maximum size configured
@@ -581,7 +581,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 class RandomRotation(Transform):
     """[BETA] Rotate the input by angle.
 
-    .. betastatus:: RandomRotation transform
+    .. v2betastatus:: RandomRotation transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -654,7 +654,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 class RandomAffine(Transform):
     """[BETA] Random affine transformation the input keeping center invariant.
 
-    .. betastatus:: RandomAffine transform
+    .. v2betastatus:: RandomAffine transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -775,7 +775,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 class RandomCrop(Transform):
     """[BETA] Crop the input at a random location.
 
-    .. betastatus:: RandomCrop transform
+    .. v2betastatus:: RandomCrop transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -930,7 +930,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 class RandomPerspective(_RandomApplyTransform):
     """[BETA] Perform a random perspective transformation of the input with a given probability.
 
-    .. betastatus:: RandomPerspective transform
+    .. v2betastatus:: RandomPerspective transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -1016,7 +1016,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 class ElasticTransform(Transform):
     """[BETA] Transform the input with elastic transformations.
 
-    .. betastatus:: RandomPerspective transform
+    .. v2betastatus:: RandomPerspective transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -1108,7 +1108,7 @@ class RandomIoUCrop(Transform):
     """[BETA] Random IoU crop transformation from
     `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
 
-    .. betastatus:: RandomIoUCrop transform
+    .. v2betastatus:: RandomIoUCrop transform
 
     This transformation requires an image or video data and ``datapoints.BoundingBox`` in the input.
 
@@ -1232,7 +1232,7 @@ class ScaleJitter(Transform):
     """[BETA] Perform Large Scale Jitter on the input according to
     `"Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation" <https://arxiv.org/abs/2012.07177>`_.
 
-    .. betastatus:: ScaleJitter transform
+    .. v2betastatus:: ScaleJitter transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -1298,7 +1298,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 class RandomShortestSize(Transform):
     """[BETA] Randomly resize the input.
 
-    .. betastatus:: RandomShortestSize transform
+    .. v2betastatus:: RandomShortestSize transform
 
     If the input is a :class:`torch.Tensor` or a ``Datapoint`` (e.g. :class:`~torchvision.datapoints.Image`,
     :class:`~torchvision.datapoints.Video`, :class:`~torchvision.datapoints.BoundingBox` etc.)
@@ -1366,7 +1366,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 class RandomResize(Transform):
     """[BETA] Randomly resize the input.
 
-    .. betastatus:: RandomResize transform
+    .. v2betastatus:: RandomResize transform
 
     This transformation can be used together with ``RandomCrop`` as data augmentations to train
     models on image segmentation task.
diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py
index 7f28e25c602..b7e2a42259f 100644
--- a/torchvision/transforms/v2/_meta.py
+++ b/torchvision/transforms/v2/_meta.py
@@ -11,7 +11,7 @@
 class ConvertBoundingBoxFormat(Transform):
     """[BETA] Convert bounding box coordinates to the given ``format``, eg from "CXCYWH" to "XYXY".
 
-    .. betastatus:: ConvertBoundingBoxFormat transform
+    .. v2betastatus:: ConvertBoundingBoxFormat transform
 
     Args:
         format (str or datapoints.BoundingBoxFormat): output bounding box format.
@@ -34,7 +34,7 @@ def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> da
 class ConvertDtype(Transform):
     """[BETA] Convert input image or video to the given ``dtype`` and scale the values accordingly.
 
-    .. betastatus:: ConvertDtype transform
+    .. v2betastatus:: ConvertDtype transform
 
     This function does not support PIL Image.
 
@@ -77,7 +77,7 @@ class ClampBoundingBox(Transform):
 
     The clamping is done according to the bounding boxes' ``spatial_size`` meta-data.
 
-    .. betastatus:: ClampBoundingBox transform
+    .. v2betastatus:: ClampBoundingBox transform
 
     """
 
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 40d57856292..c9b9025ebd9 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -24,7 +24,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 class Lambda(Transform):
     """[BETA] Apply a user-defined function as a transform.
 
-    .. betastatus:: Lambda transform
+    .. v2betastatus:: Lambda transform
 
     This transform does not support torchscript.
 
@@ -55,7 +55,7 @@ def extra_repr(self) -> str:
 class LinearTransformation(Transform):
     """[BETA] Transform a tensor image or video with a square transformation matrix and a mean_vector computed offline.
 
-    .. betastatus:: LinearTransformation transform
+    .. v2betastatus:: LinearTransformation transform
 
     This transform does not support PIL Image.
     Given transformation_matrix and mean_vector, will flatten the torch.*Tensor and
@@ -138,7 +138,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 class Normalize(Transform):
     """[BETA] Normalize a tensor image or video with mean and standard deviation.
 
-    .. betastatus:: Normalize transform
+    .. v2betastatus:: Normalize transform
 
     This transform does not support PIL Image.
     Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n``
@@ -178,7 +178,7 @@ def _transform(
 class GaussianBlur(Transform):
     """[BETA] Blurs image with randomly chosen Gaussian blur.
 
-    .. betastatus:: GausssianBlur transform
+    .. v2betastatus:: GausssianBlur transform
 
     If the input is a Tensor, it is expected
     to have [..., C, H, W] shape, where ... means an arbitrary number of leading dimensions.
@@ -225,7 +225,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 class ToDtype(Transform):
     """[BETA] Converts the input to a specific dtype - this does not scale values.
 
-    .. betastatus:: ToDtype transform
+    .. v2betastatus:: ToDtype transform
 
     Args:
         dtype (``torch.dtype`` or dict of ``Datapoint`` -> ``torch.dtype``): The dtype to convert to.
@@ -258,7 +258,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 class SanitizeBoundingBox(Transform):
     """[BETA] Remove degenerate/invalid bounding boxes and their corresponding labels and masks.
 
-    .. betastatus:: SanitizeBoundingBox transform
+    .. v2betastatus:: SanitizeBoundingBox transform
 
     This transform removes bounding boxes and their associated labels/masks that:
 
diff --git a/torchvision/transforms/v2/_temporal.py b/torchvision/transforms/v2/_temporal.py
index ad7526bc4a4..df4ad66643a 100644
--- a/torchvision/transforms/v2/_temporal.py
+++ b/torchvision/transforms/v2/_temporal.py
@@ -9,7 +9,7 @@
 class UniformTemporalSubsample(Transform):
     """[BETA] Uniformly subsample ``num_samples`` indices from the temporal dimension of the video.
 
-    .. betastatus:: UniformTemporalSubsample transform
+    .. v2betastatus:: UniformTemporalSubsample transform
 
     Videos are expected to be of shape ``[..., T, C, H, W]`` where ``T`` denotes the temporal dimension.
 
diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py
index 92de314608c..60f44c5d3db 100644
--- a/torchvision/transforms/v2/_type_conversion.py
+++ b/torchvision/transforms/v2/_type_conversion.py
@@ -13,7 +13,7 @@
 class PILToTensor(Transform):
     """[BETA] Convert a PIL Image to a tensor of the same type - this does not scale values.
 
-    .. betastatus:: PILToTensor transform
+    .. v2betastatus:: PILToTensor transform
 
     This transform does not support torchscript.
 
@@ -30,7 +30,7 @@ class ToImageTensor(Transform):
     """[BETA] Convert a tensor, ndarray, or PIL Image to :class:`~torchvision.datapoints.Image`
     ; this does not scale values.
 
-    .. betastatus:: ToImageTensor transform
+    .. v2betastatus:: ToImageTensor transform
 
     This transform does not support torchscript.
     """
@@ -46,7 +46,7 @@ def _transform(
 class ToImagePIL(Transform):
     """[BETA] Convert a tensor or an ndarray to PIL Image - this does not scale values.
 
-    .. betastatus:: ToImagePIL transform
+    .. v2betastatus:: ToImagePIL transform
 
     This transform does not support torchscript.
 

From 9de95667f506818e41d135bc18fc48351e6a72e2 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Fri, 24 Feb 2023 17:40:42 +0100
Subject: [PATCH 26/27] add gallery for transforms v2 (#7331)

---
 gallery/plot_transforms_v2.py | 109 ++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 gallery/plot_transforms_v2.py

diff --git a/gallery/plot_transforms_v2.py b/gallery/plot_transforms_v2.py
new file mode 100644
index 00000000000..d1096bec1e7
--- /dev/null
+++ b/gallery/plot_transforms_v2.py
@@ -0,0 +1,109 @@
+"""
+==================================
+Getting started with transforms v2
+==================================
+
+Most computer vision tasks are not supported out of the box by ``torchvision.transforms`` v1, since it only supports
+images. ``torchvision.transforms.v2`` enables jointly transforming images, videos, bounding boxes, and masks. This
+example showcases the core functionality of the new ``torchvision.transforms.v2`` API.
+"""
+
+import pathlib
+
+import torch
+import torchvision
+
+
+def load_data():
+    from torchvision.io import read_image
+    from torchvision import datapoints
+    from torchvision.ops import masks_to_boxes
+
+    assets_directory = pathlib.Path("assets")
+
+    path = assets_directory / "FudanPed00054.png"
+    image = datapoints.Image(read_image(str(path)))
+    merged_masks = read_image(str(assets_directory / "FudanPed00054_mask.png"))
+
+    labels = torch.unique(merged_masks)[1:]
+
+    masks = datapoints.Mask(merged_masks == labels.view(-1, 1, 1))
+
+    bounding_boxes = datapoints.BoundingBox(
+        masks_to_boxes(masks), format=datapoints.BoundingBoxFormat.XYXY, spatial_size=image.shape[-2:]
+    )
+
+    return path, image, bounding_boxes, masks, labels
+
+
+########################################################################################################################
+# The :mod:`torchvision.transforms.v2` API supports images, videos, bounding boxes, and instance and segmentation
+# masks. Thus, it offers native support for many Computer Vision tasks, like image and video classification, object
+# detection or instance and semantic segmentation. Still, the interface is the same, making
+# :mod:`torchvision.transforms.v2` a drop-in replacement for the existing :mod:`torchvision.transforms` API, aka v1.
+
+# We are using BETA APIs, so we deactivate the associated warning, thereby acknowledging that
+# some APIs may slightly change in the future
+torchvision.disable_beta_transforms_warning()
+import torchvision.transforms.v2 as transforms
+
+transform = transforms.Compose(
+    [
+        transforms.ColorJitter(contrast=0.5),
+        transforms.RandomRotation(30),
+        transforms.CenterCrop(480),
+    ]
+)
+
+########################################################################################################################
+# :mod:`torchvision.transforms.v2` natively supports jointly transforming multiple inputs while making sure that
+# potential random behavior is consistent across all inputs. However, it doesn't enforce a specific input structure or
+# order.
+
+path, image, bounding_boxes, masks, labels = load_data()
+
+torch.manual_seed(0)
+new_image = transform(image)  # Image Classification
+new_image, new_bounding_boxes, new_labels = transform(image, bounding_boxes, labels)  # Object Detection
+new_image, new_bounding_boxes, new_masks, new_labels = transform(
+    image, bounding_boxes, masks, labels
+)  # Instance Segmentation
+new_image, new_target = transform((image, {"boxes": bounding_boxes, "labels": labels}))  # Arbitrary Structure
+
+########################################################################################################################
+# Under the hood, :mod:`torchvision.transforms.v2` relies on :mod:`torchvision.datapoints` for the dispatch to the
+# appropriate function for the input data: :ref:`sphx_glr_auto_examples_plot_datapoints.py`. Note however, that as
+# regular user, you likely don't have to touch this yourself. See
+# :ref:`sphx_glr_auto_examples_plot_transforms_v2_e2e.py`.
+#
+# All "foreign" types like :class:`str`'s or :class:`pathlib.Path`'s are passed through, allowing to store extra
+# information directly with the sample:
+
+sample = {"path": path, "image": image}
+new_sample = transform(sample)
+
+assert new_sample["path"] is sample["path"]
+
+########################################################################################################################
+# As stated above, :mod:`torchvision.transforms.v2` is a drop-in replacement for :mod:`torchvision.transforms` and thus
+# also supports transforming plain :class:`torch.Tensor`'s as image or video if applicable. This is achieved with a
+# simple heuristic:
+#
+# * If we find an explicit image or video (:class:`torchvision.datapoints.Image`, :class:`torchvision.datapoints.Video`,
+#   or :class:`PIL.Image.Image`) in the input, all other plain tensors are passed through.
+# * If there is no explicit image or video, only the first plain :class:`torch.Tensor` will be transformed as image or
+#   video, while all others will be passed through.
+
+plain_tensor_image = torch.rand(image.shape)
+
+print(image.shape, plain_tensor_image.shape)
+
+# passing a plain tensor together with an explicit image, will not transform the former
+plain_tensor_image, image = transform(plain_tensor_image, image)
+
+print(image.shape, plain_tensor_image.shape)
+
+# passing a plain tensor without an explicit image, will transform the former
+plain_tensor_image, _ = transform(plain_tensor_image, bounding_boxes)
+
+print(image.shape, plain_tensor_image.shape)

From fdf72de88d508e9cf06848dec00ada3e223cf265 Mon Sep 17 00:00:00 2001
From: vfdev <vfdev.5@gmail.com>
Date: Fri, 24 Feb 2023 18:18:12 +0100
Subject: [PATCH 27/27] Fixed uncaught warnings in tests v2 (#7330)

Co-authored-by: Nicolas Hug <contact@nicolas-hug.com>
---
 test/test_transforms_v2.py | 41 +++++++++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
index 9beded4c957..f5ca976963a 100644
--- a/test/test_transforms_v2.py
+++ b/test/test_transforms_v2.py
@@ -136,14 +136,14 @@ class TestSmoke:
             (transforms.RandomCrop([16, 16], pad_if_needed=True), None),
             (transforms.RandomHorizontalFlip(p=1.0), None),
             (transforms.RandomPerspective(p=1.0), None),
-            (transforms.RandomResize(min_size=10, max_size=20), None),
-            (transforms.RandomResizedCrop([16, 16]), None),
+            (transforms.RandomResize(min_size=10, max_size=20, antialias=True), None),
+            (transforms.RandomResizedCrop([16, 16], antialias=True), None),
             (transforms.RandomRotation(degrees=30), None),
-            (transforms.RandomShortestSize(min_size=10), None),
+            (transforms.RandomShortestSize(min_size=10, antialias=True), None),
             (transforms.RandomVerticalFlip(p=1.0), None),
             (transforms.RandomZoomOut(p=1.0), None),
             (transforms.Resize([16, 16], antialias=True), None),
-            (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2)), None),
+            (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2), antialias=True), None),
             (transforms.ClampBoundingBox(), None),
             (transforms.ConvertBoundingBoxFormat(datapoints.BoundingBoxFormat.CXCYWH), None),
             (transforms.ConvertDtype(), None),
@@ -1514,7 +1514,7 @@ class TestRandomShortestSize:
     def test__get_params(self, min_size, max_size, mocker):
         spatial_size = (3, 10)
 
-        transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size)
+        transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size, antialias=True)
 
         sample = mocker.MagicMock(spec=datapoints.Image, num_channels=3, spatial_size=spatial_size)
         params = transform._get_params([sample])
@@ -1595,7 +1595,7 @@ def test__get_params(self):
         min_size = 3
         max_size = 6
 
-        transform = transforms.RandomResize(min_size=min_size, max_size=max_size)
+        transform = transforms.RandomResize(min_size=min_size, max_size=max_size, antialias=True)
 
         for _ in range(10):
             params = transform._get_params([])
@@ -1791,15 +1791,21 @@ def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor):
     else:
         sample = image, label
 
+    if to_tensor is transforms.ToTensor:
+        with pytest.warns(UserWarning, match="deprecated and will be removed"):
+            to_tensor = to_tensor()
+    else:
+        to_tensor = to_tensor()
+
     t = transforms.Compose(
         [
-            transforms.RandomResizedCrop((224, 224)),
+            transforms.RandomResizedCrop((224, 224), antialias=True),
             transforms.RandomHorizontalFlip(p=1),
             transforms.RandAugment(),
             transforms.TrivialAugmentWide(),
             transforms.AugMix(),
             transforms.AutoAugment(),
-            to_tensor(),
+            to_tensor,
             # TODO: ConvertImageDtype is a pass-through on PIL images, is that
             # intended?  This results in a failure if we convert to tensor after
             # it, because the image would still be uint8 which make Normalize
@@ -1830,10 +1836,17 @@ def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor):
 @pytest.mark.parametrize("sanitize", (True, False))
 def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
     torch.manual_seed(0)
+
+    if to_tensor is transforms.ToTensor:
+        with pytest.warns(UserWarning, match="deprecated and will be removed"):
+            to_tensor = to_tensor()
+    else:
+        to_tensor = to_tensor()
+
     if data_augmentation == "hflip":
         t = [
             transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
+            to_tensor,
             transforms.ConvertImageDtype(torch.float),
         ]
     elif data_augmentation == "lsj":
@@ -1847,7 +1860,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
             # ),
             transforms.RandomCrop((1024, 1024), pad_if_needed=True),
             transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
+            to_tensor,
             transforms.ConvertImageDtype(torch.float),
         ]
     elif data_augmentation == "multiscale":
@@ -1856,7 +1869,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
                 min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333, antialias=True
             ),
             transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
+            to_tensor,
             transforms.ConvertImageDtype(torch.float),
         ]
     elif data_augmentation == "ssd":
@@ -1865,14 +1878,14 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
             transforms.RandomZoomOut(fill=defaultdict(lambda: (123.0, 117.0, 104.0), {datapoints.Mask: 0})),
             transforms.RandomIoUCrop(),
             transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
+            to_tensor,
             transforms.ConvertImageDtype(torch.float),
         ]
     elif data_augmentation == "ssdlite":
         t = [
             transforms.RandomIoUCrop(),
             transforms.RandomHorizontalFlip(p=1),
-            to_tensor(),
+            to_tensor,
             transforms.ConvertImageDtype(torch.float),
         ]
     if sanitize:
@@ -1907,7 +1920,7 @@ def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
 
     out = t(sample)
 
-    if to_tensor is transforms.ToTensor and image_type is not datapoints.Image:
+    if isinstance(to_tensor, transforms.ToTensor) and image_type is not datapoints.Image:
         assert is_simple_tensor(out["image"])
     else:
         assert isinstance(out["image"], datapoints.Image)