From 53cd51619820f9c868e8c0779609a2b83c929d75 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 12 May 2021 17:37:26 +0100
Subject: [PATCH 01/19] WIP

---
 gallery/plot_visualization_utils.py |  69 +++++++++----
 test/test_utils.py                  | 153 +++++++++++++++++++---------
 torchvision/utils.py                |  91 ++++++++++-------
 3 files changed, 212 insertions(+), 101 deletions(-)

diff --git a/gallery/plot_visualization_utils.py b/gallery/plot_visualization_utils.py
index 58788437a28..06f7162d2a5 100644
--- a/gallery/plot_visualization_utils.py
+++ b/gallery/plot_visualization_utils.py
@@ -24,7 +24,8 @@ def show(imgs):
         imgs = [imgs]
     fix, axs = plt.subplots(ncols=len(imgs), squeeze=False)
     for i, img in enumerate(imgs):
-        img = F.to_pil_image(img.to('cpu'))
+        img = img.detach()
+        img = F.to_pil_image(img)
         axs[0, i].imshow(np.asarray(img))
         axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
 
@@ -50,9 +51,8 @@ def show(imgs):
 # Visualizing bounding boxes
 # --------------------------
 # We can use :func:`~torchvision.utils.draw_bounding_boxes` to draw boxes on an
-# image. We can set the colors, labels, width as well as font and font size !
-# The boxes are in ``(xmin, ymin, xmax, ymax)`` format
-# from torchvision.utils import draw_bounding_boxes
+# image. We can set the colors, labels, width as well as font and font size.
+# The boxes are in ``(xmin, ymin, xmax, ymax)`` format.
 
 from torchvision.utils import draw_bounding_boxes
 
@@ -99,19 +99,17 @@ def show(imgs):
 # Visualizing segmentation masks
 # ------------------------------
 # The :func:`~torchvision.utils.draw_segmentation_masks` function can be used to
-# draw segmentation amasks on images. We can set the colors as well as
-# transparency of masks.
+# draw segmentation amasks on images.
 #
-# Here is demo with torchvision's FCN Resnet-50, loaded with
+# We will see how to use it with torchvision's FCN Resnet-50, loaded with
 # :func:`~torchvision.models.segmentation.fcn_resnet50`.
 # You can also try using
 # DeepLabv3 (:func:`~torchvision.models.segmentation.deeplabv3_resnet50`)
 # or lraspp mobilenet models
 # (:func:`~torchvision.models.segmentation.lraspp_mobilenet_v3_large`).
 #
-# Like :func:`~torchvision.utils.draw_bounding_boxes`,
-# :func:`~torchvision.utils.draw_segmentation_masks` requires a single RGB image
-# of dtype `uint8`.
+# Let's start by looking at the ouput of the model. Remember that in general,
+# images must be normalized before they're passed to the model.
 
 from torchvision.models.segmentation import fcn_resnet50
 from torchvision.utils import draw_segmentation_masks
@@ -120,12 +118,49 @@ def show(imgs):
 model = fcn_resnet50(pretrained=True, progress=False)
 model = model.eval()
 
-# The model expects the batch to be normalized
-batch = F.normalize(batch, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
-outputs = model(batch)
+normalized_batch = F.normalize(batch, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
+output = model(normalized_batch)['out']
+print(output.shape, output.min().item(), output.max().item())
+
+#####################################
+# As we can see above, the output of the segmentation model is a tensor of shape
+# ``(batch_size, num_classes, H, W)``. Each value is a non-normalized score
+# and can normalize them into ``[0, 1]`` by using a softmax. After the softmax,
+# we can interpret each value as a probability indicating how likely a given
+# pixel is to belong to a given class. 
+# 
+# Let's plot the masks that have been detected for the dog class and for the
+# boat class:
+
+seg_classes = [
+    '__background__', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
+    'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike',
+    'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
+]
+seg_class_to_idx = {cls: idx for (idx, cls) in enumerate(seg_classes)}
+
+# We normalize the masks of each image in the batch independently
+normalized_masks = torch.stack([torch.nn.Softmax(dim=0)(masks) for masks in output])
 
-dogs_with_masks = [
-    draw_segmentation_masks(dog_int, masks=masks, alpha=0.6)
-    for dog_int, masks in zip((dog1_int, dog2_int), outputs['out'])
+dog_and_boat_masks = [
+    normalized_masks[img_idx, seg_class_to_idx[cls]]
+    for img_idx in range(batch.shape[0])
+    for cls in ('dog', 'boat')
 ]
-show(dogs_with_masks)
+
+show(dog_and_boat_masks)
+
+#####################################
+# As expected, the model is confident about the dog class, but not so much for
+# the boat class.
+#
+# The :func:`~torchvision.utils.draw_segmentation_masks` function can be used to
+# plots those masks on top of the original image. This function expects the
+# masks to be boolean masks, but our masks above contain probabilities in ``[0,
+# 1]``. To get boolean masks, we can do the following:
+
+# dogs_with_dog_masks = [
+#     draw_segmentation_masks(dog_int, masks=output[img_idx, seg_class_to_idx['dog']], alpha=0.6)
+#     for img_idx, dog_int in enumerate(dog1_int, dog2_int)
+# ]
+# show(dogs_with_masks)
diff --git a/test/test_utils.py b/test/test_utils.py
index 8c4cc620229..7d85c0cb1af 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -1,3 +1,4 @@
+import pytest
 import numpy as np
 import os
 import sys
@@ -7,7 +8,7 @@
 import unittest
 from io import BytesIO
 import torchvision.transforms.functional as F
-from PIL import Image, __version__ as PILLOW_VERSION
+from PIL import Image, __version__ as PILLOW_VERSION, ImageColor
 
 
 PILLOW_VERSION = tuple(int(x) for x in PILLOW_VERSION.split('.'))
@@ -159,55 +160,109 @@ def test_draw_invalid_boxes(self):
         self.assertRaises(ValueError, utils.draw_bounding_boxes, img_wrong1, boxes)
         self.assertRaises(ValueError, utils.draw_bounding_boxes, img_wrong2, boxes)
 
-    def test_draw_segmentation_masks_colors(self):
-        img = torch.full((3, 5, 5), 255, dtype=torch.uint8)
-        img_cp = img.clone()
-        masks_cp = masks.clone()
-        colors = ["#FF00FF", (0, 255, 0), "red"]
-        result = utils.draw_segmentation_masks(img, masks, colors=colors)
-
-        path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets",
-                            "fakedata", "draw_segm_masks_colors_util.png")
-
-        if not os.path.exists(path):
-            res = Image.fromarray(result.permute(1, 2, 0).contiguous().numpy())
-            res.save(path)
-
-        expected = torch.as_tensor(np.array(Image.open(path))).permute(2, 0, 1)
-        self.assertTrue(torch.equal(result, expected))
-        # Check if modification is not in place
-        self.assertTrue(torch.all(torch.eq(img, img_cp)).item())
-        self.assertTrue(torch.all(torch.eq(masks, masks_cp)).item())
-
-    def test_draw_segmentation_masks_no_colors(self):
-        img = torch.full((3, 20, 20), 255, dtype=torch.uint8)
-        img_cp = img.clone()
-        masks_cp = masks.clone()
-        result = utils.draw_segmentation_masks(img, masks, colors=None)
-
-        path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets",
-                            "fakedata", "draw_segm_masks_no_colors_util.png")
-
-        if not os.path.exists(path):
-            res = Image.fromarray(result.permute(1, 2, 0).contiguous().numpy())
-            res.save(path)
-
-        expected = torch.as_tensor(np.array(Image.open(path))).permute(2, 0, 1)
-        self.assertTrue(torch.equal(result, expected))
-        # Check if modification is not in place
-        self.assertTrue(torch.all(torch.eq(img, img_cp)).item())
-        self.assertTrue(torch.all(torch.eq(masks, masks_cp)).item())
-
-    def test_draw_invalid_masks(self):
-        img_tp = ((1, 1, 1), (1, 2, 3))
-        img_wrong1 = torch.full((3, 5, 5), 255, dtype=torch.float)
-        img_wrong2 = torch.full((1, 3, 5, 5), 255, dtype=torch.uint8)
-        img_wrong3 = torch.full((4, 5, 5), 255, dtype=torch.uint8)
 
-        self.assertRaises(TypeError, utils.draw_segmentation_masks, img_tp, masks)
-        self.assertRaises(ValueError, utils.draw_segmentation_masks, img_wrong1, masks)
-        self.assertRaises(ValueError, utils.draw_segmentation_masks, img_wrong2, masks)
-        self.assertRaises(ValueError, utils.draw_segmentation_masks, img_wrong3, masks)
+@pytest.mark.parametrize('dtype', (torch.float, torch.uint8))
+@pytest.mark.parametrize('colors', [
+    # None,
+    ['red', 'blue'],
+    ['#FF00FF', (1, 34, 122)],
+])
+@pytest.mark.parametrize('alpha', (0, 1))
+def test_draw_segmentation_masks(dtype, colors, alpha):
+    """This test makes sure that masks draw their corresponding color where they should"""
+    num_masks, h, w = 2, 100, 100
+    img = torch.randint(0, 256, size=(3, h, w), dtype=dtype)
+    masks = torch.randint(0, 2, (num_masks, h, w), dtype=torch.bool)
+    
+    # For testing we enforce that there's no overlap between the masks. The
+    # current behaviour is that the last mask's color will take priority when
+    # masks overlap, but this makes testing slightly harder so we don't really
+    # care
+    overlap = masks[0] & masks[1]
+    masks[:, overlap] = False
+
+    out = utils.draw_segmentation_masks(img, masks, colors=colors, alpha=alpha)
+    assert out.dtype == dtype
+    assert out is not img
+
+    if dtype == torch.float:
+        # makes comparisons below easier
+        img = F.convert_image_dtype(img, torch.uint8)
+        out = F.convert_image_dtype(out, torch.uint8)
+    img, out = img.float(), out.float()  # avoids underflows etc.
+
+    # Make sure the image didn't change where there's no mask
+    masked_pixels = masks[0] | masks[1]
+    assert (img[:, ~masked_pixels] == out[:, ~masked_pixels]).all()
+
+    if colors is None:
+        colors = utils._generate_color_palette(num_masks)
+
+    # Make sure each mask draws with its own color
+    for mask, color in zip(masks, colors):
+        if isinstance(color, str):
+            color = ImageColor.getrgb(color)
+        color = torch.tensor(color, dtype=dtype)
+
+        if alpha == 0:
+            assert (out[:, mask] == color[:, None]).all()
+        else:
+            assert (out[:, mask] == img[:, mask]).all()
+        
+
+def test_draw_segmentation_masks_int_vs_float():
+    """Make sure float and uint8 dtypes produce similar images"""
+    h, w = 100, 100
+    masks = torch.randint(0, 2, size=(2, h, w), dtype=torch.bool)
+    img_int = torch.randint(0, 256, size=(3, h, w), dtype=torch.uint8)
+    img_float = F.convert_image_dtype(img_int, torch.float)
+
+    out_int = utils.draw_segmentation_masks(image=img_int, masks=masks, colors=['red', 'blue'])
+    out_float = utils.draw_segmentation_masks(image=img_float, masks=masks, colors=['red', 'blue'])
+
+    assert out_int.dtype == img_int.dtype
+    assert out_float.dtype == img_float.dtype
+
+    out_float_int = F.convert_image_dtype(out_float, torch.uint8).int()
+    out_int = out_int.int()
+
+    assert (out_int - out_float_int).abs().max() <= 1
+
+
+def test_draw_segmentation_masks_errors():
+    h, w = 10, 10
+
+    masks = torch.randint(0, 2, size=(h, w), dtype=torch.bool)
+    img = torch.randint(0, 256, size=(3, h, w), dtype=torch.uint8)
+
+    with pytest.raises(TypeError, match="The image must be a tensor"):
+        utils.draw_segmentation_masks(image="Not A Tensor Image", masks=masks)
+    with pytest.raises(ValueError, match="The image dtype must be"):
+        img_bad_dtype = torch.randint(0, 256, size=(3, h, w), dtype=torch.int64)
+        utils.draw_segmentation_masks(image=img_bad_dtype, masks=masks)
+    with pytest.raises(ValueError, match="Pass individual images, not batches"):
+        batch = torch.randint(0, 256, size=(10, 3, h, w), dtype=torch.uint8)
+        utils.draw_segmentation_masks(image=batch, masks=masks)
+    with pytest.raises(ValueError, match="Pass an RGB image"):
+        one_channel = torch.randint(0, 256, size=(1, h, w), dtype=torch.uint8)
+        utils.draw_segmentation_masks(image=one_channel, masks=masks)
+    with pytest.raises(ValueError, match="The masks must be of dtype bool"):
+        masks_bad_dtype = torch.randint(0, 2, size=(h, w), dtype=torch.float)
+        utils.draw_segmentation_masks(image=img, masks=masks_bad_dtype)
+    with pytest.raises(ValueError, match="masks must be of shape"):
+        masks_bad_shape = torch.randint(0, 2, size=(3, 2, h, w), dtype=torch.bool)
+        utils.draw_segmentation_masks(image=img, masks=masks_bad_shape)
+    with pytest.raises(ValueError, match="must have the same height and width"):
+        masks_bad_shape = torch.randint(0, 2, size=(h + 4, w), dtype=torch.bool)
+        utils.draw_segmentation_masks(image=img, masks=masks_bad_shape)
+    with pytest.raises(ValueError, match="There are more masks"):
+        utils.draw_segmentation_masks(image=img, masks=masks, colors=[])
+    with pytest.raises(ValueError, match="colors must be a tuple or a string, or a list thereof"):
+        bad_colors = np.array(['red', 'blue'])  # should be a list
+        utils.draw_segmentation_masks(image=img, masks=masks, colors=bad_colors)
+    with pytest.raises(ValueError, match="It seems that you passed a tuple of colors instead of"):
+        bad_colors = ('red', 'blue')  # should be a list
+        utils.draw_segmentation_masks(image=img, masks=masks, colors=bad_colors)
 
 
 if __name__ == '__main__':
diff --git a/torchvision/utils.py b/torchvision/utils.py
index 9d9bbdb3c80..0bbea6391cb 100644
--- a/torchvision/utils.py
+++ b/torchvision/utils.py
@@ -216,7 +216,7 @@ def draw_bounding_boxes(
     return torch.from_numpy(np.array(img_to_draw)).permute(2, 0, 1).to(dtype=torch.uint8)
 
 
-@torch.no_grad()
+# @torch.no_grad()
 def draw_segmentation_masks(
     image: torch.Tensor,
     masks: torch.Tensor,
@@ -229,49 +229,70 @@ def draw_segmentation_masks(
     The values of the input image should be uint8 between 0 and 255.
 
     Args:
-        image (Tensor): Tensor of shape (3 x H x W) and dtype uint8.
-        masks (Tensor): Tensor of shape (num_masks, H, W). Each containing probability of predicted class.
-        alpha (float): Float number between 0 and 1 denoting factor of transparency of masks.
-        colors (List[Union[str, Tuple[int, int, int]]]): List containing the colors of masks. The colors can
-            be represented as `str` or `Tuple[int, int, int]`.
+        image (Tensor): Tensor of shape (3, H, W) and dtype uint8 or float.
+        masks (Tensor): Tensor of shape (num_masks, H, W) or (H, W) and dtype bool.
+        alpha (float): Float number between 0 and 1 denoting the transparency of the masks.
+        colors (list or None): List containing the colors of the masks. The colors can
+            be represented as PIL strings e.g. "red" or "#FF00FF", or as RGB tuples e.g. ``(240, 10, 157)``.
+            When ``masks`` has a single entry of shape (H, W), you can pass a single color instead of a list
+            with one element. By default, random colors are generated for each mask.
 
     Returns:
-        img (Tensor[C, H, W]): Image Tensor of dtype uint8 with segmentation masks plotted.
+        img (Tensor[C, H, W]): Image Tensor with the same dtype as the input image, with segmentation masks
+        drawn on top.
     """
 
     if not isinstance(image, torch.Tensor):
-        raise TypeError(f"Tensor expected, got {type(image)}")
-    elif image.dtype != torch.uint8:
-        raise ValueError(f"Tensor uint8 expected, got {image.dtype}")
+        raise TypeError(f"The image must be a tensor, got {type(image)}")
+    elif image.dtype not in (torch.uint8, torch.float):
+        raise ValueError(f"The image dtype must be uint8 or float, got {image.dtype}")
     elif image.dim() != 3:
         raise ValueError("Pass individual images, not batches")
     elif image.size()[0] != 3:
         raise ValueError("Pass an RGB image. Other Image formats are not supported")
+    if masks.ndim == 2:
+        masks = masks[None, :, :]
+    if masks.ndim != 3:
+        raise ValueError("masks must be of shape (H, W) or (batch_size, H, W)")
+    if masks.dtype != torch.bool:
+        raise ValueError(f"The masks must be of dtype bool. Got {masks.dtype}")
+    if masks.shape[-2:] != image.shape[-2:]:
+        raise ValueError(f"The image and the masks must have the same height and width")
 
     num_masks = masks.size()[0]
-    masks = masks.argmax(0)
-
+    if colors is not None and num_masks > len(colors):
+        raise ValueError(f"There are more masks ({num_masks}) than colors ({len(colors)})")
+    
     if colors is None:
-        palette = torch.tensor([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1])
-        colors_t = torch.as_tensor([i for i in range(num_masks)])[:, None] * palette
-        color_arr = (colors_t % 255).numpy().astype("uint8")
-    else:
-        color_list = []
-        for color in colors:
-            if isinstance(color, str):
-                # This will automatically raise Error if rgb cannot be parsed.
-                fill_color = ImageColor.getrgb(color)
-                color_list.append(fill_color)
-            elif isinstance(color, tuple):
-                color_list.append(color)
-
-        color_arr = np.array(color_list).astype("uint8")
-
-    _, h, w = image.size()
-    img_to_draw = Image.fromarray(masks.byte().cpu().numpy()).resize((w, h))
-    img_to_draw.putpalette(color_arr)
-
-    img_to_draw = torch.from_numpy(np.array(img_to_draw.convert('RGB')))
-    img_to_draw = img_to_draw.permute((2, 0, 1))
-
-    return (image.float() * alpha + img_to_draw.float() * (1.0 - alpha)).to(dtype=torch.uint8)
+        colors = _generate_color_palette(num_masks)
+ 
+    if not isinstance(colors, list):
+        colors = [colors]
+    if not isinstance(colors[0], (tuple, str)):
+        raise ValueError("colors must be a tuple or a string, or a list thereof")
+    if isinstance(colors[0], tuple) and len(colors[0]) != 3:
+        raise ValueError("It seems that you passed a tuple of colors instead of a list of colors")
+        
+    out_dtype = image.dtype
+
+    colors_ = []
+    for color in colors:
+        if isinstance(color, str):
+            color = ImageColor.getrgb(color)
+        color = torch.tensor(color, dtype=out_dtype)
+        if out_dtype == torch.float:
+            color /= 255
+        colors_.append(color)
+    
+    img_to_draw = image.detach().clone()
+    # TODO: There might be a way to vectorize this
+    for mask, color in zip(masks, colors_):
+        img_to_draw[:, mask] = color[:, None]
+
+    out = image * alpha + img_to_draw * (1 - alpha)
+    return out.to(out_dtype)
+
+
+def _generate_color_palette(num_masks):
+    palette = torch.tensor([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1])
+    return [tuple((i * palette) % 255) for i in range(num_masks)]

From 9cf6247cbb0a97ca13712980721d80d3f8fb7dce Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 12 May 2021 17:40:16 +0100
Subject: [PATCH 02/19] rm images

---
 gallery/plot_visualization_utils.py           |  75 +++++-------------
 .../fakedata/draw_segm_masks_colors_util.png  | Bin 88 -> 0 bytes
 .../draw_segm_masks_no_colors_util.png        | Bin 106 -> 0 bytes
 3 files changed, 20 insertions(+), 55 deletions(-)
 delete mode 100644 test/assets/fakedata/draw_segm_masks_colors_util.png
 delete mode 100644 test/assets/fakedata/draw_segm_masks_no_colors_util.png

diff --git a/gallery/plot_visualization_utils.py b/gallery/plot_visualization_utils.py
index 06f7162d2a5..f196f0cdd6d 100644
--- a/gallery/plot_visualization_utils.py
+++ b/gallery/plot_visualization_utils.py
@@ -24,8 +24,7 @@ def show(imgs):
         imgs = [imgs]
     fix, axs = plt.subplots(ncols=len(imgs), squeeze=False)
     for i, img in enumerate(imgs):
-        img = img.detach()
-        img = F.to_pil_image(img)
+        img = F.to_pil_image(img.to('cpu'))
         axs[0, i].imshow(np.asarray(img))
         axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
 
@@ -51,8 +50,9 @@ def show(imgs):
 # Visualizing bounding boxes
 # --------------------------
 # We can use :func:`~torchvision.utils.draw_bounding_boxes` to draw boxes on an
-# image. We can set the colors, labels, width as well as font and font size.
-# The boxes are in ``(xmin, ymin, xmax, ymax)`` format.
+# image. We can set the colors, labels, width as well as font and font size !
+# The boxes are in ``(xmin, ymin, xmax, ymax)`` format
+# from torchvision.utils import draw_bounding_boxes
 
 from torchvision.utils import draw_bounding_boxes
 
@@ -99,68 +99,33 @@ def show(imgs):
 # Visualizing segmentation masks
 # ------------------------------
 # The :func:`~torchvision.utils.draw_segmentation_masks` function can be used to
-# draw segmentation amasks on images.
+# draw segmentation amasks on images. We can set the colors as well as
+# transparency of masks.
 #
-# We will see how to use it with torchvision's FCN Resnet-50, loaded with
+# Here is demo with torchvision's FCN Resnet-50, loaded with
 # :func:`~torchvision.models.segmentation.fcn_resnet50`.
 # You can also try using
 # DeepLabv3 (:func:`~torchvision.models.segmentation.deeplabv3_resnet50`)
 # or lraspp mobilenet models
 # (:func:`~torchvision.models.segmentation.lraspp_mobilenet_v3_large`).
 #
-# Let's start by looking at the ouput of the model. Remember that in general,
-# images must be normalized before they're passed to the model.
+# Like :func:`~torchvision.utils.draw_bounding_boxes`,
+# :func:`~torchvision.utils.draw_segmentation_masks` requires a single RGB image
+# of dtype `uint8`.
 
-from torchvision.models.segmentation import fcn_resnet50
-from torchvision.utils import draw_segmentation_masks
+# from torchvision.models.segmentation import fcn_resnet50
+# from torchvision.utils import draw_segmentation_masks
 
 
-model = fcn_resnet50(pretrained=True, progress=False)
-model = model.eval()
-
-normalized_batch = F.normalize(batch, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
-output = model(normalized_batch)['out']
-print(output.shape, output.min().item(), output.max().item())
-
-#####################################
-# As we can see above, the output of the segmentation model is a tensor of shape
-# ``(batch_size, num_classes, H, W)``. Each value is a non-normalized score
-# and can normalize them into ``[0, 1]`` by using a softmax. After the softmax,
-# we can interpret each value as a probability indicating how likely a given
-# pixel is to belong to a given class. 
-# 
-# Let's plot the masks that have been detected for the dog class and for the
-# boat class:
-
-seg_classes = [
-    '__background__', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
-    'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike',
-    'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
-]
-seg_class_to_idx = {cls: idx for (idx, cls) in enumerate(seg_classes)}
-
-# We normalize the masks of each image in the batch independently
-normalized_masks = torch.stack([torch.nn.Softmax(dim=0)(masks) for masks in output])
+# model = fcn_resnet50(pretrained=True, progress=False)
+# model = model.eval()
 
-dog_and_boat_masks = [
-    normalized_masks[img_idx, seg_class_to_idx[cls]]
-    for img_idx in range(batch.shape[0])
-    for cls in ('dog', 'boat')
-]
-
-show(dog_and_boat_masks)
-
-#####################################
-# As expected, the model is confident about the dog class, but not so much for
-# the boat class.
-#
-# The :func:`~torchvision.utils.draw_segmentation_masks` function can be used to
-# plots those masks on top of the original image. This function expects the
-# masks to be boolean masks, but our masks above contain probabilities in ``[0,
-# 1]``. To get boolean masks, we can do the following:
+# # The model expects the batch to be normalized
+# batch = F.normalize(batch, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
+# outputs = model(batch)
 
-# dogs_with_dog_masks = [
-#     draw_segmentation_masks(dog_int, masks=output[img_idx, seg_class_to_idx['dog']], alpha=0.6)
-#     for img_idx, dog_int in enumerate(dog1_int, dog2_int)
+# dogs_with_masks = [
+#     draw_segmentation_masks(dog_int, masks=masks, alpha=0.6)
+#     for dog_int, masks in zip((dog1_int, dog2_int), outputs['out'])
 # ]
 # show(dogs_with_masks)
diff --git a/test/assets/fakedata/draw_segm_masks_colors_util.png b/test/assets/fakedata/draw_segm_masks_colors_util.png
deleted file mode 100644
index 454b35556317dc1da1707fb234cf8563c1e8c707..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 88
zcmeAS@N?(olHy`uVBq!ia0vp^tRT$61SFYwH*Nw_@}4e^Ar*6yP5$MdX<(7~Fa6*B
l;o_6Vh6|XE{XeGhiNULzE&Y{;O%+fngQu&X%Q~loCIFN+8JPe8

diff --git a/test/assets/fakedata/draw_segm_masks_no_colors_util.png b/test/assets/fakedata/draw_segm_masks_no_colors_util.png
deleted file mode 100644
index f048d2469d2414d6e1e864111a6117a30a7d210b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 106
zcmeAS@N?(olHy`uVBq!ia0vp^A|TAc1SFYWcSQjyLr)jSkcv6UCi5Pib<j&tihkaZ
zm6kZ;Z&E^zoQM5#FP@&Wi)X!DIkVN!w+JX?ahAb4pZSR5#q5bd%?zHdelF{r5}E+c
CZ6N6Y


From d7de81e1a6272685f40aad2c7dea651c49a2c4cf Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 12 May 2021 17:41:56 +0100
Subject: [PATCH 03/19] cleanup

---
 test/test_utils.py   |  4 ++--
 torchvision/utils.py | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/test/test_utils.py b/test/test_utils.py
index 7d85c0cb1af..e0689dc5342 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -173,7 +173,7 @@ def test_draw_segmentation_masks(dtype, colors, alpha):
     num_masks, h, w = 2, 100, 100
     img = torch.randint(0, 256, size=(3, h, w), dtype=dtype)
     masks = torch.randint(0, 2, (num_masks, h, w), dtype=torch.bool)
-    
+
     # For testing we enforce that there's no overlap between the masks. The
     # current behaviour is that the last mask's color will take priority when
     # masks overlap, but this makes testing slightly harder so we don't really
@@ -208,7 +208,7 @@ def test_draw_segmentation_masks(dtype, colors, alpha):
             assert (out[:, mask] == color[:, None]).all()
         else:
             assert (out[:, mask] == img[:, mask]).all()
-        
+
 
 def test_draw_segmentation_masks_int_vs_float():
     """Make sure float and uint8 dtypes produce similar images"""
diff --git a/torchvision/utils.py b/torchvision/utils.py
index 0bbea6391cb..51a1f346036 100644
--- a/torchvision/utils.py
+++ b/torchvision/utils.py
@@ -216,7 +216,7 @@ def draw_bounding_boxes(
     return torch.from_numpy(np.array(img_to_draw)).permute(2, 0, 1).to(dtype=torch.uint8)
 
 
-# @torch.no_grad()
+@torch.no_grad()
 def draw_segmentation_masks(
     image: torch.Tensor,
     masks: torch.Tensor,
@@ -262,17 +262,17 @@ def draw_segmentation_masks(
     num_masks = masks.size()[0]
     if colors is not None and num_masks > len(colors):
         raise ValueError(f"There are more masks ({num_masks}) than colors ({len(colors)})")
-    
+
     if colors is None:
         colors = _generate_color_palette(num_masks)
- 
+
     if not isinstance(colors, list):
         colors = [colors]
     if not isinstance(colors[0], (tuple, str)):
         raise ValueError("colors must be a tuple or a string, or a list thereof")
     if isinstance(colors[0], tuple) and len(colors[0]) != 3:
         raise ValueError("It seems that you passed a tuple of colors instead of a list of colors")
-        
+
     out_dtype = image.dtype
 
     colors_ = []
@@ -283,7 +283,7 @@ def draw_segmentation_masks(
         if out_dtype == torch.float:
             color /= 255
         colors_.append(color)
-    
+
     img_to_draw = image.detach().clone()
     # TODO: There might be a way to vectorize this
     for mask, color in zip(masks, colors_):

From e52fb082a1a926b82d5fa0ed6bda5cf32a5f4264 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Wed, 12 May 2021 18:07:54 +0100
Subject: [PATCH 04/19] pep

---
 torchvision/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/utils.py b/torchvision/utils.py
index 51a1f346036..949aa3b6406 100644
--- a/torchvision/utils.py
+++ b/torchvision/utils.py
@@ -257,7 +257,7 @@ def draw_segmentation_masks(
     if masks.dtype != torch.bool:
         raise ValueError(f"The masks must be of dtype bool. Got {masks.dtype}")
     if masks.shape[-2:] != image.shape[-2:]:
-        raise ValueError(f"The image and the masks must have the same height and width")
+        raise ValueError("The image and the masks must have the same height and width")
 
     num_masks = masks.size()[0]
     if colors is not None and num_masks > len(colors):

From 131f1a444e08bfa7ee3874e43c907cda92b26a8d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Thu, 13 May 2021 08:30:01 +0100
Subject: [PATCH 05/19] temporarily remove mask example

---
 gallery/plot_visualization_utils.py | 36 +----------------------------
 1 file changed, 1 insertion(+), 35 deletions(-)

diff --git a/gallery/plot_visualization_utils.py b/gallery/plot_visualization_utils.py
index f196f0cdd6d..260b92270fd 100644
--- a/gallery/plot_visualization_utils.py
+++ b/gallery/plot_visualization_utils.py
@@ -77,6 +77,7 @@ def show(imgs):
 dog1_float = convert_image_dtype(dog1_int, dtype=torch.float)
 dog2_float = convert_image_dtype(dog2_int, dtype=torch.float)
 batch = torch.stack([dog1_float, dog2_float])
+batch = torch.stack([dog1_int, dog2_int])
 
 model = fasterrcnn_resnet50_fpn(pretrained=True, progress=False)
 model = model.eval()
@@ -94,38 +95,3 @@ def show(imgs):
     for dog_int, output in zip((dog1_int, dog2_int), outputs)
 ]
 show(dogs_with_boxes)
-
-#####################################
-# Visualizing segmentation masks
-# ------------------------------
-# The :func:`~torchvision.utils.draw_segmentation_masks` function can be used to
-# draw segmentation amasks on images. We can set the colors as well as
-# transparency of masks.
-#
-# Here is demo with torchvision's FCN Resnet-50, loaded with
-# :func:`~torchvision.models.segmentation.fcn_resnet50`.
-# You can also try using
-# DeepLabv3 (:func:`~torchvision.models.segmentation.deeplabv3_resnet50`)
-# or lraspp mobilenet models
-# (:func:`~torchvision.models.segmentation.lraspp_mobilenet_v3_large`).
-#
-# Like :func:`~torchvision.utils.draw_bounding_boxes`,
-# :func:`~torchvision.utils.draw_segmentation_masks` requires a single RGB image
-# of dtype `uint8`.
-
-# from torchvision.models.segmentation import fcn_resnet50
-# from torchvision.utils import draw_segmentation_masks
-
-
-# model = fcn_resnet50(pretrained=True, progress=False)
-# model = model.eval()
-
-# # The model expects the batch to be normalized
-# batch = F.normalize(batch, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
-# outputs = model(batch)
-
-# dogs_with_masks = [
-#     draw_segmentation_masks(dog_int, masks=masks, alpha=0.6)
-#     for dog_int, masks in zip((dog1_int, dog2_int), outputs['out'])
-# ]
-# show(dogs_with_masks)

From 7876d47cb93f68a338e5af974490ee98e1ce3e9d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Thu, 13 May 2021 08:31:00 +0100
Subject: [PATCH 06/19] Add comment about float images expected in 0-1

---
 torchvision/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/utils.py b/torchvision/utils.py
index 949aa3b6406..c8e333b5767 100644
--- a/torchvision/utils.py
+++ b/torchvision/utils.py
@@ -226,7 +226,7 @@ def draw_segmentation_masks(
 
     """
     Draws segmentation masks on given RGB image.
-    The values of the input image should be uint8 between 0 and 255.
+    The values of the input image should be uint8 between 0 and 255, or float values between 0 and 1.
 
     Args:
         image (Tensor): Tensor of shape (3, H, W) and dtype uint8 or float.

From e0e0fc679365ed03a620380dec93c7563c37ba7d Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Thu, 13 May 2021 08:51:26 +0100
Subject: [PATCH 07/19] remove debug stuff

---
 gallery/plot_visualization_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gallery/plot_visualization_utils.py b/gallery/plot_visualization_utils.py
index 260b92270fd..76e32bc586e 100644
--- a/gallery/plot_visualization_utils.py
+++ b/gallery/plot_visualization_utils.py
@@ -77,7 +77,6 @@ def show(imgs):
 dog1_float = convert_image_dtype(dog1_int, dtype=torch.float)
 dog2_float = convert_image_dtype(dog2_int, dtype=torch.float)
 batch = torch.stack([dog1_float, dog2_float])
-batch = torch.stack([dog1_int, dog2_int])
 
 model = fasterrcnn_resnet50_fpn(pretrained=True, progress=False)
 model = model.eval()

From 5d287db8a20b727e5dc3e54903cdccf5bf9706a7 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Thu, 13 May 2021 09:11:11 +0100
Subject: [PATCH 08/19] Put back None testing

---
 test/test_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_utils.py b/test/test_utils.py
index e0689dc5342..a223088121c 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -163,7 +163,7 @@ def test_draw_invalid_boxes(self):
 
 @pytest.mark.parametrize('dtype', (torch.float, torch.uint8))
 @pytest.mark.parametrize('colors', [
-    # None,
+    None,
     ['red', 'blue'],
     ['#FF00FF', (1, 34, 122)],
 ])

From f91d0709058b0a5acad20d8636b5209efb6004db Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Thu, 13 May 2021 09:46:28 +0100
Subject: [PATCH 09/19] more robust test

---
 test/test_utils.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/test/test_utils.py b/test/test_utils.py
index a223088121c..7644915f2cb 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -167,7 +167,7 @@ def test_draw_invalid_boxes(self):
     ['red', 'blue'],
     ['#FF00FF', (1, 34, 122)],
 ])
-@pytest.mark.parametrize('alpha', (0, 1))
+@pytest.mark.parametrize('alpha', (0, .5, .7, 1))
 def test_draw_segmentation_masks(dtype, colors, alpha):
     """This test makes sure that masks draw their corresponding color where they should"""
     num_masks, h, w = 2, 100, 100
@@ -185,12 +185,6 @@ def test_draw_segmentation_masks(dtype, colors, alpha):
     assert out.dtype == dtype
     assert out is not img
 
-    if dtype == torch.float:
-        # makes comparisons below easier
-        img = F.convert_image_dtype(img, torch.uint8)
-        out = F.convert_image_dtype(out, torch.uint8)
-    img, out = img.float(), out.float()  # avoids underflows etc.
-
     # Make sure the image didn't change where there's no mask
     masked_pixels = masks[0] | masks[1]
     assert (img[:, ~masked_pixels] == out[:, ~masked_pixels]).all()
@@ -203,12 +197,21 @@ def test_draw_segmentation_masks(dtype, colors, alpha):
         if isinstance(color, str):
             color = ImageColor.getrgb(color)
         color = torch.tensor(color, dtype=dtype)
+        if dtype == torch.float:
+            color /= 255
 
         if alpha == 0:
             assert (out[:, mask] == color[:, None]).all()
-        else:
+        elif alpha == 1:
             assert (out[:, mask] == img[:, mask]).all()
 
+        interpolated_color = (img[:, mask] * alpha + color[:, None] * (1 - alpha))
+        max_diff = (out[:, mask] - interpolated_color).abs().max()
+        if dtype == torch.uint8:
+            assert max_diff <= 1
+        else:
+            assert max_diff <= 1e-5
+
 
 def test_draw_segmentation_masks_int_vs_float():
     """Make sure float and uint8 dtypes produce similar images"""

From a59c9d74ab0c4741ae78def2e9c2ae4de54d36cb Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Thu, 13 May 2021 12:34:19 +0100
Subject: [PATCH 10/19] WIP

---
 gallery/plot_visualization_utils.py | 153 +++++++++++++++++++++++++++-
 1 file changed, 149 insertions(+), 4 deletions(-)

diff --git a/gallery/plot_visualization_utils.py b/gallery/plot_visualization_utils.py
index 76e32bc586e..904cc9187c7 100644
--- a/gallery/plot_visualization_utils.py
+++ b/gallery/plot_visualization_utils.py
@@ -24,7 +24,8 @@ def show(imgs):
         imgs = [imgs]
     fix, axs = plt.subplots(ncols=len(imgs), squeeze=False)
     for i, img in enumerate(imgs):
-        img = F.to_pil_image(img.to('cpu'))
+        img = img.detach()
+        img = F.to_pil_image(img)
         axs[0, i].imshow(np.asarray(img))
         axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
 
@@ -50,9 +51,8 @@ def show(imgs):
 # Visualizing bounding boxes
 # --------------------------
 # We can use :func:`~torchvision.utils.draw_bounding_boxes` to draw boxes on an
-# image. We can set the colors, labels, width as well as font and font size !
-# The boxes are in ``(xmin, ymin, xmax, ymax)`` format
-# from torchvision.utils import draw_bounding_boxes
+# image. We can set the colors, labels, width as well as font and font size.
+# The boxes are in ``(xmin, ymin, xmax, ymax)`` format.
 
 from torchvision.utils import draw_bounding_boxes
 
@@ -94,3 +94,148 @@ def show(imgs):
     for dog_int, output in zip((dog1_int, dog2_int), outputs)
 ]
 show(dogs_with_boxes)
+
+#####################################
+# Visualizing segmentation masks
+# ------------------------------
+# The :func:`~torchvision.utils.draw_segmentation_masks` function can be used to
+# draw segmentation amasks on images.
+# 
+# Semantic segmentation models
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# 
+# We will see how to use it with torchvision's FCN Resnet-50, loaded with
+# :func:`~torchvision.models.segmentation.fcn_resnet50`.  You can also try using
+# DeepLabv3 (:func:`~torchvision.models.segmentation.deeplabv3_resnet50`) or
+# lraspp mobilenet models
+# (:func:`~torchvision.models.segmentation.lraspp_mobilenet_v3_large`).
+# 
+# Let's start by looking at the ouput of the model. Remember that in general,
+# images must be normalized before they're passed to the model.
+
+from torchvision.models.segmentation import fcn_resnet50
+from torchvision.utils import draw_segmentation_masks
+
+
+model = fcn_resnet50(pretrained=True, progress=False)
+model = model.eval()
+
+normalized_batch = F.normalize(batch, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
+output = model(normalized_batch)['out']
+print(output.shape, output.min().item(), output.max().item())
+
+#####################################
+# As we can see above, the output of the segmentation model is a tensor of shape
+# ``(batch_size, num_classes, H, W)``. Each value is a non-normalized score and
+# can normalize them into ``[0, 1]`` by using a softmax. After the softmax, we
+# can interpret each value as a probability indicating how likely a given pixel
+# is to belong to a given class. 
+# 
+# Let's plot the masks that have been detected for the dog class and for the
+# boat class:
+
+seg_classes = [
+    '__background__', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
+    'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike',
+    'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
+]
+seg_class_to_idx = {cls: idx for (idx, cls) in enumerate(seg_classes)}
+
+# We normalize the masks of each image in the batch independently
+normalized_masks = torch.stack([torch.nn.Softmax(dim=0)(masks) for masks in output])
+
+dog_and_boat_masks = [
+    normalized_masks[img_idx, seg_class_to_idx[cls]]
+    for img_idx in range(batch.shape[0])
+    for cls in ('dog', 'boat')
+]
+
+show(dog_and_boat_masks)
+
+#####################################
+# As expected, the model is confident about the dog class, but not so much for
+# the boat class.
+# 
+# The :func:`~torchvision.utils.draw_segmentation_masks` function can be used to
+# plots those masks on top of the original image. This function expects the
+# masks to be boolean masks, but our masks above contain probabilities in ``[0,
+# 1]``. To get boolean masks, we can do the following:
+
+class_dim = 1
+boolean_dog_masks = (normalized_masks.argmax(class_dim) == seg_class_to_idx['dog'])
+print(f"shape = {boolean_dog_masks.shape}, dtype = {boolean_dog_masks.dtype}")
+show([m.float() for m in boolean_dog_masks])
+
+#####################################
+# The line above where we define ``boolean_dog_masks`` is a bit cryptic, but you
+# can read it as the following query: "For which pixels is 'dog' the most likely
+# class?"
+# 
+# .. note::
+#   While we're using the ``normalized_masks`` here, we would have
+#   gotten the same result by using the non-normalized scores of the model
+#   directly (as the softmax operation perserves the order).
+# 
+# Now that we have boolean masks, we can use them with
+# :func:~torchvision.utils.draw_segmentation_masks to plot them on top of the
+# original images:
+
+dogs_with_masks = [
+    draw_segmentation_masks(img, masks=mask, alpha=0.3)
+    for img, mask in zip(batch, boolean_dog_masks)
+]
+show(dogs_with_masks)
+
+#####################################
+# We can plot more than one mask per image! Remember that the model returned as
+# many masks as there are classes. Let's ask the same query as above, but this
+# time for *all* classes, not just the dog class: "For each pixel and each class
+# C, is class C the most most likely class?"
+# 
+# This one is a bit more involved, so we'll first show how to do it with a
+# single image, and then we'll generalize to the batch
+
+num_classes = normalized_masks.shape[1]
+dog1_masks = normalized_masks[0]
+class_dim = 0
+dog1_all_classes_masks = dog1_masks.argmax(class_dim) == torch.arange(num_classes)[:, None, None]
+
+print(f"dog1_masks shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype}")
+print(f"dog1_all_classes_masks = {dog1_all_classes_masks.shape}, dtype = {dog1_all_classes_masks.dtype}")
+
+dog_with_all_masks = draw_segmentation_masks(dog1_float, masks=dog1_all_classes_masks, alpha=.4)
+show(dog_with_all_masks)
+
+#####################################
+# We can see in the image above that only 2 masks were drawn: the mask for the
+# background and the mask for the dog. This is because the model thinkgs that
+# only these 2 classes are the most likely ones across all the pixels. It the
+# model had detected another class as the most likely among other pixels, we
+# would have seen its mask above.
+# 
+# Removing the background mask is as simple as passing
+# ``masks=dog1_all_classes_masks[1:]``.
+# 
+# Let's now do the same but for an entire batch of images. The code is similar
+# but involves a bit more juggling with the dimensions.
+
+class_dim = 1
+all_classes_masks = normalized_masks.argmax(class_dim) == torch.arange(num_classes)[:, None, None, None]
+print(f"shape = {all_classes_masks.shape}, dtype = {all_classes_masks.dtype}")
+# The first dimension is the classes now, so we need to swap it
+all_classes_masks = all_classes_masks.swapaxes(0, 1)
+
+dogs_with_masks = [
+    draw_segmentation_masks(img, masks=mask, alpha=.4)
+    for img, mask in zip(batch, all_classes_masks)
+]
+show(dogs_with_masks)
+
+
+#####################################
+# Instance segmentation models
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# Instance segmentation models have a significantly different output from the
+# semantic segmentation models. We will see here blahblah TODO
+# 
\ No newline at end of file

From d77f0e5dacdf161775d9c90fdfa01188922f3a0f Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Thu, 13 May 2021 15:27:17 +0100
Subject: [PATCH 11/19] some more

---
 gallery/plot_visualization_utils.py | 126 ++++++++++++++++++++++++++--
 1 file changed, 118 insertions(+), 8 deletions(-)

diff --git a/gallery/plot_visualization_utils.py b/gallery/plot_visualization_utils.py
index 904cc9187c7..1c7c8030664 100644
--- a/gallery/plot_visualization_utils.py
+++ b/gallery/plot_visualization_utils.py
@@ -99,7 +99,9 @@ def show(imgs):
 # Visualizing segmentation masks
 # ------------------------------
 # The :func:`~torchvision.utils.draw_segmentation_masks` function can be used to
-# draw segmentation amasks on images.
+# draw segmentation amasks on images. Semantic segmentation and instance
+# segmentation models have different outputs, so we will treat each
+# independently.
 # 
 # Semantic segmentation models
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -114,7 +116,6 @@ def show(imgs):
 # images must be normalized before they're passed to the model.
 
 from torchvision.models.segmentation import fcn_resnet50
-from torchvision.utils import draw_segmentation_masks
 
 
 model = fcn_resnet50(pretrained=True, progress=False)
@@ -134,18 +135,18 @@ def show(imgs):
 # Let's plot the masks that have been detected for the dog class and for the
 # boat class:
 
-seg_classes = [
+sem_classes = [
     '__background__', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
     'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike',
     'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
 ]
-seg_class_to_idx = {cls: idx for (idx, cls) in enumerate(seg_classes)}
+sem_class_to_idx = {cls: idx for (idx, cls) in enumerate(sem_classes)}
 
 # We normalize the masks of each image in the batch independently
 normalized_masks = torch.stack([torch.nn.Softmax(dim=0)(masks) for masks in output])
 
 dog_and_boat_masks = [
-    normalized_masks[img_idx, seg_class_to_idx[cls]]
+    normalized_masks[img_idx, sem_class_to_idx[cls]]
     for img_idx in range(batch.shape[0])
     for cls in ('dog', 'boat')
 ]
@@ -162,10 +163,11 @@ def show(imgs):
 # 1]``. To get boolean masks, we can do the following:
 
 class_dim = 1
-boolean_dog_masks = (normalized_masks.argmax(class_dim) == seg_class_to_idx['dog'])
+boolean_dog_masks = (normalized_masks.argmax(class_dim) == sem_class_to_idx['dog'])
 print(f"shape = {boolean_dog_masks.shape}, dtype = {boolean_dog_masks.dtype}")
 show([m.float() for m in boolean_dog_masks])
 
+
 #####################################
 # The line above where we define ``boolean_dog_masks`` is a bit cryptic, but you
 # can read it as the following query: "For which pixels is 'dog' the most likely
@@ -180,6 +182,8 @@ def show(imgs):
 # :func:~torchvision.utils.draw_segmentation_masks to plot them on top of the
 # original images:
 
+from torchvision.utils import draw_segmentation_masks
+
 dogs_with_masks = [
     draw_segmentation_masks(img, masks=mask, alpha=0.3)
     for img, mask in zip(batch, boolean_dog_masks)
@@ -237,5 +241,111 @@ def show(imgs):
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 #
 # Instance segmentation models have a significantly different output from the
-# semantic segmentation models. We will see here blahblah TODO
-# 
\ No newline at end of file
+# semantic segmentation models. We will see here how to plot the masks for such
+# models. Let's start by analyzing the output of a Mask-RCNN model. Note that
+# these models don't require the images to be normalized, so we don't need to
+# use the normalized batch.
+
+from torchvision.models.detection import maskrcnn_resnet50_fpn
+model = maskrcnn_resnet50_fpn(pretrained=True, progress=False)
+model = model.eval()
+
+output = model(batch)
+print(output)
+
+#####################################
+# Let's break this down. For each image in the batch, the model outputs some
+# detections (or instances). The number of detection varies for each input
+# image. Each instance is described by its bounding box, its label, its score
+# and its mask.
+# 
+# The way the output is organized is as follows: the output is a list of length
+# ``batch_size``. Each entry in the list corresponds to an input image, and it
+# is a dict with keys 'boxes', 'labels', 'scores', and 'masks'. Each value
+# associated to those keys has ``num_instances`` elements in it.  In our case
+# above there are 3 instances detected in the first image, and 2 instances in
+# the second one.
+# 
+# The boxes can be plotted with :func:`~torchvision.utils.draw_bounding_boxes`
+# as above, but here we're more interested in the masks. These masks are quite
+# different from the masks that we saw above for the semantic segmentation
+# models.
+
+dog1_output = output[0]
+dog1_masks = dog1_output['masks']
+print(f"shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype},
+        min = {dog1_masks.min()}, max = {dog1_masks.max()}")
+
+#####################################
+# Here the masks corresponds to probabilities indicating, for each pixel, how
+# likely it is to belong to the predicted label of that instance. Those
+# predicted labels correspond to the 'labels' element in the same output dict.
+
+inst_classes = [
+    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
+    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
+    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
+    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
+    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
+    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
+    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
+    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
+    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
+    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
+    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
+]
+
+inst_class_to_idx = {cls: idx for (idx, cls) in enumerate(inst_classes)}
+
+print("For the first dog, the following instances were detected:")
+print([inst_classes[label] for label in dog1_output['labels']])
+
+#####################################
+# Interestingly, the models detects two persons in the image. Let's go ahead and
+# plot those masks. Since :func:`~torchvision.utils.draw_segmentation_masks`
+# expects boolean mask, we need to convert those probabilities into boolean
+# values. Remember that the semantic of those masks is "How likely is this pixel
+# to belong to this predicted class?". As a result, a natural way of converting
+# those masks into boolean values is to threshold them with the 0.5 probability
+# (one could also choose a different threshold).
+
+proba_threshold = 0.5
+dog1_bool_masks = dog1_output['masks'] > proba_threshold
+print(f"shape = {dog1_bool_masks.shape}, dtype = {dog1_bool_masks.dtype}")
+
+# There's an extra dimension (1) to the masks. We need to remove it
+dog1_bool_masks = dog1_bool_masks.squeeze(1)
+
+show(draw_segmentation_masks(dog1_float, dog1_bool_masks, alpha=0.1))
+
+#####################################
+# The model seems to have properly detected the dog, but it also confused trees
+# with people. Looking more closely at the scores will help us plotting more
+# relevant masks:
+
+print(dog1_output['scores'])
+
+#####################################
+# Clearly the model is less confident about the dog detection than it is about
+# the people detections. That's good news. When plotting the masks, we can ask
+# for only those that have a good score. Let's use a score threshold of .75
+# here, and also plot the mask of the second dog.
+
+score_threshold = .75
+
+boolean_masks = [
+    out['masks'][out['scores'] > score_threshold] > proba_threshold
+    for out in output
+]
+
+dogs_with_masks = [
+    draw_segmentation_masks(img, mask.squeeze(1))
+    for img, mask in zip(batch, boolean_masks)
+]
+show(dogs_with_masks)
+
+#####################################
+# The two 'people' masks in the first image where not selected because they have
+# a lower score than the score threshold. Similarly in the second image, the
+# instance with class 15 (which corresponds to 'bench') was not selected.

From 079feef979e3a47abebe8d92dd210ab196801385 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Thu, 13 May 2021 15:32:46 +0100
Subject: [PATCH 12/19] some flake8

---
 gallery/plot_visualization_utils.py | 30 ++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/gallery/plot_visualization_utils.py b/gallery/plot_visualization_utils.py
index 1c7c8030664..d5c7ca0e8f3 100644
--- a/gallery/plot_visualization_utils.py
+++ b/gallery/plot_visualization_utils.py
@@ -102,16 +102,16 @@ def show(imgs):
 # draw segmentation amasks on images. Semantic segmentation and instance
 # segmentation models have different outputs, so we will treat each
 # independently.
-# 
+#
 # Semantic segmentation models
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-# 
+#
 # We will see how to use it with torchvision's FCN Resnet-50, loaded with
 # :func:`~torchvision.models.segmentation.fcn_resnet50`.  You can also try using
 # DeepLabv3 (:func:`~torchvision.models.segmentation.deeplabv3_resnet50`) or
 # lraspp mobilenet models
 # (:func:`~torchvision.models.segmentation.lraspp_mobilenet_v3_large`).
-# 
+#
 # Let's start by looking at the ouput of the model. Remember that in general,
 # images must be normalized before they're passed to the model.
 
@@ -130,8 +130,8 @@ def show(imgs):
 # ``(batch_size, num_classes, H, W)``. Each value is a non-normalized score and
 # can normalize them into ``[0, 1]`` by using a softmax. After the softmax, we
 # can interpret each value as a probability indicating how likely a given pixel
-# is to belong to a given class. 
-# 
+# is to belong to a given class.
+#
 # Let's plot the masks that have been detected for the dog class and for the
 # boat class:
 
@@ -156,7 +156,7 @@ def show(imgs):
 #####################################
 # As expected, the model is confident about the dog class, but not so much for
 # the boat class.
-# 
+#
 # The :func:`~torchvision.utils.draw_segmentation_masks` function can be used to
 # plots those masks on top of the original image. This function expects the
 # masks to be boolean masks, but our masks above contain probabilities in ``[0,
@@ -172,12 +172,12 @@ def show(imgs):
 # The line above where we define ``boolean_dog_masks`` is a bit cryptic, but you
 # can read it as the following query: "For which pixels is 'dog' the most likely
 # class?"
-# 
+#
 # .. note::
 #   While we're using the ``normalized_masks`` here, we would have
 #   gotten the same result by using the non-normalized scores of the model
 #   directly (as the softmax operation perserves the order).
-# 
+#
 # Now that we have boolean masks, we can use them with
 # :func:~torchvision.utils.draw_segmentation_masks to plot them on top of the
 # original images:
@@ -195,7 +195,7 @@ def show(imgs):
 # many masks as there are classes. Let's ask the same query as above, but this
 # time for *all* classes, not just the dog class: "For each pixel and each class
 # C, is class C the most most likely class?"
-# 
+#
 # This one is a bit more involved, so we'll first show how to do it with a
 # single image, and then we'll generalize to the batch
 
@@ -216,10 +216,10 @@ def show(imgs):
 # only these 2 classes are the most likely ones across all the pixels. It the
 # model had detected another class as the most likely among other pixels, we
 # would have seen its mask above.
-# 
+#
 # Removing the background mask is as simple as passing
 # ``masks=dog1_all_classes_masks[1:]``.
-# 
+#
 # Let's now do the same but for an entire batch of images. The code is similar
 # but involves a bit more juggling with the dimensions.
 
@@ -258,14 +258,14 @@ def show(imgs):
 # detections (or instances). The number of detection varies for each input
 # image. Each instance is described by its bounding box, its label, its score
 # and its mask.
-# 
+#
 # The way the output is organized is as follows: the output is a list of length
 # ``batch_size``. Each entry in the list corresponds to an input image, and it
 # is a dict with keys 'boxes', 'labels', 'scores', and 'masks'. Each value
 # associated to those keys has ``num_instances`` elements in it.  In our case
 # above there are 3 instances detected in the first image, and 2 instances in
 # the second one.
-# 
+#
 # The boxes can be plotted with :func:`~torchvision.utils.draw_bounding_boxes`
 # as above, but here we're more interested in the masks. These masks are quite
 # different from the masks that we saw above for the semantic segmentation
@@ -273,8 +273,8 @@ def show(imgs):
 
 dog1_output = output[0]
 dog1_masks = dog1_output['masks']
-print(f"shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype},
-        min = {dog1_masks.min()}, max = {dog1_masks.max()}")
+print(f"shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype}, "
+      f"min = {dog1_masks.min()}, max = {dog1_masks.max()}")
 
 #####################################
 # Here the masks corresponds to probabilities indicating, for each pixel, how

From efb12bc2a5260cd81cce38190886b6f48ae098ea Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Thu, 13 May 2021 15:42:27 +0100
Subject: [PATCH 13/19] fix typos etc

---
 gallery/plot_visualization_utils.py | 31 ++++++++++++++++-------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/gallery/plot_visualization_utils.py b/gallery/plot_visualization_utils.py
index d5c7ca0e8f3..ee6c30ae336 100644
--- a/gallery/plot_visualization_utils.py
+++ b/gallery/plot_visualization_utils.py
@@ -113,7 +113,8 @@ def show(imgs):
 # (:func:`~torchvision.models.segmentation.lraspp_mobilenet_v3_large`).
 #
 # Let's start by looking at the ouput of the model. Remember that in general,
-# images must be normalized before they're passed to the model.
+# images must be normalized before they're passed to a semantic segmentation
+# model.
 
 from torchvision.models.segmentation import fcn_resnet50
 
@@ -127,10 +128,10 @@ def show(imgs):
 
 #####################################
 # As we can see above, the output of the segmentation model is a tensor of shape
-# ``(batch_size, num_classes, H, W)``. Each value is a non-normalized score and
-# can normalize them into ``[0, 1]`` by using a softmax. After the softmax, we
-# can interpret each value as a probability indicating how likely a given pixel
-# is to belong to a given class.
+# ``(batch_size, num_classes, H, W)``. Each value is a non-normalized score, and
+# we can normalize them into ``[0, 1]`` by using a softmax. After the softmax,
+# we can interpret each value as a probability indicating how likely a given
+# pixel is to belong to a given class.
 #
 # Let's plot the masks that have been detected for the dog class and for the
 # boat class:
@@ -176,10 +177,10 @@ def show(imgs):
 # .. note::
 #   While we're using the ``normalized_masks`` here, we would have
 #   gotten the same result by using the non-normalized scores of the model
-#   directly (as the softmax operation perserves the order).
+#   directly (as the softmax operation preserves the order).
 #
 # Now that we have boolean masks, we can use them with
-# :func:~torchvision.utils.draw_segmentation_masks to plot them on top of the
+# :func:`~torchvision.utils.draw_segmentation_masks` to plot them on top of the
 # original images:
 
 from torchvision.utils import draw_segmentation_masks
@@ -212,13 +213,14 @@ def show(imgs):
 
 #####################################
 # We can see in the image above that only 2 masks were drawn: the mask for the
-# background and the mask for the dog. This is because the model thinkgs that
-# only these 2 classes are the most likely ones across all the pixels. It the
+# background and the mask for the dog. This is because the model thinks that
+# only these 2 classes are the most likely ones across all the pixels. If the
 # model had detected another class as the most likely among other pixels, we
 # would have seen its mask above.
 #
 # Removing the background mask is as simple as passing
-# ``masks=dog1_all_classes_masks[1:]``.
+# ``masks=dog1_all_classes_masks[1:]``, because the background class is the
+# class with index 0.
 #
 # Let's now do the same but for an entire batch of images. The code is similar
 # but involves a bit more juggling with the dimensions.
@@ -280,6 +282,7 @@ def show(imgs):
 # Here the masks corresponds to probabilities indicating, for each pixel, how
 # likely it is to belong to the predicted label of that instance. Those
 # predicted labels correspond to the 'labels' element in the same output dict.
+# Let's see which labels were predicted for the instances of the first image.
 
 inst_classes = [
     '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
@@ -302,11 +305,11 @@ def show(imgs):
 print([inst_classes[label] for label in dog1_output['labels']])
 
 #####################################
-# Interestingly, the models detects two persons in the image. Let's go ahead and
+# Interestingly, the model detects two persons in the image. Let's go ahead and
 # plot those masks. Since :func:`~torchvision.utils.draw_segmentation_masks`
-# expects boolean mask, we need to convert those probabilities into boolean
+# expects boolean masks, we need to convert those probabilities into boolean
 # values. Remember that the semantic of those masks is "How likely is this pixel
-# to belong to this predicted class?". As a result, a natural way of converting
+# to belong to the predicted class?". As a result, a natural way of converting
 # those masks into boolean values is to threshold them with the 0.5 probability
 # (one could also choose a different threshold).
 
@@ -330,7 +333,7 @@ def show(imgs):
 # Clearly the model is less confident about the dog detection than it is about
 # the people detections. That's good news. When plotting the masks, we can ask
 # for only those that have a good score. Let's use a score threshold of .75
-# here, and also plot the mask of the second dog.
+# here, and also plot the masks of the second dog.
 
 score_threshold = .75
 

From 1885d953525b41209ff3ffb18b77ebc68f197daf Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Mon, 17 May 2021 11:10:14 +0100
Subject: [PATCH 14/19] remove float images support

---
 gallery/plot_visualization_utils.py | 17 ++++++++--------
 test/test_utils.py                  | 30 +++--------------------------
 torchvision/utils.py                | 15 ++++++---------
 3 files changed, 17 insertions(+), 45 deletions(-)

diff --git a/gallery/plot_visualization_utils.py b/gallery/plot_visualization_utils.py
index ee6c30ae336..4fc2c8c1233 100644
--- a/gallery/plot_visualization_utils.py
+++ b/gallery/plot_visualization_utils.py
@@ -74,9 +74,8 @@ def show(imgs):
 from torchvision.transforms.functional import convert_image_dtype
 
 
-dog1_float = convert_image_dtype(dog1_int, dtype=torch.float)
-dog2_float = convert_image_dtype(dog2_int, dtype=torch.float)
-batch = torch.stack([dog1_float, dog2_float])
+batch_int = torch.stack([dog1_int, dog2_int])
+batch = convert_image_dtype(batch_int, dtype=torch.float)
 
 model = fasterrcnn_resnet50_fpn(pretrained=True, progress=False)
 model = model.eval()
@@ -91,7 +90,7 @@ def show(imgs):
 threshold = .8
 dogs_with_boxes = [
     draw_bounding_boxes(dog_int, boxes=output['boxes'][output['scores'] > threshold], width=4)
-    for dog_int, output in zip((dog1_int, dog2_int), outputs)
+    for dog_int, output in zip(batch_int, outputs)
 ]
 show(dogs_with_boxes)
 
@@ -187,7 +186,7 @@ def show(imgs):
 
 dogs_with_masks = [
     draw_segmentation_masks(img, masks=mask, alpha=0.3)
-    for img, mask in zip(batch, boolean_dog_masks)
+    for img, mask in zip(batch_int, boolean_dog_masks)
 ]
 show(dogs_with_masks)
 
@@ -208,7 +207,7 @@ def show(imgs):
 print(f"dog1_masks shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype}")
 print(f"dog1_all_classes_masks = {dog1_all_classes_masks.shape}, dtype = {dog1_all_classes_masks.dtype}")
 
-dog_with_all_masks = draw_segmentation_masks(dog1_float, masks=dog1_all_classes_masks, alpha=.4)
+dog_with_all_masks = draw_segmentation_masks(dog1_int, masks=dog1_all_classes_masks, alpha=.4)
 show(dog_with_all_masks)
 
 #####################################
@@ -233,7 +232,7 @@ def show(imgs):
 
 dogs_with_masks = [
     draw_segmentation_masks(img, masks=mask, alpha=.4)
-    for img, mask in zip(batch, all_classes_masks)
+    for img, mask in zip(batch_int, all_classes_masks)
 ]
 show(dogs_with_masks)
 
@@ -320,7 +319,7 @@ def show(imgs):
 # There's an extra dimension (1) to the masks. We need to remove it
 dog1_bool_masks = dog1_bool_masks.squeeze(1)
 
-show(draw_segmentation_masks(dog1_float, dog1_bool_masks, alpha=0.1))
+show(draw_segmentation_masks(dog1_int, dog1_bool_masks, alpha=0.1))
 
 #####################################
 # The model seems to have properly detected the dog, but it also confused trees
@@ -344,7 +343,7 @@ def show(imgs):
 
 dogs_with_masks = [
     draw_segmentation_masks(img, mask.squeeze(1))
-    for img, mask in zip(batch, boolean_masks)
+    for img, mask in zip(batch_int, boolean_masks)
 ]
 show(dogs_with_masks)
 
diff --git a/test/test_utils.py b/test/test_utils.py
index 7644915f2cb..0edcd5b59da 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -161,16 +161,16 @@ def test_draw_invalid_boxes(self):
         self.assertRaises(ValueError, utils.draw_bounding_boxes, img_wrong2, boxes)
 
 
-@pytest.mark.parametrize('dtype', (torch.float, torch.uint8))
 @pytest.mark.parametrize('colors', [
     None,
     ['red', 'blue'],
     ['#FF00FF', (1, 34, 122)],
 ])
 @pytest.mark.parametrize('alpha', (0, .5, .7, 1))
-def test_draw_segmentation_masks(dtype, colors, alpha):
+def test_draw_segmentation_masks(colors, alpha):
     """This test makes sure that masks draw their corresponding color where they should"""
     num_masks, h, w = 2, 100, 100
+    dtype = torch.uint8
     img = torch.randint(0, 256, size=(3, h, w), dtype=dtype)
     masks = torch.randint(0, 2, (num_masks, h, w), dtype=torch.bool)
 
@@ -197,8 +197,6 @@ def test_draw_segmentation_masks(dtype, colors, alpha):
         if isinstance(color, str):
             color = ImageColor.getrgb(color)
         color = torch.tensor(color, dtype=dtype)
-        if dtype == torch.float:
-            color /= 255
 
         if alpha == 0:
             assert (out[:, mask] == color[:, None]).all()
@@ -207,29 +205,7 @@ def test_draw_segmentation_masks(dtype, colors, alpha):
 
         interpolated_color = (img[:, mask] * alpha + color[:, None] * (1 - alpha))
         max_diff = (out[:, mask] - interpolated_color).abs().max()
-        if dtype == torch.uint8:
-            assert max_diff <= 1
-        else:
-            assert max_diff <= 1e-5
-
-
-def test_draw_segmentation_masks_int_vs_float():
-    """Make sure float and uint8 dtypes produce similar images"""
-    h, w = 100, 100
-    masks = torch.randint(0, 2, size=(2, h, w), dtype=torch.bool)
-    img_int = torch.randint(0, 256, size=(3, h, w), dtype=torch.uint8)
-    img_float = F.convert_image_dtype(img_int, torch.float)
-
-    out_int = utils.draw_segmentation_masks(image=img_int, masks=masks, colors=['red', 'blue'])
-    out_float = utils.draw_segmentation_masks(image=img_float, masks=masks, colors=['red', 'blue'])
-
-    assert out_int.dtype == img_int.dtype
-    assert out_float.dtype == img_float.dtype
-
-    out_float_int = F.convert_image_dtype(out_float, torch.uint8).int()
-    out_int = out_int.int()
-
-    assert (out_int - out_float_int).abs().max() <= 1
+        assert max_diff <= 1
 
 
 def test_draw_segmentation_masks_errors():
diff --git a/torchvision/utils.py b/torchvision/utils.py
index c8e333b5767..ef28b14ec0b 100644
--- a/torchvision/utils.py
+++ b/torchvision/utils.py
@@ -226,10 +226,10 @@ def draw_segmentation_masks(
 
     """
     Draws segmentation masks on given RGB image.
-    The values of the input image should be uint8 between 0 and 255, or float values between 0 and 1.
+    The values of the input image should be uint8 between 0 and 255.
 
     Args:
-        image (Tensor): Tensor of shape (3, H, W) and dtype uint8 or float.
+        image (Tensor): Tensor of shape (3, H, W) and dtype uint8.
         masks (Tensor): Tensor of shape (num_masks, H, W) or (H, W) and dtype bool.
         alpha (float): Float number between 0 and 1 denoting the transparency of the masks.
         colors (list or None): List containing the colors of the masks. The colors can
@@ -238,14 +238,13 @@ def draw_segmentation_masks(
             with one element. By default, random colors are generated for each mask.
 
     Returns:
-        img (Tensor[C, H, W]): Image Tensor with the same dtype as the input image, with segmentation masks
-        drawn on top.
+        img (Tensor[C, H, W]): Image Tensor, with segmentation masks drawn on top.
     """
 
     if not isinstance(image, torch.Tensor):
         raise TypeError(f"The image must be a tensor, got {type(image)}")
-    elif image.dtype not in (torch.uint8, torch.float):
-        raise ValueError(f"The image dtype must be uint8 or float, got {image.dtype}")
+    elif image.dtype != torch.uint8:
+        raise ValueError(f"The image dtype must be uint8, got {image.dtype}")
     elif image.dim() != 3:
         raise ValueError("Pass individual images, not batches")
     elif image.size()[0] != 3:
@@ -273,15 +272,13 @@ def draw_segmentation_masks(
     if isinstance(colors[0], tuple) and len(colors[0]) != 3:
         raise ValueError("It seems that you passed a tuple of colors instead of a list of colors")
 
-    out_dtype = image.dtype
+    out_dtype = torch.uint8
 
     colors_ = []
     for color in colors:
         if isinstance(color, str):
             color = ImageColor.getrgb(color)
         color = torch.tensor(color, dtype=out_dtype)
-        if out_dtype == torch.float:
-            color /= 255
         colors_.append(color)
 
     img_to_draw = image.detach().clone()

From 5d53e9d14986617af52a61ce96f08293189cdebc Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Mon, 17 May 2021 11:13:54 +0100
Subject: [PATCH 15/19] typo

---
 gallery/plot_visualization_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gallery/plot_visualization_utils.py b/gallery/plot_visualization_utils.py
index 4fc2c8c1233..de26dbd3c17 100644
--- a/gallery/plot_visualization_utils.py
+++ b/gallery/plot_visualization_utils.py
@@ -98,7 +98,7 @@ def show(imgs):
 # Visualizing segmentation masks
 # ------------------------------
 # The :func:`~torchvision.utils.draw_segmentation_masks` function can be used to
-# draw segmentation amasks on images. Semantic segmentation and instance
+# draw segmentation masks on images. Semantic segmentation and instance
 # segmentation models have different outputs, so we will treat each
 # independently.
 #

From 3a61f830d54ef7fa24d198fdd14a9dada3d23aa9 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Mon, 17 May 2021 11:55:54 +0100
Subject: [PATCH 16/19] invert alpha meaning to follow PIL's convention

---
 test/test_utils.py   | 6 +++---
 torchvision/utils.py | 5 +++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/test/test_utils.py b/test/test_utils.py
index 0edcd5b59da..ee683b27ca4 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -198,12 +198,12 @@ def test_draw_segmentation_masks(colors, alpha):
             color = ImageColor.getrgb(color)
         color = torch.tensor(color, dtype=dtype)
 
-        if alpha == 0:
+        if alpha == 1:
             assert (out[:, mask] == color[:, None]).all()
-        elif alpha == 1:
+        elif alpha == 0:
             assert (out[:, mask] == img[:, mask]).all()
 
-        interpolated_color = (img[:, mask] * alpha + color[:, None] * (1 - alpha))
+        interpolated_color = (img[:, mask] * (1 - alpha) + color[:, None] * alpha)
         max_diff = (out[:, mask] - interpolated_color).abs().max()
         assert max_diff <= 1
 
diff --git a/torchvision/utils.py b/torchvision/utils.py
index ef28b14ec0b..8b23cae6eee 100644
--- a/torchvision/utils.py
+++ b/torchvision/utils.py
@@ -220,7 +220,7 @@ def draw_bounding_boxes(
 def draw_segmentation_masks(
     image: torch.Tensor,
     masks: torch.Tensor,
-    alpha: float = 0.2,
+    alpha: float = 0.8,
     colors: Optional[List[Union[str, Tuple[int, int, int]]]] = None,
 ) -> torch.Tensor:
 
@@ -232,6 +232,7 @@ def draw_segmentation_masks(
         image (Tensor): Tensor of shape (3, H, W) and dtype uint8.
         masks (Tensor): Tensor of shape (num_masks, H, W) or (H, W) and dtype bool.
         alpha (float): Float number between 0 and 1 denoting the transparency of the masks.
+            0 means full transparency, 1 means no transparency.
         colors (list or None): List containing the colors of the masks. The colors can
             be represented as PIL strings e.g. "red" or "#FF00FF", or as RGB tuples e.g. ``(240, 10, 157)``.
             When ``masks`` has a single entry of shape (H, W), you can pass a single color instead of a list
@@ -286,7 +287,7 @@ def draw_segmentation_masks(
     for mask, color in zip(masks, colors_):
         img_to_draw[:, mask] = color[:, None]
 
-    out = image * alpha + img_to_draw * (1 - alpha)
+    out = image * (1 - alpha) + img_to_draw * alpha
     return out.to(out_dtype)
 
 
From 99a0ebf6c67d00ca0203adfa1c98382456a38b15 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Mon, 17 May 2021 12:07:36 +0100
Subject: [PATCH 17/19] use functional.softmax

---
 gallery/plot_visualization_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gallery/plot_visualization_utils.py b/gallery/plot_visualization_utils.py
index de26dbd3c17..422a262f46b 100644
--- a/gallery/plot_visualization_utils.py
+++ b/gallery/plot_visualization_utils.py
@@ -143,7 +143,7 @@ def show(imgs):
 sem_class_to_idx = {cls: idx for (idx, cls) in enumerate(sem_classes)}
 
 # We normalize the masks of each image in the batch independently
-normalized_masks = torch.stack([torch.nn.Softmax(dim=0)(masks) for masks in output])
+normalized_masks = torch.stack([torch.nn.functional.softmax(masks, dim=0) for masks in output])
 
 dog_and_boat_masks = [
     normalized_masks[img_idx, sem_class_to_idx[cls]]

From b3341483e06b1abcb206c5e18f13be4ff47b314a Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Mon, 17 May 2021 12:17:53 +0100
Subject: [PATCH 18/19] cleaner softmax computation

---
 gallery/plot_visualization_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/gallery/plot_visualization_utils.py b/gallery/plot_visualization_utils.py
index 422a262f46b..0e31a60ac4c 100644
--- a/gallery/plot_visualization_utils.py
+++ b/gallery/plot_visualization_utils.py
@@ -142,8 +142,7 @@ def show(imgs):
 ]
 sem_class_to_idx = {cls: idx for (idx, cls) in enumerate(sem_classes)}
 
-# We normalize the masks of each image in the batch independently
-normalized_masks = torch.stack([torch.nn.functional.softmax(masks, dim=0) for masks in output])
+normalized_masks = torch.nn.functional.softmax(output, dim=1)
 
 dog_and_boat_masks = [
     normalized_masks[img_idx, sem_class_to_idx[cls]]

From e137c1c6282b451cddc4b62f2c33d5f900711cad Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Mon, 17 May 2021 12:32:33 +0100
Subject: [PATCH 19/19] forgot to change alphas in example too

---
 gallery/plot_visualization_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gallery/plot_visualization_utils.py b/gallery/plot_visualization_utils.py
index 0e31a60ac4c..04c5e3dcb53 100644
--- a/gallery/plot_visualization_utils.py
+++ b/gallery/plot_visualization_utils.py
@@ -184,7 +184,7 @@ def show(imgs):
 from torchvision.utils import draw_segmentation_masks
 
 dogs_with_masks = [
-    draw_segmentation_masks(img, masks=mask, alpha=0.3)
+    draw_segmentation_masks(img, masks=mask, alpha=0.7)
     for img, mask in zip(batch_int, boolean_dog_masks)
 ]
 show(dogs_with_masks)
@@ -206,7 +206,7 @@ def show(imgs):
 print(f"dog1_masks shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype}")
 print(f"dog1_all_classes_masks = {dog1_all_classes_masks.shape}, dtype = {dog1_all_classes_masks.dtype}")
 
-dog_with_all_masks = draw_segmentation_masks(dog1_int, masks=dog1_all_classes_masks, alpha=.4)
+dog_with_all_masks = draw_segmentation_masks(dog1_int, masks=dog1_all_classes_masks, alpha=.6)
 show(dog_with_all_masks)
 
 #####################################
@@ -230,7 +230,7 @@ def show(imgs):
 all_classes_masks = all_classes_masks.swapaxes(0, 1)
 
 dogs_with_masks = [
-    draw_segmentation_masks(img, masks=mask, alpha=.4)
+    draw_segmentation_masks(img, masks=mask, alpha=.6)
     for img, mask in zip(batch_int, all_classes_masks)
 ]
 show(dogs_with_masks)
@@ -318,7 +318,7 @@ def show(imgs):
 # There's an extra dimension (1) to the masks. We need to remove it
 dog1_bool_masks = dog1_bool_masks.squeeze(1)
 
-show(draw_segmentation_masks(dog1_int, dog1_bool_masks, alpha=0.1))
+show(draw_segmentation_masks(dog1_int, dog1_bool_masks, alpha=0.9))
 
 #####################################
 # The model seems to have properly detected the dog, but it also confused trees