@@ -2,9 +2,9 @@ TorchVision Object Detection Finetuning Tutorial
22====================================================
33
44.. tip ::
5- To get the most of this tutorial, we suggest using this
6- `Colab Version <https://colab.research.google.com/github/pytorch/tutorials/blob/gh-pages/_downloads/torchvision_finetuning_instance_segmentation.ipynb >`__.
7- This will allow you to experiment with the information presented below.
5+ To get the most of this tutorial, we suggest using this
6+ `Colab Version <https://colab.research.google.com/github/pytorch/tutorials/blob/gh-pages/_downloads/torchvision_finetuning_instance_segmentation.ipynb >`__.
7+ This will allow you to experiment with the information presented below.
88
99For this tutorial, we will be finetuning a pre-trained `Mask
1010R-CNN <https://arxiv.org/abs/1703.06870> `__ model in the `Penn-Fudan
@@ -57,11 +57,14 @@ training and evaluation, and will use the evaluation scripts from
5757``pycocotools `` which can be installed with ``pip install pycocotools ``.
5858
5959.. note ::
60- For Windows, please install ``pycocotools`` from `gautamchitnis <https://github.com/gautamchitnis/cocoapi>`__ with command
60+ For Windows, please install ``pycocotools`` from `gautamchitnis <https://github.com/gautamchitnis/cocoapi>`__ with command
6161
6262 ``pip install git+https://github.com/gautamchitnis/cocoapi.git@cocodataset-master#subdirectory=PythonAPI``
6363
64- One note on the ``labels ``. The model considers class ``0 `` as background. If your dataset does not contain the background class, you should not have ``0 `` in your ``labels ``. For example, assuming you have just two classes, *cat * and *dog *, you can define ``1 `` (not ``0 ``) to represent *cats * and ``2 `` to represent *dogs *. So, for instance, if one of the images has both classes, your ``labels `` tensor should look like ``[1,2] ``.
64+ One note on the ``labels ``. The model considers class ``0 `` as background. If your dataset does not contain the background class,
65+ you should not have ``0 `` in your ``labels ``. For example, assuming you have just two classes, *cat * and *dog *, you can
66+ define ``1 `` (not ``0 ``) to represent *cats * and ``2 `` to represent *dogs *. So, for instance, if one of the images has both
67+ classes, your ``labels `` tensor should look like ``[1,2] ``.
6568
6669Additionally, if you want to use aspect ratio grouping during training
6770(so that each batch only contains images with similar aspect ratios),
@@ -94,7 +97,7 @@ have the following folder structure:
9497 FudanPed00003.png
9598 FudanPed00004.png
9699
97- Here is one example of a pair of images and segmentation masks
100+ Here is one example of a pair of images and segmentation masks
98101
99102.. image :: ../../_static/img/tv_tutorial/tv_image01.png
100103
@@ -103,13 +106,21 @@ Here is one example of a pair of images and segmentation masks
103106So each image has a corresponding
104107segmentation mask, where each color correspond to a different instance.
105108Let’s write a ``torch.utils.data.Dataset `` class for this dataset.
109+ In the code below we are wrapping images, bounding boxes and masks into
110+ ``torchvision.datapoints `` structures such that we will be able apply torchvision
111+ built-in transformations (`new Transforms API <https://pytorch.org/vision/stable/transforms.html >`_)
112+ that cover the object detection and segmetation tasks.
113+ More on torchvision datapoints see this `documentation <https://pytorch.org/vision/stable/datapoints.html >`_
106114
107115.. code :: python
108116
109117 import os
110- import numpy as np
111118 import torch
112- from PIL import Image
119+
120+ from torchvision.io import read_image
121+ from torchvision.ops.boxes import masks_to_boxes
122+ from torchvision import datapoints as dp
123+ from torchvision.transforms.v2 import functional as F
113124
114125
115126 class PennFudanDataset (torch .utils .data .Dataset ):
@@ -125,48 +136,36 @@ Let’s write a ``torch.utils.data.Dataset`` class for this dataset.
125136 # load images and masks
126137 img_path = os.path.join(self .root, " PNGImages" , self .imgs[idx])
127138 mask_path = os.path.join(self .root, " PedMasks" , self .masks[idx])
128- img = Image.open(img_path).convert(" RGB" )
129- # note that we haven't converted the mask to RGB,
130- # because each color corresponds to a different instance
131- # with 0 being background
132- mask = Image.open(mask_path)
133- # convert the PIL Image into a numpy array
134- mask = np.array(mask)
139+ img = read_image(img_path)
140+ mask = read_image(mask_path)
135141 # instances are encoded as different colors
136- obj_ids = np .unique(mask)
142+ obj_ids = torch .unique(mask)
137143 # first id is the background, so remove it
138144 obj_ids = obj_ids[1 :]
145+ num_objs = len (obj_ids)
139146
140147 # split the color-encoded mask into a set
141148 # of binary masks
142- masks = mask == obj_ids[:, None , None ]
149+ masks = ( mask == obj_ids[:, None , None ]).to( dtype = torch.uint8)
143150
144151 # get bounding box coordinates for each mask
145- num_objs = len (obj_ids)
146- boxes = []
147- for i in range (num_objs):
148- pos = np.nonzero(masks[i])
149- xmin = np.min(pos[1 ])
150- xmax = np.max(pos[1 ])
151- ymin = np.min(pos[0 ])
152- ymax = np.max(pos[0 ])
153- boxes.append([xmin, ymin, xmax, ymax])
154-
155- # convert everything into a torch.Tensor
156- boxes = torch.as_tensor(boxes, dtype = torch.float32)
152+ boxes = masks_to_boxes(masks)
153+
157154 # there is only one class
158155 labels = torch.ones((num_objs,), dtype = torch.int64)
159- masks = torch.as_tensor(masks, dtype = torch.uint8)
160156
161157 image_id = torch.tensor([idx])
162158 area = (boxes[:, 3 ] - boxes[:, 1 ]) * (boxes[:, 2 ] - boxes[:, 0 ])
163159 # suppose all instances are not crowd
164160 iscrowd = torch.zeros((num_objs,), dtype = torch.int64)
165161
162+ # Wrap sample and targets into torchvision datapoints:
163+ img = dp.Image(img)
164+
166165 target = {}
167- target[" boxes" ] = boxes
166+ target[" boxes" ] = dp.BoundingBoxes(boxes, format = " XYXY" , canvas_size = F.get_size(img))
167+ target[" masks" ] = dp.Mask(masks)
168168 target[" labels" ] = labels
169- target[" masks" ] = masks
170169 target[" image_id" ] = image_id
171170 target[" area" ] = area
172171 target[" iscrowd" ] = iscrowd
@@ -189,7 +188,7 @@ In this tutorial, we will be using `Mask
189188R-CNN <https://arxiv.org/abs/1703.06870> `__, which is based on top of
190189`Faster R-CNN <https://arxiv.org/abs/1506.01497 >`__. Faster R-CNN is a
191190model that predicts both bounding boxes and class scores for potential
192- objects in the image.
191+ objects in the image.
193192
194193.. image :: ../../_static/img/tv_tutorial/tv_image03.png
195194
@@ -199,7 +198,7 @@ instance.
199198
200199.. image :: ../../_static/img/tv_tutorial/tv_image04.png
201200
202- There are two common
201+ There are two common
203202situations where one might want
204203to modify one of the available models in torchvision modelzoo. The first
205204is when we want to start from a pre-trained model, and just finetune the
@@ -229,7 +228,7 @@ way of doing it:
229228 # get number of input features for the classifier
230229 in_features = model.roi_heads.box_predictor.cls_score.in_features
231230 # replace the pre-trained head with a new one
232- model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
231+ model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
233232
234233 2 - Modifying the model to add a different backbone
235234~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -252,7 +251,7 @@ way of doing it:
252251 # location, with 5 different sizes and 3 different aspect
253252 # ratios. We have a Tuple[Tuple[int]] because each feature
254253 # map could potentially have different sizes and
255- # aspect ratios
254+ # aspect ratios
256255 anchor_generator = AnchorGenerator(sizes = ((32 , 64 , 128 , 256 , 512 ),),
257256 aspect_ratios = ((0.5 , 1.0 , 2.0 ),))
258257
@@ -316,9 +315,11 @@ Putting everything together
316315
317316In ``references/detection/ ``, we have a number of helper functions to
318317simplify training and evaluating detection models. Here, we will use
319- ``references/detection/engine.py ``, ``references/detection/utils.py ``
320- and ``references/detection/transforms.py ``. Just copy everything under
321- ``references/detection `` to your folder and use them here.
318+ ``references/detection/engine.py `` and ``references/detection/utils.py ``.
319+ Just copy everything under ``references/detection `` to your folder and use them here.
320+
321+ Since v0.15.0 torchvision provides `new Transforms API <https://pytorch.org/vision/stable/transforms.html >`_
322+ to easily write data augmentation pipelines for Object Detection and Segmentation tasks.
322323
323324Let’s write some helper functions for data augmentation /
324325transformation:
@@ -339,25 +340,25 @@ transformation:
339340 Testing ``forward() `` method (Optional)
340341---------------------------------------
341342
342- Before iterating over the dataset, it's good to see what the model
343+ Before iterating over the dataset, it's good to see what the model
343344expects during training and inference time on sample data.
344345
345346.. code :: python
346347
347348 model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights = " DEFAULT" )
348349 dataset = PennFudanDataset(' PennFudanPed' , get_transform(train = True ))
349350 data_loader = torch.utils.data.DataLoader(
350- dataset, batch_size = 2 , shuffle = True , num_workers = 4 ,
351- collate_fn = utils.collate_fn)
351+ dataset, batch_size = 2 , shuffle = True , num_workers = 4 ,
352+ collate_fn = utils.collate_fn)
352353 # For Training
353- images,targets = next (iter (data_loader))
354+ images, targets = next (iter (data_loader))
354355 images = list (image for image in images)
355356 targets = [{k: v for k, v in t.items()} for t in targets]
356- output = model(images,targets) # Returns losses and detections
357+ output = model(images, targets) # Returns losses and detections
357358 # For inference
358359 model.eval()
359360 x = [torch.rand(3 , 300 , 400 ), torch.rand(3 , 500 , 400 )]
360- predictions = model(x) # Returns predictions
361+ predictions = model(x) # Returns predictions
361362
362363 Let’s now write the main function which performs the training and the
363364validation:
@@ -504,12 +505,12 @@ After training for 10 epochs, I got the following metrics
504505 Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.818
505506
506507But what do the predictions look like? Let’s take one image in the
507- dataset and verify
508+ dataset and verify
508509
509510.. image :: ../../_static/img/tv_tutorial/tv_image05.png
510511
511512The trained model predicts 9
512- instances of person in this image, let’s see a couple of them:
513+ instances of person in this image, let’s see a couple of them:
513514
514515.. image :: ../../_static/img/tv_tutorial/tv_image06.png
515516
@@ -531,7 +532,7 @@ For a more complete example, which includes multi-machine / multi-gpu
531532training, check ``references/detection/train.py ``, which is present in
532533the torchvision repo.
533534
534- You can download a full source file for this tutorial
535- `here <https://pytorch.org/tutorials/_static/tv-training-code.py >`__.
536-
535+ You can download a full source file for this tutorial
536+ `here <https://pytorch.org/tutorials/_static/tv-training-code.py >`__.
537+
537538
0 commit comments