Skip to content

Commit 5450e7c

Browse files
authored
πŸ”΄ πŸ”΄ πŸ”΄ Added segmentation maps support for DPT image processor (#34345)
* Added `segmentation_maps` support for DPT image processor * Added tests for dpt image processor * Moved preprocessing into separate functions * Added # Copied from statements * Fixed # Copied from statements * Added `segmentation_maps` support for DPT image processor * Added tests for dpt image processor * Moved preprocessing into separate functions * Added # Copied from statements * Fixed # Copied from statements
1 parent a50befa commit 5450e7c

File tree

2 files changed

+352
-37
lines changed

2 files changed

+352
-37
lines changed

β€Žsrc/transformers/models/dpt/image_processing_dpt.pyβ€Ž

Lines changed: 196 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,11 @@ class DPTImageProcessor(BaseImageProcessor):
139139
size_divisor (`int`, *optional*):
140140
If `do_pad` is `True`, pads the image dimensions to be divisible by this value. This was introduced in the
141141
DINOv2 paper, which uses the model in combination with DPT.
142+
do_reduce_labels (`bool`, *optional*, defaults to `False`):
143+
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 is
144+
used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). The
145+
background label will be replaced by 255. Can be overridden by the `do_reduce_labels` parameter in the
146+
`preprocess` method.
142147
"""
143148

144149
model_input_names = ["pixel_values"]
@@ -157,6 +162,7 @@ def __init__(
157162
image_std: Optional[Union[float, List[float]]] = None,
158163
do_pad: bool = False,
159164
size_divisor: int = None,
165+
do_reduce_labels: bool = False,
160166
**kwargs,
161167
) -> None:
162168
super().__init__(**kwargs)
@@ -174,6 +180,7 @@ def __init__(
174180
self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
175181
self.do_pad = do_pad
176182
self.size_divisor = size_divisor
183+
self.do_reduce_labels = do_reduce_labels
177184

178185
def resize(
179186
self,
@@ -275,10 +282,160 @@ def _get_pad(size, size_divisor):
275282

276283
return pad(image, ((pad_size_left, pad_size_right), (pad_size_top, pad_size_bottom)), data_format=data_format)
277284

285+
# Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.reduce_label
286+
def reduce_label(self, label: ImageInput) -> np.ndarray:
287+
label = to_numpy_array(label)
288+
# Avoid using underflow conversion
289+
label[label == 0] = 255
290+
label = label - 1
291+
label[label == 254] = 255
292+
return label
293+
294+
def _preprocess(
295+
self,
296+
image: ImageInput,
297+
do_reduce_labels: bool = None,
298+
do_resize: bool = None,
299+
size: Dict[str, int] = None,
300+
resample: PILImageResampling = None,
301+
keep_aspect_ratio: bool = None,
302+
ensure_multiple_of: int = None,
303+
do_rescale: bool = None,
304+
rescale_factor: float = None,
305+
do_normalize: bool = None,
306+
image_mean: Optional[Union[float, List[float]]] = None,
307+
image_std: Optional[Union[float, List[float]]] = None,
308+
do_pad: bool = None,
309+
size_divisor: int = None,
310+
input_data_format: Optional[Union[str, ChannelDimension]] = None,
311+
):
312+
if do_reduce_labels:
313+
image = self.reduce_label(image)
314+
315+
if do_resize:
316+
image = self.resize(
317+
image=image,
318+
size=size,
319+
resample=resample,
320+
keep_aspect_ratio=keep_aspect_ratio,
321+
ensure_multiple_of=ensure_multiple_of,
322+
input_data_format=input_data_format,
323+
)
324+
325+
if do_rescale:
326+
image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
327+
328+
if do_normalize:
329+
image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
330+
331+
if do_pad:
332+
image = self.pad_image(image=image, size_divisor=size_divisor, input_data_format=input_data_format)
333+
334+
return image
335+
336+
def _preprocess_image(
337+
self,
338+
image: ImageInput,
339+
do_resize: bool = None,
340+
size: Dict[str, int] = None,
341+
resample: PILImageResampling = None,
342+
keep_aspect_ratio: bool = None,
343+
ensure_multiple_of: int = None,
344+
do_rescale: bool = None,
345+
rescale_factor: float = None,
346+
do_normalize: bool = None,
347+
image_mean: Optional[Union[float, List[float]]] = None,
348+
image_std: Optional[Union[float, List[float]]] = None,
349+
do_pad: bool = None,
350+
size_divisor: int = None,
351+
data_format: Optional[Union[str, ChannelDimension]] = None,
352+
input_data_format: Optional[Union[str, ChannelDimension]] = None,
353+
) -> np.ndarray:
354+
"""Preprocesses a single image."""
355+
# All transformations expect numpy arrays.
356+
image = to_numpy_array(image)
357+
if do_rescale and is_scaled_image(image):
358+
logger.warning_once(
359+
"It looks like you are trying to rescale already rescaled images. If the input"
360+
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
361+
)
362+
if input_data_format is None:
363+
# We assume that all images have the same channel dimension format.
364+
input_data_format = infer_channel_dimension_format(image)
365+
366+
image = self._preprocess(
367+
image,
368+
do_reduce_labels=False,
369+
do_resize=do_resize,
370+
size=size,
371+
resample=resample,
372+
keep_aspect_ratio=keep_aspect_ratio,
373+
ensure_multiple_of=ensure_multiple_of,
374+
do_rescale=do_rescale,
375+
rescale_factor=rescale_factor,
376+
do_normalize=do_normalize,
377+
image_mean=image_mean,
378+
image_std=image_std,
379+
do_pad=do_pad,
380+
size_divisor=size_divisor,
381+
input_data_format=input_data_format,
382+
)
383+
if data_format is not None:
384+
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
385+
return image
386+
387+
def _preprocess_segmentation_map(
388+
self,
389+
segmentation_map: ImageInput,
390+
do_resize: bool = None,
391+
size: Dict[str, int] = None,
392+
resample: PILImageResampling = None,
393+
keep_aspect_ratio: bool = None,
394+
ensure_multiple_of: int = None,
395+
do_reduce_labels: bool = None,
396+
input_data_format: Optional[Union[str, ChannelDimension]] = None,
397+
):
398+
"""Preprocesses a single segmentation map."""
399+
# All transformations expect numpy arrays.
400+
segmentation_map = to_numpy_array(segmentation_map)
401+
# Add an axis to the segmentation maps for transformations.
402+
if segmentation_map.ndim == 2:
403+
segmentation_map = segmentation_map[None, ...]
404+
added_dimension = True
405+
input_data_format = ChannelDimension.FIRST
406+
else:
407+
added_dimension = False
408+
if input_data_format is None:
409+
input_data_format = infer_channel_dimension_format(segmentation_map, num_channels=1)
410+
segmentation_map = self._preprocess(
411+
image=segmentation_map,
412+
do_reduce_labels=do_reduce_labels,
413+
do_resize=do_resize,
414+
size=size,
415+
resample=resample,
416+
keep_aspect_ratio=keep_aspect_ratio,
417+
ensure_multiple_of=ensure_multiple_of,
418+
do_normalize=False,
419+
do_rescale=False,
420+
input_data_format=input_data_format,
421+
)
422+
# Remove extra axis if added
423+
if added_dimension:
424+
segmentation_map = np.squeeze(segmentation_map, axis=0)
425+
segmentation_map = segmentation_map.astype(np.int64)
426+
return segmentation_map
427+
428+
# Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.__call__
429+
def __call__(self, images, segmentation_maps=None, **kwargs):
430+
# Overrides the `__call__` method of the `Preprocessor` class such that the images and segmentation maps can both
431+
# be passed in as positional arguments.
432+
return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs)
433+
278434
@filter_out_non_signature_kwargs()
279435
def preprocess(
280436
self,
281437
images: ImageInput,
438+
segmentation_maps: Optional[ImageInput] = None,
282439
do_resize: bool = None,
283440
size: int = None,
284441
keep_aspect_ratio: bool = None,
@@ -291,6 +448,7 @@ def preprocess(
291448
image_std: Optional[Union[float, List[float]]] = None,
292449
do_pad: bool = None,
293450
size_divisor: int = None,
451+
do_reduce_labels: Optional[bool] = None,
294452
return_tensors: Optional[Union[str, TensorType]] = None,
295453
data_format: ChannelDimension = ChannelDimension.FIRST,
296454
input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -302,6 +460,8 @@ def preprocess(
302460
images (`ImageInput`):
303461
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
304462
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
463+
segmentation_maps (`ImageInput`, *optional*):
464+
Segmentation map to preprocess.
305465
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
306466
Whether to resize the image.
307467
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -326,6 +486,10 @@ def preprocess(
326486
Image mean.
327487
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
328488
Image standard deviation.
489+
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
490+
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
491+
is used for background, and background itself is not included in all classes of a dataset (e.g.
492+
ADE20k). The background label will be replaced by 255.
329493
return_tensors (`str` or `TensorType`, *optional*):
330494
The type of tensors to return. Can be one of:
331495
- Unset: Return a list of `np.ndarray`.
@@ -357,9 +521,13 @@ def preprocess(
357521
image_std = image_std if image_std is not None else self.image_std
358522
do_pad = do_pad if do_pad is not None else self.do_pad
359523
size_divisor = size_divisor if size_divisor is not None else self.size_divisor
524+
do_reduce_labels = do_reduce_labels if do_reduce_labels is not None else self.do_reduce_labels
360525

361526
images = make_list_of_images(images)
362527

528+
if segmentation_maps is not None:
529+
segmentation_maps = make_list_of_images(segmentation_maps, expected_ndims=2)
530+
363531
if not valid_images(images):
364532
raise ValueError(
365533
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
@@ -377,55 +545,47 @@ def preprocess(
377545
size=size,
378546
resample=resample,
379547
)
380-
# All transformations expect numpy arrays.
381-
images = [to_numpy_array(image) for image in images]
382548

383-
if do_rescale and is_scaled_image(images[0]):
384-
logger.warning_once(
385-
"It looks like you are trying to rescale already rescaled images. If the input"
386-
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
549+
images = [
550+
self._preprocess_image(
551+
image=img,
552+
do_resize=do_resize,
553+
do_rescale=do_rescale,
554+
do_normalize=do_normalize,
555+
do_pad=do_pad,
556+
size=size,
557+
resample=resample,
558+
keep_aspect_ratio=keep_aspect_ratio,
559+
ensure_multiple_of=ensure_multiple_of,
560+
rescale_factor=rescale_factor,
561+
image_mean=image_mean,
562+
image_std=image_std,
563+
size_divisor=size_divisor,
564+
data_format=data_format,
565+
input_data_format=input_data_format,
387566
)
567+
for img in images
568+
]
388569

389-
if input_data_format is None:
390-
# We assume that all images have the same channel dimension format.
391-
input_data_format = infer_channel_dimension_format(images[0])
570+
data = {"pixel_values": images}
392571

393-
if do_resize:
394-
images = [
395-
self.resize(
396-
image=image,
572+
if segmentation_maps is not None:
573+
segmentation_maps = [
574+
self._preprocess_segmentation_map(
575+
segmentation_map=segmentation_map,
576+
do_reduce_labels=do_reduce_labels,
577+
do_resize=do_resize,
397578
size=size,
398579
resample=resample,
399580
keep_aspect_ratio=keep_aspect_ratio,
400581
ensure_multiple_of=ensure_multiple_of,
401582
input_data_format=input_data_format,
402583
)
403-
for image in images
584+
for segmentation_map in segmentation_maps
404585
]
405586

406-
if do_rescale:
407-
images = [
408-
self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
409-
for image in images
410-
]
587+
data["labels"] = segmentation_maps
411588

412-
if do_normalize:
413-
images = [
414-
self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
415-
for image in images
416-
]
417-
418-
if do_pad:
419-
images = [
420-
self.pad_image(image=image, size_divisor=size_divisor, input_data_format=input_data_format)
421-
for image in images
422-
]
423-
424-
images = [
425-
to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
426-
]
427-
428-
data = {"pixel_values": images}
429589
return BatchFeature(data=data, tensor_type=return_tensors)
430590

431591
# Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.post_process_semantic_segmentation with Beit->DPT

0 commit comments

Comments
Β (0)