@@ -139,6 +139,11 @@ class DPTImageProcessor(BaseImageProcessor):
139139 size_divisor (`int`, *optional*):
140140 If `do_pad` is `True`, pads the image dimensions to be divisible by this value. This was introduced in the
141141 DINOv2 paper, which uses the model in combination with DPT.
142+ do_reduce_labels (`bool`, *optional*, defaults to `False`):
143+ Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 is
144+ used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). The
145+ background label will be replaced by 255. Can be overridden by the `do_reduce_labels` parameter in the
146+ `preprocess` method.
142147 """
143148
144149 model_input_names = ["pixel_values" ]
@@ -157,6 +162,7 @@ def __init__(
157162 image_std : Optional [Union [float , List [float ]]] = None ,
158163 do_pad : bool = False ,
159164 size_divisor : int = None ,
165+ do_reduce_labels : bool = False ,
160166 ** kwargs ,
161167 ) -> None :
162168 super ().__init__ (** kwargs )
@@ -174,6 +180,7 @@ def __init__(
174180 self .image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
175181 self .do_pad = do_pad
176182 self .size_divisor = size_divisor
183+ self .do_reduce_labels = do_reduce_labels
177184
178185 def resize (
179186 self ,
@@ -275,10 +282,160 @@ def _get_pad(size, size_divisor):
275282
276283 return pad (image , ((pad_size_left , pad_size_right ), (pad_size_top , pad_size_bottom )), data_format = data_format )
277284
285+ # Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.reduce_label
286+ def reduce_label (self , label : ImageInput ) -> np .ndarray :
287+ label = to_numpy_array (label )
288+ # Avoid using underflow conversion
289+ label [label == 0 ] = 255
290+ label = label - 1
291+ label [label == 254 ] = 255
292+ return label
293+
294+ def _preprocess (
295+ self ,
296+ image : ImageInput ,
297+ do_reduce_labels : bool = None ,
298+ do_resize : bool = None ,
299+ size : Dict [str , int ] = None ,
300+ resample : PILImageResampling = None ,
301+ keep_aspect_ratio : bool = None ,
302+ ensure_multiple_of : int = None ,
303+ do_rescale : bool = None ,
304+ rescale_factor : float = None ,
305+ do_normalize : bool = None ,
306+ image_mean : Optional [Union [float , List [float ]]] = None ,
307+ image_std : Optional [Union [float , List [float ]]] = None ,
308+ do_pad : bool = None ,
309+ size_divisor : int = None ,
310+ input_data_format : Optional [Union [str , ChannelDimension ]] = None ,
311+ ):
312+ if do_reduce_labels :
313+ image = self .reduce_label (image )
314+
315+ if do_resize :
316+ image = self .resize (
317+ image = image ,
318+ size = size ,
319+ resample = resample ,
320+ keep_aspect_ratio = keep_aspect_ratio ,
321+ ensure_multiple_of = ensure_multiple_of ,
322+ input_data_format = input_data_format ,
323+ )
324+
325+ if do_rescale :
326+ image = self .rescale (image = image , scale = rescale_factor , input_data_format = input_data_format )
327+
328+ if do_normalize :
329+ image = self .normalize (image = image , mean = image_mean , std = image_std , input_data_format = input_data_format )
330+
331+ if do_pad :
332+ image = self .pad_image (image = image , size_divisor = size_divisor , input_data_format = input_data_format )
333+
334+ return image
335+
336+ def _preprocess_image (
337+ self ,
338+ image : ImageInput ,
339+ do_resize : bool = None ,
340+ size : Dict [str , int ] = None ,
341+ resample : PILImageResampling = None ,
342+ keep_aspect_ratio : bool = None ,
343+ ensure_multiple_of : int = None ,
344+ do_rescale : bool = None ,
345+ rescale_factor : float = None ,
346+ do_normalize : bool = None ,
347+ image_mean : Optional [Union [float , List [float ]]] = None ,
348+ image_std : Optional [Union [float , List [float ]]] = None ,
349+ do_pad : bool = None ,
350+ size_divisor : int = None ,
351+ data_format : Optional [Union [str , ChannelDimension ]] = None ,
352+ input_data_format : Optional [Union [str , ChannelDimension ]] = None ,
353+ ) -> np .ndarray :
354+ """Preprocesses a single image."""
355+ # All transformations expect numpy arrays.
356+ image = to_numpy_array (image )
357+ if do_rescale and is_scaled_image (image ):
358+ logger .warning_once (
359+ "It looks like you are trying to rescale already rescaled images. If the input"
360+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
361+ )
362+ if input_data_format is None :
363+ # We assume that all images have the same channel dimension format.
364+ input_data_format = infer_channel_dimension_format (image )
365+
366+ image = self ._preprocess (
367+ image ,
368+ do_reduce_labels = False ,
369+ do_resize = do_resize ,
370+ size = size ,
371+ resample = resample ,
372+ keep_aspect_ratio = keep_aspect_ratio ,
373+ ensure_multiple_of = ensure_multiple_of ,
374+ do_rescale = do_rescale ,
375+ rescale_factor = rescale_factor ,
376+ do_normalize = do_normalize ,
377+ image_mean = image_mean ,
378+ image_std = image_std ,
379+ do_pad = do_pad ,
380+ size_divisor = size_divisor ,
381+ input_data_format = input_data_format ,
382+ )
383+ if data_format is not None :
384+ image = to_channel_dimension_format (image , data_format , input_channel_dim = input_data_format )
385+ return image
386+
387+ def _preprocess_segmentation_map (
388+ self ,
389+ segmentation_map : ImageInput ,
390+ do_resize : bool = None ,
391+ size : Dict [str , int ] = None ,
392+ resample : PILImageResampling = None ,
393+ keep_aspect_ratio : bool = None ,
394+ ensure_multiple_of : int = None ,
395+ do_reduce_labels : bool = None ,
396+ input_data_format : Optional [Union [str , ChannelDimension ]] = None ,
397+ ):
398+ """Preprocesses a single segmentation map."""
399+ # All transformations expect numpy arrays.
400+ segmentation_map = to_numpy_array (segmentation_map )
401+ # Add an axis to the segmentation maps for transformations.
402+ if segmentation_map .ndim == 2 :
403+ segmentation_map = segmentation_map [None , ...]
404+ added_dimension = True
405+ input_data_format = ChannelDimension .FIRST
406+ else :
407+ added_dimension = False
408+ if input_data_format is None :
409+ input_data_format = infer_channel_dimension_format (segmentation_map , num_channels = 1 )
410+ segmentation_map = self ._preprocess (
411+ image = segmentation_map ,
412+ do_reduce_labels = do_reduce_labels ,
413+ do_resize = do_resize ,
414+ size = size ,
415+ resample = resample ,
416+ keep_aspect_ratio = keep_aspect_ratio ,
417+ ensure_multiple_of = ensure_multiple_of ,
418+ do_normalize = False ,
419+ do_rescale = False ,
420+ input_data_format = input_data_format ,
421+ )
422+ # Remove extra axis if added
423+ if added_dimension :
424+ segmentation_map = np .squeeze (segmentation_map , axis = 0 )
425+ segmentation_map = segmentation_map .astype (np .int64 )
426+ return segmentation_map
427+
428+ # Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.__call__
429+ def __call__ (self , images , segmentation_maps = None , ** kwargs ):
430+ # Overrides the `__call__` method of the `Preprocessor` class such that the images and segmentation maps can both
431+ # be passed in as positional arguments.
432+ return super ().__call__ (images , segmentation_maps = segmentation_maps , ** kwargs )
433+
278434 @filter_out_non_signature_kwargs ()
279435 def preprocess (
280436 self ,
281437 images : ImageInput ,
438+ segmentation_maps : Optional [ImageInput ] = None ,
282439 do_resize : bool = None ,
283440 size : int = None ,
284441 keep_aspect_ratio : bool = None ,
@@ -291,6 +448,7 @@ def preprocess(
291448 image_std : Optional [Union [float , List [float ]]] = None ,
292449 do_pad : bool = None ,
293450 size_divisor : int = None ,
451+ do_reduce_labels : Optional [bool ] = None ,
294452 return_tensors : Optional [Union [str , TensorType ]] = None ,
295453 data_format : ChannelDimension = ChannelDimension .FIRST ,
296454 input_data_format : Optional [Union [str , ChannelDimension ]] = None ,
@@ -302,6 +460,8 @@ def preprocess(
302460 images (`ImageInput`):
303461 Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
304462 passing in images with pixel values between 0 and 1, set `do_rescale=False`.
463+ segmentation_maps (`ImageInput`, *optional*):
464+ Segmentation map to preprocess.
305465 do_resize (`bool`, *optional*, defaults to `self.do_resize`):
306466 Whether to resize the image.
307467 size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -326,6 +486,10 @@ def preprocess(
326486 Image mean.
327487 image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
328488 Image standard deviation.
489+ do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
490+ Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
491+ is used for background, and background itself is not included in all classes of a dataset (e.g.
492+ ADE20k). The background label will be replaced by 255.
329493 return_tensors (`str` or `TensorType`, *optional*):
330494 The type of tensors to return. Can be one of:
331495 - Unset: Return a list of `np.ndarray`.
@@ -357,9 +521,13 @@ def preprocess(
357521 image_std = image_std if image_std is not None else self .image_std
358522 do_pad = do_pad if do_pad is not None else self .do_pad
359523 size_divisor = size_divisor if size_divisor is not None else self .size_divisor
524+ do_reduce_labels = do_reduce_labels if do_reduce_labels is not None else self .do_reduce_labels
360525
361526 images = make_list_of_images (images )
362527
528+ if segmentation_maps is not None :
529+ segmentation_maps = make_list_of_images (segmentation_maps , expected_ndims = 2 )
530+
363531 if not valid_images (images ):
364532 raise ValueError (
365533 "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
@@ -377,55 +545,47 @@ def preprocess(
377545 size = size ,
378546 resample = resample ,
379547 )
380- # All transformations expect numpy arrays.
381- images = [to_numpy_array (image ) for image in images ]
382548
383- if do_rescale and is_scaled_image (images [0 ]):
384- logger .warning_once (
385- "It looks like you are trying to rescale already rescaled images. If the input"
386- " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
549+ images = [
550+ self ._preprocess_image (
551+ image = img ,
552+ do_resize = do_resize ,
553+ do_rescale = do_rescale ,
554+ do_normalize = do_normalize ,
555+ do_pad = do_pad ,
556+ size = size ,
557+ resample = resample ,
558+ keep_aspect_ratio = keep_aspect_ratio ,
559+ ensure_multiple_of = ensure_multiple_of ,
560+ rescale_factor = rescale_factor ,
561+ image_mean = image_mean ,
562+ image_std = image_std ,
563+ size_divisor = size_divisor ,
564+ data_format = data_format ,
565+ input_data_format = input_data_format ,
387566 )
567+ for img in images
568+ ]
388569
389- if input_data_format is None :
390- # We assume that all images have the same channel dimension format.
391- input_data_format = infer_channel_dimension_format (images [0 ])
570+ data = {"pixel_values" : images }
392571
393- if do_resize :
394- images = [
395- self .resize (
396- image = image ,
572+ if segmentation_maps is not None :
573+ segmentation_maps = [
574+ self ._preprocess_segmentation_map (
575+ segmentation_map = segmentation_map ,
576+ do_reduce_labels = do_reduce_labels ,
577+ do_resize = do_resize ,
397578 size = size ,
398579 resample = resample ,
399580 keep_aspect_ratio = keep_aspect_ratio ,
400581 ensure_multiple_of = ensure_multiple_of ,
401582 input_data_format = input_data_format ,
402583 )
403- for image in images
584+ for segmentation_map in segmentation_maps
404585 ]
405586
406- if do_rescale :
407- images = [
408- self .rescale (image = image , scale = rescale_factor , input_data_format = input_data_format )
409- for image in images
410- ]
587+ data ["labels" ] = segmentation_maps
411588
412- if do_normalize :
413- images = [
414- self .normalize (image = image , mean = image_mean , std = image_std , input_data_format = input_data_format )
415- for image in images
416- ]
417-
418- if do_pad :
419- images = [
420- self .pad_image (image = image , size_divisor = size_divisor , input_data_format = input_data_format )
421- for image in images
422- ]
423-
424- images = [
425- to_channel_dimension_format (image , data_format , input_channel_dim = input_data_format ) for image in images
426- ]
427-
428- data = {"pixel_values" : images }
429589 return BatchFeature (data = data , tensor_type = return_tensors )
430590
431591 # Copied from transformers.models.beit.image_processing_beit.BeitImageProcessor.post_process_semantic_segmentation with Beit->DPT
0 commit comments