amyeroberts · amyeroberts · Sep 2, 2022 · Aug 5, 2022 · Aug 5, 2022 · Aug 11, 2022
diff --git a/src/transformers/models/imagegpt/feature_extraction_imagegpt.py b/src/transformers/models/imagegpt/feature_extraction_imagegpt.py
@@ -117,13 +117,15 @@ def __call__(
                 tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                 number of channels, H and W are image height and width.
 
-            return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
-                If set, will return tensors of a particular framework. Acceptable values are:
+            return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `None`):
+                If set, will return a tensor of a particular framework.
 
+                Acceptable values are:
                 - `'tf'`: Return TensorFlow `tf.constant` objects.
                 - `'pt'`: Return PyTorch `torch.Tensor` objects.
                 - `'np'`: Return NumPy `np.ndarray` objects.
                 - `'jax'`: Return JAX `jnp.ndarray` objects.
+                - None: Return list of `np.ndarray` objects.
 
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
@@ -158,19 +160,25 @@ def __call__(
         if self.do_resize and self.size is not None:
             images = [self.resize(image, size=self.size, resample=self.resample) for image in images]
 
+        # if do_normalize=False, the casting to a numpy array won't happen, so we need to do it here
+        make_channel_first = True if isinstance(images[0], Image.Image) else images[0].shape[-1] in (1, 3)
+        images = [self.to_numpy_array(image, rescale=False, channel_first=make_channel_first) for image in images]
+
         if self.do_normalize:
             images = [self.normalize(image) for image in images]
 
         # color quantize from (batch_size, height, width, 3) to (batch_size, height, width)
-        images = np.array(images)
-        images = color_quantize(images, self.clusters).reshape(images.shape[:-1])
+        flattened_images = []
+        for image in images:
+            image = color_quantize(image, self.clusters).reshape(image.shape[:-1])
+
+            # flatten to (height*width)
+            image = image.reshape(-1)
 
-        # flatten to (batch_size, height*width)
-        batch_size = images.shape[0]
-        images = images.reshape(batch_size, -1)
+            flattened_images.append(image)
 
         # return as BatchFeature
-        data = {"input_ids": images}
+        data = {"input_ids": flattened_images}
         encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
 
         return encoded_inputs
diff --git a/tests/models/imagegpt/test_feature_extraction_imagegpt.py b/tests/models/imagegpt/test_feature_extraction_imagegpt.py
@@ -22,10 +22,11 @@
 import numpy as np
 from datasets import load_dataset
 
+from parameterized import parameterized
 from transformers.testing_utils import require_torch, require_vision, slow
 from transformers.utils import is_torch_available, is_vision_available
 
-from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin
+from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
 
 
 if is_torch_available():
@@ -137,6 +138,39 @@ def test_feat_extract_from_and_save_pretrained(self):
     def test_init_without_params(self):
         pass
 
+    @parameterized.expand(
+        [
+            ("do_resize_True_do_normalize_True", True, True),
+            ("do_resize_True_do_normalize_False", True, False),
+            ("do_resize_True_do_normalize_True", True, True),
+            ("do_resize_True_do_normalize_False", True, False),
+            ("do_resize_False_do_normalize_True", False, True),
+            ("do_resize_False_do_normalize_False", False, False),
+            ("do_resize_False_do_normalize_True", False, True),
+            ("do_resize_False_do_normalize_False", False, False),
+        ]
+    )
+    def test_call_flags(self, _, do_resize, do_normalize):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        feature_extractor.do_resize = do_resize
+        feature_extractor.do_normalize = do_normalize
+        # create random PIL images
+        image_inputs = prepare_image_inputs(self.feature_extract_tester, equal_resolution=False)
+
+        expected_shapes = [(x.size[0] * x.size[1],) for x in image_inputs]
+        if do_resize:
+            expected_shapes = [
+                (self.feature_extract_tester.size * self.feature_extract_tester.size,)
+                for _ in range(self.feature_extract_tester.batch_size)
+            ]
+
+        input_ids = feature_extractor(image_inputs, return_tensors=None)["input_ids"]
+        self.assertEqual(len(input_ids), self.feature_extract_tester.batch_size)
+        for idx, image in enumerate(input_ids):
+            self.assertEqual(image.shape, expected_shapes[idx])
+            self.assertIsInstance(image, np.ndarray)
+
 
 def prepare_images():
     dataset = load_dataset("hf-internal-testing/fixtures_image_utils", split="test")