huggingface · echarlaix · May 27, 2024 · May 22, 2024 · May 22, 2024 · May 22, 2024
diff --git a/.github/workflows/test_dummy_inputs.yml b/.github/workflows/test_dummy_inputs.yml
diff --git a/.github/workflows/test_utils.yml b/.github/workflows/test_utils.yml
@@ -0,0 +1,40 @@
+name: Utils / Python - Test
+
+on:
+ push:
+ branches: [main]
+ pull_request:
+ branches: [main]
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+ cancel-in-progress: true
+
+jobs:
+ build:
+ strategy:
+ fail-fast: false
+ matrix:
+ os: [ubuntu-20.04, macos-13]
+ python-version: [3.8, 3.9]
+
+ runs-on: ${{ matrix.os }}
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v3
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+ pip install .[tests]
+
+ - name: Test with pytest
+ working-directory: tests
+ run: |
+ python -m pytest -s -vvvv utils
diff --git a/optimum/utils/preprocessing/task_processors_manager.py b/optimum/utils/preprocessing/task_processors_manager.py
@@ -23,7 +23,7 @@
 
 
 if TYPE_CHECKING:
- from .base import DatasetProcessing
+ from .base import TaskProcessor
 
 
 class TaskProcessorsManager:
@@ -35,7 +35,7 @@ class TaskProcessorsManager:
  }
 
  @classmethod
- def get_task_processor_class_for_task(cls, task: str) -> Type:
+ def get_task_processor_class_for_task(cls, task: str) -> Type["TaskProcessor"]:
  if task not in cls._TASK_TO_DATASET_PROCESSING_CLASS:
  supported_tasks = ", ".join(cls._TASK_TO_DATASET_PROCESSING_CLASS.keys())
  raise KeyError(
@@ -45,5 +45,5 @@ def get_task_processor_class_for_task(cls, task: str) -> Type:
  return cls._TASK_TO_DATASET_PROCESSING_CLASS[task]
 
  @classmethod
- def for_task(cls, task: str, *dataset_processing_args, **dataset_processing_kwargs: Any) -> "DatasetProcessing":
+ def for_task(cls, task: str, *dataset_processing_args, **dataset_processing_kwargs: Any) -> "TaskProcessor":
  return cls.get_task_processor_class_for_task(task)(*dataset_processing_args, **dataset_processing_kwargs)
diff --git a/tests/utils/test_dummpy_input_generators.py b/tests/utils/test_dummpy_input_generators.py
@@ -31,13 +31,13 @@
  from optimum.utils.input_generators import DummyInputGenerator
 
 
-TEXT_ENCODER_MODELS = {"distilbert": "distilbert-base-cased"}
+TEXT_ENCODER_MODELS = {"distilbert": "hf-internal-testing/tiny-random-DistilBertModel"}
 
 VISION_MODELS = {"resnet": "hf-internal-testing/tiny-random-resnet"}
 
-SEQ2SEQ_MODELS = {"t5": "t5-small"}
+SEQ2SEQ_MODELS = {"t5": "hf-internal-testing/tiny-random-T5Model"}
 
-AUDIO_MODELS = {"whisper": "openai/whisper-tiny.en"}
+AUDIO_MODELS = {"whisper": "hf-internal-testing/tiny-random-WhisperModel"}
 
 DUMMY_SHAPES = {
  "batch_size": [2, 4],
@@ -60,7 +60,7 @@ class GenerateDummy(TestCase):
  "np": tuple,
  }
  if is_tf_available():
- import tensorflow as tf
+ import tensorflow as tf # type: ignore[import]
 
  _FRAMEWORK_TO_SHAPE_CLS["tf"] = tf.TensorShape
 

diff --git a/tests/utils/test_task_processors.py b/tests/utils/test_task_processors.py
@@ -55,6 +55,9 @@
  },
 }
 
+LOAD_SMALLEST_SPLIT = True
+NUM_SAMPLES = 10
+
 
 # Taken from https://pynative.com/python-generate-random-string/
 def get_random_string(length: int) -> str:
@@ -148,7 +151,11 @@ def _test_load_dataset(
  )
  dataset_with_all_columns = None
  if default_dataset:
- dataset = task_processor.load_default_dataset(only_keep_necessary_columns=only_keep_necessary_columns)
+ dataset = task_processor.load_default_dataset(
+ only_keep_necessary_columns=only_keep_necessary_columns,
+ load_smallest_split=LOAD_SMALLEST_SPLIT,
+ num_samples=NUM_SAMPLES,
+ )
  if only_keep_necessary_columns:
  dataset_with_all_columns = task_processor.load_default_dataset()
  else:
@@ -157,11 +164,17 @@ def _test_load_dataset(
  path,
  data_keys=data_keys,
  only_keep_necessary_columns=only_keep_necessary_columns,
+ load_smallest_split=LOAD_SMALLEST_SPLIT,
+ num_samples=NUM_SAMPLES,
  **load_dataset_kwargs,
  )
  if only_keep_necessary_columns:
  dataset_with_all_columns = task_processor.load_dataset(
- path, data_keys=data_keys, **load_dataset_kwargs
+ path,
+ data_keys=data_keys,
+ load_smallest_split=LOAD_SMALLEST_SPLIT,
+ num_samples=NUM_SAMPLES,
+ **load_dataset_kwargs,
  )
 
  # We only check if the column names of the dataset with the not necessary columns removed are a strict subset