openvinotoolkit · goodsong81 · Sep 20, 2023 · Sep 4, 2023 · Sep 7, 2023 · Sep 7, 2023
@@ -15,8 +15,9 @@ All notable changes to this project will be documented in this file.
 - Add a new object detector Lite-DINO(<https://github.com/openvinotoolkit/training_extensions/pull/2457>)
 - Add Semi-SL Mean Teacher algorithm for Instance Segmentation task(<https://github.com/openvinotoolkit/training_extensions/pull/2444>)
 - Official supports for YOLOX-X, YOLOX-L, YOLOX-S, ResNeXt101-ATSS (<https://github.com/openvinotoolkit/training_extensions/pull/2485>)
-- Add new argument to track resource usage in train command(<https://github.com/openvinotoolkit/training_extensions/pull/2500>)
+- Add new argument to track resource usage in train command (<https://github.com/openvinotoolkit/training_extensions/pull/2500>)
 - Add Self-SL for semantic segmentation of SegNext families (<https://github.com/openvinotoolkit/training_extensions/pull/2215>)
+- Adapt input size automatically based on dataset statistics (<https://github.com/openvinotoolkit/training_extensions/pull/2499>)
 
 ### Enhancements
 

@@ -120,3 +120,10 @@ OpenVINO™ Training Extensions will automatically recognize these types of task
 
 .. note::
     To use auto template configuration with Self-SL training type `--task` option is required since it is impossible to recognize task type by folder with only images.
+
+Auto-adapt input size
+---------------------
+
+"Auto" input size feature tries to automatically select the right model input size
+based on given dataset statictics.
+See :ref:`adaptive-input-size`.
@@ -21,10 +21,50 @@ The available input sizes are currently as follows:
 
 - 64x64 (only for classification)
 - 128x128 (only for classification)
+- 224x224 (only for classification)
 - 256x256
 - 384x384
 - 512x512
+- 768x768
 - 1024x1024
+- Default (per-model default input size)
+- Auto (adaptive to dataset statistics)
+
+.. _adaptive-input-size:
+
+Adaptive Input Size
+-------------------
+
+"Auto" mode tries to automatically select the right size
+based on given dataset statictics.
+
+1. OTX analyzes the input dataset to get robust statistics.
+
+2. Input size is initially set to typical large image size.
+
+.. code-block::
+
+    input_size = large_image_size
+
+3. (Optionally) Input size is adjusted by object sizes in the dataset, if any.
+   The input size from image size is rescaled accoridng to the ratio of
+   minimum recongnizable object size of models, which is typically 16x16 ~ 32x32,
+   and the typical small object size in the dataset.
+   In short, if objects are 64x64 in general in 512x512 image,
+   it will be down-scaled to 256x256 as 32x32 objects are enough to be detected.
+
+.. code-block::
+
+    input_size = input_size * MIN_RECOGNIZABLE_OBJECT_SIZE / small_object_size
+
+4. Select the closest size from standard preset sizes
+
+5. Restrict scale-up
+
+.. code-block::
+
+    input_size = min(input_size, default_model_input_size)
+
 
 .. Note::
     Using smaller input size with datasets having lower image resolutions or larger objects can yield a speed advantage with minimal impact on model performance.

@@ -19,7 +19,6 @@
 from otx.algorithms.common.adapters.mmcv.semisl_mixin import SemiSLConfigurerMixin
 from otx.algorithms.common.adapters.mmcv.utils.config_utils import (
     InputSizeManager,
-    get_configured_input_size,
     recursively_update_cfg,
     update_or_add_custom_hook,
 )
@@ -166,11 +165,17 @@ def configure_input_size(
         cfg, input_size_config: InputSizePreset = InputSizePreset.DEFAULT, model_ckpt_path: Optional[str] = None
     ):
         """Change input size if necessary."""
-        input_size = get_configured_input_size(input_size_config, model_ckpt_path)
-        if input_size is None:
+        manager = InputSizeManager(cfg)
+        input_size = manager.get_configured_input_size(input_size_config, model_ckpt_path)
+        if input_size is None:  # InputSizePreset.DEFAULT
             return
 
-        InputSizeManager(cfg.data).set_input_size(input_size)
+        if input_size == (0, 0):  # InputSizePreset.AUTO
+            input_size = BaseConfigurer.adapt_input_size_to_dataset(cfg, manager)
+            if input_size is None:
+                return
+
+        manager.set_input_size(input_size)
         logger.info("Input size is changed to {}".format(input_size))
 
 

@@ -635,7 +635,7 @@ def patch_input_preprocessing(deploy_cfg):
                 mo_options.flags = list(set(mo_options.flags))
 
             def patch_input_shape(deploy_cfg):
-                input_size_manager = InputSizeManager(cfg.data)
+                input_size_manager = InputSizeManager(cfg)
                 size = input_size_manager.get_input_size_from_cfg("test")
                 assert all(isinstance(i, int) and i > 0 for i in size)
                 # default is static shape to prevent an unexpected error

@@ -287,8 +287,10 @@ learning_parameters:
     header: Configure model input size.
     options:
       DEFAULT: "Default"
+      AUTO: "Auto"
       _64x64: "64x64"
       _128x128: "128x128"
+      _224x224: "224x224"
       _256x256: "256x256"
       _384x384: "384x384"
       _512x512: "512x512"

@@ -3,8 +3,9 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
+import json
 import os
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 import numpy as np
 import torch
@@ -18,6 +19,7 @@
     patch_persistent_workers,
 )
 from otx.algorithms.common.adapters.mmcv.utils.config_utils import (
+    InputSizeManager,
     patch_color_conversion,
     patch_from_hyperparams,
     recursively_update_cfg,
@@ -26,6 +28,7 @@
 from otx.algorithms.common.configs.configuration_enums import InputSizePreset
 from otx.algorithms.common.tasks.base_task import OnHookInitialized
 from otx.algorithms.common.utils import UncopiableDefaultDict, append_dist_rank_suffix
+from otx.algorithms.common.utils.data import compute_robust_dataset_statistics
 from otx.algorithms.common.utils.logger import get_logger
 from otx.api.usecases.reporting.time_monitor_callback import TimeMonitorCallback
 from otx.core.data import caching
@@ -492,3 +495,41 @@ def get_data_cfg(cfg, subset):
                 dataset = dataset.dataset
             return dataset
         return cfg.data[subset]
+
+    @staticmethod
+    def adapt_input_size_to_dataset(
+        cfg, input_size_manager: InputSizeManager, downscale_only: bool = True, use_annotations: bool = False
+    ) -> Optional[Tuple[int, int]]:
+        """Compute appropriate model input size w.r.t. dataset statistics.
+
+        Args:
+            cfg (Dict): Global configuration.
+            input_size_manager: (InputSizeManager): Pre-configured input size manager
+            downscale_only (bool) : Whether to allow only smaller size than default setting. Defaults to True.
+            use_annotations (bool): Whether to consider annotation shapes to compute input size. Defaults to False.
+
+        Returns:
+            Tuple[int, int]: (width, height) or None
+        """
+
+        data_cfg = BaseConfigurer.get_data_cfg(cfg, "train")
+        dataset = data_cfg.get("otx_dataset", None)
+        if dataset is None:
+            return None
+
+        stat = compute_robust_dataset_statistics(dataset, use_annotations)
+        if not stat:
+            return None
+        logger.info(f"Dataset stat: {json.dumps(stat, indent=4)}")
+
+        # Fit to typical large image size (conservative)
+        # -> "avg" size might be preferrable for efficiency
+        image_size = stat["image"]["robust_max"]
+        object_size = None
+        if use_annotations and stat["annotation"]:
+            # Refine using annotation shape size stat
+            # Fit to typical small object size (conservative)
+            # -> "avg" size might be preferrable for efficiency
+            object_size = stat["annotation"].get("size_of_shape", {}).get("robust_min", None)
+
+        return input_size_manager.adapt_input_size_to_dataset(image_size, object_size, downscale_only)