From eb7bb7c871d4dac3a5765bec35cbed69c6771497 Mon Sep 17 00:00:00 2001
From: Wonju Lee <wonju.lee@intel.com>
Date: Wed, 28 Aug 2024 22:47:10 +0900
Subject: [PATCH 1/4] add rtmpose_tiny for single obj

---
 src/otx/recipe/_base_/data/keypoint_detection.yaml      | 2 +-
 tests/unit/core/data/transform_libs/test_torchvision.py | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/otx/recipe/_base_/data/keypoint_detection.yaml b/src/otx/recipe/_base_/data/keypoint_detection.yaml
index 756bddba960..0f1cc0f54ae 100644
--- a/src/otx/recipe/_base_/data/keypoint_detection.yaml
+++ b/src/otx/recipe/_base_/data/keypoint_detection.yaml
@@ -44,7 +44,7 @@ val_subset:
         mean: [123.675, 116.28, 103.53]
         std: [58.395, 57.12, 57.375]
 test_subset:
-  subset_name: test
+  subset_name: val
   batch_size: 32
   transforms:
     - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale
diff --git a/tests/unit/core/data/transform_libs/test_torchvision.py b/tests/unit/core/data/transform_libs/test_torchvision.py
index 1a1363d6821..6655c535935 100644
--- a/tests/unit/core/data/transform_libs/test_torchvision.py
+++ b/tests/unit/core/data/transform_libs/test_torchvision.py
@@ -915,6 +915,11 @@ def test_forward(self, keypoint_det_entity) -> None:
         )
         results = transform(deepcopy(keypoint_det_entity))
 
+<<<<<<< HEAD
         assert np.array_equal(results.bbox_info.center, np.array([3.5, 3.5]))
         assert np.array_equal(results.bbox_info.scale, np.array([8.75, 8.75]))
+=======
+        assert torch.all(results.bbox_info.center == torch.Tensor([[3.5, 3.5]]))
+        assert torch.all(results.bbox_info.scale == torch.Tensor([[8.75, 8.75]]))
+>>>>>>> add rtmpose_tiny for single obj
         assert results.keypoints.shape == (4, 2)

From ae6293a562c26ce4821dd289fcd133ffc6a88b4c Mon Sep 17 00:00:00 2001
From: Wonju Lee <wonju.lee@intel.com>
Date: Wed, 28 Aug 2024 23:08:05 +0900
Subject: [PATCH 2/4] modify test subset name

---
 src/otx/recipe/_base_/data/keypoint_detection.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/otx/recipe/_base_/data/keypoint_detection.yaml b/src/otx/recipe/_base_/data/keypoint_detection.yaml
index 0f1cc0f54ae..756bddba960 100644
--- a/src/otx/recipe/_base_/data/keypoint_detection.yaml
+++ b/src/otx/recipe/_base_/data/keypoint_detection.yaml
@@ -44,7 +44,7 @@ val_subset:
         mean: [123.675, 116.28, 103.53]
         std: [58.395, 57.12, 57.375]
 test_subset:
-  subset_name: val
+  subset_name: test
   batch_size: 32
   transforms:
     - class_path: otx.core.data.transform_libs.torchvision.GetBBoxCenterScale

From b24ee2ebb60a7d43c704da4523bda2b2d606e29b Mon Sep 17 00:00:00 2001
From: Wonju Lee <wonju.lee@intel.com>
Date: Wed, 28 Aug 2024 23:38:28 +0900
Subject: [PATCH 3/4] fix unit test

---
 tests/unit/core/data/transform_libs/test_torchvision.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tests/unit/core/data/transform_libs/test_torchvision.py b/tests/unit/core/data/transform_libs/test_torchvision.py
index 6655c535935..1a1363d6821 100644
--- a/tests/unit/core/data/transform_libs/test_torchvision.py
+++ b/tests/unit/core/data/transform_libs/test_torchvision.py
@@ -915,11 +915,6 @@ def test_forward(self, keypoint_det_entity) -> None:
         )
         results = transform(deepcopy(keypoint_det_entity))
 
-<<<<<<< HEAD
         assert np.array_equal(results.bbox_info.center, np.array([3.5, 3.5]))
         assert np.array_equal(results.bbox_info.scale, np.array([8.75, 8.75]))
-=======
-        assert torch.all(results.bbox_info.center == torch.Tensor([[3.5, 3.5]]))
-        assert torch.all(results.bbox_info.scale == torch.Tensor([[8.75, 8.75]]))
->>>>>>> add rtmpose_tiny for single obj
         assert results.keypoints.shape == (4, 2)

From e00276abf0b64a9cd2b262f0f54854240ba03074 Mon Sep 17 00:00:00 2001
From: Wonju Lee <wonju.lee@intel.com>
Date: Fri, 30 Aug 2024 23:58:59 +0900
Subject: [PATCH 4/4] property for pck

---
 src/otx/core/metrics/pck.py                   | 18 +++-
 src/otx/core/model/keypoint_detection.py      |  5 +
 .../keypoint_detection/rtmpose_tiny.yaml      |  2 +-
 .../rtmpose_tiny_single_obj.yaml              | 93 ++++++++++---------
 tests/perf/test_keypoint_detection.py         | 90 ++++++++++++++++--
 5 files changed, 153 insertions(+), 55 deletions(-)

diff --git a/src/otx/core/metrics/pck.py b/src/otx/core/metrics/pck.py
index 61ab3909865..941ed679775 100644
--- a/src/otx/core/metrics/pck.py
+++ b/src/otx/core/metrics/pck.py
@@ -147,6 +147,22 @@ def __init__(
         self.label_info: LabelInfo = label_info
         self.reset()
 
+    @property
+    def input_size(self) -> tuple[int, int]:
+        """Getter for input_size."""
+        return self._input_size
+
+    @input_size.setter
+    def input_size(self, size: tuple[int, int]) -> None:
+        """Setter for input_size."""
+        if not isinstance(size, tuple) or len(size) != 2:
+            msg = "input_size must be a tuple of two integers."
+            raise ValueError(msg)
+        if not all(isinstance(dim, int) for dim in size):
+            msg = "input_size dimensions must be integers."
+            raise ValueError(msg)
+        self._input_size = size
+
     def reset(self) -> None:
         """Reset for every validation and test epoch.
 
@@ -177,7 +193,7 @@ def compute(self) -> dict:
         gt_kpts = np.stack([p[0] for p in self.targets])
         kpts_visible = np.stack([p[1] for p in self.targets])
 
-        normalize = np.tile(np.array([[256, 192]]), (pred_kpts.shape[0], 1))
+        normalize = np.tile(np.array([self.input_size]), (pred_kpts.shape[0], 1))
         _, avg_acc, _ = keypoint_pck_accuracy(
             pred_kpts,
             gt_kpts,
diff --git a/src/otx/core/model/keypoint_detection.py b/src/otx/core/model/keypoint_detection.py
index 406c6c8031e..ea91d00cd67 100644
--- a/src/otx/core/model/keypoint_detection.py
+++ b/src/otx/core/model/keypoint_detection.py
@@ -104,6 +104,11 @@ def _customize_outputs(
             bbox_info=[],
         )
 
+    def configure_metric(self) -> None:
+        """Configure the metric."""
+        super().configure_metric()
+        self._metric.input_size = self.input_size
+
     def _convert_pred_entity_to_compute_metric(
         self,
         preds: KeypointDetBatchPredEntity,
diff --git a/src/otx/recipe/keypoint_detection/rtmpose_tiny.yaml b/src/otx/recipe/keypoint_detection/rtmpose_tiny.yaml
index 1a25a2d39d4..447d4fd5218 100644
--- a/src/otx/recipe/keypoint_detection/rtmpose_tiny.yaml
+++ b/src/otx/recipe/keypoint_detection/rtmpose_tiny.yaml
@@ -6,7 +6,7 @@ model:
     optimizer:
       class_path: torch.optim.AdamW
       init_args:
-        lr: 0.004
+        lr: 0.001
         weight_decay: 0.0001
 
     scheduler:
diff --git a/src/otx/recipe/keypoint_detection/rtmpose_tiny_single_obj.yaml b/src/otx/recipe/keypoint_detection/rtmpose_tiny_single_obj.yaml
index 8b22c757330..8045bb5e85c 100644
--- a/src/otx/recipe/keypoint_detection/rtmpose_tiny_single_obj.yaml
+++ b/src/otx/recipe/keypoint_detection/rtmpose_tiny_single_obj.yaml
@@ -2,6 +2,9 @@ model:
   class_path: otx.algo.keypoint_detection.rtmpose.RTMPoseTiny
   init_args:
     label_info: 17
+    input_size:
+      - 512
+      - 512
 
     optimizer:
       class_path: torch.optim.AdamW
@@ -35,47 +38,49 @@ overrides:
     - data.train_subset.transforms
     - data.val_subset.transforms
     - data.test_subset.transforms
-  input_size:
-    - 512
-    - 512
-  train_subset:
-    transforms:
-      - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
-        init_args:
-          input_size: $(input_size)
-      - class_path: otx.core.data.transform_libs.torchvision.YOLOXHSVRandomAug
-        init_args:
-          is_numpy_to_tvtensor: true
-      - class_path: torchvision.transforms.v2.ToDtype
-        init_args:
-          dtype: ${as_torch_dtype:torch.float32}
-      - class_path: torchvision.transforms.v2.Normalize
-        init_args:
-          mean: [123.675, 116.28, 103.53]
-          std: [58.395, 57.12, 57.375]
-  val_subset:
-    transforms:
-      - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
-        init_args:
-          input_size: $(input_size)
-          is_numpy_to_tvtensor: true
-      - class_path: torchvision.transforms.v2.ToDtype
-        init_args:
-          dtype: ${as_torch_dtype:torch.float32}
-      - class_path: torchvision.transforms.v2.Normalize
-        init_args:
-          mean: [123.675, 116.28, 103.53]
-          std: [58.395, 57.12, 57.375]
-  test_subset:
-    transforms:
-      - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
-        init_args:
-          input_size: $(input_size)
-          is_numpy_to_tvtensor: true
-      - class_path: torchvision.transforms.v2.ToDtype
-        init_args:
-          dtype: ${as_torch_dtype:torch.float32}
-      - class_path: torchvision.transforms.v2.Normalize
-        init_args:
-          mean: [123.675, 116.28, 103.53]
-          std: [58.395, 57.12, 57.375]
+  data:
+    input_size:
+      - 512
+      - 512
+    train_subset:
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.RandomBBoxTransform
+        - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
+          init_args:
+            input_size: $(input_size)
+        - class_path: otx.core.data.transform_libs.torchvision.YOLOXHSVRandomAug
+          init_args:
+            is_numpy_to_tvtensor: true
+        - class_path: torchvision.transforms.v2.ToDtype
+          init_args:
+            dtype: ${as_torch_dtype:torch.float32}
+        - class_path: torchvision.transforms.v2.Normalize
+          init_args:
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+    val_subset:
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
+          init_args:
+            input_size: $(input_size)
+            is_numpy_to_tvtensor: true
+        - class_path: torchvision.transforms.v2.ToDtype
+          init_args:
+            dtype: ${as_torch_dtype:torch.float32}
+        - class_path: torchvision.transforms.v2.Normalize
+          init_args:
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+    test_subset:
+      transforms:
+        - class_path: otx.core.data.transform_libs.torchvision.TopdownAffine
+          init_args:
+            input_size: $(input_size)
+            is_numpy_to_tvtensor: true
+        - class_path: torchvision.transforms.v2.ToDtype
+          init_args:
+            dtype: ${as_torch_dtype:torch.float32}
+        - class_path: torchvision.transforms.v2.Normalize
+          init_args:
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
diff --git a/tests/perf/test_keypoint_detection.py b/tests/perf/test_keypoint_detection.py
index 3a2f2a299c9..1ff150a03d6 100644
--- a/tests/perf/test_keypoint_detection.py
+++ b/tests/perf/test_keypoint_detection.py
@@ -5,6 +5,7 @@
 from __future__ import annotations
 
 from pathlib import Path
+from typing import ClassVar
 
 import pytest
 
@@ -19,26 +20,97 @@ class TestPerfKeypointDetection(PerfTestBase):
         Benchmark.Model(task="keypoint_detection", name="rtmpose_tiny", category="speed"),
     ]
 
-    DATASET_TEST_CASES = [
+    DATASET_TEST_CASES: ClassVar = [
         Benchmark.Dataset(
-            name=f"coco_person_keypoint_small_{idx}",
-            path=Path("keypoint_detection/coco_keypoint_small") / f"{idx}",
+            name="coco_person_keypoint_small",
+            path=Path("keypoint_detection/coco_keypoint/small"),
             group="small",
             num_repeat=5,
             extra_overrides={},
-        )
-        for idx in (1, 2, 3)
-    ] + [
+        ),
         Benchmark.Dataset(
             name="coco_person_keypoint_medium",
-            path=Path("keypoint_detection/coco_keypoint_medium"),
+            path=Path("keypoint_detection/coco_keypoint/medium"),
+            group="medium",
+            num_repeat=5,
+            extra_overrides={},
+        ),
+        Benchmark.Dataset(
+            name="coco_person_keypoint_large",
+            path=Path("keypoint_detection/coco_keypoint/large"),
+            group="large",
+            num_repeat=5,
+            extra_overrides={},
+        ),
+    ]
+
+    BENCHMARK_CRITERIA = [  # noqa: RUF012
+        Benchmark.Criterion(name="train/epoch", summary="max", compare="<", margin=0.1),
+        Benchmark.Criterion(name="train/e2e_time", summary="max", compare="<", margin=0.1),
+        Benchmark.Criterion(name="val/accuracy", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test/accuracy", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="export/accuracy", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="optimize/accuracy", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="optimize/iter_time", summary="mean", compare="<", margin=0.1),
+        Benchmark.Criterion(name="test(train)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(export)/e2e_time", summary="max", compare=">", margin=0.1),
+        Benchmark.Criterion(name="test(optimize)/e2e_time", summary="max", compare=">", margin=0.1),
+    ]
+
+    @pytest.mark.parametrize(
+        "fxt_model",
+        MODEL_TEST_CASES,
+        ids=lambda model: model.name,
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "fxt_dataset",
+        DATASET_TEST_CASES,
+        ids=lambda dataset: dataset.name,
+        indirect=True,
+    )
+    def test_perf(
+        self,
+        fxt_model: Benchmark.Model,
+        fxt_dataset: Benchmark.Dataset,
+        fxt_benchmark: Benchmark,
+    ):
+        self._test_perf(
+            model=fxt_model,
+            dataset=fxt_dataset,
+            benchmark=fxt_benchmark,
+            criteria=self.BENCHMARK_CRITERIA,
+        )
+
+
+class TestPerfKeypointDetectionSingleObj(PerfTestBase):
+    """Benchmark visual prompting."""
+
+    MODEL_TEST_CASES = [  # noqa: RUF012
+        Benchmark.Model(task="keypoint_detection", name="rtmpose_tiny_single_obj", category="speed"),
+    ]
+
+    DATASET_TEST_CASES: ClassVar = [
+        Benchmark.Dataset(
+            name="coco_person_keypoint_single_obj_small",
+            path=Path("keypoint_detection/coco_keypoint_single_obj/small"),
+            group="small",
+            num_repeat=5,
+            extra_overrides={},
+        ),
+        Benchmark.Dataset(
+            name="coco_person_keypoint_single_obj_medium",
+            path=Path("keypoint_detection/coco_keypoint_single_obj/medium"),
             group="medium",
             num_repeat=5,
             extra_overrides={},
         ),
         Benchmark.Dataset(
-            name="mpii_large",
-            path=Path("keypoint_detection/mpii_large"),
+            name="coco_person_keypoint_single_obj_large",
+            path=Path("keypoint_detection/coco_keypoint_single_obj/large"),
             group="large",
             num_repeat=5,
             extra_overrides={},