From e93d732219309c29035bf869d5058ef3573bd8c0 Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Thu, 23 Nov 2023 11:41:09 +0900
Subject: [PATCH 01/25] Add parameterized perf test template

---
 tests/perf/__init__.py            |   4 +
 tests/perf/conftest.py            | 175 ++++++++++++++++++++++++++++++
 tests/perf/test_classification.py |  47 ++++++++
 3 files changed, 226 insertions(+)
 create mode 100644 tests/perf/__init__.py
 create mode 100644 tests/perf/conftest.py
 create mode 100644 tests/perf/test_classification.py

diff --git a/tests/perf/__init__.py b/tests/perf/__init__.py
new file mode 100644
index 00000000000..36a90a5e5f6
--- /dev/null
+++ b/tests/perf/__init__.py
@@ -0,0 +1,4 @@
+"""OTX Perfomance tests."""
+
+# Copyright (C) 2021-2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py
new file mode 100644
index 00000000000..e950bd0bf29
--- /dev/null
+++ b/tests/perf/conftest.py
@@ -0,0 +1,175 @@
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+import pytest
+import os
+import subprocess
+import yaml
+from typing import List
+
+from otx.api.entities.model_template import ModelTemplate, ModelCategory
+
+
+def pytest_addoption(parser):
+    """Add custom options for perf tests."""
+    parser.addoption(
+        "--model-type",
+        action="store",
+        default="all",
+        help="Choose default|all. Defaults to all."
+    )
+    parser.addoption(
+        "--data-size",
+        action="store",
+        default="all",
+        help="Choose small|medium|large|all. Defaults to all."
+    )
+    parser.addoption(
+        "--num-repeat",
+        action="store",
+        default=0,
+        help="Overrides default per-data-size settings. Defaults to 0, which means no override."
+    )
+    parser.addoption(
+        "--eval-upto",
+        action="store",
+        default="all",
+        help="Choose train|export|optimize. Defaults to train."
+    )
+    parser.addoption(
+        "--data-root",
+        action="store",
+        default="data",
+        help="Dataset root directory."
+    )
+    parser.addoption(
+        "--output-dir",
+        action="store",
+        default="exp/perf",
+        help="Output directory to save outputs."
+    )
+
+
+@pytest.fixture
+def fxt_template(request: pytest.FixtureRequest):
+    """Skip by model template."""
+    model_type: str = request.config.getoption("--model-type")
+    template: ModelTemplate = request.param
+    if model_type == "default":
+        if template.model_category == ModelCategory.OTHER:
+            pytest.skip(f"{template.model_category} model")
+    return template
+
+
+@pytest.fixture
+def fxt_data_setting(request: pytest.FixtureRequest):
+    """Skip by dataset size."""
+    data_size_option: str = request.config.getoption("--data-size")
+    data_size: str = request.param[0]
+    datasets: List[str] = request.param[1]["datasets"]
+    num_repeat: int = request.param[1]["num_repeat"]
+    num_repeat_override: int = request.config.getoption("--num-repeat")
+    if num_repeat_override > 0:
+        num_repeat = num_repeat_override
+
+    if data_size_option != "all":
+        if data_size_option != data_size:
+            pytest.skip(f"{data_size} datasets")
+    return data_size, datasets, num_repeat
+
+
+@pytest.fixture
+def fxt_commit_hash():
+    """Short commit hash in short form."""
+    return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode('ascii').strip()
+
+
+@pytest.fixture
+def fxt_build_command(request: pytest.FixtureRequest, fxt_commit_hash: str, tmp_path_factory):
+    """Research framework command builder."""
+    eval_upto = request.config.getoption("--eval-upto")
+    data_root = request.config.getoption("--data-root")
+    data_root = os.path.abspath(data_root)
+    output_dir = request.config.getoption("--output-dir")
+    output_dir = os.path.abspath(output_dir + "-" + fxt_commit_hash)
+
+    def build_config(
+        tag: str,
+        model_template: ModelTemplate,
+        datasets: List[str],
+        num_repeat: int,
+        params: str = "",
+    ) -> dict:
+        cfg = {}
+        cfg["output_path"] = output_dir
+        cfg["constants"] = {
+            "dataroot": data_root,
+        }
+        cfg["variables"] = {
+            "model": [model_template.model_template_id],
+            "data": datasets,
+        }
+        cfg["repeat"] = num_repeat
+        cfg["command"] = []
+        cfg["command"].append(
+            "otx train ${model}"
+            " --train-data-roots ${dataroot}/${data}"
+            " --val-data-roots ${dataroot}/${data}"
+            " --track-resource-usage all"
+            " --deterministic"
+            f" params {params}"
+        )
+        cfg["command"].append(
+            "otx eval"
+            " --test-data-roots ${dataroot}/${data}"
+        )
+        if eval_upto == "train":
+            return cfg
+
+        cfg["command"].append(
+            "otx export"
+        )
+        cfg["command"].append(
+            "otx eval"
+            " --test-data-roots ${dataroot}/${data}"
+        )
+        if eval_upto == "export":
+            return cfg
+
+        cfg["command"].append(
+            "otx optimize"
+        )
+        cfg["command"].append(
+            "otx eval"
+            " --test-data-roots ${dataroot}/${data}"
+        )
+        return cfg
+
+    def build_command(
+        tag: str,
+        model_template: ModelTemplate,
+        datasets: List[str],
+        num_repeat: int,
+        params: str = "",
+    ) -> List[str]:
+        cfg = build_config(tag, model_template, datasets, num_repeat, params)
+        cfg_path = tmp_path_factory.mktemp("exp")/"cfg.yaml"
+        print(cfg_path)
+        with open(cfg_path, "w") as cfg_file:
+            yaml.dump(cfg, cfg_file, indent=2,)
+        cmd = [
+            "python",
+            "tools/experiment.py",
+            "-d",
+            "-f",
+            cfg_path,
+        ]
+        return cmd
+
+    return build_command
+
+
+class OTXBenchmark:
+    def __init__(self):
+        pass
diff --git a/tests/perf/test_classification.py b/tests/perf/test_classification.py
new file mode 100644
index 00000000000..2ddb411647c
--- /dev/null
+++ b/tests/perf/test_classification.py
@@ -0,0 +1,47 @@
+"""OTX Classification Perfomance tests."""
+
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+import pytest
+
+from otx.cli.registry import Registry
+from tests.test_suite.run_test_command import check_run
+
+
+templates = Registry(f"src/otx/algorithms").filter(task_type="CLASSIFICATION").templates
+templates_names = [template.name for template in templates]
+
+
+class TestPerfMultiClassClassification:
+    data_settings = {
+        "small": {
+            "datasets": [
+                "small_dataset/1",
+                "small_dataset/2",
+                "small_dataset/3",
+            ],
+            "num_repeat": 3,
+        },
+        "medium": {
+            "datasets": [
+                "medium_dataset",
+            ],
+            "num_repeat": 3,
+        },
+        "large": {
+            "datasets": [
+                "large_dataset",
+            ],
+            "num_repeat": 1,
+        },
+    }
+    @pytest.mark.parametrize("fxt_template", templates, ids=templates_names, indirect=True)
+    @pytest.mark.parametrize("fxt_data_setting", data_settings.items(), ids=data_settings.keys(), indirect=True)
+    def test_benchmark(self, fxt_template, fxt_data_setting, fxt_build_command):
+        model_template = fxt_template
+        data_size, datasets, num_repeat = fxt_data_setting
+        tag = f"multiclass-classification-{data_size}"
+        command = fxt_build_command(tag, model_template, datasets, num_repeat)
+        check_run(command)

From f17bf416a4519a29dfc62fdce401f988d687e53d Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Fri, 15 Dec 2023 11:58:21 +0900
Subject: [PATCH 02/25] Split acccuracy / perf tests

---
 tests/perf/conftest.py            | 53 +++++++++++++++++++++------
 tests/perf/test_classification.py | 61 +++++++++++++++++++++++--------
 2 files changed, 88 insertions(+), 26 deletions(-)

diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py
index e950bd0bf29..1c32a6f9019 100644
--- a/tests/perf/conftest.py
+++ b/tests/perf/conftest.py
@@ -29,7 +29,16 @@ def pytest_addoption(parser):
         "--num-repeat",
         action="store",
         default=0,
-        help="Overrides default per-data-size settings. Defaults to 0, which means no override."
+        help="Overrides default per-data-size number of repeat setting. "
+        "Random seeds are set to 0 ~ num_repeat-1 for the trials. "
+        "Defaults to 0 (small=3, medium=3, large=1)."
+    )
+    parser.addoption(
+        "--num-epoch",
+        action="store",
+        default=0,
+        help="Overrides default per-model number of epoch setting. "
+        "Defaults to 0 (per-model epoch & early-stopping)."
     )
     parser.addoption(
         "--eval-upto",
@@ -49,6 +58,12 @@ def pytest_addoption(parser):
         default="exp/perf",
         help="Output directory to save outputs."
     )
+    parser.addoption(
+        "--dry-run",
+        action="store_true",
+        default=False,
+        help="Print OTX commands without execution."
+    )
 
 
 @pytest.fixture
@@ -63,20 +78,26 @@ def fxt_template(request: pytest.FixtureRequest):
 
 
 @pytest.fixture
-def fxt_data_setting(request: pytest.FixtureRequest):
-    """Skip by dataset size."""
+def fxt_benchmark_config(request: pytest.FixtureRequest):
+    """Override benchmark config."""
     data_size_option: str = request.config.getoption("--data-size")
     data_size: str = request.param[0]
     datasets: List[str] = request.param[1]["datasets"]
-    num_repeat: int = request.param[1]["num_repeat"]
+    if data_size_option != "all":
+        if data_size_option != data_size:
+            pytest.skip(f"{data_size} datasets")
+
+    num_epoch: int = request.param[1].get("num_epoch", 0)  # 0: per-model default
+    num_epoch_override: int = request.config.getoption("--num-epoch")
+    if num_epoch_override > 0:
+        num_epoch = num_epoch_override
+
+    num_repeat: int = request.param[1].get("num_repeat", 1)
     num_repeat_override: int = request.config.getoption("--num-repeat")
     if num_repeat_override > 0:
         num_repeat = num_repeat_override
 
-    if data_size_option != "all":
-        if data_size_option != data_size:
-            pytest.skip(f"{data_size} datasets")
-    return data_size, datasets, num_repeat
+    return data_size, datasets, num_epoch, num_repeat
 
 
 @pytest.fixture
@@ -93,12 +114,15 @@ def fxt_build_command(request: pytest.FixtureRequest, fxt_commit_hash: str, tmp_
     data_root = os.path.abspath(data_root)
     output_dir = request.config.getoption("--output-dir")
     output_dir = os.path.abspath(output_dir + "-" + fxt_commit_hash)
+    dry_run = request.config.getoption("--dry-run")
 
     def build_config(
         tag: str,
         model_template: ModelTemplate,
         datasets: List[str],
+        num_epoch: int,
         num_repeat: int,
+        track_resources: bool = False,
         params: str = "",
     ) -> dict:
         cfg = {}
@@ -112,12 +136,17 @@ def build_config(
         }
         cfg["repeat"] = num_repeat
         cfg["command"] = []
+        if num_epoch > 0:
+            params = params + f" --learning_pararmeters.num_iters {num_epoch}"
+        resource_param = ""
+        if track_resources:
+            resource_param = " --track-resource-usage all"
         cfg["command"].append(
             "otx train ${model}"
             " --train-data-roots ${dataroot}/${data}"
             " --val-data-roots ${dataroot}/${data}"
-            " --track-resource-usage all"
             " --deterministic"
+            f"{resource_param}"
             f" params {params}"
         )
         cfg["command"].append(
@@ -150,10 +179,12 @@ def build_command(
         tag: str,
         model_template: ModelTemplate,
         datasets: List[str],
+        num_epoch: int,
         num_repeat: int,
+        track_resources: bool = False,
         params: str = "",
     ) -> List[str]:
-        cfg = build_config(tag, model_template, datasets, num_repeat, params)
+        cfg = build_config(tag, model_template, datasets, num_epoch, num_repeat, track_resources, params)
         cfg_path = tmp_path_factory.mktemp("exp")/"cfg.yaml"
         print(cfg_path)
         with open(cfg_path, "w") as cfg_file:
@@ -161,9 +192,9 @@ def build_command(
         cmd = [
             "python",
             "tools/experiment.py",
-            "-d",
             "-f",
             cfg_path,
+            "-d" if dry_run else "",
         ]
         return cmd
 
diff --git a/tests/perf/test_classification.py b/tests/perf/test_classification.py
index 2ddb411647c..21cd390644c 100644
--- a/tests/perf/test_classification.py
+++ b/tests/perf/test_classification.py
@@ -10,38 +10,69 @@
 from tests.test_suite.run_test_command import check_run
 
 
-templates = Registry(f"src/otx/algorithms").filter(task_type="CLASSIFICATION").templates
-templates_names = [template.name for template in templates]
+TEMPLATES = Registry(f"src/otx/algorithms").filter(task_type="CLASSIFICATION").templates
+TEMPLATE_NAMES = [template.name for template in TEMPLATES]
 
 
-class TestPerfMultiClassClassification:
-    data_settings = {
+class TestPerfSingleLabelClassification:
+    BENCHMARK_CONFIGS = {
         "small": {
             "datasets": [
-                "small_dataset/1",
-                "small_dataset/2",
-                "small_dataset/3",
+                "classification/single_label/multiclass_CUB_small/1",
+                "classification/single_label/multiclass_CUB_small/2",
+                "classification/single_label/multiclass_CUB_small/3",
             ],
             "num_repeat": 3,
         },
         "medium": {
             "datasets": [
-                "medium_dataset",
+                "classification/single_label/multiclass_CUB_medium",
             ],
             "num_repeat": 3,
         },
         "large": {
             "datasets": [
-                "large_dataset",
+                "classification/single_label/multiclass_food101_large",
             ],
             "num_repeat": 1,
         },
     }
-    @pytest.mark.parametrize("fxt_template", templates, ids=templates_names, indirect=True)
-    @pytest.mark.parametrize("fxt_data_setting", data_settings.items(), ids=data_settings.keys(), indirect=True)
-    def test_benchmark(self, fxt_template, fxt_data_setting, fxt_build_command):
+
+    @pytest.mark.parametrize("fxt_template", TEMPLATES, ids=TEMPLATE_NAMES, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark_config", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_accuarcy(self, fxt_template, fxt_benchmark_config, fxt_build_command):
+        """Benchmark accruacy metrics."""
+        model_template = fxt_template
+        data_size, datasets, num_epoch, num_repeat = fxt_benchmark_config
+        tag = f"singlelabel-classification-accuracy-{data_size}"
+        command = fxt_build_command(
+            tag,
+            model_template,
+            datasets,
+            num_epoch,
+            num_repeat,
+        )
+        check_run(command)
+
+    @pytest.mark.parametrize("fxt_template", TEMPLATES, ids=TEMPLATE_NAMES, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark_config", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_speed(self, fxt_template, fxt_benchmark_config, fxt_build_command):
+        """Benchmark train time per iter / infer time per image."""
         model_template = fxt_template
-        data_size, datasets, num_repeat = fxt_data_setting
-        tag = f"multiclass-classification-{data_size}"
-        command = fxt_build_command(tag, model_template, datasets, num_repeat)
+        data_size, datasets, num_epoch, num_repeat = fxt_benchmark_config
+        # Override default iteration setting, in case there's no user input
+        # "--data-size large -k speed" is recommended.
+        if num_epoch == 0:
+            num_epoch = 2
+        if num_repeat == 0:
+            num_repeat = 1
+        tag = f"singlelabel-classification-speed-{data_size}"
+        command = fxt_build_command(
+            tag,
+            model_template,
+            datasets,
+            num_epoch,
+            num_repeat,
+            track_resources=True,  # Measure CPU/GPU usages
+        )
         check_run(command)

From 8b5bfadf09d41b19fe46b28c5312735d9a8af8c2 Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Fri, 15 Dec 2023 14:00:31 +0900
Subject: [PATCH 03/25] Apply datetime-based output directoy

---
 tests/perf/conftest.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py
index 1c32a6f9019..803fb9c382b 100644
--- a/tests/perf/conftest.py
+++ b/tests/perf/conftest.py
@@ -7,6 +7,7 @@
 import subprocess
 import yaml
 from typing import List
+from datetime import datetime
 
 from otx.api.entities.model_template import ModelTemplate, ModelCategory
 
@@ -88,12 +89,12 @@ def fxt_benchmark_config(request: pytest.FixtureRequest):
             pytest.skip(f"{data_size} datasets")
 
     num_epoch: int = request.param[1].get("num_epoch", 0)  # 0: per-model default
-    num_epoch_override: int = request.config.getoption("--num-epoch")
+    num_epoch_override: int = int(request.config.getoption("--num-epoch"))
     if num_epoch_override > 0:
         num_epoch = num_epoch_override
 
     num_repeat: int = request.param[1].get("num_repeat", 1)
-    num_repeat_override: int = request.config.getoption("--num-repeat")
+    num_repeat_override: int = int(request.config.getoption("--num-repeat"))
     if num_repeat_override > 0:
         num_repeat = num_repeat_override
 
@@ -114,6 +115,7 @@ def fxt_build_command(request: pytest.FixtureRequest, fxt_commit_hash: str, tmp_
     data_root = os.path.abspath(data_root)
     output_dir = request.config.getoption("--output-dir")
     output_dir = os.path.abspath(output_dir + "-" + fxt_commit_hash)
+    output_dir = output_dir + "/" + datetime.now().strftime("%Y%m%d_%H%M%S")
     dry_run = request.config.getoption("--dry-run")
 
     def build_config(
@@ -137,7 +139,7 @@ def build_config(
         cfg["repeat"] = num_repeat
         cfg["command"] = []
         if num_epoch > 0:
-            params = params + f" --learning_pararmeters.num_iters {num_epoch}"
+            params = params + f" --learning_parameters.num_iters {num_epoch}"
         resource_param = ""
         if track_resources:
             resource_param = " --track-resource-usage all"
@@ -194,8 +196,10 @@ def build_command(
             "tools/experiment.py",
             "-f",
             cfg_path,
-            "-d" if dry_run else "",
         ]
+        if dry_run:
+            cmd.append("-d")
+
         return cmd
 
     return build_command

From 4debeeb62443d0cb160865b27a962a41b49c9f12 Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Fri, 15 Dec 2023 14:19:04 +0900
Subject: [PATCH 04/25] Fix choice options

---
 tests/perf/conftest.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py
index 803fb9c382b..dc4514a3d6c 100644
--- a/tests/perf/conftest.py
+++ b/tests/perf/conftest.py
@@ -18,13 +18,15 @@ def pytest_addoption(parser):
         "--model-type",
         action="store",
         default="all",
-        help="Choose default|all. Defaults to all."
+        choices=("default", "all"),
+        help="Choose default|all. Defaults to all.",
     )
     parser.addoption(
         "--data-size",
         action="store",
         default="all",
-        help="Choose small|medium|large|all. Defaults to all."
+        choices=("small", "medium", "large", "all"),
+        help="Choose small|medium|large|all. Defaults to all.",
     )
     parser.addoption(
         "--num-repeat",
@@ -45,6 +47,7 @@ def pytest_addoption(parser):
         "--eval-upto",
         action="store",
         default="all",
+        choices=("train", "export", "optimize"),
         help="Choose train|export|optimize. Defaults to train."
     )
     parser.addoption(
@@ -54,10 +57,10 @@ def pytest_addoption(parser):
         help="Dataset root directory."
     )
     parser.addoption(
-        "--output-dir",
+        "--output-root",
         action="store",
         default="exp/perf",
-        help="Output directory to save outputs."
+        help="Output root directory."
     )
     parser.addoption(
         "--dry-run",
@@ -113,7 +116,7 @@ def fxt_build_command(request: pytest.FixtureRequest, fxt_commit_hash: str, tmp_
     eval_upto = request.config.getoption("--eval-upto")
     data_root = request.config.getoption("--data-root")
     data_root = os.path.abspath(data_root)
-    output_dir = request.config.getoption("--output-dir")
+    output_dir = request.config.getoption("--output-root")
     output_dir = os.path.abspath(output_dir + "-" + fxt_commit_hash)
     output_dir = output_dir + "/" + datetime.now().strftime("%Y%m%d_%H%M%S")
     dry_run = request.config.getoption("--dry-run")

From b9b35661fcf3944553fca4fdad1a46d3431d2ba9 Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Fri, 15 Dec 2023 14:54:52 +0900
Subject: [PATCH 05/25] Exec based on model ID

---
 tests/perf/conftest.py            | 20 ++++++++++----------
 tests/perf/test_classification.py | 18 ++++++++----------
 tools/experiment.py               |  2 +-
 3 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py
index dc4514a3d6c..23272797196 100644
--- a/tests/perf/conftest.py
+++ b/tests/perf/conftest.py
@@ -71,14 +71,14 @@ def pytest_addoption(parser):
 
 
 @pytest.fixture
-def fxt_template(request: pytest.FixtureRequest):
-    """Skip by model template."""
+def fxt_model_id(request: pytest.FixtureRequest):
+    """Skip by model category."""
     model_type: str = request.config.getoption("--model-type")
-    template: ModelTemplate = request.param
+    model_template: ModelTemplate = request.param
     if model_type == "default":
-        if template.model_category == ModelCategory.OTHER:
-            pytest.skip(f"{template.model_category} model")
-    return template
+        if model_template.model_category == ModelCategory.OTHER:
+            pytest.skip(f"{model_template.model_category} category model")
+    return model_template.model_template_id
 
 
 @pytest.fixture
@@ -123,7 +123,7 @@ def fxt_build_command(request: pytest.FixtureRequest, fxt_commit_hash: str, tmp_
 
     def build_config(
         tag: str,
-        model_template: ModelTemplate,
+        model_id: str,
         datasets: List[str],
         num_epoch: int,
         num_repeat: int,
@@ -136,7 +136,7 @@ def build_config(
             "dataroot": data_root,
         }
         cfg["variables"] = {
-            "model": [model_template.model_template_id],
+            "model": [model_id],
             "data": datasets,
         }
         cfg["repeat"] = num_repeat
@@ -182,14 +182,14 @@ def build_config(
 
     def build_command(
         tag: str,
-        model_template: ModelTemplate,
+        model_id: str,
         datasets: List[str],
         num_epoch: int,
         num_repeat: int,
         track_resources: bool = False,
         params: str = "",
     ) -> List[str]:
-        cfg = build_config(tag, model_template, datasets, num_epoch, num_repeat, track_resources, params)
+        cfg = build_config(tag, model_id, datasets, num_epoch, num_repeat, track_resources, params)
         cfg_path = tmp_path_factory.mktemp("exp")/"cfg.yaml"
         print(cfg_path)
         with open(cfg_path, "w") as cfg_file:
diff --git a/tests/perf/test_classification.py b/tests/perf/test_classification.py
index 21cd390644c..d350d18e9c1 100644
--- a/tests/perf/test_classification.py
+++ b/tests/perf/test_classification.py
@@ -10,8 +10,8 @@
 from tests.test_suite.run_test_command import check_run
 
 
-TEMPLATES = Registry(f"src/otx/algorithms").filter(task_type="CLASSIFICATION").templates
-TEMPLATE_NAMES = [template.name for template in TEMPLATES]
+MODEL_TEMPLATES = Registry(f"src/otx/algorithms").filter(task_type="CLASSIFICATION").templates
+MODEL_IDS = [template.model_template_id for template in MODEL_TEMPLATES]
 
 
 class TestPerfSingleLabelClassification:
@@ -38,27 +38,25 @@ class TestPerfSingleLabelClassification:
         },
     }
 
-    @pytest.mark.parametrize("fxt_template", TEMPLATES, ids=TEMPLATE_NAMES, indirect=True)
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
     @pytest.mark.parametrize("fxt_benchmark_config", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
-    def test_accuarcy(self, fxt_template, fxt_benchmark_config, fxt_build_command):
+    def test_accuarcy(self, fxt_model_id, fxt_benchmark_config, fxt_build_command):
         """Benchmark accruacy metrics."""
-        model_template = fxt_template
         data_size, datasets, num_epoch, num_repeat = fxt_benchmark_config
         tag = f"singlelabel-classification-accuracy-{data_size}"
         command = fxt_build_command(
             tag,
-            model_template,
+            fxt_model_id,
             datasets,
             num_epoch,
             num_repeat,
         )
         check_run(command)
 
-    @pytest.mark.parametrize("fxt_template", TEMPLATES, ids=TEMPLATE_NAMES, indirect=True)
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
     @pytest.mark.parametrize("fxt_benchmark_config", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
-    def test_speed(self, fxt_template, fxt_benchmark_config, fxt_build_command):
+    def test_speed(self, fxt_model_id, fxt_benchmark_config, fxt_build_command):
         """Benchmark train time per iter / infer time per image."""
-        model_template = fxt_template
         data_size, datasets, num_epoch, num_repeat = fxt_benchmark_config
         # Override default iteration setting, in case there's no user input
         # "--data-size large -k speed" is recommended.
@@ -69,7 +67,7 @@ def test_speed(self, fxt_template, fxt_benchmark_config, fxt_build_command):
         tag = f"singlelabel-classification-speed-{data_size}"
         command = fxt_build_command(
             tag,
-            model_template,
+            fxt_model_id,
             datasets,
             num_epoch,
             num_repeat,
diff --git a/tools/experiment.py b/tools/experiment.py
index 6d9a271e547..7b2d2745064 100644
--- a/tools/experiment.py
+++ b/tools/experiment.py
@@ -790,7 +790,7 @@ def run_experiment_recipe(recipe_file: Union[str, Path], dryrun: bool = False):
     """
     exp_recipe = ExpRecipeParser(recipe_file)
     output_path = exp_recipe.output_path
-    output_path.mkdir(exist_ok=True)
+    output_path.mkdir(parents=True, exist_ok=True)
     current_dir = os.getcwd()
     os.chdir(output_path)
 

From c733b25fbff48c3fcd6a3bc49757f45d1be3da42 Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Tue, 19 Dec 2023 17:31:09 +0900
Subject: [PATCH 06/25] Refactor out Benchmark class

---
 tests/perf/conftest.py            | 180 +++++++++++++++++-------------
 tests/perf/test_classification.py |  49 ++++----
 2 files changed, 128 insertions(+), 101 deletions(-)

diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py
index 23272797196..d6f076f722a 100644
--- a/tests/perf/conftest.py
+++ b/tests/perf/conftest.py
@@ -6,6 +6,7 @@
 import os
 import subprocess
 import yaml
+from pathlib import Path
 from typing import List
 from datetime import datetime
 
@@ -70,6 +71,12 @@ def pytest_addoption(parser):
     )
 
 
+@pytest.fixture
+def fxt_commit_hash():
+    """Short commit hash."""
+    return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode('ascii').strip()
+
+
 @pytest.fixture
 def fxt_model_id(request: pytest.FixtureRequest):
     """Skip by model category."""
@@ -82,83 +89,136 @@ def fxt_model_id(request: pytest.FixtureRequest):
 
 
 @pytest.fixture
-def fxt_benchmark_config(request: pytest.FixtureRequest):
-    """Override benchmark config."""
+def fxt_benchmark(request: pytest.FixtureRequest, fxt_commit_hash: str):
+    """Configure benchmark."""
+    # Skip by dataset size
     data_size_option: str = request.config.getoption("--data-size")
     data_size: str = request.param[0]
-    datasets: List[str] = request.param[1]["datasets"]
     if data_size_option != "all":
         if data_size_option != data_size:
             pytest.skip(f"{data_size} datasets")
 
-    num_epoch: int = request.param[1].get("num_epoch", 0)  # 0: per-model default
+    # Options
+    cfg: dict = request.param[1].copy()
+
     num_epoch_override: int = int(request.config.getoption("--num-epoch"))
-    if num_epoch_override > 0:
-        num_epoch = num_epoch_override
+    if num_epoch_override > 0:  # 0: use default
+        cfg["num_epoch"] = num_epoch_override
 
-    num_repeat: int = request.param[1].get("num_repeat", 1)
     num_repeat_override: int = int(request.config.getoption("--num-repeat"))
-    if num_repeat_override > 0:
-        num_repeat = num_repeat_override
+    if num_repeat_override > 0:  # 0: use default
+        cfg["num_repeat"] = num_repeat_override
+
+    cfg["eval_upto"] = request.config.getoption("--eval-upto")
+    cfg["data_root"] = request.config.getoption("--data-root")
+    output_root = request.config.getoption("--output-root")
+    output_dir = fxt_commit_hash + "-" + datetime.now().strftime("%Y%m%d_%H%M%S")
+    cfg["output_root"] = str(Path(output_root) / output_dir)
+    cfg["dry_run"] = request.config.getoption("--dry-run")
+
+    tags = cfg.get("tags", {})
+    tags["data_size"] = data_size
+    cfg["tags"] = tags
+
+    # Create benchmark
+    benchmark = OTXBenchmark(
+        **cfg,
+    )
 
-    return data_size, datasets, num_epoch, num_repeat
+    return benchmark
 
 
-@pytest.fixture
-def fxt_commit_hash():
-    """Short commit hash in short form."""
-    return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode('ascii').strip()
+class OTXBenchmark:
+    def __init__(
+        self,
+        datasets: List[str],
+        data_root: str = "data",
+        num_epoch: int = 0,
+        num_repeat: int = 0,
+        train_params: dict = {},
+        track_resources: bool = False,
+        eval_upto: str = "train",
+        output_root: str = "otx-benchmark",
+        dry_run: bool = False,
+        tags: dict = {},
+    ):
+        self.datasets = datasets
+        self.data_root = data_root
+        self.num_epoch = num_epoch
+        self.num_repeat = num_repeat
+        self.train_params = train_params
+        self.track_resources = track_resources
+        self.eval_upto = eval_upto
+        self.output_root = output_root
+        self.dry_run = dry_run
+        self.tags = tags
 
+    def build_command(
+        self,
+        model_id: str,
+        train_params: dict = {},
+        tags: dict = {},
+    ) -> List[str]:
+        cfg = self._build_config(model_id, tags, train_params)
+        cfg_dir = Path(self.output_root)
+        cfg_dir.mkdir(parents=True, exist_ok=True)
+        cfg_path = cfg_dir / "cfg.yaml"
+        print(cfg_path)
+        with open(cfg_path, "w") as cfg_file:
+            yaml.dump(cfg, cfg_file, indent=2,)
+        cmd = [
+            "python",
+            "tools/experiment.py",
+            "-f",
+            cfg_path,
+        ]
+        if self.dry_run:
+            cmd.append("-d")
+        return cmd
 
-@pytest.fixture
-def fxt_build_command(request: pytest.FixtureRequest, fxt_commit_hash: str, tmp_path_factory):
-    """Research framework command builder."""
-    eval_upto = request.config.getoption("--eval-upto")
-    data_root = request.config.getoption("--data-root")
-    data_root = os.path.abspath(data_root)
-    output_dir = request.config.getoption("--output-root")
-    output_dir = os.path.abspath(output_dir + "-" + fxt_commit_hash)
-    output_dir = output_dir + "/" + datetime.now().strftime("%Y%m%d_%H%M%S")
-    dry_run = request.config.getoption("--dry-run")
-
-    def build_config(
-        tag: str,
+    def _build_config(
+        self,
         model_id: str,
-        datasets: List[str],
-        num_epoch: int,
-        num_repeat: int,
-        track_resources: bool = False,
-        params: str = "",
+        train_params: dict = {},
+        tags: dict = {},
     ) -> dict:
+        all_train_params = self.train_params.copy()
+        all_train_params.update(train_params)
+        all_tags = self.tags.copy()
+        all_tags.update(tags)
+
         cfg = {}
-        cfg["output_path"] = output_dir
+        cfg["tags"] = all_tags  # metadata
+        cfg["output_path"] = os.path.abspath(self.output_root)
         cfg["constants"] = {
-            "dataroot": data_root,
+            "dataroot": os.path.abspath(self.data_root),
         }
         cfg["variables"] = {
             "model": [model_id],
-            "data": datasets,
+            "data": self.datasets,
+            **{k: [v] for k, v in all_tags.items()},  # To be shown in result file
         }
-        cfg["repeat"] = num_repeat
+        cfg["repeat"] = self.num_repeat
         cfg["command"] = []
-        if num_epoch > 0:
-            params = params + f" --learning_parameters.num_iters {num_epoch}"
         resource_param = ""
-        if track_resources:
-            resource_param = " --track-resource-usage all"
+        if self.track_resources:
+            resource_param = "--track-resource-usage all"
+        if self.num_epoch > 0:
+            all_train_params["learning_parameters.num_iters"] = self.num_epoch
+        params_str = " ".join([f"--{k} {v}" for k, v in all_train_params.items()])
         cfg["command"].append(
             "otx train ${model}"
             " --train-data-roots ${dataroot}/${data}"
             " --val-data-roots ${dataroot}/${data}"
             " --deterministic"
-            f"{resource_param}"
-            f" params {params}"
+            f" {resource_param}"
+            f" params {params_str}"
         )
         cfg["command"].append(
             "otx eval"
             " --test-data-roots ${dataroot}/${data}"
         )
-        if eval_upto == "train":
+        if self.eval_upto == "train":
             return cfg
 
         cfg["command"].append(
@@ -168,7 +228,7 @@ def build_config(
             "otx eval"
             " --test-data-roots ${dataroot}/${data}"
         )
-        if eval_upto == "export":
+        if self.eval_upto == "export":
             return cfg
 
         cfg["command"].append(
@@ -179,35 +239,3 @@ def build_config(
             " --test-data-roots ${dataroot}/${data}"
         )
         return cfg
-
-    def build_command(
-        tag: str,
-        model_id: str,
-        datasets: List[str],
-        num_epoch: int,
-        num_repeat: int,
-        track_resources: bool = False,
-        params: str = "",
-    ) -> List[str]:
-        cfg = build_config(tag, model_id, datasets, num_epoch, num_repeat, track_resources, params)
-        cfg_path = tmp_path_factory.mktemp("exp")/"cfg.yaml"
-        print(cfg_path)
-        with open(cfg_path, "w") as cfg_file:
-            yaml.dump(cfg, cfg_file, indent=2,)
-        cmd = [
-            "python",
-            "tools/experiment.py",
-            "-f",
-            cfg_path,
-        ]
-        if dry_run:
-            cmd.append("-d")
-
-        return cmd
-
-    return build_command
-
-
-class OTXBenchmark:
-    def __init__(self):
-        pass
diff --git a/tests/perf/test_classification.py b/tests/perf/test_classification.py
index d350d18e9c1..78861d5a05d 100644
--- a/tests/perf/test_classification.py
+++ b/tests/perf/test_classification.py
@@ -17,6 +17,9 @@
 class TestPerfSingleLabelClassification:
     BENCHMARK_CONFIGS = {
         "small": {
+            "tags": {
+                "task": "single-label-classification",
+            },
             "datasets": [
                 "classification/single_label/multiclass_CUB_small/1",
                 "classification/single_label/multiclass_CUB_small/2",
@@ -25,12 +28,18 @@ class TestPerfSingleLabelClassification:
             "num_repeat": 3,
         },
         "medium": {
+            "tags": {
+                "task": "single-label-classification",
+            },
             "datasets": [
                 "classification/single_label/multiclass_CUB_medium",
             ],
             "num_repeat": 3,
         },
         "large": {
+            "tags": {
+                "task": "single-label-classification",
+            },
             "datasets": [
                 "classification/single_label/multiclass_food101_large",
             ],
@@ -39,38 +48,28 @@ class TestPerfSingleLabelClassification:
     }
 
     @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
-    @pytest.mark.parametrize("fxt_benchmark_config", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
-    def test_accuarcy(self, fxt_model_id, fxt_benchmark_config, fxt_build_command):
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_accuarcy(self, fxt_model_id, fxt_benchmark):
         """Benchmark accruacy metrics."""
-        data_size, datasets, num_epoch, num_repeat = fxt_benchmark_config
-        tag = f"singlelabel-classification-accuracy-{data_size}"
-        command = fxt_build_command(
-            tag,
-            fxt_model_id,
-            datasets,
-            num_epoch,
-            num_repeat,
+        command = fxt_benchmark.build_command(
+            model_id=fxt_model_id,
+            tags={"benchmark": "accuracy"},
         )
         check_run(command)
 
     @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
-    @pytest.mark.parametrize("fxt_benchmark_config", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
-    def test_speed(self, fxt_model_id, fxt_benchmark_config, fxt_build_command):
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_speed(self, fxt_model_id, fxt_benchmark):
         """Benchmark train time per iter / infer time per image."""
-        data_size, datasets, num_epoch, num_repeat = fxt_benchmark_config
         # Override default iteration setting, in case there's no user input
         # "--data-size large -k speed" is recommended.
-        if num_epoch == 0:
-            num_epoch = 2
-        if num_repeat == 0:
-            num_repeat = 1
-        tag = f"singlelabel-classification-speed-{data_size}"
-        command = fxt_build_command(
-            tag,
-            fxt_model_id,
-            datasets,
-            num_epoch,
-            num_repeat,
-            track_resources=True,  # Measure CPU/GPU usages
+        if fxt_benchmark.num_epoch == 0:
+            fxt_benchmark.num_epoch = 2
+        if fxt_benchmark.num_repeat == 0:
+            fxt_benchmark.num_repeat = 1
+        fxt_benchmark.track_resources = True
+        command = fxt_benchmark.build_command(
+            model_id=fxt_model_id,
+            tags={"benchmark": "speed"},
         )
         check_run(command)

From 3996eb376d79eec2622468b2eb0e21060c4672d7 Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Tue, 19 Dec 2023 17:59:09 +0900
Subject: [PATCH 07/25] Automate speed test setting

---
 tests/perf/conftest.py            | 5 ++++-
 tests/perf/test_classification.py | 6 ------
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py
index d6f076f722a..6495ba32fb9 100644
--- a/tests/perf/conftest.py
+++ b/tests/perf/conftest.py
@@ -104,6 +104,9 @@ def fxt_benchmark(request: pytest.FixtureRequest, fxt_commit_hash: str):
     num_epoch_override: int = int(request.config.getoption("--num-epoch"))
     if num_epoch_override > 0:  # 0: use default
         cfg["num_epoch"] = num_epoch_override
+    if "test_speed" in request.node.name:
+        if cfg.get("num_epoch", 0) == 0:  # No user options
+            cfg["num_epoch"] = 2
 
     num_repeat_override: int = int(request.config.getoption("--num-repeat"))
     if num_repeat_override > 0:  # 0: use default
@@ -159,7 +162,7 @@ def build_command(
         train_params: dict = {},
         tags: dict = {},
     ) -> List[str]:
-        cfg = self._build_config(model_id, tags, train_params)
+        cfg = self._build_config(model_id, train_params, tags)
         cfg_dir = Path(self.output_root)
         cfg_dir.mkdir(parents=True, exist_ok=True)
         cfg_path = cfg_dir / "cfg.yaml"
diff --git a/tests/perf/test_classification.py b/tests/perf/test_classification.py
index 78861d5a05d..88ca0767cd1 100644
--- a/tests/perf/test_classification.py
+++ b/tests/perf/test_classification.py
@@ -61,12 +61,6 @@ def test_accuarcy(self, fxt_model_id, fxt_benchmark):
     @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
     def test_speed(self, fxt_model_id, fxt_benchmark):
         """Benchmark train time per iter / infer time per image."""
-        # Override default iteration setting, in case there's no user input
-        # "--data-size large -k speed" is recommended.
-        if fxt_benchmark.num_epoch == 0:
-            fxt_benchmark.num_epoch = 2
-        if fxt_benchmark.num_repeat == 0:
-            fxt_benchmark.num_repeat = 1
         fxt_benchmark.track_resources = True
         command = fxt_benchmark.build_command(
             model_id=fxt_model_id,

From 9d0e831eb06cdc588481498b6f27aab6a9d6f8ed Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Wed, 20 Dec 2023 09:50:22 +0900
Subject: [PATCH 08/25] Refacor OTXBenchmark

---
 tests/perf/benchmark.py           | 127 ++++++++++++++++++++++++++++++
 tests/perf/conftest.py            | 120 +---------------------------
 tests/perf/test_classification.py |  12 ++-
 3 files changed, 136 insertions(+), 123 deletions(-)
 create mode 100644 tests/perf/benchmark.py

diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py
new file mode 100644
index 00000000000..b7326064d85
--- /dev/null
+++ b/tests/perf/benchmark.py
@@ -0,0 +1,127 @@
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+import os
+import yaml
+from pathlib import Path
+from typing import List
+
+from tests.test_suite.run_test_command import check_run
+
+
+class OTXBenchmark:
+    def __init__(
+        self,
+        datasets: List[str],
+        data_root: str = "data",
+        num_epoch: int = 0,
+        num_repeat: int = 0,
+        train_params: dict = {},
+        track_resources: bool = False,
+        eval_upto: str = "train",
+        output_root: str = "otx-benchmark",
+        dry_run: bool = False,
+        tags: dict = {},
+    ):
+        self.datasets = datasets
+        self.data_root = data_root
+        self.num_epoch = num_epoch
+        self.num_repeat = num_repeat
+        self.train_params = train_params
+        self.track_resources = track_resources
+        self.eval_upto = eval_upto
+        self.output_root = output_root
+        self.dry_run = dry_run
+        self.tags = tags
+
+    def run(
+        self,
+        model_id: str,
+        train_params: dict = {},
+        tags: dict = {},
+    ) -> List[str]:
+        # Build config file
+        cfg = self._build_config(model_id, train_params, tags)
+        cfg_dir = Path(self.output_root)
+        cfg_dir.mkdir(parents=True, exist_ok=True)
+        cfg_path = cfg_dir / "cfg.yaml"
+        with open(cfg_path, "w") as cfg_file:
+            yaml.dump(cfg, cfg_file, indent=2,)
+        cmd = [
+            "python",
+            "tools/experiment.py",
+            "-f",
+            cfg_path,
+        ]
+        if self.dry_run:
+            cmd.append("-d")
+        # Run benchmark
+        check_run(cmd)
+        # Load result
+        result = None
+        return result
+
+    def _build_config(
+        self,
+        model_id: str,
+        train_params: dict = {},
+        tags: dict = {},
+    ) -> dict:
+        all_train_params = self.train_params.copy()
+        all_train_params.update(train_params)
+        all_tags = self.tags.copy()
+        all_tags.update(tags)
+
+        cfg = {}
+        cfg["tags"] = all_tags  # metadata
+        cfg["output_path"] = os.path.abspath(self.output_root)
+        cfg["constants"] = {
+            "dataroot": os.path.abspath(self.data_root),
+        }
+        cfg["variables"] = {
+            "model": [model_id],
+            "data": self.datasets,
+            **{k: [v] for k, v in all_tags.items()},  # To be shown in result file
+        }
+        cfg["repeat"] = self.num_repeat
+        cfg["command"] = []
+        resource_param = ""
+        if self.track_resources:
+            resource_param = "--track-resource-usage all"
+        if self.num_epoch > 0:
+            all_train_params["learning_parameters.num_iters"] = self.num_epoch
+        params_str = " ".join([f"--{k} {v}" for k, v in all_train_params.items()])
+        cfg["command"].append(
+            "otx train ${model}"
+            " --train-data-roots ${dataroot}/${data}"
+            " --val-data-roots ${dataroot}/${data}"
+            " --deterministic"
+            f" {resource_param}"
+            f" params {params_str}"
+        )
+        cfg["command"].append(
+            "otx eval"
+            " --test-data-roots ${dataroot}/${data}"
+        )
+        if self.eval_upto == "train":
+            return cfg
+
+        cfg["command"].append(
+            "otx export"
+        )
+        cfg["command"].append(
+            "otx eval"
+            " --test-data-roots ${dataroot}/${data}"
+        )
+        if self.eval_upto == "export":
+            return cfg
+
+        cfg["command"].append(
+            "otx optimize"
+        )
+        cfg["command"].append(
+            "otx eval"
+            " --test-data-roots ${dataroot}/${data}"
+        )
+        return cfg
diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py
index 6495ba32fb9..00c2f6c0a1c 100644
--- a/tests/perf/conftest.py
+++ b/tests/perf/conftest.py
@@ -11,6 +11,7 @@
 from datetime import datetime
 
 from otx.api.entities.model_template import ModelTemplate, ModelCategory
+from .benchmark import OTXBenchmark
 
 
 def pytest_addoption(parser):
@@ -72,13 +73,13 @@ def pytest_addoption(parser):
 
 
 @pytest.fixture
-def fxt_commit_hash():
+def fxt_commit_hash() -> str:
     """Short commit hash."""
     return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode('ascii').strip()
 
 
 @pytest.fixture
-def fxt_model_id(request: pytest.FixtureRequest):
+def fxt_model_id(request: pytest.FixtureRequest) -> str:
     """Skip by model category."""
     model_type: str = request.config.getoption("--model-type")
     model_template: ModelTemplate = request.param
@@ -89,7 +90,7 @@ def fxt_model_id(request: pytest.FixtureRequest):
 
 
 @pytest.fixture
-def fxt_benchmark(request: pytest.FixtureRequest, fxt_commit_hash: str):
+def fxt_benchmark(request: pytest.FixtureRequest, fxt_commit_hash: str) -> OTXBenchmark:
     """Configure benchmark."""
     # Skip by dataset size
     data_size_option: str = request.config.getoption("--data-size")
@@ -129,116 +130,3 @@ def fxt_benchmark(request: pytest.FixtureRequest, fxt_commit_hash: str):
     )
 
     return benchmark
-
-
-class OTXBenchmark:
-    def __init__(
-        self,
-        datasets: List[str],
-        data_root: str = "data",
-        num_epoch: int = 0,
-        num_repeat: int = 0,
-        train_params: dict = {},
-        track_resources: bool = False,
-        eval_upto: str = "train",
-        output_root: str = "otx-benchmark",
-        dry_run: bool = False,
-        tags: dict = {},
-    ):
-        self.datasets = datasets
-        self.data_root = data_root
-        self.num_epoch = num_epoch
-        self.num_repeat = num_repeat
-        self.train_params = train_params
-        self.track_resources = track_resources
-        self.eval_upto = eval_upto
-        self.output_root = output_root
-        self.dry_run = dry_run
-        self.tags = tags
-
-    def build_command(
-        self,
-        model_id: str,
-        train_params: dict = {},
-        tags: dict = {},
-    ) -> List[str]:
-        cfg = self._build_config(model_id, train_params, tags)
-        cfg_dir = Path(self.output_root)
-        cfg_dir.mkdir(parents=True, exist_ok=True)
-        cfg_path = cfg_dir / "cfg.yaml"
-        print(cfg_path)
-        with open(cfg_path, "w") as cfg_file:
-            yaml.dump(cfg, cfg_file, indent=2,)
-        cmd = [
-            "python",
-            "tools/experiment.py",
-            "-f",
-            cfg_path,
-        ]
-        if self.dry_run:
-            cmd.append("-d")
-        return cmd
-
-    def _build_config(
-        self,
-        model_id: str,
-        train_params: dict = {},
-        tags: dict = {},
-    ) -> dict:
-        all_train_params = self.train_params.copy()
-        all_train_params.update(train_params)
-        all_tags = self.tags.copy()
-        all_tags.update(tags)
-
-        cfg = {}
-        cfg["tags"] = all_tags  # metadata
-        cfg["output_path"] = os.path.abspath(self.output_root)
-        cfg["constants"] = {
-            "dataroot": os.path.abspath(self.data_root),
-        }
-        cfg["variables"] = {
-            "model": [model_id],
-            "data": self.datasets,
-            **{k: [v] for k, v in all_tags.items()},  # To be shown in result file
-        }
-        cfg["repeat"] = self.num_repeat
-        cfg["command"] = []
-        resource_param = ""
-        if self.track_resources:
-            resource_param = "--track-resource-usage all"
-        if self.num_epoch > 0:
-            all_train_params["learning_parameters.num_iters"] = self.num_epoch
-        params_str = " ".join([f"--{k} {v}" for k, v in all_train_params.items()])
-        cfg["command"].append(
-            "otx train ${model}"
-            " --train-data-roots ${dataroot}/${data}"
-            " --val-data-roots ${dataroot}/${data}"
-            " --deterministic"
-            f" {resource_param}"
-            f" params {params_str}"
-        )
-        cfg["command"].append(
-            "otx eval"
-            " --test-data-roots ${dataroot}/${data}"
-        )
-        if self.eval_upto == "train":
-            return cfg
-
-        cfg["command"].append(
-            "otx export"
-        )
-        cfg["command"].append(
-            "otx eval"
-            " --test-data-roots ${dataroot}/${data}"
-        )
-        if self.eval_upto == "export":
-            return cfg
-
-        cfg["command"].append(
-            "otx optimize"
-        )
-        cfg["command"].append(
-            "otx eval"
-            " --test-data-roots ${dataroot}/${data}"
-        )
-        return cfg
diff --git a/tests/perf/test_classification.py b/tests/perf/test_classification.py
index 88ca0767cd1..4bc5ef6d4a8 100644
--- a/tests/perf/test_classification.py
+++ b/tests/perf/test_classification.py
@@ -7,7 +7,7 @@
 import pytest
 
 from otx.cli.registry import Registry
-from tests.test_suite.run_test_command import check_run
+from .benchmark import OTXBenchmark
 
 
 MODEL_TEMPLATES = Registry(f"src/otx/algorithms").filter(task_type="CLASSIFICATION").templates
@@ -49,21 +49,19 @@ class TestPerfSingleLabelClassification:
 
     @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
     @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
-    def test_accuarcy(self, fxt_model_id, fxt_benchmark):
+    def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
         """Benchmark accruacy metrics."""
-        command = fxt_benchmark.build_command(
+        result = fxt_benchmark.run(
             model_id=fxt_model_id,
             tags={"benchmark": "accuracy"},
         )
-        check_run(command)
 
     @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
     @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
-    def test_speed(self, fxt_model_id, fxt_benchmark):
+    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
         """Benchmark train time per iter / infer time per image."""
         fxt_benchmark.track_resources = True
-        command = fxt_benchmark.build_command(
+        result = fxt_benchmark.run(
             model_id=fxt_model_id,
             tags={"benchmark": "speed"},
         )
-        check_run(command)

From 1abbca828bab3fde39e921dd5088b8224afb55ca Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Wed, 20 Dec 2023 10:43:50 +0900
Subject: [PATCH 09/25] Add API doc for OTXBenchmark

---
 tests/perf/benchmark.py | 56 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 53 insertions(+), 3 deletions(-)

diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py
index b7326064d85..14f055cc80e 100644
--- a/tests/perf/benchmark.py
+++ b/tests/perf/benchmark.py
@@ -3,6 +3,7 @@
 
 
 import os
+import pandas as pd
 import yaml
 from pathlib import Path
 from typing import List
@@ -11,12 +12,38 @@
 
 
 class OTXBenchmark:
+    """Benchmark runner based on tools/experiment.py in OTX1.x.
+
+    Example:
+        >>> bm = OTXBenchmark(['random_sample1', 'random_sample'], data_root='./data/coco')
+        >>> atss_result = bm.run('MobileNetV2-ATSS')
+        >>> yolox_result = bm.run('YOLOX-TINY')
+
+    Args:
+        datasets (List[str]): Paths to datasets relative to the data_root.
+            Intended for, but not restricted to different sampling based on same dataset.
+        data_root (str): Path to the root of dataset directories. Defaults to './data'.
+        num_epoch (int): Overrides the per-model default number of epoch settings.
+            Defaults to 0, which means no overriding.
+        num_repeat (int): Number for trials with different random seed, which would be set
+            as range(0, num_repeat). Defaults to 1.
+        train_params (dict): Additional training parameters.
+            e.x) {'learning_parameters.num_iters': 2}. Defaults to {}.
+        track_resources (bool): Whether to track CPU & GPU usage metrics. Defaults to False.
+        eval_upto (str): The last serial operation to evaluate. Choose on of ('train', 'export', 'optimize').
+            Operations include the preceeding ones.
+            e.x) Eval up to 'optimize': train -> eval -> export -> eval -> optimize -> eval
+            Default to 'train'.
+        output_root (str): Output path for logs and results. Defaults to './otx-benchmark'.
+        dry_run (bool): Whether to just print the OTX command without execution. Defaults to False.
+        tags (dict): Key-values pair metadata for the experiment. Defaults to {}.
+    """
     def __init__(
         self,
         datasets: List[str],
         data_root: str = "data",
         num_epoch: int = 0,
-        num_repeat: int = 0,
+        num_repeat: int = 1,
         train_params: dict = {},
         track_resources: bool = False,
         eval_upto: str = "train",
@@ -40,7 +67,18 @@ def run(
         model_id: str,
         train_params: dict = {},
         tags: dict = {},
-    ) -> List[str]:
+    ) -> pd.DataFrame
+        """Run benchmark and return the result.
+
+        Args:
+            model_id (str): Target model identifier
+            train_params (dict): Overrides global benchmark train params
+            tags (dict): Overrides global benchmark tags
+
+        Retruns:
+            pd.DataFrame: Table with benchmark metrics
+        """
+
         # Build config file
         cfg = self._build_config(model_id, train_params, tags)
         cfg_dir = Path(self.output_root)
@@ -59,9 +97,21 @@ def run(
         # Run benchmark
         check_run(cmd)
         # Load result
-        result = None
+        result = self.load_result()
         return result
 
+    def load_result(self, result_path: str = None) -> pd.DataFrame:
+        """Load result as pd.DataFrame format.
+
+        Args:
+            result_path (str): Result directory or speicific file.
+                Defaults to None to search the benchmark output root.
+
+        Retruns:
+            pd.DataFrame: Table with benchmark metrics
+        """
+        return None
+
     def _build_config(
         self,
         model_id: str,

From 752928b13d2e67db44e232e896c0bf25a8c9e8e2 Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Wed, 20 Dec 2023 11:23:52 +0900
Subject: [PATCH 10/25] Add csv loading

---
 tests/perf/benchmark.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py
index 14f055cc80e..ecbc1c51978 100644
--- a/tests/perf/benchmark.py
+++ b/tests/perf/benchmark.py
@@ -6,7 +6,7 @@
 import pandas as pd
 import yaml
 from pathlib import Path
-from typing import List
+from typing import List, Optional
 
 from tests.test_suite.run_test_command import check_run
 
@@ -67,7 +67,7 @@ def run(
         model_id: str,
         train_params: dict = {},
         tags: dict = {},
-    ) -> pd.DataFrame
+    ) -> pd.DataFrame:
         """Run benchmark and return the result.
 
         Args:
@@ -100,7 +100,7 @@ def run(
         result = self.load_result()
         return result
 
-    def load_result(self, result_path: str = None) -> pd.DataFrame:
+    def load_result(self, result_path: Optional[str] = None) -> pd.DataFrame:
         """Load result as pd.DataFrame format.
 
         Args:
@@ -110,7 +110,11 @@ def load_result(self, result_path: str = None) -> pd.DataFrame:
         Retruns:
             pd.DataFrame: Table with benchmark metrics
         """
-        return None
+        if result_path is None:
+            csv_file_path = Path(self.output_root) / "exp_summary.csv"
+        elif os.path.isdir(result_path):
+            csv_file_path = Path(result_path) / "exp_summary.csv"
+        return pd.read_csv(csv_file_path)
 
     def _build_config(
         self,

From 046a88291ce1632482e17691f3c327ac352498cd Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Wed, 20 Dec 2023 13:44:45 +0900
Subject: [PATCH 11/25] Add tags to benchmark result

---
 tests/perf/benchmark.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py
index ecbc1c51978..4cfff421d2d 100644
--- a/tests/perf/benchmark.py
+++ b/tests/perf/benchmark.py
@@ -110,11 +110,22 @@ def load_result(self, result_path: Optional[str] = None) -> pd.DataFrame:
         Retruns:
             pd.DataFrame: Table with benchmark metrics
         """
+        # Load csv data
         if result_path is None:
             csv_file_path = Path(self.output_root) / "exp_summary.csv"
         elif os.path.isdir(result_path):
             csv_file_path = Path(result_path) / "exp_summary.csv"
-        return pd.read_csv(csv_file_path)
+        result = pd.read_csv(csv_file_path)
+
+        # Append metadata if any
+        cfg_file_path: Path = csv_file_path.parent / "cfg.yaml"
+        if cfg_file_path.exists():
+            with cfg_file_path.open("r") as cfg_file:
+                tags = yaml.safe_load(cfg_file).get("tags", {})
+                for k, v in tags.items():
+                    result[k] = v
+
+        return result
 
     def _build_config(
         self,

From c5ff73e77b95d99dbf683c0a6c2cfe0e4243cf5a Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Wed, 20 Dec 2023 18:33:12 +0900
Subject: [PATCH 12/25] Add benchmark summary fixture

---
 tests/perf/benchmark.py | 53 +++++++++++++++++++++++------------------
 tests/perf/conftest.py  | 37 ++++++++++++++++++----------
 2 files changed, 55 insertions(+), 35 deletions(-)

diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py
index 4cfff421d2d..7bb3b41087e 100644
--- a/tests/perf/benchmark.py
+++ b/tests/perf/benchmark.py
@@ -3,6 +3,7 @@
 
 
 import os
+import glob
 import pandas as pd
 import yaml
 from pathlib import Path
@@ -34,7 +35,7 @@ class OTXBenchmark:
             Operations include the preceeding ones.
             e.x) Eval up to 'optimize': train -> eval -> export -> eval -> optimize -> eval
             Default to 'train'.
-        output_root (str): Output path for logs and results. Defaults to './otx-benchmark'.
+        output_root (str): Output root dirctory for logs and results. Defaults to './otx-benchmark'.
         dry_run (bool): Whether to just print the OTX command without execution. Defaults to False.
         tags (dict): Key-values pair metadata for the experiment. Defaults to {}.
     """
@@ -81,7 +82,7 @@ def run(
 
         # Build config file
         cfg = self._build_config(model_id, train_params, tags)
-        cfg_dir = Path(self.output_root)
+        cfg_dir = Path(cfg["output_path"])
         cfg_dir.mkdir(parents=True, exist_ok=True)
         cfg_path = cfg_dir / "cfg.yaml"
         with open(cfg_path, "w") as cfg_file:
@@ -97,35 +98,39 @@ def run(
         # Run benchmark
         check_run(cmd)
         # Load result
-        result = self.load_result()
+        result = self.load_result(cfg_dir)
         return result
 
-    def load_result(self, result_path: Optional[str] = None) -> pd.DataFrame:
-        """Load result as pd.DataFrame format.
+    @staticmethod
+    def load_result(result_path: str) -> pd.DataFrame:
+        """Load benchmark results recursively and merge as pd.DataFrame.
 
         Args:
             result_path (str): Result directory or speicific file.
-                Defaults to None to search the benchmark output root.
 
         Retruns:
-            pd.DataFrame: Table with benchmark metrics
+            pd.DataFrame: Table with benchmark metrics & options
         """
+        # Search csv files
+        if os.path.isdir(result_path):
+            csv_file_paths = glob.glob(f"{result_path}/**/exp_summary.csv", recursive=True)
+        else:
+            csv_file_paths = [result_path]
+        results = []
         # Load csv data
-        if result_path is None:
-            csv_file_path = Path(self.output_root) / "exp_summary.csv"
-        elif os.path.isdir(result_path):
-            csv_file_path = Path(result_path) / "exp_summary.csv"
-        result = pd.read_csv(csv_file_path)
-
-        # Append metadata if any
-        cfg_file_path: Path = csv_file_path.parent / "cfg.yaml"
-        if cfg_file_path.exists():
-            with cfg_file_path.open("r") as cfg_file:
-                tags = yaml.safe_load(cfg_file).get("tags", {})
-                for k, v in tags.items():
-                    result[k] = v
-
-        return result
+        for csv_file_path in csv_file_paths:
+            result = pd.read_csv(csv_file_path)
+            # Append metadata if any
+            cfg_file_path = Path(csv_file_path).parent / "cfg.yaml"
+            if cfg_file_path.exists():
+                with cfg_file_path.open("r") as cfg_file:
+                    tags = yaml.safe_load(cfg_file).get("tags", {})
+                    for k, v in tags.items():
+                        result[k] = v
+            results.append(result)
+        if len(results) > 0:
+            results = pd.concat(results, ignore_index=True)
+        return results
 
     def _build_config(
         self,
@@ -140,7 +145,9 @@ def _build_config(
 
         cfg = {}
         cfg["tags"] = all_tags  # metadata
-        cfg["output_path"] = os.path.abspath(self.output_root)
+        cfg["output_path"] = os.path.abspath(
+            Path(self.output_root) / "-".join(list(all_tags.values()) + [model_id])
+        )
         cfg["constants"] = {
             "dataroot": os.path.abspath(self.data_root),
         }
diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py
index 00c2f6c0a1c..a505269c221 100644
--- a/tests/perf/conftest.py
+++ b/tests/perf/conftest.py
@@ -72,10 +72,13 @@ def pytest_addoption(parser):
     )
 
 
-@pytest.fixture
-def fxt_commit_hash() -> str:
-    """Short commit hash."""
-    return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode('ascii').strip()
+@pytest.fixture(scope="session")
+def fxt_output_root(request: pytest.FixtureRequest) -> Path:
+    """Output root + date + short commit hash."""
+    output_root = request.config.getoption("--output-root")
+    data_str = datetime.now().strftime("%Y%m%d-%H%M%S")
+    commit_str = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode('ascii').strip()
+    return Path(output_root) / (data_str + "-" + commit_str)
 
 
 @pytest.fixture
@@ -90,7 +93,7 @@ def fxt_model_id(request: pytest.FixtureRequest) -> str:
 
 
 @pytest.fixture
-def fxt_benchmark(request: pytest.FixtureRequest, fxt_commit_hash: str) -> OTXBenchmark:
+def fxt_benchmark(request: pytest.FixtureRequest, fxt_output_root: Path) -> OTXBenchmark:
     """Configure benchmark."""
     # Skip by dataset size
     data_size_option: str = request.config.getoption("--data-size")
@@ -102,6 +105,10 @@ def fxt_benchmark(request: pytest.FixtureRequest, fxt_commit_hash: str) -> OTXBe
     # Options
     cfg: dict = request.param[1].copy()
 
+    tags = cfg.get("tags", {})
+    tags["data_size"] = data_size
+    cfg["tags"] = tags
+
     num_epoch_override: int = int(request.config.getoption("--num-epoch"))
     if num_epoch_override > 0:  # 0: use default
         cfg["num_epoch"] = num_epoch_override
@@ -115,18 +122,24 @@ def fxt_benchmark(request: pytest.FixtureRequest, fxt_commit_hash: str) -> OTXBe
 
     cfg["eval_upto"] = request.config.getoption("--eval-upto")
     cfg["data_root"] = request.config.getoption("--data-root")
-    output_root = request.config.getoption("--output-root")
-    output_dir = fxt_commit_hash + "-" + datetime.now().strftime("%Y%m%d_%H%M%S")
-    cfg["output_root"] = str(Path(output_root) / output_dir)
+    cfg["output_root"] = str(fxt_output_root)
     cfg["dry_run"] = request.config.getoption("--dry-run")
 
-    tags = cfg.get("tags", {})
-    tags["data_size"] = data_size
-    cfg["tags"] = tags
-
     # Create benchmark
     benchmark = OTXBenchmark(
         **cfg,
     )
 
     return benchmark
+
+
+@pytest.fixture(scope="session", autouse=True)
+def fxt_benchmark_summary(fxt_output_root: Path):
+    """Summarize all results at the end of test session."""
+    yield
+    all_results = OTXBenchmark.load_result(fxt_output_root)
+    print("="*20, "[Benchmark summary]")
+    print(all_results)
+    output_path = fxt_output_root / "benchmark-summary.csv"
+    all_results.to_csv(output_path, index=False)
+    print(f"  -> Saved to {output_path}.")

From ed553dfa25e086026a151b9ee3430f5d2e995d01 Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Thu, 21 Dec 2023 10:10:56 +0900
Subject: [PATCH 13/25] Add multi/h-label tests

---
 tests/perf/benchmark.py           |  10 ++-
 tests/perf/conftest.py            |  11 ++--
 tests/perf/test_classification.py | 106 ++++++++++++++++++++++++++++++
 3 files changed, 119 insertions(+), 8 deletions(-)

diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py
index 7bb3b41087e..ce8c720d7ab 100644
--- a/tests/perf/benchmark.py
+++ b/tests/perf/benchmark.py
@@ -1,3 +1,5 @@
+"""OTX Benchmark based on tools/experiment.py."""
+
 # Copyright (C) 2023 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
@@ -102,7 +104,7 @@ def run(
         return result
 
     @staticmethod
-    def load_result(result_path: str) -> pd.DataFrame:
+    def load_result(result_path: str) -> pd.DataFrame | None:
         """Load benchmark results recursively and merge as pd.DataFrame.
 
         Args:
@@ -129,8 +131,9 @@ def load_result(result_path: str) -> pd.DataFrame:
                         result[k] = v
             results.append(result)
         if len(results) > 0:
-            results = pd.concat(results, ignore_index=True)
-        return results
+            return pd.concat(results, ignore_index=True)
+        else:
+            return None
 
     def _build_config(
         self,
@@ -138,6 +141,7 @@ def _build_config(
         train_params: dict = {},
         tags: dict = {},
     ) -> dict:
+        """Build config for tools/expeirment.py."""
         all_train_params = self.train_params.copy()
         all_train_params.update(train_params)
         all_tags = self.tags.copy()
diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py
index a505269c221..88bd53b1d67 100644
--- a/tests/perf/conftest.py
+++ b/tests/perf/conftest.py
@@ -138,8 +138,9 @@ def fxt_benchmark_summary(fxt_output_root: Path):
     """Summarize all results at the end of test session."""
     yield
     all_results = OTXBenchmark.load_result(fxt_output_root)
-    print("="*20, "[Benchmark summary]")
-    print(all_results)
-    output_path = fxt_output_root / "benchmark-summary.csv"
-    all_results.to_csv(output_path, index=False)
-    print(f"  -> Saved to {output_path}.")
+    if all_results:
+        print("="*20, "[Benchmark summary]")
+        print(all_results)
+        output_path = fxt_output_root / "benchmark-summary.csv"
+        all_results.to_csv(output_path, index=False)
+        print(f"  -> Saved to {output_path}.")
diff --git a/tests/perf/test_classification.py b/tests/perf/test_classification.py
index 4bc5ef6d4a8..ba2ccc49e40 100644
--- a/tests/perf/test_classification.py
+++ b/tests/perf/test_classification.py
@@ -65,3 +65,109 @@ def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
             model_id=fxt_model_id,
             tags={"benchmark": "speed"},
         )
+
+
+class TestPerfMultiLabelClassification:
+    BENCHMARK_CONFIGS = {
+        "small": {
+            "tags": {
+                "task": "multi-label-classification",
+            },
+            "datasets": [
+                "classification/multi_label/multilabel_CUB_small/1",
+                "classification/multi_label/multilabel_CUB_small/2",
+                "classification/multi_label/multilabel_CUB_small/3",
+            ],
+            "num_repeat": 3,
+        },
+        "medium": {
+            "tags": {
+                "task": "multi-label-classification",
+            },
+            "datasets": [
+                "classification/multi_label/multilabel_CUB_medium",
+            ],
+            "num_repeat": 3,
+        },
+        "large": {
+            "tags": {
+                "task": "multi-label-classification",
+            },
+            "datasets": [
+                "classification/multi_label/multilabel_food101_large",
+            ],
+            "num_repeat": 1,
+        },
+    }
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark accruacy metrics."""
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "accuracy"},
+        )
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark train time per iter / infer time per image."""
+        fxt_benchmark.track_resources = True
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "speed"},
+        )
+
+
+class TestPerfHierarchicalLabelClassification:
+    BENCHMARK_CONFIGS = {
+        "small": {
+            "tags": {
+                "task": "h-label-classification",
+            },
+            "datasets": [
+                "classification/h_label/h_label_CUB_small/1",
+                "classification/h_label/h_label_CUB_small/2",
+                "classification/h_label/h_label_CUB_small/3",
+            ],
+            "num_repeat": 3,
+        },
+        "medium": {
+            "tags": {
+                "task": "h-label-classification",
+            },
+            "datasets": [
+                "classification/h_label/h_label_CUB_medium",
+            ],
+            "num_repeat": 3,
+        },
+        # TODO: Add large dataset
+        # "large": {
+        #     "tags": {
+        #         "task": "h-label-classification",
+        #     },
+        #     "datasets": [
+        #     ],
+        #     "num_repeat": 1,
+        # },
+    }
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark accruacy metrics."""
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "accuracy"},
+        )
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark train time per iter / infer time per image."""
+        fxt_benchmark.track_resources = True
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "speed"},
+        )

From d6609f24b77cb5c9deb64e566bacd6106c7d4293 Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Thu, 21 Dec 2023 10:16:01 +0900
Subject: [PATCH 14/25] Fix pre-commit

---
 tests/perf/benchmark.py | 30 ++++++++----------------------
 tests/perf/conftest.py  | 16 ++++++++--------
 2 files changed, 16 insertions(+), 30 deletions(-)

diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py
index ce8c720d7ab..169b2b01072 100644
--- a/tests/perf/benchmark.py
+++ b/tests/perf/benchmark.py
@@ -41,6 +41,7 @@ class OTXBenchmark:
         dry_run (bool): Whether to just print the OTX command without execution. Defaults to False.
         tags (dict): Key-values pair metadata for the experiment. Defaults to {}.
     """
+
     def __init__(
         self,
         datasets: List[str],
@@ -88,7 +89,7 @@ def run(
         cfg_dir.mkdir(parents=True, exist_ok=True)
         cfg_path = cfg_dir / "cfg.yaml"
         with open(cfg_path, "w") as cfg_file:
-            yaml.dump(cfg, cfg_file, indent=2,)
+            yaml.dump(cfg, cfg_file, indent=2)
         cmd = [
             "python",
             "tools/experiment.py",
@@ -149,9 +150,7 @@ def _build_config(
 
         cfg = {}
         cfg["tags"] = all_tags  # metadata
-        cfg["output_path"] = os.path.abspath(
-            Path(self.output_root) / "-".join(list(all_tags.values()) + [model_id])
-        )
+        cfg["output_path"] = os.path.abspath(Path(self.output_root) / "-".join(list(all_tags.values()) + [model_id]))
         cfg["constants"] = {
             "dataroot": os.path.abspath(self.data_root),
         }
@@ -176,28 +175,15 @@ def _build_config(
             f" {resource_param}"
             f" params {params_str}"
         )
-        cfg["command"].append(
-            "otx eval"
-            " --test-data-roots ${dataroot}/${data}"
-        )
+        cfg["command"].append("otx eval --test-data-roots ${dataroot}/${data}")
         if self.eval_upto == "train":
             return cfg
 
-        cfg["command"].append(
-            "otx export"
-        )
-        cfg["command"].append(
-            "otx eval"
-            " --test-data-roots ${dataroot}/${data}"
-        )
+        cfg["command"].append("otx export")
+        cfg["command"].append("otx eval --test-data-roots ${dataroot}/${data}")
         if self.eval_upto == "export":
             return cfg
 
-        cfg["command"].append(
-            "otx optimize"
-        )
-        cfg["command"].append(
-            "otx eval"
-            " --test-data-roots ${dataroot}/${data}"
-        )
+        cfg["command"].append("otx optimize")
+        cfg["command"].append("otx eval --test-data-roots ${dataroot}/${data}")
         return cfg
diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py
index 88bd53b1d67..5a16751842d 100644
--- a/tests/perf/conftest.py
+++ b/tests/perf/conftest.py
@@ -36,39 +36,39 @@ def pytest_addoption(parser):
         default=0,
         help="Overrides default per-data-size number of repeat setting. "
         "Random seeds are set to 0 ~ num_repeat-1 for the trials. "
-        "Defaults to 0 (small=3, medium=3, large=1)."
+        "Defaults to 0 (small=3, medium=3, large=1).",
     )
     parser.addoption(
         "--num-epoch",
         action="store",
         default=0,
         help="Overrides default per-model number of epoch setting. "
-        "Defaults to 0 (per-model epoch & early-stopping)."
+        "Defaults to 0 (per-model epoch & early-stopping).",
     )
     parser.addoption(
         "--eval-upto",
         action="store",
         default="all",
         choices=("train", "export", "optimize"),
-        help="Choose train|export|optimize. Defaults to train."
+        help="Choose train|export|optimize. Defaults to train.",
     )
     parser.addoption(
         "--data-root",
         action="store",
         default="data",
-        help="Dataset root directory."
+        help="Dataset root directory.",
     )
     parser.addoption(
         "--output-root",
         action="store",
         default="exp/perf",
-        help="Output root directory."
+        help="Output root directory.",
     )
     parser.addoption(
         "--dry-run",
         action="store_true",
         default=False,
-        help="Print OTX commands without execution."
+        help="Print OTX commands without execution.",
     )
 
 
@@ -77,7 +77,7 @@ def fxt_output_root(request: pytest.FixtureRequest) -> Path:
     """Output root + date + short commit hash."""
     output_root = request.config.getoption("--output-root")
     data_str = datetime.now().strftime("%Y%m%d-%H%M%S")
-    commit_str = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode('ascii').strip()
+    commit_str = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]).decode("ascii").strip()
     return Path(output_root) / (data_str + "-" + commit_str)
 
 
@@ -139,7 +139,7 @@ def fxt_benchmark_summary(fxt_output_root: Path):
     yield
     all_results = OTXBenchmark.load_result(fxt_output_root)
     if all_results:
-        print("="*20, "[Benchmark summary]")
+        print("=" * 20, "[Benchmark summary]")
         print(all_results)
         output_path = fxt_output_root / "benchmark-summary.csv"
         all_results.to_csv(output_path, index=False)

From 519de9d5497b9fa43d35bd893728ceee379bf8fe Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Thu, 21 Dec 2023 10:54:20 +0900
Subject: [PATCH 15/25] Add detection tests

---
 tests/perf/benchmark.py           |  3 +-
 tests/perf/test_classification.py |  2 +-
 tests/perf/test_detection.py      | 68 +++++++++++++++++++++++++++++++
 3 files changed, 70 insertions(+), 3 deletions(-)
 create mode 100644 tests/perf/test_detection.py

diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py
index 169b2b01072..853e7ee33eb 100644
--- a/tests/perf/benchmark.py
+++ b/tests/perf/benchmark.py
@@ -72,7 +72,7 @@ def run(
         train_params: dict = {},
         tags: dict = {},
     ) -> pd.DataFrame:
-        """Run benchmark and return the result.
+        """Run configured benchmark with given model and return the result.
 
         Args:
             model_id (str): Target model identifier
@@ -157,7 +157,6 @@ def _build_config(
         cfg["variables"] = {
             "model": [model_id],
             "data": self.datasets,
-            **{k: [v] for k, v in all_tags.items()},  # To be shown in result file
         }
         cfg["repeat"] = self.num_repeat
         cfg["command"] = []
diff --git a/tests/perf/test_classification.py b/tests/perf/test_classification.py
index ba2ccc49e40..eae5a982835 100644
--- a/tests/perf/test_classification.py
+++ b/tests/perf/test_classification.py
@@ -1,4 +1,4 @@
-"""OTX Classification Perfomance tests."""
+"""OTX Classification perfomance tests."""
 
 # Copyright (C) 2023 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
diff --git a/tests/perf/test_detection.py b/tests/perf/test_detection.py
new file mode 100644
index 00000000000..45235f00ed2
--- /dev/null
+++ b/tests/perf/test_detection.py
@@ -0,0 +1,68 @@
+"""OTX Detection perfomance tests."""
+
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+import pytest
+
+from otx.cli.registry import Registry
+from .benchmark import OTXBenchmark
+
+
+MODEL_TEMPLATES = Registry(f"src/otx/algorithms").filter(task_type="DETECTION").templates
+MODEL_IDS = [template.model_template_id for template in MODEL_TEMPLATES]
+
+
+class TestPerfDetection:
+    BENCHMARK_CONFIGS = {
+        "small": {
+            "tags": {
+                "task": "detection",
+            },
+            "datasets": [
+                "detection/pothole_small/1",
+                "detection/pothole_small/2",
+                "detection/pothole_small/3",
+            ],
+            "num_repeat": 3,
+        },
+        "medium": {
+            "tags": {
+                "task": "detection",
+            },
+            "datasets": [
+                "detection/pothole_medium",
+            ],
+            "num_repeat": 3,
+        },
+        "large": {
+            "tags": {
+                "task": "detection",
+            },
+            "datasets": [
+                "detection/vitens_large",
+            ],
+            "num_repeat": 1,
+        },
+    }
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark accruacy metrics."""
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "accuracy"},
+        )
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark train time per iter / infer time per image."""
+        fxt_benchmark.track_resources = True
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "speed"},
+        )
+

From e447a3c8229cf27b03f31bdd67b69057d2759e2f Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Thu, 21 Dec 2023 13:46:56 +0900
Subject: [PATCH 16/25] Add instance segmentationt tests

---
 tests/perf/test_classification.py         | 24 +++++---
 tests/perf/test_detection.py              |  3 +-
 tests/perf/test_instance_segmenatation.py | 69 +++++++++++++++++++++++
 3 files changed, 86 insertions(+), 10 deletions(-)
 create mode 100644 tests/perf/test_instance_segmenatation.py

diff --git a/tests/perf/test_classification.py b/tests/perf/test_classification.py
index eae5a982835..15a9b4dd133 100644
--- a/tests/perf/test_classification.py
+++ b/tests/perf/test_classification.py
@@ -15,10 +15,12 @@
 
 
 class TestPerfSingleLabelClassification:
+    """Benchmark single-label classification."""
+
     BENCHMARK_CONFIGS = {
         "small": {
             "tags": {
-                "task": "single-label-classification",
+                "task": "single_label_classification",
             },
             "datasets": [
                 "classification/single_label/multiclass_CUB_small/1",
@@ -29,7 +31,7 @@ class TestPerfSingleLabelClassification:
         },
         "medium": {
             "tags": {
-                "task": "single-label-classification",
+                "task": "single_label_classification",
             },
             "datasets": [
                 "classification/single_label/multiclass_CUB_medium",
@@ -38,7 +40,7 @@ class TestPerfSingleLabelClassification:
         },
         "large": {
             "tags": {
-                "task": "single-label-classification",
+                "task": "single_label_classification",
             },
             "datasets": [
                 "classification/single_label/multiclass_food101_large",
@@ -68,10 +70,12 @@ def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
 
 
 class TestPerfMultiLabelClassification:
+    """Benchmark multi-label classification."""
+
     BENCHMARK_CONFIGS = {
         "small": {
             "tags": {
-                "task": "multi-label-classification",
+                "task": "multi_label_classification",
             },
             "datasets": [
                 "classification/multi_label/multilabel_CUB_small/1",
@@ -82,7 +86,7 @@ class TestPerfMultiLabelClassification:
         },
         "medium": {
             "tags": {
-                "task": "multi-label-classification",
+                "task": "multi_label_classification",
             },
             "datasets": [
                 "classification/multi_label/multilabel_CUB_medium",
@@ -91,7 +95,7 @@ class TestPerfMultiLabelClassification:
         },
         "large": {
             "tags": {
-                "task": "multi-label-classification",
+                "task": "multi_label_classification",
             },
             "datasets": [
                 "classification/multi_label/multilabel_food101_large",
@@ -121,10 +125,12 @@ def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
 
 
 class TestPerfHierarchicalLabelClassification:
+    """Benchmark hierarchcial-label classification."""
+
     BENCHMARK_CONFIGS = {
         "small": {
             "tags": {
-                "task": "h-label-classification",
+                "task": "hierarchical_label_classification",
             },
             "datasets": [
                 "classification/h_label/h_label_CUB_small/1",
@@ -135,7 +141,7 @@ class TestPerfHierarchicalLabelClassification:
         },
         "medium": {
             "tags": {
-                "task": "h-label-classification",
+                "task": "hierarchical_label_classification",
             },
             "datasets": [
                 "classification/h_label/h_label_CUB_medium",
@@ -145,7 +151,7 @@ class TestPerfHierarchicalLabelClassification:
         # TODO: Add large dataset
         # "large": {
         #     "tags": {
-        #         "task": "h-label-classification",
+        #         "task": "hierarchical_label_classification",
         #     },
         #     "datasets": [
         #     ],
diff --git a/tests/perf/test_detection.py b/tests/perf/test_detection.py
index 45235f00ed2..81ed71c0bac 100644
--- a/tests/perf/test_detection.py
+++ b/tests/perf/test_detection.py
@@ -15,6 +15,8 @@
 
 
 class TestPerfDetection:
+    """Benchmark basic object detection."""
+
     BENCHMARK_CONFIGS = {
         "small": {
             "tags": {
@@ -65,4 +67,3 @@ def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
             model_id=fxt_model_id,
             tags={"benchmark": "speed"},
         )
-
diff --git a/tests/perf/test_instance_segmenatation.py b/tests/perf/test_instance_segmenatation.py
new file mode 100644
index 00000000000..1c649dc2e74
--- /dev/null
+++ b/tests/perf/test_instance_segmenatation.py
@@ -0,0 +1,69 @@
+"""OTX Instance Segmentation perfomance tests."""
+
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+import pytest
+
+from otx.cli.registry import Registry
+from .benchmark import OTXBenchmark
+
+
+MODEL_TEMPLATES = Registry(f"src/otx/algorithms").filter(task_type="INSTANCE_SEGMENTATION").templates
+MODEL_IDS = [template.model_template_id for template in MODEL_TEMPLATES]
+
+
+class TestPerfInstanceSegmentation:
+    """Benchmark basic instance segmentation."""
+
+    BENCHMARK_CONFIGS = {
+        "small": {
+            "tags": {
+                "task": "instance_segmentation",
+            },
+            "datasets": [
+                "instance_seg/wgisd_small/1",
+                "instance_seg/wgisd_small/2",
+                "instance_seg/wgisd_small/3",
+            ],
+            "num_repeat": 3,
+        },
+        "medium": {
+            "tags": {
+                "task": "instance_segmentation",
+            },
+            "datasets": [
+                "instance_seg/coco_car_person_medium",
+            ],
+            "num_repeat": 3,
+        },
+        "large": {
+            "tags": {
+                "task": "instance_segmentation",
+            },
+            "datasets": [
+                "instance_seg/bdd_large",
+            ],
+            "num_repeat": 1,
+        },
+    }
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark accruacy metrics."""
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "accuracy"},
+        )
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark train time per iter / infer time per image."""
+        fxt_benchmark.track_resources = True
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "speed"},
+        )

From 742e8cc489059f7930ba4ed42cceec0b4497ba41 Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Thu, 21 Dec 2023 14:06:01 +0900
Subject: [PATCH 17/25] Add tiling tests

---
 tests/perf/test_instance_segmenatation.py | 61 +++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/tests/perf/test_instance_segmenatation.py b/tests/perf/test_instance_segmenatation.py
index 1c649dc2e74..3257f690139 100644
--- a/tests/perf/test_instance_segmenatation.py
+++ b/tests/perf/test_instance_segmenatation.py
@@ -67,3 +67,64 @@ def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
             model_id=fxt_model_id,
             tags={"benchmark": "speed"},
         )
+
+
+class TestPerfTilingInstanceSegmentation:
+    """Benchmark tiling instance segmentation."""
+
+    TILING_PARAMS = {
+        "tiling_parameters.enable_tiling": 1,
+    }
+    BENCHMARK_CONFIGS = {
+        "small": {
+            "tags": {
+                "task": "instance_segmentation",
+            },
+            "datasets": [
+                "tiling_instance_seg/vitens_aeromonas_small/1",
+                "tiling_instance_seg/vitens_aeromonas_small/2",
+                "tiling_instance_seg/vitens_aeromonas_small/3",
+            ],
+            "num_repeat": 3,
+            "train_params": TILING_PARAMS,
+        },
+        "medium": {
+            "tags": {
+                "task": "instance_segmentation",
+            },
+            "datasets": [
+                "tiling_instance_seg/vitens_aeromonas_medium",
+            ],
+            "num_repeat": 3,
+            "train_params": TILING_PARAMS,
+        },
+        "large": {
+            "tags": {
+                "task": "instance_segmentation",
+            },
+            "datasets": [
+                "tiling_instance_seg/bdd_large",
+            ],
+            "num_repeat": 1,
+            "train_params": TILING_PARAMS,
+        },
+    }
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark accruacy metrics."""
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "accuracy"},
+        )
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark train time per iter / infer time per image."""
+        fxt_benchmark.track_resources = True
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "speed"},
+        )

From c37c1444a125d23ae70e76c05e0dc2e8e5601ad3 Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Thu, 21 Dec 2023 15:10:18 +0900
Subject: [PATCH 18/25] Add semantic segmenation tests

---
 tests/perf/benchmark.py                   |  2 +-
 tests/perf/test_instance_segmenatation.py |  6 +-
 tests/perf/test_semantic_segmentation.py  | 69 +++++++++++++++++++++++
 3 files changed, 73 insertions(+), 4 deletions(-)
 create mode 100644 tests/perf/test_semantic_segmentation.py

diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py
index 853e7ee33eb..36e0a39a0b4 100644
--- a/tests/perf/benchmark.py
+++ b/tests/perf/benchmark.py
@@ -18,7 +18,7 @@ class OTXBenchmark:
     """Benchmark runner based on tools/experiment.py in OTX1.x.
 
     Example:
-        >>> bm = OTXBenchmark(['random_sample1', 'random_sample'], data_root='./data/coco')
+        >>> bm = OTXBenchmark(['random_sample1', 'random_sample2'], data_root='./data/coco')
         >>> atss_result = bm.run('MobileNetV2-ATSS')
         >>> yolox_result = bm.run('YOLOX-TINY')
 
diff --git a/tests/perf/test_instance_segmenatation.py b/tests/perf/test_instance_segmenatation.py
index 3257f690139..6e4a1a9b275 100644
--- a/tests/perf/test_instance_segmenatation.py
+++ b/tests/perf/test_instance_segmenatation.py
@@ -78,7 +78,7 @@ class TestPerfTilingInstanceSegmentation:
     BENCHMARK_CONFIGS = {
         "small": {
             "tags": {
-                "task": "instance_segmentation",
+                "task": "tiling_instance_segmentation",
             },
             "datasets": [
                 "tiling_instance_seg/vitens_aeromonas_small/1",
@@ -90,7 +90,7 @@ class TestPerfTilingInstanceSegmentation:
         },
         "medium": {
             "tags": {
-                "task": "instance_segmentation",
+                "task": "tiling_instance_segmentation",
             },
             "datasets": [
                 "tiling_instance_seg/vitens_aeromonas_medium",
@@ -100,7 +100,7 @@ class TestPerfTilingInstanceSegmentation:
         },
         "large": {
             "tags": {
-                "task": "instance_segmentation",
+                "task": "tiling_instance_segmentation",
             },
             "datasets": [
                 "tiling_instance_seg/bdd_large",
diff --git a/tests/perf/test_semantic_segmentation.py b/tests/perf/test_semantic_segmentation.py
new file mode 100644
index 00000000000..4ec28f6726c
--- /dev/null
+++ b/tests/perf/test_semantic_segmentation.py
@@ -0,0 +1,69 @@
+"""OTX Semantic Segmentation perfomance tests."""
+
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+import pytest
+
+from otx.cli.registry import Registry
+from .benchmark import OTXBenchmark
+
+
+MODEL_TEMPLATES = Registry(f"src/otx/algorithms").filter(task_type="SEGMENTATION").templates
+MODEL_IDS = [template.model_template_id for template in MODEL_TEMPLATES]
+
+
+class TestPerfSemanticSegmentation:
+    """Benchmark basic semantic segmentation."""
+
+    BENCHMARK_CONFIGS = {
+        "small": {
+            "tags": {
+                "task": "semantic_segmentation",
+            },
+            "datasets": [
+                "semantic_seg/kvasir_small/1",
+                "semantic_seg/kvasir_small/2",
+                "semantic_seg/kvasir_small/3",
+            ],
+            "num_repeat": 3,
+        },
+        "medium": {
+            "tags": {
+                "task": "semantic_segmentation",
+            },
+            "datasets": [
+                "semantic_seg/kvasir_medium",
+            ],
+            "num_repeat": 3,
+        },
+        "large": {
+            "tags": {
+                "task": "semantic_segmentation",
+            },
+            "datasets": [
+                "semantic_seg/kvasir_large",
+            ],
+            "num_repeat": 1,
+        },
+    }
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark accruacy metrics."""
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "accuracy"},
+        )
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark train time per iter / infer time per image."""
+        fxt_benchmark.track_resources = True
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "speed"},
+        )

From 233b18cdbd170276a10306908835b366b35d4b1d Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Thu, 21 Dec 2023 16:27:01 +0900
Subject: [PATCH 19/25] Add anomaly test

---
 tests/perf/test_anomaly.py | 184 +++++++++++++++++++++++++++++++++++++
 1 file changed, 184 insertions(+)
 create mode 100644 tests/perf/test_anomaly.py

diff --git a/tests/perf/test_anomaly.py b/tests/perf/test_anomaly.py
new file mode 100644
index 00000000000..db16f7172ea
--- /dev/null
+++ b/tests/perf/test_anomaly.py
@@ -0,0 +1,184 @@
+"""OTX Anomaly perfomance tests."""
+
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+import pytest
+
+from otx.cli.registry import Registry
+from .benchmark import OTXBenchmark
+
+
+class TestPerfAnomalyClassification:
+    """Benchmark anomaly classification."""
+
+    MODEL_TEMPLATES = Registry(f"src/otx/algorithms").filter(task_type="ANOMALY_CLASSIFICATION").templates
+    MODEL_IDS = [template.model_template_id for template in MODEL_TEMPLATES]
+
+    BENCHMARK_CONFIGS = {
+        "small": {
+            "tags": {
+                "task": "anomaly_classification",
+            },
+            "datasets": [
+                "anomaly/mvtec/bottle_small/1",
+                "anomaly/mvtec/bottle_small/2",
+                "anomaly/mvtec/bottle_small/3",
+            ],
+            "num_repeat": 3,
+        },
+        "medium": {
+            "tags": {
+                "task": "anomaly_classification",
+            },
+            "datasets": [
+                "anomaly/mvtec/wood_medium",
+            ],
+            "num_repeat": 3,
+        },
+        "large": {
+            "tags": {
+                "task": "anomaly_classification",
+            },
+            "datasets": [
+                "anomaly/mvtec/hazelnut_large",
+            ],
+            "num_repeat": 1,
+        },
+    }
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark accruacy metrics."""
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "accuracy"},
+        )
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark train time per iter / infer time per image."""
+        fxt_benchmark.track_resources = True
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "speed"},
+        )
+
+
+class TestPerfAnomalyDetection:
+    """Benchmark anomaly detection."""
+
+    MODEL_TEMPLATES = Registry(f"src/otx/algorithms").filter(task_type="ANOMALY_DETECTION").templates
+    MODEL_IDS = [template.model_template_id for template in MODEL_TEMPLATES]
+
+    BENCHMARK_CONFIGS = {
+        "small": {
+            "tags": {
+                "task": "anomaly_detection",
+            },
+            "datasets": [
+                "anomaly/mvtec/bottle_small/1",
+                "anomaly/mvtec/bottle_small/2",
+                "anomaly/mvtec/bottle_small/3",
+            ],
+            "num_repeat": 3,
+        },
+        "medium": {
+            "tags": {
+                "task": "anomaly_detection",
+            },
+            "datasets": [
+                "anomaly/mvtec/wood_medium",
+            ],
+            "num_repeat": 3,
+        },
+        "large": {
+            "tags": {
+                "task": "anomaly_detection",
+            },
+            "datasets": [
+                "anomaly/mvtec/hazelnut_large",
+            ],
+            "num_repeat": 1,
+        },
+    }
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark accruacy metrics."""
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "accuracy"},
+        )
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark train time per iter / infer time per image."""
+        fxt_benchmark.track_resources = True
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "speed"},
+        )
+
+
+class TestPerfAnomalySegmentation:
+    """Benchmark anomaly segmentation."""
+
+    MODEL_TEMPLATES = Registry(f"src/otx/algorithms").filter(task_type="ANOMALY_SEGMENTATION").templates
+    MODEL_IDS = [template.model_template_id for template in MODEL_TEMPLATES]
+
+    BENCHMARK_CONFIGS = {
+        "small": {
+            "tags": {
+                "task": "anomaly_segmentation",
+            },
+            "datasets": [
+                "anomaly/mvtec/bottle_small/1",
+                "anomaly/mvtec/bottle_small/2",
+                "anomaly/mvtec/bottle_small/3",
+            ],
+            "num_repeat": 3,
+        },
+        "medium": {
+            "tags": {
+                "task": "anomaly_segmentation",
+            },
+            "datasets": [
+                "anomaly/mvtec/wood_medium",
+            ],
+            "num_repeat": 3,
+        },
+        "large": {
+            "tags": {
+                "task": "anomaly_segmentation",
+            },
+            "datasets": [
+                "anomaly/mvtec/hazelnut_large",
+            ],
+            "num_repeat": 1,
+        },
+    }
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark accruacy metrics."""
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "accuracy"},
+        )
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark train time per iter / infer time per image."""
+        fxt_benchmark.track_resources = True
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "speed"},
+        )

From 8db6c60afacf71f902c898261ff270ac6b337469 Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Thu, 21 Dec 2023 17:52:32 +0900
Subject: [PATCH 20/25] Fix anomaly max_epochs setting

---
 tests/perf/benchmark.py | 12 +++++++++++-
 tests/perf/conftest.py  |  2 +-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py
index 36e0a39a0b4..bba5e36d55d 100644
--- a/tests/perf/benchmark.py
+++ b/tests/perf/benchmark.py
@@ -164,7 +164,7 @@ def _build_config(
         if self.track_resources:
             resource_param = "--track-resource-usage all"
         if self.num_epoch > 0:
-            all_train_params["learning_parameters.num_iters"] = self.num_epoch
+            self._set_num_epoch(model_id, all_train_params, self.num_epoch)
         params_str = " ".join([f"--{k} {v}" for k, v in all_train_params.items()])
         cfg["command"].append(
             "otx train ${model}"
@@ -186,3 +186,13 @@ def _build_config(
         cfg["command"].append("otx optimize")
         cfg["command"].append("otx eval --test-data-roots ${dataroot}/${data}")
         return cfg
+
+    @staticmethod
+    def _set_num_epoch(model_id:str, train_params: dict, num_epoch: int):
+        """Set model specific num_epoch parameter."""
+        if "padim" in model_id:
+            return  # No configurable parameter for num_epoch
+        elif "stfpm" in model_id:
+            train_params["learning_parameters.max_epochs"] = num_epoch
+        else:
+            train_params["learning_parameters.num_iters"] = num_epoch
diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py
index 5a16751842d..b85d3e2b869 100644
--- a/tests/perf/conftest.py
+++ b/tests/perf/conftest.py
@@ -138,7 +138,7 @@ def fxt_benchmark_summary(fxt_output_root: Path):
     """Summarize all results at the end of test session."""
     yield
     all_results = OTXBenchmark.load_result(fxt_output_root)
-    if all_results:
+    if all_results is not None:
         print("=" * 20, "[Benchmark summary]")
         print(all_results)
         output_path = fxt_output_root / "benchmark-summary.csv"

From e6cd073fb522f84d935984f471f6227f0d72c44e Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Fri, 22 Dec 2023 09:54:59 +0900
Subject: [PATCH 21/25] Fix pre-commit

---
 tests/perf/benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py
index bba5e36d55d..79178e63f1f 100644
--- a/tests/perf/benchmark.py
+++ b/tests/perf/benchmark.py
@@ -188,7 +188,7 @@ def _build_config(
         return cfg
 
     @staticmethod
-    def _set_num_epoch(model_id:str, train_params: dict, num_epoch: int):
+    def _set_num_epoch(model_id: str, train_params: dict, num_epoch: int):
         """Set model specific num_epoch parameter."""
         if "padim" in model_id:
             return  # No configurable parameter for num_epoch

From 94bdc0036cdca34044d241247f5cbca67a8ca823 Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Fri, 22 Dec 2023 15:08:59 +0900
Subject: [PATCH 22/25] Add subset_dir_name cfg for seg datasets

---
 tests/perf/benchmark.py                  | 12 +++++++-----
 tests/perf/test_semantic_segmentation.py |  3 +++
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py
index 79178e63f1f..18c94c0536d 100644
--- a/tests/perf/benchmark.py
+++ b/tests/perf/benchmark.py
@@ -54,6 +54,7 @@ def __init__(
         output_root: str = "otx-benchmark",
         dry_run: bool = False,
         tags: dict = {},
+        **kwargs,
     ):
         self.datasets = datasets
         self.data_root = data_root
@@ -65,6 +66,7 @@ def __init__(
         self.output_root = output_root
         self.dry_run = dry_run
         self.tags = tags
+        self.subset_dir_names = kwargs.get("subset_dir_names", {"train": "", "val": "", "test": ""})
 
     def run(
         self,
@@ -168,23 +170,23 @@ def _build_config(
         params_str = " ".join([f"--{k} {v}" for k, v in all_train_params.items()])
         cfg["command"].append(
             "otx train ${model}"
-            " --train-data-roots ${dataroot}/${data}"
-            " --val-data-roots ${dataroot}/${data}"
+            " --train-data-roots ${dataroot}/${data}" + f"/{self.subset_dir_names['train']}"
+            " --val-data-roots ${dataroot}/${data}" + f"/{self.subset_dir_names['val']}"
             " --deterministic"
             f" {resource_param}"
             f" params {params_str}"
         )
-        cfg["command"].append("otx eval --test-data-roots ${dataroot}/${data}")
+        cfg["command"].append("otx eval --test-data-roots ${dataroot}/${data}" + f"/{self.subset_dir_names['test']}")
         if self.eval_upto == "train":
             return cfg
 
         cfg["command"].append("otx export")
-        cfg["command"].append("otx eval --test-data-roots ${dataroot}/${data}")
+        cfg["command"].append("otx eval --test-data-roots ${dataroot}/${data}" + f"/{self.subset_dir_names['test']}")
         if self.eval_upto == "export":
             return cfg
 
         cfg["command"].append("otx optimize")
-        cfg["command"].append("otx eval --test-data-roots ${dataroot}/${data}")
+        cfg["command"].append("otx eval --test-data-roots ${dataroot}/${data}" + f"/{self.subset_dir_names['test']}")
         return cfg
 
     @staticmethod
diff --git a/tests/perf/test_semantic_segmentation.py b/tests/perf/test_semantic_segmentation.py
index 4ec28f6726c..a5ca4086f83 100644
--- a/tests/perf/test_semantic_segmentation.py
+++ b/tests/perf/test_semantic_segmentation.py
@@ -27,6 +27,7 @@ class TestPerfSemanticSegmentation:
                 "semantic_seg/kvasir_small/2",
                 "semantic_seg/kvasir_small/3",
             ],
+            "subset_dir_names": {"train": "train", "val": "val", "test": "test"},
             "num_repeat": 3,
         },
         "medium": {
@@ -36,6 +37,7 @@ class TestPerfSemanticSegmentation:
             "datasets": [
                 "semantic_seg/kvasir_medium",
             ],
+            "subset_dir_names": {"train": "train", "val": "val", "test": "test"},
             "num_repeat": 3,
         },
         "large": {
@@ -45,6 +47,7 @@ class TestPerfSemanticSegmentation:
             "datasets": [
                 "semantic_seg/kvasir_large",
             ],
+            "subset_dir_names": {"train": "train", "val": "val", "test": "test"},
             "num_repeat": 1,
         },
     }

From fab92516888eb7a89042bcdb3a22eedd14c38a0f Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Fri, 22 Dec 2023 15:11:34 +0900
Subject: [PATCH 23/25] Update changelog.md

---
 CHANGELOG.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 68b075dca53..30554245175 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,11 +6,12 @@ All notable changes to this project will be documented in this file.
 
 ### New features
 
-- Add zero-shot visual prompting (https://github.com/openvinotoolkit/training_extensions/pull/2616)
+- Add zero-shot visual prompting (<https://github.com/openvinotoolkit/training_extensions/pull/2616>)
 
 ### Enhancements
 
 - Upgrade NNCF to 2.7 and OpenVINO to 2023.2 (<https://github.com/openvinotoolkit/training_extensions/pull/2656>)
+- Automate performance benchmark (<https://github.com/openvinotoolkit/training_extensions/pull/2742>)
 
 ## \[v1.5.0\]
 

From 91bfec13541e70258511473e29b8b56dfed4c3d1 Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Tue, 26 Dec 2023 12:09:29 +0900
Subject: [PATCH 24/25] Reflect review comments

---
 tests/perf/__init__.py  |  2 +-
 tests/perf/benchmark.py | 18 +++++++++---------
 tests/perf/conftest.py  |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/tests/perf/__init__.py b/tests/perf/__init__.py
index 36a90a5e5f6..9984d0cb25b 100644
--- a/tests/perf/__init__.py
+++ b/tests/perf/__init__.py
@@ -1,4 +1,4 @@
 """OTX Perfomance tests."""
 
-# Copyright (C) 2021-2022 Intel Corporation
+# Copyright (C) 2023 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py
index 18c94c0536d..2a2c18a8146 100644
--- a/tests/perf/benchmark.py
+++ b/tests/perf/benchmark.py
@@ -33,7 +33,7 @@ class OTXBenchmark:
         train_params (dict): Additional training parameters.
             e.x) {'learning_parameters.num_iters': 2}. Defaults to {}.
         track_resources (bool): Whether to track CPU & GPU usage metrics. Defaults to False.
-        eval_upto (str): The last serial operation to evaluate. Choose on of ('train', 'export', 'optimize').
+        eval_upto (str): The last serial operation to evaluate. Choose one of ('train', 'export', 'optimize').
             Operations include the preceeding ones.
             e.x) Eval up to 'optimize': train -> eval -> export -> eval -> optimize -> eval
             Default to 'train'.
@@ -48,32 +48,32 @@ def __init__(
         data_root: str = "data",
         num_epoch: int = 0,
         num_repeat: int = 1,
-        train_params: dict = {},
+        train_params: dict | None = None,
         track_resources: bool = False,
         eval_upto: str = "train",
         output_root: str = "otx-benchmark",
         dry_run: bool = False,
-        tags: dict = {},
-        **kwargs,
+        tags: dict | None = None,
+        subset_dir_names: dict | None = None,
     ):
         self.datasets = datasets
         self.data_root = data_root
         self.num_epoch = num_epoch
         self.num_repeat = num_repeat
-        self.train_params = train_params
+        self.train_params = train_params or {}
         self.track_resources = track_resources
         self.eval_upto = eval_upto
         self.output_root = output_root
         self.dry_run = dry_run
-        self.tags = tags
-        self.subset_dir_names = kwargs.get("subset_dir_names", {"train": "", "val": "", "test": ""})
+        self.tags = tags or {}
+        self.subset_dir_names = subset_dir_names or {"train": "", "val": "", "test": ""}
 
     def run(
         self,
         model_id: str,
         train_params: dict = {},
         tags: dict = {},
-    ) -> pd.DataFrame:
+    ) -> pd.DataFrame | None:
         """Run configured benchmark with given model and return the result.
 
         Args:
@@ -82,7 +82,7 @@ def run(
             tags (dict): Overrides global benchmark tags
 
         Retruns:
-            pd.DataFrame: Table with benchmark metrics
+            pd.DataFrame | None: Table with benchmark metrics
         """
 
         # Build config file
diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py
index b85d3e2b869..0d831d50dd1 100644
--- a/tests/perf/conftest.py
+++ b/tests/perf/conftest.py
@@ -48,7 +48,7 @@ def pytest_addoption(parser):
     parser.addoption(
         "--eval-upto",
         action="store",
-        default="all",
+        default="train",
         choices=("train", "export", "optimize"),
         help="Choose train|export|optimize. Defaults to train.",
     )

From 5e88a6664b8044e1e64a913d8bd6b254ab909387 Mon Sep 17 00:00:00 2001
From: Songki Choi <songki.choi@intel.com>
Date: Tue, 26 Dec 2023 13:48:37 +0900
Subject: [PATCH 25/25] Refine doc string

---
 tests/perf/benchmark.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py
index 2a2c18a8146..f39ed806731 100644
--- a/tests/perf/benchmark.py
+++ b/tests/perf/benchmark.py
@@ -30,7 +30,7 @@ class OTXBenchmark:
             Defaults to 0, which means no overriding.
         num_repeat (int): Number for trials with different random seed, which would be set
             as range(0, num_repeat). Defaults to 1.
-        train_params (dict): Additional training parameters.
+        train_params (dict, optional): Additional training parameters.
             e.x) {'learning_parameters.num_iters': 2}. Defaults to {}.
         track_resources (bool): Whether to track CPU & GPU usage metrics. Defaults to False.
         eval_upto (str): The last serial operation to evaluate. Choose one of ('train', 'export', 'optimize').
@@ -39,7 +39,9 @@ class OTXBenchmark:
             Default to 'train'.
         output_root (str): Output root dirctory for logs and results. Defaults to './otx-benchmark'.
         dry_run (bool): Whether to just print the OTX command without execution. Defaults to False.
-        tags (dict): Key-values pair metadata for the experiment. Defaults to {}.
+        tags (dict, optional): Key-values pair metadata for the experiment.
+        subset_dir_names (dict, optional): Specify dataset subset directory names, if any.
+            e.x) {"train": "train_10percent", "val": "val_all", "test": "test"}
     """
 
     def __init__(