diff --git a/tests/unit/algorithms/segmentation/adapters/mmseg/datasets/pipelines/test_transforms.py b/tests/unit/algorithms/segmentation/adapters/mmseg/datasets/pipelines/test_transforms.py
index facded59996..c1404deb76c 100644
--- a/tests/unit/algorithms/segmentation/adapters/mmseg/datasets/pipelines/test_transforms.py
+++ b/tests/unit/algorithms/segmentation/adapters/mmseg/datasets/pipelines/test_transforms.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict
+from typing import Any, Dict, List
 
 import numpy as np
 import pytest
@@ -110,11 +110,11 @@ class TestNormalize:
     @pytest.mark.parametrize(
         "mean,std,to_rgb,expected",
         [
-            (1.0, 1.0, True, np.array([[[1.0, 0.0, 0.0]]], dtype=np.float32)),
-            (1.0, 1.0, False, np.array([[[-1.0, 0.0, 0.0]]], dtype=np.float32)),
+            ([1.0 for _ in range(3)], [1.0 for _ in range(3)], True, np.array([[[1.0, 0.0, -1.0]]], dtype=np.float32)),
+            ([1.0 for _ in range(3)], [1.0 for _ in range(3)], False, np.array([[[-1.0, 0.0, 1.0]]], dtype=np.float32)),
         ],
     )
-    def test_call(self, mean: float, std: float, to_rgb: bool, expected: np.array) -> None:
+    def test_call(self, mean: List[float], std: List[float], to_rgb: bool, expected: np.array) -> None:
         """Test __call__."""
         normalize = Normalize(mean=mean, std=std, to_rgb=to_rgb)
         inputs = dict(img=np.arange(3).reshape(1, 1, 3))
diff --git a/tools/experiment.py b/tools/experiment.py
index f161a5a2372..e49e1c339c6 100644
--- a/tools/experiment.py
+++ b/tools/experiment.py
@@ -66,6 +66,48 @@ def find_latest_file(root_dir: Union[Path, str], file_name: str) -> Union[None,
     return train_record_files[0]
 
 
+class EvalResult:
+    """Class to save otx eval output.
+
+    Current OTX eval output has different metrics depending on a task.
+    To deal with it, this class can save dynamic metric name.
+    Each metric can be set or gotten by both dict-like(ins["metric"]) or class-like(ins.metric) way.
+    "add" (only with a class having same metrics) and "true devide" are supported.
+    """
+
+    def __getitem__(self, key):
+        """Support dict-like way to get attribute."""
+        return getattr(self, key)
+
+    def __setitem__(self, key, value):
+        """Support dict-like way to set attribute."""
+        setattr(self, key, value)
+
+    def __add__(self, obj: "EvalResult"):
+        """Add with a class having same metrics."""
+        new_obj = deepcopy(self)
+        new_obj_metrics = vars(new_obj).keys()
+
+        if new_obj_metrics != vars(obj).keys():
+            raise KeyError(
+                "Two objects have different metrics. "
+                f"Left operand : {','.join(new_obj_metrics)} / Right operand : {','.join(vars(obj).keys())}"
+            )
+
+        for attr in new_obj_metrics:
+            new_obj[attr] += obj[attr]
+        return new_obj
+
+    def __truediv__(self, divisor: Union[int, float]):
+        """Divide each metric in the class."""
+        new_obj = deepcopy(self)
+
+        for attr in vars(new_obj).keys():
+            new_obj[attr] /= divisor
+
+        return new_obj
+
+
 @dataclass
 class ExperimentResult:
     """Dataclass to manage experiment result.
@@ -76,44 +118,42 @@ class ExperimentResult:
     """
 
     val_score: Union[float, None] = None
-    test_score: Union[float, None] = None
+    train_eval_result: Union[EvalResult, None] = None
     train_e2e_time: Union[timedelta, None] = None
     avg_iter_time: Union[float, None] = None
     std_iter_time: Union[float, None] = None
     avg_data_time: Union[float, None] = None
     std_data_time: Union[float, None] = None
-    export_model_score: Union[float, None] = None
-    avg_ov_infer_time: Union[float, None] = None
+    export_eval_result: Union[EvalResult, None] = None
     max_cpu_mem: Union[float, None] = None
     avg_cpu_util: Union[float, None] = None
     max_gpu_mem: Union[float, None] = None
     avg_gpu_util: Union[float, None] = None
-    optimize_model_score: Union[float, None] = None
+    optimize_eval_result: Union[EvalResult, None] = None
     epoch: Union[int, None] = None
 
     def get_formatted_result(self) -> Dict:
         """Return dictionary format result."""
         result = dataclasses.asdict(self)
+        formatted_result = {}
 
-        for attr_name in ["max_cpu_mem", "max_gpu_mem"]:
-            max_mem = result.pop(attr_name)
-            result[f"{attr_name}(GiB)"] = max_mem
-
-        for attr_name in ["avg_cpu_util", "avg_gpu_util"]:
-            res_util = result.pop(attr_name)
-            result[f"{attr_name}(%)"] = res_util
-
-        if self.train_e2e_time is not None:
-            result["train_e2e_time"] = str(self.train_e2e_time).split(".")[0]
-
-        # delete None value
-        for key in list(result.keys()):
-            if result[key] is None:
-                del result[key]
-            elif isinstance(result[key], float):
-                result[key] = round(result[key], 4)
+        for key, val in result.items():
+            if val is None:
+                continue
+            elif key in ["max_cpu_mem", "max_gpu_mem"]:
+                formatted_result[f"{key}(GiB)"] = round(val, 2)
+            elif key in ["avg_cpu_util", "avg_gpu_util"]:
+                formatted_result[f"{key}(%)"] = round(val, 2)
+            elif key == "train_e2e_time":
+                formatted_result[key] = str(self.train_e2e_time).split(".")[0]
+            elif isinstance(val, EvalResult):
+                task = key.split('_')[0]
+                for metric, score in vars(val).items():
+                    formatted_result[f"{metric}({task})"] = round(score, 4)
+            elif isinstance(val, float):
+                formatted_result[key] = round(val, 4)
 
-        return result
+        return formatted_result
 
     def __add__(self, obj: "ExperimentResult"):
         """Add with same class. If None exists, it's skipped."""
@@ -152,15 +192,29 @@ def parse_formatted_dict(self, formatted_dict: Dict):
         """Parse a dictionary with same format."""
         max_mem_pat = re.compile(r"max_.*_mem")
         cpu_util_pat = re.compile(r"avg.*_util")
+        eval_result_pat = re.compile(r"(.*)\((.*)\)")
+
         for key, val in formatted_dict.items():
             max_mem_name = max_mem_pat.search(key)
             cpu_util_name = cpu_util_pat.search(key)
+            eval_result_name = eval_result_pat.search(key)
+
             if max_mem_name is not None:
                 max_mem_name = max_mem_name.group(0)
                 setattr(self, max_mem_name, val)
             elif cpu_util_name is not None:
                 cpu_util_name = cpu_util_name.group(0)
                 setattr(self, cpu_util_name, val)
+            elif eval_result_name is not None:
+                metric = eval_result_name.group(1)
+                task = eval_result_name.group(2)
+                eval_result = getattr(self, f"{task}_eval_result")
+                if  eval_result is None:
+                    eval_result = EvalResult()
+                    eval_result[metric] = val
+                    setattr(self, f"{task}_eval_result", eval_result)
+                else:
+                    eval_result[metric] = val
             elif key == "train_e2e_time":
                 setattr(self, key, parse_time_delta_fmt(val, "%H:%M:%S"))
             else:
@@ -205,20 +259,21 @@ def _calculate_avg_std_per_iter(self):
             )
 
     def _parse_eval_output(self, file_path: Path):
-        # NOTE: It is assumed that performance.json has key named either score or avg_time_per_image
+        for task in ["train", "export", "optimize"]:
+            if task in str(file_path.parent.name):
+                break
+        else:
+            print(f"Can not parse eval output in {file_path.parent.name}")
+            return
+
         with file_path.open("r") as f:
             eval_output: Dict = json.load(f)
 
-        if "train" in str(file_path.parent.name):
-            self._exp_result.test_score = list(eval_output.values())[0]
-        elif "export" in str(file_path.parent.name):
-            for key, val in eval_output.items():
-                if key == "avg_time_per_image":
-                    self._exp_result.avg_ov_infer_time = val
-                else:
-                    self._exp_result.export_model_score = val
-        elif "optimize" in str(file_path.parent.name):
-            self._exp_result.optimize_model_score = list(eval_output.values())[0]
+        eval_result = EvalResult()
+        for metric, score in eval_output.items():
+            eval_result[metric] = score
+
+        setattr(self._exp_result, f"{task}_eval_result", eval_result)
 
     def _parse_resource_usage(self, file_path: Path):
         with file_path.open("r") as f:
@@ -255,7 +310,7 @@ class MMCVExpParser(BaseExpParser):
     def parse_exp_log(self):
         """Parse experiment log."""
         for task_dir in (self._workspace / "outputs").iterdir():
-            if task_dir.is_symlink():
+            if task_dir.is_symlink():  # prevent duplicated parse
                 continue
 
             if "train" in str(task_dir.name):
@@ -574,6 +629,7 @@ def _product_all_cases(
         if not found_keys:
             return []
 
+        found_keys = sorted(found_keys)
         values_of_found_key = []
         for key in found_keys:
             if isinstance(variable[key], list):