Fix F1 auto-threshold to choose best largest confidence (#2371)

Songki Choi · web-flow · commit e780ccabf628 · 2023-07-18T17:43:23.000+09:00
* Fix F1 auto-threshold to choose best largest confidence

* Update license notice

* Update change log

---------
Signed-off-by: Songki Choi &lt;songki.choi@intel.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,24 @@
 
 All notable changes to this project will be documented in this file.
 
+## \[v1.5.0\]
+
+### New features
+
+-
+
+### Enhancements
+
+-
+
+### Bug fixes
+
+- Fix F1 auto-threshold to choose best largest confidence (<https://github.com/openvinotoolkit/training_extensions/pull/2371>)
+
+### Known issues
+
+- OpenVINO(==2023.0) IR inference is not working well on 2-stage models (e.g. Mask-RCNN) exported from torch==1.13.1
+
 ## \[v1.4.0\]
 
 ### New features
diff --git a/src/otx/api/usecases/evaluation/f_measure.py b/src/otx/api/usecases/evaluation/f_measure.py
@@ -1,10 +1,8 @@
 """This module contains the f-measure performance provider class."""
-
-# Copyright (C) 2021-2022 Intel Corporation
+# Copyright (C) 2021-2023 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
 
-
 import logging
 from typing import Dict, List, Optional, Tuple
 
@@ -363,7 +361,7 @@ def get_results_per_confidence(
                 result.f_measure_curve[class_name].append(result_point[class_name].f_measure)
                 result.precision_curve[class_name].append(result_point[class_name].precision)
                 result.recall_curve[class_name].append(result_point[class_name].recall)
-            if all_classes_f_measure > result.best_f_measure:
+            if all_classes_f_measure > 0.0 and all_classes_f_measure >= result.best_f_measure:
                 result.best_f_measure = all_classes_f_measure
                 result.best_threshold = confidence_threshold
         return result
@@ -417,7 +415,7 @@ def get_results_per_nms(
                 result.precision_curve[class_name].append(result_point[class_name].precision)
                 result.recall_curve[class_name].append(result_point[class_name].recall)
 
-            if all_classes_f_measure >= result.best_f_measure:
+            if all_classes_f_measure > 0.0 and all_classes_f_measure >= result.best_f_measure:
                 result.best_f_measure = all_classes_f_measure
                 result.best_threshold = nms_threshold
         return result
diff --git a/tests/unit/api/usecases/evaluation/test_f_measure.py b/tests/unit/api/usecases/evaluation/test_f_measure.py
@@ -1,16 +1,6 @@
-# Copyright (C) 2020-2021 Intel Corporation
+# Copyright (C) 2020-2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
 
 import datetime
 from typing import cast
@@ -962,7 +952,7 @@ def test_f_measure_calculator_get_results_per_confidence(self):
         # Check "_AggregatedResults" object returned by "get_results_per_confidence" when All Classes f-measure is more
         # than best f-measure in results_per_confidence
         expected_results_per_confidence = _AggregatedResults(["class_1", "class_2"])
-        for confidence_threshold in np.arange(*[0.6, 0.9]):
+        for confidence_threshold in np.arange(*[0.6, 0.9, 0.1]):
             result_point = f_measure_calculator.evaluate_classes(
                 classes=["class_1", "class_2"],
                 iou_threshold=0.7,
@@ -978,7 +968,7 @@ def test_f_measure_calculator_get_results_per_confidence(self):
 
         actual_results_per_confidence = f_measure_calculator.get_results_per_confidence(
             classes=["class_1", "class_2"],
-            confidence_range=[0.6, 0.9],
+            confidence_range=[0.6, 0.9, 0.1],  # arrange(0.6, 0.9, 0.1)
             iou_threshold=0.7,
         )
         assert actual_results_per_confidence.all_classes_f_measure_curve == (
@@ -987,7 +977,9 @@ def test_f_measure_calculator_get_results_per_confidence(self):
         assert actual_results_per_confidence.f_measure_curve == expected_results_per_confidence.f_measure_curve
         assert actual_results_per_confidence.recall_curve == expected_results_per_confidence.recall_curve
         assert actual_results_per_confidence.best_f_measure == 0.5454545454545453
-        assert actual_results_per_confidence.best_threshold == 0.6
+        # 0.6 -> 0.54, 0.7 -> 0.54, 0.8 -> 0.54, 0.9 -> 0.44
+        # Best ""LARGEST" trehshold should be 0.8 (considering numerical error)
+        assert abs(actual_results_per_confidence.best_threshold - 0.8) < 0.001
         # Check "_AggregatedResults" object returned by "get_results_per_confidence" when All Classes f-measure is less
         # than best f-measure in results_per_confidence
         actual_results_per_confidence = f_measure_calculator.get_results_per_confidence(