Lightning-AI · Borda · Jul 3, 2023 · Jun 8, 2023 · Jun 9, 2023 · Jun 12, 2023
@@ -152,6 +152,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Changed calculation in `PearsonCorrCoeff` to be more robust in certain cases  ([#1729](https://github.com/Lightning-AI/torchmetrics/pull/1729))
 
+
+- Changed `MeanAveragePrecision` to `pycocotools` backend ([#1832](https://github.com/Lightning-AI/torchmetrics/pull/1832))
+
+
 ### Deprecated
 
 - Deprecated domain metrics import from package root (

@@ -17,10 +17,11 @@
 
 import pytest
 import torch
-from torch.multiprocessing import Pool, set_start_method
+from torch.multiprocessing import Pool, set_sharing_strategy, set_start_method
 
 with contextlib.suppress(RuntimeError):
     set_start_method("spawn")
+    set_sharing_strategy("file_system")
 
 NUM_PROCESSES = 2  # torch.cuda.device_count() if torch.cuda.is_available() else 2
 NUM_BATCHES = 2 * NUM_PROCESSES  # Need to be divisible with the number of processes

@@ -3,3 +3,6 @@
 from unittests import _PATH_ROOT
 
 _SAMPLE_DETECTION_SEGMENTATION = os.path.join(_PATH_ROOT, "_data", "detection", "instance_segmentation_inputs.json")
+_DETECTION_VAL = os.path.join(_PATH_ROOT, "_data", "detection", "instances_val2014_100.json")
+_DETECTION_BBOX = os.path.join(_PATH_ROOT, "_data", "detection", "instances_val2014_fakebbox100_results.json")
+_DETECTION_SEGM = os.path.join(_PATH_ROOT, "_data", "detection", "instances_val2014_fakesegm100_results.json")
@@ -11,76 +11,157 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-import json
+import contextlib
+import io
 from collections import namedtuple
+from copy import deepcopy
+from functools import partial
 
 import numpy as np
 import pytest
 import torch
 from pycocotools import mask
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
 from torch import IntTensor, Tensor
 from torchmetrics.detection.mean_ap import MeanAveragePrecision
-from torchmetrics.utilities.imports import _TORCHVISION_AVAILABLE, _TORCHVISION_GREATER_EQUAL_0_8
+from torchmetrics.utilities.imports import _PYCOCOTOOLS_AVAILABLE, _TORCHVISION_GREATER_EQUAL_0_8
 
-from unittests.detection import _SAMPLE_DETECTION_SEGMENTATION
+from unittests.detection import _DETECTION_BBOX, _DETECTION_SEGM, _DETECTION_VAL, _SAMPLE_DETECTION_SEGMENTATION
 from unittests.helpers.testers import MetricTester
 
-Input = namedtuple("Input", ["preds", "target"])
+_pytest_condition = not (_PYCOCOTOOLS_AVAILABLE and _TORCHVISION_GREATER_EQUAL_0_8)
 
 
-def _create_inputs_masks() -> Input:
-    with open(_SAMPLE_DETECTION_SEGMENTATION) as fp:
-        inputs_json = json.load(fp)
-
-    _mask_unsqueeze_bool = lambda m: Tensor(mask.decode(m)).unsqueeze(0).bool()
-    _masks_stack_bool = lambda ms: Tensor(np.stack([mask.decode(m) for m in ms])).bool()
-
-    return Input(
-        preds=[
-            [
-                {
-                    "masks": _mask_unsqueeze_bool(inputs_json["preds"][0]),
-                    "scores": Tensor([0.236]),
-                    "labels": IntTensor([4]),
-                },
-                {
-                    "masks": _masks_stack_bool([inputs_json["preds"][1], inputs_json["preds"][2]]),
-                    "scores": Tensor([0.318, 0.726]),
-                    "labels": IntTensor([3, 2]),
-                },  # 73
-            ],
-            [
-                {
-                    "masks": _mask_unsqueeze_bool(inputs_json["preds"][0]),
-                    "scores": Tensor([0.236]),
-                    "labels": IntTensor([4]),
-                },
-                {
-                    "masks": _masks_stack_bool([inputs_json["preds"][1], inputs_json["preds"][2]]),
-                    "scores": Tensor([0.318, 0.726]),
-                    "labels": IntTensor([3, 2]),
-                },  # 73
-            ],
-        ],
-        target=[
-            [
-                {"masks": _mask_unsqueeze_bool(inputs_json["targets"][0]), "labels": IntTensor([4])},  # 42
-                {
-                    "masks": _masks_stack_bool([inputs_json["targets"][1], inputs_json["targets"][2]]),
-                    "labels": IntTensor([2, 2]),
-                },  # 73
-            ],
-            [
-                {"masks": _mask_unsqueeze_bool(inputs_json["targets"][0]), "labels": IntTensor([4])},  # 42
-                {
-                    "masks": _masks_stack_bool([inputs_json["targets"][1], inputs_json["targets"][2]]),
-                    "labels": IntTensor([2, 2]),
-                },  # 73
-            ],
-        ],
+def _generate_coco_inputs(iou_type):
+    """Generates inputs for the MAP metric.
+
+    The inputs are generated from the official COCO results json files:
+    https://github.com/cocodataset/cocoapi/tree/master/results
+    and should therefore correspond directly to the result on the webpage
+    """
+    batched_preds, batched_target = MeanAveragePrecision.coco_to_tm(
+        _DETECTION_BBOX if iou_type == "bbox" else _DETECTION_SEGM, _DETECTION_VAL, iou_type
+    )
+
+    # create 10 batches of 10 preds/targets each
+    batched_preds = [batched_preds[10 * i : 10 * (i + 1)] for i in range(10)]
+    batched_target = [batched_target[10 * i : 10 * (i + 1)] for i in range(10)]
+    return batched_preds, batched_target
+
+
+_coco_bbox_input = _generate_coco_inputs("bbox")
+_coco_segm_input = _generate_coco_inputs("segm")
+
+
+def _compare_again_coco_fn(preds, target, iou_type, iou_thresholds=None, rec_thresholds=None, class_metrics=True):
+    """Taken from https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb."""
+    with contextlib.redirect_stdout(io.StringIO()):
+        gt = COCO(_DETECTION_VAL)
+        dt = gt.loadRes(_DETECTION_BBOX) if iou_type == "bbox" else gt.loadRes(_DETECTION_SEGM)
+
+        coco_eval = COCOeval(gt, dt, iou_type)
+        if iou_thresholds is not None:
+            coco_eval.params.iouThrs = np.array(iou_thresholds, dtype=np.float64)
+        if rec_thresholds is not None:
+            coco_eval.params.recThrs = np.array(rec_thresholds, dtype=np.float64)
+
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+    global_stats = deepcopy(coco_eval.stats)
+
+    map_per_class_values = torch.Tensor([-1])
+    mar_100_per_class_values = torch.Tensor([-1])
+    classes = torch.tensor(
+        list(set(torch.arange(91).tolist()) - {0, 12, 19, 26, 29, 30, 45, 66, 68, 69, 71, 76, 83, 87, 89})
     )
 
+    if class_metrics:
+        map_per_class_list = []
+        mar_100_per_class_list = []
+        for class_id in classes.tolist():
+            coco_eval.params.catIds = [class_id]
+            with contextlib.redirect_stdout(io.StringIO()):
+                coco_eval.evaluate()
+                coco_eval.accumulate()
+                coco_eval.summarize()
+            class_stats = coco_eval.stats
+            map_per_class_list.append(torch.Tensor([class_stats[0]]))
+            mar_100_per_class_list.append(torch.Tensor([class_stats[8]]))
+
+        map_per_class_values = torch.Tensor(map_per_class_list)
+        mar_100_per_class_values = torch.Tensor(mar_100_per_class_list)
+
+    return {
+        "map": Tensor([global_stats[0]]),
+        "map_50": Tensor([global_stats[1]]),
+        "map_75": Tensor([global_stats[2]]),
+        "map_small": Tensor([global_stats[3]]),
+        "map_medium": Tensor([global_stats[4]]),
+        "map_large": Tensor([global_stats[5]]),
+        "mar_1": Tensor([global_stats[6]]),
+        "mar_10": Tensor([global_stats[7]]),
+        "mar_100": Tensor([global_stats[8]]),
+        "mar_small": Tensor([global_stats[9]]),
+        "mar_medium": Tensor([global_stats[10]]),
+        "mar_large": Tensor([global_stats[11]]),
+        "map_per_class": map_per_class_values,
+        "mar_100_per_class": mar_100_per_class_values,
+        "classes": classes,
+    }
+
+
+@pytest.mark.skipif(_pytest_condition, reason="test requires that torchvision=>0.8.0 and pycocotools is installed")
+@pytest.mark.parametrize("iou_type", ["bbox", "segm"])
+@pytest.mark.parametrize("ddp", [False, True])
+class TestMAPUsingCOCOReference(MetricTester):
+    """Test map metric on the reference coco data."""
+
+    @pytest.mark.parametrize("iou_thresholds", [None, [0.25, 0.5, 0.75]])
+    @pytest.mark.parametrize("rec_thresholds", [None, [0.25, 0.5, 0.75]])
+    def test_map(self, iou_type, iou_thresholds, rec_thresholds, ddp):
+        """Test modular implementation for correctness."""
+        preds, target = _coco_bbox_input if iou_type == "bbox" else _coco_segm_input
+        self.run_class_metric_test(
+            ddp=ddp,
+            preds=preds,
+            target=target,
+            metric_class=MeanAveragePrecision,
+            reference_metric=partial(
+                _compare_again_coco_fn,
+                iou_type=iou_type,
+                iou_thresholds=iou_thresholds,
+                rec_thresholds=rec_thresholds,
+                class_metrics=False,
+            ),
+            metric_args={
+                "iou_type": iou_type,
+                "iou_thresholds": iou_thresholds,
+                "rec_thresholds": rec_thresholds,
+                "class_metrics": False,
+            },
+            check_batch=False,
+            atol=1e-2,
+        )
+
+    def test_map_classwise(self, iou_type, ddp):
+        """Test modular implementation for correctness with classwise=True. Needs bigger atol to be stable."""
+        preds, target = _coco_bbox_input if iou_type == "bbox" else _coco_segm_input
+        self.run_class_metric_test(
+            ddp=ddp,
+            preds=preds,
+            target=target,
+            metric_class=MeanAveragePrecision,
+            reference_metric=partial(_compare_again_coco_fn, iou_type=iou_type, class_metrics=True),
+            metric_args={"iou_type": iou_type, "class_metrics": True},
+            check_batch=False,
+            atol=1e-1,
+        )
+
+
+Input = namedtuple("Input", ["preds", "target"])
+
 
 _inputs = Input(
     preds=[
@@ -244,154 +325,13 @@ def _create_inputs_masks() -> Input:
             {
                 "boxes": Tensor([[1.0, 2.0, 3.0, 4.0]]),
                 "scores": Tensor([0.8]),  # target does not have scores
-                "labels": Tensor([1]),
+                "labels": IntTensor([1]),
             },
         ],
     ],
 )
 
 
-def _compare_fn(preds, target) -> dict:
-    """Comparison function for map implementation.
-
-    Official pycocotools results calculated from a subset of https://github.com/cocodataset/cocoapi/tree/master/results
-        All classes
-        Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.637
-        Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.859
-        Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.761
-        Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.622
-        Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.800
-        Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.635
-        Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.432
-        Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.652
-        Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.652
-        Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.673
-        Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.800
-        Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.633
-
-        Class 0
-        Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.725
-        Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.780
-
-        Class 1
-        Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.800
-        Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.800
-
-        Class 2
-        Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.454
-        Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.450
-
-        Class 3
-        Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = -1.000
-        Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = -1.000
-
-        Class 4
-        Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.650
-        Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.650
-
-        Class 49
-        Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.556
-        Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.580
-    """
-    return {
-        "map": Tensor([0.637]),
-        "map_50": Tensor([0.859]),
-        "map_75": Tensor([0.761]),
-        "map_small": Tensor([0.622]),
-        "map_medium": Tensor([0.800]),
-        "map_large": Tensor([0.635]),
-        "mar_1": Tensor([0.432]),
-        "mar_10": Tensor([0.652]),
-        "mar_100": Tensor([0.652]),
-        "mar_small": Tensor([0.673]),
-        "mar_medium": Tensor([0.800]),
-        "mar_large": Tensor([0.633]),
-        "map_per_class": Tensor([0.725, 0.800, 0.454, -1.000, 0.650, 0.556]),
-        "mar_100_per_class": Tensor([0.780, 0.800, 0.450, -1.000, 0.650, 0.580]),
-        "classes": Tensor([0, 1, 2, 3, 4, 49]),
-    }
-
-
-def _compare_fn_segm(preds, target) -> dict:
-    """Comparison function for map implementation for instance segmentation.
-
-    Official pycocotools results calculated from a subset of https://github.com/cocodataset/cocoapi/tree/master/results
-        Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.352
-        Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.752
-        Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.252
-        Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
-        Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000
-        Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.352
-        Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.350
-        Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.350
-        Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.350
-        Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
-        Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = -1.000
-        Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.350
-    """
-    return {
-        "map": Tensor([0.352]),
-        "map_50": Tensor([0.752]),
-        "map_75": Tensor([0.252]),
-        "map_small": Tensor([-1]),
-        "map_medium": Tensor([-1]),
-        "map_large": Tensor([0.352]),
-        "mar_1": Tensor([0.35]),
-        "mar_10": Tensor([0.35]),
-        "mar_100": Tensor([0.35]),
-        "mar_small": Tensor([-1]),
-        "mar_medium": Tensor([-1]),
-        "mar_large": Tensor([0.35]),
-        "map_per_class": Tensor([0.4039604, -1.0, 0.3]),
-        "mar_100_per_class": Tensor([0.4, -1.0, 0.3]),
-        "classes": Tensor([2, 3, 4]),
-    }
-
-
-_pytest_condition = not (_TORCHVISION_AVAILABLE and _TORCHVISION_GREATER_EQUAL_0_8)
-
-
-@pytest.mark.skipif(_pytest_condition, reason="test requires that torchvision=>0.8.0 is installed")
-@pytest.mark.parametrize("compute_on_cpu", [True])
-class TestMAP(MetricTester):
-    """Test the MAP metric for object detection predictions.
-
-    Results are compared to original values from the pycocotools implementation. A subset of the first 10 fake
-    predictions of the official repo is used:
-    https://github.com/cocodataset/cocoapi/blob/master/results/instances_val2014_fakebbox100_results.json
-    """
-
-    atol = 1e-2
-
-    @pytest.mark.parametrize("ddp", [False, True])
-    def test_map_bbox(self, compute_on_cpu, ddp):
-        """Test modular implementation for correctness."""
-        self.run_class_metric_test(
-            ddp=ddp,
-            preds=_inputs.preds,
-            target=_inputs.target,
-            metric_class=MeanAveragePrecision,
-            reference_metric=_compare_fn,
-            check_batch=False,
-            metric_args={"class_metrics": True, "compute_on_cpu": compute_on_cpu},
-        )
-
-    @pytest.mark.parametrize("ddp", [False, True])
-    def test_map_segm(self, compute_on_cpu, ddp):
-        """Test modular implementation for correctness."""
-        _inputs_masks = _create_inputs_masks()
-        self.run_class_metric_test(
-            ddp=ddp,
-            preds=_inputs_masks.preds,
-            target=_inputs_masks.target,
-            metric_class=MeanAveragePrecision,
-            reference_metric=_compare_fn_segm,
-            check_batch=False,
-            metric_args={"class_metrics": True, "compute_on_cpu": compute_on_cpu, "iou_type": "segm"},
-        )
-
-
-# noinspection PyTypeChecker
 @pytest.mark.skipif(_pytest_condition, reason="test requires that torchvision=>0.8.0 is installed")
 def test_error_on_wrong_init():
     """Test class raises the expected errors."""