Handling undefined labels at the annotation statistics (#1232)

Jihyeon Yi · web-flow · commit cfb6832d6047 · 2023-12-21T18:10:40.000+09:00
### Summary Regarding the issue #1204 , modify `compute_ann_statistics` function to handle undefined labels. And add corresponding unit test.  ### How to test  ### Checklist  - [x] I have added unit tests to cover my changes.​ - [ ] I have added integration tests to cover my changes.​ - [x] I have added the description of my changes into [CHANGELOG](https://github.com/openvinotoolkit/datumaro/blob/develop/CHANGELOG.md).​ - [ ] I have updated the [documentation](https://github.com/openvinotoolkit/datumaro/tree/develop/docs) accordingly ### License - [x] I submit _my code changes_ under the same [MIT License](https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE) that covers the project. Feel free to contact the maintainers if that's a concern. - [ ] I have updated the license header for each file (see an example below). ```python # Copyright (C) 2023 Intel Corporation # # SPDX-License-Identifier: MIT ```
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -39,6 +39,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   (<https://github.com/openvinotoolkit/datumaro/pull/1221>)
 - Fix Kinetics data format to have media data
   (<https://github.com/openvinotoolkit/datumaro/pull/1223>)
+- Handling undefined labels at the annotation statistics
+  (<https://github.com/openvinotoolkit/datumaro/pull/1232>)
 
 ## 16/11/2023 - Release 1.5.1
 ### Enhancements
diff --git a/src/datumaro/components/operations.py b/src/datumaro/components/operations.py
@@ -5,6 +5,7 @@
 import hashlib
 import logging as log
 import warnings
+from collections import defaultdict
 from copy import deepcopy
 from typing import Callable, Dict, Optional, Set, Tuple
 
@@ -225,10 +226,20 @@ def _extractor_stats(subset_name):
 
 
 def compute_ann_statistics(dataset: IDataset):
-    labels = dataset.categories().get(AnnotationType.label, LabelCategories())
+    warnings.warn(
+        "We are planning to change the type of stats['annotations']['labels']['distribution'] "
+        "and stats['annotations']['segments']['pixel distribution'] from `list` to `(named) tuple`. "
+        "If you are checking the types in your code, please revisit it after upgrading datumaro>=2.0.0.",
+        FutureWarning,
+    )
+    labels: LabelCategories = dataset.categories().get(AnnotationType.label, LabelCategories())
 
     def get_label(ann):
-        return labels.items[ann.label].name if ann.label is not None else None
+        try:
+            return labels.items[ann.label].name if ann.label is not None else None
+        except IndexError:
+            log.warning(f"annotation({ann}) has undefined label({ann.label})")
+            return ann.label
 
     stats = {
         "images count": 0,
@@ -253,21 +264,26 @@ def get_label(ann):
     }
     label_stat = {
         "count": 0,
-        "distribution": {l.name: [0, 0] for l in labels.items},  # label -> (count, total%)
+        "distribution": defaultdict(lambda: [0, 0]),  # label -> (count, total%)
         "attributes": {},
     }
+
     stats["annotations"]["labels"] = label_stat
     segm_stat = {
         "avg. area": 0,
         "area distribution": [],  # a histogram with 10 bins
         # (min, min+10%), ..., (min+90%, max) -> (count, total%)
-        "pixel distribution": {l.name: [0, 0] for l in labels.items},  # label -> (count, total%)
+        "pixel distribution": defaultdict(lambda: [0, 0]),  # label -> (count, total%)
     }
     stats["annotations"]["segments"] = segm_stat
     segm_areas = []
     pixel_dist = segm_stat["pixel distribution"]
     total_pixels = 0
 
+    for l in labels.items:
+        label_stat["distribution"][l.name] = [0, 0]
+        pixel_dist[l.name] = [0, 0]
+
     for item in dataset:
         if len(item.annotations) == 0:
             stats["unannotated images"].append(item.id)
diff --git a/tests/requirements.py b/tests/requirements.py
@@ -61,6 +61,9 @@ class Requirements:
     DATUM_BUG_618 = "ResizeTransform returns broken image pixels"
     DATUM_BUG_721 = "Explain command cannot find the model"
     DATUM_BUG_873 = "Error using datum stats"
+    DATUM_BUG_1204 = (
+        "Statistics raise an error when there is a label annotation not in the category"
+    )
 
 
 class SkipMessages:
diff --git a/tests/unit/operations/test_statistics.py b/tests/unit/operations/test_statistics.py
@@ -8,11 +8,16 @@
 import numpy as np
 import pytest
 
+from datumaro.components.annotation import Bbox, Caption, Ellipse, Label, Mask, Points
 from datumaro.components.dataset import Dataset
 from datumaro.components.dataset_base import DatasetItem
 from datumaro.components.errors import DatumaroError
 from datumaro.components.media import Image, PointCloud
-from datumaro.components.operations import IMAGE_STATS_SCHEMA, compute_image_statistics
+from datumaro.components.operations import (
+    IMAGE_STATS_SCHEMA,
+    compute_ann_statistics,
+    compute_image_statistics,
+)
 
 from tests.requirements import Requirements, mark_requirement
 
@@ -109,3 +114,298 @@ def test_invalid_media_type(
             with pytest.warns(UserWarning, match="only Image media_type is allowed"):
                 actual = compute_image_statistics(fxt_point_cloud_dataset)
             assert actual["dataset"] == IMAGE_STATS_SCHEMA["dataset"]
+
+
+class AnnStatisticsTest:
+    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    def test_stats(self):
+        dataset = Dataset.from_iterable(
+            [
+                DatasetItem(
+                    id=1,
+                    media=Image.from_numpy(data=np.ones((5, 5, 3))),
+                    annotations=[
+                        Caption("hello"),
+                        Caption("world"),
+                        Label(
+                            2,
+                            attributes={
+                                "x": 1,
+                                "y": "2",
+                            },
+                        ),
+                        Bbox(
+                            1,
+                            2,
+                            2,
+                            2,
+                            label=2,
+                            attributes={
+                                "score": 0.5,
+                            },
+                        ),
+                        Bbox(
+                            5,
+                            6,
+                            2,
+                            2,
+                            attributes={
+                                "x": 1,
+                                "y": "3",
+                                "occluded": True,
+                            },
+                        ),
+                        Points([1, 2, 2, 0, 1, 1], label=0),
+                        Mask(
+                            label=3,
+                            image=np.array(
+                                [
+                                    [0, 0, 1, 1, 1],
+                                    [0, 0, 1, 1, 1],
+                                    [0, 0, 1, 1, 1],
+                                    [0, 0, 0, 0, 0],
+                                    [0, 0, 0, 0, 0],
+                                ]
+                            ),
+                        ),
+                    ],
+                ),
+                DatasetItem(
+                    id=2,
+                    media=Image.from_numpy(data=np.ones((2, 4, 3))),
+                    annotations=[
+                        Label(
+                            2,
+                            attributes={
+                                "x": 2,
+                                "y": "2",
+                            },
+                        ),
+                        Bbox(
+                            1,
+                            2,
+                            2,
+                            2,
+                            label=3,
+                            attributes={
+                                "score": 0.5,
+                            },
+                        ),
+                        Bbox(
+                            5,
+                            6,
+                            2,
+                            2,
+                            attributes={
+                                "x": 2,
+                                "y": "3",
+                                "occluded": False,
+                            },
+                        ),
+                        Ellipse(
+                            5,
+                            6,
+                            2,
+                            2,
+                            attributes={
+                                "x": 2,
+                                "y": "3",
+                                "occluded": False,
+                            },
+                        ),
+                    ],
+                ),
+                DatasetItem(id=3),
+                DatasetItem(id="2.2", media=Image.from_numpy(data=np.ones((2, 4, 3)))),
+            ],
+            categories=["label_%s" % i for i in range(4)],
+        )
+
+        expected = {
+            "images count": 4,
+            "annotations count": 11,
+            "unannotated images count": 2,
+            "unannotated images": ["3", "2.2"],
+            "annotations by type": {
+                "label": {
+                    "count": 2,
+                },
+                "polygon": {
+                    "count": 0,
+                },
+                "polyline": {
+                    "count": 0,
+                },
+                "bbox": {
+                    "count": 4,
+                },
+                "mask": {
+                    "count": 1,
+                },
+                "points": {
+                    "count": 1,
+                },
+                "caption": {
+                    "count": 2,
+                },
+                "cuboid_3d": {"count": 0},
+                "super_resolution_annotation": {"count": 0},
+                "depth_annotation": {"count": 0},
+                "ellipse": {"count": 1},
+                "hash_key": {"count": 0},
+                "feature_vector": {"count": 0},
+                "tabular": {"count": 0},
+                "unknown": {"count": 0},
+            },
+            "annotations": {
+                "labels": {
+                    "count": 6,
+                    "distribution": {
+                        "label_0": [1, 1 / 6],
+                        "label_1": [0, 0.0],
+                        "label_2": [3, 3 / 6],
+                        "label_3": [2, 2 / 6],
+                    },
+                    "attributes": {
+                        "x": {
+                            "count": 2,  # annotations with no label are skipped
+                            "values count": 2,
+                            "values present": ["1", "2"],
+                            "distribution": {
+                                "1": [1, 1 / 2],
+                                "2": [1, 1 / 2],
+                            },
+                        },
+                        "y": {
+                            "count": 2,  # annotations with no label are skipped
+                            "values count": 1,
+                            "values present": ["2"],
+                            "distribution": {
+                                "2": [2, 2 / 2],
+                            },
+                        },
+                        # must not include "special" attributes like "occluded"
+                    },
+                },
+                "segments": {
+                    "avg. area": (4 * 2 + 9 * 1) / 3,
+                    "area distribution": [
+                        {"min": 4.0, "max": 4.5, "count": 2, "percent": 2 / 3},
+                        {"min": 4.5, "max": 5.0, "count": 0, "percent": 0.0},
+                        {"min": 5.0, "max": 5.5, "count": 0, "percent": 0.0},
+                        {"min": 5.5, "max": 6.0, "count": 0, "percent": 0.0},
+                        {"min": 6.0, "max": 6.5, "count": 0, "percent": 0.0},
+                        {"min": 6.5, "max": 7.0, "count": 0, "percent": 0.0},
+                        {"min": 7.0, "max": 7.5, "count": 0, "percent": 0.0},
+                        {"min": 7.5, "max": 8.0, "count": 0, "percent": 0.0},
+                        {"min": 8.0, "max": 8.5, "count": 0, "percent": 0.0},
+                        {"min": 8.5, "max": 9.0, "count": 1, "percent": 1 / 3},
+                    ],
+                    "pixel distribution": {
+                        "label_0": [0, 0.0],
+                        "label_1": [0, 0.0],
+                        "label_2": [4, 4 / 17],
+                        "label_3": [13, 13 / 17],
+                    },
+                },
+            },
+        }
+
+        actual = compute_ann_statistics(dataset)
+
+        assert actual == expected
+
+    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    def test_stats_with_empty_dataset(self):
+        label_names = ["label_%s" % i for i in range(4)]
+        dataset = Dataset.from_iterable(
+            [
+                DatasetItem(id=1),
+                DatasetItem(id=3),
+            ],
+            categories=label_names,
+        )
+
+        expected = self._get_stats_template(label_names)
+        expected["images count"] = 2
+        expected["unannotated images count"] = 2
+        expected["unannotated images"] = ["1", "3"]
+
+        actual = compute_ann_statistics(dataset)
+        assert actual == expected
+
+    @mark_requirement(Requirements.DATUM_BUG_1204)
+    def test_stats_with_invalid_label(self):
+        label_names = ["label_%s" % i for i in range(3)]
+        dataset = Dataset.from_iterable(
+            iterable=[DatasetItem(id=f"item{i}", annotations=[Label(i)]) for i in range(4)],
+            categories=label_names,
+        )
+
+        expected = self._get_stats_template(label_names)
+        expected["images count"] = 4
+        expected["annotations count"] = 4
+        expected["annotations by type"]["label"]["count"] = 4
+        expected["annotations"]["labels"]["count"] = 4
+        expected["annotations"]["labels"]["distribution"] = {
+            "label_0": [1, 0.25],
+            "label_1": [1, 0.25],
+            "label_2": [1, 0.25],
+            3: [1, 0.25],  # label which does not exist in categories.
+        }
+
+        actual = compute_ann_statistics(dataset)
+
+        assert actual == expected
+
+    @staticmethod
+    def _get_stats_template(label_names: list):
+        return {
+            "images count": 0,
+            "annotations count": 0,
+            "unannotated images count": 0,
+            "unannotated images": [],
+            "annotations by type": {
+                "label": {
+                    "count": 0,
+                },
+                "polygon": {
+                    "count": 0,
+                },
+                "polyline": {
+                    "count": 0,
+                },
+                "bbox": {
+                    "count": 0,
+                },
+                "mask": {
+                    "count": 0,
+                },
+                "points": {
+                    "count": 0,
+                },
+                "caption": {
+                    "count": 0,
+                },
+                "cuboid_3d": {"count": 0},
+                "super_resolution_annotation": {"count": 0},
+                "depth_annotation": {"count": 0},
+                "ellipse": {"count": 0},
+                "hash_key": {"count": 0},
+                "feature_vector": {"count": 0},
+                "tabular": {"count": 0},
+                "unknown": {"count": 0},
+            },
+            "annotations": {
+                "labels": {
+                    "count": 0,
+                    "distribution": {n: [0, 0] for n in label_names},
+                    "attributes": {},
+                },
+                "segments": {
+                    "avg. area": 0.0,
+                    "area distribution": [],
+                    "pixel distribution": {n: [0, 0] for n in label_names},
+                },
+            },
+        }
diff --git a/tests/unit/test_ops.py b/tests/unit/test_ops.py