handling undefined label at the annotation statistics

Yi, Jihyeon · Yi, Jihyeon · commit 867a3582884a · 2023-12-21T14:13:20.000+09:00
diff --git a/src/datumaro/components/operations.py b/src/datumaro/components/operations.py
@@ -5,6 +5,7 @@
 import hashlib
 import logging as log
 import warnings
+from collections import defaultdict
 from copy import deepcopy
 from typing import Callable, Dict, Optional, Set, Tuple
 
@@ -225,10 +226,14 @@ def _extractor_stats(subset_name):
 
 
 def compute_ann_statistics(dataset: IDataset):
-    labels = dataset.categories().get(AnnotationType.label, LabelCategories())
+    labels: LabelCategories = dataset.categories().get(AnnotationType.label, LabelCategories())
 
     def get_label(ann):
-        return labels.items[ann.label].name if ann.label is not None else None
+        try:
+            return labels.items[ann.label].name if ann.label is not None else None
+        except IndexError:
+            log.warning(f"annotation({ann}) has undefined label({ann.label})")
+            return ann.label
 
     stats = {
         "images count": 0,
@@ -253,21 +258,26 @@ def get_label(ann):
     }
     label_stat = {
         "count": 0,
-        "distribution": {l.name: [0, 0] for l in labels.items},  # label -> (count, total%)
+        "distribution": defaultdict(lambda: [0, 0]),  # label -> (count, total%)
         "attributes": {},
     }
+
     stats["annotations"]["labels"] = label_stat
     segm_stat = {
         "avg. area": 0,
         "area distribution": [],  # a histogram with 10 bins
         # (min, min+10%), ..., (min+90%, max) -> (count, total%)
-        "pixel distribution": {l.name: [0, 0] for l in labels.items},  # label -> (count, total%)
+        "pixel distribution": defaultdict(lambda: [0, 0]),  # label -> (count, total%)
     }
     stats["annotations"]["segments"] = segm_stat
     segm_areas = []
     pixel_dist = segm_stat["pixel distribution"]
     total_pixels = 0
 
+    for l in labels.items:
+        label_stat["distribution"][l.name] = [0, 0]
+        pixel_dist[l.name] = [0, 0]
+
     for item in dataset:
         if len(item.annotations) == 0:
             stats["unannotated images"].append(item.id)
diff --git a/tests/requirements.py b/tests/requirements.py
@@ -61,6 +61,9 @@ class Requirements:
     DATUM_BUG_618 = "ResizeTransform returns broken image pixels"
     DATUM_BUG_721 = "Explain command cannot find the model"
     DATUM_BUG_873 = "Error using datum stats"
+    DATUM_BUG_1204 = (
+        "Statistics raise an error when there is a label annotation not in the category"
+    )
 
 
 class SkipMessages:
diff --git a/tests/unit/operations/test_statistics.py b/tests/unit/operations/test_statistics.py
@@ -8,11 +8,16 @@
 import numpy as np
 import pytest
 
+from datumaro.components.annotation import Bbox, Caption, Ellipse, Label, Mask, Points
 from datumaro.components.dataset import Dataset
 from datumaro.components.dataset_base import DatasetItem
 from datumaro.components.errors import DatumaroError
 from datumaro.components.media import Image, PointCloud
-from datumaro.components.operations import IMAGE_STATS_SCHEMA, compute_image_statistics
+from datumaro.components.operations import (
+    IMAGE_STATS_SCHEMA,
+    compute_ann_statistics,
+    compute_image_statistics,
+)
 
 from tests.requirements import Requirements, mark_requirement
 
@@ -109,3 +114,298 @@ def test_invalid_media_type(
             with pytest.warns(UserWarning, match="only Image media_type is allowed"):
                 actual = compute_image_statistics(fxt_point_cloud_dataset)
             assert actual["dataset"] == IMAGE_STATS_SCHEMA["dataset"]
+
+
+class AnnStatisticsTest:
+    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    def test_stats(self):
+        dataset = Dataset.from_iterable(
+            [
+                DatasetItem(
+                    id=1,
+                    media=Image.from_numpy(data=np.ones((5, 5, 3))),
+                    annotations=[
+                        Caption("hello"),
+                        Caption("world"),
+                        Label(
+                            2,
+                            attributes={
+                                "x": 1,
+                                "y": "2",
+                            },
+                        ),
+                        Bbox(
+                            1,
+                            2,
+                            2,
+                            2,
+                            label=2,
+                            attributes={
+                                "score": 0.5,
+                            },
+                        ),
+                        Bbox(
+                            5,
+                            6,
+                            2,
+                            2,
+                            attributes={
+                                "x": 1,
+                                "y": "3",
+                                "occluded": True,
+                            },
+                        ),
+                        Points([1, 2, 2, 0, 1, 1], label=0),
+                        Mask(
+                            label=3,
+                            image=np.array(
+                                [
+                                    [0, 0, 1, 1, 1],
+                                    [0, 0, 1, 1, 1],
+                                    [0, 0, 1, 1, 1],
+                                    [0, 0, 0, 0, 0],
+                                    [0, 0, 0, 0, 0],
+                                ]
+                            ),
+                        ),
+                    ],
+                ),
+                DatasetItem(
+                    id=2,
+                    media=Image.from_numpy(data=np.ones((2, 4, 3))),
+                    annotations=[
+                        Label(
+                            2,
+                            attributes={
+                                "x": 2,
+                                "y": "2",
+                            },
+                        ),
+                        Bbox(
+                            1,
+                            2,
+                            2,
+                            2,
+                            label=3,
+                            attributes={
+                                "score": 0.5,
+                            },
+                        ),
+                        Bbox(
+                            5,
+                            6,
+                            2,
+                            2,
+                            attributes={
+                                "x": 2,
+                                "y": "3",
+                                "occluded": False,
+                            },
+                        ),
+                        Ellipse(
+                            5,
+                            6,
+                            2,
+                            2,
+                            attributes={
+                                "x": 2,
+                                "y": "3",
+                                "occluded": False,
+                            },
+                        ),
+                    ],
+                ),
+                DatasetItem(id=3),
+                DatasetItem(id="2.2", media=Image.from_numpy(data=np.ones((2, 4, 3)))),
+            ],
+            categories=["label_%s" % i for i in range(4)],
+        )
+
+        expected = {
+            "images count": 4,
+            "annotations count": 11,
+            "unannotated images count": 2,
+            "unannotated images": ["3", "2.2"],
+            "annotations by type": {
+                "label": {
+                    "count": 2,
+                },
+                "polygon": {
+                    "count": 0,
+                },
+                "polyline": {
+                    "count": 0,
+                },
+                "bbox": {
+                    "count": 4,
+                },
+                "mask": {
+                    "count": 1,
+                },
+                "points": {
+                    "count": 1,
+                },
+                "caption": {
+                    "count": 2,
+                },
+                "cuboid_3d": {"count": 0},
+                "super_resolution_annotation": {"count": 0},
+                "depth_annotation": {"count": 0},
+                "ellipse": {"count": 1},
+                "hash_key": {"count": 0},
+                "feature_vector": {"count": 0},
+                "tabular": {"count": 0},
+                "unknown": {"count": 0},
+            },
+            "annotations": {
+                "labels": {
+                    "count": 6,
+                    "distribution": {
+                        "label_0": [1, 1 / 6],
+                        "label_1": [0, 0.0],
+                        "label_2": [3, 3 / 6],
+                        "label_3": [2, 2 / 6],
+                    },
+                    "attributes": {
+                        "x": {
+                            "count": 2,  # annotations with no label are skipped
+                            "values count": 2,
+                            "values present": ["1", "2"],
+                            "distribution": {
+                                "1": [1, 1 / 2],
+                                "2": [1, 1 / 2],
+                            },
+                        },
+                        "y": {
+                            "count": 2,  # annotations with no label are skipped
+                            "values count": 1,
+                            "values present": ["2"],
+                            "distribution": {
+                                "2": [2, 2 / 2],
+                            },
+                        },
+                        # must not include "special" attributes like "occluded"
+                    },
+                },
+                "segments": {
+                    "avg. area": (4 * 2 + 9 * 1) / 3,
+                    "area distribution": [
+                        {"min": 4.0, "max": 4.5, "count": 2, "percent": 2 / 3},
+                        {"min": 4.5, "max": 5.0, "count": 0, "percent": 0.0},
+                        {"min": 5.0, "max": 5.5, "count": 0, "percent": 0.0},
+                        {"min": 5.5, "max": 6.0, "count": 0, "percent": 0.0},
+                        {"min": 6.0, "max": 6.5, "count": 0, "percent": 0.0},
+                        {"min": 6.5, "max": 7.0, "count": 0, "percent": 0.0},
+                        {"min": 7.0, "max": 7.5, "count": 0, "percent": 0.0},
+                        {"min": 7.5, "max": 8.0, "count": 0, "percent": 0.0},
+                        {"min": 8.0, "max": 8.5, "count": 0, "percent": 0.0},
+                        {"min": 8.5, "max": 9.0, "count": 1, "percent": 1 / 3},
+                    ],
+                    "pixel distribution": {
+                        "label_0": [0, 0.0],
+                        "label_1": [0, 0.0],
+                        "label_2": [4, 4 / 17],
+                        "label_3": [13, 13 / 17],
+                    },
+                },
+            },
+        }
+
+        actual = compute_ann_statistics(dataset)
+
+        assert actual == expected
+
+    @mark_requirement(Requirements.DATUM_GENERAL_REQ)
+    def test_stats_with_empty_dataset(self):
+        label_names = ["label_%s" % i for i in range(4)]
+        dataset = Dataset.from_iterable(
+            [
+                DatasetItem(id=1),
+                DatasetItem(id=3),
+            ],
+            categories=label_names,
+        )
+
+        expected = self._get_stats_template(label_names)
+        expected["images count"] = 2
+        expected["unannotated images count"] = 2
+        expected["unannotated images"] = ["1", "3"]
+
+        actual = compute_ann_statistics(dataset)
+        assert actual == expected
+
+    @mark_requirement(Requirements.DATUM_BUG_1204)
+    def test_stats_with_invalid_label(self):
+        label_names = ["label_%s" % i for i in range(3)]
+        dataset = Dataset.from_iterable(
+            iterable=[DatasetItem(id=f"item{i}", annotations=[Label(i)]) for i in range(4)],
+            categories=label_names,
+        )
+
+        expected = self._get_stats_template(label_names)
+        expected["images count"] = 4
+        expected["annotations count"] = 4
+        expected["annotations by type"]["label"]["count"] = 4
+        expected["annotations"]["labels"]["count"] = 4
+        expected["annotations"]["labels"]["distribution"] = {
+            "label_0": [1, 0.25],
+            "label_1": [1, 0.25],
+            "label_2": [1, 0.25],
+            3: [1, 0.25],  # label which does not exist in categories.
+        }
+
+        actual = compute_ann_statistics(dataset)
+
+        assert actual == expected
+
+    @staticmethod
+    def _get_stats_template(label_names: list):
+        return {
+            "images count": 0,
+            "annotations count": 0,
+            "unannotated images count": 0,
+            "unannotated images": [],
+            "annotations by type": {
+                "label": {
+                    "count": 0,
+                },
+                "polygon": {
+                    "count": 0,
+                },
+                "polyline": {
+                    "count": 0,
+                },
+                "bbox": {
+                    "count": 0,
+                },
+                "mask": {
+                    "count": 0,
+                },
+                "points": {
+                    "count": 0,
+                },
+                "caption": {
+                    "count": 0,
+                },
+                "cuboid_3d": {"count": 0},
+                "super_resolution_annotation": {"count": 0},
+                "depth_annotation": {"count": 0},
+                "ellipse": {"count": 0},
+                "hash_key": {"count": 0},
+                "feature_vector": {"count": 0},
+                "tabular": {"count": 0},
+                "unknown": {"count": 0},
+            },
+            "annotations": {
+                "labels": {
+                    "count": 0,
+                    "distribution": {n: [0, 0] for n in label_names},
+                    "attributes": {},
+                },
+                "segments": {
+                    "avg. area": 0.0,
+                    "area distribution": [],
+                    "pixel distribution": {n: [0, 0] for n in label_names},
+                },
+            },
+        }
diff --git a/tests/unit/test_ops.py b/tests/unit/test_ops.py