Skip to content

Commit 867a358

Browse files
author
Yi, Jihyeon
committed
handling undefined label at the annotation statistics
1 parent cce5fc9 commit 867a358

File tree

4 files changed

+319
-278
lines changed

4 files changed

+319
-278
lines changed

src/datumaro/components/operations.py

+14-4
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import hashlib
66
import logging as log
77
import warnings
8+
from collections import defaultdict
89
from copy import deepcopy
910
from typing import Callable, Dict, Optional, Set, Tuple
1011

@@ -225,10 +226,14 @@ def _extractor_stats(subset_name):
225226

226227

227228
def compute_ann_statistics(dataset: IDataset):
228-
labels = dataset.categories().get(AnnotationType.label, LabelCategories())
229+
labels: LabelCategories = dataset.categories().get(AnnotationType.label, LabelCategories())
229230

230231
def get_label(ann):
231-
return labels.items[ann.label].name if ann.label is not None else None
232+
try:
233+
return labels.items[ann.label].name if ann.label is not None else None
234+
except IndexError:
235+
log.warning(f"annotation({ann}) has undefined label({ann.label})")
236+
return ann.label
232237

233238
stats = {
234239
"images count": 0,
@@ -253,21 +258,26 @@ def get_label(ann):
253258
}
254259
label_stat = {
255260
"count": 0,
256-
"distribution": {l.name: [0, 0] for l in labels.items}, # label -> (count, total%)
261+
"distribution": defaultdict(lambda: [0, 0]), # label -> (count, total%)
257262
"attributes": {},
258263
}
264+
259265
stats["annotations"]["labels"] = label_stat
260266
segm_stat = {
261267
"avg. area": 0,
262268
"area distribution": [], # a histogram with 10 bins
263269
# (min, min+10%), ..., (min+90%, max) -> (count, total%)
264-
"pixel distribution": {l.name: [0, 0] for l in labels.items}, # label -> (count, total%)
270+
"pixel distribution": defaultdict(lambda: [0, 0]), # label -> (count, total%)
265271
}
266272
stats["annotations"]["segments"] = segm_stat
267273
segm_areas = []
268274
pixel_dist = segm_stat["pixel distribution"]
269275
total_pixels = 0
270276

277+
for l in labels.items:
278+
label_stat["distribution"][l.name] = [0, 0]
279+
pixel_dist[l.name] = [0, 0]
280+
271281
for item in dataset:
272282
if len(item.annotations) == 0:
273283
stats["unannotated images"].append(item.id)

tests/requirements.py

+3
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ class Requirements:
6161
DATUM_BUG_618 = "ResizeTransform returns broken image pixels"
6262
DATUM_BUG_721 = "Explain command cannot find the model"
6363
DATUM_BUG_873 = "Error using datum stats"
64+
DATUM_BUG_1204 = (
65+
"Statistics raise an error when there is a label annotation not in the category"
66+
)
6467

6568

6669
class SkipMessages:

tests/unit/operations/test_statistics.py

+301-1
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,16 @@
88
import numpy as np
99
import pytest
1010

11+
from datumaro.components.annotation import Bbox, Caption, Ellipse, Label, Mask, Points
1112
from datumaro.components.dataset import Dataset
1213
from datumaro.components.dataset_base import DatasetItem
1314
from datumaro.components.errors import DatumaroError
1415
from datumaro.components.media import Image, PointCloud
15-
from datumaro.components.operations import IMAGE_STATS_SCHEMA, compute_image_statistics
16+
from datumaro.components.operations import (
17+
IMAGE_STATS_SCHEMA,
18+
compute_ann_statistics,
19+
compute_image_statistics,
20+
)
1621

1722
from tests.requirements import Requirements, mark_requirement
1823

@@ -109,3 +114,298 @@ def test_invalid_media_type(
109114
with pytest.warns(UserWarning, match="only Image media_type is allowed"):
110115
actual = compute_image_statistics(fxt_point_cloud_dataset)
111116
assert actual["dataset"] == IMAGE_STATS_SCHEMA["dataset"]
117+
118+
119+
class AnnStatisticsTest:
120+
@mark_requirement(Requirements.DATUM_GENERAL_REQ)
121+
def test_stats(self):
122+
dataset = Dataset.from_iterable(
123+
[
124+
DatasetItem(
125+
id=1,
126+
media=Image.from_numpy(data=np.ones((5, 5, 3))),
127+
annotations=[
128+
Caption("hello"),
129+
Caption("world"),
130+
Label(
131+
2,
132+
attributes={
133+
"x": 1,
134+
"y": "2",
135+
},
136+
),
137+
Bbox(
138+
1,
139+
2,
140+
2,
141+
2,
142+
label=2,
143+
attributes={
144+
"score": 0.5,
145+
},
146+
),
147+
Bbox(
148+
5,
149+
6,
150+
2,
151+
2,
152+
attributes={
153+
"x": 1,
154+
"y": "3",
155+
"occluded": True,
156+
},
157+
),
158+
Points([1, 2, 2, 0, 1, 1], label=0),
159+
Mask(
160+
label=3,
161+
image=np.array(
162+
[
163+
[0, 0, 1, 1, 1],
164+
[0, 0, 1, 1, 1],
165+
[0, 0, 1, 1, 1],
166+
[0, 0, 0, 0, 0],
167+
[0, 0, 0, 0, 0],
168+
]
169+
),
170+
),
171+
],
172+
),
173+
DatasetItem(
174+
id=2,
175+
media=Image.from_numpy(data=np.ones((2, 4, 3))),
176+
annotations=[
177+
Label(
178+
2,
179+
attributes={
180+
"x": 2,
181+
"y": "2",
182+
},
183+
),
184+
Bbox(
185+
1,
186+
2,
187+
2,
188+
2,
189+
label=3,
190+
attributes={
191+
"score": 0.5,
192+
},
193+
),
194+
Bbox(
195+
5,
196+
6,
197+
2,
198+
2,
199+
attributes={
200+
"x": 2,
201+
"y": "3",
202+
"occluded": False,
203+
},
204+
),
205+
Ellipse(
206+
5,
207+
6,
208+
2,
209+
2,
210+
attributes={
211+
"x": 2,
212+
"y": "3",
213+
"occluded": False,
214+
},
215+
),
216+
],
217+
),
218+
DatasetItem(id=3),
219+
DatasetItem(id="2.2", media=Image.from_numpy(data=np.ones((2, 4, 3)))),
220+
],
221+
categories=["label_%s" % i for i in range(4)],
222+
)
223+
224+
expected = {
225+
"images count": 4,
226+
"annotations count": 11,
227+
"unannotated images count": 2,
228+
"unannotated images": ["3", "2.2"],
229+
"annotations by type": {
230+
"label": {
231+
"count": 2,
232+
},
233+
"polygon": {
234+
"count": 0,
235+
},
236+
"polyline": {
237+
"count": 0,
238+
},
239+
"bbox": {
240+
"count": 4,
241+
},
242+
"mask": {
243+
"count": 1,
244+
},
245+
"points": {
246+
"count": 1,
247+
},
248+
"caption": {
249+
"count": 2,
250+
},
251+
"cuboid_3d": {"count": 0},
252+
"super_resolution_annotation": {"count": 0},
253+
"depth_annotation": {"count": 0},
254+
"ellipse": {"count": 1},
255+
"hash_key": {"count": 0},
256+
"feature_vector": {"count": 0},
257+
"tabular": {"count": 0},
258+
"unknown": {"count": 0},
259+
},
260+
"annotations": {
261+
"labels": {
262+
"count": 6,
263+
"distribution": {
264+
"label_0": [1, 1 / 6],
265+
"label_1": [0, 0.0],
266+
"label_2": [3, 3 / 6],
267+
"label_3": [2, 2 / 6],
268+
},
269+
"attributes": {
270+
"x": {
271+
"count": 2, # annotations with no label are skipped
272+
"values count": 2,
273+
"values present": ["1", "2"],
274+
"distribution": {
275+
"1": [1, 1 / 2],
276+
"2": [1, 1 / 2],
277+
},
278+
},
279+
"y": {
280+
"count": 2, # annotations with no label are skipped
281+
"values count": 1,
282+
"values present": ["2"],
283+
"distribution": {
284+
"2": [2, 2 / 2],
285+
},
286+
},
287+
# must not include "special" attributes like "occluded"
288+
},
289+
},
290+
"segments": {
291+
"avg. area": (4 * 2 + 9 * 1) / 3,
292+
"area distribution": [
293+
{"min": 4.0, "max": 4.5, "count": 2, "percent": 2 / 3},
294+
{"min": 4.5, "max": 5.0, "count": 0, "percent": 0.0},
295+
{"min": 5.0, "max": 5.5, "count": 0, "percent": 0.0},
296+
{"min": 5.5, "max": 6.0, "count": 0, "percent": 0.0},
297+
{"min": 6.0, "max": 6.5, "count": 0, "percent": 0.0},
298+
{"min": 6.5, "max": 7.0, "count": 0, "percent": 0.0},
299+
{"min": 7.0, "max": 7.5, "count": 0, "percent": 0.0},
300+
{"min": 7.5, "max": 8.0, "count": 0, "percent": 0.0},
301+
{"min": 8.0, "max": 8.5, "count": 0, "percent": 0.0},
302+
{"min": 8.5, "max": 9.0, "count": 1, "percent": 1 / 3},
303+
],
304+
"pixel distribution": {
305+
"label_0": [0, 0.0],
306+
"label_1": [0, 0.0],
307+
"label_2": [4, 4 / 17],
308+
"label_3": [13, 13 / 17],
309+
},
310+
},
311+
},
312+
}
313+
314+
actual = compute_ann_statistics(dataset)
315+
316+
assert actual == expected
317+
318+
@mark_requirement(Requirements.DATUM_GENERAL_REQ)
319+
def test_stats_with_empty_dataset(self):
320+
label_names = ["label_%s" % i for i in range(4)]
321+
dataset = Dataset.from_iterable(
322+
[
323+
DatasetItem(id=1),
324+
DatasetItem(id=3),
325+
],
326+
categories=label_names,
327+
)
328+
329+
expected = self._get_stats_template(label_names)
330+
expected["images count"] = 2
331+
expected["unannotated images count"] = 2
332+
expected["unannotated images"] = ["1", "3"]
333+
334+
actual = compute_ann_statistics(dataset)
335+
assert actual == expected
336+
337+
@mark_requirement(Requirements.DATUM_BUG_1204)
338+
def test_stats_with_invalid_label(self):
339+
label_names = ["label_%s" % i for i in range(3)]
340+
dataset = Dataset.from_iterable(
341+
iterable=[DatasetItem(id=f"item{i}", annotations=[Label(i)]) for i in range(4)],
342+
categories=label_names,
343+
)
344+
345+
expected = self._get_stats_template(label_names)
346+
expected["images count"] = 4
347+
expected["annotations count"] = 4
348+
expected["annotations by type"]["label"]["count"] = 4
349+
expected["annotations"]["labels"]["count"] = 4
350+
expected["annotations"]["labels"]["distribution"] = {
351+
"label_0": [1, 0.25],
352+
"label_1": [1, 0.25],
353+
"label_2": [1, 0.25],
354+
3: [1, 0.25], # label which does not exist in categories.
355+
}
356+
357+
actual = compute_ann_statistics(dataset)
358+
359+
assert actual == expected
360+
361+
@staticmethod
362+
def _get_stats_template(label_names: list):
363+
return {
364+
"images count": 0,
365+
"annotations count": 0,
366+
"unannotated images count": 0,
367+
"unannotated images": [],
368+
"annotations by type": {
369+
"label": {
370+
"count": 0,
371+
},
372+
"polygon": {
373+
"count": 0,
374+
},
375+
"polyline": {
376+
"count": 0,
377+
},
378+
"bbox": {
379+
"count": 0,
380+
},
381+
"mask": {
382+
"count": 0,
383+
},
384+
"points": {
385+
"count": 0,
386+
},
387+
"caption": {
388+
"count": 0,
389+
},
390+
"cuboid_3d": {"count": 0},
391+
"super_resolution_annotation": {"count": 0},
392+
"depth_annotation": {"count": 0},
393+
"ellipse": {"count": 0},
394+
"hash_key": {"count": 0},
395+
"feature_vector": {"count": 0},
396+
"tabular": {"count": 0},
397+
"unknown": {"count": 0},
398+
},
399+
"annotations": {
400+
"labels": {
401+
"count": 0,
402+
"distribution": {n: [0, 0] for n in label_names},
403+
"attributes": {},
404+
},
405+
"segments": {
406+
"avg. area": 0.0,
407+
"area distribution": [],
408+
"pixel distribution": {n: [0, 0] for n in label_names},
409+
},
410+
},
411+
}

0 commit comments

Comments
 (0)