Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handling undefined labels at the annotation statistics #1232

Merged
merged 3 commits into from
Dec 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
(<https://github.com/openvinotoolkit/datumaro/pull/1221>)
- Fix Kinetics data format to have media data
(<https://github.com/openvinotoolkit/datumaro/pull/1223>)
- Handling undefined labels at the annotation statistics
(<https://github.com/openvinotoolkit/datumaro/pull/1232>)

## 16/11/2023 - Release 1.5.1
### Enhancements
Expand Down
24 changes: 20 additions & 4 deletions src/datumaro/components/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import hashlib
import logging as log
import warnings
from collections import defaultdict
from copy import deepcopy
from typing import Callable, Dict, Optional, Set, Tuple

Expand Down Expand Up @@ -225,10 +226,20 @@ def _extractor_stats(subset_name):


def compute_ann_statistics(dataset: IDataset):
labels = dataset.categories().get(AnnotationType.label, LabelCategories())
warnings.warn(
"We are planning to change the type of stats['annotations']['labels']['distribution'] "
"and stats['annotations']['segments']['pixel distribution'] from `list` to `(named) tuple`. "
"If you are checking the types in your code, please revisit it after upgrading datumaro>=2.0.0.",
FutureWarning,
)
labels: LabelCategories = dataset.categories().get(AnnotationType.label, LabelCategories())

def get_label(ann):
return labels.items[ann.label].name if ann.label is not None else None
try:
return labels.items[ann.label].name if ann.label is not None else None
except IndexError:
log.warning(f"annotation({ann}) has undefined label({ann.label})")
return ann.label

stats = {
"images count": 0,
Expand All @@ -253,21 +264,26 @@ def get_label(ann):
}
label_stat = {
"count": 0,
"distribution": {l.name: [0, 0] for l in labels.items}, # label -> (count, total%)
"distribution": defaultdict(lambda: [0, 0]), # label -> (count, total%)
vinnamkim marked this conversation as resolved.
Show resolved Hide resolved
"attributes": {},
}

stats["annotations"]["labels"] = label_stat
segm_stat = {
"avg. area": 0,
"area distribution": [], # a histogram with 10 bins
# (min, min+10%), ..., (min+90%, max) -> (count, total%)
"pixel distribution": {l.name: [0, 0] for l in labels.items}, # label -> (count, total%)
"pixel distribution": defaultdict(lambda: [0, 0]), # label -> (count, total%)
}
stats["annotations"]["segments"] = segm_stat
segm_areas = []
pixel_dist = segm_stat["pixel distribution"]
total_pixels = 0

for l in labels.items:
label_stat["distribution"][l.name] = [0, 0]
pixel_dist[l.name] = [0, 0]

for item in dataset:
if len(item.annotations) == 0:
stats["unannotated images"].append(item.id)
Expand Down
3 changes: 3 additions & 0 deletions tests/requirements.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ class Requirements:
DATUM_BUG_618 = "ResizeTransform returns broken image pixels"
DATUM_BUG_721 = "Explain command cannot find the model"
DATUM_BUG_873 = "Error using datum stats"
DATUM_BUG_1204 = (
"Statistics raise an error when there is a label annotation not in the category"
)


class SkipMessages:
Expand Down
302 changes: 301 additions & 1 deletion tests/unit/operations/test_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,16 @@
import numpy as np
import pytest

from datumaro.components.annotation import Bbox, Caption, Ellipse, Label, Mask, Points
from datumaro.components.dataset import Dataset
from datumaro.components.dataset_base import DatasetItem
from datumaro.components.errors import DatumaroError
from datumaro.components.media import Image, PointCloud
from datumaro.components.operations import IMAGE_STATS_SCHEMA, compute_image_statistics
from datumaro.components.operations import (
IMAGE_STATS_SCHEMA,
compute_ann_statistics,
compute_image_statistics,
)

from tests.requirements import Requirements, mark_requirement

Expand Down Expand Up @@ -109,3 +114,298 @@ def test_invalid_media_type(
with pytest.warns(UserWarning, match="only Image media_type is allowed"):
actual = compute_image_statistics(fxt_point_cloud_dataset)
assert actual["dataset"] == IMAGE_STATS_SCHEMA["dataset"]


class AnnStatisticsTest:
@mark_requirement(Requirements.DATUM_GENERAL_REQ)
def test_stats(self):
dataset = Dataset.from_iterable(
[
DatasetItem(
id=1,
media=Image.from_numpy(data=np.ones((5, 5, 3))),
annotations=[
Caption("hello"),
Caption("world"),
Label(
2,
attributes={
"x": 1,
"y": "2",
},
),
Bbox(
1,
2,
2,
2,
label=2,
attributes={
"score": 0.5,
},
),
Bbox(
5,
6,
2,
2,
attributes={
"x": 1,
"y": "3",
"occluded": True,
},
),
Points([1, 2, 2, 0, 1, 1], label=0),
Mask(
label=3,
image=np.array(
[
[0, 0, 1, 1, 1],
[0, 0, 1, 1, 1],
[0, 0, 1, 1, 1],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
]
),
),
],
),
DatasetItem(
id=2,
media=Image.from_numpy(data=np.ones((2, 4, 3))),
annotations=[
Label(
2,
attributes={
"x": 2,
"y": "2",
},
),
Bbox(
1,
2,
2,
2,
label=3,
attributes={
"score": 0.5,
},
),
Bbox(
5,
6,
2,
2,
attributes={
"x": 2,
"y": "3",
"occluded": False,
},
),
Ellipse(
5,
6,
2,
2,
attributes={
"x": 2,
"y": "3",
"occluded": False,
},
),
],
),
DatasetItem(id=3),
DatasetItem(id="2.2", media=Image.from_numpy(data=np.ones((2, 4, 3)))),
],
categories=["label_%s" % i for i in range(4)],
)

expected = {
"images count": 4,
"annotations count": 11,
"unannotated images count": 2,
"unannotated images": ["3", "2.2"],
"annotations by type": {
"label": {
"count": 2,
},
"polygon": {
"count": 0,
},
"polyline": {
"count": 0,
},
"bbox": {
"count": 4,
},
"mask": {
"count": 1,
},
"points": {
"count": 1,
},
"caption": {
"count": 2,
},
"cuboid_3d": {"count": 0},
"super_resolution_annotation": {"count": 0},
"depth_annotation": {"count": 0},
"ellipse": {"count": 1},
"hash_key": {"count": 0},
"feature_vector": {"count": 0},
"tabular": {"count": 0},
"unknown": {"count": 0},
},
"annotations": {
"labels": {
"count": 6,
"distribution": {
"label_0": [1, 1 / 6],
"label_1": [0, 0.0],
"label_2": [3, 3 / 6],
"label_3": [2, 2 / 6],
},
"attributes": {
"x": {
"count": 2, # annotations with no label are skipped
"values count": 2,
"values present": ["1", "2"],
"distribution": {
"1": [1, 1 / 2],
"2": [1, 1 / 2],
},
},
"y": {
"count": 2, # annotations with no label are skipped
"values count": 1,
"values present": ["2"],
"distribution": {
"2": [2, 2 / 2],
},
},
# must not include "special" attributes like "occluded"
},
},
"segments": {
"avg. area": (4 * 2 + 9 * 1) / 3,
"area distribution": [
{"min": 4.0, "max": 4.5, "count": 2, "percent": 2 / 3},
{"min": 4.5, "max": 5.0, "count": 0, "percent": 0.0},
{"min": 5.0, "max": 5.5, "count": 0, "percent": 0.0},
{"min": 5.5, "max": 6.0, "count": 0, "percent": 0.0},
{"min": 6.0, "max": 6.5, "count": 0, "percent": 0.0},
{"min": 6.5, "max": 7.0, "count": 0, "percent": 0.0},
{"min": 7.0, "max": 7.5, "count": 0, "percent": 0.0},
{"min": 7.5, "max": 8.0, "count": 0, "percent": 0.0},
{"min": 8.0, "max": 8.5, "count": 0, "percent": 0.0},
{"min": 8.5, "max": 9.0, "count": 1, "percent": 1 / 3},
],
"pixel distribution": {
"label_0": [0, 0.0],
"label_1": [0, 0.0],
"label_2": [4, 4 / 17],
"label_3": [13, 13 / 17],
},
},
},
}

actual = compute_ann_statistics(dataset)

assert actual == expected

@mark_requirement(Requirements.DATUM_GENERAL_REQ)
def test_stats_with_empty_dataset(self):
label_names = ["label_%s" % i for i in range(4)]
dataset = Dataset.from_iterable(
[
DatasetItem(id=1),
DatasetItem(id=3),
],
categories=label_names,
)

expected = self._get_stats_template(label_names)
expected["images count"] = 2
expected["unannotated images count"] = 2
expected["unannotated images"] = ["1", "3"]

actual = compute_ann_statistics(dataset)
assert actual == expected

@mark_requirement(Requirements.DATUM_BUG_1204)
def test_stats_with_invalid_label(self):
label_names = ["label_%s" % i for i in range(3)]
dataset = Dataset.from_iterable(
iterable=[DatasetItem(id=f"item{i}", annotations=[Label(i)]) for i in range(4)],
categories=label_names,
)

expected = self._get_stats_template(label_names)
expected["images count"] = 4
expected["annotations count"] = 4
expected["annotations by type"]["label"]["count"] = 4
expected["annotations"]["labels"]["count"] = 4
expected["annotations"]["labels"]["distribution"] = {
"label_0": [1, 0.25],
"label_1": [1, 0.25],
"label_2": [1, 0.25],
3: [1, 0.25], # label which does not exist in categories.
}

actual = compute_ann_statistics(dataset)

assert actual == expected

@staticmethod
def _get_stats_template(label_names: list):
return {
"images count": 0,
"annotations count": 0,
"unannotated images count": 0,
"unannotated images": [],
"annotations by type": {
"label": {
"count": 0,
},
"polygon": {
"count": 0,
},
"polyline": {
"count": 0,
},
"bbox": {
"count": 0,
},
"mask": {
"count": 0,
},
"points": {
"count": 0,
},
"caption": {
"count": 0,
},
"cuboid_3d": {"count": 0},
"super_resolution_annotation": {"count": 0},
"depth_annotation": {"count": 0},
"ellipse": {"count": 0},
"hash_key": {"count": 0},
"feature_vector": {"count": 0},
"tabular": {"count": 0},
"unknown": {"count": 0},
},
"annotations": {
"labels": {
"count": 0,
"distribution": {n: [0, 0] for n in label_names},
"attributes": {},
},
"segments": {
"avg. area": 0.0,
"area distribution": [],
"pixel distribution": {n: [0, 0] for n in label_names},
},
},
}
Loading
Loading