Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add UserWarning if an invalid media_type comes to image statistics computation #891

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## \[Unreleased\]

### Bug fixes
- Add UserWarning if an invalid media_type comes to image statistics computation
(<https://github.com/openvinotoolkit/datumaro/pull/891>)

## 23/03/2023 - Release 1.1.0
### New features
- Add with_subset_dirs decorator (Add ImagenetWithSubsetDirsImporter)
Expand Down
49 changes: 34 additions & 15 deletions datumaro/components/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import hashlib
import logging as log
import warnings
from collections import OrderedDict
from copy import deepcopy
from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
Expand All @@ -29,6 +30,7 @@
AnnotationsTooCloseError,
ConflictingCategoriesError,
DatasetMergeError,
DatumaroError,
FailedAttrVotingError,
FailedLabelVotingError,
MismatchingAttributesError,
Expand Down Expand Up @@ -1458,26 +1460,45 @@ def _compute_stats(stats, counts, mean_accessor, variance_accessor):
*__class__._compute_stats(stats[h:], counts[h:], m, v),
)

def __len__(self) -> int:
return len(self._stats)


IMAGE_STATS_SCHEMA = {
"dataset": {
"images count": 0,
"unique images count": 0,
"repeated images count": 0,
"repeated images": [], # [[id1, id2], [id3, id4, id5], ...]
},
"subsets": {},
}


def compute_image_statistics(dataset: IDataset):
stats = {
"dataset": {
"images count": 0,
"unique images count": 0,
"repeated images count": 0,
"repeated images": [], # [[id1, id2], [id3, id4, id5], ...]
},
"subsets": {},
}
if dataset.media_type() != Image:
raise DatumaroError(
f"Your dataset's media_type is {dataset.media_type()}, "
"but only Image media_type is allowed."
)

stats = deepcopy(IMAGE_STATS_SCHEMA)

stats_counter = _MeanStdCounter()
unique_counter = _ItemMatcher()

for item in dataset:
if not isinstance(item.media, Image):
warnings.warn(
f"item (id: {item.id}, subset: {item.subset})"
f" has media_type, {item.media} but only Image media_type is allowed."
)
continue

stats_counter.accumulate(item)
unique_counter.process_item(item)

def _extractor_stats(subset_name, extractor):
def _extractor_stats(subset_name):
sub_counter = _MeanStdCounter()
sub_counter._stats = {
k: v
Expand All @@ -1488,7 +1509,7 @@ def _extractor_stats(subset_name, extractor):
available = len(sub_counter._stats) != 0

stats = {
"images count": len(extractor),
"images count": len(sub_counter),
}

if available:
Expand All @@ -1510,16 +1531,14 @@ def _extractor_stats(subset_name, extractor):
return stats

for subset_name in dataset.subsets():
stats["subsets"][subset_name] = _extractor_stats(
subset_name, dataset.get_subset(subset_name)
)
stats["subsets"][subset_name] = _extractor_stats(subset_name)

unique_items = unique_counter.get_result()
repeated_items = [sorted(g) for g in unique_items.values() if 1 < len(g)]

stats["dataset"].update(
{
"images count": len(dataset),
"images count": len(stats_counter),
"unique images count": len(unique_items),
"repeated images count": len(repeated_items),
"repeated images": repeated_items, # [[id1, id2], [id3, id4, id5], ...]
Expand Down
1 change: 1 addition & 0 deletions tests/requirements.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ class Requirements:
DATUM_BUG_606 = "transform with resize also changed the image extension from .jpg to .png"
DATUM_BUG_618 = "ResizeTransform returns broken image pixels"
DATUM_BUG_721 = "Explain command cannot find the model"
DATUM_BUG_873 = "Error using datum stats"


class SkipMessages:
Expand Down
Empty file.
102 changes: 102 additions & 0 deletions tests/unit/operations/test_statistics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# Copyright (C) 2023 Intel Corporation
#
# SPDX-License-Identifier: MIT

from typing import List, Tuple
from unittest.mock import patch

import numpy as np
import pytest

from datumaro.components.dataset import Dataset
from datumaro.components.dataset_base import DatasetItem
from datumaro.components.errors import DatumaroError
from datumaro.components.media import Image, PointCloud
from datumaro.components.operations import IMAGE_STATS_SCHEMA, compute_image_statistics

from tests.requirements import Requirements, mark_requirement


@pytest.fixture
def fxt_image_dataset_expected_mean_std():
expected_mean = [100, 50, 150]
expected_std = [20, 50, 10]

return expected_mean, expected_std


@pytest.fixture
def fxt_image_dataset(fxt_image_dataset_expected_mean_std: Tuple[List[int], List[int]]):
np.random.seed(3003)

expected_mean, expected_std = fxt_image_dataset_expected_mean_std

dataset = Dataset.from_iterable(
[
DatasetItem(
id=i,
media=Image(data=np.random.normal(expected_mean, expected_std, size=(h, w, 3))),
)
for i, (w, h) in enumerate([(3000, 100), (800, 600), (400, 200), (700, 300)])
]
)
dataset.put(dataset.get("1"), id="5", subset="train")
return dataset


@pytest.fixture
def fxt_point_cloud_dataset():
dataset = Dataset.from_iterable(
[
DatasetItem(
id=i,
media=PointCloud(path="dummy.pcd"),
)
for i in range(5)
],
media_type=PointCloud,
)
return dataset


class ImageStatisticsTest:
@mark_requirement(Requirements.DATUM_GENERAL_REQ)
def test_image_stats(
self,
fxt_image_dataset: Dataset,
fxt_image_dataset_expected_mean_std: Tuple[List[int], List[int]],
):
expected_mean, expected_std = fxt_image_dataset_expected_mean_std
actual = compute_image_statistics(fxt_image_dataset)

assert actual["dataset"] == {
"images count": 5,
"unique images count": 4,
"repeated images count": 1,
"repeated images": [[("1", "default"), ("5", "train")]],
}
assert actual["subsets"]["default"]["images count"] == 4
assert actual["subsets"]["train"]["images count"] == 1

actual_mean = actual["subsets"]["default"]["image mean"][::-1]
actual_std = actual["subsets"]["default"]["image std"][::-1]

for em, am in zip(expected_mean, actual_mean):
assert am == pytest.approx(em, 1e-2)
for estd, astd in zip(expected_std, actual_std):
assert astd == pytest.approx(estd, 1e-2)

@mark_requirement(Requirements.DATUM_BUG_873)
def test_invalid_media_type(
self,
fxt_point_cloud_dataset: Dataset,
):
# PointCloud media_type at dataset level
with pytest.raises(DatumaroError, match="only Image media_type is allowed"):
actual = compute_image_statistics(fxt_point_cloud_dataset)

# Exceptional case of #873, dataset has Image media_type but DatasetItem has PointCloud.
with patch.object(Dataset, "media_type", return_value=Image):
with pytest.warns(UserWarning, match="only Image media_type is allowed"):
actual = compute_image_statistics(fxt_point_cloud_dataset)
assert actual["dataset"] == IMAGE_STATS_SCHEMA["dataset"]
38 changes: 0 additions & 38 deletions tests/unit/test_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
UnionMerge,
WrongGroupError,
compute_ann_statistics,
compute_image_statistics,
find_unique_images,
mean_std,
)
Expand Down Expand Up @@ -62,43 +61,6 @@ def test_mean_std(self):
for estd, astd in zip(expected_std, actual_std):
self.assertAlmostEqual(estd, astd, places=0)

@mark_requirement(Requirements.DATUM_GENERAL_REQ)
def test_image_stats(self):
expected_mean = [100, 50, 150]
expected_std = [20, 50, 10]

dataset = Dataset.from_iterable(
[
DatasetItem(
id=i,
media=Image(data=np.random.normal(expected_mean, expected_std, size=(h, w, 3))),
)
for i, (w, h) in enumerate([(3000, 100), (800, 600), (400, 200), (700, 300)])
]
)
dataset.put(dataset.get("1"), id="5", subset="train")

actual = compute_image_statistics(dataset)

self.assertEqual(
actual["dataset"],
{
"images count": 5,
"unique images count": 4,
"repeated images count": 1,
"repeated images": [[("1", "default"), ("5", "train")]],
},
)
self.assertEqual(actual["subsets"]["default"]["images count"], 4)
self.assertEqual(actual["subsets"]["train"]["images count"], 1)

actual_mean = actual["subsets"]["default"]["image mean"][::-1]
actual_std = actual["subsets"]["default"]["image std"][::-1]
for em, am in zip(expected_mean, actual_mean):
self.assertAlmostEqual(em, am, places=0)
for estd, astd in zip(expected_std, actual_std):
self.assertAlmostEqual(estd, astd, places=0)

@mark_requirement(Requirements.DATUM_GENERAL_REQ)
def test_stats(self):
dataset = Dataset.from_iterable(
Expand Down