From 4ca506e47ced2955bee748f34c0932978ec2350e Mon Sep 17 00:00:00 2001 From: bonhun koo Date: Tue, 18 Oct 2022 15:50:33 +0900 Subject: [PATCH] hierarchical labeling --- CHANGELOG.md | 2 + datumaro/components/annotation.py | 9 +- datumaro/plugins/datumaro_format/converter.py | 37 ++++++ datumaro/plugins/datumaro_format/extractor.py | 5 +- .../datumaro_dataset/annotations/test.json | 81 ++++++++++++ .../datumaro_dataset/annotations/train.json | 107 ++++++++++++++++ .../annotations/validation.json | 58 +++++++++ .../assets/datumaro_dataset/images/test/c.jpg | Bin 0 -> 631 bytes .../datumaro_dataset/images/train/a.jpg | Bin 0 -> 631 bytes .../datumaro_dataset/images/train/b.jpg | Bin 0 -> 631 bytes .../datumaro_dataset/images/validation/d.png | Bin 0 -> 70 bytes tests/test_labeling.py | 117 ++++++++++++++++++ 12 files changed, 413 insertions(+), 3 deletions(-) create mode 100644 tests/assets/datumaro_dataset/annotations/test.json create mode 100644 tests/assets/datumaro_dataset/annotations/train.json create mode 100644 tests/assets/datumaro_dataset/annotations/validation.json create mode 100644 tests/assets/datumaro_dataset/images/test/c.jpg create mode 100644 tests/assets/datumaro_dataset/images/train/a.jpg create mode 100644 tests/assets/datumaro_dataset/images/train/b.jpg create mode 100644 tests/assets/datumaro_dataset/images/validation/d.png create mode 100644 tests/test_labeling.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 84c35ac044..32cb419d7e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Add jupyter sample introducing how to merge datasets () +- Support for hierarchical labeling with single-selection + () ## 06/09/2022 - Release v0.3.1 ### Added diff --git a/datumaro/components/annotation.py b/datumaro/components/annotation.py index baa5376dd1..201dd7c230 100644 --- a/datumaro/components/annotation.py +++ b/datumaro/components/annotation.py @@ -97,6 +97,7 @@ class Category: name: str = field(converter=str, validator=not_empty) parent: str = field(default="", validator=default_if_none(str)) attributes: Set[str] = field(factory=set, validator=default_if_none(set)) + single_selection: bool = field(default=False, validator=default_if_none(bool)) items: List[str] = field(factory=list, validator=default_if_none(list)) _indices: Dict[str, int] = field(factory=dict, init=False, eq=False) @@ -146,13 +147,17 @@ def _reindex(self): self._indices = indices def add( - self, name: str, parent: Optional[str] = None, attributes: Optional[Set[str]] = None + self, + name: str, + parent: Optional[str] = None, + attributes: Optional[Set[str]] = None, + single_selection: Optional[bool] = False, ) -> int: assert name assert name not in self._indices, name index = len(self.items) - self.items.append(self.Category(name, parent, attributes)) + self.items.append(self.Category(name, parent, attributes, single_selection)) self._indices[name] = index return index diff --git a/datumaro/plugins/datumaro_format/converter.py b/datumaro/plugins/datumaro_format/converter.py index dce8ca38b8..2c9a4bb40a 100644 --- a/datumaro/plugins/datumaro_format/converter.py +++ b/datumaro/plugins/datumaro_format/converter.py @@ -13,6 +13,7 @@ from datumaro.components.annotation import ( Annotation, + AnnotationType, Bbox, Caption, Cuboid3d, @@ -279,6 +280,7 @@ def _convert_label_categories(self, obj): "name": cast(label.name, str), "parent": cast(label.parent, str), "attributes": self._convert_attribute_categories(label.attributes), + "single_selection": cast(label.single_selection, bool), } ) return converted @@ -336,6 +338,7 @@ def apply(self): for item in self._extractor: subset = item.subset or DEFAULT_SUBSET_NAME + item = self._filterout_for_single_selection(item, self._extractor.categories()) writers[subset].add_item(item) for subset, writer in writers.items(): @@ -383,3 +386,37 @@ def patch(cls, dataset, patch, save_dir, **kwargs): ) if osp.isdir(related_images_path): shutil.rmtree(related_images_path) + + @staticmethod + def _filterout_for_single_selection(item, categories): + name2parent_ss = {} + for label_category in categories[AnnotationType.label]: + name2parent_ss[label_category.name] = ( + label_category.parent, + label_category.single_selection, + ) + + # collect childrens that have the same parent which only allow single-selection + parent2ss_indices = {} + for i, annotation in enumerate(item.annotations): + if annotation._type != AnnotationType.label: + continue + + label_name = categories[AnnotationType.label][annotation.label].name + + parent, single_selection = name2parent_ss[label_name] + + if len(parent) == 0: + continue + + if not single_selection: + parent2ss_indices[parent] = parent2ss_indices.get(parent, []) + [i] + + # remove labels that dis-obey the single-selection rule + for indices in parent2ss_indices.values(): + if len(indices) <= 1: + continue + for i in sorted(indices, reverse=True): + del item.annotations[i] + + return item diff --git a/datumaro/plugins/datumaro_format/extractor.py b/datumaro/plugins/datumaro_format/extractor.py index 62f781169d..3a0b285c99 100644 --- a/datumaro/plugins/datumaro_format/extractor.py +++ b/datumaro/plugins/datumaro_format/extractor.py @@ -65,7 +65,10 @@ def _load_categories(parsed): label_categories = LabelCategories(attributes=parsed_label_cat.get("attributes", [])) for item in parsed_label_cat["labels"]: label_categories.add( - item["name"], parent=item["parent"], attributes=item.get("attributes", []) + item["name"], + parent=item["parent"], + attributes=item.get("attributes", []), + single_selection=item.get("single_selection", False), ) categories[AnnotationType.label] = label_categories diff --git a/tests/assets/datumaro_dataset/annotations/test.json b/tests/assets/datumaro_dataset/annotations/test.json new file mode 100644 index 0000000000..3bdd21fefd --- /dev/null +++ b/tests/assets/datumaro_dataset/annotations/test.json @@ -0,0 +1,81 @@ +{ + "info": {}, + "categories": { + "label": { + "labels": [ + { + "name": "manmade", + "parent": "", + "single_selection": true, + "attributes": [] + }, + { + "name": "car", + "parent": "manmade", + "single_selection": false, + "attributes": [] + }, + { + "name": "bicycle", + "parent": "manmade", + "single_selection": false, + "attributes": [] + }, + { + "name": "accesory", + "parent": "", + "single_selection": false, + "attributes": [] + }, + { + "name": "glasses", + "parent": "accesory", + "single_selection": false, + "attributes": [] + }, + { + "name": "glove", + "parent": "accesory", + "single_selection": false, + "attributes": [] + } + ], + "attributes": [] + } + }, + "items": [ + { + "id": "c", + "annotations": [ + { + "id": 0, + "type": "label", + "attributes": { + "score": 1.0 + }, + "group": 0, + "label_id": 1 + }, + { + "id": 0, + "type": "label", + "attributes": { + "score": 1.0 + }, + "group": 0, + "label_id": 3 + } + ], + "image": { + "path": "../tests/assets/datumaro_dataset/images/test/c.jpg", + "size": [ + 10, + 5 + ] + }, + "media": { + "path": "../tests/assets/datumaro_dataset/images/test/c.jpg" + } + } + ] +} \ No newline at end of file diff --git a/tests/assets/datumaro_dataset/annotations/train.json b/tests/assets/datumaro_dataset/annotations/train.json new file mode 100644 index 0000000000..cbae0feae0 --- /dev/null +++ b/tests/assets/datumaro_dataset/annotations/train.json @@ -0,0 +1,107 @@ +{ + "info": {}, + "categories": { + "label": { + "labels": [ + { + "name": "manmade", + "parent": "", + "single_selection": true, + "attributes": [] + }, + { + "name": "car", + "parent": "manmade", + "single_selection": false, + "attributes": [] + }, + { + "name": "bicycle", + "parent": "manmade", + "single_selection": false, + "attributes": [] + }, + { + "name": "accesory", + "parent": "", + "single_selection": false, + "attributes": [] + }, + { + "name": "glasses", + "parent": "accesory", + "single_selection": false, + "attributes": [] + }, + { + "name": "glove", + "parent": "accesory", + "single_selection": false, + "attributes": [] + } + ], + "attributes": [] + } + }, + "items": [ + { + "id": "a", + "annotations": [ + { + "id": 0, + "type": "label", + "attributes": { + "score": 1.0 + }, + "group": 0, + "label_id": 0 + } + ], + "image": { + "path": "../tests/assets/datumaro_dataset/images/train/a.jpg" + }, + "media": { + "path": "../tests/assets/datumaro_dataset/images/train/a.jpg" + } + }, + { + "id": "b", + "annotations": [ + { + "id": 0, + "type": "label", + "group": 0, + "label_id": 0 + }, + { + "id": 1, + "type": "label", + "group": 0, + "label_id": 1 + }, + { + "id": 2, + "type": "label", + "group": 0, + "label_id": 2 + }, + { + "id": 3, + "type": "label", + "group": 0, + "label_id": 5 + } + ], + "image": { + "path": "../tests/assets/datumaro_dataset/images/train/b.jpg", + "size": [ + 2, + 8 + ] + }, + "media": { + "path": "../tests/assets/datumaro_dataset/images/train/b.jpg" + } + } + ] +} \ No newline at end of file diff --git a/tests/assets/datumaro_dataset/annotations/validation.json b/tests/assets/datumaro_dataset/annotations/validation.json new file mode 100644 index 0000000000..e6555a2597 --- /dev/null +++ b/tests/assets/datumaro_dataset/annotations/validation.json @@ -0,0 +1,58 @@ +{ + "info": {}, + "categories": { + "label": { + "labels": [ + { + "name": "manmade", + "parent": "", + "single_selection": true, + "attributes": [] + }, + { + "name": "car", + "parent": "manmade", + "single_selection": false, + "attributes": [] + }, + { + "name": "bicycle", + "parent": "manmade", + "single_selection": false, + "attributes": [] + }, + { + "name": "accesory", + "parent": "", + "single_selection": false, + "attributes": [] + }, + { + "name": "glasses", + "parent": "accesory", + "single_selection": false, + "attributes": [] + }, + { + "name": "glove", + "parent": "accesory", + "single_selection": false, + "attributes": [] + } + ], + "attributes": [] + } + }, + "items": [ + { + "id": "d", + "annotations": [], + "image": { + "path": "../tests/assets/datumaro_dataset/images/validation/d.png" + }, + "media": { + "path": "../tests/assets/datumaro_dataset/images/validation/d.png" + } + } + ] +} \ No newline at end of file diff --git a/tests/assets/datumaro_dataset/images/test/c.jpg b/tests/assets/datumaro_dataset/images/test/c.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8bce84d3bf50bd756621338e0da944a42428fb06 GIT binary patch literal 631 zcmex=^(PF6}rMnOeST|r4lSw=>~TvNxu(8R<c1}I=;VrF4wW9Q)H;sz?% zD!{d!pzFb!U9xX3zTPI5o8roG<0MW4oqZMDikqloVbuf*=gfJ(V&YTRE(2~ znmD<{#3dx9RMpfqG__1j&CD$#!v`*nMGf}^(PF6}rMnOeST|r4lSw=>~TvNxu(8R<c1}I=;VrF4wW9Q)H;sz?% zD!{d!pzFb!U9xX3zTPI5o8roG<0MW4oqZMDikqloVbuf*=gfJ(V&YTRE(2~ znmD<{#3dx9RMpfqG__1j&CD$#!_R+R8VmUU-vj{uw9ah+ literal 0 HcmV?d00001 diff --git a/tests/assets/datumaro_dataset/images/train/b.jpg b/tests/assets/datumaro_dataset/images/train/b.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0ab7dbe4a41973063285ddadd8f7a2d10ca91c45 GIT binary patch literal 631 zcmex=^(PF6}rMnOeST|r4lSw=>~TvNxu(8R<c1}I=;VrF4wW9Q)H;sz?% zD!{d!pzFb!U9xX3zTPI5o8roG<0MW4oqZMDikqloVbuf*=gfJ(V&YTRE(2~ znmD<{#3dx9RMpfqG__1j&CD$#!_R+R8VmUU-vj{nWX@>- literal 0 HcmV?d00001 diff --git a/tests/assets/datumaro_dataset/images/validation/d.png b/tests/assets/datumaro_dataset/images/validation/d.png new file mode 100644 index 0000000000000000000000000000000000000000..528f10546704be6b339cfe1f577ca4b10ef4f472 GIT binary patch literal 70 zcmeAS@N?(olHy`uVBq!ia0vp^tU%1j!2~2{&iT9qEaBo9!XcZ?!o;QmFVdQ&MBb@0GX=|x&QzG literal 0 HcmV?d00001 diff --git a/tests/test_labeling.py b/tests/test_labeling.py new file mode 100644 index 0000000000..fe792488c3 --- /dev/null +++ b/tests/test_labeling.py @@ -0,0 +1,117 @@ +# Copyright (C) 2019-2022 Intel Corporation +# +# SPDX-License-Identifier: MIT +from unittest.case import TestCase + +import numpy as np + +from datumaro.components.annotation import AnnotationType, Label, LabelCategories +from datumaro.components.extractor import DatasetItem +from datumaro.components.media import Image +from datumaro.components.project import Dataset + +from .requirements import Requirements, mark_requirement + + +class LabelingTest(TestCase): + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + def test_multiple_label(self): + label_categories = LabelCategories() + label_categories.add("car", parent="") + label_categories.add("bicycle", parent="") + + dataset = Dataset.from_iterable( + [ + DatasetItem( + id=0, + subset="train", + media=Image(data=np.ones((10, 6, 3))), + annotations=[ + Label( + 0, + id=0, + ), + Label( + 1, + id=1, + ), + ], + ), + ], + categories={ + AnnotationType.label: label_categories, + }, + ) + + for item in dataset: + self.assertEqual(len(item.annotations), 2) + + def test_label_single_selection_filtered(self): + label_categories = LabelCategories() + label_categories.add("manmade", single_selection=True) + label_categories.add("car", parent="manmade") + label_categories.add("bicycle", parent="manmade") + + dataset = Dataset.from_iterable( + [ + DatasetItem( + id=0, + subset="train", + media=Image(data=np.ones((10, 6, 3))), + annotations=[ + Label( + 1, + id=1, + ), + Label( + 2, + id=2, + ), + ], + ), + ], + categories={ + AnnotationType.label: label_categories, + }, + ) + + dataset.export("test_labaling", format="datumaro") + dataset_imported = Dataset.import_from("test_labaling", format="datumaro") + + for item in dataset_imported: + self.assertEqual(len(item.annotations), 0) + + def test_label_single_selection_not_filtered(self): + label_categories = LabelCategories() + label_categories.add("manmade", single_selection=False) + label_categories.add("car", parent="manmade") + label_categories.add("bicycle", parent="manmade") + + dataset = Dataset.from_iterable( + [ + DatasetItem( + id=0, + subset="train", + media=Image(data=np.ones((10, 6, 3))), + annotations=[ + Label( + 1, + id=1, + ), + Label( + 2, + id=2, + ), + ], + ), + ], + categories={ + AnnotationType.label: label_categories, + }, + ) + + dataset.export("test_labaling", format="datumaro") + dataset_imported = Dataset.import_from("test_labaling", format="datumaro") + + for item in dataset_imported: + self.assertEqual(len(item.annotations), 0)