Merge pull request #47 from openvinotoolkit/develop

Release v0.1.3
openvinotoolkit · Oct 29, 2020 · c59e169 · c59e169
2 parents 86f7f6b + ba439fc
commit c59e169
Show file tree

Hide file tree

Showing 25 changed files with 509 additions and 19 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -25,6 +25,29 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Security
 -
 
+## 10/28/2020 - Release v0.1.3
+### Added
+- `ImageNet` and `ImageNetTxt` dataset formats (<https://github.com/openvinotoolkit/datumaro/pull/41>)
+
+### Changed
+-
+
+### Deprecated
+-
+
+### Removed
+-
+
+### Fixed
+- Default `label-map` parameter value for VOC converter (<https://github.com/openvinotoolkit/datumaro/pull/34>)
+- Randomness of random split transform (<https://github.com/openvinotoolkit/datumaro/pull/38>)
+- `Transform.subsets()` method (<https://github.com/openvinotoolkit/datumaro/pull/38>)
+- Supported unknown image formats in TF Detection API converter (<https://github.com/openvinotoolkit/datumaro/pull/40>)
+- Supported empty attribute values in CVAT extractor (<https://github.com/openvinotoolkit/datumaro/pull/45>)
+
+### Security
+-
+
 
 ## 10/05/2020 - Release v0.1.2
 ### Added

diff --git a/README.md b/README.md
@@ -113,6 +113,7 @@ CVAT annotations                             ---> Publication, statistics etc.
   - [TF Detection API](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/using_your_own_dataset.md) (`bboxes`, `masks`)
   - [MOT sequences](https://arxiv.org/pdf/1906.04567.pdf)
   - [MOTS PNG](https://www.vision.rwth-aachen.de/page/mots)
+  - [ImageNet](http://image-net.org/)
   - [CVAT](https://github.com/opencv/cvat/blob/develop/cvat/apps/documentation/xml_format.md)
   - [LabelMe](http://labelme.csail.mit.edu/Release3.0)
 - Dataset building

diff --git a/datumaro/components/extractor.py b/datumaro/components/extractor.py
@@ -643,7 +643,7 @@ def categories(self):
     def subsets(self):
         if self._subsets is None:
             self._subsets = set(self._extractor.subsets())
-        return self._subsets
+        return super().subsets()
 
     def __len__(self):
         assert self._length in {None, 'parent'} or isinstance(self._length, int)

diff --git a/datumaro/plugins/cvat_format/extractor.py b/datumaro/plugins/cvat_format/extractor.py
@@ -86,7 +86,7 @@ def _parse(cls, path):
                     }
             elif ev == 'end':
                 if el.tag == 'attribute' and attributes is not None:
-                    attr_value = el.text
+                    attr_value = el.text or ''
                     if el.text in ['true', 'false']:
                         attr_value = attr_value == 'true'
                     else:

diff --git a/datumaro/plugins/imagenet_format.py b/datumaro/plugins/imagenet_format.py
@@ -0,0 +1,89 @@
+
+# Copyright (C) 2020 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+from glob import glob
+import logging as log
+import os
+import os.path as osp
+
+from datumaro.components.extractor import (DatasetItem, Label,
+    LabelCategories, AnnotationType, SourceExtractor, Importer
+)
+from datumaro.components.converter import Converter
+
+
+class ImagenetPath:
+    IMAGES_EXT = '.jpg'
+    IMAGES_DIR_NO_LABEL = 'no_label'
+
+
+class ImagenetExtractor(SourceExtractor):
+    def __init__(self, path, subset=None):
+        assert osp.isdir(path), path
+        super().__init__(subset=subset)
+
+        self._categories = self._load_categories(path)
+        self._items = list(self._load_items(path).values())
+
+    def _load_categories(self, path):
+        label_cat = LabelCategories()
+        for images_dir in sorted(os.listdir(path)):
+            if images_dir != ImagenetPath.IMAGES_DIR_NO_LABEL:
+                label_cat.add(images_dir)
+        return { AnnotationType.label: label_cat }
+
+    def _load_items(self, path):
+        items = {}
+        for image_path in glob(osp.join(path, '*', '*')):
+            if osp.splitext(image_path)[1] != ImagenetPath.IMAGES_EXT:
+                continue
+            label = osp.basename(osp.dirname(image_path))
+            image_name = osp.splitext(osp.basename(image_path))[0][len(label) + 1:]
+            item = items.get(image_name)
+            if item is None:
+                item = DatasetItem(id=image_name, subset=self._subset,
+                    image=image_path)
+            annotations = item.annotations
+            if label != ImagenetPath.IMAGES_DIR_NO_LABEL:
+                label = self._categories[AnnotationType.label].find(label)[0]
+                annotations.append(Label(label=label))
+            items[image_name] = item
+        return items
+
+
+class ImagenetImporter(Importer):
+    @classmethod
+    def find_sources(cls, path):
+        if not osp.isdir(path):
+            return []
+        return [{ 'url': path, 'format': 'imagenet' }]
+
+
+class ImagenetConverter(Converter):
+    DEFAULT_IMAGE_EXT = ImagenetPath.IMAGES_EXT
+
+    def apply(self):
+        if 1 < len(self._extractor.subsets()):
+            log.warning("ImageNet format supports exporting only a single "
+                "subset, subset information will not be used.")
+
+        subset_dir = self._save_dir
+        extractor = self._extractor
+        labels = {}
+        for item in self._extractor:
+            image_name = item.id
+            labels[image_name] = set(p.label for p in item.annotations)
+            for label in labels[image_name]:
+                label_name = extractor.categories()[AnnotationType.label][label].name
+                self._save_image(item, osp.join(subset_dir, label_name,
+                    '%s_%s%s' % \
+                    (label_name, image_name, ImagenetPath.IMAGES_EXT)
+                ))
+
+            if not labels[image_name]:
+                self._save_image(item, osp.join(subset_dir,
+                    ImagenetPath.IMAGES_DIR_NO_LABEL,
+                    ImagenetPath.IMAGES_DIR_NO_LABEL + '_' +
+                    image_name + ImagenetPath.IMAGES_EXT))
diff --git a/datumaro/plugins/imagenet_txt_format.py b/datumaro/plugins/imagenet_txt_format.py
@@ -0,0 +1,105 @@
+
+# Copyright (C) 2020 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+from glob import glob
+import os
+import os.path as osp
+
+from datumaro.components.extractor import (DatasetItem, Label,
+    LabelCategories, AnnotationType, SourceExtractor, Importer
+)
+from datumaro.components.converter import Converter
+
+
+class ImagenetTxtPath:
+    LABELS_FILE = 'synsets.txt'
+    IMAGE_DIR = 'images'
+
+class ImagenetTxtExtractor(SourceExtractor):
+    def __init__(self, path, labels=None, image_dir=None):
+        assert osp.isfile(path), path
+        super().__init__(subset=osp.splitext(osp.basename(path))[0])
+
+        if not image_dir:
+            image_dir = ImagenetTxtPath.IMAGE_DIR
+        self.image_dir = osp.join(osp.dirname(path), image_dir)
+
+        if labels is None:
+            labels = osp.join(osp.dirname(path), ImagenetTxtPath.LABELS_FILE)
+            labels = self._parse_labels(labels)
+        else:
+            assert all(isinstance(e, str) for e in labels)
+
+        self._categories = self._load_categories(labels)
+        self._items = list(self._load_items(path).values())
+
+    @staticmethod
+    def _parse_labels(path):
+        with open(path, encoding='utf-8') as labels_file:
+            return [s.strip() for s in labels_file]
+
+    def _load_categories(self, labels):
+        return { AnnotationType.label: LabelCategories().from_iterable(labels) }
+
+    def _load_items(self, path):
+        items = {}
+        with open(path, encoding='utf-8') as f:
+            for line in f:
+                item = line.split()
+                item_id = item[0]
+                label_ids = [int(id) for id in item[1:]]
+                anno = []
+                for label in label_ids:
+                    assert 0 <= label and \
+                        label < len(self._categories[AnnotationType.label]), \
+                        "Image '%s': unknown label id '%s'" % (item_id, label)
+                    anno.append(Label(label))
+                items[item_id] = DatasetItem(id=item_id, subset=self._subset,
+                    image=osp.join(self.image_dir, item_id + '.jpg'),
+                    annotations=anno)
+        return items
+
+
+class ImagenetTxtImporter(Importer):
+    @classmethod
+    def find_sources(cls, path):
+        subset_paths = [p for p in glob(osp.join(path, '*.txt'))
+            if osp.basename(p) != ImagenetTxtPath.LABELS_FILE]
+        sources = []
+        for subset_path in subset_paths:
+            sources += cls._find_sources_recursive(
+                subset_path, '.txt', 'imagenet_txt')
+        return sources
+
+
+class ImagenetTxtConverter(Converter):
+    DEFAULT_IMAGE_EXT = '.jpg'
+
+    def apply(self):
+        subset_dir = self._save_dir
+        os.makedirs(subset_dir, exist_ok=True)
+
+        extractor = self._extractor
+        for subset_name, subset in self._extractor.subsets().items():
+            annotation_file = osp.join(subset_dir, '%s.txt' % subset_name)
+            labels = {}
+            for item in subset:
+                labels[item.id] = [str(p.label) for p in item.annotations
+                    if p.type == AnnotationType.label]
+
+                if self._save_images and item.has_image:
+                    self._save_image(item,
+                        osp.join(self._save_dir, ImagenetTxtPath.IMAGE_DIR,
+                            self._make_image_filename(item)))
+
+            with open(annotation_file, 'w', encoding='utf-8') as f:
+                f.writelines(['%s %s\n' % (item_id, ' '.join(labels[item_id]))
+                    for item_id in labels])
+
+        labels_file = osp.join(subset_dir, ImagenetTxtPath.LABELS_FILE)
+        with open(labels_file, 'w', encoding='utf-8') as f:
+            f.write('\n'.join(l.name
+                for l in extractor.categories()[AnnotationType.label])
+            )
diff --git a/datumaro/plugins/mots_format.py b/datumaro/plugins/mots_format.py
@@ -113,6 +113,7 @@ def apply(self):
             subset_dir = osp.join(self._save_dir, subset_name)
             images_dir = osp.join(subset_dir, MotsPath.IMAGE_DIR)
             anno_dir = osp.join(subset_dir, MotsPath.MASKS_DIR)
+            os.makedirs(anno_dir, exist_ok=True)
 
             for item in subset:
                 log.debug("Converting item '%s'", item.id)

diff --git a/datumaro/plugins/tf_detection_api_format/converter.py b/datumaro/plugins/tf_detection_api_format/converter.py
@@ -199,7 +199,7 @@ def _make_tf_example(self, item):
     def _save_image(self, item, path=None):
         src_ext = item.image.ext.lower()
         dst_ext = osp.splitext(osp.basename(path))[1].lower()
-        fmt = DetectionApiPath.IMAGE_EXT_FORMAT.get(dst_ext)
+        fmt = DetectionApiPath.IMAGE_EXT_FORMAT.get(dst_ext, '')
         if not fmt:
             log.warning("Item '%s': can't find format string for the '%s' "
                 "image extension, the corresponding field will be empty." % \

diff --git a/datumaro/plugins/tf_detection_api_format/format.py b/datumaro/plugins/tf_detection_api_format/format.py
@@ -8,6 +8,6 @@ class DetectionApiPath:
     ANNOTATIONS_DIR = 'annotations'
 
     DEFAULT_IMAGE_EXT = '.jpg'
-    IMAGE_EXT_FORMAT = {'.jpg': 'jpeg', '.png': 'png'}
+    IMAGE_EXT_FORMAT = {'.jpg': 'jpeg', '.jpeg': 'jpeg', '.png': 'png'}
 
     LABELMAP_FILE = 'label_map.pbtxt'
diff --git a/datumaro/plugins/transforms.py b/datumaro/plugins/transforms.py
@@ -355,24 +355,27 @@ def __init__(self, extractor, splits, seed=None):
 
         dataset_size = len(extractor)
         indices = list(range(dataset_size))
-
         random.seed(seed)
         random.shuffle(indices)
         parts = []
         s = 0
-        for subset, ratio in splits:
+        lower_boundary = 0
+        for split_idx, (subset, ratio) in enumerate(splits):
             s += ratio
-            boundary = int(s * dataset_size)
-            parts.append((boundary, subset))
-
+            upper_boundary = int(s * dataset_size)
+            if split_idx == len(splits) - 1:
+                upper_boundary = dataset_size
+            subset_indices = set(indices[lower_boundary : upper_boundary])
+            parts.append((subset_indices, subset))
+            lower_boundary = upper_boundary
         self._parts = parts
 
         self._subsets = set(s[0] for s in splits)
         self._length = 'parent'
 
     def _find_split(self, index):
-        for boundary, subset in self._parts:
-            if index < boundary:
+        for subset_indices, subset in self._parts:
+            if index in subset_indices:
                 return subset
         return subset # all the possible remainder goes to the last split
 

diff --git a/datumaro/plugins/voc_format/converter.py b/datumaro/plugins/voc_format/converter.py
@@ -103,7 +103,7 @@ def __init__(self, extractor, save_dir,
         self._allow_attributes = allow_attributes
 
         if label_map is None:
-            label_map = LabelmapType.source
+            label_map = LabelmapType.source.name
         self._load_categories(label_map)
 
     def apply(self):

diff --git a/datumaro/util/test_utils.py b/datumaro/util/test_utils.py
@@ -92,7 +92,7 @@ def compare_datasets(test, expected, actual, ignored_attrs=None,
             x.subset == item_a.subset)
         test.assertFalse(item_b is None, item_a.id)
         test.assertEqual(item_a.attributes, item_b.attributes)
-        if require_images or \
+        if (require_images and item_a.has_image and item_a.image.has_data) or \
                 item_a.has_image and item_a.image.has_data and \
                 item_b.has_image and item_b.image.has_data:
             test.assertEqual(item_a.image, item_b.image, item_a.id)

diff --git a/datumaro/version.py b/datumaro/version.py
@@ -1 +1 @@
-VERSION = '0.1.0'
+VERSION = '0.1.3'
diff --git a/docs/user_manual.md b/docs/user_manual.md
@@ -97,6 +97,10 @@ List of supported formats:
 - MOTS (png)
   - [Format specification](https://www.vision.rwth-aachen.de/page/mots)
   - [Dataset example](../tests/assets/mots_dataset)
+- ImageNet (`classification`, `detection`)
+  - [Dataset example](../tests/assets/imagenet_dataset)
+  - [Dataset example (txt for classification)](../tests/assets/imagenet_txt_dataset)
+  - Detection format is the same as in PASCAL VOC
 - CVAT
   - [Format specification](https://github.com/opencv/cvat/blob/develop/cvat/apps/documentation/xml_format.md)
   - [Dataset example](../tests/assets/cvat_dataset)

diff --git a/tests/assets/imagenet_dataset/label_0/label_0_1.jpg b/tests/assets/imagenet_dataset/label_0/label_0_1.jpg
diff --git a/tests/assets/imagenet_dataset/label_0/label_0_2.jpg b/tests/assets/imagenet_dataset/label_0/label_0_2.jpg
diff --git a/tests/assets/imagenet_dataset/label_1/label_1_1.jpg b/tests/assets/imagenet_dataset/label_1/label_1_1.jpg
diff --git a/tests/assets/imagenet_txt_dataset/images/1.jpg b/tests/assets/imagenet_txt_dataset/images/1.jpg
diff --git a/tests/assets/imagenet_txt_dataset/images/2.jpg b/tests/assets/imagenet_txt_dataset/images/2.jpg
diff --git a/tests/assets/imagenet_txt_dataset/synsets.txt b/tests/assets/imagenet_txt_dataset/synsets.txt
@@ -0,0 +1,10 @@
+label_0
+label_1
+label_2
+label_3
+label_4
+label_5
+label_6
+label_7
+label_8
+label_9
diff --git a/tests/assets/imagenet_txt_dataset/train.txt b/tests/assets/imagenet_txt_dataset/train.txt
@@ -0,0 +1,4 @@
+1 0
+2 5
+3 3
+4 5
diff --git a/tests/test_cvat_format.py b/tests/test_cvat_format.py
@@ -150,18 +150,18 @@ def test_can_save_and_load(self):
         label_categories = LabelCategories()
         for i in range(10):
             label_categories.add(str(i))
-        label_categories.items[2].attributes.update(['a1', 'a2'])
+        label_categories.items[2].attributes.update(['a1', 'a2', 'empty'])
         label_categories.attributes.update(['occluded'])
 
         source_dataset = Dataset.from_iterable([
             DatasetItem(id=0, subset='s1', image=np.zeros((5, 10, 3)),
                 annotations=[
                     Polygon([0, 0, 4, 0, 4, 4],
                         label=1, group=4,
-                        attributes={ 'occluded': True }),
+                        attributes={ 'occluded': True}),
                     Points([1, 1, 3, 2, 2, 3],
                         label=2,
-                        attributes={ 'a1': 'x', 'a2': 42,
+                        attributes={ 'a1': 'x', 'a2': 42, 'empty': '',
                             'unknown': 'bar' }),
                     Label(1),
                     Label(2, attributes={ 'a1': 'y', 'a2': 44 }),
@@ -199,7 +199,7 @@ def test_can_save_and_load(self):
                         attributes={ 'occluded': True }),
                     Points([1, 1, 3, 2, 2, 3],
                         label=2,
-                        attributes={ 'occluded': False,
+                        attributes={ 'occluded': False, 'empty': '',
                             'a1': 'x', 'a2': 42 }),
                     Label(1),
                     Label(2, attributes={ 'a1': 'y', 'a2': 44 }),
-Original file line number
+Diff line change
@@ -0,0 +1,10 @@
+    label_0
+    label_1
+    label_2
+    label_3
+    label_4
+    label_5
+    label_6
+    label_7
+    label_8
+    label_9