Fix Mapillary Vistas data format (#977)

### Summary  ### How to test  ### Checklist  - [x] I have added unit tests to cover my changes. - [ ] I have added integration tests to cover my changes. - [x] I have added the description of my changes into [CHANGELOG](https://github.com/openvinotoolkit/datumaro/blob/develop/CHANGELOG.md). - [x] I have updated the [documentation](https://github.com/openvinotoolkit/datumaro/tree/develop/docs) accordingly ### License - [ ] I submit _my code changes_ under the same [MIT License](https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE) that covers the project. Feel free to contact the maintainers if that's a concern. - [ ] I have updated the license header for each file (see an example below). ```python # Copyright (C) 2023 Intel Corporation # # SPDX-License-Identifier: MIT ``` --------- Co-authored-by: wonjuleee <wonju@intel.com>
openvinotoolkit · Apr 27, 2023 · ce714d2 · ce714d2
1 parent 6f1c6dd
commit ce714d2
Show file tree

Hide file tree

Showing 53 changed files with 685 additions and 790 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,11 +6,15 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## \[Unreleased\]
-
 ### New features
 - Add CocoRoboflowImporter
   (<https://github.com/openvinotoolkit/datumaro/pull/976>)
 
+### Enhancements
+
+### Bug fixes
+- Fix Mapillary Vistas data format (<https://github.com/openvinotoolkit/datumaro/pull/977>)
+
 ## 20/04/2023 - Release 1.2.0
 ### New features
 - Add Skill Up section to documentation

diff --git a/datumaro/plugins/data_formats/mapillary_vistas/base.py b/datumaro/plugins/data_formats/mapillary_vistas/base.py
@@ -1,7 +1,6 @@
-# Copyright (C) 2022 Intel Corporation
+# Copyright (C) 2022-2023 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
-import glob
 import logging as log
 import os
 import os.path as osp
@@ -34,8 +33,21 @@
 
 class _MapillaryVistasBase(SubsetBase):
     def __init__(
-        self, path, task, subset=None, use_original_config=False, keep_original_category_ids=False
+        self,
+        path,
+        task,
+        subset=None,
+        use_original_config=False,
+        keep_original_category_ids=False,
+        format_version="v2.0",
+        parse_polygon=False,
     ):
+        if format_version == "v1.2" and parse_polygon is True:
+            raise ImportError(
+                f"Format version {format_version} is not available for polygons. "
+                "Please try with v2.0 for parsing polygons."
+            )
+
         assert osp.isdir(path), path
         self._path = path
         if subset is None:
@@ -45,24 +57,25 @@ def __init__(
         annotations_dirs = [d for d in os.listdir(path) if d in MapillaryVistasPath.ANNOTATION_DIRS]
 
         if len(annotations_dirs) == 0:
+            expected_dirs = ",".join(MapillaryVistasPath.ANNOTATION_DIRS[format_version])
             raise NotADirectoryError(
-                "Can't find annotation directory at %s. "
-                "Expected one of these directories: %s"
-                % (path, ",".join(MapillaryVistasPath.ANNOTATIONS_DIR_PATTERNS))
+                f"Can't find annotation directory at {path}. "
+                f"Expected one of these directories: {expected_dirs}."
             )
         elif len(annotations_dirs) > 1:
+            skipped_dirs = ",".join(annotations_dirs[1:])
             log.warning(
-                "Directory(-es): %s will be skipped, dataset should contain "
-                "only one annotation directory" % ",".join(annotations_dirs[1:])
+                f"Directory(-es): {skipped_dirs} will be skipped, dataset should "
+                "contain only one annotation directory"
             )
 
         self._use_original_config = use_original_config
-        self._format_version = annotations_dirs[0]
-        self._annotations_dir = osp.join(path, annotations_dirs[0])
+        self._format_version = format_version
+        self._parse_polygon = parse_polygon
+        self._annotations_dir = osp.join(path, format_version)
         self._images_dir = osp.join(path, MapillaryVistasPath.IMAGES_DIR)
-        self._task = task
 
-        if self._task == MapillaryVistasTask.instances:
+        if task == MapillaryVistasTask.instances:
             if has_meta_file(path):
                 self._categories = make_mapillary_instance_categories(parse_meta_file(path))
             else:
@@ -75,16 +88,16 @@ def __init__(
             )
             self._items = self._load_panoptic_items(panoptic_config)
 
-    @staticmethod
-    def _load_panoptic_config(path):
+    def _load_panoptic_config(self, path):
         panoptic_config_path = osp.join(
-            path, MapillaryVistasPath.PANOPTIC_DIR, MapillaryVistasPath.PANOPTIC_CONFIG
+            path,
+            MapillaryVistasPath.PANOPTIC_DIR,
+            MapillaryVistasPath.PANOPTIC_CONFIG[self._format_version],
         )
 
         if not osp.isfile(panoptic_config_path):
             raise FileNotFoundError(
-                "Can't find panoptic config file: '%s' at '%s'"
-                % (MapillaryVistasPath.PANOPTIC_CONFIG, panoptic_config_path)
+                f"Can't find panoptic config file: {MapillaryVistasPath.PANOPTIC_CONFIG} at {panoptic_config_path}"
             )
 
         return parse_json_file(panoptic_config_path)
@@ -127,6 +140,8 @@ def _load_panoptic_items(self, config):
             for img in config["images"]
         }
 
+        polygon_dir = osp.join(self._annotations_dir, MapillaryVistasPath.POLYGON_DIR)
+
         for item_ann in config["annotations"]:
             item_id = item_ann["image_id"]
             image = None
@@ -136,13 +151,13 @@ def _load_panoptic_items(self, config):
                     size=self._get_image_size(images_info[item_id]),
                 )
 
-            annotations = []
             mask_path = osp.join(
                 self._annotations_dir, MapillaryVistasPath.PANOPTIC_DIR, item_ann["file_name"]
             )
             mask = lazy_image(mask_path, loader=self._load_pan_mask)
             mask = CompiledMask(instance_mask=mask)
 
+            annotations = []
             for segment_info in item_ann["segments_info"]:
                 cat_id = self._get_label_id(segment_info)
                 segment_id = segment_info["id"]
@@ -157,11 +172,24 @@ def _load_panoptic_items(self, config):
                     )
                 )
 
+            if self._parse_polygon:
+                polygon_path = osp.join(polygon_dir, item_id + ".json")
+                item_info = parse_json_file(polygon_path)
+
+                polygons = item_info["objects"]
+                for polygon in polygons:
+                    label = polygon["label"]
+                    label_id = self._categories[AnnotationType.label].find(label)[0]
+                    if label_id is None:
+                        label_id = self._categories[AnnotationType.label].add(label)
+
+                    points = [int(coord) for point in polygon["polygon"] for coord in point]
+                    annotations.append(Polygon(label=label_id, points=points))
+
             items[item_id] = DatasetItem(
                 id=item_id, subset=self._subset, annotations=annotations, media=image
             )
 
-        self._load_polygons(items)
         return items.values()
 
     def _load_instances_categories(self):
@@ -180,80 +208,60 @@ def _load_instances_categories(self):
     def _load_instances_items(self):
         items = {}
 
-        instances_dir = osp.join(self._annotations_dir, MapillaryVistasPath.INSTANCES_DIR)
-        for instance_path in find_images(instances_dir, recursive=True):
-            item_id = osp.splitext(osp.relpath(instance_path, instances_dir))[0]
+        # class_dir = osp.join(self._annotations_dir, MapillaryVistasPath.CLASS_DIR)
+        # for class_path in find_images(class_dir, recursive=True):
+        #     item_id = osp.splitext(osp.relpath(class_path, class_dir))[0]
+        #     if item_id in items:
+        #         continue
 
-            mask = load_image(instance_path, dtype=np.uint32)
+        #     from PIL import Image as PILImage
 
-            annotations = []
-            for uval in np.unique(mask):
-                label_id, instance_id = uval >> 8, uval & 255
-                annotations.append(
-                    Mask(image=self._lazy_extract_mask(mask, uval), label=label_id, id=instance_id)
-                )
+        #     class_mask = np.array(PILImage.open(class_path))
+        #     classes = np.unique(class_mask)
 
-            items[item_id] = DatasetItem(id=item_id, subset=self._subset, annotations=annotations)
+        #     annotations = []
+        #     for label_id in classes:
+        #         annotations.append(
+        #             Mask(label=label_id, image=self._lazy_extract_mask(class_mask, label_id))
+        #         )
 
-        class_dir = osp.join(self._annotations_dir, MapillaryVistasPath.CLASS_DIR)
-        for class_path in find_images(class_dir, recursive=True):
-            item_id = osp.splitext(osp.relpath(class_path, class_dir))[0]
-            if item_id in items:
-                continue
+        #     items[item_id] = DatasetItem(id=item_id, subset=self._subset, annotations=annotations)
 
-            from PIL import Image as PILImage
+        instance_dir = osp.join(self._annotations_dir, MapillaryVistasPath.INSTANCES_DIR)
+        polygon_dir = osp.join(self._annotations_dir, MapillaryVistasPath.POLYGON_DIR)
+        for image_path in find_images(self._images_dir, recursive=True):
+            item_id = osp.splitext(osp.relpath(image_path, self._images_dir))[0]
+            image = Image.from_file(path=image_path)
 
-            class_mask = np.array(PILImage.open(class_path))
-            classes = np.unique(class_mask)
+            instance_path = osp.join(instance_dir, item_id + MapillaryVistasPath.MASK_EXT)
+            mask = load_image(instance_path, dtype=np.uint32)
 
             annotations = []
-            for label_id in classes:
+            for uval in np.unique(mask):
+                label_id, instance_id = uval >> 8, uval & 255
                 annotations.append(
-                    Mask(label=label_id, image=self._lazy_extract_mask(class_mask, label_id))
+                    Mask(image=self._lazy_extract_mask(mask, uval), label=label_id, id=instance_id)
                 )
 
-            items[item_id] = DatasetItem(id=item_id, subset=self._subset, annotations=annotations)
+            if self._parse_polygon:
+                polygon_path = osp.join(polygon_dir, item_id + ".json")
+                item_info = parse_json_file(polygon_path)
 
-        for image_path in find_images(self._images_dir, recursive=True):
-            item_id = osp.splitext(osp.relpath(image_path, self._images_dir))[0]
-            image = Image.from_file(path=image_path)
-            if item_id in items:
-                items[item_id].media = image
-            else:
-                items[item_id] = DatasetItem(id=item_id, subset=self._subset, media=image)
+                polygons = item_info["objects"]
+                for polygon in polygons:
+                    label = polygon["label"]
+                    label_id = self._categories[AnnotationType.label].find(label)[0]
+                    if label_id is None:
+                        label_id = self._categories[AnnotationType.label].add(label)
 
-        self._load_polygons(items)
-        return items.values()
-
-    def _load_polygons(self, items):
-        polygons_dir = osp.join(self._annotations_dir, MapillaryVistasPath.POLYGON_DIR)
-        for item_path in glob.glob(osp.join(polygons_dir, "**", "*.json"), recursive=True):
-            item_id = osp.splitext(osp.relpath(item_path, polygons_dir))[0]
-            item = items.get(item_id)
-            item_info = {}
-            item_info = parse_json_file(item_path)
+                    points = [int(coord) for point in polygon["polygon"] for coord in point]
+                    annotations.append(Polygon(label=label_id, points=points))
 
-            image_size = self._get_image_size(item_info)
-            if image_size and item.has_image:
-                item.media = item.image.from_self(size=image_size)
+            items[item_id] = DatasetItem(
+                id=item_id, subset=self._subset, media=image, annotations=annotations
+            )
 
-            polygons = item_info["objects"]
-            annotations = []
-            for polygon in polygons:
-                label = polygon["label"]
-                label_id = self._categories[AnnotationType.label].find(label)[0]
-                if label_id is None:
-                    label_id = self._categories[AnnotationType.label].add(label)
-
-                points = [coord for point in polygon["polygon"] for coord in point]
-                annotations.append(Polygon(label=label_id, points=points))
-
-            if item is None:
-                items[item_id] = DatasetItem(
-                    id=item_id, subset=self._subset, annotations=annotations
-                )
-            else:
-                item.annotations.extend(annotations)
+        return items.values()
 
     @staticmethod
     def _get_image_size(image_info):

diff --git a/datumaro/plugins/data_formats/mapillary_vistas/format.py b/datumaro/plugins/data_formats/mapillary_vistas/format.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2022 Intel Corporation
+# Copyright (C) 2022-2023 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
@@ -49,12 +49,12 @@ class MapillaryVistasPath:
     MASK_EXT = ".png"
 
     ANNOTATION_DIRS = {
-        "v1.2": [CLASS_DIR, INSTANCES_DIR],
+        "v1.2": [CLASS_DIR, INSTANCES_DIR, PANOPTIC_DIR],
         "v2.0": [CLASS_DIR, INSTANCES_DIR, PANOPTIC_DIR, POLYGON_DIR],
     }
 
     CONFIG_FILES = {"v1.2": "config_v1.2.json", "v2.0": "config_v2.0.json"}
-    PANOPTIC_CONFIG = "panoptic_2020.json"
+    PANOPTIC_CONFIG = {"v1.2": "panoptic_2018.json", "v2.0": "panoptic_2020.json"}
 
     CLASS_BY_DIR = {
         INSTANCES_DIR: MapillaryVistasTask.instances,

diff --git a/datumaro/plugins/data_formats/mapillary_vistas/importer.py b/datumaro/plugins/data_formats/mapillary_vistas/importer.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2021 Intel Corporation
+# Copyright (C) 2022-2023 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 import glob
@@ -7,6 +7,7 @@
 
 from datumaro.components.dataset_base import DEFAULT_SUBSET_NAME
 from datumaro.components.importer import Importer
+from datumaro.util import str_to_bool
 
 from .base import MapillaryVistasInstancesBase, MapillaryVistasPanopticBase
 from .format import MapillaryVistasPath, MapillaryVistasTask
@@ -21,6 +22,18 @@ class MapillaryVistasImporter(Importer):
     @classmethod
     def build_cmdline_parser(cls, **kwargs):
         parser = super().build_cmdline_parser(**kwargs)
+        parser.add_argument(
+            "--format-version",
+            default="v2.0",
+            type=str,
+            help="Use original config*.json file for your version of dataset",
+        )
+        parser.add_argument(
+            "--parse-polygon",
+            type=str_to_bool,
+            default=False,
+            help="Use original config*.json file for your version of dataset",
+        )
         parser.add_argument(
             "--use-original-config",
             action="store_true",
@@ -39,15 +52,15 @@ def __call__(self, path, **extra_params):
         subsets = self.find_sources(path)
 
         if len(subsets) == 0:
-            raise Exception("Failed to find Mapillary Vistas dataset at '%s'" % path)
+            raise Exception(f"Failed to find Mapillary Vistas dataset at {path}")
 
         tasks = list(set(task for subset in subsets.values() for task in subset))
         selected_task = tasks[0]
         if 1 < len(tasks):
+            task_types = ",".join(task.name for task in tasks)
             log.warning(
-                "Found potentially conflicting source types: %s"
-                "Only one one type will be used: %s"
-                % (",".join(task.name for task in tasks), selected_task.name)
+                f"Found potentially conflicting source types: {task_types}"
+                f"Only one one type will be used: {selected_task.name}"
             )
 
         if selected_task == MapillaryVistasTask.instances:
@@ -60,8 +73,8 @@ def __call__(self, path, **extra_params):
 
             if not has_config and not extra_params.get("use_original_config"):
                 raise Exception(
-                    "Failed to find config*.json at '%s'. "
-                    "See extra args for using original config" % path
+                    f"Failed to find config*.json at {path}. "
+                    "See extra args for using original config."
                 )
 
         sources = [