openvinotoolkit · vinnamkim · Jul 6, 2023 · Jul 5, 2023 · Jul 5, 2023 · Jul 5, 2023
@@ -15,12 +15,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   (<https://github.com/openvinotoolkit/datumaro/pull/1049>, <https://github.com/openvinotoolkit/datumaro/pull/1063>, <https://github.com/openvinotoolkit/datumaro/pull/1064>)
 - Add OVMSLauncher
   (<https://github.com/openvinotoolkit/datumaro/pull/1056>)
+- Add Prune API
+  (<https://github.com/openvinotoolkit/datumaro/pull/1058>)
 - Add TritonLauncher
   (<https://github.com/openvinotoolkit/datumaro/pull/1059>)
 - Migrate DVC v3.0.0
   (<https://github.com/openvinotoolkit/datumaro/pull/1072>)
-- Add Prune API
-  (<https://github.com/openvinotoolkit/datumaro/pull/1058>)
+- Support mask annotations for CVAT data format
+  (<https://github.com/openvinotoolkit/datumaro/pull/1078>)
 
 ### Enhancements
 - Enhance import performance for built-in plugins

@@ -22,6 +22,7 @@ Supported annotation types:
 - `Points`
 - `Polygon`
 - `PolyLine`
+- `Mask`
 
 Supported annotation attributes:
 - It supports any arbitrary boolean, floating number, or string attribute.

@@ -7,13 +7,15 @@
 from copy import deepcopy
 from typing import Optional
 
+import numpy as np
 from defusedxml import ElementTree
 
 from datumaro.components.annotation import (
     AnnotationType,
     Bbox,
     Label,
     LabelCategories,
+    Mask,
     Points,
     Polygon,
     PolyLine,
@@ -23,6 +25,7 @@
 from datumaro.components.format_detection import FormatDetectionContext
 from datumaro.components.importer import ImportContext, Importer
 from datumaro.components.media import Image
+from datumaro.util import mask_tools
 
 from .format import CvatPath
 
@@ -46,8 +49,6 @@ def _find_meta_root(path: str):
 
 
 class CvatBase(SubsetBase):
-    _SUPPORTED_SHAPES = ("box", "polygon", "polyline", "points")
-
     def __init__(
         self,
         path: str,
@@ -103,7 +104,7 @@ def _parse(self, path):
                         "height": el.attrib.get("height"),
                     }
                     subset = el.attrib.get("subset")
-                elif el.tag in self._SUPPORTED_SHAPES and (track or image):
+                elif el.tag in CvatPath.SUPPORTED_IMPORT_SHAPES and (track or image):
                     attributes = {}
                     shape = {
                         "type": None,
@@ -134,7 +135,7 @@ def _parse(self, path):
                         except ValueError:
                             pass
                     attributes[el.attrib["name"]] = attr_value
-                elif el.tag in self._SUPPORTED_SHAPES:
+                elif el.tag in CvatPath.SUPPORTED_IMPORT_SHAPES:
                     if track is not None:
                         shape["frame"] = el.attrib["frame"]
                         shape["outside"] = el.attrib.get("outside") == "1"
@@ -159,14 +160,22 @@ def _parse(self, path):
                                 ],
                             )
                         )
+                    elif el.tag == "mask":
+                        shape["rle"] = el.attrib["rle"]
+                        shape["left"] = el.attrib["left"]
+                        shape["top"] = el.attrib["top"]
+                        shape["width"] = el.attrib["width"]
+                        shape["height"] = el.attrib["height"]
                     else:
                         shape["points"] = []
                         for pair in el.attrib["points"].split(";"):
                             shape["points"].extend(map(float, pair.split(",")))
 
                     if subset is None or subset == self._subset:
                         frame_desc = items.get(shape["frame"], {"annotations": []})
-                        frame_desc["annotations"].append(self._parse_shape_ann(shape, categories))
+                        frame_desc["annotations"].append(
+                            self._parse_shape_ann(shape, categories, image)
+                        )
                         items[shape["frame"]] = frame_desc
                     shape = None
 
@@ -240,7 +249,7 @@ def _parse_meta(meta_root):
         return categories, frame_size, attribute_types
 
     @classmethod
-    def _parse_shape_ann(cls, ann, categories):
+    def _parse_shape_ann(cls, ann, categories, image):
         ann_id = ann.get("id", 0)
         ann_type = ann["type"]
 
@@ -307,6 +316,35 @@ def _parse_shape_ann(cls, ann, categories):
                 group=group,
             )
 
+        elif ann_type == "mask":
+            rle = ann.get("rle")
+            mask_w, mask_h = int(ann.get("width")), int(ann.get("height"))
+            mask_l, mask_t = int(ann.get("left")), int(ann.get("top"))
+            img_w, img_h = int(image.get("width")), int(image.get("height"))
+
+            rle_uncompressed = {
+                "counts": np.array([int(str_num) for str_num in rle.split(",")], dtype=np.uint32),
+                "size": np.array([mask_w, mask_h]),
+            }
+
+            def _gen_mask():
+                # From the manual test for the dataset exported from the CVAT 2.5,
+                # the RLE encoding in the dataset has (W, H) binary 2D np.ndarray, not (H, W)
+                # Therefore, we need to tranpose it to make its shape as (H, W).
+                mask = mask_tools.rle_to_mask(rle_uncompressed).transpose()
+                canvas = np.zeros(shape=[img_h, img_w], dtype=np.uint8)
+                canvas[mask_t : mask_t + mask_h, mask_l : mask_l + mask_w] = mask
+                return canvas
+
+            return Mask(
+                image=_gen_mask,
+                label=label_id,
+                z_order=z_order,
+                id=ann_id,
+                attributes=attributes,
+                group=group,
+            )
+
         else:
             raise NotImplementedError("Unknown annotation type '%s'" % ann_type)
 

@@ -17,7 +17,7 @@
 from datumaro.components.errors import MediaTypeError
 from datumaro.components.exporter import Exporter
 from datumaro.components.media import Image
-from datumaro.util import cast, pairs
+from datumaro.util import cast, mask_tools, pairs
 
 from .format import CvatPath
 
@@ -106,6 +106,11 @@ def open_points(self, points):
         self.xmlgen.startElement("points", points)
         self._level += 1
 
+    def open_mask(self, mask):
+        self._indent()
+        self.xmlgen.startElement("mask", mask)
+        self._level += 1
+
     def open_tag(self, tag):
         self._indent()
         self.xmlgen.startElement("tag", tag)
@@ -134,6 +139,9 @@ def close_polyline(self):
     def close_points(self):
         self._close_element("points")
 
+    def close_mask(self):
+        self._close_element("mask")
+
     def close_tag(self):
         self._close_element("tag")
 
@@ -182,12 +190,7 @@ def _write_track(self, track):
 
         self._writer.open_track(track_info)
         for ann in annotations:
-            if ann.type in {
-                AnnotationType.points,
-                AnnotationType.polyline,
-                AnnotationType.polygon,
-                AnnotationType.bbox,
-            }:
+            if ann.type in CvatPath.SUPPORTED_EXPORT_SHAPES:
                 self._write_shape(ann, write_label_info=False, write_frame=True)
         self._writer.close_track()
 
@@ -254,12 +257,7 @@ def _write_item(self, item, index):
         self._writer.open_image(image_info)
 
         for ann in item.annotations:
-            if ann.type in {
-                AnnotationType.points,
-                AnnotationType.polyline,
-                AnnotationType.polygon,
-                AnnotationType.bbox,
-            }:
+            if ann.type in CvatPath.SUPPORTED_EXPORT_SHAPES:
                 self._write_shape(ann, item)
             elif ann.type == AnnotationType.label:
                 self._write_tag(ann, item)
@@ -389,6 +387,22 @@ def _write_shape(self, shape, item=None, write_label_info=True, write_frame=Fals
                     ]
                 )
             )
+        elif shape.type == AnnotationType.mask:
+            # From the manual test for the dataset exported from the CVAT 2.5,
+            # the RLE encoding in the dataset has (W, H) binary 2D np.ndarray, not (H, W)
+            # Therefore, we need to tranpose it to make its shape as (H, W).
+            mask = shape.image.transpose()
+            rle_uncompressed = mask_tools.mask_to_rle(mask)
+            width, height = mask.shape
+            shape_data.update(
+                OrderedDict(
+                    rle=", ".join([str(c) for c in rle_uncompressed["counts"]]),
+                    left=str(0),
+                    top=str(0),
+                    width=str(width),
+                    height=str(height),
+                )
+            )
         else:
             shape_data.update(
                 OrderedDict(
@@ -418,6 +432,8 @@ def _write_shape(self, shape, item=None, write_label_info=True, write_frame=Fals
             self._writer.open_polyline(shape_data)
         elif shape.type == AnnotationType.points:
             self._writer.open_points(shape_data)
+        elif shape.type == AnnotationType.mask:
+            self._writer.open_mask(shape_data)
         else:
             raise NotImplementedError("unknown shape type")
 
@@ -456,6 +472,8 @@ def _write_shape(self, shape, item=None, write_label_info=True, write_frame=Fals
             self._writer.close_polyline()
         elif shape.type == AnnotationType.points:
             self._writer.close_points()
+        elif shape.type == AnnotationType.mask:
+            self._writer.close_mask()
         else:
             raise NotImplementedError("unknown shape type")
 

@@ -3,9 +3,27 @@
 # SPDX-License-Identifier: MIT
 
 
+from datumaro.components.annotation import AnnotationType
+
+
 class CvatPath:
     IMAGES_DIR = "images"
 
     IMAGE_EXT = ".jpg"
 
     BUILTIN_ATTRS = {"occluded", "outside", "keyframe", "track_id"}
+
+    SUPPORTED_IMPORT_SHAPES = {
+        "box",
+        "polygon",
+        "polyline",
+        "points",
+        "mask",
+    }
+    SUPPORTED_EXPORT_SHAPES = {
+        AnnotationType.bbox,
+        AnnotationType.polygon,
+        AnnotationType.polyline,
+        AnnotationType.points,
+        AnnotationType.mask,
+    }
@@ -4,9 +4,10 @@
 
 from functools import partial
 from itertools import chain
-from typing import Tuple
+from typing import Dict, Tuple
 
 import numpy as np
+from pycocotools import mask as pycocotools_mask
 
 from datumaro._capi import encode
 from datumaro.util.image import lazy_image, load_image
@@ -223,15 +224,14 @@ def mask_to_polygons(mask, area_threshold=1):
     Returns:
         A list of polygons like [[x1,y1, x2,y2 ...], [...]]
     """
-    from pycocotools import mask as mask_utils
 
     contours = extract_contours(mask)
 
     polygons = []
     for contour in contours:
         # Check if the polygon is big enough
-        rle = mask_utils.frPyObjects([contour], mask.shape[0], mask.shape[1])
-        area = sum(mask_utils.area(rle))
+        rle = pycocotools_mask.frPyObjects([contour], mask.shape[0], mask.shape[1])
+        area = sum(pycocotools_mask.area(rle))
         if area_threshold <= area:
             polygons.append(contour)
     return polygons
@@ -296,26 +296,24 @@ def crop_covered_segments(
                 ...
             ]
     """
-    from pycocotools import mask as mask_utils
-
     segments = [[s] for s in segments]
-    input_rles = [mask_utils.frPyObjects(s, height, width) for s in segments]
+    input_rles = [pycocotools_mask.frPyObjects(s, height, width) for s in segments]
 
     for i, rle_bottom in enumerate(input_rles):
-        area_bottom = sum(mask_utils.area(rle_bottom))
+        area_bottom = sum(pycocotools_mask.area(rle_bottom))
         if area_bottom < area_threshold:
             segments[i] = [] if not return_masks else None
             continue
 
         rles_top = []
         for j in range(i + 1, len(input_rles)):
             rle_top = input_rles[j]
-            iou = sum(mask_utils.iou(rle_bottom, rle_top, [0]))[0]
+            iou = sum(pycocotools_mask.iou(rle_bottom, rle_top, [0]))[0]
 
             if iou <= iou_threshold:
                 continue
 
-            area_top = sum(mask_utils.area(rle_top))
+            area_top = sum(pycocotools_mask.area(rle_top))
             area_ratio = area_top / area_bottom
 
             # If a segment is fully inside another one, skip this segment
@@ -334,11 +332,11 @@ def crop_covered_segments(
             continue
 
         rle_bottom = rle_bottom[0]
-        bottom_mask = mask_utils.decode(rle_bottom).astype(np.uint8)
+        bottom_mask = pycocotools_mask.decode(rle_bottom).astype(np.uint8)
 
         if rles_top:
-            rle_top = mask_utils.merge(rles_top)
-            top_mask = mask_utils.decode(rle_top).astype(np.uint8)
+            rle_top = pycocotools_mask.merge(rles_top)
+            top_mask = pycocotools_mask.decode(rle_top).astype(np.uint8)
 
             bottom_mask -= top_mask
             bottom_mask[bottom_mask != 1] = 0
@@ -352,14 +350,23 @@ def crop_covered_segments(
 
 
 def rles_to_mask(rles, width, height):
-    from pycocotools import mask as mask_utils
-
-    rles = mask_utils.frPyObjects(rles, height, width)
-    rles = mask_utils.merge(rles)
-    mask = mask_utils.decode(rles)
+    rles = pycocotools_mask.frPyObjects(rles, height, width)
+    rles = pycocotools_mask.merge(rles)
+    mask = pycocotools_mask.decode(rles)
     return mask
 
 
+def rle_to_mask(rle_uncompressed: Dict[str, np.ndarray]) -> np.ndarray:
+    """Decode the uncompressed RLE string to the binary mask (2D np.ndarray)
+
+    The uncompressed RLE string can be obtained by
+    the datumaro.util.mask_tools.mask_to_rle() function
+    """
+    resulting_mask = pycocotools_mask.frPyObjects(rle_uncompressed, *rle_uncompressed["size"])
+    resulting_mask = pycocotools_mask.decode(resulting_mask)
+    return resulting_mask
+
+
 def find_mask_bbox(mask) -> Tuple[int, int, int, int]:
     cols = np.any(mask, axis=0)
     rows = np.any(mask, axis=1)