Merge back 1.2.0rc2 (#942)

openvinotoolkit · Apr 17, 2023 · f57b37c · f57b37c
2 parents ed3cf05 + 08425db
commit f57b37c
Show file tree

Hide file tree

Showing 53 changed files with 2,819 additions and 1,114 deletions.
diff --git a/.github/workflows/docs_stable.yml b/.github/workflows/docs_stable.yml
@@ -0,0 +1,34 @@
+name: Build Docs for releases
+
+on:
+  workflow_dispatch: # run on request (no need for PR)
+  release:
+    types: [published]
+
+jobs:
+  Build-Docs:
+    runs-on: ubuntu-20.04
+    permissions:
+      contents: write
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0 # otherwise, you will failed to push refs to dest repo
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+      - name: Install dependencies
+        run: python -m pip install tox
+      - name: Build-Docs
+        run: |
+          echo RELEASE_VERSION=${GITHUB_REF#refs/*/} >> $GITHUB_ENV
+          tox -e build-doc
+      # - name: Deploy
+      #   uses: peaceiris/actions-gh-pages@v3
+      #   with:
+      #     github_token: ${{ secrets.GITHUB_TOKEN }}
+      #     publish_dir: ./public
+      #     destination_dir: ${{ env.RELEASE_VERSION }}
+      #     force_orphan: true
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## \[Unreleased\]
 
-## 14/04/2023 - Release 1.2.0rc1
+## 17/04/2023 - Release 1.2.0rc2
 ### New features
 - Add LossDynamicsAnalyzer for noisy label detection
   (<https://github.com/openvinotoolkit/datumaro/pull/928>)

diff --git a/docs/contributing.md → contributing.md b/docs/contributing.md → contributing.md
diff --git a/datumaro/cli/commands/merge.py b/datumaro/cli/commands/merge.py
@@ -1,19 +1,18 @@
-# Copyright (C) 2020-2022 Intel Corporation
+# Copyright (C) 2020-2023 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
 import argparse
 import logging as log
 import os
 import os.path as osp
-from collections import OrderedDict
 
 from datumaro.components.dataset import DEFAULT_FORMAT
 from datumaro.components.environment import Environment
-from datumaro.components.errors import DatasetMergeError, DatasetQualityError, ProjectNotFoundError
+from datumaro.components.errors import ProjectNotFoundError
+from datumaro.components.hl_ops import HLOps
 from datumaro.components.merge.intersect_merge import IntersectMerge
 from datumaro.components.project import ProjectBuildTargets
-from datumaro.util import dump_json_file
 from datumaro.util.scope import scope_add, scoped
 
 from ..util import MultilineFormatter, join_cli_args
@@ -23,18 +22,35 @@
 
 def build_parser(parser_ctor=argparse.ArgumentParser):
     parser = parser_ctor(
-        help="Merge few projects",
+        help="Merge few datasets or projects",
         description="""
         Merges multiple datasets into one and produces a new dataset.
-        The command can be useful if you have few annotations and wish
-        to merge them, taking into consideration potential overlaps and
-        conflicts. This command can try to find common ground by voting or
-        return a list of conflicts.|n
+        The command can be useful if you want to merge several datasets
+        which have homogeneous or heterogeneous label categories.
+        We provide three merge policies: 1) "exact" for homogenous datasets;
+        2) "union", or 3) "intersect" for heterogenous datasets. The default merge
+        policy is "union".
+        |n
         |n
         In simple cases, when dataset images do not intersect and new
-        labels are not added, the recommended way of merging is using the
-        "patch" command. It will offer better performance and provide the same
-        results.|n
+        labels are not added, we can use the "exact" merge policy for this situation.
+        |n
+        |n
+        On the other hand, when two datasets have different label categories or
+        have the same (id, subset) pairs in their dataset items. you have to use
+        the "union" or "intersect" merge.
+        |n
+        The "union" merge is more simple way to merge them.
+        It tries to obtain a union set from their label categories and attempts to merge
+        their dataset items. However, if there exist the same (id, subset) pairs, it appends
+        a suffix to each item's id to prevent the same (id, subset) pair in the merged dataset.
+        |n
+        The "intersect" policy is a more complicated way to merge heterogeneous datasets. If you want to merge
+        the annotations in the same (id, subset) pairs which coming from the different datasets
+        each other you wish to merge. You have to taking into consideration potential overlaps
+        and conflicts between the annotations. "intersect" merge policy can try to find common
+        ground by voting or return a list of conflicts.
+        |n
         |n
         This command has multiple forms:|n
         1) %(prog)s <revpath>|n
@@ -64,12 +80,15 @@ def build_parser(parser_ctor=argparse.ArgumentParser):
         options, which are passed after the '--' separator (see examples),
         pass '-- -h' for more info. If not stated otherwise, by default
         only annotations are exported; to include images pass
-        '--save-images' parameter.|n
+        '--save-media' parameter.|n
         |n
         Examples:|n
         - Merge annotations from 3 (or more) annotators:|n
         |s|s%(prog)s project1/ project2/ project3/|n
         |n
+        - Merge datasets with varying merge policies:|n
+        |s|s%(prog)s project1/ project2/ -m <union|intersect|exact>|n
+        |n
         - Check groups of the merged dataset for consistency:|n
         |s|s|slook for groups consising of 'person', 'hand' 'head', 'foot'|n
         |s|s%(prog)s project1/ project2/ -g 'person,hand?,head,foot?'|n
@@ -84,7 +103,7 @@ def build_parser(parser_ctor=argparse.ArgumentParser):
         |s|s%(prog)s HEAD~2:source-2 path/to/dataset2:yolo
         |n
         - Merge datasets and save in different format:|n
-        |s|s%(prog)s -f voc dataset1/:yolo path2/:coco -- --save-images
+        |s|s%(prog)s -f voc dataset1/:yolo path2/:coco -- --save-media
         """,
         formatter_class=MultilineFormatter,
     )
@@ -97,34 +116,11 @@ def _group(s):
     )  # workaround for -- eaten by positionals
     parser.add_argument("targets", nargs="+", help="Target dataset revpaths (repeatable)")
     parser.add_argument(
-        "-iou",
-        "--iou-thresh",
-        default=0.25,
-        type=float,
-        help="IoU match threshold for segments (default: %(default)s)",
-    )
-    parser.add_argument(
-        "-oconf",
-        "--output-conf-thresh",
-        default=0.0,
-        type=float,
-        help="Confidence threshold for output " "annotations (default: %(default)s)",
-    )
-    parser.add_argument(
-        "--quorum",
-        default=0,
-        type=int,
-        help="Minimum count for a label and attribute voting "
-        "results to be counted (default: %(default)s)",
-    )
-    parser.add_argument(
-        "-g",
-        "--groups",
-        action="append",
-        type=_group,
-        help="A comma-separated list of labels in "
-        "annotation groups to check. '?' postfix can be added to a label to "
-        "make it optional in the group (repeatable)",
+        "-m",
+        "--merge-policy",
+        default="union",
+        type=str,
+        help="Policy for how to merge datasets (default: %(default)s)",
     )
     parser.add_argument(
         "-o",
@@ -152,6 +148,42 @@ def _group(s):
         "Must be specified after the main command arguments and after "
         "the '--' separator",
     )
+
+    intersect_args = parser.add_argument_group(
+        title="intersect policy arguments",
+        description="These parameters are optional for the intersect policy only.",
+    )
+    intersect_args.add_argument(
+        "-iou",
+        "--iou-thresh",
+        default=0.25,
+        type=float,
+        help="IoU match threshold for segments (default: %(default)s)",
+    )
+    intersect_args.add_argument(
+        "-oconf",
+        "--output-conf-thresh",
+        default=0.0,
+        type=float,
+        help="Confidence threshold for output " "annotations (default: %(default)s)",
+    )
+    intersect_args.add_argument(
+        "--quorum",
+        default=0,
+        type=int,
+        help="Minimum count for a label and attribute voting "
+        "results to be counted (default: %(default)s)",
+    )
+    intersect_args.add_argument(
+        "-g",
+        "--groups",
+        action="append",
+        type=_group,
+        help="A comma-separated list of labels in "
+        "annotation groups to check. '?' postfix can be added to a label to "
+        "make it optional in the group (repeatable)",
+    )
+
     parser.set_defaults(command=merge_command)
 
     return parser
@@ -222,48 +254,26 @@ def merge_command(args):
     except Exception as e:
         raise CliException(str(e))
 
-    merger = IntersectMerge(
-        conf=IntersectMerge.Conf(
-            pairwise_dist=args.iou_thresh,
-            groups=args.groups or [],
-            output_conf_thresh=args.output_conf_thresh,
-            quorum=args.quorum,
-        )
+    report_path = osp.join(dst_dir, "merge_report.json")
+    options = (
+        {
+            "conf": IntersectMerge.Conf(
+                pairwise_dist=args.iou_thresh,
+                groups=args.groups or [],
+                output_conf_thresh=args.output_conf_thresh,
+                quorum=args.quorum,
+            )
+        }
+        if args.merge_policy == "intersect"
+        else {}
+    )
+    merged_dataset = HLOps.merge(
+        *source_datasets, merge_policy=args.merge_policy, report_path=report_path, **options
     )
-    merged_dataset = merger.merge(source_datasets)
 
     merged_dataset.export(save_dir=dst_dir, format=exporter, **export_args)
 
-    report_path = osp.join(dst_dir, "merge_report.json")
-    save_merge_report(merger, report_path)
-
     log.info("Merge results have been saved to '%s'" % dst_dir)
     log.info("Report has been saved to '%s'" % report_path)
 
     return 0
-
-
-def save_merge_report(merger, path):
-    item_errors = OrderedDict()
-    source_errors = OrderedDict()
-    all_errors = []
-
-    for e in merger.errors:
-        if isinstance(e, DatasetQualityError):
-            item_errors[str(e.item_id)] = item_errors.get(str(e.item_id), 0) + 1
-        elif isinstance(e, DatasetMergeError):
-            for s in e.sources:
-                source_errors[str(s)] = source_errors.get(s, 0) + 1
-            item_errors[str(e.item_id)] = item_errors.get(str(e.item_id), 0) + 1
-
-        all_errors.append(str(e))
-
-    errors = OrderedDict(
-        [
-            ("Item errors", item_errors),
-            ("Source errors", source_errors),
-            ("All errors", all_errors),
-        ]
-    )
-
-    dump_json_file(path, errors, indent=True)
diff --git a/datumaro/cli/util/project.py b/datumaro/cli/util/project.py
@@ -153,7 +153,7 @@ def split_local_revpath(revpath: str) -> Tuple[Revision, str]:
 
     A local revpath is a path to a revision withing the current project.
     The syntax is:
-      - [ <revision> : ] [ <target> ]
+        - [ <revision> : ] [ <target> ]
     At least one part must be present.
 
     Returns: (revision, build target)

diff --git a/datumaro/components/abstracts/__init__.py b/datumaro/components/abstracts/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (C) 2023 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+from .merger import *
diff --git a/datumaro/components/abstracts/merger.py b/datumaro/components/abstracts/merger.py
@@ -0,0 +1,37 @@
+# Copyright (C) 2023 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+from abc import ABC, abstractmethod
+from typing import Dict, Optional, Sequence, Type
+
+from datumaro.components.dataset_base import IDataset
+from datumaro.components.dataset_item_storage import (
+    DatasetItemStorage,
+    DatasetItemStorageDatasetView,
+)
+from datumaro.components.media import MediaElement
+
+__all__ = ["IMerger"]
+
+
+class IMerger(ABC):
+    @abstractmethod
+    def merge_infos(self, sources: Sequence[IDataset]) -> Dict:
+        raise NotImplementedError
+
+    @abstractmethod
+    def merge_categories(self, sources: Sequence[IDataset]) -> Dict:
+        raise NotImplementedError
+
+    @abstractmethod
+    def merge_media_types(self, sources: Sequence[IDataset]) -> Optional[Type[MediaElement]]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def merge(self, sources: Sequence[IDataset]) -> DatasetItemStorage:
+        raise NotImplementedError
+
+    @abstractmethod
+    def __call__(self, *datasets: IDataset) -> DatasetItemStorageDatasetView:
+        raise NotImplementedError
diff --git a/datumaro/components/annotations/matcher.py b/datumaro/components/annotations/matcher.py
@@ -7,7 +7,7 @@
 import numpy as np
 from attr import attrib, attrs
 
-from datumaro.components.merge.base import Merger
+from datumaro.components.abstracts import IMerger
 from datumaro.util.annotation_util import (
     OKS,
     approximate_line,
@@ -97,7 +97,7 @@ def match_segments(
 
 @attrs(kw_only=True)
 class AnnotationMatcher:
-    _context: Optional[Merger] = attrib(default=None)
+    _context: Optional[IMerger] = attrib(default=None)
 
     def match_annotations(self, sources):
         raise NotImplementedError()