openvinotoolkit · IRDonch · Jan 25, 2022 · Dec 7, 2021 · Dec 7, 2021 · Dec 1, 2021
@@ -23,6 +23,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   (<https://github.com/openvinotoolkit/datumaro/pull/584>)
 - MARS format (import-only)
   (<https://github.com/openvinotoolkit/datumaro/pull/585>)
+- Command to detect the format of a dataset
+  (<https://github.com/openvinotoolkit/datumaro/pull/576>)
 
 ### Changed
 - The `pycocotools` dependency lower bound is raised to `2.0.4`.

@@ -80,6 +80,8 @@ def _get_known_commands():
         ("", None, ''),
         ("Dataset operations:", None, ''),
         ('convert', commands.convert, "Convert dataset between formats"),
+        ('detect-format', commands.detect_format,
+            "Detect the format of a dataset"),
         ('diff', commands.diff, "Compare datasets"),
         ('download', commands.download, "Download a publicly available dataset"),
         ('explain', commands.explain, "Run Explainable AI algorithm for model"),

@@ -1,11 +1,11 @@
-# Copyright (C) 2019-2021 Intel Corporation
+# Copyright (C) 2019-2022 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
 # pylint: disable=redefined-builtin
 
 from . import (
-    add, checkout, commit, convert, create, diff, download, explain, export,
-    filter, import_, info, log, merge, patch, remove, stats, status, transform,
-    validate,
+    add, checkout, commit, convert, create, detect_format, diff, download,
+    explain, export, filter, import_, info, log, merge, patch, remove, stats,
+    status, transform, validate,
 )
@@ -0,0 +1,118 @@
+# Copyright (C) 2021-2022 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+import argparse
+import json
+
+from datumaro.cli.util import MultilineFormatter
+from datumaro.cli.util.project import load_project
+from datumaro.components.environment import Environment
+from datumaro.components.errors import ProjectNotFoundError
+from datumaro.components.format_detection import (
+    RejectionReason, detect_dataset_format,
+)
+from datumaro.util.scope import scope_add, scoped
+
+
+def build_parser(parser_ctor=argparse.ArgumentParser):
+    parser = parser_ctor(help="Detect the format of a dataset",
+        description="""
+        Attempts to detect the format of a dataset in a directory.
+        Currently, only local directories are supported.|n
+        |n
+        By default, this command shows a human-readable report with the ID
+        of the format that was detected (if any). If Datumaro is unable to
+        unambiguously determine a single format, all matching formats will
+        be shown.|n
+        |n
+        To see why other formats were rejected, use --show-rejections. To get
+        machine-readable output, use --json-report.|n
+        |n
+        The value of -p/--project is used as a context for plugins.|n
+        |n
+        Example:|n
+        |s|s%(prog)s --show-rejections path/to/dataset
+        """,
+        formatter_class=MultilineFormatter)
+
+    parser.add_argument('url',
+        help="URL to the dataset; a path to a directory")
+    parser.add_argument('-p', '--project', dest='project_dir',
+        help="Directory of the project to use as the context "
+            "(default: current dir)")
+    parser.add_argument('--show-rejections', action='store_true',
+        help="Describe why each supported format that wasn't detected "
+            "was rejected")
+    parser.add_argument('--json-report',
+        help="Path to which to save a JSON report describing detected "
+            "and rejected formats. By default, no report is saved.")
+    parser.set_defaults(command=detect_format_command)
+
+    return parser
+
+def get_sensitive_args():
+    return {
+        detect_format_command: ['url'],
+    }
+
+@scoped
+def detect_format_command(args):
+    project = None
+    try:
+        project = scope_add(load_project(args.project_dir))
+    except ProjectNotFoundError:
+        if args.project_dir:
+            raise
+
+    if project is not None:
+        env = project.env
+    else:
+        env = Environment()
+
+    report = {'rejected_formats': {}}
+
+    def rejection_callback(
+        format_name: str, reason: RejectionReason, human_message: str,
+    ):
+        report['rejected_formats'][format_name] = {
+            'reason': reason.name,
+            'message': human_message,
+        }
+
+    detected_formats = detect_dataset_format(
+        ((format_name, importer.detect)
+            for format_name, importer in env.importers.items.items()),
+        args.url,
+        rejection_callback=rejection_callback,
+    )
+    report['detected_formats'] = detected_formats
+
+    if len(detected_formats) == 1:
+        print(f"Detected format: {detected_formats[0]}")
+    elif len(detected_formats) == 0:
+        print("Unable to detect the format")
+    else:
+        print("Ambiguous dataset; detected the following formats:")
+        print()
+        for format_name in sorted(detected_formats):
+            print(f"- {format_name}")
+
+    if args.show_rejections:
+        print()
+        if report['rejected_formats']:
+            print("The following formats were rejected:")
+            print()
+
+            for format_name, rejection in sorted(
+                report['rejected_formats'].items()
+            ):
+                print(f"{format_name}:")
+                for line in rejection['message'].split('\n'):
+                    print(f"  {line}")
+        else:
+            print("No formats were rejected.")
+
+    if args.json_report:
+        with open(args.json_report, 'w', encoding='UTF-8') as report_file:
+            json.dump(report, report_file, indent=4)
@@ -1,4 +1,4 @@
-# Copyright (C) 2020-2021 Intel Corporation
+# Copyright (C) 2020-2022 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
@@ -13,9 +13,7 @@
 import os.path as osp
 
 from datumaro.components.cli_plugin import CliPlugin, plugin_types
-from datumaro.components.format_detection import (
-    FormatRequirementsUnmet, apply_format_detector,
-)
+from datumaro.components.format_detection import detect_dataset_format
 from datumaro.util.os_util import import_foreign_module, split_path
 
 T = TypeVar('T')
@@ -232,33 +230,8 @@ def is_format_known(self, name):
         return name in self.importers or name in self.extractors
 
     def detect_dataset(self, path):
-        max_confidence = 0
-        matches = []
-
-        if not osp.exists(path):
-            raise FileNotFoundError(f"Path {path} doesn't exist")
-
-        for format_name, importer in self.importers.items.items():
-            log.debug("Checking '%s' format...", format_name)
-            try:
-                new_confidence = apply_format_detector(path, importer.detect)
-            except FormatRequirementsUnmet as cf:
-                log.debug("Format did not match")
-                if len(cf.failed_alternatives) > 1:
-                    log.debug("None of the following requirements were met:")
-                else:
-                    log.debug("The following requirement was not met:")
-
-                for req in cf.failed_alternatives:
-                    log.debug("  %s", req)
-            else:
-                log.debug("Format matched with confidence %d", new_confidence)
-
-                # keep only matches with the highest confidence
-                if new_confidence > max_confidence:
-                    matches = [format_name]
-                    max_confidence = new_confidence
-                elif new_confidence == max_confidence:
-                    matches.append(format_name)
-
-        return matches
+        return detect_dataset_format(
+            ((format_name, importer.detect)
+                for format_name, importer in self.importers.items.items()),
+            path,
+        )
@@ -2,16 +2,19 @@
 #
 # SPDX-License-Identifier: MIT
 
-from enum import IntEnum
+from enum import Enum, IntEnum, auto
 from typing import (
-    Callable, Collection, Iterator, List, NoReturn, Optional, Sequence, TextIO,
-    Union,
+    Any, Callable, Collection, Iterable, Iterator, List, NoReturn, Optional,
+    Sequence, TextIO, Tuple, Union,
 )
 import contextlib
 import fnmatch
 import glob
+import logging as log
 import os.path as osp
 
+from typing_extensions import Protocol
+
 
 class FormatDetectionConfidence(IntEnum):
     """
@@ -59,6 +62,18 @@ def __init__(self, failed_alternatives: Sequence[str]) -> None:
         assert failed_alternatives
         self.failed_alternatives = tuple(failed_alternatives)
 
+    def __str__(self) -> str:
+        lines = []
+
+        if len(self.failed_alternatives) > 1:
+            lines.append("None of the following requirements were met:")
+        else:
+            lines.append("The following requirement was not met:")
+
+        lines.extend('  ' + req for req in self.failed_alternatives)
+
+        return '\n'.join(lines)
+
 class FormatDetectionContext:
     """
     An instance of this class is given to a dataset format detector.
@@ -331,3 +346,86 @@ def apply_format_detector(
         context.fail(f"root path {dataset_root_path} must refer to a directory")
 
     return detector(context) or FormatDetectionConfidence.MEDIUM
+
+class RejectionReason(Enum):
+    unmet_requirements = auto()
+    insufficient_confidence = auto()
+
+class RejectionCallback(Protocol):
+    def __call__(self,
+        format_name: str, reason: RejectionReason, human_message: str,
+    ) -> Any:
+        ...
+
+def detect_dataset_format(
+    formats: Iterable[Tuple[str, FormatDetector]],
+    path: str,
+    *,
+    rejection_callback: Optional[RejectionCallback] = None,
+) -> Sequence[str]:
+    """
+    Determines which format(s) the dataset at the specified path belongs to.
+
+    The function applies each supplied detector to the given patch and decides
+    whether the corresponding format is detected or rejected. A format may be
+    rejected if the detector fails or if it succeeds with less confidence than
+    another detector (other rejection reasons might be added in the future).
+
+    Args:
+        `formats` - The formats to be considered. Each element of the
+            iterable must be a tuple of a format name and a `FormatDetector`
+            instance.
+
+        `path` - the filesystem path to the dataset to be analyzed.
+
+        `rejection_callback` - Unless `None`, called for every rejected format
+            to report the reason it was rejected.
+
+    Returns: a sequence of detected format names.
+    """
+
+    if not osp.exists(path):
+        raise FileNotFoundError(f"Path {path} doesn't exist")
+
+    def report_insufficient_confidence(
+        format_name: str,
+        format_with_more_confidence: str,
+    ):
+        if rejection_callback:
+            rejection_callback(
+                format_name, RejectionReason.insufficient_confidence,
+                f"Another format ({format_with_more_confidence}) "
+                    "was matched with more confidence",
+            )
+
+    max_confidence = 0
+    matches = []
+
+    for format_name, detector in formats:
+        log.debug("Checking '%s' format...", format_name)
+        try:
+            new_confidence = apply_format_detector(path, detector)
+        except FormatRequirementsUnmet as ex:
+            human_message = str(ex)
+            if rejection_callback:
+                rejection_callback(
+                    format_name, RejectionReason.unmet_requirements,
+                    human_message)
+            log.debug(human_message)
+        else:
+            log.debug("Format matched with confidence %d", new_confidence)
+
+            # keep only matches with the highest confidence
+            if new_confidence > max_confidence:
+                for match in matches:
+                    report_insufficient_confidence(match, format_name)
+
+                matches = [format_name]
+                max_confidence = new_confidence
+            elif new_confidence == max_confidence:
+                matches.append(format_name)
+            else: # new confidence is less than max
+                report_insufficient_confidence(format_name, matches[0])
+
+
+    return matches
@@ -25,6 +25,7 @@ weight: 3
   - [Commit](./command-reference/commit)
   - [Convert](./command-reference/convert)
   - [Create](./command-reference/create)
+  - [Detect format](./command-reference/detect-format)
   - [Diff](./command-reference/diff)
   - [Download](./command-reference/download)
   - [Explain](./command-reference/explain)

@@ -29,6 +29,7 @@ flowchart LR
     s===s_remove[remove]:::hideclass
   d====_add[add]:::filloneclass
   d====_create[create]:::filloneclass
+  d====_detect_format[detect-format]:::filloneclass
   d====_download[download]:::filloneclass
   d====_export[export]:::filloneclass
   d====_import[import]:::filloneclass