Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a dedicated command to detect a dataset's format #576

Merged
merged 12 commits into from
Jan 25, 2022
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
(<https://github.com/openvinotoolkit/datumaro/pull/584>)
- MARS format (import-only)
(<https://github.com/openvinotoolkit/datumaro/pull/585>)
- Command to detect the format of a dataset
(<https://github.com/openvinotoolkit/datumaro/pull/576>)

### Changed
- The `pycocotools` dependency lower bound is raised to `2.0.4`.
Expand Down
2 changes: 2 additions & 0 deletions datumaro/cli/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ def _get_known_commands():
("", None, ''),
("Dataset operations:", None, ''),
('convert', commands.convert, "Convert dataset between formats"),
('detect-format', commands.detect_format,
"Detect the format of a dataset"),
('diff', commands.diff, "Compare datasets"),
('download', commands.download, "Download a publicly available dataset"),
('explain', commands.explain, "Run Explainable AI algorithm for model"),
Expand Down
8 changes: 4 additions & 4 deletions datumaro/cli/commands/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# Copyright (C) 2019-2021 Intel Corporation
# Copyright (C) 2019-2022 Intel Corporation
#
# SPDX-License-Identifier: MIT

# pylint: disable=redefined-builtin

from . import (
add, checkout, commit, convert, create, diff, download, explain, export,
filter, import_, info, log, merge, patch, remove, stats, status, transform,
validate,
add, checkout, commit, convert, create, detect_format, diff, download,
explain, export, filter, import_, info, log, merge, patch, remove, stats,
status, transform, validate,
)
118 changes: 118 additions & 0 deletions datumaro/cli/commands/detect_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
# Copyright (C) 2021-2022 Intel Corporation
#
# SPDX-License-Identifier: MIT

import argparse
import json

from datumaro.cli.util import MultilineFormatter
from datumaro.cli.util.project import load_project
from datumaro.components.environment import Environment
from datumaro.components.errors import ProjectNotFoundError
from datumaro.components.format_detection import (
RejectionReason, detect_dataset_format,
)
from datumaro.util.scope import scope_add, scoped


def build_parser(parser_ctor=argparse.ArgumentParser):
parser = parser_ctor(help="Detect the format of a dataset",
description="""
Attempts to detect the format of a dataset in a directory.
Currently, only local directories are supported.|n
|n
By default, this command shows a human-readable report with the ID
of the format that was detected (if any). If Datumaro is unable to
unambiguously determine a single format, all matching formats will
be shown.|n
|n
To see why other formats were rejected, use --show-rejections. To get
machine-readable output, use --json-report.|n
|n
The value of -p/--project is used as a context for plugins.|n
|n
Example:|n
|s|s%(prog)s --show-rejections path/to/dataset
""",
formatter_class=MultilineFormatter)

parser.add_argument('url',
help="URL to the dataset; a path to a directory")
parser.add_argument('-p', '--project', dest='project_dir',
help="Directory of the project to use as the context "
"(default: current dir)")
parser.add_argument('--show-rejections', action='store_true',
help="Describe why each supported format that wasn't detected "
"was rejected")
parser.add_argument('--json-report',
help="Path to which to save a JSON report describing detected "
"and rejected formats. By default, no report is saved.")
parser.set_defaults(command=detect_format_command)

return parser

def get_sensitive_args():
return {
detect_format_command: ['url'],
}

@scoped
def detect_format_command(args):
project = None
try:
project = scope_add(load_project(args.project_dir))
except ProjectNotFoundError:
if args.project_dir:
raise

if project is not None:
env = project.env
else:
env = Environment()

report = {'rejected_formats': {}}

def rejection_callback(
format_name: str, reason: RejectionReason, human_message: str,
):
report['rejected_formats'][format_name] = {
'reason': reason.name,
'message': human_message,
}

detected_formats = detect_dataset_format(
((format_name, importer.detect)
for format_name, importer in env.importers.items.items()),
args.url,
rejection_callback=rejection_callback,
)
report['detected_formats'] = detected_formats

if len(detected_formats) == 1:
print(f"Detected format: {detected_formats[0]}")
elif len(detected_formats) == 0:
print("Unable to detect the format")
else:
print("Ambiguous dataset; detected the following formats:")
print()
for format_name in sorted(detected_formats):
print(f"- {format_name}")

if args.show_rejections:
print()
if report['rejected_formats']:
print("The following formats were rejected:")
print()

for format_name, rejection in sorted(
report['rejected_formats'].items()
):
print(f"{format_name}:")
for line in rejection['message'].split('\n'):
print(f" {line}")
else:
print("No formats were rejected.")

if args.json_report:
with open(args.json_report, 'w', encoding='UTF-8') as report_file:
json.dump(report, report_file, indent=4)
41 changes: 7 additions & 34 deletions datumaro/components/environment.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2020-2021 Intel Corporation
# Copyright (C) 2020-2022 Intel Corporation
#
# SPDX-License-Identifier: MIT

Expand All @@ -13,9 +13,7 @@
import os.path as osp

from datumaro.components.cli_plugin import CliPlugin, plugin_types
from datumaro.components.format_detection import (
FormatRequirementsUnmet, apply_format_detector,
)
from datumaro.components.format_detection import detect_dataset_format
from datumaro.util.os_util import import_foreign_module, split_path

T = TypeVar('T')
Expand Down Expand Up @@ -232,33 +230,8 @@ def is_format_known(self, name):
return name in self.importers or name in self.extractors

def detect_dataset(self, path):
max_confidence = 0
matches = []

if not osp.exists(path):
raise FileNotFoundError(f"Path {path} doesn't exist")

for format_name, importer in self.importers.items.items():
log.debug("Checking '%s' format...", format_name)
try:
new_confidence = apply_format_detector(path, importer.detect)
except FormatRequirementsUnmet as cf:
log.debug("Format did not match")
if len(cf.failed_alternatives) > 1:
log.debug("None of the following requirements were met:")
else:
log.debug("The following requirement was not met:")

for req in cf.failed_alternatives:
log.debug(" %s", req)
else:
log.debug("Format matched with confidence %d", new_confidence)

# keep only matches with the highest confidence
if new_confidence > max_confidence:
matches = [format_name]
max_confidence = new_confidence
elif new_confidence == max_confidence:
matches.append(format_name)

return matches
return detect_dataset_format(
((format_name, importer.detect)
for format_name, importer in self.importers.items.items()),
path,
)
104 changes: 101 additions & 3 deletions datumaro/components/format_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,19 @@
#
# SPDX-License-Identifier: MIT

from enum import IntEnum
from enum import Enum, IntEnum, auto
from typing import (
Callable, Collection, Iterator, List, NoReturn, Optional, Sequence, TextIO,
Union,
Any, Callable, Collection, Iterable, Iterator, List, NoReturn, Optional,
Sequence, TextIO, Tuple, Union,
)
import contextlib
import fnmatch
import glob
import logging as log
import os.path as osp

from typing_extensions import Protocol


class FormatDetectionConfidence(IntEnum):
"""
Expand Down Expand Up @@ -59,6 +62,18 @@ def __init__(self, failed_alternatives: Sequence[str]) -> None:
assert failed_alternatives
self.failed_alternatives = tuple(failed_alternatives)

def __str__(self) -> str:
lines = []

if len(self.failed_alternatives) > 1:
lines.append("None of the following requirements were met:")
else:
lines.append("The following requirement was not met:")

lines.extend(' ' + req for req in self.failed_alternatives)

return '\n'.join(lines)

class FormatDetectionContext:
"""
An instance of this class is given to a dataset format detector.
Expand Down Expand Up @@ -331,3 +346,86 @@ def apply_format_detector(
context.fail(f"root path {dataset_root_path} must refer to a directory")

return detector(context) or FormatDetectionConfidence.MEDIUM

class RejectionReason(Enum):
unmet_requirements = auto()
insufficient_confidence = auto()

class RejectionCallback(Protocol):
def __call__(self,
format_name: str, reason: RejectionReason, human_message: str,
) -> Any:
...

def detect_dataset_format(
formats: Iterable[Tuple[str, FormatDetector]],
path: str,
*,
rejection_callback: Optional[RejectionCallback] = None,
) -> Sequence[str]:
"""
Determines which format(s) the dataset at the specified path belongs to.

The function applies each supplied detector to the given patch and decides
whether the corresponding format is detected or rejected. A format may be
rejected if the detector fails or if it succeeds with less confidence than
another detector (other rejection reasons might be added in the future).

Args:
`formats` - The formats to be considered. Each element of the
iterable must be a tuple of a format name and a `FormatDetector`
instance.

`path` - the filesystem path to the dataset to be analyzed.

`rejection_callback` - Unless `None`, called for every rejected format
to report the reason it was rejected.

Returns: a sequence of detected format names.
"""

if not osp.exists(path):
raise FileNotFoundError(f"Path {path} doesn't exist")

def report_insufficient_confidence(
format_name: str,
format_with_more_confidence: str,
):
if rejection_callback:
rejection_callback(
format_name, RejectionReason.insufficient_confidence,
f"Another format ({format_with_more_confidence}) "
"was matched with more confidence",
)

max_confidence = 0
matches = []

for format_name, detector in formats:
log.debug("Checking '%s' format...", format_name)
try:
new_confidence = apply_format_detector(path, detector)
except FormatRequirementsUnmet as ex:
human_message = str(ex)
if rejection_callback:
rejection_callback(
format_name, RejectionReason.unmet_requirements,
human_message)
log.debug(human_message)
else:
log.debug("Format matched with confidence %d", new_confidence)

# keep only matches with the highest confidence
if new_confidence > max_confidence:
for match in matches:
report_insufficient_confidence(match, format_name)

matches = [format_name]
max_confidence = new_confidence
elif new_confidence == max_confidence:
matches.append(format_name)
else: # new confidence is less than max
report_insufficient_confidence(format_name, matches[0])


return matches
1 change: 1 addition & 0 deletions site/content/en/docs/user-manual/_index.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ weight: 3
- [Commit](./command-reference/commit)
- [Convert](./command-reference/convert)
- [Create](./command-reference/create)
- [Detect format](./command-reference/detect-format)
- [Diff](./command-reference/diff)
- [Download](./command-reference/download)
- [Explain](./command-reference/explain)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ flowchart LR
s===s_remove[remove]:::hideclass
d====_add[add]:::filloneclass
d====_create[create]:::filloneclass
d====_detect_format[detect-format]:::filloneclass
d====_download[download]:::filloneclass
d====_export[export]:::filloneclass
d====_import[import]:::filloneclass
Expand Down
Loading