Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enrich stack trace while importing dataset to improve user experience #992

Merged
merged 12 commits into from
May 9, 2023
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Enhancements
- Use autosummary for fully-automatic Python module docs generation
(<https://github.com/openvinotoolkit/datumaro/pull/973>)
- Enrich stack trace for better user experience when importing
(<https://github.com/openvinotoolkit/datumaro/pull/992>)

### Bug fixes
- Fix Mapillary Vistas data format
Expand Down
7 changes: 6 additions & 1 deletion datumaro/components/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from datumaro.components.errors import (
CategoriesRedefinedError,
ConflictingCategoriesError,
DatasetImportError,
DatasetInfosRedefinedError,
MediaTypeError,
MultipleFormatsMatchError,
Expand Down Expand Up @@ -1249,7 +1250,11 @@ def import_from(
if eager:
dataset.init_cache()
except _ImportFail as e:
raise e.__cause__
cause = e.__cause__ if getattr(e, "__cause__", None) is not None else e
bonhunko marked this conversation as resolved.
Show resolved Hide resolved
cause.__traceback__ = e.__traceback__
raise DatasetImportError(f"Failed to import dataset '{format}' at '{path}'.") from cause
except Exception as e:
raise DatasetImportError(f"Failed to import dataset '{format}' at '{path}'.") from e

dataset._source_path = path
dataset._format = format
Expand Down
4 changes: 3 additions & 1 deletion datumaro/components/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,9 @@ def detect_dataset(
else FormatDetectionConfidence.NONE
)

return [str(format) for format in all_matched_formats if format.confidence == max_conf]
return sorted(
[str(format) for format in all_matched_formats if format.confidence == max_conf]
)

def __reduce__(self):
return (self.__class__, ())
Expand Down
6 changes: 4 additions & 2 deletions datumaro/components/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,10 +272,12 @@ def __str__(self):

@define(auto_exc=False)
class DatasetNotFoundError(DatasetImportError):
path = field()
path: str = field()
format: str = field()
template: str = field(default="Failed to find dataset '{format}' at '{path}'")

def __str__(self):
return f"Failed to find dataset at '{self.path}'"
return self.template.format(path=self.path, format=self.format)


@define(auto_exc=False)
Expand Down
4 changes: 2 additions & 2 deletions datumaro/components/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,11 +113,11 @@ def find_sources_with_params(cls, path: str, **extra_params) -> List[Dict]:

def __call__(self, path, **extra_params):
if not path or not osp.exists(path):
raise DatasetNotFoundError(path)
raise DatasetNotFoundError(path, self.NAME)

found_sources = self.find_sources_with_params(osp.normpath(path), **extra_params)
if not found_sources:
raise DatasetNotFoundError(path)
raise DatasetNotFoundError(path, self.NAME)

sources = []
for desc in found_sources:
Expand Down
23 changes: 14 additions & 9 deletions datumaro/plugins/data_formats/ade20k2017.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,22 @@
# Copyright (C) 2020-2021 Intel Corporation
# Copyright (C) 2020-2023 Intel Corporation
#
# SPDX-License-Identifier: MIT

import errno
import glob
import logging as log
import os
import os.path as osp
import re
from typing import Optional

import numpy as np

from datumaro.components.annotation import AnnotationType, CompiledMask, LabelCategories, Mask
from datumaro.components.dataset_base import DatasetBase, DatasetItem
from datumaro.components.errors import InvalidAnnotationError
from datumaro.components.format_detection import FormatDetectionContext
from datumaro.components.importer import Importer
from datumaro.components.importer import ImportContext, Importer
from datumaro.components.media import Image
from datumaro.util.image import IMAGE_EXTENSIONS, find_images, lazy_image, load_image
from datumaro.util.meta_file_util import has_meta_file, parse_meta_file
Expand All @@ -29,16 +32,16 @@ class Ade20k2017Path:


class Ade20k2017Base(DatasetBase):
def __init__(self, path):
def __init__(self, path: str, *, ctx: Optional[ImportContext] = None):
if not osp.isdir(path):
raise FileNotFoundError("Can't read dataset directory '%s'" % path)
raise NotADirectoryError(errno.ENOTDIR, "Can't find dataset directory", path)

# exclude dataset meta file
subsets = [subset for subset in os.listdir(path) if osp.splitext(subset)[-1] != ".json"]
if len(subsets) < 1:
raise FileNotFoundError("Can't read subsets in directory '%s'" % path)
raise FileNotFoundError(errno.ENOENT, "Can't find subsets in directory", path)

super().__init__(subsets=sorted(subsets))
super().__init__(subsets=sorted(subsets), ctx=ctx)
self._path = path

self._items = []
Expand Down Expand Up @@ -127,16 +130,18 @@ def _load_items(self, subset):
def _load_item_info(self, path):
attr_path = osp.splitext(path)[0] + "_atr.txt"
if not osp.isfile(attr_path):
raise Exception("Can't find annotation file for image %s" % path)
raise FileNotFoundError(
errno.ENOENT, "Can't find annotation file for image %s" % path, attr_path
)

item_info = []
with open(attr_path, "r", encoding="utf-8") as f:
for line in f:
columns = [s.strip() for s in line.split("#")]
if len(columns) != 6:
raise Exception("Invalid line in %s" % attr_path)
raise InvalidAnnotationError("Invalid line in %s" % attr_path)
if columns[5][0] != '"' or columns[5][-1] != '"':
raise Exception(
raise InvalidAnnotationError(
"Attributes column are expected \
in double quotes, file %s"
% attr_path
Expand Down
20 changes: 10 additions & 10 deletions datumaro/plugins/data_formats/ade20k2020.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
# Copyright (C) 2020-2022 Intel Corporation
# Copyright (C) 2020-2023 Intel Corporation
#
# SPDX-License-Identifier: MIT

import errno
import glob
import logging as log
import os
import os.path as osp
import re
from typing import Optional

import numpy as np

Expand All @@ -19,7 +21,7 @@
)
from datumaro.components.dataset_base import DatasetBase, DatasetItem
from datumaro.components.format_detection import FormatDetectionContext
from datumaro.components.importer import Importer
from datumaro.components.importer import ImportContext, Importer
from datumaro.components.media import Image
from datumaro.util import parse_json
from datumaro.util.image import IMAGE_EXTENSIONS, find_images, lazy_image, load_image
Expand All @@ -37,16 +39,16 @@ class Ade20k2020Path:


class Ade20k2020Base(DatasetBase):
def __init__(self, path):
def __init__(self, path: str, *, ctx: Optional[ImportContext] = None):
if not osp.isdir(path):
raise FileNotFoundError("Can't read dataset directory '%s'" % path)
raise NotADirectoryError(errno.ENOTDIR, "Can't find dataset directory", path)

# exclude dataset meta file
subsets = [subset for subset in os.listdir(path) if osp.splitext(subset)[-1] != ".json"]
if len(subsets) < 1:
raise FileNotFoundError("Can't read subsets in directory '%s'" % path)
raise FileNotFoundError(errno.ENOENT, "Can't find subsets in directory", path)

super().__init__(subsets=sorted(subsets))
super().__init__(subsets=sorted(subsets), ctx=ctx)
self._path = path

self._items = []
Expand Down Expand Up @@ -167,10 +169,8 @@ def _load_item_info(self, path):
json_path = osp.splitext(path)[0] + ".json"
item_info = []
if not osp.isfile(json_path):
raise Exception(
"Can't find annotation file (*.json) \
for image %s"
% path
raise FileNotFoundError(
errno.ENOENT, "Can't find annotation file for image %s" % path, json_path
)

with open(json_path, "r", encoding="latin-1") as f:
Expand Down
15 changes: 13 additions & 2 deletions datumaro/plugins/data_formats/arrow/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@

import os.path as osp
import struct
from typing import List, Optional

import pyarrow as pa

from datumaro.components.dataset_base import SubsetBase
from datumaro.components.errors import MediaTypeError
from datumaro.components.importer import ImportContext
from datumaro.components.media import MediaType
from datumaro.components.merge import get_merger
from datumaro.plugins.data_formats.datumaro.base import DatumaroBase
Expand All @@ -19,10 +21,19 @@


class ArrowBase(SubsetBase):
def __init__(self, path, ctx, subset, additional_paths=[]):
def __init__(
self,
path: str,
additional_paths: Optional[List[str]] = None,
*,
subset: Optional[str] = None,
ctx: Optional[ImportContext] = None,
):
super().__init__(subset=subset, ctx=ctx)

self._paths = [path] + additional_paths
self._paths = [path]
if additional_paths:
self._paths += additional_paths

self._load()

Expand Down
46 changes: 29 additions & 17 deletions datumaro/plugins/data_formats/ava/ava.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
# Copyright (C) 2022 Intel Corporation
# Copyright (C) 2022-2023 Intel Corporation
#
# SPDX-License-Identifier: MIT

import csv
import errno
import os
import os.path as osp
from typing import Optional

import google.protobuf.text_format as text_format

from datumaro.components.annotation import AnnotationType, Bbox, LabelCategories
from datumaro.components.dataset_base import DatasetItem, SubsetBase
from datumaro.components.errors import DatasetImportError, MediaTypeError
from datumaro.components.errors import MediaTypeError
from datumaro.components.exporter import Exporter
from datumaro.components.format_detection import FormatDetectionConfidence, FormatDetectionContext
from datumaro.components.importer import Importer
from datumaro.components.importer import ImportContext, Importer
from datumaro.components.media import Image
from datumaro.util.os_util import find_files

Expand All @@ -32,32 +34,41 @@ class AvaPath:


class AvaBase(SubsetBase):
def __init__(self, path):
def __init__(
self,
path: str,
*,
subset: Optional[str] = None,
ctx: Optional[ImportContext] = None,
):
if not osp.isfile(path):
raise DatasetImportError(f"Can't find CSV file at '{path}'")
raise FileNotFoundError(errno.ENOENT, "Can't find CSV file", path)
self._path = path

subset = (
osp.splitext(osp.basename(path))[0]
.replace(AvaPath.ANNOTATION_PREFIX, "")
.replace(AvaPath.ANNOTATION_VERSION, "")
)
super().__init__(subset=subset)
if not subset:
subset = (
osp.splitext(osp.basename(path))[0]
.replace(AvaPath.ANNOTATION_PREFIX, "")
.replace(AvaPath.ANNOTATION_VERSION, "")
)
super().__init__(subset=subset, ctx=ctx)

if path.endswith(osp.join(AvaPath.ANNOTATION_DIR, osp.basename(path))):
self._rootpath = path.rsplit(AvaPath.ANNOTATION_DIR, maxsplit=1)[0]
else:
raise DatasetImportError(
raise FileNotFoundError(
errno.ENOENT,
f"Annotation path ({path}) should be under the directory which is named {AvaPath.ANNOTATION_DIR}. "
"If not, Datumaro fails to find the root path for this dataset."
"If not, Datumaro fails to find the root path for this dataset.",
)

if self._rootpath and osp.isdir(osp.join(self._rootpath, AvaPath.IMAGE_DIR)):
self._images_dir = osp.join(self._rootpath, AvaPath.IMAGE_DIR)
else:
raise DatasetImportError(
raise FileNotFoundError(
errno.ENOENT,
f"Root path ({self._rootpath}) should contain the directory which is named {AvaPath.IMAGE_DIR}. "
"If not, Datumaro fails to find the image directory path."
"If not, Datumaro fails to find the image directory path.",
)

self._infos = self._load_infos(osp.dirname(path))
Expand All @@ -78,9 +89,10 @@ def _load_infos(self, path):

def _load_categories(self, category_path):
if not osp.exists(category_path):
raise DatasetImportError(
raise FileNotFoundError(
errno.ENOENT,
f"Label lists cannot be found in ({category_path}). "
"If not, Datumaro fails to import AVA action dataset."
"If not, Datumaro fails to import AVA action dataset.",
)

with open(category_path, "r") as f:
Expand Down
12 changes: 7 additions & 5 deletions datumaro/plugins/data_formats/brats.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
# Copyright (C) 2022 Intel Corporation
# Copyright (C) 2022-2023 Intel Corporation
#
# SPDX-License-Identifier: MIT

import errno
import glob
import os.path as osp
from typing import Optional

import nibabel as nib
import numpy as np

from datumaro.components.annotation import AnnotationType, LabelCategories, Mask
from datumaro.components.dataset_base import DatasetItem, SubsetBase
from datumaro.components.format_detection import FormatDetectionContext
from datumaro.components.importer import Importer
from datumaro.components.importer import ImportContext, Importer
from datumaro.components.media import MultiframeImage


Expand All @@ -22,17 +24,17 @@ class BratsPath:


class BratsBase(SubsetBase):
def __init__(self, path):
def __init__(self, path: str, *, ctx: Optional[ImportContext] = None):
if not osp.isdir(path):
raise FileNotFoundError("Can't read dataset directory '%s'" % path)
raise NotADirectoryError(errno.ENOTDIR, "Can't find dataset directory", path)

self._subset_suffix = osp.basename(path)[len(BratsPath.IMAGES_DIR) :]
subset = None
if self._subset_suffix == "Tr":
subset = "train"
elif self._subset_suffix == "Ts":
subset = "test"
super().__init__(subset=subset, media_type=MultiframeImage)
super().__init__(subset=subset, media_type=MultiframeImage, ctx=ctx)

self._root_dir = osp.dirname(path)
self._categories = self._load_categories()
Expand Down
Loading