Skip to content

Commit

Permalink
Mergeback 1.9.1 to develop (#1623)
Browse files Browse the repository at this point in the history
<!-- Contributing guide:
https://github.com/openvinotoolkit/datumaro/blob/develop/CONTRIBUTING.md
-->

### Summary

<!--
Resolves #111 and #222.
Depends on #1000 (for series of dependent commits).

This PR introduces this capability to make the project better in this
and that.

- Added this feature
- Removed that feature
- Fixed the problem #1234
-->

### How to test
<!-- Describe the testing procedure for reviewers, if changes are
not fully covered by unit tests or manual testing can be complicated.
-->

### Checklist
<!-- Put an 'x' in all the boxes that apply -->
- [ ] I have added unit tests to cover my changes.​
- [ ] I have added integration tests to cover my changes.​
- [ ] I have added the description of my changes into
[CHANGELOG](https://github.com/openvinotoolkit/datumaro/blob/develop/CHANGELOG.md).​
- [ ] I have updated the
[documentation](https://github.com/openvinotoolkit/datumaro/tree/develop/docs)
accordingly

### License

- [ ] I submit _my code changes_ under the same [MIT
License](https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE)
that covers the project.
  Feel free to contact the maintainers if that's a concern.
- [ ] I have updated the license header for each file (see an example
below).

```python
# Copyright (C) 2024 Intel Corporation
#
# SPDX-License-Identifier: MIT
```

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: Ilya Trushkin <ilya.trushkin@intel.com>
Co-authored-by: Sooah Lee <sooah.lee@intel.com>
Co-authored-by: Ilya Trushkin <ilya.trushkin@intel.com>
Co-authored-by: williamcorsel <31770711+williamcorsel@users.noreply.github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Wonju Lee <wonju.lee@intel.com>
  • Loading branch information
6 people authored Sep 27, 2024
1 parent c4d7bb4 commit fdd1ac2
Show file tree
Hide file tree
Showing 16 changed files with 184 additions and 24 deletions.
13 changes: 12 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Bug fixes

## \[Q3 2024 Release 1.9.0\]
## Q4 2024 Release 1.9.1
### Enhancements
- Support multiple labels for kaggle format
(<https://github.com/openvinotoolkit/datumaro/pull/1607>)
- Use DataFrame.map instead of DataFrame.applymap
(<https://github.com/openvinotoolkit/datumaro/pull/1613>)

### Bug fixes
- Fix StreamDataset merging when importing in eager mode
(<https://github.com/openvinotoolkit/datumaro/pull/1609>)

## Q3 2024 Release 1.9.0
### New features
- Add a new CLI command: datum format
(<https://github.com/openvinotoolkit/datumaro/pull/1570>)
Expand Down
12 changes: 12 additions & 0 deletions docs/source/docs/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,18 @@ Release Notes
.. toctree::
:maxdepth: 1

v1.9.1 (2024 Q3)
----------------

Enhancements
^^^^^^^^^^^^
- Support multiple labels for kaggle format
- Use DataFrame.map instead of DataFrame.applymap

Bug fixes
^^^^^^^^^
- Fix StreamDataset merging when importing in eager mode

v1.9.0 (2024 Q3)
----------------

Expand Down
11 changes: 8 additions & 3 deletions src/datumaro/components/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1023,17 +1023,22 @@ class _MergedStreamDataset(cls):
def __init__(self, *sources: IDataset):
from datumaro.components.hl_ops import HLOps

self.merged = HLOps.merge(*sources, merge_policy=merge_policy)
self._merged = HLOps.merge(*sources, merge_policy=merge_policy)
self._data = self._merged._data
self._env = env
self._format = DEFAULT_FORMAT
self._source_path = None
self._options = {}

def __iter__(self):
yield from self.merged
yield from self._merged

@property
def is_stream(self):
return True

def subsets(self) -> Dict[str, DatasetSubset]:
return self.merged.subsets()
return self._merged.subsets()

return _MergedStreamDataset(*sources)

Expand Down
75 changes: 59 additions & 16 deletions src/datumaro/plugins/data_formats/kaggle/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,21 +77,43 @@ def _parse_bbox_coords(self, bbox_str):
# expected to output [x1, y1, x2, y2]
return [float(coord.strip()) for coord in coords]

def _load_annotations(self, datas: list, indices: Dict[str, int], bbox_flag: bool):
def _load_annotations(
self, datas: list, indices: Dict[str, Union[int, Dict[str, int]]], bbox_flag: bool
):
if "label" in indices:
label_name = str(datas[indices["label"]])
label, cat = self._label_cat.find(label_name)
if not cat:
self._label_cat.add(label_name)
label, _ = self._label_cat.find(label_name)
label_indices = indices["label"]
if isinstance(label_indices, dict):
labels = []
list_values = datas[1:]
index_to_label = {v: k for k, v in label_indices.items()}
present_labels = [
index_to_label[i + 1] for i, value in enumerate(list_values) if value == "1"
]

for label_name in present_labels:
label, cat = self._label_cat.find(label_name)
if not cat:
self._label_cat.add(label_name)
label, _ = self._label_cat.find(label_name)
labels.append(Label(label=label))
else:
label_name = str(datas[indices["label"]])
label, cat = self._label_cat.find(label_name)
if not cat:
self._label_cat.add(label_name)
label, _ = self._label_cat.find(label_name)
else:
_, cat = self._label_cat.find("object")
if not cat:
self._label_cat.add("object")
label = 0

if "label" in indices and not bbox_flag:
label_indices = indices["label"]
if isinstance(label_indices, dict):
return labels
return Label(label=label)

if bbox_flag:
if "bbox" in indices:
coords = self._parse_bbox_coords(datas[indices["bbox"]])
Expand Down Expand Up @@ -125,7 +147,14 @@ def _load_items(self, ann_file: str, columns: Dict[str, Union[str, list]]):

indices = {"media": df_fields.index(columns["media"])}
if "label" in columns:
indices.update({"label": df_fields.index(columns["label"])})
label_columns = columns["label"]
if isinstance(label_columns, list):
indices_label = {}
for label in label_columns:
indices_label[label] = df_fields.index(label)
indices.update({"label": indices_label})
else:
indices.update({"label": df_fields.index(label_columns)})

bbox_flag = False
bbox_index = columns.get("bbox")
Expand Down Expand Up @@ -165,16 +194,30 @@ def _load_items(self, ann_file: str, columns: Dict[str, Union[str, list]]):
continue

ann = self._load_annotations(data_info, indices, bbox_flag)
self._ann_types.add(ann.type)
if item_id in items:
items[item_id].annotations.append(ann)
if isinstance(ann, list):
for label in ann:
self._ann_types.add(label.type)
if item_id in items:
for label in ann:
items[item_id].annotations.append(label)
else:
items[item_id] = DatasetItem(
id=item_id,
subset=self._subset,
media=Image.from_file(path=media_path),
annotations=ann,
)
else:
items[item_id] = DatasetItem(
id=item_id,
subset=self._subset,
media=Image.from_file(path=media_path),
annotations=[ann],
)
self._ann_types.add(ann.type)
if item_id in items:
items[item_id].annotations.append(ann)
else:
items[item_id] = DatasetItem(
id=item_id,
subset=self._subset,
media=Image.from_file(path=media_path),
annotations=[ann],
)
return items.values()

def categories(self):
Expand Down
2 changes: 1 addition & 1 deletion src/datumaro/plugins/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1974,7 +1974,7 @@ def refine_tabular_media(self, item):
or item.media.table.dtype(col) is int
]

df[str_cols] = df[str_cols].applymap(lambda x: self.remove_unnecessary_char(x))
df[str_cols] = df[str_cols].map(lambda x: self.remove_unnecessary_char(x))

if not (self._outlier_value):
self.check_outlier(media.table.data[float_cols + int_cols], float_cols + int_cols)
Expand Down
2 changes: 1 addition & 1 deletion src/datumaro/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.9.0"
__version__ = "1.10.0.dev0"
7 changes: 7 additions & 0 deletions tests/assets/kaggle_dataset/image_csv_multi_label/ann.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
image_name,dog,cat,person
1.jpg,1,0,0
2.jpg,0,1,0
3.jpg,0,0,1
4.jpg,1,1,0
5.jpg,1,0,1
6.jpg,0,1,1
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
image_name,dog,cat,person
1,1,0,0
2,0,1,0
3,0,0,1
4,1,1,0
5,1,0,1
6,0,1,1
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
70 changes: 70 additions & 0 deletions tests/unit/data_formats/test_kaggle.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
from tests.utils.test_utils import compare_datasets

DUMMY_DATASET_IMAGE_CSV_DIR = get_test_asset_path("kaggle_dataset", "image_csv")
DUMMY_DATASET_IMAGE_CSV_MULTI_LB_DIR = get_test_asset_path(
"kaggle_dataset", "image_csv_multi_label"
)
DUMMY_DATASET_IMAGE_CSV_DET_DIR = get_test_asset_path("kaggle_dataset", "image_csv_det")
DUMMY_DATASET_IMAGE_TXT_DIR = get_test_asset_path("kaggle_dataset", "image_txt")
DUMMY_DATASET_IMAGE_TXT_DET_DIR = get_test_asset_path("kaggle_dataset", "image_txt_det")
Expand Down Expand Up @@ -72,6 +75,51 @@ def fxt_img_dataset() -> Dataset:
)


@pytest.fixture
def fxt_img_multi_label_dataset() -> Dataset:
return Dataset.from_iterable(
[
DatasetItem(
id="1",
subset="default",
media=Image.from_numpy(data=np.ones((5, 10, 3))),
annotations=[Label(label=0)],
),
DatasetItem(
id="2",
subset="default",
media=Image.from_numpy(data=np.ones((5, 10, 3))),
annotations=[Label(label=1)],
),
DatasetItem(
id="3",
subset="default",
media=Image.from_numpy(data=np.ones((5, 10, 3))),
annotations=[Label(label=2)],
),
DatasetItem(
id="4",
subset="default",
media=Image.from_numpy(data=np.ones((5, 10, 3))),
annotations=[Label(label=0), Label(label=1)],
),
DatasetItem(
id="5",
subset="default",
media=Image.from_numpy(data=np.ones((5, 10, 3))),
annotations=[Label(label=0), Label(label=2)],
),
DatasetItem(
id="6",
subset="default",
media=Image.from_numpy(data=np.ones((5, 10, 3))),
annotations=[Label(label=1), Label(label=2)],
),
],
categories=["dog", "cat", "person"],
)


@pytest.fixture
def fxt_img_det_dataset() -> Dataset:
return Dataset.from_iterable(
Expand Down Expand Up @@ -321,6 +369,8 @@ def fxt_coco_dataset() -> Dataset:
IDS = [
"IMAGE_CSV",
"IMAGE_CSV_WO_EXT",
"IMAGE_CSV_MULTI_LB",
"IMAGE_CSV_MULTI_LB_WO_EXT",
"IMAGE_CSV_DET",
"IMAGE_CSV_DET2",
"IMAGE_CSV_DET3",
Expand Down Expand Up @@ -372,6 +422,26 @@ def test_can_detect(self, fxt_dataset_dir: str):
"columns": {"media": "image_name", "label": "label_name"},
},
),
(
DUMMY_DATASET_IMAGE_CSV_MULTI_LB_DIR,
"images",
"fxt_img_multi_label_dataset",
KaggleImageCsvBase,
{
"ann_file": osp.join(DUMMY_DATASET_IMAGE_CSV_MULTI_LB_DIR, "ann.csv"),
"columns": {"media": "image_name", "label": ["dog", "cat", "person"]},
},
),
(
DUMMY_DATASET_IMAGE_CSV_MULTI_LB_DIR,
"images",
"fxt_img_multi_label_dataset",
KaggleImageCsvBase,
{
"ann_file": osp.join(DUMMY_DATASET_IMAGE_CSV_MULTI_LB_DIR, "ann_wo_ext.csv"),
"columns": {"media": "image_name", "label": ["dog", "cat", "person"]},
},
),
(
DUMMY_DATASET_IMAGE_CSV_DET_DIR,
"images",
Expand Down
9 changes: 7 additions & 2 deletions tests/unit/test_imagenet_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pytest

from datumaro.components.annotation import AnnotationType, Label, LabelCategories
from datumaro.components.contexts.importer import ImportErrorPolicy
from datumaro.components.dataset import Dataset, StreamDataset
from datumaro.components.dataset_base import DatasetItem
from datumaro.components.environment import Environment
Expand Down Expand Up @@ -214,7 +215,9 @@ def _create_expected_dataset(self):
@pytest.mark.parametrize("dataset_cls, is_stream", [(Dataset, False), (StreamDataset, True)])
def test_can_import(self, dataset_cls, is_stream, helper_tc):
expected_dataset = self._create_expected_dataset()
dataset = dataset_cls.import_from(self.DUMMY_DATASET_DIR, self.IMPORTER_NAME)
dataset = dataset_cls.import_from(
self.DUMMY_DATASET_DIR, self.IMPORTER_NAME, error_policy=ImportErrorPolicy()
)
assert dataset.is_stream == is_stream

compare_datasets(helper_tc, expected_dataset, dataset, require_media=True)
Expand All @@ -240,7 +243,9 @@ class ImagenetWithSubsetDirsImporterTest(ImagenetImporterTest):
@mark_requirement(Requirements.DATUM_GENERAL_REQ)
@pytest.mark.parametrize("dataset_cls, is_stream", [(Dataset, False), (StreamDataset, True)])
def test_can_import(self, dataset_cls, is_stream, helper_tc):
dataset = dataset_cls.import_from(self.DUMMY_DATASET_DIR, self.IMPORTER_NAME)
dataset = dataset_cls.import_from(
self.DUMMY_DATASET_DIR, self.IMPORTER_NAME, error_policy=ImportErrorPolicy()
)
assert dataset.is_stream == is_stream

for subset_name, subset in dataset.subsets().items():
Expand Down

0 comments on commit fdd1ac2

Please sign in to comment.