diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ffef85abc..679dcfaceb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 () - Enrich stack trace for better user experience when importing () +- Save and load hashkey for explorer + () ### Bug fixes - Fix Mapillary Vistas data format diff --git a/datumaro/cli/commands/explore.py b/datumaro/cli/commands/explore.py index 29d09b76c2..be85a4f17c 100644 --- a/datumaro/cli/commands/explore.py +++ b/datumaro/cli/commands/explore.py @@ -4,14 +4,13 @@ import argparse import logging as log +import os import os.path as osp - -import numpy as np +import shutil from datumaro.components.errors import ProjectNotFoundError from datumaro.components.explorer import Explorer -from datumaro.components.visualizer import Visualizer -from datumaro.util.image import save_image +from datumaro.util import str_to_bool from datumaro.util.scope import scope_add, scoped from ..util import MultilineFormatter @@ -42,10 +41,7 @@ def build_parser(parser_ctor=argparse.ArgumentParser): formatter_class=MultilineFormatter, ) - parser.add_argument( - "_positionals", nargs=argparse.REMAINDER, help=argparse.SUPPRESS - ) # workaround for -- eaten by positionals - parser.add_argument("target", nargs="+", default="project", help="Target dataset") + parser.add_argument("target", nargs="?", help="Target dataset") parser.add_argument( "-q", "--query", @@ -61,9 +57,25 @@ def build_parser(parser_ctor=argparse.ArgumentParser): help="Directory of the project to operate on (default: current dir)", ) parser.add_argument( - "-s", "--save", dest="save", default=True, help="Save explorer result as png" + "-s", + "--save", + action="store_true", + default=False, + help="Save explorer result files on explore_result folder", + ) + parser.add_argument( + "--stage", + type=str_to_bool, + default=True, + help=""" + Include this action as a project build step. + If true, this operation will be saved in the project + build tree, allowing to reproduce the resulting dataset later. + Applicable only to main project targets (i.e. data sources + and the 'project' target, but not intermediate stages) + (default: %(default)s) + """, ) - parser.set_defaults(command=explore_command) return parser @@ -75,7 +87,7 @@ def get_sensitive_args(): "target", "query", "topk", - "save", + "project_dir", ] } @@ -89,36 +101,61 @@ def explore_command(args): if args.project_dir: raise - dataset, _ = parse_full_revpath(args.target[0], project) + if args.target: + targets = [args.target] + else: + targets = list(project.working_tree.sources) + + source_datasets = [] + for target in targets: + target_dataset, _ = parse_full_revpath(target, project) + source_datasets.append(target_dataset) + + explorer_args = {"save_hashkey": True} + build_tree = project.working_tree.clone() + for target in targets: + build_tree.build_targets.add_explore_stage(target, params=explorer_args) - explorer = Explorer(dataset) + explorer = Explorer(*source_datasets) + for dataset in source_datasets: + dst_dir = dataset.data_path + dataset.save(dst_dir, save_media=True) + + if args.stage: + project.working_tree.config.update(build_tree.config) + project.working_tree.save() # Get query datasetitem through query path if osp.exists(args.query): - query_datasetitem = dataset.get_datasetitem_by_path(args.query) + query_datasetitem = None + for dataset in source_datasets: + try: + query_datasetitem = dataset.get_datasetitem_by_path(args.query) + except Exception: + continue + if not query_datasetitem: + break else: query_datasetitem = args.query results = explorer.explore_topk(query_datasetitem, args.topk) - subset_list = [] - id_list = [] result_path_list = [] - log.info("Most similar {} results of query in dataset".format(args.topk)) + log.info(f"Most similar {args.topk} results of query in dataset") for result in results: - subset_list.append(result.subset) - id_list.append(result.id) path = getattr(result.media, "path", None) result_path_list.append(path) - log.info("id: {} | subset: {} | path : {}".format(result.id, result.subset, path)) - - visualizer = Visualizer(dataset, figsize=(20, 20), alpha=0) - fig = visualizer.vis_gallery(id_list, subset_list) + log.info(f"id: {result.id} | subset: {result.subset} | path : {path}") if args.save: - fig.canvas.draw() - data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8) - data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) - save_image(osp.join("./explorer.png"), data, create_dir=True) + saved_result_path = osp.join(args.project_dir, "explore_result") + if osp.exists(saved_result_path): + shutil.rmtree(saved_result_path) + os.makedirs(saved_result_path) + for result in results: + saved_subset_path = osp.join(saved_result_path, result.subset) + if not osp.exists(saved_subset_path): + os.makedirs(saved_subset_path) + shutil.copyfile(path, osp.join(saved_subset_path, result.id + ".jpg")) return 0 diff --git a/datumaro/components/explorer.py b/datumaro/components/explorer.py index 5d898b6073..4079fc3ece 100644 --- a/datumaro/components/explorer.py +++ b/datumaro/components/explorer.py @@ -2,15 +2,15 @@ # # SPDX-License-Identifier: MIT -from typing import List, Optional, Union +from typing import List, Optional, Sequence, Union import numpy as np from datumaro.components.annotation import HashKey -from datumaro.components.dataset import IDataset +from datumaro.components.dataset import Dataset from datumaro.components.dataset_base import DatasetItem from datumaro.components.errors import MediaTypeError -from datumaro.components.media import Image +from datumaro.components.media import Image, MediaElement from datumaro.plugins.explorer import ExplorerLauncher @@ -25,10 +25,18 @@ def calculate_hamming(B1, B2): return distH +def select_uninferenced_dataset(dataset): + uninferenced_dataset = Dataset(media_type=MediaElement) + for item in dataset: + if not any(isinstance(annotation, HashKey) for annotation in item.annotations): + uninferenced_dataset.put(item) + return uninferenced_dataset + + class Explorer: def __init__( self, - dataset: IDataset, + *datasets: Sequence[Dataset], topk: int = 10, ) -> None: """ @@ -41,28 +49,50 @@ def __init__( topk: Number of images. """ - self._model = ExplorerLauncher(model_name="clip_visual_ViT-B_32") - self._text_model = ExplorerLauncher(model_name="clip_text_ViT-B_32") - inference = dataset.run_model(self._model, append_annotation=True) + self._model = None + self._text_model = None self._topk = topk - database_keys = [] item_list = [] - for item in inference: - for annotation in item.annotations: - if isinstance(annotation, HashKey): - try: - hash_key = annotation.hash_key[0] - hash_key = np.unpackbits(hash_key, axis=-1) - database_keys.append(hash_key) - item_list.append(item) - except Exception: - hash_key = None + datasets_to_infer = [select_uninferenced_dataset(dataset) for dataset in datasets] + datasets = self.compute_hash_key(datasets, datasets_to_infer) + + for dataset in datasets: + for item in dataset: + for annotation in item.annotations: + if isinstance(annotation, HashKey): + try: + hash_key = annotation.hash_key[0] + hash_key = np.unpackbits(hash_key, axis=-1) + database_keys.append(hash_key) + item_list.append(item) + except Exception: + continue self._database_keys = database_keys self._item_list = item_list + @property + def model(self): + if self._model is None: + self._model = ExplorerLauncher(model_name="clip_visual_ViT-B_32") + return self._model + + @property + def text_model(self): + if self._text_model is None: + self._text_model = ExplorerLauncher(model_name="clip_text_ViT-B_32") + return self._text_model + + def compute_hash_key(self, datasets, datasets_to_infer): + for dataset in datasets_to_infer: + if len(dataset) > 0: + dataset.run_model(self.model, append_annotation=True) + for dataset, dataset_to_infer in zip(datasets, datasets_to_infer): + dataset.update(dataset_to_infer) + return datasets + def explore_topk( self, query: Union[DatasetItem, str, List[DatasetItem], List[str]], @@ -91,7 +121,7 @@ def explore_topk( break query_hash_key_list.append(q_hash_key) elif isinstance(q, str): - q_hash_key = self._text_model.launch(q)[0][0].hash_key + q_hash_key = self.text_model.launch(q)[0][0].hash_key query_hash_key_list.append(q_hash_key) sims = np.zeros(shape=database_keys.shape[0] * len(query_hash_key_list)) @@ -131,7 +161,7 @@ def cal_ind(x): pass elif isinstance(query, str): - query_key = self._text_model.launch(query)[0][0].hash_key + query_key = self.text_model.launch(query)[0][0].hash_key else: raise MediaTypeError( "Unexpected media type of query '%s'. " diff --git a/datumaro/components/exporter.py b/datumaro/components/exporter.py index 3daf6d17e9..7244ba06cb 100644 --- a/datumaro/components/exporter.py +++ b/datumaro/components/exporter.py @@ -24,7 +24,7 @@ ) from datumaro.components.media import Image, PointCloud from datumaro.components.progress_reporting import NullProgressReporter, ProgressReporter -from datumaro.util.meta_file_util import save_meta_file +from datumaro.util.meta_file_util import save_hashkey_file, save_meta_file from datumaro.util.os_util import rmtree from datumaro.util.scope import on_error_do, scoped @@ -174,6 +174,7 @@ def __init__( image_ext: Optional[str] = None, default_image_ext: Optional[str] = None, save_dataset_meta: bool = False, + save_hashkey_meta: bool = False, ctx: Optional[ExportContext] = None, ): default_image_ext = default_image_ext or self.DEFAULT_IMAGE_EXT @@ -202,6 +203,7 @@ def __init__( self._save_dir = save_dir self._save_dataset_meta = save_dataset_meta + self._save_hashkey_meta = save_hashkey_meta # TODO: refactor this variable. # Can be used by a subclass to store the current patch info @@ -278,9 +280,14 @@ def _save_point_cloud(self, item=None, path=None, *, name=None, subdir=None, bas def _save_meta_file(self, path): save_meta_file(path, self._extractor.categories()) + def _save_hashkey_file(self, path): + save_hashkey_file(path, self._extractor) + # TODO: Currently, ExportContextComponent is introduced only for Datumaro and DatumaroBinary format # for multi-processing. We need to propagate this to everywhere in Datumaro 1.2.0 + + class ExportContextComponent: def __init__( self, diff --git a/datumaro/components/project.py b/datumaro/components/project.py index 8aa2d1a875..f9043a21a0 100644 --- a/datumaro/components/project.py +++ b/datumaro/components/project.py @@ -288,6 +288,7 @@ class BuildStageType(Enum): filter = auto() convert = auto() inference = auto() + explore = auto() class Pipeline: @@ -915,6 +916,18 @@ def add_convert_stage( name=name, ) + def add_explore_stage( + self, target: str, params: Optional[Dict] = None, name: Optional[str] = None + ): + return self.add_stage( + target, + { + "type": BuildStageType.explore.name, + "params": params or {}, + }, + name=name, + ) + @staticmethod def make_target_name(target: str, stage: Optional[str] = None) -> str: if stage: diff --git a/datumaro/plugins/data_formats/datumaro/base.py b/datumaro/plugins/data_formats/datumaro/base.py index 65ebdddb17..aaa6cfe249 100644 --- a/datumaro/plugins/data_formats/datumaro/base.py +++ b/datumaro/plugins/data_formats/datumaro/base.py @@ -5,12 +5,15 @@ import os.path as osp from typing import Optional +import numpy as np + from datumaro.components.annotation import ( AnnotationType, Bbox, Caption, Cuboid3d, Ellipse, + HashKey, Label, LabelCategories, MaskCategories, @@ -25,6 +28,7 @@ from datumaro.components.importer import ImportContext from datumaro.components.media import Image, MediaElement, PointCloud from datumaro.util import parse_json_file +from datumaro.util.meta_file_util import has_hashkey_file, parse_hashkey_file from datumaro.version import __version__ from .format import DATUMARO_FORMAT_VERSION, DatumaroPath @@ -99,7 +103,8 @@ def _load_impl(self, path: str) -> None: """Actual implementation of loading Datumaro format.""" self._infos = self._load_infos(self._parsed_anns) self._categories = self._load_categories(self._parsed_anns) - self._items = self._load_items(self._parsed_anns) + items = self._load_items(self._parsed_anns) + self._items = self._load_hash_key(items) @staticmethod def _load_infos(parsed): @@ -147,6 +152,7 @@ def _load_categories(parsed): def _load_items(self, parsed): items = [] + for item_desc in parsed["items"]: item_id = item_desc["id"] @@ -204,6 +210,16 @@ def _load_items(self, parsed): return items + def _load_hash_key(self, items): + if not has_hashkey_file(self._rootpath): + return items + + hashkey_dict = parse_hashkey_file(self._rootpath) + for item in items: + hash_key = hashkey_dict[item.subset + "/" + item.id] + item.annotations.append(HashKey(hash_key=np.asarray(hash_key, dtype=np.uint8))) + return items + @staticmethod def _load_annotations(item): parsed = item["annotations"] @@ -317,6 +333,8 @@ def _load_annotations(item): ) ) + elif ann_type == AnnotationType.hash_key: + continue else: raise NotImplementedError() diff --git a/datumaro/plugins/data_formats/datumaro/exporter.py b/datumaro/plugins/data_formats/datumaro/exporter.py index 61248f04d5..e0a92d9f54 100644 --- a/datumaro/plugins/data_formats/datumaro/exporter.py +++ b/datumaro/plugins/data_formats/datumaro/exporter.py @@ -20,6 +20,7 @@ Caption, Cuboid3d, Ellipse, + HashKey, Label, LabelCategories, Mask, @@ -174,6 +175,8 @@ def add_item(self, item: DatasetItem, *args, **kwargs): converted_ann = self._convert_cuboid_3d_object(ann) elif isinstance(ann, Ellipse): converted_ann = self._convert_ellipse_object(ann) + elif isinstance(ann, HashKey): + continue else: raise NotImplementedError() annotations.append(converted_ann) @@ -419,6 +422,8 @@ def apply(self, pool: Optional[Pool] = None, *args, **kwargs): subset = item.subset or DEFAULT_SUBSET_NAME writers[subset].add_item(item, pool) + self._check_hash_key_existence(item) + for subset, writer in writers.items(): if self._patch and subset in self._patch.updated_subsets and writer.is_empty(): if osp.isfile(writer.ann_file): @@ -428,6 +433,9 @@ def apply(self, pool: Optional[Pool] = None, *args, **kwargs): writer.write(pool) + if self._save_hashkey_meta: + self._save_hashkey_file(self._save_dir) + @classmethod def patch(cls, dataset, patch, save_dir, **kwargs): for subset in patch.updated_subsets: @@ -460,3 +468,11 @@ def patch(cls, dataset, patch, save_dir, **kwargs): related_images_path = osp.join(save_dir, cls.PATH_CLS.IMAGES_DIR, item.subset, item.id) if osp.isdir(related_images_path): shutil.rmtree(related_images_path) + + def _check_hash_key_existence(self, item): + if self._save_hashkey_meta: + return + for annotation in item.annotations: + if isinstance(annotation, HashKey): + self._save_hashkey_meta = True + return diff --git a/datumaro/plugins/explorer.py b/datumaro/plugins/explorer.py index 1aa85f3a1f..6ce5ee8087 100644 --- a/datumaro/plugins/explorer.py +++ b/datumaro/plugins/explorer.py @@ -62,16 +62,13 @@ def infer(self, inputs): prompt_text = f"a photo of a {inputs}" inputs = self._tokenize(prompt_text) inputs = {self._input_blob: inputs} - else: - if not inputs.any(): - # media.data is None case - return None - + elif isinstance(inputs, np.ndarray): # when processing a query key, we expand HWC to NHWC if len(inputs.shape) == 3: inputs = np.expand_dims(inputs, axis=0) - inputs = self.process_inputs(inputs) + else: + raise ValueError(f"inputs={inputs} is not allowed type.") results = self._net.infer(inputs) hash_key = self._compute_hash(results[self._output_blobs]) diff --git a/datumaro/plugins/openvino_plugin/samples/clip_visual_ViT-B_32_interp.py b/datumaro/plugins/openvino_plugin/samples/clip_visual_ViT-B_32_interp.py index e8de3f18f2..e673904be8 100644 --- a/datumaro/plugins/openvino_plugin/samples/clip_visual_ViT-B_32_interp.py +++ b/datumaro/plugins/openvino_plugin/samples/clip_visual_ViT-B_32_interp.py @@ -2,9 +2,12 @@ # # SPDX-License-Identifier: MIT +import os.path as osp + import numpy as np from datumaro.components.annotation import AnnotationType, HashKey, LabelCategories +from datumaro.util.samples import get_samples_path def normalize(inputs): @@ -26,4 +29,12 @@ def process_outputs(inputs, outputs): def get_categories(): label_categories = LabelCategories() + + openvino_plugin_samples_dir = get_samples_path() + imagenet_class_path = osp.join(openvino_plugin_samples_dir, "imagenet.class") + with open(imagenet_class_path, "r", encoding="utf-8") as file: + for line in file.readlines(): + label = line.strip() + label_categories.add(label) + return {AnnotationType.label: label_categories} diff --git a/datumaro/util/meta_file_util.py b/datumaro/util/meta_file_util.py index b10effdd2e..7c6925208e 100644 --- a/datumaro/util/meta_file_util.py +++ b/datumaro/util/meta_file_util.py @@ -1,14 +1,16 @@ -# Copyright (C) 2022 Intel Corporation +# Copyright (C) 2022-2023 Intel Corporation # # SPDX-License-Identifier: MIT +import os import os.path as osp from collections import OrderedDict -from datumaro.components.annotation import AnnotationType +from datumaro.components.annotation import AnnotationType, HashKey from datumaro.util import dump_json_file, find, parse_json_file DATASET_META_FILE = "dataset_meta.json" +DATASET_HASHKEY_FILE = "hash_keys.json" def is_meta_file(path): @@ -19,10 +21,21 @@ def has_meta_file(path): return osp.isfile(get_meta_file(path)) +def has_hashkey_file(path): + return osp.isfile(get_hashkey_file(path)) + + def get_meta_file(path): return osp.join(path, DATASET_META_FILE) +def get_hashkey_file(path): + hashkey_folder_path = osp.join(path, "hash_key_meta") + if not osp.exists(hashkey_folder_path): + os.makedirs(hashkey_folder_path) + return osp.join(hashkey_folder_path, DATASET_HASHKEY_FILE) + + def parse_meta_file(path): meta_file = path if osp.isdir(path): @@ -72,3 +85,44 @@ def save_meta_file(path, categories): meta_file = get_meta_file(path) dump_json_file(meta_file, dataset_meta, indent=True) + + +def parse_hashkey_file(path): + meta_file = path + if osp.isdir(path): + meta_file = get_hashkey_file(path) + + if not osp.exists(meta_file): + return None + + dataset_meta = parse_json_file(meta_file) + + hashkey_dict = OrderedDict() + for id_, hashkey in dataset_meta.get("hashkey", {}).items(): + hashkey_dict[id_] = hashkey + + return hashkey_dict + + +def save_hashkey_file(path, item_list): + dataset_hashkey = {} + + if osp.isdir(path): + meta_file = get_hashkey_file(path) + + hashkey_dict = parse_hashkey_file(path) + if not hashkey_dict: + hashkey_dict = {} + + for item in item_list: + item_id = item.id + item_subset = item.subset + for annotation in item.annotations: + if isinstance(annotation, HashKey): + hashkey = annotation.hash_key + break + hashkey_dict.update({item_subset + "/" + item_id: hashkey.tolist()}) + + dataset_hashkey["hashkey"] = hashkey_dict + + dump_json_file(meta_file, dataset_hashkey, indent=True) diff --git a/docs/source/docs/command-reference/context_free/explorer.md b/docs/source/docs/command-reference/context_free/explorer.md index 75a3ae9b28..1bbec624b1 100644 --- a/docs/source/docs/command-reference/context_free/explorer.md +++ b/docs/source/docs/command-reference/context_free/explorer.md @@ -4,16 +4,16 @@ This command explore similar data results for query on dataset. You can use your own query with any image file or text description, even put it on the list. The result includes top-k similar data among target dataset and the visualization of result is saved as png file. This feature is supposed to help users to figure out dataset property easier. -Explorer is a feature that operates on hash basis. Once you put dataset that use as a datasetbase, Explorer calculates hash for every datasetitems in the dataset. Currently, hash of each data is computed based on the CLIP ([article](https://arxiv.org/abs/2103.00020)), which could support both image and text modality. Supported model format is Openvino IR and those are uploaded in [openvinotoolkit storage](https://storage.openvinotoolkit.org/repositories/datumaro/models/). When you call Explorer class, hash of whole dataset is started to compute. For database, we use hash for image of each datasetitem. Through CLIP, we extracted feature of image, converted it to binary value and pack the elements into bits. Each hash information is saved as `HashKey` in annotations. Hence, once you call Explorer for the dataset, all datasetitems in dataset have `HashKey` in each annotations. +Explorer is a feature that operates on hash basis. Once you put dataset that use as a datasetbase, Explorer calculates hash for every datasetitems in the dataset. Currently, hash of each data is computed based on the CLIP([article](https://arxiv.org/abs/2103.00020)), which could support both image and text modality. Supported model format is Openvino IR and those are uploaded in [openvinotoolkit storage](https://storage.openvinotoolkit.org/repositories/datumaro/models/). When you call Explorer class, hash of whole dataset is started to compute. For database, we use hash for image of each datasetitem. Through CLIP, we extracted feature of image, converted it to binary value and pack the elements into bits. Each hash information is saved as `HashKey` in annotations. Hence, once you call Explorer for the dataset, all datasetitems in dataset have `HashKey` in each annotations. To explore similar data in dataset, you need to set query first. Query could be image, text, list of images, list of texts and list of images and texts. The query does not need to be an image that exists in the dataset. You can put in any data that you want to explore similar dataset. And you need to set top-k that how much you want to find similar data. The default value for top-k is 50, so if you hope to find more smaller results, you would set top-k. For single query, we computed hamming distance of hash between whole dataset and query. And we sorted those distance and select top-k data which have short distance. For list query, we repeated computing distance for each query and select top-k data based on distance among all dataset. -The command can be applied to a dataset. And if you want to use multiple dataset as database, you could use merged dataset. The current project (`-p/--project`) is also used a context for plugins, so it can be useful for dataset paths having custom formats. When not specified, the current project's working tree is used. To save visualized result (`-s/--save`) is turned on as default. This visualized result is based on [Visualizer](../../jupyter_notebook_examples/visualizer). +The command can be applied to a dataset. And if you want to use multiple dataset as database, you could use merged dataset. The current project (`-p/--project`) is also used a context for plugins, so it can be useful for dataset paths having custom formats. When not specified, the current project's working tree is used. To save visualized result (`-s/--save`) is turned off as default. This visualized result is based on [Visualizer](../../jupyter_notebook_examples/visualizer). Usage: ```console -datum explore [-q or ] - [-topk TOPK] [-p PROJECT_DIR] [-s SAVE] target [target ...] +datum explore [target] -q or + [-topk TOPK] [-p PROJECT_DIR] [-s SAVE] [--stage STAGE] ``` Parameters: @@ -22,7 +22,12 @@ Parameters: - `-q, --query` (string) - Image path or text to use as query. - `-topk` (int) - Number how much you want to find similar data. - `-p, --project` (string) - Directory of the project to operate on (default: current directory). -- `-s, --save` (bool) - Save visualized result of similar dataset. +- `-s, --save` (bool) - Save explorer result files on explore_result folder. +- `--stage` (bool) - Include this action as a project build step. + If true, this operation will be saved in the project + build tree, allowing to reproduce the resulting dataset later. + Applicable only to main project targets (i.e. data sources + and the `project` target, but not intermediate stages). Enabled by default. Examples: - Explore top10 similar images of image query diff --git a/tests/integration/cli/test_explore.py b/tests/integration/cli/test_explore.py index 60e1f85b1d..6b97d196a8 100644 --- a/tests/integration/cli/test_explore.py +++ b/tests/integration/cli/test_explore.py @@ -1,14 +1,16 @@ import os.path as osp import platform +from glob import glob from unittest import TestCase, skipIf import numpy as np +from datumaro.cli.util.project import load_project from datumaro.components.annotation import Caption, Label from datumaro.components.dataset import Dataset from datumaro.components.dataset_base import DatasetItem from datumaro.components.media import Image -from datumaro.components.project import Project +from datumaro.util.meta_file_util import has_hashkey_file, parse_hashkey_file from datumaro.util.scope import scope_add, scoped from ...requirements import Requirements, mark_requirement @@ -24,26 +26,57 @@ def test_dataset(self): train_img[2, :] = 0 test_img = np.full((5, 5, 3), 0, dtype=np.uint8) test_img[2, :] = 255 - train_Image = Image.from_numpy(data=train_img) + dataset = Dataset.from_iterable( [ DatasetItem( id=1, subset="train", - media=train_Image, - annotations=[Label(1, id=1), Caption("cat")], + media=Image.from_numpy(data=train_img), + annotations=[Label(0), Caption("cat")], ), DatasetItem( id=2, subset="train", - media=train_Image, - annotations=[Label(1, id=1), Caption("cat")], + media=Image.from_numpy(data=train_img), + annotations=[Label(0), Caption("cat")], ), DatasetItem( id=3, subset="test", media=Image.from_numpy(data=test_img), - annotations=[Label(2, id=2), Caption("dog")], + annotations=[Label(1), Caption("dog")], + ), + ] + ) + return dataset + + @property + def test_dataset2(self): + train_img = np.full((5, 5, 3), 255, dtype=np.uint8) + train_img[2, :] = 0 + test_img = np.full((5, 5, 3), 0, dtype=np.uint8) + test_img[2, :] = 255 + + dataset = Dataset.from_iterable( + [ + DatasetItem( + id=4, + subset="train", + media=Image.from_numpy(data=train_img), + annotations=[Label(0), Caption("cat")], + ), + DatasetItem( + id=5, + subset="train", + media=Image.from_numpy(data=train_img), + annotations=[Label(0), Caption("cat")], + ), + DatasetItem( + id=6, + subset="test", + media=Image.from_numpy(data=test_img), + annotations=[Label(1), Caption("dog")], ), ] ) @@ -56,16 +89,124 @@ def test_dataset(self): ) @mark_requirement(Requirements.DATUM_GENERAL_REQ) @scoped - def test_can_explore_dataset(self): + def test_can_explore_dataset_w_target(self): test_dir = scope_add(TestDir()) + proj_dir = osp.join(test_dir, "proj") dataset_url = osp.join(test_dir, "dataset") + train_image_path = osp.join(test_dir, "train", "1.jpg") self.test_dataset.export(dataset_url, "datumaro", save_media=True) + run(self, "project", "create", "-o", proj_dir) + run(self, "project", "import", "-p", proj_dir, "-f", "datumaro", dataset_url) + run( + self, + "explore", + "source-1", + "-q", + train_image_path, + "-topk", + "2", + "-p", + proj_dir, + "-s", + ) + + saved_result_path = osp.join(proj_dir, "explore_result") + results = glob(osp.join(saved_result_path, "**", "*"), recursive=True) + + self.assertIn(osp.join(saved_result_path, "train", "1.jpg"), results) + + @skipIf( + platform.system() == "Darwin", + "Segmentation fault only occurs on MacOS: " + "https://github.com/openvinotoolkit/datumaro/actions/runs/4202399957/jobs/7324077250", + ) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + @scoped + def test_can_explore_dataset_wo_target(self): + test_dir = scope_add(TestDir()) + proj_dir = osp.join(test_dir, "proj") + dataset_url = osp.join(test_dir, "dataset") + train_image_path = osp.join(test_dir, "train", "1.jpg") + + self.test_dataset.export(dataset_url, "datumaro", save_media=True) + + run(self, "project", "create", "-o", proj_dir) + run(self, "project", "import", "-p", proj_dir, "-f", "datumaro", dataset_url) + run(self, "explore", "-q", train_image_path, "-topk", "2", "-p", proj_dir, "-s") + + saved_result_path = osp.join(proj_dir, "explore_result") + results = glob(osp.join(saved_result_path, "**", "*"), recursive=True) + + self.assertIn(osp.join(saved_result_path, "train", "1.jpg"), results) + + @skipIf( + platform.system() == "Darwin", + "Segmentation fault only occurs on MacOS: " + "https://github.com/openvinotoolkit/datumaro/actions/runs/4202399957/jobs/7324077250", + ) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + @scoped + def test_can_explore_added_dataset_w_target(self): + test_dir = scope_add(TestDir()) + proj_dir = osp.join(test_dir, "proj") + dataset1_url = osp.join(test_dir, "dataset1") train_image_path = osp.join(test_dir, "train", "1.jpg") + + self.test_dataset.export(dataset1_url, "datumaro", save_media=True) + run(self, "project", "create", "-o", proj_dir) + run(self, "project", "import", "-p", proj_dir, "-f", "datumaro", dataset1_url) + + run( + self, + "explore", + "source-1", + "-q", + train_image_path, + "-topk", + "2", + "-p", + proj_dir, + ) + + dataset2_url = osp.join(proj_dir, "dataset2") + self.test_dataset2.save(dataset2_url, save_media=True) + run(self, "project", "add", "-p", proj_dir, "-f", "datumaro", dataset2_url) + run( + self, + "explore", + "source-1", + "-q", + train_image_path, + "-topk", + "2", + "-p", + proj_dir, + "-s", + ) + + saved_result_path = osp.join(proj_dir, "explore_result") + results = glob(osp.join(saved_result_path, "**", "*"), recursive=True) + + self.assertIn(osp.join(saved_result_path, "train", "1.jpg"), results) + + @skipIf( + platform.system() == "Darwin", + "Segmentation fault only occurs on MacOS: " + "https://github.com/openvinotoolkit/datumaro/actions/runs/4202399957/jobs/7324077250", + ) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + @scoped + def test_can_explore_added_dataset_wo_target(self): + test_dir = scope_add(TestDir()) proj_dir = osp.join(test_dir, "proj") - with Project.init(proj_dir) as project: - project.import_source("source-1", dataset_url, "datumaro", no_cache=True) + dataset1_url = osp.join(test_dir, "dataset1") + train_image_path = osp.join(test_dir, "train", "1.jpg") + + self.test_dataset.export(dataset1_url, "datumaro", save_media=True) + run(self, "project", "create", "-o", proj_dir) + run(self, "project", "import", "-p", proj_dir, "-f", "datumaro", dataset1_url) run( self, @@ -74,5 +215,238 @@ def test_can_explore_dataset(self): train_image_path, "-topk", "2", + "-p", + proj_dir, + ) + + dataset2_url = osp.join(proj_dir, "source-2") + self.test_dataset2.save(dataset2_url, save_media=True) + run(self, "project", "add", "-p", proj_dir, "-f", "datumaro", dataset2_url) + run( + self, + "explore", + "-q", + train_image_path, + "-topk", + "4", + "-p", proj_dir, + "-s", ) + + saved_result_path = osp.join(proj_dir, "explore_result") + results = glob(osp.join(saved_result_path, "**", "*"), recursive=True) + + self.assertIn(osp.join(saved_result_path, "train", "1.jpg"), results) + + @skipIf( + platform.system() == "Darwin", + "Segmentation fault only occurs on MacOS: " + "https://github.com/openvinotoolkit/datumaro/actions/runs/4202399957/jobs/7324077250", + ) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + @scoped + def test_can_explore_merged_dataset_w_target(self): + test_dir = scope_add(TestDir()) + proj_dir = osp.join(test_dir, "proj") + dataset1_url = osp.join(test_dir, "dataset1") + train_image_path = osp.join(test_dir, "train", "1.jpg") + + self.test_dataset.export(dataset1_url, "datumaro", save_media=True) + + run(self, "project", "create", "-o", proj_dir) + run(self, "project", "import", "-p", proj_dir, "-f", "datumaro", dataset1_url) + run(self, "explore", "-q", train_image_path, "-topk", "2", "-p", proj_dir, "source-1") + + dataset2_url = osp.join(test_dir, "dataset2") + self.test_dataset2.save(dataset2_url, save_media=True) + result_dir = osp.join(test_dir, "result") + + run( + self, + "merge", + "-f", + "datumaro", + "-o", + result_dir, + dataset1_url, + dataset2_url, + ) + run(self, "project", "import", "-n", "result", "-p", proj_dir, "-f", "datumaro", result_dir) + run( + self, + "explore", + "result", + "-q", + train_image_path, + "-topk", + "4", + "-p", + proj_dir, + "-s", + ) + + saved_result_path = osp.join(proj_dir, "explore_result") + results = glob(osp.join(saved_result_path, "**", "*"), recursive=True) + + self.assertIn(osp.join(saved_result_path, "train", "1.jpg"), results) + + @skipIf( + platform.system() == "Darwin", + "Segmentation fault only occurs on MacOS: " + "https://github.com/openvinotoolkit/datumaro/actions/runs/4202399957/jobs/7324077250", + ) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + @scoped + def test_can_explore_merged_dataset_wo_target(self): + test_dir = scope_add(TestDir()) + proj_dir = osp.join(test_dir, "proj") + dataset1_url = osp.join(test_dir, "dataset1") + self.test_dataset.export(dataset1_url, "datumaro", save_media=True) + + train_image_path = osp.join(test_dir, "train", "1.jpg") + run(self, "project", "create", "-o", proj_dir) + run(self, "project", "import", "-p", proj_dir, "-f", "datumaro", dataset1_url) + run(self, "explore", "-q", train_image_path, "-topk", "2", "-p", proj_dir) + + dataset2_url = osp.join(test_dir, "dataset2") + self.test_dataset2.save(dataset2_url, save_media=True) + result_dir = osp.join(test_dir, "result") + + run( + self, + "merge", + "-f", + "datumaro", + "-o", + result_dir, + dataset1_url, + dataset2_url, + ) + run(self, "project", "import", "-n", "result", "-p", proj_dir, "-f", "datumaro", result_dir) + run( + self, + "explore", + "-q", + train_image_path, + "-topk", + "4", + "-p", + proj_dir, + "-s", + ) + + saved_result_path = osp.join(proj_dir, "explore_result") + results = glob(osp.join(saved_result_path, "**", "*"), recursive=True) + + self.assertIn(osp.join(saved_result_path, "train", "1.jpg"), results) + + @skipIf( + platform.system() == "Darwin", + "Segmentation fault only occurs on MacOS: " + "https://github.com/openvinotoolkit/datumaro/actions/runs/4202399957/jobs/7324077250", + ) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + @scoped + def test_can_checkout_load_hashkey(self): + test_dir = scope_add(TestDir()) + proj_dir = osp.join(test_dir, "proj") + dataset1_url = osp.join(test_dir, "dataset1") + train_image_path = osp.join(test_dir, "train", "1.jpg") + + self.test_dataset.export(dataset1_url, "datumaro", save_media=True) + run(self, "project", "create", "-o", proj_dir) + run(self, "project", "import", "-p", proj_dir, "-f", "datumaro", dataset1_url) + run(self, "explore", "source-1", "-q", train_image_path, "-topk", "2", "-p", proj_dir) + run(self, "project", "commit", "-p", proj_dir, "-m", "commit1") + + commit1_proj = load_project(proj_dir) + commit1_srcs = list(commit1_proj.working_tree.config.sources.keys()) + self.assertTrue(len(commit1_srcs), 1) + src_dir = commit1_proj.source_data_dir(commit1_srcs[0]) + self.assertTrue(has_hashkey_file(src_dir)) + commit1_hashkey = parse_hashkey_file(src_dir) + + # check stage added + new_tree = commit1_proj.working_tree.clone() + stage = new_tree.build_targets.add_explore_stage("source-1", params={"save_hashkey": True}) + self.assertTrue(stage in new_tree.build_targets) + + dataset2_url = osp.join(test_dir, "dataset2") + self.test_dataset2.save(dataset2_url, save_media=True) + result_dir = osp.join(test_dir, "result") + + run( + self, + "merge", + "-f", + "datumaro", + "-o", + result_dir, + "-p", + proj_dir, + dataset2_url + ":datumaro", + ) + run(self, "project", "import", "-n", "result", "-p", proj_dir, "-f", "datumaro", result_dir) + run( + self, + "explore", + "result", + "-q", + train_image_path, + "-topk", + "2", + "-p", + proj_dir, + ) + + run(self, "project", "commit", "-p", proj_dir, "-m", "commit2") + commit2_proj = load_project(proj_dir) + commit2_srcs = list(commit2_proj.working_tree.config.sources.keys()) + self.assertTrue(len(commit2_srcs), 2) + + src_dir = commit2_proj.source_data_dir("result") + self.assertTrue(has_hashkey_file(src_dir)) + commit2_hashkey = parse_hashkey_file(src_dir) + + self.assertTrue(len(commit2_hashkey) > len(commit1_hashkey)) + + run(self, "project", "checkout", "-p", proj_dir, "HEAD~1") + checkout_proj = load_project(proj_dir) + checkout_srcs = list(checkout_proj.working_tree.config.sources.keys()) + self.assertTrue(len(checkout_srcs), 1) + src_dir = checkout_proj.source_data_dir(checkout_srcs[0]) + checkout_hashkey = parse_hashkey_file(src_dir) + + self.assertEqual(len(checkout_hashkey), len(commit1_hashkey)) + self.assertEqual(checkout_hashkey["train/1"], commit1_hashkey["train/1"]) + self.assertEqual(checkout_hashkey["train/2"], commit1_hashkey["train/2"]) + self.assertEqual(checkout_hashkey["test/3"], commit1_hashkey["test/3"]) + + @skipIf( + platform.system() == "Darwin", + "Segmentation fault only occurs on MacOS: " + "https://github.com/openvinotoolkit/datumaro/actions/runs/4202399957/jobs/7324077250", + ) + @mark_requirement(Requirements.DATUM_GENERAL_REQ) + @scoped + def test_parse_hashkey(self): + test_dir = scope_add(TestDir()) + proj_dir = osp.join(test_dir, "proj") + dataset1_url = osp.join(test_dir, "dataset1") + train_image_path = osp.join(test_dir, "train", "1.jpg") + + self.test_dataset.export(dataset1_url, "datumaro", save_media=True) + run(self, "project", "create", "-o", proj_dir) + run(self, "project", "import", "-p", proj_dir, "-f", "datumaro", dataset1_url) + run(self, "explore", "-q", train_image_path, "-topk", "2", "-p", proj_dir) + + proj = load_project(proj_dir) + srcs = list(proj.working_tree.config.sources.keys()) + src_dir = proj.source_data_dir(srcs[0]) + hashkey = parse_hashkey_file(src_dir) + + for item in self.test_dataset: + item_id = item.id + item_subset = item.subset + self.assertIsNotNone(hashkey[item_subset + "/" + item_id]) diff --git a/tests/unit/test_explorer.py b/tests/unit/test_explorer.py index db1cac4acf..1ef7152d1a 100644 --- a/tests/unit/test_explorer.py +++ b/tests/unit/test_explorer.py @@ -135,43 +135,14 @@ def test_explore_txt_query(self): """ with TestDir() as test_dir: converter = partial(DatumaroExporter.convert, save_media=True) - converter(self.test_dataset_black_white, test_dir) + converter(self.test_dataset, test_dir) imported_dataset = Dataset.import_from(test_dir, "datumaro") explorer = Explorer(imported_dataset) - result = explorer.explore_topk("a photo of white background", topk=2) + result = explorer.explore_topk( + "a photo of a upper white and bottom black background", topk=2 + ) self.assertEqual(result[0].subset, result[1].subset) - @skipIf( - platform.system() == "Darwin", - "Segmentation fault only occurs on MacOS: " - "https://github.com/openvinotoolkit/datumaro/actions/runs/4202399957/jobs/7324077250", - ) - @mark_requirement(Requirements.DATUM_GENERAL_REQ) - def test_data_none(self): - """ - Description: - Check that data does not have any media. - - Input data: - Dataset whose data is None. - - Expected results: - Raise ValueError as data should have hash_key. - - Steps - 1. Import datumaro dataset which contain None media data. - 2. Set Explorer and try explore_topk to find similar media of query. - 3. Check whether ValueError raised properly or not. - """ - imported_dataset = Dataset.import_from("./tests/assets/datumaro_dataset/legacy", "datumaro") - for i, item in enumerate(imported_dataset): - if i == 0: - query = item - explorer = Explorer(imported_dataset) - with self.assertRaises(ValueError) as capture: - explorer.explore_topk(query, topk=2) - self.assertEqual("Database should have hash_key", str(capture.exception)) - @skipIf( platform.system() == "Darwin", "Segmentation fault only occurs on MacOS: "