Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize Python import to make CLI entrypoint faster #1182

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## \[Unreleased\]
### Enhancements
- Optimize Python import to make CLI entrypoint faster
(<https://github.com/openvinotoolkit/datumaro/pull/1182>)

## 16/11/2023 - Release 1.5.1
### Enhancements
- Enhance Datumaro data format stream importer performance
(<https://github.com/openvinotoolkit/datumaro/pull/1153>)
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ exclude_lines = [
# Don't complain if non-runnable code isn't run:
'if 0:',
'if __name__ == .__main__.:',

# Don't complain for the type checking code:
'if TYPE_CHECKING:'
]

# don't fail on the code that can be found
Expand Down
3 changes: 2 additions & 1 deletion src/datumaro/cli/commands/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from shutil import rmtree

from datumaro.cli.util.errors import CliException
from datumaro.plugins.synthetic_data import FractalImageGenerator
from datumaro.util.definitions import get_datumaro_cache_dir

from ..util import MultilineFormatter
Expand Down Expand Up @@ -76,6 +75,8 @@ def get_sensitive_args():


def generate_command(args):
from datumaro.plugins.synthetic_data import FractalImageGenerator

log.info("Generating dataset...")
output_dir = args.output_dir

Expand Down
22 changes: 13 additions & 9 deletions src/datumaro/cli/util/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,27 @@
from collections import Counter
from enum import Enum, auto
from itertools import zip_longest
from typing import Union
from typing import TYPE_CHECKING, Union

import cv2
import numpy as np

from datumaro.components.media import Image

with warnings.catch_warnings():
warnings.simplefilter("ignore")
import tensorboardX as tb

from datumaro.components.annotation import AnnotationType, LabelCategories
from datumaro.components.dataset import IDataset
from datumaro.components.media import Image
from datumaro.util import parse_str_enum_value
from datumaro.util.image import save_image
from datumaro.util.import_util import lazy_import

if TYPE_CHECKING:
import matplotlib.pyplot as plt

with warnings.catch_warnings():
warnings.simplefilter("ignore")
import tensorboardX as tb
else:
tb = lazy_import("tensorboardX")
plt = lazy_import("matplotlib.pyplot")


class DistanceCompareVisualizer:
Expand Down Expand Up @@ -291,8 +297,6 @@ def save_as_tensorboard(self, img, name):
self._file_writer.add_image(name, img)

def save_conf_matrix(self, conf_matrix, filename):
import matplotlib.pyplot as plt

def _get_class_map(label_categories):
classes = None
if label_categories is not None:
Expand Down
14 changes: 10 additions & 4 deletions src/datumaro/components/algorithms/hash_key_inference/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,16 @@
#
# SPDX-License-Identifier: MIT

from typing import Sequence
from typing import TYPE_CHECKING, Sequence

from datumaro.components.dataset import Dataset
from datumaro.plugins.explorer import ExplorerLauncher

if TYPE_CHECKING:
import datumaro.plugins.explorer as explorer
else:
from datumaro.util.import_util import lazy_import

explorer = lazy_import("datumaro.plugins.explorer")


class HashInference:
Expand All @@ -15,13 +21,13 @@ def __init__(self, *datasets: Sequence[Dataset]) -> None:
@property
def model(self):
if self._model is None:
self._model = ExplorerLauncher(model_name="clip_visual_ViT-B_32")
self._model = explorer.ExplorerLauncher(model_name="clip_visual_ViT-B_32")
return self._model

@property
def text_model(self):
if self._text_model is None:
self._text_model = ExplorerLauncher(model_name="clip_text_ViT-B_32")
self._text_model = explorer.ExplorerLauncher(model_name="clip_text_ViT-B_32")
return self._text_model

def _compute_hash_key(self, datasets, datasets_to_infer):
Expand Down
19 changes: 16 additions & 3 deletions src/datumaro/components/algorithms/hash_key_inference/prune.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,10 @@
import math
import random
from abc import ABC, abstractmethod
from typing import Dict, List, Optional, Tuple
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple

import numpy as np
from sklearn.cluster import KMeans

import datumaro.plugins.ndr as ndr
from datumaro.components.algorithms.hash_key_inference.base import HashInference
from datumaro.components.algorithms.hash_key_inference.hashkey_util import (
calculate_hamming,
Expand All @@ -23,6 +21,13 @@
from datumaro.components.dataset import Dataset
from datumaro.components.dataset_base import DatasetItem

if TYPE_CHECKING:
import datumaro.plugins.ndr as ndr
else:
from datumaro.util.import_util import lazy_import

ndr = lazy_import("datumaro.plugins.ndr")


def match_num_item_for_cluster(ratio, dataset_len, cluster_num_item_list):
total_num_selected_item = math.ceil(dataset_len * ratio)
Expand Down Expand Up @@ -94,6 +99,8 @@ class Centroid(PruneBase):
"""

def base(self, ratio, num_centers, labels, database_keys, item_list, source):
from sklearn.cluster import KMeans

num_selected_centers = math.ceil(len(item_list) * ratio)
kmeans = KMeans(n_clusters=num_selected_centers, random_state=0)
clusters = kmeans.fit_predict(database_keys)
Expand Down Expand Up @@ -124,6 +131,8 @@ class ClusteredRandom(PruneBase):
"""

def base(self, ratio, num_centers, labels, database_keys, item_list, source):
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=num_centers, random_state=0)
clusters = kmeans.fit_predict(database_keys)
cluster_ids, cluster_num_item_list = np.unique(clusters, return_counts=True)
Expand All @@ -148,6 +157,8 @@ class QueryClust(PruneBase):
"""

def base(self, ratio, num_centers, labels, database_keys, item_list, source):
from sklearn.cluster import KMeans

center_dict = {i: None for i in range(1, num_centers)}
for item in item_list:
for anno in item.annotations:
Expand Down Expand Up @@ -199,6 +210,8 @@ class Entropy(PruneBase):
"""

def base(self, ratio, num_centers, labels, database_keys, item_list, source):
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=num_centers, random_state=0)
clusters = kmeans.fit_predict(database_keys)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,19 @@

from collections import defaultdict
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Sequence, Tuple

import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.figure import Figure
from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Tuple

from datumaro.components.annotation import AnnotationType, LabelCategories
from datumaro.components.dataset_base import IDataset
from datumaro.errors import DatasetError

if TYPE_CHECKING:
from matplotlib.figure import Figure
from pandas import DataFrame, Series
else:
DataFrame, Series, Figure = None, None, None


__all__ = ["LossDynamicsAnalyzer", "NoisyLabelCandidate"]


Expand Down Expand Up @@ -92,30 +95,32 @@ def alpha(self) -> float:
return self._alpha

@property
def mean_loss_dyns(self) -> pd.Series:
def mean_loss_dyns(self) -> Series:
"""Pandas Series object obtained by averaging all EMA loss dynamics statistics"""
return self._mean_loss_dyns

@property
def mean_loss_dyns_per_label(self) -> Dict[LabelCategories.Category, pd.Series]:
def mean_loss_dyns_per_label(self) -> Dict[LabelCategories.Category, Series]:
"""A dictionary of Pandas Series object obtained
by averaging EMA loss dynamics statistics according to the label category"""
label_categories = self._dataset.categories()[AnnotationType.label]
return {label_categories[k]: v for k, v in self._mean_loss_dyns_per_label.items()}

@property
def ema_dataframe(self) -> pd.DataFrame:
def ema_dataframe(self) -> DataFrame:
"""Pandas DataFrame including full EMA loss dynamics statistics."""
return self._df

@staticmethod
def _parse_to_dataframe(
dataset: IDataset, ema_alpha: float = 0.001, tracking_loss_type: Optional[str] = None
) -> pd.DataFrame:
) -> DataFrame:
"""Parse loss dynamics statistics from Datumaro dataset to Pandas DataFrame."""
key = (
"loss_dynamics" if tracking_loss_type is None else f"loss_dynamics_{tracking_loss_type}"
)
import pandas as pd

ema_loss_dyns_list = []
for item in dataset:
for ann in item.annotations:
Expand Down Expand Up @@ -167,6 +172,8 @@ def plot_ema_loss_dynamics(
figsize: Tuple[int, int] = (4, 3),
**kwargs,
) -> Figure:
import matplotlib.pyplot as plt

if mode == "mean":
cands_by_label_id = {None: candidates}
elif mode == "label_mean":
Expand Down
15 changes: 13 additions & 2 deletions src/datumaro/components/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import logging as log
import os.path as osp
from functools import partial
from inspect import isclass
from inspect import getmodule, isclass
from typing import (
Callable,
Dict,
Expand Down Expand Up @@ -202,7 +202,18 @@ def _get_plugin_exports(cls, module, types):
continue
exports.append(getattr(module, symbol))

exports = [s for s in exports if isclass(s) and issubclass(s, types) and s not in types]
exports = [
s
for s in exports
if isclass(s)
and issubclass(s, types)
and s not in types
and (
getmodule(s)
is None # Custom plugin (in the Datumaro project) can be a single file and have no module
or not getmodule(s).__package__.startswith("datumaro.components")
)
]

return exports

Expand Down
39 changes: 28 additions & 11 deletions src/datumaro/components/extractor_tfds.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,21 @@
import itertools
import logging as log
import os.path as osp
from importlib.util import find_spec
from types import SimpleNamespace as namespace
from typing import Any, Callable, Dict, Iterator, Mapping, Optional, Sequence, Tuple, Type, Union
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Iterator,
Mapping,
Optional,
Sequence,
Tuple,
Type,
Union,
)

import attrs
import numpy as np
Expand All @@ -19,17 +32,21 @@
from datumaro.components.media import Image, MediaElement
from datumaro.util.tf_util import import_tf

try:
tf = import_tf()
import tensorflow_datasets as tfds
except ImportError:
log.debug(
"Unable to import TensorFlow or TensorFlow Datasets. "
"Dataset downloading via TFDS is disabled."
)
TFDS_EXTRACTOR_AVAILABLE = False
TFDS_EXTRACTOR_AVAILABLE = True if find_spec("tensorflow_datasets") is not None else False

if TYPE_CHECKING:
try:
tf = import_tf()
import tensorflow_datasets as tfds
except ImportError:
log.debug(
"Unable to import TensorFlow or TensorFlow Datasets. "
"Dataset downloading via TFDS is disabled."
)
else:
TFDS_EXTRACTOR_AVAILABLE = True
from datumaro.util.import_util import lazy_import

tfds = lazy_import("tensorflow_datasets")


@frozen(kw_only=True)
Expand Down
12 changes: 7 additions & 5 deletions src/datumaro/components/lazy_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
#
# SPDX-License-Identifier: MIT

import logging as log
from abc import ABC, abstractclassmethod
from importlib import import_module
from importlib.util import find_spec
from typing import List, Optional, Sequence, Type, Union

from datumaro.components.dataset_base import DatasetBase
Expand Down Expand Up @@ -56,11 +58,11 @@ def get_lazy_plugin(
plugin_type: str,
extra_deps: List[str] = [],
) -> Optional[LazyPlugin]:
try:
for extra_dep in extra_deps:
import_module(extra_dep)
except ImportError:
return None
for extra_dep in extra_deps:
spec = find_spec(extra_dep)
if spec is None:
log.debug(f"Cannot import extra dep={extra_dep} for plugin_name={plugin_name}.")
return None

plugin_type_cls = STR_TO_PLUGIN_TYPES[plugin_type]

Expand Down
10 changes: 9 additions & 1 deletion src/datumaro/components/media.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from copy import deepcopy
from enum import IntEnum
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Expand All @@ -29,7 +30,6 @@

import cv2
import numpy as np
import pandas as pd

from datumaro.components.crypter import NULL_CRYPTER, Crypter
from datumaro.components.errors import DatumaroError, MediaShapeError
Expand All @@ -42,6 +42,14 @@
save_image,
)

if TYPE_CHECKING:
import pandas as pd
else:
from datumaro.util.import_util import lazy_import

pd = lazy_import("pandas")


AnyData = TypeVar("AnyData", bytes, np.ndarray)


Expand Down
Loading