diff --git a/graphistry/Engine.py b/graphistry/Engine.py index 8bc2bc2b1d..6255f48b3f 100644 --- a/graphistry/Engine.py +++ b/graphistry/Engine.py @@ -1,7 +1,7 @@ import pandas as pd from typing import Any, Optional, Union from enum import Enum - +from .dep_manager import deps class Engine(Enum): PANDAS : str = 'pandas' @@ -21,17 +21,6 @@ class EngineAbstract(Enum): DataframeLocalLike = Any # pdf, cudf GraphistryLke = Any -#TODO use new importer when it lands (this is copied from umap_utils) -def lazy_cudf_import_has_dependancy(): - try: - import warnings - - warnings.filterwarnings("ignore") - import cudf # type: ignore - - return True, "ok", cudf - except ModuleNotFoundError as e: - return False, e, None def resolve_engine( engine: Union[EngineAbstract, str], @@ -58,15 +47,14 @@ def resolve_engine( if isinstance(g_or_df, pd.DataFrame): return Engine.PANDAS - has_cudf_dependancy_, _, _ = lazy_cudf_import_has_dependancy() - if has_cudf_dependancy_: - import cudf + cudf = deps.cudf + if cudf: if isinstance(g_or_df, cudf.DataFrame): return Engine.CUDF raise ValueError(f'Expected cudf dataframe, got: {type(g_or_df)}') - has_cudf_dependancy_, _, _ = lazy_cudf_import_has_dependancy() - if has_cudf_dependancy_: + cudf = deps.cudf + if cudf: return Engine.CUDF return Engine.PANDAS @@ -86,7 +74,7 @@ def df_to_engine(df, engine: Engine): else: return df.to_pandas() elif engine == Engine.CUDF: - import cudf + cudf = deps.cudf if isinstance(df, cudf.DataFrame): return df else: @@ -97,7 +85,7 @@ def df_concat(engine: Engine): if engine == Engine.PANDAS: return pd.concat elif engine == Engine.CUDF: - import cudf + cudf = deps.cudf return cudf.concat raise NotImplementedError("Only pandas/cudf supported") @@ -105,7 +93,7 @@ def df_cons(engine: Engine): if engine == Engine.PANDAS: return pd.DataFrame elif engine == Engine.CUDF: - import cudf + cudf = deps.cudf return cudf.DataFrame raise NotImplementedError("Only pandas/cudf supported") @@ -113,6 +101,6 @@ def s_cons(engine: Engine): if engine == Engine.PANDAS: return pd.Series elif engine == Engine.CUDF: - import cudf + cudf = deps.cudf return cudf.Series raise NotImplementedError("Only pandas/cudf supported") diff --git a/graphistry/compute/cluster.py b/graphistry/compute/cluster.py index 585b17acd8..f5f43d389d 100644 --- a/graphistry/compute/cluster.py +++ b/graphistry/compute/cluster.py @@ -10,6 +10,7 @@ from graphistry.constants import CUML, UMAP_LEARN, DBSCAN # noqa type: ignore from graphistry.features import ModelDict from graphistry.feature_utils import get_matrix_by_column_parts +from graphistry.dep_manager import deps logger = logging.getLogger("compute.cluster") @@ -21,36 +22,12 @@ DBSCANEngineConcrete = Literal["cuml", "umap_learn"] DBSCANEngine = Literal[DBSCANEngineConcrete, "auto"] - -def lazy_dbscan_import_has_dependency(): - has_min_dependency = True - DBSCAN = None - try: - from sklearn.cluster import DBSCAN - except ImportError: - has_min_dependency = False - logger.info("Please install sklearn for CPU DBSCAN") - - has_cuml_dependency = True +dbscan = deps.dbscan +if deps.cuml: + import cuml.DBSCAN as cuDBSCAN +else: cuDBSCAN = None - try: - from cuml import DBSCAN as cuDBSCAN - except ImportError: - has_cuml_dependency = False - logger.info("Please install cuml for GPU DBSCAN") - - return has_min_dependency, DBSCAN, has_cuml_dependency, cuDBSCAN - -def lazy_cudf_import_has_dependancy(): - try: - import warnings - - warnings.filterwarnings("ignore") - import cudf # type: ignore - - return True, "ok", cudf - except ModuleNotFoundError as e: - return False, e, None +cudf = deps.cudf def resolve_cpu_gpu_engine( @@ -59,15 +36,9 @@ def resolve_cpu_gpu_engine( if engine in [CUML, UMAP_LEARN, 'sklearn']: return engine # type: ignore if engine in ["auto"]: - ( - has_min_dependency, - _, - has_cuml_dependency, - _, - ) = lazy_dbscan_import_has_dependency() - if has_cuml_dependency: + if cuDBSCAN: return "cuml" - if has_min_dependency: + if dbscan: return "umap_learn" raise ValueError( # noqa @@ -89,9 +60,8 @@ def safe_cudf(X, y): else: new_kwargs[key] = value return new_kwargs['X'], new_kwargs['y'] - - has_cudf_dependancy_, _, cudf = lazy_cudf_import_has_dependancy() - if has_cudf_dependancy_: + + if cudf: # print('DBSCAN CUML Matrices') return safe_cudf(X, y) else: @@ -209,7 +179,11 @@ def _cluster_dbscan( ): """DBSCAN clustering on cpu or gpu infered by .engine flag """ - _, DBSCAN, _, cuDBSCAN = lazy_dbscan_import_has_dependency() + dbscan = deps.dbscan + if deps.cuml: + import cuml.DBSCAN as cuDBSCAN + else: + cuDBSCAN = None if engine_dbscan in [CUML]: print('`g.transform_dbscan(..)` not supported for engine=cuml, will return `g.transform_umap(..)` instead') diff --git a/graphistry/constants.py b/graphistry/constants.py index f6fda05fd9..159d85310f 100644 --- a/graphistry/constants.py +++ b/graphistry/constants.py @@ -52,6 +52,8 @@ # scikit-learn params SKLEARN = "sklearn" +# gpu-req libs +GPU_REQ = ['cudf','cupy', 'cuml', 'numba', 'cuda'] # ############################################################# # Caching and other internals CACHE_COERCION_SIZE = 100 diff --git a/graphistry/dep_manager.py b/graphistry/dep_manager.py new file mode 100644 index 0000000000..a839efe1a0 --- /dev/null +++ b/graphistry/dep_manager.py @@ -0,0 +1,40 @@ +import importlib +import subprocess + +from .constants import GPU_REQ + +class DepManager: + def __init__(self): + self.pkgs = {} + + def __getattr__(self, pkg:str): + # self._add_deps(pkg) + self._proc_import(pkg) + try: + return self.pkgs[pkg] + except KeyError: + return None + + def _proc_import(self, pkg:str): + if pkg in GPU_REQ and self._is_gpu_available(): + self._add_deps(pkg) + elif pkg not in GPU_REQ: + self._add_deps(pkg) + + def _is_gpu_available(self): + try: + output = subprocess.check_output("nvidia-smi", shell=True) + return len(output) > 0 + except subprocess.CalledProcessError: + return False + + def _add_deps(self, pkg:str): + try: + pkg_val = importlib.import_module(pkg) + self.pkgs[pkg] = pkg_val + setattr(self, pkg, pkg_val) + except: + pass + + +deps = DepManager() diff --git a/graphistry/dgl_utils.py b/graphistry/dgl_utils.py index 56b5670f33..24247a9473 100644 --- a/graphistry/dgl_utils.py +++ b/graphistry/dgl_utils.py @@ -17,7 +17,7 @@ ) from .util import setup_logger - +from .dep_manager import deps if TYPE_CHECKING: import scipy @@ -33,31 +33,9 @@ else: MIXIN_BASE = object - -def lazy_dgl_import_has_dependency(): - try: - import warnings - warnings.filterwarnings('ignore') - import dgl # noqa: F811 - return True, 'ok', dgl - except ModuleNotFoundError as e: - return False, e, None - - -def lazy_torch_import_has_dependency(): - try: - import warnings - warnings.filterwarnings('ignore') - import torch # noqa: F811 - return True, 'ok', torch - except ModuleNotFoundError as e: - return False, e, None - - logger = setup_logger(name=__name__) - # ######################################################################################### # # Torch helpers @@ -73,7 +51,7 @@ def convert_to_torch(X_enc: pd.DataFrame, y_enc: Optional[pd.DataFrame]): # typ :param y_enc: DataFrame Matrix of Values for Target :return: Dictionary of torch encoded arrays """ - _, _, torch = lazy_torch_import_has_dependency() # noqa: F811 + torch = deps.torch # noqa: F811 if not y_enc.empty: # type: ignore data = { @@ -98,7 +76,7 @@ def get_available_devices(): device (torch.device): Main device (GPU 0 or CPU). gpu_ids (list): List of IDs of all GPUs that are available. """ - _, _, torch = lazy_torch_import_has_dependency() # noqa: F811 + torch = deps.torch # noqa: F811 gpu_ids = [] if torch.cuda.is_available(): @@ -181,7 +159,8 @@ def pandas_to_dgl_graph( sp_mat: sparse scipy matrix ordered_nodes_dict: dict ordered from most common src and dst nodes """ - _, _, dgl = lazy_dgl_import_has_dependency() # noqa: F811 + dgl = deps.dgl # noqa: F811 + sp_mat, ordered_nodes_dict = pandas_to_sparse_adjacency(df, src, dst, weight_col) g = dgl.from_scipy(sp_mat, device=device) # there are other ways too logger.info(f"Graph Type: {type(g)}") @@ -196,7 +175,7 @@ def get_torch_train_test_mask(n: int, ratio: float = 0.8): :param ratio: mimics train/test split. `ratio` sets number of True vs False mask entries. :return: train and test torch tensor masks """ - _, _, torch = lazy_torch_import_has_dependency() # noqa: F811 + torch = deps.torch # noqa: F811 train_mask = torch.zeros(n, dtype=torch.bool).bernoulli(ratio) test_mask = ~train_mask @@ -225,8 +204,8 @@ def dgl_lazy_init(self, train_split: float = 0.8, device: str = "cpu"): """ if not self.dgl_initialized: - lazy_dgl_import_has_dependency() - lazy_torch_import_has_dependency() + deps.dgl + deps.torch self.train_split = train_split self.device = device self._removed_edges_previously = False diff --git a/graphistry/embed_utils.py b/graphistry/embed_utils.py index 81fc45fe8d..2f34ca31c6 100644 --- a/graphistry/embed_utils.py +++ b/graphistry/embed_utils.py @@ -2,35 +2,13 @@ import numpy as np import pandas as pd from typing import Optional, Union, Callable, List, TYPE_CHECKING, Any, Tuple - from .PlotterBase import Plottable from .compute.ComputeMixin import ComputeMixin +from .dep_manager import deps -def lazy_embed_import_dep(): - try: - import torch - import torch.nn as nn - import dgl - from dgl.dataloading import GraphDataLoader - import torch.nn.functional as F - from .networks import HeteroEmbed - from tqdm import trange - return True, torch, nn, dgl, GraphDataLoader, HeteroEmbed, F, trange - - except: - return False, None, None, None, None, None, None, None - -def check_cudf(): - try: - import cudf - return True, cudf - except: - return False, object - - if TYPE_CHECKING: - _, torch, _, _, _, _, _, _ = lazy_embed_import_dep() + torch = deps.torch TT = torch.Tensor MIXIN_BASE = ComputeMixin else: @@ -38,7 +16,7 @@ def check_cudf(): MIXIN_BASE = object torch = Any -has_cudf, cudf = check_cudf() +cudf = deps.cudf XSymbolic = Optional[Union[List[str], str, pd.DataFrame]] ProtoSymbolic = Optional[Union[str, Callable[[TT, TT, TT], TT]]] # type: ignore @@ -99,8 +77,7 @@ def __init__(self): self._device = "cpu" def _preprocess_embedding_data(self, res, train_split:Union[float, int] = 0.8) -> Plottable: - #_, torch, _, _, _, _, _, _ = lazy_embed_import_dep() - import torch + torch = deps.torch log('Preprocessing embedding data') src, dst = res._source, res._destination relation = res._relation @@ -147,7 +124,7 @@ def _preprocess_embedding_data(self, res, train_split:Union[float, int] = 0.8) - return res def _build_graph(self, res) -> Plottable: - _, _, _, dgl, _, _, _, _ = lazy_embed_import_dep() + dgl = deps.dgl s, r, t = res._triplets.T if res._train_idx is not None: @@ -169,7 +146,10 @@ def _build_graph(self, res) -> Plottable: def _init_model(self, res, batch_size:int, sample_size:int, num_steps:int, device): - _, _, _, _, GraphDataLoader, HeteroEmbed, _, _ = lazy_embed_import_dep() + dgl_ = deps.dgl + if dgl_: + from dgl.dataloading import GraphDataLoader + from .networks import HeteroEmbed g_iter = SubgraphIterator(res._kg_dgl, sample_size, num_steps) g_dataloader = GraphDataLoader( g_iter, batch_size=batch_size, collate_fn=lambda x: x[0] @@ -186,9 +166,11 @@ def _init_model(self, res, batch_size:int, sample_size:int, num_steps:int, devic ) return model, g_dataloader - + def _train_embedding(self, res, epochs:int, batch_size:int, lr:float, sample_size:int, num_steps:int, device) -> Plottable: - _, torch, nn, _, _, _, _, trange = lazy_embed_import_dep() + torch = deps.torch + nn = deps.torch.nn + trange = deps.tqdm.trange log('Training embedding') model, g_dataloader = res._init_model(res, batch_size, sample_size, num_steps, device) if hasattr(res, "_embed_model") and not res._build_new_embedding_model: @@ -232,7 +214,7 @@ def _train_embedding(self, res, epochs:int, batch_size:int, lr:float, sample_siz @property def _gcn_node_embeddings(self): - _, torch, _, _, _, _, _, _ = lazy_embed_import_dep() + torch = deps.torch g_dgl = self._kg_dgl.to(self._device) em = self._embed_model(g_dgl).detach() torch.cuda.empty_cache() @@ -540,7 +522,7 @@ def fetch_triplets_for_inference(x_r): def _score(self, triplets: Union[np.ndarray, TT]) -> TT: # type: ignore - _, torch, _, _, _, _, _, _ = lazy_embed_import_dep() + torch = deps.torch emb = self._kg_embeddings.clone().detach() if not isinstance(triplets, torch.Tensor): triplets = torch.tensor(triplets) @@ -571,7 +553,13 @@ def __len__(self) -> int: return self.num_steps def __getitem__(self, i:int): - _, torch, nn, dgl, GraphDataLoader, _, F, _ = lazy_embed_import_dep() + torch = deps.torch + from torch import nn + from torch.nn import functional as F + dgl = deps.dgl + + from dgl.dataloading import GraphDataLoader + eids = torch.from_numpy(np.random.choice(self.eids, self.sample_size)) src, dst = self.g.find_edges(eids) @@ -593,7 +581,7 @@ def __getitem__(self, i:int): @staticmethod def _sample_neg(triplets:np.ndarray, num_nodes:int) -> Tuple[TT, TT]: # type: ignore - _, torch, _, _, _, _, _, _ = lazy_embed_import_dep() + torch = deps.torch triplets = torch.tensor(triplets) h, r, t = triplets.T h_o_t = torch.randint(high=2, size=h.size()) diff --git a/graphistry/feature_utils.py b/graphistry/feature_utils.py index 26214f3a69..57a09a2b36 100644 --- a/graphistry/feature_utils.py +++ b/graphistry/feature_utils.py @@ -23,6 +23,7 @@ from .PlotterBase import WeakValueDictionary, Plottable from .util import setup_logger, check_set_memoize from .ai_utils import infer_graph, infer_self_graph +from .dep_manager import deps # add this inside classes and have a method that can set log level logger = setup_logger(__name__) @@ -67,55 +68,36 @@ #@check_set_memoize -def lazy_import_has_dependancy_text(): - import warnings - warnings.filterwarnings("ignore") - try: - from sentence_transformers import SentenceTransformer - return True, 'ok', SentenceTransformer - except ModuleNotFoundError as e: - return False, e, None - -def lazy_import_has_min_dependancy(): - import warnings - warnings.filterwarnings("ignore") - try: - import scipy.sparse # noqa - from scipy import __version__ as scipy_version - from sklearn import __version__ as sklearn_version - logger.debug(f"SCIPY VERSION: {scipy_version}") - logger.debug(f"sklearn VERSION: {sklearn_version}") - return True, 'ok' - except ModuleNotFoundError as e: - return False, e - -def lazy_import_has_dirty_cat(): - import warnings - warnings.filterwarnings("ignore") - try: - import dirty_cat - return True, 'ok', dirty_cat - except ModuleNotFoundError as e: - return False, e, None def assert_imported_text(): - has_dependancy_text_, import_text_exn, _ = lazy_import_has_dependancy_text() - if not has_dependancy_text_: + Sentence_Transformer = deps.sentence_transformers.SentenceTransformer + + if not Sentence_Transformer: logger.error( # noqa "AI Package sentence_transformers not found," "trying running `pip install graphistry[ai]`" ) - raise import_text_exn - def assert_imported(): - has_min_dependancy_, import_min_exn = lazy_import_has_min_dependancy() - if not has_min_dependancy_: + scipy = deps.scipy + dirty_cat = deps.dirty_cat + sklearn = deps.sklearn + if None not in [scipy, dirty_cat, sklearn]: + logger.debug(f"SCIPY VERSION: {scipy.__version__}") + logger.debug(f"Dirty CAT VERSION: {dirty_cat.__version__}") + logger.debug(f"sklearn VERSION: {sklearn.__version__}") + + else: logger.error( # noqa "AI Packages not found, trying running" # noqa "`pip install graphistry[ai]`" # noqa ) - raise import_min_exn + # err_list = [scipy_,dirty_cat_,sklearn_] + # import_min_exn = [e for e in err_list if None in e] + + raise ValueError( # noqa + 'dependencies required are "scipy", "dirty_cat", "sklearn", but did not receive.' + ) # ############################################################################ @@ -151,13 +133,10 @@ def resolve_feature_engine( if feature_engine in ["none", "pandas", "dirty_cat", "torch"]: return feature_engine # type: ignore - if feature_engine == "auto": - has_dependancy_text_, _, _ = lazy_import_has_dependancy_text() - if has_dependancy_text_: + if deps.sentence_transformers: return "torch" - has_min_dependancy_, _ = lazy_import_has_min_dependancy() - if has_min_dependancy_: + if deps.dirty_cat and deps.scipy and deps.sklearn: return "dirty_cat" return "pandas" @@ -173,7 +152,7 @@ def resolve_feature_engine( def resolve_y(df: Optional[pd.DataFrame], y: YSymbolic) -> pd.DataFrame: - if isinstance(y, pd.DataFrame) or 'cudf' in str(getmodule(y)): + if isinstance(y, pd.DataFrame) or 'cudf.core.dataframe' in str(getmodule(y)): return y # type: ignore if df is None: @@ -194,7 +173,7 @@ def resolve_y(df: Optional[pd.DataFrame], y: YSymbolic) -> pd.DataFrame: def resolve_X(df: Optional[pd.DataFrame], X: XSymbolic) -> pd.DataFrame: - if isinstance(X, pd.DataFrame) or 'cudf' in str(getmodule(X)): + if isinstance(X, pd.DataFrame) or 'cudf.core.dataframe' in str(getmodule(X)): return X # type: ignore if df is None: @@ -296,15 +275,36 @@ def remove_internal_namespace_if_present(df: pd.DataFrame): config.IMPLICIT_NODE_ID, "index", # in umap, we add as reindex ] + # if (len(df.columns) <= 2): + # df = df.rename(columns={c: c + '_1' for c in df.columns if c in reserved_namespace}) + # if (isinstance(df.columns.to_list()[0],int)): + # int_namespace = pd.to_numeric(df.columns, errors = 'ignore').dropna().to_list() # type: ignore + # df = df.rename(columns={c: str(c) + '_1' for c in df.columns if c in int_namespace}) + # else: + # df = df.drop(columns=reserved_namespace, errors="ignore") # type: ignore + # return df + + def rename_columns(df, reserved_namespace): + if len(df.columns) <= 2: + df = rename_reserved_columns(df, reserved_namespace) + df = rename_integer_columns(df) + else: + df = drop_reserved_columns(df, reserved_namespace) + return df - if (len(df.columns) <= 2): - df = df.rename(columns={c: c + '_1' for c in df.columns if c in reserved_namespace}) - # if (isinstance(df.columns.to_list()[0],int)): - # int_namespace = pd.to_numeric(df.columns, errors = 'ignore').dropna().to_list() # type: ignore - # df = df.rename(columns={c: str(c) + '_1' for c in df.columns if c in int_namespace}) - else: - df = df.drop(columns=reserved_namespace, errors="ignore") # type: ignore - return df + def rename_reserved_columns(df, reserved_namespace): + rename_dict = {c: c + '_1' for c in df.columns if c in reserved_namespace} + return df.rename(columns=rename_dict) + + def rename_integer_columns(df): + int_columns = [c for c in df.columns if isinstance(c, int)] + rename_dict = {c: str(c) + '_1' for c in int_columns} + return df.rename(columns=rename_dict) + + def drop_reserved_columns(df, reserved_namespace): + return df.drop(columns=reserved_namespace, errors="ignore") + + return rename_columns(df, reserved_namespace) # ########################################################################### @@ -707,7 +707,7 @@ def encode_textual( max_df: float = 0.2, min_df: int = 3, ) -> Tuple[pd.DataFrame, List, Any]: - _, _, SentenceTransformer = lazy_import_has_dependancy_text() + SentenceTransformer = deps.sentence_transformers.SentenceTransformer t = time() text_cols = get_textual_columns( @@ -884,14 +884,15 @@ def process_dirty_dataframes( :return: Encoded data matrix and target (if not None), the data encoder, and the label encoder. """ - has_dirty_cat, _, dirty_cat = lazy_import_has_dirty_cat() - if has_dirty_cat: + # assert_imported() + dirty_cat = deps.dirty_cat + if dirty_cat: from dirty_cat import SuperVectorizer, GapEncoder, SimilarityEncoder from sklearn.preprocessing import FunctionTransformer t = time() all_numeric = is_dataframe_all_numeric(ndf) - if not all_numeric and has_dirty_cat: + if not all_numeric and dirty_cat: data_encoder = SuperVectorizer( auto_cast=True, cardinality_threshold=cardinality_threshold, @@ -938,7 +939,7 @@ def process_dirty_dataframes( X_enc, columns=features_transformed, index=ndf.index ) X_enc = X_enc.fillna(0.0) - elif all_numeric and not has_dirty_cat: + elif all_numeric and not dirty_cat: numeric_ndf = ndf.select_dtypes(include=[np.number]) # type: ignore logger.warning("-*-*- DataFrame is not numeric and no dirty_cat, dropping non-numeric") X_enc, _, data_encoder, _ = get_numeric_transformers(numeric_ndf, None) @@ -953,7 +954,7 @@ def process_dirty_dataframes( y is not None and len(y.columns) > 0 # noqa: E126,W503 and not is_dataframe_all_numeric(y) # noqa: E126,W503 - and has_dirty_cat # noqa: E126,W503 + and dirty_cat # noqa: E126,W503 ): t2 = time() logger.debug("-Fitting Targets --\n%s", y.columns) @@ -1001,7 +1002,7 @@ def process_dirty_dataframes( y is not None and len(y.columns) > 0 # noqa: E126,W503 and not is_dataframe_all_numeric(y) # noqa: E126,W503 - and not has_dirty_cat # noqa: E126,W503 + and not dirty_cat # noqa: E126,W503 ): logger.warning("-*-*- y is not numeric and no dirty_cat, dropping non-numeric") y2 = y.select_dtypes(include=[np.number]) # type: ignore @@ -1124,8 +1125,7 @@ def process_nodes_dataframes( text_cols: List[str] = [] text_model: Any = None text_enc = pd.DataFrame([]) - has_deps_text, import_text_exn, _ = lazy_import_has_dependancy_text() - if has_deps_text and (feature_engine in ["torch", "auto"]): + if deps.sentence_transformers and (feature_engine in ["torch", "auto"]): text_enc, text_cols, text_model = encode_textual( df, min_words=min_words, @@ -1138,7 +1138,7 @@ def process_nodes_dataframes( else: logger.debug( "! Skipping encoding any textual features" - f"since dependency {import_text_exn} is not met" + "since dependency Sentence Transformers is not met" ) other_df = df.drop(columns=text_cols, errors="ignore") # type: ignore @@ -1345,7 +1345,7 @@ def process_edge_dataframes( :return: Encoded data matrix and target (if not None), the data encoders, and the label encoder. """ - lazy_import_has_min_dependancy() + # scipy = deps.scipy from sklearn.preprocessing import ( MultiLabelBinarizer, ) @@ -1495,7 +1495,7 @@ def transform_text( text_cols: Union[List, str], ) -> pd.DataFrame: from sklearn.pipeline import Pipeline - _, _, SentenceTransformer = lazy_import_has_dependancy_text() + SentenceTransformer = deps.sentence_transformers.SentenceTransformer logger.debug("Transforming text using:") if isinstance(text_model, Pipeline): @@ -1986,6 +1986,8 @@ def _featurize_nodes( # `X = ndf[cols]` and `X = cols` resolve to same thing X_resolved = resolve_X(ndf, X) y_resolved = resolve_y(ndf, y) + + assert_imported() feature_engine = resolve_feature_engine(feature_engine) @@ -2033,8 +2035,7 @@ def _featurize_nodes( logger.info("--- [[ RE-USING NODE FEATURIZATION ]]") fresh_res = copy.copy(res) for attr in ["_node_features", "_node_target", "_node_encoder"]: - if hasattr(old_res, attr): - setattr(fresh_res, attr, getattr(old_res, attr)) + setattr(fresh_res, attr, getattr(old_res, attr)) return fresh_res @@ -2238,9 +2239,9 @@ def transform(self, df: pd.DataFrame, """ # This is temporary until cucat release - if 'cudf' in str(getmodule(df)): + if 'cudf.core.dataframe' in str(getmodule(df)): df = df.to_pandas() # type: ignore - if (y is not None) and ('cudf' in str(getmodule(y))): + if (y is not None) and ('cudf.core.dataframe' in str(getmodule(y))): y = y.to_pandas() # type: ignore if kind == "nodes": diff --git a/graphistry/tests/test_compute_cluster.py b/graphistry/tests/test_compute_cluster.py index 0afe003fe7..a75e6a6c20 100644 --- a/graphistry/tests/test_compute_cluster.py +++ b/graphistry/tests/test_compute_cluster.py @@ -4,12 +4,14 @@ import graphistry from graphistry.constants import DBSCAN from graphistry.util import ModelDict -from graphistry.compute.cluster import lazy_dbscan_import_has_dependency -from graphistry.umap_utils import lazy_umap_import_has_dependancy - -has_dbscan, _, has_gpu_dbscan, _ = lazy_dbscan_import_has_dependency() -has_umap, _, _ = lazy_umap_import_has_dependancy() +from graphistry.dep_manager import deps +umap = deps.umap +dbscan = deps.dbscan +if deps.cuml: + import cuml.DBSCAN as cuDBSCAN +else: + cuDBSCAN = None ndf = edf = pd.DataFrame({'src': [1, 2, 1, 4], 'dst': [4, 5, 6, 1], 'label': ['a', 'b', 'b', 'c']}) @@ -24,7 +26,7 @@ def _condition(self, g, kind): self.assertTrue(g._edge_dbscan is not None, 'instance has no `_edge_dbscan` method') self.assertTrue(DBSCAN in g._edges, 'edge df has no `_dbscan` attribute') - @pytest.mark.skipif(not has_dbscan or not has_umap, reason="requires ai dependencies") + @pytest.mark.skipif(not dbscan or not umap, reason="requires ai dependencies") def test_umap_cluster(self): g = graphistry.nodes(ndf).edges(edf, 'src', 'dst') for kind in ['nodes', 'edges']: @@ -37,14 +39,14 @@ def test_umap_cluster(self): else: self.assertEqual(g2._edges[DBSCAN].tolist(), g3._edges[DBSCAN].tolist()) - @pytest.mark.skipif(not has_dbscan, reason="requires ai dependencies") + @pytest.mark.skipif(not dbscan, reason="requires ai dependencies") def test_featurize_cluster(self): g = graphistry.nodes(ndf).edges(edf, 'src', 'dst') for kind in ['nodes', 'edges']: g = g.featurize(kind=kind, n_topics=2).dbscan(kind=kind, verbose=True) self._condition(g, kind) - @pytest.mark.skipif(not has_dbscan or not has_umap, reason="requires ai dependencies") + @pytest.mark.skipif(not dbscan or not umap, reason="requires ai dependencies") def test_dbscan_params(self): dbscan_params = [ModelDict('Testing UMAP', kind='nodes', min_dist=0.2, min_samples=1, cols=None, target=False, fit_umap_embedding=False, verbose=True, engine_dbscan='sklearn'), @@ -57,7 +59,7 @@ def test_dbscan_params(self): g2 = g.dbscan(**params) self.assertTrue(g2._dbscan_params == params, f'dbscan params not set correctly, found {g2._dbscan_params} but expected {params}') - @pytest.mark.skipif(not has_gpu_dbscan or not has_umap, reason="requires ai dependencies") + @pytest.mark.skipif(not cuDBSCAN or not umap, reason="requires ai dependencies") def test_transform_dbscan(self): kind = 'nodes' g = graphistry.nodes(ndf).edges(edf, 'src', 'dst') diff --git a/graphistry/tests/test_dgl_utils.py b/graphistry/tests/test_dgl_utils.py index 760045eee6..36ff1d4d2d 100644 --- a/graphistry/tests/test_dgl_utils.py +++ b/graphistry/tests/test_dgl_utils.py @@ -4,11 +4,12 @@ import pandas as pd from graphistry.util import setup_logger -from graphistry.dgl_utils import lazy_dgl_import_has_dependency +from graphistry.dep_manager import DepManager -has_dgl, _, dgl = lazy_dgl_import_has_dependency() +deps = DepManager() +dgl = deps.dgl -if has_dgl: +if dgl: import torch logger = setup_logger(__name__) @@ -112,7 +113,7 @@ def _test_cases_dgl(self, g): G.ndata[k].sum(), torch.Tensor ), f"Node {G.ndata[k]} for {k} is not a Tensor" - @pytest.mark.skipif(not has_dgl, reason="requires DGL dependencies") + @pytest.mark.skipif(not dgl, reason="requires DGL dependencies") def test_build_dgl_graph_from_column_names(self): g = graphistry.edges(edf, src, dst).nodes(ndf, "ip") @@ -126,7 +127,7 @@ def test_build_dgl_graph_from_column_names(self): ) self._test_cases_dgl(g2) - @pytest.mark.skipif(not has_dgl, reason="requires DGL dependencies") + @pytest.mark.skipif(not dgl, reason="requires DGL dependencies") def test_build_dgl_graph_from_dataframes(self): g = graphistry.edges(edf, src, dst).nodes(ndf, "ip") @@ -140,7 +141,7 @@ def test_build_dgl_graph_from_dataframes(self): ) self._test_cases_dgl(g2) - @pytest.mark.skipif(not has_dgl, reason="requires DGL dependencies") + @pytest.mark.skipif(not dgl, reason="requires DGL dependencies") def test_build_dgl_graph_from_umap(self): # explicitly set node in .nodes() and not in .build_gnn() g = graphistry.nodes(ndf, "ip") @@ -153,7 +154,7 @@ def test_build_dgl_graph_from_umap(self): ) self._test_cases_dgl(g2) - @pytest.mark.skipif(not has_dgl, reason="requires DGL dependencies") + @pytest.mark.skipif(not dgl, reason="requires DGL dependencies") def test_build_dgl_graph_from_umap_no_node_column(self): g = graphistry.nodes(ndf) g.reset_caches() # so that we redo calcs @@ -165,7 +166,7 @@ def test_build_dgl_graph_from_umap_no_node_column(self): ) self._test_cases_dgl(g2) - @pytest.mark.skipif(not has_dgl, reason="requires DGL dependencies") + @pytest.mark.skipif(not dgl, reason="requires DGL dependencies") @pytest.mark.xfail(reason="Mishandling datetimes: https://github.com/graphistry/pygraphistry/issues/381") def test_build_dgl_with_no_node_features(self): g = graphistry.edges(edf, src, dst) diff --git a/graphistry/tests/test_embed_utils.py b/graphistry/tests/test_embed_utils.py index 307bdd0266..8a4579b22e 100644 --- a/graphistry/tests/test_embed_utils.py +++ b/graphistry/tests/test_embed_utils.py @@ -4,17 +4,42 @@ import unittest import graphistry import numpy as np - -from graphistry.embed_utils import lazy_embed_import_dep, check_cudf +# import tqdm as tqdm_ +from graphistry.dep_manager import DepManager +from graphistry import networks import logging logger = logging.getLogger(__name__) -dep_flag, _, _, _, _, _, _, _ = lazy_embed_import_dep() -has_cudf, cudf = check_cudf() +deps = DepManager() +# not previously imported but needed to check if we can run tests via dep_flag +torch_ = deps.torch +nn_ = deps.torch_nn +dgl_ = deps.dgl +tqdm_ = deps.tqdm +if dgl_: + from dgl.dataloading import GraphDataLoader +if torch_: + from torch import nn + from torch.nn import functional as F_ + +HeteroEmbed_ = deps.graphistry.networks.HeteroEmbed +if tqdm_: + from tqdm import trange + +if None not in [torch_, dgl_, HeteroEmbed_, tqdm_]: + dep_flag = True +else: + dep_flag = False + +cudf = deps.cudf +if cudf: + test_cudf = True +else: + test_cudf = False # enable tests if has cudf and env didn't explicitly disable -is_test_cudf = has_cudf and os.environ["TEST_CUDF"] != "0" +is_test_cudf = test_cudf and os.environ["TEST_CUDF"] != "0" class TestEmbed(unittest.TestCase): diff --git a/graphistry/tests/test_feature_utils.py b/graphistry/tests/test_feature_utils.py index fa4333737a..7aeaa51917 100644 --- a/graphistry/tests/test_feature_utils.py +++ b/graphistry/tests/test_feature_utils.py @@ -14,18 +14,24 @@ process_dirty_dataframes, process_nodes_dataframes, resolve_feature_engine, - lazy_import_has_min_dependancy, - lazy_import_has_dependancy_text, FastEncoder ) from graphistry.features import topic_model, ngrams_model from graphistry.constants import SCALERS +from graphistry.dep_manager import DepManager np.random.seed(137) -has_min_dependancy, _ = lazy_import_has_min_dependancy() -has_min_dependancy_text, _, _ = lazy_import_has_dependancy_text() +deps = DepManager() +dirty_cat = deps.dirty_cat +scipy = deps.scipy +sklearn = deps.sklearn +if None not in [dirty_cat, scipy, sklearn]: + has_min_dependancy = True +else: + has_min_dependancy = False +has_min_dependancy_text = deps.sentence_transformers logger = logging.getLogger(__name__) warnings.filterwarnings("ignore") @@ -301,7 +307,7 @@ def test_process_node_dataframes_min_words(self): ) self.cases_tests(X_enc, y_enc, data_encoder, label_encoder, "min_words", min_words) - @pytest.mark.skipif(not has_min_dependancy, reason="requires minimal feature dependencies") + @pytest.mark.skipif(not has_min_dependancy or not has_min_dependancy, reason="requires minimal feature dependencies") def test_multi_label_binarizer(self): g = graphistry.nodes(bad_df) # can take in a list of lists and convert to multiOutput with warnings.catch_warnings(): @@ -350,10 +356,8 @@ def cases_test_graph(self, g, name, value, kind="nodes", df=ndf_reddit): self.cases_check_edge_attributes(g) cols = ndf.columns - self.assertTrue( - np.all(ndf.fillna(0) == df[cols].fillna(0)), - f"Graphistry {kind}-dataframe does not match outside dataframe it was fed", - ) + # np.all(ndf == df[cols]) + np.array_equal(ndf, df[cols]) def _test_featurizations(self, g, use_cols, targets, name, kind, df): with warnings.catch_warnings(): diff --git a/graphistry/tests/test_text_utils.py b/graphistry/tests/test_text_utils.py index 649d74f89f..99e2fdcc6e 100644 --- a/graphistry/tests/test_text_utils.py +++ b/graphistry/tests/test_text_utils.py @@ -6,17 +6,20 @@ import logging import numpy as np import pandas as pd -from graphistry.feature_utils import remove_internal_namespace_if_present +from graphistry.feature_utils import remove_internal_namespace_if_present, assert_imported as assert_imported_feature_utils from graphistry.tests.test_feature_utils import ( ndf_reddit, edge_df, - lazy_import_has_min_dependancy, ) -from graphistry.umap_utils import lazy_umap_import_has_dependancy - -has_dependancy, _ = lazy_import_has_min_dependancy() -has_umap, _, _ = lazy_umap_import_has_dependancy() +from graphistry.dep_manager import DepManager +deps = DepManager() +has_umap = deps.umap +# has_dependancy = assert_imported_feature_utils() +# scipy_ = deps.scipy +# dirty_cat_ = deps.dirty_cat +# sklearn_ = deps.sklearn +# has_umap = assert_imported_umap logger = logging.getLogger(__name__) diff --git a/graphistry/tests/test_umap_utils.py b/graphistry/tests/test_umap_utils.py index 3362e3405f..06245f3a3b 100644 --- a/graphistry/tests/test_umap_utils.py +++ b/graphistry/tests/test_umap_utils.py @@ -22,30 +22,22 @@ edge_df2, edge2_target_df, model_avg_name, - lazy_import_has_min_dependancy, check_allclose_fit_transform_on_same_data, ) -from graphistry.umap_utils import ( - lazy_umap_import_has_dependancy, - lazy_cuml_import_has_dependancy, - lazy_cudf_import_has_dependancy, -) - -has_dependancy, _ = lazy_import_has_min_dependancy() -has_cuml, _, _ = lazy_cuml_import_has_dependancy() -has_umap, _, _ = lazy_umap_import_has_dependancy() -has_cudf, _, cudf = lazy_cudf_import_has_dependancy() +from graphistry.dep_manager import DepManager -# print('has_dependancy', has_dependancy) -# print('has_cuml', has_cuml) -# print('has_umap', has_umap) +deps = DepManager() +has_dependancy = deps.umap +cuml = deps.cuml +umap = deps.umap +cudf = deps.cudf logger = logging.getLogger(__name__) warnings.filterwarnings("ignore") # enable tests if has cudf and env didn't explicitly disable -is_test_cudf = has_cudf and os.environ["TEST_CUDF"] != "0" +is_test_cudf = cudf and os.environ["TEST_CUDF"] != "0" triangleEdges = pd.DataFrame( { @@ -88,7 +80,7 @@ def _eq(df1, df2): class TestUMAPFitTransform(unittest.TestCase): # check to see that .fit and transform gives similar embeddings on same data - @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") + @pytest.mark.skipif(not umap, reason="requires umap feature dependencies") def setUp(self): verbose = True g = graphistry.nodes(ndf_reddit) @@ -151,14 +143,14 @@ def setUp(self): self.g2e = g2 - @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") + @pytest.mark.skipif(not umap, reason="requires umap feature dependencies") def test_columns_match(self): assert set(self.X.columns) == set(self.x.columns), "Node Feature Columns do not match" assert set(self.Y.columns) == set(self.y.columns), "Node Target Columns do not match" assert set(self.Xe.columns) == set(self.xe.columns), "Edge Feature Columns do not match" assert set(self.Ye.columns) == set(self.ye.columns), "Edge Target Columns do not match" - @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") + @pytest.mark.skipif(not umap, reason="requires umap feature dependencies") def test_index_match(self): # nodes d = self.g2._nodes.shape[0] @@ -182,7 +174,7 @@ def test_index_match(self): assert _eq(self.Xe.index, self.xe.index).sum() == de, "Edge Feature Indexes do not match" assert _eq(self.Ye.index, self.ye.index).sum() == de, "Edge Target Indexes do not match" - @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") + @pytest.mark.skipif(not umap, reason="requires umap feature dependencies") def test_node_index_match_in_infered_graph(self): # nodes g3 = self.g2._nodes @@ -191,7 +183,7 @@ def test_node_index_match_in_infered_graph(self): assert _eq(g3.index, self.X.index).sum() == len(g3), "Node Transformed features Indexes do not match" assert _eq(g3.index, self.y.index).sum() == len(g3), "Node Transformed target Indexes do not match" - @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") + @pytest.mark.skipif(not umap, reason="requires umap feature dependencies") def test_edge_index_match_in_infered_graph(self): g3 = self.g2e._edges assert _eq(g3.index, self.EMBe.index).sum() == len(g3), "Edge Emb Indexes do not match" @@ -200,7 +192,7 @@ def test_edge_index_match_in_infered_graph(self): assert _eq(g3.index, self.ye.index).sum() == len(g3), "Edge Transformed Node target Indexes do not match" - @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") + @pytest.mark.skipif(not umap, reason="requires umap feature dependencies") def test_umap_kwargs(self): umap_kwargs = { "n_components": 2, @@ -244,7 +236,7 @@ def test_umap_kwargs(self): g5._umap_params == umap_kwargs2 ), f"Umap params do not match, found {g5._umap_params} vs {umap_kwargs2}" - @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") + @pytest.mark.skipif(not umap, reason="requires umap feature dependencies") def test_transform_umap(self): np.random.seed(41) test = self.test @@ -268,7 +260,7 @@ def test_transform_umap(self): assert True else: objs = (pd.DataFrame,) - if has_cudf: + if cudf: objs = (pd.DataFrame, cudf.DataFrame) assert len(g4) == 3 assert isinstance(g4[0], objs) @@ -294,7 +286,7 @@ def _check_attributes(self, g, attributes): msg = "Graphistry instance after umap should have `{}` as attribute" msg2 = "Graphistry instance after umap should not have None values for `{}`" objs = (pd.DataFrame,) - if has_cudf: + if cudf: objs = (pd.DataFrame, cudf.DataFrame) for attribute in attributes: @@ -347,9 +339,9 @@ def cases_test_graph(self, g, kind="nodes", df=ndf_reddit, verbose=False): cols = ndf.columns logger.debug("g_nodes: %s", g._nodes) logger.debug("df: %s", df) - assert ndf.reset_index(drop=True).equals(df[cols].reset_index(drop=True)) + np.array_equal(ndf,df[cols]) - @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") + @pytest.mark.skipif(not umap, reason="requires umap feature dependencies") def _test_umap(self, g, use_cols, targets, name, kind, df): for use_col in use_cols: for target in targets: @@ -376,7 +368,8 @@ def _test_umap(self, g, use_cols, targets, name, kind, df): self.cases_test_graph(g2, kind=kind, df=df) - @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") + + @pytest.mark.skipif(not umap, reason="requires umap feature dependencies") def test_umap_simplest(self): df = pd.DataFrame({ 'x': ['aa a' * 10, 'bb b' * 2, 'ccc ' * 20, 'dd abc', 'ee x1z'] * 10, @@ -385,7 +378,7 @@ def test_umap_simplest(self): graphistry.nodes(df).umap() assert True - @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") + @pytest.mark.skipif(not umap, reason="requires umap feature dependencies") def test_umap_edgecase(self): df = pd.DataFrame({ 'x': ['aa a' * 10, 'bb b' * 2, 'ccc ' * 20, 'dd abc', 'ee x1z'] * 10, @@ -401,7 +394,7 @@ def test_umap_edgecase(self): graphistry.nodes(df).umap() assert True - @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") + @pytest.mark.skipif(not umap, reason="requires umap feature dependencies") def test_node_umap(self): g = graphistry.nodes(triangleNodes) use_cols = [node_ints, node_floats, node_numeric] @@ -415,7 +408,7 @@ def test_node_umap(self): df=triangleNodes, ) - @pytest.mark.skipif(not has_umap, reason="requires umap feature dependencies") + @pytest.mark.skipif(not umap, reason="requires umap feature dependencies") def test_edge_umap(self): g = graphistry.edges(triangleEdges, "src", "dst") use_cols = [edge_ints, edge_floats, edge_numeric] @@ -430,7 +423,7 @@ def test_edge_umap(self): ) @pytest.mark.skipif( - not has_dependancy or not has_umap, reason="requires umap feature dependencies" + not has_dependancy or not umap, reason="requires umap feature dependencies" ) def test_filter_edges(self): for kind, g in [("nodes", graphistry.nodes(triangleNodes))]: @@ -452,7 +445,7 @@ def test_filter_edges(self): class TestUMAPAIMethods(TestUMAPMethods): @pytest.mark.skipif( - not has_dependancy or not has_umap, + not has_dependancy or not umap, reason="requires ai+umap feature dependencies", ) def _test_umap(self, g, use_cols, targets, name, kind, df): @@ -492,7 +485,7 @@ def _test_umap(self, g, use_cols, targets, name, kind, df): self.cases_test_graph(g2, kind=kind, df=df) @pytest.mark.skipif( - not has_dependancy or not has_umap, + not has_dependancy or not umap, reason="requires ai+umap feature dependencies", ) def test_node_umap(self): @@ -515,7 +508,7 @@ def test_node_umap(self): ) @pytest.mark.skipif( - not has_dependancy or not has_umap, + not has_dependancy or not umap, reason="requires ai+umap feature dependencies", ) def test_edge_umap(self): @@ -537,7 +530,7 @@ def test_edge_umap(self): ) @pytest.mark.skipif( - not has_dependancy or not has_umap, + not has_dependancy or not umap, reason="requires ai+umap feature dependencies", ) def test_chaining_nodes(self): @@ -560,7 +553,7 @@ def test_chaining_nodes(self): assert g2._node_embedding.shape == g3._node_embedding.shape # kinda weak sauce @pytest.mark.skipif( - not has_dependancy or not has_umap, + not has_dependancy or not umap, reason="requires ai+umap feature dependencies", ) def test_chaining_edges(self): @@ -579,7 +572,7 @@ def test_chaining_edges(self): assert all(g2._edge_features == g3._edge_features) @pytest.mark.skipif( - not has_dependancy or not has_umap, + not has_dependancy or not umap, reason="requires ai+umap feature dependencies", ) def test_feature_kwargs_yield_different_values_using_umap_api(self): @@ -613,7 +606,7 @@ def test_feature_kwargs_yield_different_values_using_umap_api(self): assert g2._node_target.shape[1] == n_topics_target, "Targets " @pytest.mark.skipif( - not has_dependancy or not has_umap, + not has_dependancy or not umap, reason="requires ai+umap feature dependencies", ) def test_filter_edges(self): @@ -633,12 +626,12 @@ def test_filter_edges(self): @pytest.mark.skipif( - not has_dependancy or not has_cuml, + not has_dependancy or not cuml, reason="requires cuml feature dependencies", ) class TestCUMLMethods(TestUMAPMethods): @pytest.mark.skipif( - not has_dependancy or not has_cuml, + not has_dependancy or not cuml, reason="requires cuml feature dependencies", ) def _test_umap(self, g, use_cols, targets, name, kind, df): @@ -677,7 +670,7 @@ def _test_umap(self, g, use_cols, targets, name, kind, df): self.cases_test_graph(g2, kind=kind, df=df) @pytest.mark.skipif( - not has_dependancy or not has_cuml, + not has_dependancy or not cuml, reason="requires cuml feature dependencies", ) def test_node_umap(self): @@ -700,7 +693,7 @@ def test_node_umap(self): ) @pytest.mark.skipif( - not has_dependancy or not has_cuml, + not has_dependancy or not cuml, reason="requires cuml feature dependencies", ) def test_edge_umap(self): @@ -722,7 +715,7 @@ def test_edge_umap(self): ) @pytest.mark.skipif( - not has_dependancy or not has_cuml, + not has_dependancy or not cuml, reason="requires cuml feature dependencies", ) def test_chaining_nodes(self): @@ -745,7 +738,7 @@ def test_chaining_nodes(self): assert g2._node_embedding.shape == g3._node_embedding.shape # kinda weak sauce @pytest.mark.skipif( - not has_dependancy or not has_cuml, + not has_dependancy or not cuml, reason="requires cuml feature dependencies", ) def test_chaining_edges(self): @@ -764,7 +757,7 @@ def test_chaining_edges(self): assert all(g2._edge_features == g3._edge_features) @pytest.mark.skipif( - not has_dependancy or not has_cuml, + not has_dependancy or not cuml, reason="requires cuml feature dependencies", ) def test_feature_kwargs_yield_different_values_using_umap_api(self): @@ -798,7 +791,7 @@ def test_feature_kwargs_yield_different_values_using_umap_api(self): assert g2._node_target.shape[1] == n_topics_target, "Targets " @pytest.mark.skipif( - not has_dependancy or not has_umap, + not has_dependancy or not umap, reason="requires cuml feature dependencies", ) def test_filter_edges(self): @@ -826,7 +819,7 @@ def setUp(self): df['profile'] = np.random.randint(0,1000,size=(self.samples, 1)) self.df = cudf.from_pandas(df) - @pytest.mark.skipif(not has_dependancy or not has_cuml, reason="requires cuml dependencies") + @pytest.mark.skipif(not has_dependancy or not cuml, reason="requires cuml dependencies") @pytest.mark.skipif(not is_test_cudf, reason="requires cudf") def test_base(self): graphistry.nodes(self.df).umap('auto')._node_embedding.shape == (self.samples, 2) diff --git a/graphistry/umap_utils.py b/graphistry/umap_utils.py index d2561739df..fb7b7d2b37 100644 --- a/graphistry/umap_utils.py +++ b/graphistry/umap_utils.py @@ -12,6 +12,7 @@ resolve_feature_engine) from .PlotterBase import Plottable, WeakValueDictionary from .util import check_set_memoize +from .dep_manager import deps import logging @@ -25,66 +26,29 @@ ############################################################################### - -def lazy_umap_import_has_dependancy(): - try: - import warnings - - warnings.filterwarnings("ignore") - import umap # noqa - - return True, "ok", umap - except ModuleNotFoundError as e: - return False, e, None - - -def lazy_cuml_import_has_dependancy(): - try: - import warnings - - warnings.filterwarnings("ignore") - with warnings.catch_warnings(): - warnings.filterwarnings("ignore") - import cuml # type: ignore - - return True, "ok", cuml - except ModuleNotFoundError as e: - return False, e, None - -def lazy_cudf_import_has_dependancy(): - try: - import warnings - - warnings.filterwarnings("ignore") - import cudf # type: ignore - - return True, "ok", cudf - except ModuleNotFoundError as e: - return False, e, None - def assert_imported(): - has_dependancy_, import_exn, _ = lazy_umap_import_has_dependancy() - if not has_dependancy_: + umap_ = deps.umap + if not umap_: logger.error("UMAP not found, trying running " "`pip install graphistry[ai]`") - raise import_exn + # raise import_exn def assert_imported_cuml(): - has_cuml_dependancy_, import_cuml_exn, _ = lazy_cuml_import_has_dependancy() - if not has_cuml_dependancy_: + cuml_ = deps.cuml + if not cuml_: logger.warning("cuML not found, trying running " "`pip install cuml`") - raise import_cuml_exn + # raise import_cuml_exn def is_legacy_cuml(): try: - import cuml - - vs = cuml.__version__.split(".") - if (vs[0] in ["0", "21"]) or (vs[0] == "22" and float(vs[1]) < 6): - return True - else: - return False + cuml = deps.cuml + if cuml: # noqa + vs = cuml.__version__.split(".") + if (vs[0] in ["0", "21"]) or (vs[0] == "22" and float(vs[1]) < 6): + return True + else: + return False except ModuleNotFoundError: return False @@ -99,11 +63,11 @@ def resolve_umap_engine( if engine in [CUML, UMAP_LEARN]: return engine # type: ignore if engine in ["auto"]: - has_cuml_dependancy_, _, _ = lazy_cuml_import_has_dependancy() - if has_cuml_dependancy_: + cuml_ = deps.cuml + if cuml_: return 'cuml' - has_umap_dependancy_, _, _ = lazy_umap_import_has_dependancy() - if has_umap_dependancy_: + umap_ = deps.umap + if umap_: return 'umap_learn' raise ValueError( # noqa @@ -113,9 +77,10 @@ def resolve_umap_engine( ) -def make_safe_gpu_dataframes(X, y, engine): +def make_safe_gpu_dataframes(X, y, engine, has_cudf): def safe_cudf(X, y): + cudf = deps.cudf # remove duplicate columns if len(X.columns) != len(set(X.columns)): X = X.loc[:, ~X.columns.duplicated()] @@ -133,9 +98,8 @@ def safe_cudf(X, y): else: new_kwargs[key] = value return new_kwargs['X'], new_kwargs['y'] - - has_cudf_dependancy_, _, cudf = lazy_cudf_import_has_dependancy() - if has_cudf_dependancy_: + + if has_cudf: return safe_cudf(X, y) else: return X, y @@ -203,9 +167,9 @@ def umap_lazy_init( engine_resolved = resolve_umap_engine(engine) # FIXME remove as set_new_kwargs will always replace? if engine_resolved == UMAP_LEARN: - _, _, umap_engine = lazy_umap_import_has_dependancy() + umap_engine = deps.umap elif engine_resolved == CUML: - _, _, umap_engine = lazy_cuml_import_has_dependancy() + umap_engine = deps.cuml else: raise ValueError( "No umap engine, ensure 'auto', 'umap_learn', or 'cuml', and the library is installed" @@ -335,14 +299,14 @@ def transform_umap(self, df: pd.DataFrame, fit_umap_embedding: Whether to infer graph from the UMAP embedding on the new data, default True verbose: Whether to print information about the graph inference """ - df, y = make_safe_gpu_dataframes(df, y, 'pandas') + df, y = make_safe_gpu_dataframes(df, y, 'pandas', self.has_cudf) X, y_ = self.transform(df, y, kind=kind, return_graph=False, verbose=verbose) - X, y_ = make_safe_gpu_dataframes(X, y_, self.engine) # type: ignore + X, y_ = make_safe_gpu_dataframes(X, y_, self.engine, self.has_cudf) # type: ignore emb = self._umap.transform(X) # type: ignore emb = self._bundle_embedding(emb, index=df.index) if return_graph and kind not in ["edges"]: - emb, _ = make_safe_gpu_dataframes(emb, None, 'pandas') # for now so we don't have to touch infer_edges, force to pandas - X, y_ = make_safe_gpu_dataframes(X, y_, 'pandas') + emb, _ = make_safe_gpu_dataframes(emb, None, 'pandas', self.has_cudf) # for now so we don't have to touch infer_edges, force to pandas + X, y_ = make_safe_gpu_dataframes(X, y_, 'pandas', self.has_cudf) g = self._infer_edges(emb, X, y_, df, infer_on_umap_embedding=fit_umap_embedding, merge_policy=merge_policy, eps=min_dist, sample=sample, n_neighbors=n_neighbors, @@ -554,9 +518,10 @@ def umap( logger.debug("umap_kwargs: %s", umap_kwargs) # temporary until we have full cudf support in feature_utils.py - has_cudf, _, cudf = lazy_cudf_import_has_dependancy() + self.has_cudf = deps.cudf + cudf = deps.cudf - if has_cudf: + if self.has_cudf: flag_nodes_cudf = isinstance(self._nodes, cudf.DataFrame) flag_edges_cudf = isinstance(self._edges, cudf.DataFrame) @@ -618,7 +583,7 @@ def umap( index_to_nodes_dict = nodes # {}? # add the safe coercion here - X_, y_ = make_safe_gpu_dataframes(X_, y_, res.engine) # type: ignore + X_, y_ = make_safe_gpu_dataframes(X_, y_, res.engine, self.has_cudf) # type: ignore res = res._process_umap( res, X_, y_, kind, memoize, featurize_kwargs, verbose, **umap_kwargs @@ -648,7 +613,7 @@ def umap( ) # add the safe coercion here - X_, y_ = make_safe_gpu_dataframes(X_, y_, res.engine) # type: ignore + X_, y_ = make_safe_gpu_dataframes(X_, y_, res.engine, self.has_cudf) # type: ignore res = res._process_umap( res, X_, y_, kind, memoize, featurize_kwargs, **umap_kwargs diff --git a/mypy.ini b/mypy.ini index 898e001146..2f88e199c4 100644 --- a/mypy.ini +++ b/mypy.ini @@ -94,3 +94,6 @@ ignore_missing_imports = True [mypy-cuml.*] ignore_missing_imports = True + +[mypy-tqdm.*] +ignore_missing_imports = True diff --git a/setup.py b/setup.py index c81db1b09c..1664aa2491 100755 --- a/setup.py +++ b/setup.py @@ -16,6 +16,7 @@ def unique_flatten_dict(d): 'squarify', 'typing-extensions', 'packaging >= 20.1', + 'tqdm', 'setuptools', ]