graphistry · dcolinmorgan · Nov 24, 2023 · Nov 24, 2023 · Nov 24, 2023 · Nov 24, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -157,6 +157,7 @@ jobs:
         source pygraphistry/bin/activate
         ./bin/test-umap-learn-core.sh
 
+
   test-full-ai:
 
     needs: [ test-minimal-python ]

diff --git a/docker/test-gpu-local.sh b/docker/test-gpu-local.sh
@@ -47,5 +47,4 @@ docker run \
     ${NETWORK} \
     graphistry/test-gpu:${TEST_CPU_VERSION} \
         --maxfail=1 \
-        --ignore=graphistry/tests/test_feature_utils.py \
         $@
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -120,6 +120,7 @@
     ('py:class', 'umap'),
     ('py:class', 'sentence_transformers'),
     ('py:class', 'dirty_cat'),
+    ('py:class', 'cu_cat'),
     ('py:class', 'sklearn'),
     ('py:class', 'scipy'),
     ('py:class', 'seaborn'),

diff --git a/graphistry/ai_utils.py b/graphistry/ai_utils.py
@@ -1,6 +1,6 @@
 import pandas as pd
 import numpy as np
-
+from inspect import getmodule
 import graphistry
 
 from .constants import DISTANCE, WEIGHT, BATCH
@@ -422,7 +422,10 @@ def infer_self_graph(res,
         assert (
             emb.shape[0] == df.shape[0]
         ), "minibatches emb and X must have same number of rows since h(df) = emb"
-        df = df.assign(x=emb.x, y=emb.y)  # add x and y to df for graphistry instance
+        if emb.x is not None:
+            df = df.assign(x=emb.x, y=emb.y)  # add x and y to df for graphistry instance
+        else:
+            df = df.assign(x=emb[0], y=emb[1])  # if umap kwargs n_components > 2, take first 2 here 
     else:  # if umap has been fit, but only transforming over features, need to add x and y or breaks plot binds of res
         df['x'] = np.random.random(df.shape[0])
         df['y'] = np.random.random(df.shape[0])
@@ -447,7 +450,14 @@ def infer_self_graph(res,
 
     for i in range(X_new.shape[0]):
         diff = X_previously_fit - X_new.iloc[i, :]
-        dist = np.linalg.norm(diff, axis=1)  # Euclidean distance
+        try:
+            diff = np.array(diff, dtype = 'float')
+        except TypeError:
+            pass
+        if 'pandas' in str(getmodule(diff)):
+            dist = np.linalg.norm(diff, axis=1)  # Euclidean distance
+        else:
+            dist = np.linalg.norm(diff.to_pandas(), axis=1)  # Euclidean distance
         mdists.append(dist)
 
     m, std = np.mean(mdists), np.std(mdists)

diff --git a/graphistry/compute/ComputeMixin.py b/graphistry/compute/ComputeMixin.py
@@ -85,9 +85,7 @@ def materialize_nodes(
                     import cudf
                     if isinstance(g._edges, cudf.DataFrame):
                         engine_concrete = Engine.CUDF
-                except ImportError:
-                    pass
-                if engine == EngineAbstract.AUTO:
+                except:
                     raise ValueError('Could not determine engine for edges, expected pandas or cudf dataframe, got: {}'.format(type(g._edges)))
         else:
             engine_concrete = Engine(engine.value)

diff --git a/graphistry/constants.py b/graphistry/constants.py
@@ -45,6 +45,7 @@
 # for preprocessors namespace
 #   for dirty_cat params
 DIRTY_CAT = "dirty_cat"
+CUDA_CAT = "cu_cat"
 N_TOPICS_DEFAULT = 42
 N_TOPICS_TARGET_DEFAULT = 7
 N_HASHERS_DEFAULT = 100

diff --git a/graphistry/dep_manager.py b/graphistry/dep_manager.py
@@ -0,0 +1,30 @@
+import importlib
+
+class DepManager:
+    def __init__(self):
+        self.pkgs = {}
+
+    def __getattr__(self, pkg:str):
+        self._add_deps(pkg)
+        try:
+            return self.pkgs[pkg]
+        except KeyError:
+            return None
+
+    def _add_deps(self, pkg:str):
+        try:
+            pkg_val = importlib.import_module(pkg)
+            self.pkgs[pkg] = pkg_val
+            setattr(self, pkg, pkg_val)
+        except:
+            pass
+
+    def import_from(self, pkg:str, name:str):
+        try:
+            module = __import__(pkg, fromlist=[name])
+            self.pkgs[name] = module
+        except:
+            pass
+
+
+deps = DepManager()
diff --git a/graphistry/dgl_utils.py b/graphistry/dgl_utils.py
@@ -17,7 +17,7 @@
 )
 
 from .util import setup_logger
-
+from .dep_manager import deps
 
 if TYPE_CHECKING:
     import scipy
@@ -34,30 +34,29 @@
     MIXIN_BASE = object
 
 
-def lazy_dgl_import_has_dependency():
-    try:
-        import warnings
-        warnings.filterwarnings('ignore')
-        import dgl  # noqa: F811
-        return True, 'ok', dgl
-    except ModuleNotFoundError as e:
-        return False, e, None
+# def lazy_dgl_import_has_dependency():
+#     try:
+#         import warnings
+#         warnings.filterwarnings('ignore')
+#         import dgl  # noqa: F811
+#         return True, 'ok', dgl
+#     except ModuleNotFoundError as e:
+#         return False, e, None
 
 
-def lazy_torch_import_has_dependency():
-    try:
-        import warnings
-        warnings.filterwarnings('ignore')
-        import torch  # noqa: F811
-        return True, 'ok', torch
-    except ModuleNotFoundError as e:
-        return False, e, None
+# def lazy_torch_import_has_dependency():
+#     try:
+#         import warnings
+#         warnings.filterwarnings('ignore')
+#         import torch  # noqa: F811
+#         return True, 'ok', torch
+#     except ModuleNotFoundError as e:
+#         return False, e, None
 
 
 logger = setup_logger(name=__name__)
 
 
-
 # #########################################################################################
 #
 #  Torch helpers
@@ -73,7 +72,7 @@ def convert_to_torch(X_enc: pd.DataFrame, y_enc: Optional[pd.DataFrame]):  # typ
     :param y_enc: DataFrame Matrix of Values for Target
     :return: Dictionary of torch encoded arrays
     """
-    _, _, torch = lazy_torch_import_has_dependency()  # noqa: F811
+    torch = deps.torch  # noqa: F811
 
     if not y_enc.empty:  # type: ignore
         data = {
@@ -98,7 +97,7 @@ def get_available_devices():
         device (torch.device): Main device (GPU 0 or CPU).
         gpu_ids (list): List of IDs of all GPUs that are available.
     """
-    _, _, torch = lazy_torch_import_has_dependency()  # noqa: F811
+    torch = deps.torch  # noqa: F811
 
     gpu_ids = []
     if torch.cuda.is_available():
@@ -181,7 +180,8 @@ def pandas_to_dgl_graph(
         sp_mat: sparse scipy matrix
         ordered_nodes_dict: dict ordered from most common src and dst nodes
     """
-    _, _, dgl = lazy_dgl_import_has_dependency()  # noqa: F811
+    dgl = deps.dgl  # noqa: F811
+
     sp_mat, ordered_nodes_dict = pandas_to_sparse_adjacency(df, src, dst, weight_col)
     g = dgl.from_scipy(sp_mat, device=device)  # there are other ways too
     logger.info(f"Graph Type: {type(g)}") 
@@ -196,7 +196,7 @@ def get_torch_train_test_mask(n: int, ratio: float = 0.8):
     :param ratio: mimics train/test split. `ratio` sets number of True vs False mask entries.
     :return: train and test torch tensor masks
     """
-    _, _, torch = lazy_torch_import_has_dependency()  # noqa: F811
+    torch = deps.torch  # noqa: F811
 
     train_mask = torch.zeros(n, dtype=torch.bool).bernoulli(ratio)
     test_mask = ~train_mask
@@ -225,8 +225,8 @@ def dgl_lazy_init(self, train_split: float = 0.8, device: str = "cpu"):
         """
 
         if not self.dgl_initialized:
-            lazy_dgl_import_has_dependency()
-            lazy_torch_import_has_dependency()
+            deps.dgl
+            deps.torch
             self.train_split = train_split
             self.device = device
             self._removed_edges_previously = False

diff --git a/graphistry/embed_utils.py b/graphistry/embed_utils.py
@@ -2,43 +2,22 @@
 import numpy as np
 import pandas as pd
 from typing import Optional, Union, Callable, List, TYPE_CHECKING, Any, Tuple
-
+from inspect import getmodule
 from .PlotterBase import Plottable
 from .compute.ComputeMixin import ComputeMixin
+from .dep_manager import deps
 
 
-def lazy_embed_import_dep():
-    try:
-        import torch
-        import torch.nn as nn
-        import dgl
-        from dgl.dataloading import GraphDataLoader
-        import torch.nn.functional as F
-        from .networks import HeteroEmbed
-        from tqdm import trange
-        return True, torch, nn, dgl, GraphDataLoader, HeteroEmbed, F, trange
-
-    except:
-        return False, None, None, None, None, None, None, None
-
-def check_cudf():
-    try:
-        import cudf
-        return True, cudf
-    except:
-        return False, object
-
-
 if TYPE_CHECKING:
-    _, torch, _, _, _, _, _, _ = lazy_embed_import_dep()
+    torch = deps.torch
     TT = torch.Tensor
     MIXIN_BASE = ComputeMixin
 else:
     TT = Any
     MIXIN_BASE = object
     torch = Any
 
-has_cudf, cudf = check_cudf()
+cudf = deps.cudf
 
 XSymbolic = Optional[Union[List[str], str, pd.DataFrame]]
 ProtoSymbolic = Optional[Union[str, Callable[[TT, TT, TT], TT]]]  # type: ignore
@@ -99,8 +78,7 @@ def __init__(self):
         self._device = "cpu"
 
     def _preprocess_embedding_data(self, res, train_split:Union[float, int] = 0.8) -> Plottable:
-        #_, torch, _, _, _, _, _, _ = lazy_embed_import_dep()
-        import torch
+        torch = deps.torch
         log('Preprocessing embedding data')
         src, dst = res._source, res._destination
         relation = res._relation
@@ -147,7 +125,7 @@ def _preprocess_embedding_data(self, res, train_split:Union[float, int] = 0.8) -
         return res
 
     def _build_graph(self, res) -> Plottable:
-        _, _, _, dgl, _, _, _, _ = lazy_embed_import_dep()
+        dgl = deps.dgl
         s, r, t = res._triplets.T
 
         if res._train_idx is not None:
@@ -169,7 +147,10 @@ def _build_graph(self, res) -> Plottable:
 
 
     def _init_model(self, res, batch_size:int, sample_size:int, num_steps:int, device):
-        _, _, _, _, GraphDataLoader, HeteroEmbed, _, _ = lazy_embed_import_dep()
+        dgl_ = deps.dgl
+        if dgl_: 
+            from dgl.dataloading import GraphDataLoader
+        from .networks import HeteroEmbed
         g_iter = SubgraphIterator(res._kg_dgl, sample_size, num_steps)
         g_dataloader = GraphDataLoader(
             g_iter, batch_size=batch_size, collate_fn=lambda x: x[0]
@@ -186,9 +167,11 @@ def _init_model(self, res, batch_size:int, sample_size:int, num_steps:int, devic
         )
 
         return model, g_dataloader
-
+        
     def _train_embedding(self, res, epochs:int, batch_size:int, lr:float, sample_size:int, num_steps:int, device) -> Plottable:
-        _, torch, nn, _, _, _, _, trange = lazy_embed_import_dep()
+        torch = deps.torch
+        nn = deps.torch.nn
+        trange = deps.tqdm.trange
         log('Training embedding')
         model, g_dataloader = res._init_model(res, batch_size, sample_size, num_steps, device)
         if hasattr(res, "_embed_model") and not res._build_new_embedding_model:
@@ -232,7 +215,7 @@ def _train_embedding(self, res, epochs:int, batch_size:int, lr:float, sample_siz
 
     @property
     def _gcn_node_embeddings(self):
-        _, torch, _, _, _, _, _, _ = lazy_embed_import_dep()
+        torch = deps.torch
         g_dgl = self._kg_dgl.to(self._device)
         em = self._embed_model(g_dgl).detach()
         torch.cuda.empty_cache()
@@ -301,12 +284,12 @@ def embed(
         """
         # this is temporary, will be fixed in future releases
         try:
-            if isinstance(self._nodes, cudf.DataFrame):
+            if 'cudf' in str(getmodule(self._nodes)):
                 self._nodes = self._nodes.to_pandas()
         except:
             pass
         try:
-            if isinstance(self._edges, cudf.DataFrame):
+            if 'cudf' in str(getmodule(self._edges)):
                 self._edges = self._edges.to_pandas()
         except:
             pass
@@ -436,7 +419,7 @@ def predict_links(
         else:
             # this is temporary, will be removed after gpu feature utils
             try:
-                if isinstance(source, cudf.DataFrame):
+                if 'cudf' in str(getmodule(source)):
                     source = source.to_pandas()  # type: ignore
             except:
                 pass
@@ -448,7 +431,7 @@ def predict_links(
         else:
             # this is temporary, will be removed after gpu feature utils
             try:
-                if isinstance(relation, cudf.DataFrame):
+                if 'cudf' in str(getmodule(relation)):
                     relation = relation.to_pandas()  # type: ignore
             except:
                 pass
@@ -460,7 +443,8 @@ def predict_links(
         else:
             # this is temporary, will be removed after gpu feature utils
             try:
-                if isinstance(destination, cudf.DataFrame):
+                # if isinstance(destination, cudf.DataFrame):
+                if 'cudf' in str(getmodule(destination)):
                     destination = destination.to_pandas()  # type: ignore
             except:
                 pass
@@ -540,7 +524,7 @@ def fetch_triplets_for_inference(x_r):
 
 
     def _score(self, triplets: Union[np.ndarray, TT]) -> TT:  # type: ignore
-        _, torch, _, _, _, _, _, _ = lazy_embed_import_dep()
+        torch = deps.torch
         emb = self._kg_embeddings.clone().detach()
         if not isinstance(triplets, torch.Tensor):
             triplets = torch.tensor(triplets)
@@ -571,7 +555,13 @@ def __len__(self) -> int:
         return self.num_steps
 
     def __getitem__(self, i:int):
-        _, torch, nn, dgl, GraphDataLoader, _, F, _ = lazy_embed_import_dep()
+        torch = deps.torch
+        from torch import nn
+        from torch.nn import functional as F
+        dgl = deps.dgl
+
+        from dgl.dataloading import GraphDataLoader
+
         eids = torch.from_numpy(np.random.choice(self.eids, self.sample_size))
 
         src, dst = self.g.find_edges(eids)
@@ -593,7 +583,7 @@ def __getitem__(self, i:int):
 
     @staticmethod
     def _sample_neg(triplets:np.ndarray, num_nodes:int) -> Tuple[TT, TT]:  # type: ignore
-        _, torch, _, _, _, _, _, _ = lazy_embed_import_dep()
+        torch = deps.torch
         triplets = torch.tensor(triplets)
         h, r, t = triplets.T
         h_o_t = torch.randint(high=2, size=h.size())