[REVIEW]Fix out of index errors encountered with sampling on out of index samples #2825

VibhuJawa · 2022-10-19T00:06:15Z

THIS PR does the following

Ensure we dont sample on out of range values
Issue: [BUG] cugraph storage fails with out of index samples #2828
Add tests for the sampling error
Ensure all the DGL examples here pass
https://github.com/rapidsai/dgl/blob/6ece904c69687adcd35a5ea41d1f5ca4ea01c0e2/examples/cugraph-pytorch/cugraph-local/rgcn-hetero/README.MD
Reformat out the non class specific utilities in prepration for DGL graph service class

…raph into dgl_graphstore_fix

VibhuJawa · 2022-10-19T14:57:16Z

python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py

+class CuFeatureStorage:
+ """
+ Storage for node/edge feature data.
+ """
+


Removed out of the graph_store file, see below:

cugraph/python/cugraph/cugraph/gnn/graph_store.py

Lines 503 to 583 in fbd5f20

class CuFeatureStorage:

"""Storage for node/edge feature data.

Either subclassing this class or implementing the same set of interfaces

is fine. DGL simply uses duck-typing to implement its sampling pipeline.

"""

def __init__(

self, pg, columns, storage_type, backend_lib="torch", indices_offset=0

):

self.pg = pg

self.columns = columns

if backend_lib == "torch":

from torch.utils.dlpack import from_dlpack

elif backend_lib == "tf":

from tensorflow.experimental.dlpack import from_dlpack

elif backend_lib == "cupy":

from cupy import from_dlpack

else:

raise NotImplementedError(

"Only pytorch and tensorflow backends are currently supported"

)

if storage_type not in ["edge", "node"]:

raise NotImplementedError("Only edge and node storage is supported")

self.storage_type = storage_type

self.from_dlpack = from_dlpack

self.indices_offset = indices_offset

def fetch(self, indices, device=None, pin_memory=False, **kwargs):

"""Fetch the features of the given node/edge IDs to the

given device.

Parameters

----------

indices : Tensor

Node or edge IDs.

device : Device

Device context.

pin_memory :

Returns

-------

Tensor

Feature data stored in PyTorch Tensor.

"""

# Default implementation uses synchronous fetch.

indices = cp.asarray(indices)

if isinstance(self.pg, MGPropertyGraph):

# dask_cudf loc breaks if we provide cudf series/cupy array

# https://github.com/rapidsai/cudf/issues/11877

indices = indices.get()

else:

indices = cudf.Series(indices)

indices = indices + self.indices_offset

if self.storage_type == "node":

subset_df = self.pg.get_vertex_data(

vertex_ids=indices, columns=self.columns

)

else:

subset_df = self.pg.get_edge_data(edge_ids=indices, columns=self.columns)

subset_df = subset_df[self.columns]

if isinstance(subset_df, dask_cudf.DataFrame):

subset_df = subset_df.compute()

if len(subset_df) == 0:

raise ValueError(f"{indices=} not found in FeatureStorage")

cap = subset_df.to_dlpack()

tensor = self.from_dlpack(cap)

del cap

if device:

if not isinstance(tensor, cp.ndarray):

# Cant transfer to different device for cupy

tensor = tensor.to(device)

return tensor

VibhuJawa

Added comments to help reviewers on where each file is coming from

VibhuJawa · 2022-10-19T14:59:09Z

python/cugraph/cugraph/gnn/dgl_extensions/utils/add_data.py

+def _update_feature_map(
+ pg_feature_map, feat_name_obj, contains_vector_features, columns
+):
+ """
+ Update the existing feature map `pg_feature_map` based on `feat_name_obj`
+ """


Just moved from the graph_store file

cugraph/python/cugraph/cugraph/gnn/graph_store.py

Lines 707 to 749 in fbd5f20

def _update_feature_map(

pg_feature_map, feat_name_obj, contains_vector_features, columns

):

if contains_vector_features:

if feat_name_obj is None:

raise ValueError(

"feature name must be provided when wrapping"

+ " multiple columns under a single feature name"

+ " or a feature map"

)

if isinstance(feat_name_obj, str):

pg_feature_map[feat_name_obj] = columns

elif isinstance(feat_name_obj, dict):

covered_columns = []

for col in feat_name_obj.keys():

current_cols = feat_name_obj[col]

# Handle strings too

if isinstance(current_cols, str):

current_cols = [current_cols]

covered_columns = covered_columns + current_cols

if set(covered_columns) != set(columns):

raise ValueError(

f"All the columns {columns} not covered in {covered_columns} "

f"Please check the feature_map {feat_name_obj} provided"

)

for key, cols in feat_name_obj.items():

if isinstance(cols, str):

cols = [cols]

pg_feature_map[key] = cols

else:

raise ValueError(f"{feat_name_obj} should be str or dict")

else:

if feat_name_obj:

raise ValueError(

f"feat_name {feat_name_obj} is only valid when "

"wrapping multiple columns under feature names"

)

for col in columns:

pg_feature_map[col] = [col]

VibhuJawa · 2022-10-19T15:00:58Z