From 155759839e1b291738aacb2d2c1c4415585c6d8a Mon Sep 17 00:00:00 2001 From: Ishika Roy Date: Wed, 11 Mar 2020 22:16:17 -0500 Subject: [PATCH 1/6] datatype cast in pagerank and for viewing --- .../link_analysis/pagerank_wrapper.pyx | 11 ++-- python/cugraph/structure/graph.py | 12 ++--- python/cugraph/structure/graph_wrapper.pyx | 50 ++++++++++--------- 3 files changed, 39 insertions(+), 34 deletions(-) diff --git a/python/cugraph/link_analysis/pagerank_wrapper.pyx b/python/cugraph/link_analysis/pagerank_wrapper.pyx index 87b479b329a..1091f9a545d 100644 --- a/python/cugraph/link_analysis/pagerank_wrapper.pyx +++ b/python/cugraph/link_analysis/pagerank_wrapper.pyx @@ -40,6 +40,9 @@ def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1. if not input_graph.transposedadjlist: input_graph.view_transposed_adj_list() + [offsets, indices] = graph_wrapper.datatype_cast([input_graph.transposedadjlist.offsets, input_graph.transposedadjlist.indices], [np.int32]) + [weights] = graph_wrapper.datatype_cast([input_graph.transposedadjlist.weights], [np.float32, np.float64]) + num_verts = input_graph.number_of_vertices() num_edges = input_graph.number_of_edges() @@ -68,12 +71,12 @@ def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1. cdef uintptr_t c_pers_val = NULL cdef sz = 0 - cdef uintptr_t c_offsets = input_graph.transposedadjlist.offsets.__cuda_array_interface__['data'][0] - cdef uintptr_t c_indices = input_graph.transposedadjlist.indices.__cuda_array_interface__['data'][0] + cdef uintptr_t c_offsets = offsets.__cuda_array_interface__['data'][0] + cdef uintptr_t c_indices = indices.__cuda_array_interface__['data'][0] cdef uintptr_t c_weights = NULL - if input_graph.transposedadjlist.weights: - c_weights = input_graph.transposedadjlist.weights.__cuda_array_interface__['data'][0] + if weights is not None: + c_weights = weights.__cuda_array_interface__['data'][0] cdef GraphCSC[int,int,float] graph_float cdef GraphCSC[int,int,double] graph_double diff --git a/python/cugraph/structure/graph.py b/python/cugraph/structure/graph.py index 951a51fd7d0..25bef063e4e 100644 --- a/python/cugraph/structure/graph.py +++ b/python/cugraph/structure/graph.py @@ -31,18 +31,17 @@ class EdgeList: def __init__(self, source, destination, edge_attr=None, renumber_map=None): self.renumber_map = renumber_map - df = cudf.DataFrame() - df['src'] = source - df['dst'] = destination + self.edgelist_df = cudf.DataFrame() + self.edgelist_df['src'] = source + self.edgelist_df['dst'] = destination self.weights = False if edge_attr is not None: self.weights = True if type(edge_attr) is dict: for k in edge_attr.keys(): - df[k] = edge_attr[k] + sel.edgelist_df[k] = edge_attr[k] else: - df['weights'] = edge_attr - self.edgelist_df = df + self.edgelist_df['weights'] = edge_attr class AdjList: def __init__(self, offsets, indices, value=None): @@ -243,6 +242,7 @@ def view_edge_list(self): self.edge_count = len(edgelist_df) else: edgelist_df = self.edgelist.edgelist_df + if self.renumbered: if isinstance(self.edgelist.renumber_map, cudf.DataFrame): df = cudf.DataFrame() diff --git a/python/cugraph/structure/graph_wrapper.pyx b/python/cugraph/structure/graph_wrapper.pyx index 4adfa0dfaae..010d9b8ddd4 100644 --- a/python/cugraph/structure/graph_wrapper.pyx +++ b/python/cugraph/structure/graph_wrapper.pyx @@ -139,14 +139,12 @@ def get_edge_list(graph_ptr): nelem=col_size, dtype=np_dtype_from_gdf_column(g.edgeList.edge_data)) value_col = cudf.Series(value_data) - return source_col, dest_col, value_col def add_adj_list(graph_ptr, offset_col, index_col, value_col=None): cdef uintptr_t graph = graph_ptr cdef Graph * g = graph - cdef gdf_column c_offset_col = get_gdf_column_view(offset_col) cdef gdf_column c_index_col = get_gdf_column_view(index_col) cdef gdf_column c_value_col @@ -156,7 +154,6 @@ def add_adj_list(graph_ptr, offset_col, index_col, value_col=None): else: c_value_col = get_gdf_column_view(value_col) c_value_col_ptr = &c_value_col - c_graph.adj_list_view(g, &c_offset_col, &c_index_col, @@ -169,7 +166,6 @@ def get_adj_list(graph_ptr): offset_col_size = g.adjList.offsets.size index_col_size = g.adjList.indices.size - cdef uintptr_t offset_col_data = g.adjList.offsets.data cdef uintptr_t index_col_data = g.adjList.indices.data cdef uintptr_t value_col_data = NULL @@ -212,7 +208,9 @@ def view_edge_list(input_graph): if input_graph.adjlist is None: raise Exception('Graph is Empty') else: - add_adj_list(graph, input_graph.adjlist.offsets, input_graph.adjlist.indices, input_graph.adjlist.weights) + [offsets, indices] = datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32]) + [weights] = datatype_cast([input_graph.adjlist.weights], [np.float32, np.float64]) + add_adj_list(graph, offsets, indices, weights) c_graph.add_edge_list(g) source, dest, value = get_edge_list(graph) input_graph.edgelist = input_graph.EdgeList(source, dest, value) @@ -224,10 +222,12 @@ def view_adj_list(input_graph): if input_graph.edgelist is None: raise Exception('Graph is Empty') else: - if len(input_graph.edgelist.edgelist_df.columns)>2: - add_edge_list(graph, input_graph.edgelist.edgelist_df['src'], input_graph.edgelist.edgelist_df['dst'], input_graph.edgelist.edgelist_df['weights']) + [src, dst] = datatype_cast([input_graph.edgelist.edgelist_df['src'], input_graph.edgelist.edgelist_df['dst']], [np.int32]) + if input_graph.edgelist.weights: + [weights] = datatype_cast([input_graph.edgelist.edgelist_df['weights']], [np.float32, np.float64]) + add_edge_list(graph, src, dst, weights) else: - add_edge_list(graph, input_graph.edgelist.edgelist_df['src'], input_graph.edgelist.edgelist_df['dst']) + add_edge_list(graph, src, dst) c_graph.add_adj_list(g) offsets, indices, values = get_adj_list(graph) input_graph.adjlist = input_graph.AdjList(offsets, indices, values) @@ -237,20 +237,25 @@ def view_transposed_adj_list(input_graph): cdef Graph * g = graph if input_graph.transposedadjlist is None: if input_graph.edgelist is None: - raise Exception('Graph is Empty') - else: - if len(input_graph.edgelist.edgelist_df.columns)>2: - add_edge_list(graph, input_graph.edgelist.edgelist_df['src'], input_graph.edgelist.edgelist_df['dst'], input_graph.edgelist.edgelist_df['weights']) + if input_graph.adjlist is None: + raise Exception('Graph is Empty') else: - add_edge_list(graph, input_graph.edgelist.edgelist_df['src'], input_graph.edgelist.edgelist_df['dst']) - c_graph.add_transposed_adj_list(g) - offsets, indices, values = get_transposed_adj_list(graph) - input_graph.transposedadjlist = input_graph.transposedAdjList(offsets, indices, values) + view_edge_list(input_graph) + [src, dst] = datatype_cast([input_graph.edgelist.edgelist_df['src'], input_graph.edgelist.edgelist_df['dst']], [np.int32]) + if input_graph.edgelist.weights: + [weights] = datatype_cast([input_graph.edgelist.edgelist_df['weights']], [np.float32, np.float64]) + add_edge_list(graph, src, dst, weights) + else: + add_edge_list(graph, src, dst) + c_graph.add_transposed_adj_list(g) + offsets, indices, values = get_transposed_adj_list(graph) + input_graph.transposedadjlist = input_graph.transposedAdjList(offsets, indices, values) def add_transposed_adj_list(graph_ptr, offset_col, index_col, value_col=None): cdef uintptr_t graph = graph_ptr cdef Graph * g = graph + [offset_col, index_col] = datatype_cast([offset_col, index_col], [np.int32]) cdef gdf_column c_offset_col = get_gdf_column_view(offset_col) cdef gdf_column c_index_col = get_gdf_column_view(index_col) cdef gdf_column c_value_col @@ -258,6 +263,7 @@ def add_transposed_adj_list(graph_ptr, offset_col, index_col, value_col=None): if value_col is None: c_value_col_ptr = NULL else: + [value_col] = datatype_cast([value_col], [np.float32, np.float64]) c_value_col = get_gdf_column_view(value_col) c_value_col_ptr = &c_value_col @@ -357,15 +363,11 @@ def get_two_hop_neighbors(input_graph): def number_of_vertices(input_graph): cdef uintptr_t graph = allocate_cpp_graph() cdef Graph * g = graph - - if input_graph.adjlist: - add_adj_list(graph, input_graph.adjlist.offsets, input_graph.adjlist.indices, input_graph.adjlist.weights) + if input_graph.edgelist.weights: + add_edge_list(graph, input_graph.edgelist.edgelist_df['src'], input_graph.edgelist.edgelist_df['dst'], input_graph.edgelist.edgelist_df['weights']) else: - if input_graph.edgelist.weights: - add_edge_list(graph, input_graph.edgelist.edgelist_df['src'], input_graph.edgelist.edgelist_df['dst'], input_graph.edgelist.edgelist_df['weights']) - else: - add_edge_list(graph, input_graph.edgelist.edgelist_df['src'], input_graph.edgelist.edgelist_df['dst']) - c_graph.number_of_vertices(g) + add_edge_list(graph, input_graph.edgelist.edgelist_df['src'], input_graph.edgelist.edgelist_df['dst']) + c_graph.number_of_vertices(g) return g.numberOfVertices From d40fe65a5029bb0c014d5f0d7fa30fbf593d9741 Mon Sep 17 00:00:00 2001 From: Ishika Roy Date: Wed, 11 Mar 2020 22:34:23 -0500 Subject: [PATCH 2/6] add typecast for number of vertices --- python/cugraph/structure/graph_wrapper.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cugraph/structure/graph_wrapper.pyx b/python/cugraph/structure/graph_wrapper.pyx index 010d9b8ddd4..16e0c99b84b 100644 --- a/python/cugraph/structure/graph_wrapper.pyx +++ b/python/cugraph/structure/graph_wrapper.pyx @@ -255,7 +255,6 @@ def add_transposed_adj_list(graph_ptr, offset_col, index_col, value_col=None): cdef uintptr_t graph = graph_ptr cdef Graph * g = graph - [offset_col, index_col] = datatype_cast([offset_col, index_col], [np.int32]) cdef gdf_column c_offset_col = get_gdf_column_view(offset_col) cdef gdf_column c_index_col = get_gdf_column_view(index_col) cdef gdf_column c_value_col @@ -263,7 +262,6 @@ def add_transposed_adj_list(graph_ptr, offset_col, index_col, value_col=None): if value_col is None: c_value_col_ptr = NULL else: - [value_col] = datatype_cast([value_col], [np.float32, np.float64]) c_value_col = get_gdf_column_view(value_col) c_value_col_ptr = &c_value_col @@ -363,10 +361,12 @@ def get_two_hop_neighbors(input_graph): def number_of_vertices(input_graph): cdef uintptr_t graph = allocate_cpp_graph() cdef Graph * g = graph + [src, dst] = datatype_cast([input_graph.edgelist.edgelist_df['src'], input_graph.edgelist.edgelist_df['dst']], [np.int32]) if input_graph.edgelist.weights: - add_edge_list(graph, input_graph.edgelist.edgelist_df['src'], input_graph.edgelist.edgelist_df['dst'], input_graph.edgelist.edgelist_df['weights']) + [weights] = datatype_cast([input_graph.edgelist.edgelist_df['weights']], [np.float32, np.float64]) + add_edge_list(graph, src, dst, weights) else: - add_edge_list(graph, input_graph.edgelist.edgelist_df['src'], input_graph.edgelist.edgelist_df['dst']) + add_edge_list(graph, src, dst) c_graph.number_of_vertices(g) return g.numberOfVertices From 0b8eefb3df8f95a381459a9cb7fbed806497acc1 Mon Sep 17 00:00:00 2001 From: Ishika Roy Date: Fri, 13 Mar 2020 15:12:20 -0500 Subject: [PATCH 3/6] add multi col unrenumbering for two_hop --- python/cugraph/structure/graph.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/python/cugraph/structure/graph.py b/python/cugraph/structure/graph.py index 25bef063e4e..7e56dcf5071 100644 --- a/python/cugraph/structure/graph.py +++ b/python/cugraph/structure/graph.py @@ -408,10 +408,17 @@ def get_two_hop_neighbors(self): """ df = graph_wrapper.get_two_hop_neighbors(self) if self.renumbered is True: - df['first'] = self.edgelist.renumber_map[df['first']].\ - reset_index(drop=True) - df['second'] = self.edgelist.renumber_map[df['second']].\ - reset_index(drop=True) + if isinstance(self.edgelist.renumber_map, cudf.DataFrame): + n_cols = len(self.edgelist.renumber_map.columns) - 1 + unrenumbered_df_ = df.merge(self.edgelist.renumber_map, left_on='first', right_on='id', how='left').drop(['id', 'first']) + unrenumbered_df = unrenumbered_df_.merge(self.edgelist.renumber_map, left_on='second', right_on='id', how='left').drop(['id', 'second']) + unrenumbered_df.columns = ['first_'+str(i) for i in range(n_cols)]+['second_'+str(i) for i in range(n_cols)] + df = unrenumbered_df + else: + df['first'] = self.edgelist.renumber_map[df['first']].\ + reset_index(drop=True) + df['second'] = self.edgelist.renumber_map[df['second']].\ + reset_index(drop=True) return df def number_of_vertices(self): From 356fd65bbf9a77697e8a83600a268e26bc038488 Mon Sep 17 00:00:00 2001 From: Ishika Roy Date: Fri, 13 Mar 2020 16:42:56 -0500 Subject: [PATCH 4/6] style fixes, changelog --- CHANGELOG.md | 1 + python/cugraph/structure/graph.py | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1361e345ba6..c853ef56779 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ - PR #738 Move tests directory up a level - PR #739 Updated Notebooks - PR #740 added utility to extract paths from SSSP/BFS results +- PR #747 updated viewing of graph, datatypecasting and two hop neighbor unrenumbering for multi column ## Bug Fixes - PR #697 Updated versions in conda environments. diff --git a/python/cugraph/structure/graph.py b/python/cugraph/structure/graph.py index 7e56dcf5071..b3442d9d36f 100644 --- a/python/cugraph/structure/graph.py +++ b/python/cugraph/structure/graph.py @@ -39,7 +39,7 @@ def __init__(self, source, destination, edge_attr=None, self.weights = True if type(edge_attr) is dict: for k in edge_attr.keys(): - sel.edgelist_df[k] = edge_attr[k] + self.edgelist_df[k] = edge_attr[k] else: self.edgelist_df['weights'] = edge_attr @@ -410,9 +410,19 @@ def get_two_hop_neighbors(self): if self.renumbered is True: if isinstance(self.edgelist.renumber_map, cudf.DataFrame): n_cols = len(self.edgelist.renumber_map.columns) - 1 - unrenumbered_df_ = df.merge(self.edgelist.renumber_map, left_on='first', right_on='id', how='left').drop(['id', 'first']) - unrenumbered_df = unrenumbered_df_.merge(self.edgelist.renumber_map, left_on='second', right_on='id', how='left').drop(['id', 'second']) - unrenumbered_df.columns = ['first_'+str(i) for i in range(n_cols)]+['second_'+str(i) for i in range(n_cols)] + unrenumbered_df_ = df.merge(self.edgelist.renumber_map, + left_on='first', right_on='id', + how='left').\ + drop(['id', 'first']) + unrenumbered_df = unrenumbered_df_.merge(self.edgelist. + renumber_map, + left_on='second', + right_on='id', + how='left').\ + drop(['id', 'second']) + unrenumbered_df.columns = ['first_' + str(i) + for i in range(n_cols)]\ + + ['second_' + str(i) for i in range(n_cols)] df = unrenumbered_df else: df['first'] = self.edgelist.renumber_map[df['first']].\ From 31b0a239d2bc8cd67ffc2441555282e8f3064fbb Mon Sep 17 00:00:00 2001 From: Ishika Roy Date: Mon, 16 Mar 2020 12:27:57 -0500 Subject: [PATCH 5/6] fix for graph comparison failures --- python/cugraph/tests/test_graph.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/cugraph/tests/test_graph.py b/python/cugraph/tests/test_graph.py index 67c313dcb08..b8dd0bf9d17 100644 --- a/python/cugraph/tests/test_graph.py +++ b/python/cugraph/tests/test_graph.py @@ -105,8 +105,8 @@ def compare_graphs(nx_graph, cu_graph): if len(edgelist_df.columns) > 2: df0 = cudf.from_pandas(nx.to_pandas_edgelist(nx_graph)) - df0 = df0.sort_values(by=['source', 'target']) - df1 = df.sort_values(by=['source', 'target']) + df0 = df0.sort_values(by=['source', 'target']).reset_index(drop=True) + df1 = df.sort_values(by=['source', 'target']).reset_index(drop=True) if not df0['weight'].equals(df1['weight']): return False @@ -482,7 +482,6 @@ def test_networkx_compatibility(managed, pool, graph_file): df['source'] = pd.Series(M['0']) df['target'] = pd.Series(M['1']) df['weight'] = pd.Series(M.weight) - gdf = cudf.from_pandas(df) Gnx = nx.from_pandas_edgelist(df, @@ -495,7 +494,6 @@ def test_networkx_compatibility(managed, pool, graph_file): destination='target', edge_attr='weight', create_using=cugraph.DiGraph) - assert compare_graphs(Gnx, G) Gnx.clear() From f12776a86ea62e8316f4a1ad28bf89c8d1b57170 Mon Sep 17 00:00:00 2001 From: Ishika Roy Date: Mon, 16 Mar 2020 12:30:32 -0500 Subject: [PATCH 6/6] add changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a7e01f6b27..200ef8efb10 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,7 @@ - PR #733 Fixed multi-column renumbering issues with indexes - PR #746 Dask + Distributed 2.12.0+ - PR #753 ECG Error +- PR #758 Fix for graph comparison failure # cuGraph 0.12.0 (04 Feb 2020)