From 6b9234903a62b818d536598df59020f8e99e330c Mon Sep 17 00:00:00 2001 From: John Purviance Date: Thu, 3 Dec 2020 14:22:28 -0500 Subject: [PATCH] Add support for shortest_path_length and fix graph vertex checks(#1278) Adds the ability to get the length/cost of path from source to destination(s). Closely follows [`networkx.shortest_path_length`](https://networkx.org/documentation/latest/reference/algorithms/generated/networkx.algorithms.shortest_paths.generic.shortest_path_length.html#networkx.algorithms.shortest_paths.generic.shortest_path_length). Similarities: 1) Takes an optional target vertex 2) If only source is provided, a `cudf` dataframe is returned with columns: `[vertex, distance]` (similar to `networkx` dictionary return) 3) If source and target are specified the exact length is returned or `sys.float_info.max` if the vertex is not reachable. Differences: 1) Requires that source be provided, as apposed to `networkx` 2) Nethod of graph traversal is not an option. Fixes: 1) Fixes #806 2) `sssp` and `cugraph.Graph.has_node` vertex checks. Added support for checking for vertexes that are not apart of the graph. in the past, `TypeError` was raised when doing a comparison check (as apposed to `ValueError`) Authors: - John Purviance - BradReesWork Approvers: - Alex Fender URL: https://github.com/rapidsai/cugraph/pull/1278 --- CHANGELOG.md | 1 + python/cugraph/__init__.py | 1 + python/cugraph/structure/graph.py | 2 +- python/cugraph/tests/test_graph.py | 9 ++ python/cugraph/tests/test_paths.py | 177 +++++++++++++++++++++++++++ python/cugraph/traversal/__init__.py | 9 +- python/cugraph/traversal/sssp.py | 98 +++++++++++++++ 7 files changed, 293 insertions(+), 4 deletions(-) create mode 100644 python/cugraph/tests/test_paths.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 286a12db5d8..c5a6f5b9455 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ - PR #1274 Add generic from_edgelist() and from_adjlist() APIs - PR #1279 Add self loop check variable in graph - PR #1277 SciPy sparse matrix input support for WCC, SCC, SSSP, and BFS +- PR #1278 Add support for shortest_path_length and fix graph vertex checks ## Improvements - PR #1227 Pin cmake policies to cmake 3.17 version diff --git a/python/cugraph/__init__.py b/python/cugraph/__init__.py index 7249eecab05..d752c868237 100644 --- a/python/cugraph/__init__.py +++ b/python/cugraph/__init__.py @@ -77,6 +77,7 @@ sssp, shortest_path, filter_unreachable, + shortest_path_length ) from cugraph.tree import minimum_spanning_tree, maximum_spanning_tree diff --git a/python/cugraph/structure/graph.py b/python/cugraph/structure/graph.py index 375e095a0ec..53c3a4e656c 100644 --- a/python/cugraph/structure/graph.py +++ b/python/cugraph/structure/graph.py @@ -1322,7 +1322,7 @@ def has_node(self, n): return (ddf == n).any().any().compute() if self.renumbered: tmp = self.renumber_map.to_internal_vertex_id(cudf.Series([n])) - return tmp[0] >= 0 + return tmp[0] is not cudf.NA and tmp[0] >= 0 else: df = self.edgelist.edgelist_df[["src", "dst"]] return (df == n).any().any() diff --git a/python/cugraph/tests/test_graph.py b/python/cugraph/tests/test_graph.py index a912ecfa41a..d8d5a504070 100644 --- a/python/cugraph/tests/test_graph.py +++ b/python/cugraph/tests/test_graph.py @@ -606,6 +606,15 @@ def test_has_node(graph_file): assert G.has_node(n) +def test_invalid_has_node(): + df = cudf.DataFrame([[1, 2]], columns=["src", "dst"]) + G = cugraph.Graph() + G.from_cudf_edgelist(df, source="src", destination="dst") + assert not G.has_node(-1) + assert not G.has_node(0) + assert not G.has_node(G.number_of_nodes() + 1) + + @pytest.mark.parametrize('graph_file', utils.DATASETS) def test_bipartite_api(graph_file): # This test only tests the functionality of adding set of nodes and diff --git a/python/cugraph/tests/test_paths.py b/python/cugraph/tests/test_paths.py new file mode 100644 index 00000000000..7467d024051 --- /dev/null +++ b/python/cugraph/tests/test_paths.py @@ -0,0 +1,177 @@ +import cudf +import cugraph +from cupy.sparse import coo_matrix as cupy_coo_matrix +import cupy +import networkx as nx +import pytest +import sys +from tempfile import NamedTemporaryFile + +CONNECTED_GRAPH = """1,5,3 +1,4,1 +1,2,1 +1,6,2 +1,7,2 +4,5,1 +2,3,1 +7,6,2 +""" + +DISCONNECTED_GRAPH = CONNECTED_GRAPH + "8,9,4" + + +@pytest.fixture +def graphs(request): + with NamedTemporaryFile(mode="w+", suffix=".csv") as graph_tf: + graph_tf.writelines(request.param) + graph_tf.seek(0) + + nx_G = nx.read_weighted_edgelist(graph_tf.name, delimiter=',') + cudf_df = cudf.read_csv(graph_tf.name, + names=["src", "dst", "data"], + delimiter=",", + dtype=["int32", "int32", "float64"]) + cugraph_G = cugraph.Graph() + cugraph_G.from_cudf_edgelist( + cudf_df, source="src", + destination="dst", edge_attr="data") + + # construct cupy coo_matrix graph + i = [] + j = [] + weights = [] + for index in range(cudf_df.shape[0]): + vertex1 = cudf_df.iloc[index]["src"] + vertex2 = cudf_df.iloc[index]["dst"] + weight = cudf_df.iloc[index]["data"] + i += [vertex1, vertex2] + j += [vertex2, vertex1] + weights += [weight, weight] + i = cupy.array(i) + j = cupy.array(j) + weights = cupy.array(weights) + largest_vertex = max(cupy.amax(i), cupy.amax(j)) + cupy_df = cupy_coo_matrix( + (weights, (i, j)), + shape=(largest_vertex + 1, largest_vertex + 1)) + + yield cugraph_G, nx_G, cupy_df + + +@pytest.mark.parametrize("graphs", [CONNECTED_GRAPH], indirect=True) +def test_connected_graph_shortest_path_length(graphs): + cugraph_G, nx_G, cupy_df = graphs + + path_1_to_1_length = cugraph.shortest_path_length(cugraph_G, 1, 1) + assert path_1_to_1_length == 0.0 + assert path_1_to_1_length == nx.shortest_path_length( + nx_G, "1", target="1", weight="weight") + assert path_1_to_1_length == cugraph.shortest_path_length(nx_G, "1", "1") + assert path_1_to_1_length == cugraph.shortest_path_length(cupy_df, 1, 1) + + path_1_to_5_length = cugraph.shortest_path_length(cugraph_G, 1, 5) + assert path_1_to_5_length == 2.0 + assert path_1_to_5_length == nx.shortest_path_length( + nx_G, "1", target="5", weight="weight") + assert path_1_to_5_length == cugraph.shortest_path_length(nx_G, "1", "5") + assert path_1_to_5_length == cugraph.shortest_path_length(cupy_df, 1, 5) + + path_1_to_3_length = cugraph.shortest_path_length(cugraph_G, 1, 3) + assert path_1_to_3_length == 2.0 + assert path_1_to_3_length == nx.shortest_path_length( + nx_G, "1", target="3", weight="weight") + assert path_1_to_3_length == cugraph.shortest_path_length(nx_G, "1", "3") + assert path_1_to_3_length == cugraph.shortest_path_length(cupy_df, 1, 3) + + path_1_to_6_length = cugraph.shortest_path_length(cugraph_G, 1, 6) + assert path_1_to_6_length == 2.0 + assert path_1_to_6_length == nx.shortest_path_length( + nx_G, "1", target="6", weight="weight") + assert path_1_to_6_length == cugraph.shortest_path_length(nx_G, "1", "6") + assert path_1_to_6_length == cugraph.shortest_path_length(cupy_df, 1, 6) + + +@pytest.mark.parametrize("graphs", [CONNECTED_GRAPH], indirect=True) +def test_shortest_path_length_invalid_source(graphs): + cugraph_G, nx_G, cupy_df = graphs + + with pytest.raises(ValueError): + cugraph.shortest_path_length(cugraph_G, -1, 1) + + with pytest.raises(ValueError): + cugraph.shortest_path_length(nx_G, "-1", "1") + + with pytest.raises(ValueError): + cugraph.shortest_path_length(cupy_df, -1, 1) + + +@pytest.mark.parametrize("graphs", [DISCONNECTED_GRAPH], indirect=True) +def test_shortest_path_length_invalid_target(graphs): + cugraph_G, nx_G, cupy_df = graphs + + with pytest.raises(ValueError): + cugraph.shortest_path_length(cugraph_G, 1, 10) + + with pytest.raises(ValueError): + cugraph.shortest_path_length(nx_G, "1", "10") + + with pytest.raises(ValueError): + cugraph.shortest_path_length(cupy_df, 1, 10) + + +@pytest.mark.parametrize("graphs", [CONNECTED_GRAPH], indirect=True) +def test_shortest_path_length_invalid_vertexes(graphs): + cugraph_G, nx_G, cupy_df = graphs + + with pytest.raises(ValueError): + cugraph.shortest_path_length(cugraph_G, 0, 42) + + with pytest.raises(ValueError): + cugraph.shortest_path_length(nx_G, "0", "42") + + with pytest.raises(ValueError): + cugraph.shortest_path_length(cupy_df, 0, 42) + + +@pytest.mark.parametrize("graphs", [DISCONNECTED_GRAPH], indirect=True) +def test_shortest_path_length_no_path(graphs): + cugraph_G, nx_G, cupy_df = graphs + + path_1_to_8 = cugraph.shortest_path_length(cugraph_G, 1, 8) + assert path_1_to_8 == sys.float_info.max + assert path_1_to_8 == cugraph.shortest_path_length(nx_G, "1", "8") + assert path_1_to_8 == cugraph.shortest_path_length(cupy_df, 1, 8) + + +@pytest.mark.parametrize("graphs", [DISCONNECTED_GRAPH], indirect=True) +def test_shortest_path_length_no_target(graphs): + cugraph_G, nx_G, cupy_df = graphs + + cugraph_path_1_to_all = cugraph.shortest_path_length(cugraph_G, 1) + nx_path_1_to_all = nx.shortest_path_length( + nx_G, source="1", weight="weight") + nx_gpu_path_1_to_all = cugraph.shortest_path_length(nx_G, "1") + cupy_path_1_to_all = cugraph.shortest_path_length(cupy_df, 1) + + # Cast networkx graph on cugraph vertex column type from str to int. + # SSSP preserves vertex type, convert for comparison + nx_gpu_path_1_to_all["vertex"] = \ + nx_gpu_path_1_to_all["vertex"].astype("int32") + + assert cugraph_path_1_to_all == nx_gpu_path_1_to_all + assert cugraph_path_1_to_all == cupy_path_1_to_all + + # results for vertex 8 and 9 are not returned + assert cugraph_path_1_to_all.shape[0] == len(nx_path_1_to_all) + 2 + + for index in range(cugraph_path_1_to_all.shape[0]): + + vertex = str(cugraph_path_1_to_all["vertex"][index].item()) + distance = cugraph_path_1_to_all["distance"][index].item() + + # verify cugraph against networkx + if vertex in {'8', '9'}: + # Networkx does not return distances for these vertexes. + assert distance == sys.float_info.max + else: + assert distance == nx_path_1_to_all[vertex] diff --git a/python/cugraph/traversal/__init__.py b/python/cugraph/traversal/__init__.py index 52a1b9e2cfb..58e37a7add0 100644 --- a/python/cugraph/traversal/__init__.py +++ b/python/cugraph/traversal/__init__.py @@ -13,6 +13,9 @@ from cugraph.traversal.bfs import bfs from cugraph.traversal.bfs import bfs_edges -from cugraph.traversal.sssp import sssp -from cugraph.traversal.sssp import shortest_path -from cugraph.traversal.sssp import filter_unreachable +from cugraph.traversal.sssp import ( + sssp, + shortest_path, + filter_unreachable, + shortest_path_length +) \ No newline at end of file diff --git a/python/cugraph/traversal/sssp.py b/python/cugraph/traversal/sssp.py index cc5ee9fbdcd..3736db7ce59 100644 --- a/python/cugraph/traversal/sssp.py +++ b/python/cugraph/traversal/sssp.py @@ -156,12 +156,26 @@ def sssp(G, Parameters ---------- +<<<<<<< HEAD graph : cugraph.Graph, networkx.Graph, CuPy or SciPy sparse matrix Graph or matrix object, which should contain the connectivity information. Edge weights, if present, should be single or double precision floating point values. source : int Index of the source vertex. +======= + graph : cuGraph.Graph, NetworkX.Graph, or CuPy sparse COO matrix + cuGraph graph descriptor with connectivity information. Edge weights, + if present, should be single or double precision floating point values. + + source : Dependant on graph type. Index of the source vertex. + + If graph is an instance of cuGraph.Graph or CuPy sparse COO matrix: + int + + If graph is an instance of a NetworkX.Graph: + str +>>>>>>> Document shortest_path_length and sssp behavior Returns ------- @@ -214,6 +228,10 @@ def sssp(G, if G.renumbered: source = G.lookup_internal_vertex_id(cudf.Series([source]))[0] + if source is cudf.NA: + raise ValueError( + "Starting vertex should be between 0 to number of vertices") + df = sssp_wrapper.sssp(G, source) if G.renumbered: @@ -268,3 +286,83 @@ def shortest_path(G, """ return sssp(G, source, method, directed, return_predecessors, unweighted, overwrite, indices) + + +def shortest_path_length(G, source, target=None): + """ + Compute the distance from a source vertex to one or all vertexes in graph. + Uses Single Source Shortest Path (SSSP). + + Parameters + ---------- + graph : cuGraph.Graph, NetworkX.Graph, or CuPy sparse COO matrix + cuGraph graph descriptor with connectivity information. Edge weights, + if present, should be single or double precision floating point values. + + source : Dependant on graph type. Index of the source vertex. + + If graph is an instance of cuGraph.Graph or CuPy sparse COO matrix: + int + + If graph is an instance of a NetworkX.Graph: + str + + target: Dependant on graph type. Vertex to find distance to. + + If graph is an instance of cuGraph.Graph or CuPy sparse COO matrix: + int + + If graph is an instance of a NetworkX.Graph: + str + + Returns + ------- + Return value type is based on the input type. + + If target is None, returns: + + cudf.DataFrame + df['vertex'] + vertex id + + df['distance'] + gives the path distance from the starting vertex + + If target is not None, returns: + + Distance from source to target vertex. + """ + + # verify target is in graph before traversing + if target is not None: + if not hasattr(G, "has_node"): + # G is a cupy coo_matrix. Extract maximum possible vertex value + as_matrix = G.toarray() + if target < 0 or target >= max(as_matrix.shape[0], + as_matrix.shape[1]): + raise ValueError("Graph does not contain target vertex") + elif not G.has_node(target): + # G is an instance of cugraph or networkx graph + raise ValueError("Graph does not contain target vertex") + + df = sssp(G, source) + + if isinstance(df, tuple): + # cupy path, df is tuple of (distance, predecessor) + if target: + return df[0][target-1] + results = cudf.DataFrame() + results["vertex"] = range(df[0].shape[0]) + results["distance"] = df[0] + return results + + else: + # cugraph and networkx path + if target: + target_distance = df.loc[df["vertex"] == target] + return target_distance.iloc[0]["distance"] + + results = cudf.DataFrame() + results["vertex"] = df["vertex"] + results["distance"] = df["distance"] + return results