diff --git a/python/cugraph/cugraph/structure/property_graph.py b/python/cugraph/cugraph/structure/property_graph.py index f886acf0862..9b6668ddd09 100644 --- a/python/cugraph/cugraph/structure/property_graph.py +++ b/python/cugraph/cugraph/structure/property_graph.py @@ -65,6 +65,7 @@ class EXPERIMENTAL__PropertyGraph: edge_id_col_name = "_EDGE_ID_" vertex_id_col_name = "_VERTEX_ID_" weight_col_name = "_WEIGHT_" + _default_type_name = "" def __init__(self): # The dataframe containing the properties for each vertex. @@ -135,30 +136,10 @@ def __init__(self): # Cached property values self.__num_vertices = None + self.__vertex_type_value_counts = None + self.__edge_type_value_counts = None # PropertyGraph read-only attributes - @property - def num_vertices(self): - if self.__num_vertices is not None: - return self.__num_vertices - - self.__num_vertices = 0 - vert_sers = self.__get_all_vertices_series() - if vert_sers: - if self.__series_type is cudf.Series: - self.__num_vertices = cudf.concat(vert_sers).nunique() - else: - self.__num_vertices = pd.concat(vert_sers).nunique() - - return self.__num_vertices - - @property - def num_edges(self): - if self.__edge_prop_dataframe is not None: - return len(self.__edge_prop_dataframe) - else: - return 0 - @property def edges(self): if self.__edge_prop_dataframe is not None: @@ -195,45 +176,99 @@ def _vertex_prop_dataframe(self): def _edge_prop_dataframe(self): return self.__edge_prop_dataframe - def get_num_vertices(self, type=None): - """Return the number of vertices of a given type. + @property + def _vertex_type_value_counts(self): + if self.__vertex_prop_dataframe is None: + return + if self.__vertex_type_value_counts is None: + # Types should all be strings; what should we do if we see NaN? + self.__vertex_type_value_counts = ( + self.__vertex_prop_dataframe[self.type_col_name] + .value_counts(sort=False, dropna=False) + ) + return self.__vertex_type_value_counts - If type is None, return the total number of vertices. + @property + def _edge_type_value_counts(self): + if self.__edge_prop_dataframe is None: + return + if self.__edge_type_value_counts is None: + # Types should all be strings; what should we do if we see NaN? + self.__edge_type_value_counts = ( + self.__edge_prop_dataframe[self.type_col_name] + .value_counts(sort=False, dropna=False) + ) + return self.__edge_type_value_counts + + def get_num_vertices(self, type=None, *, include_edge_data=True): + """Return the number of all vertices or vertices of a given type. - Vertex types are set by using the `type_name` argument in - `add_vertex_data`. + Parameters + ---------- + type : string, optional + If type is None (the default), return the total number of vertices, + otherwise return the number of vertices of the specified type. + include_edge_data : bool (default True) + If True, include vertices that were added in vertex and edge data. + If False, only include vertices that were added in vertex data. + Note that vertices that only exist in edge data are assumed to have + the default type. See Also -------- - PropertyGraph.num_vertices PropertyGraph.get_num_edges """ if type is None: - return self.num_vertices + if not include_edge_data: + if self.__vertex_prop_dataframe is None: + return 0 + return len(self.__vertex_prop_dataframe) + if self.__num_vertices is not None: + return self.__num_vertices + self.__num_vertices = 0 + vert_sers = self.__get_all_vertices_series() + if vert_sers: + if self.__series_type is cudf.Series: + self.__num_vertices = cudf.concat(vert_sers).nunique() + else: + self.__num_vertices = pd.concat(vert_sers).nunique() + return self.__num_vertices + value_counts = self._vertex_type_value_counts + if type == self._default_type_name and include_edge_data: + # The default type, "", can refer to both vertex and edge data + if self.__vertex_prop_dataframe is None: + return self.get_num_vertices() + return ( + self.get_num_vertices() + - len(self.__vertex_prop_dataframe) + + (value_counts[type] if type in value_counts else 0) + ) if self.__vertex_prop_dataframe is None: return 0 - # This counts duplicates - return (self.__vertex_prop_dataframe[self.type_col_name] == type).sum() + return value_counts[type] if type in value_counts else 0 def get_num_edges(self, type=None): - """Return the number of edges of a given type. - - If type is None, return the total number of edges. + """Return the number of all edges or edges of a given type. - Edge types are set by using the `type_name` argument in - `add_edge_data`. + Parameters + ---------- + type : string, optional + If type is None (the default), return the total number of edges, + otherwise return the number of edges of the specified type. See Also -------- - PropertyGraph.num_edges PropertyGraph.get_num_vertices """ if type is None: - return self.num_edges + if self.__edge_prop_dataframe is not None: + return len(self.__edge_prop_dataframe) + else: + return 0 if self.__edge_prop_dataframe is None: return 0 - # This counts duplicates - return (self.__edge_prop_dataframe[self.type_col_name] == type).sum() + value_counts = self._edge_type_value_counts + return value_counts[type] if type in value_counts else 0 def get_vertices(self, selection=None): """ @@ -274,7 +309,7 @@ def add_vertex_data(self, The name to be assigned to the type of property being added. For example, if dataframe contains data about users, type_name might be "users". If not specified, the type of properties will be added as - None or NA + the empty string, "". property_columns : list of strings List of column names in dataframe to be added as properties. All other columns in dataframe will be ignored. If not specified, all @@ -297,6 +332,8 @@ def add_vertex_data(self, if (type_name is not None) and not(isinstance(type_name, str)): raise TypeError("type_name must be a string, got: " f"{type(type_name)}") + if type_name is None: + type_name = self._default_type_name if property_columns: if type(property_columns) is not list: raise TypeError("property_columns must be a list, got: " @@ -321,6 +358,7 @@ def add_vertex_data(self, # Clear the cached value for num_vertices since more could be added in # this method. self.__num_vertices = None + self.__vertex_type_value_counts = None # Could update instead # Initialize the __vertex_prop_dataframe if necessary using the same # type as the incoming dataframe. @@ -391,7 +429,7 @@ def add_edge_data(self, The name to be assigned to the type of property being added. For example, if dataframe contains data about transactions, type_name might be "transactions". If not specified, the type of properties - will be added as None or NA + will be added as the empty string "". property_columns : list of strings List of column names in dataframe to be added as properties. All other columns in dataframe will be ignored. If not specified, all @@ -418,6 +456,8 @@ def add_edge_data(self, if (type_name is not None) and not(isinstance(type_name, str)): raise TypeError("type_name must be a string, got: " f"{type(type_name)}") + if type_name is None: + type_name = self._default_type_name if property_columns: if type(property_columns) is not list: raise TypeError("property_columns must be a list, got: " @@ -442,6 +482,7 @@ def add_edge_data(self, # Clear the cached value for num_vertices since more could be added in # this method. self.__num_vertices = None + self.__edge_type_value_counts = None # Could update instead default_edge_columns = [self.src_col_name, self.dst_col_name, diff --git a/python/cugraph/cugraph/tests/test_property_graph.py b/python/cugraph/cugraph/tests/test_property_graph.py index 31176ce3bdf..78178a06398 100644 --- a/python/cugraph/cugraph/tests/test_property_graph.py +++ b/python/cugraph/cugraph/tests/test_property_graph.py @@ -281,10 +281,8 @@ def test_add_vertex_data(df_type): vertex_col_name="merchant_id", property_columns=None) - assert pG.num_vertices == 5 assert pG.get_num_vertices() == 5 assert pG.get_num_vertices('merchants') == 5 - assert pG.num_edges == 0 assert pG.get_num_edges() == 0 expected_props = merchants[0].copy() assert sorted(pG.vertex_property_names) == sorted(expected_props) @@ -293,7 +291,7 @@ def test_add_vertex_data(df_type): @pytest.mark.parametrize("df_type", df_types, ids=df_type_id) def test_num_vertices(df_type): """ - Ensures num_vertices is correct after various additions of specific data. + Ensures get_num_vertices is correct after various additions of data. """ from cugraph.experimental import PropertyGraph @@ -302,9 +300,8 @@ def test_num_vertices(df_type): data=merchants[1]) pG = PropertyGraph() - assert pG.num_vertices == 0 + assert pG.get_num_vertices() == 0 assert pG.get_num_vertices('unknown_type') == 0 - assert pG.num_edges == 0 assert pG.get_num_edges('unknown_type') == 0 pG.add_vertex_data(merchants_df, type_name="merchants", @@ -313,14 +310,11 @@ def test_num_vertices(df_type): # Test caching - the second retrieval should always be faster st = time.time() - assert pG.num_vertices == 5 assert pG.get_num_vertices() == 5 compute_time = time.time() - st - assert pG.num_edges == 0 assert pG.get_num_edges() == 0 st = time.time() - assert pG.num_vertices == 5 assert pG.get_num_vertices() == 5 cache_retrieval_time = time.time() - st assert cache_retrieval_time < compute_time @@ -333,11 +327,9 @@ def test_num_vertices(df_type): vertex_col_name="user_id", property_columns=None) - assert pG.num_vertices == 9 assert pG.get_num_vertices() == 9 assert pG.get_num_vertices('merchants') == 5 assert pG.get_num_vertices('users') == 4 - assert pG.num_edges == 0 assert pG.get_num_edges() == 0 # The taxpayers table does not add new vertices, it only adds properties to @@ -351,16 +343,88 @@ def test_num_vertices(df_type): vertex_col_name="payer_id", property_columns=None) - assert pG.num_vertices == 9 assert pG.get_num_vertices() == 9 assert pG.get_num_vertices('merchants') == 5 assert pG.get_num_vertices('users') == 4 - assert pG.get_num_vertices('taxpayers') == 7 # These nodes have two types assert pG.get_num_vertices('unknown_type') == 0 - assert pG.num_edges == 0 assert pG.get_num_edges() == 0 +@pytest.mark.parametrize("df_type", df_types, ids=df_type_id) +def test_num_vertices_include_edge_data(df_type): + """ + Ensures get_num_vertices is correct after various additions of data. + """ + from cugraph.experimental import PropertyGraph + + (merchants, users, taxpayers, + transactions, relationships, referrals) = dataset1.values() + + pG = PropertyGraph() + assert pG.get_num_vertices(include_edge_data=False) == 0 + assert pG.get_num_vertices("", include_edge_data=False) == 0 + + pG.add_edge_data(df_type(columns=transactions[0], + data=transactions[1]), + type_name="transactions", + vertex_col_names=("user_id", "merchant_id"), + property_columns=None) + + assert pG.get_num_vertices(include_edge_data=False) == 0 + assert pG.get_num_vertices("", include_edge_data=False) == 0 + assert pG.get_num_vertices(include_edge_data=True) == 7 + assert pG.get_num_vertices("", include_edge_data=True) == 7 + pG.add_vertex_data(df_type(columns=merchants[0], + data=merchants[1]), + # type_name="merchants", # Use default! + vertex_col_name="merchant_id", + property_columns=None) + assert pG.get_num_vertices(include_edge_data=False) == 5 + assert pG.get_num_vertices("", include_edge_data=False) == 5 + assert pG.get_num_vertices(include_edge_data=True) == 9 + assert pG.get_num_vertices("", include_edge_data=True) == 9 + pG.add_vertex_data(df_type(columns=users[0], + data=users[1]), + type_name="users", + vertex_col_name="user_id", + property_columns=None) + assert pG.get_num_vertices(include_edge_data=False) == 9 + assert pG.get_num_vertices("", include_edge_data=False) == 5 + assert pG.get_num_vertices("users", include_edge_data=False) == 4 + # All vertices now have vertex data, so this should match + assert pG.get_num_vertices(include_edge_data=True) == 9 + assert pG.get_num_vertices("", include_edge_data=True) == 5 + assert pG.get_num_vertices("users", include_edge_data=True) == 4 + + +@pytest.mark.parametrize("df_type", df_types, ids=df_type_id) +def test_num_vertices_with_properties(df_type): + """ + Checks that the num_vertices_with_properties attr is set to the number of + vertices that have properties, as opposed to just num_vertices which also + includes all verts in the graph edgelist. + """ + from cugraph.experimental import PropertyGraph + + pG = PropertyGraph() + df = df_type({"src": [99, 98, 97], + "dst": [22, 34, 56], + "some_property": ["a", "b", "c"], + }) + pG.add_edge_data(df, vertex_col_names=("src", "dst")) + + assert pG.get_num_vertices() == 6 + assert pG.get_num_vertices(include_edge_data=False) == 0 + + df = df_type({"vertex": [98, 97], + "some_property": ["a", "b"], + }) + pG.add_vertex_data(df, vertex_col_name="vertex") + + assert pG.get_num_vertices() == 6 + assert pG.get_num_vertices(include_edge_data=False) == 2 + + @pytest.mark.parametrize("df_type", df_types, ids=df_type_id) def test_null_data(df_type): """ @@ -370,9 +434,7 @@ def test_null_data(df_type): pG = PropertyGraph() - assert pG.num_vertices == 0 assert pG.get_num_vertices() == 0 - assert pG.num_edges == 0 assert pG.get_num_edges() == 0 assert sorted(pG.vertex_property_names) == sorted([]) @@ -395,10 +457,8 @@ def test_add_vertex_data_prop_columns(df_type): vertex_col_name="merchant_id", property_columns=expected_props) - assert pG.num_vertices == 5 assert pG.get_num_vertices() == 5 assert pG.get_num_vertices('merchants') == 5 - assert pG.num_edges == 0 assert pG.get_num_edges() == 0 assert sorted(pG.vertex_property_names) == sorted(expected_props) @@ -460,11 +520,9 @@ def test_add_edge_data(df_type): vertex_col_names=("user_id", "merchant_id"), property_columns=None) - assert pG.num_vertices == 7 assert pG.get_num_vertices() == 7 # 'transactions' is edge type, not vertex type assert pG.get_num_vertices('transactions') == 0 - assert pG.num_edges == 4 assert pG.get_num_edges() == 4 assert pG.get_num_edges('transactions') == 4 expected_props = ["merchant_id", "user_id", @@ -490,11 +548,9 @@ def test_add_edge_data_prop_columns(df_type): vertex_col_names=("user_id", "merchant_id"), property_columns=expected_props) - assert pG.num_vertices == 7 assert pG.get_num_vertices() == 7 # 'transactions' is edge type, not vertex type assert pG.get_num_vertices('transactions') == 0 - assert pG.num_edges == 4 assert pG.get_num_edges() == 4 assert pG.get_num_edges('transactions') == 4 assert sorted(pG.edge_property_names) == sorted(expected_props) @@ -919,7 +975,6 @@ def test_graph_edge_data_added(dataset1_PropertyGraph): len(dataset1["relationships"][-1]) + \ len(dataset1["referrals"][-1]) - assert pG.num_edges == expected_num_edges assert pG.get_num_edges() == expected_num_edges assert ( pG.get_num_edges("transactions") == len(dataset1["transactions"][-1]) @@ -1100,10 +1155,7 @@ def test_dgl_use_case(): def bench_num_vertices(gpubenchmark, dataset1_PropertyGraph): pG = dataset1_PropertyGraph - def get_num_vertices(): - return pG.num_vertices - - assert gpubenchmark(get_num_vertices) == 9 + assert gpubenchmark(pG.get_num_vertices) == 9 def bench_get_vertices(gpubenchmark, dataset1_PropertyGraph):