added get_chunks and updated heatmap(improved speedups), updated docs

networkx · Feb 29, 2024 · 94de49a · 94de49a
1 parent 3ad8068
commit 94de49a
Show file tree

Hide file tree

Showing 5 changed files with 33 additions and 56 deletions.
diff --git a/nx_parallel/algorithms/cluster.py b/nx_parallel/algorithms/cluster.py
@@ -2,51 +2,26 @@
 from joblib import Parallel, delayed
 import nx_parallel as nxp
 
-__all__ = ["square_clustering_no_chunk", "square_clustering_chunk"]
+__all__ = [
+    "square_clustering",
+]
 
 
-def square_clustering_no_chunk(G, nodes=None):
-    """The squares clustering coefficient for nodes for all nodes
-    are computed in parallel over all available CPU cores.
+def square_clustering(G, nodes=None, get_chunks="chunks"):
+    """The nodes are chunked into `node_chunks` and then the square clustering
+    coefficient for all `node_chunks` are computed in parallel over all available
+    CPU cores.
+
+    Parameters
+    ------------
+    get_chunks : str, function (default = "chunks")
+        A function that takes in a list of all the nodes (or nbunch) as input and
+        returns an iterable `node_chunks`. The default chunking is done by slicing the
+        `nodes` into `n` chunks, where `n` is the number of CPU cores.
 
     networkx.square_clustering: https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.cluster.square_clustering.html
     """
 
-    def _compute_clustering(v):
-        clustering = 0
-        potential = 0
-        for u, w in combinations(G[v], 2):
-            squares = len((set(G[u]) & set(G[w])) - {v})
-            clustering += squares
-            degm = squares + 1
-            if w in G[u]:
-                degm += 1
-            potential += (len(G[u]) - degm) + (len(G[w]) - degm) + squares
-        if potential > 0:
-            clustering /= potential
-        return (v, clustering)
-
-    if hasattr(G, "graph_object"):
-        G = G.graph_object
-
-    if nodes is None:
-        node_iter = G
-    else:
-        node_iter = G.nbunch_iter(nodes)
-
-    total_cores = nxp.cpu_count()
-
-    result = Parallel(n_jobs=total_cores)(
-        delayed(_compute_clustering)(v) for v in node_iter
-    )
-    clustering = dict(result)
-
-    if nodes in G:
-        return clustering[nodes]
-    return clustering
-
-
-def square_clustering_chunk(G, nodes=None):
     def _compute_clustering_chunk(node_iter_chunk):
         result_chunk = []
         for v in node_iter_chunk:
@@ -73,8 +48,12 @@ def _compute_clustering_chunk(node_iter_chunk):
         node_iter = list(G.nbunch_iter(nodes))
 
     total_cores = nxp.cpu_count()
-    num_in_chunk = max(min(len(node_iter) // total_cores, 10), 1)
-    node_iter_chunks = nxp.chunks(node_iter, num_in_chunk)
+
+    if get_chunks == "chunks":
+        num_in_chunk = max(len(node_iter) // total_cores, 1)
+        node_iter_chunks = nxp.chunks(node_iter, num_in_chunk)
+    else:
+        node_iter_chunks = get_chunks(node_iter)
 
     result = Parallel(n_jobs=total_cores)(
         delayed(_compute_clustering_chunk)(node_iter_chunk)

diff --git a/nx_parallel/interface.py b/nx_parallel/interface.py
@@ -7,7 +7,7 @@
     tournament_is_strongly_connected,
 )
 from nx_parallel.algorithms.vitality import closeness_vitality
-# from nx_parallel.algorithms.cluster import square_clustering
+from nx_parallel.algorithms.cluster import square_clustering
 
 __all__ = ["Dispatcher", "ParallelGraph"]
 
@@ -51,7 +51,7 @@ class Dispatcher:
     all_pairs_bellman_ford_path = all_pairs_bellman_ford_path
 
     # Clustering
-    # square_clustering = square_clustering
+    square_clustering = square_clustering
 
     # =============================
 

diff --git a/timing/heatmap_square_clustering_chunk_timing.png b/timing/heatmap_square_clustering_chunk_timing.png
diff --git a/timing/heatmap_square_clustering_timing.png b/timing/heatmap_square_clustering_timing.png
diff --git a/timing/timing_individual_function.py b/timing/timing_individual_function.py
@@ -14,7 +14,7 @@
 number_of_nodes_list = [10, 50, 100, 300, 500]
 pList = [1, 0.8, 0.6, 0.4, 0.2]
 weighted = False
-currFun = nxp.square_clustering_chunk
+currFun = nx.square_clustering
 for p in pList:
     for num in number_of_nodes_list:
         # create original and parallel graphs
@@ -30,21 +30,19 @@
 
         # time both versions and update heatmapDF
         t1 = time.time()
-        c1 = nxp.square_clustering_chunk(H)
-        if isinstance(c1, types.GeneratorType):
-            d = dict(c1)
+        c = nx.square_clustering(H)
+        if isinstance(c, types.GeneratorType):
+            d = dict(c)
         t2 = time.time()
-        newTime = t2 - t1
+        parallelTime = t2 - t1
         t1 = time.time()
-        c2 = nxp.square_clustering_no_chunk(H)
-        if isinstance(c2, types.GeneratorType):
-            d = dict(c2)
+        c = nx.square_clustering(G)
+        if isinstance(c, types.GeneratorType):
+            d = dict(c)
         t2 = time.time()
-        oldTime = t2 - t1
-        timesFaster = oldTime / newTime
+        stdTime = t2 - t1
+        timesFaster = stdTime / parallelTime
         heatmapDF.at[num, p] = timesFaster
-        print(num, p, timesFaster)
-        print(c1 == c2)
         print("Finished " + str(currFun))
 
 # Code to create for row of heatmap specifically for tournaments
@@ -77,7 +75,7 @@
 plt.xticks(rotation=45)
 plt.yticks(rotation=20)
 plt.title(
-    "Small Scale Demo: Times Speedups of " + currFun.__name__ + " - chunk vs no chunk"
+    "Small Scale Demo: Times Speedups of " + currFun.__name__ + " compared to NetworkX"
 )
 plt.xlabel("Number of Vertices")
 plt.ylabel("Edge Probability")