Merge pull request #774 from rlratzel/branch-0.13-digraphcheck

[REVIEW] [BUG] Raise TypeError if a DiGraph is used with spectral*Clustering()
rapidsai · Mar 20, 2020 · c611719 · c611719
2 parents 857c234 + 905a1e0
commit c611719
Show file tree

Hide file tree

Showing 5 changed files with 104 additions and 35 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -38,6 +38,7 @@
 - PR #758 Fix for graph comparison failure
 - PR #761 Added flag to not treat deprecation warnings as errors, for now
 - PR #771 Added unrenumbering in wcc and scc. Updated tests to compare vertices of largest component
+- PR #774 Raise TypeError if a DiGraph is used with spectral*Clustering()
 
 # cuGraph 0.12.0 (04 Feb 2020)
 

diff --git a/cpp/src/nvgraph/kmeans.cu b/cpp/src/nvgraph/kmeans.cu
@@ -124,7 +124,7 @@ namespace {
           // Perform reduction on warp
           for(i=WARP_SIZE/2; i>0; i/=2)
             dist_private += utils::shfl_down(dist_private, i, 2*i);
-        
+
           // Write result to global memory
           if(threadIdx.x == 0)
             atomicFPAdd(dists+IDX(gidz,gidy,n), dist_private);
@@ -136,11 +136,11 @@ namespace {
         // Move to another centroid
         gidy += blockDim.y*gridDim.y;
       }
-      
+
       // Move to another vector entry
       bidx += gridDim.x;
     }
-  
+
   }
 
   /// Find closest centroid to observation vectors
@@ -198,7 +198,7 @@ namespace {
 
       // Increment cluster sizes
       atomicAdd(clusterSizes+code_min, 1);
-    
+
       // Move to another row
       i += blockDim.x*gridDim.x;
 
@@ -251,7 +251,7 @@ namespace {
         dists_old[i] = dist_new_private;
         codes_old[i] = code_new;
       }
-    
+
       // Move to another row
       i += blockDim.x*gridDim.x;
     }
@@ -314,7 +314,7 @@ namespace {
     // Observation vector is determined by global y-index
     gidy = threadIdx.y + blockIdx.y*blockDim.y;
     while(gidy < k) {
-    
+
       // Get cluster size from global memory
       clusterSize_private = clusterSizes[gidy];
 
@@ -359,7 +359,7 @@ namespace {
       const ValueType_ * __restrict__ obs,
       ValueType_ * __restrict__ dists,
       ValueType_ * __restrict__ centroid) {
-  
+
     using namespace thrust;
 
     // Cumulative sum of distances
@@ -377,7 +377,7 @@ namespace {
     CHECK_CUDA(cudaMemcpy(&distsSum, distsCumSum+n-1,
         sizeof(ValueType_),
         cudaMemcpyDeviceToHost));
-  
+
     // Randomly choose observation vector
     //   Probabilities are proportional to square of distance to closest
     //   centroid (see k-means++ algorithm)
@@ -440,7 +440,7 @@ namespace {
     // Random number generator
     thrust::default_random_engine rng(123456);
     thrust::uniform_real_distribution<ValueType_> uniformDist(0,1);
-  
+
     // -------------------------------------------------------
     // Implementation
     // -------------------------------------------------------
@@ -451,7 +451,7 @@ namespace {
     blockDim_warp.z = BSIZE_DIV_WSIZE;
     gridDim_warp.x = min((d+WARP_SIZE-1)/WARP_SIZE, 65535);
     gridDim_warp.y = 1;
-    gridDim_warp.z 
+    gridDim_warp.z
       = min((n+BSIZE_DIV_WSIZE-1)/BSIZE_DIV_WSIZE, 65535);
     gridDim_block.x = min((n+BLOCK_SIZE-1)/BLOCK_SIZE, 65535);
     gridDim_block.y = 1;
@@ -475,9 +475,9 @@ namespace {
 
     // Choose remaining centroids
     for(i=1; i<k; ++i) {
-    
+
       // Choose ith centroid
-      if(chooseNewCentroid(n, d, k, uniformDist(rng),obs, dists, centroids+IDX(0,i,d))) 
+      if(chooseNewCentroid(n, d, k, uniformDist(rng),obs, dists, centroids+IDX(0,i,d)))
         WARNING("error in k-means++ (could not pick centroid)");
 
       // Compute distances from ith centroid
@@ -716,7 +716,7 @@ namespace nvgraph {
         IndexType_ * __restrict__ work_int,
         ValueType_ * residual_host,
         IndexType_ * iters_host) {
-  
+
     // -------------------------------------------------------
     // Variable declarations
     // -------------------------------------------------------
@@ -764,7 +764,7 @@ namespace nvgraph {
          cudaMemcpyHostToDevice));
       if(updateCentroids(n, d, k, obs, codes,
            clusterSizes, centroids,
-           work, work_int)) 
+           work, work_int))
         WARNING("could not compute k-means centroids");
       dim3 blockDim, gridDim;
       blockDim.x = WARP_SIZE;
@@ -779,7 +779,7 @@ namespace nvgraph {
               centroids,
               work);
       cudaCheckError();
-      *residual_host = thrust::reduce(thrust::device_pointer_cast(work), 
+      *residual_host = thrust::reduce(thrust::device_pointer_cast(work),
         thrust::device_pointer_cast(work+n));
       cudaCheckError();
       return NVGRAPH_OK;
@@ -808,14 +808,14 @@ namespace nvgraph {
 
     // Choose initial cluster centroids
     if(initializeCentroids(n, d, k, obs, centroids, codes,
-             clusterSizes, work)) 
+             clusterSizes, work))
       WARNING("could not initialize k-means centroids");
 
     // Apply k-means iteration until convergence
     for(iter=0; iter<maxiter; ++iter) {
 
       // Update cluster centroids
-      if(updateCentroids(n, d, k, obs, codes, 
+      if(updateCentroids(n, d, k, obs, codes,
            clusterSizes, centroids,
            work, work_int)) WARNING("could not update k-means centroids");
 
@@ -826,14 +826,18 @@ namespace nvgraph {
        WARNING("could not assign observation vectors to k-means clusters");
 
       // Reinitialize empty clusters with new centroids
-      IndexType_ emptyCentroid = (thrust::find(thrust::device_pointer_cast(clusterSizes), 
+      IndexType_ emptyCentroid = (thrust::find(thrust::device_pointer_cast(clusterSizes),
         thrust::device_pointer_cast(clusterSizes+k), 0) - thrust::device_pointer_cast(clusterSizes));
+
+      // FIXME: emptyCentroid never reaches k (infinite loop) under certain
+      // conditions, such as if obs is corrupt (as seen as a result of a
+      // DataFrame column of NULL edge vals used to create the Graph)
       while(emptyCentroid < k) {
         if(chooseNewCentroid(n, d, k, uniformDist(rng), obs, work, centroids+IDX(0,emptyCentroid,d)))
           WARNING("could not replace empty centroid");
         if(assignCentroids(n, d, k, obs, centroids, work, codes, clusterSizes, residual_host))
           WARNING("could not assign observation vectors to k-means clusters");
-        emptyCentroid = (thrust::find(thrust::device_pointer_cast(clusterSizes), 
+        emptyCentroid = (thrust::find(thrust::device_pointer_cast(clusterSizes),
             thrust::device_pointer_cast(clusterSizes+k), 0) - thrust::device_pointer_cast(clusterSizes));
         cudaCheckError();
       }
@@ -915,15 +919,15 @@ namespace nvgraph {
     Vector<ValueType_> centroids(d*k, stream);
     Vector<ValueType_> work(n*max(k,d), stream);
     Vector<IndexType_> work_int(2*d*n, stream);
-    
+
     // Perform k-means
     return kmeans<IndexType_,ValueType_>(n, d, k, tol, maxiter,
-           obs, codes, 
+           obs, codes,
            clusterSizes.raw(),
            centroids.raw(),
            work.raw(), work_int.raw(),
            &residual, &iters);
-    
+
   }
 
 
@@ -948,4 +952,3 @@ namespace nvgraph {
 }
 //#endif //NVGRAPH_PARTITION
 //#endif //debug
-
diff --git a/python/cugraph/community/spectral_clustering_wrapper.pyx b/python/cugraph/community/spectral_clustering_wrapper.pyx
@@ -26,6 +26,7 @@ from libc.stdint cimport uintptr_t
 from libc.stdlib cimport calloc, malloc, free
 from libc.float cimport FLT_MAX_EXP
 
+import cugraph
 import cudf
 import cudf._lib as libcudf
 import rmm
@@ -45,6 +46,9 @@ def spectralBalancedCutClustering(input_graph,
     cdef uintptr_t graph = graph_wrapper.allocate_cpp_graph()
     cdef Graph * g = <Graph*> graph
 
+    if isinstance(input_graph, cugraph.DiGraph):
+        raise TypeError("DiGraph objects are not supported")
+
     if input_graph.adjlist:
         [offsets, indices] = graph_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
         [weights] = graph_wrapper.datatype_cast([input_graph.adjlist.weights], [np.float32, np.float64])
@@ -73,7 +77,6 @@ def spectralBalancedCutClustering(input_graph,
 
     # Set the vertex identifiers
     g.adjList.get_vertex_identifiers(&c_identifier_col)
-
 
     balancedCutClustering_nvgraph(g,
                                             num_clusters,
@@ -83,7 +86,6 @@ def spectralBalancedCutClustering(input_graph,
                                             kmean_tolerance,
                                             kmean_max_iter,
                                             &c_cluster_col)
-
 
     if input_graph.renumbered:
         df = unrenumber(input_graph.edgelist.renumber_map, df, 'vertex')
@@ -103,6 +105,9 @@ def spectralModularityMaximizationClustering(input_graph,
     cdef uintptr_t graph = graph_wrapper.allocate_cpp_graph()
     cdef Graph * g = <Graph*> graph
 
+    if isinstance(input_graph, cugraph.DiGraph):
+        raise TypeError("DiGraph objects are not supported")
+
     if input_graph.adjlist:
         graph_wrapper.add_adj_list(graph, input_graph.adjlist.offsets, input_graph.adjlist.indices, input_graph.adjlist.weights)
     else:
@@ -127,7 +132,7 @@ def spectralModularityMaximizationClustering(input_graph,
 
     # Set the vertex identifiers
     g.adjList.get_vertex_identifiers(&c_identifier_col)
-    
+
 
     spectralModularityMaximization_nvgraph(g,
                                                      num_clusters,
@@ -137,7 +142,7 @@ def spectralModularityMaximizationClustering(input_graph,
                                                      kmean_tolerance,
                                                      kmean_max_iter,
                                                      &c_cluster_col)
-    
+
 
     if input_graph.renumbered:
         df = unrenumber(input_graph.edgelist.renumber_map, df, 'vertex')
@@ -165,7 +170,7 @@ def analyzeClustering_modularity(input_graph, n_clusters, clustering):
     cdef gdf_column c_clustering_col = get_gdf_column_view(clustering)
     cdef float score
     analyzeClustering_modularity_nvgraph(g, n_clusters, &c_clustering_col, &score)
-    
+
     return score
 
 def analyzeClustering_edge_cut(input_graph, n_clusters, clustering):
@@ -189,7 +194,7 @@ def analyzeClustering_edge_cut(input_graph, n_clusters, clustering):
     cdef gdf_column c_clustering_col = get_gdf_column_view(clustering)
     cdef float score
     analyzeClustering_edge_cut_nvgraph(g, n_clusters, &c_clustering_col, &score)
-    
+
     return score
 
 def analyzeClustering_ratio_cut(input_graph, n_clusters, clustering):
@@ -213,5 +218,5 @@ def analyzeClustering_ratio_cut(input_graph, n_clusters, clustering):
     cdef gdf_column c_clustering_col = get_gdf_column_view(clustering)
     cdef float score
     analyzeClustering_ratio_cut_nvgraph(g, n_clusters, &c_clustering_col, &score)
-    
+
     return score
diff --git a/python/cugraph/tests/test_balanced_cut.py b/python/cugraph/tests/test_balanced_cut.py
@@ -70,9 +70,9 @@ def test_edge_cut_clustering(managed, pool, graph_file, partitions):
     '''row_offsets = cudf.Series(M.indptr)
     col_indices = cudf.Series(M.indices)
 
-    G_adj = cugraph.DiGraph()
+    G_adj = cugraph.Graph()
     G_adj.from_cudf_adjlist(row_offsets, col_indices)'''
-    G_edge = cugraph.DiGraph()
+    G_edge = cugraph.Graph()
     G_edge.from_cudf_edgelist(cu_M, source='0', destination='1')
 
     # Get the edge_cut score for partitioning versus random assignment
@@ -118,10 +118,10 @@ def test_edge_cut_clustering_with_edgevals(managed, pool,
     col_indices = cudf.Series(M.indices)
     val = cudf.Series(M.data)
 
-    G_adj = cugraph.DiGraph()
+    G_adj = cugraph.Graph()
     G_adj.from_cudf_adjlist(row_offsets, col_indices, val)
     '''
-    G_edge = cugraph.DiGraph()
+    G_edge = cugraph.Graph()
     G_edge.from_cudf_edgelist(cu_M, source='0', destination='1',
                               edge_attr='2')
 
@@ -141,3 +141,33 @@ def test_edge_cut_clustering_with_edgevals(managed, pool,
     # assignment
     print(cu_score, rand_score)
     assert cu_score < rand_score
+
+
+# Test to ensure DiGraph objs are not accepted
+# Test all combinations of default/managed and pooled/non-pooled allocation
+@pytest.mark.parametrize('managed, pool',
+                         list(product([False, True], [False, True])))
+def test_digraph_rejected(managed, pool):
+    gc.collect()
+
+    rmm.reinitialize(
+        managed_memory=managed,
+        pool_allocator=pool,
+        initial_pool_size=2 << 27
+    )
+
+    assert(rmm.is_initialized())
+
+    df = cudf.DataFrame()
+    df['src'] = cudf.Series(range(10))
+    df['dst'] = cudf.Series(range(10))
+    df['val'] = cudf.Series(range(10))
+
+    G = cugraph.DiGraph()
+    G.from_cudf_edgelist(df, source="src",
+                         destination="dst",
+                         edge_attr="val",
+                         renumber=False)
+
+    with pytest.raises(Exception):
+        cugraph_call(G, 2)
diff --git a/python/cugraph/tests/test_modularity.py b/python/cugraph/tests/test_modularity.py
@@ -65,7 +65,7 @@ def test_modularity_clustering(managed, pool, graph_file, partitions):
 
     # Read in the graph and get a cugraph object
     cu_M = utils.read_csv_file(graph_file, read_weights_in_sp=False)
-    G = cugraph.DiGraph()
+    G = cugraph.Graph()
     G.from_cudf_edgelist(cu_M, source='0', destination='1',
                          edge_attr='2')
 
@@ -76,3 +76,33 @@ def test_modularity_clustering(managed, pool, graph_file, partitions):
     # Assert that the partitioning has better modularity than the random
     # assignment
     assert cu_score > rand_score
+
+
+# Test to ensure DiGraph objs are not accepted
+# Test all combinations of default/managed and pooled/non-pooled allocation
+@pytest.mark.parametrize('managed, pool',
+                         list(product([False, True], [False, True])))
+def test_digraph_rejected(managed, pool):
+    gc.collect()
+
+    rmm.reinitialize(
+        managed_memory=managed,
+        pool_allocator=pool,
+        initial_pool_size=2 << 27
+    )
+
+    assert(rmm.is_initialized())
+
+    df = cudf.DataFrame()
+    df['src'] = cudf.Series(range(10))
+    df['dst'] = cudf.Series(range(10))
+    df['val'] = cudf.Series(range(10))
+
+    G = cugraph.DiGraph()
+    G.from_cudf_edgelist(df, source="src",
+                         destination="dst",
+                         edge_attr="val",
+                         renumber=False)
+
+    with pytest.raises(Exception):
+        cugraph_call(G, 2)