Skip to content

Commit

Permalink
Merge pull request #774 from rlratzel/branch-0.13-digraphcheck
Browse files Browse the repository at this point in the history
[REVIEW] [BUG] Raise TypeError if a DiGraph is used with spectral*Clustering()
  • Loading branch information
BradReesWork authored Mar 20, 2020
2 parents 857c234 + 905a1e0 commit c611719
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 35 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
- PR #758 Fix for graph comparison failure
- PR #761 Added flag to not treat deprecation warnings as errors, for now
- PR #771 Added unrenumbering in wcc and scc. Updated tests to compare vertices of largest component
- PR #774 Raise TypeError if a DiGraph is used with spectral*Clustering()

# cuGraph 0.12.0 (04 Feb 2020)

Expand Down
49 changes: 26 additions & 23 deletions cpp/src/nvgraph/kmeans.cu
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ namespace {
// Perform reduction on warp
for(i=WARP_SIZE/2; i>0; i/=2)
dist_private += utils::shfl_down(dist_private, i, 2*i);

// Write result to global memory
if(threadIdx.x == 0)
atomicFPAdd(dists+IDX(gidz,gidy,n), dist_private);
Expand All @@ -136,11 +136,11 @@ namespace {
// Move to another centroid
gidy += blockDim.y*gridDim.y;
}

// Move to another vector entry
bidx += gridDim.x;
}

}

/// Find closest centroid to observation vectors
Expand Down Expand Up @@ -198,7 +198,7 @@ namespace {

// Increment cluster sizes
atomicAdd(clusterSizes+code_min, 1);

// Move to another row
i += blockDim.x*gridDim.x;

Expand Down Expand Up @@ -251,7 +251,7 @@ namespace {
dists_old[i] = dist_new_private;
codes_old[i] = code_new;
}

// Move to another row
i += blockDim.x*gridDim.x;
}
Expand Down Expand Up @@ -314,7 +314,7 @@ namespace {
// Observation vector is determined by global y-index
gidy = threadIdx.y + blockIdx.y*blockDim.y;
while(gidy < k) {

// Get cluster size from global memory
clusterSize_private = clusterSizes[gidy];

Expand Down Expand Up @@ -359,7 +359,7 @@ namespace {
const ValueType_ * __restrict__ obs,
ValueType_ * __restrict__ dists,
ValueType_ * __restrict__ centroid) {

using namespace thrust;

// Cumulative sum of distances
Expand All @@ -377,7 +377,7 @@ namespace {
CHECK_CUDA(cudaMemcpy(&distsSum, distsCumSum+n-1,
sizeof(ValueType_),
cudaMemcpyDeviceToHost));

// Randomly choose observation vector
// Probabilities are proportional to square of distance to closest
// centroid (see k-means++ algorithm)
Expand Down Expand Up @@ -440,7 +440,7 @@ namespace {
// Random number generator
thrust::default_random_engine rng(123456);
thrust::uniform_real_distribution<ValueType_> uniformDist(0,1);

// -------------------------------------------------------
// Implementation
// -------------------------------------------------------
Expand All @@ -451,7 +451,7 @@ namespace {
blockDim_warp.z = BSIZE_DIV_WSIZE;
gridDim_warp.x = min((d+WARP_SIZE-1)/WARP_SIZE, 65535);
gridDim_warp.y = 1;
gridDim_warp.z
gridDim_warp.z
= min((n+BSIZE_DIV_WSIZE-1)/BSIZE_DIV_WSIZE, 65535);
gridDim_block.x = min((n+BLOCK_SIZE-1)/BLOCK_SIZE, 65535);
gridDim_block.y = 1;
Expand All @@ -475,9 +475,9 @@ namespace {

// Choose remaining centroids
for(i=1; i<k; ++i) {

// Choose ith centroid
if(chooseNewCentroid(n, d, k, uniformDist(rng),obs, dists, centroids+IDX(0,i,d)))
if(chooseNewCentroid(n, d, k, uniformDist(rng),obs, dists, centroids+IDX(0,i,d)))
WARNING("error in k-means++ (could not pick centroid)");

// Compute distances from ith centroid
Expand Down Expand Up @@ -716,7 +716,7 @@ namespace nvgraph {
IndexType_ * __restrict__ work_int,
ValueType_ * residual_host,
IndexType_ * iters_host) {

// -------------------------------------------------------
// Variable declarations
// -------------------------------------------------------
Expand Down Expand Up @@ -764,7 +764,7 @@ namespace nvgraph {
cudaMemcpyHostToDevice));
if(updateCentroids(n, d, k, obs, codes,
clusterSizes, centroids,
work, work_int))
work, work_int))
WARNING("could not compute k-means centroids");
dim3 blockDim, gridDim;
blockDim.x = WARP_SIZE;
Expand All @@ -779,7 +779,7 @@ namespace nvgraph {
centroids,
work);
cudaCheckError();
*residual_host = thrust::reduce(thrust::device_pointer_cast(work),
*residual_host = thrust::reduce(thrust::device_pointer_cast(work),
thrust::device_pointer_cast(work+n));
cudaCheckError();
return NVGRAPH_OK;
Expand Down Expand Up @@ -808,14 +808,14 @@ namespace nvgraph {

// Choose initial cluster centroids
if(initializeCentroids(n, d, k, obs, centroids, codes,
clusterSizes, work))
clusterSizes, work))
WARNING("could not initialize k-means centroids");

// Apply k-means iteration until convergence
for(iter=0; iter<maxiter; ++iter) {

// Update cluster centroids
if(updateCentroids(n, d, k, obs, codes,
if(updateCentroids(n, d, k, obs, codes,
clusterSizes, centroids,
work, work_int)) WARNING("could not update k-means centroids");

Expand All @@ -826,14 +826,18 @@ namespace nvgraph {
WARNING("could not assign observation vectors to k-means clusters");

// Reinitialize empty clusters with new centroids
IndexType_ emptyCentroid = (thrust::find(thrust::device_pointer_cast(clusterSizes),
IndexType_ emptyCentroid = (thrust::find(thrust::device_pointer_cast(clusterSizes),
thrust::device_pointer_cast(clusterSizes+k), 0) - thrust::device_pointer_cast(clusterSizes));

// FIXME: emptyCentroid never reaches k (infinite loop) under certain
// conditions, such as if obs is corrupt (as seen as a result of a
// DataFrame column of NULL edge vals used to create the Graph)
while(emptyCentroid < k) {
if(chooseNewCentroid(n, d, k, uniformDist(rng), obs, work, centroids+IDX(0,emptyCentroid,d)))
WARNING("could not replace empty centroid");
if(assignCentroids(n, d, k, obs, centroids, work, codes, clusterSizes, residual_host))
WARNING("could not assign observation vectors to k-means clusters");
emptyCentroid = (thrust::find(thrust::device_pointer_cast(clusterSizes),
emptyCentroid = (thrust::find(thrust::device_pointer_cast(clusterSizes),
thrust::device_pointer_cast(clusterSizes+k), 0) - thrust::device_pointer_cast(clusterSizes));
cudaCheckError();
}
Expand Down Expand Up @@ -915,15 +919,15 @@ namespace nvgraph {
Vector<ValueType_> centroids(d*k, stream);
Vector<ValueType_> work(n*max(k,d), stream);
Vector<IndexType_> work_int(2*d*n, stream);

// Perform k-means
return kmeans<IndexType_,ValueType_>(n, d, k, tol, maxiter,
obs, codes,
obs, codes,
clusterSizes.raw(),
centroids.raw(),
work.raw(), work_int.raw(),
&residual, &iters);

}


Expand All @@ -948,4 +952,3 @@ namespace nvgraph {
}
//#endif //NVGRAPH_PARTITION
//#endif //debug

19 changes: 12 additions & 7 deletions python/cugraph/community/spectral_clustering_wrapper.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ from libc.stdint cimport uintptr_t
from libc.stdlib cimport calloc, malloc, free
from libc.float cimport FLT_MAX_EXP

import cugraph
import cudf
import cudf._lib as libcudf
import rmm
Expand All @@ -45,6 +46,9 @@ def spectralBalancedCutClustering(input_graph,
cdef uintptr_t graph = graph_wrapper.allocate_cpp_graph()
cdef Graph * g = <Graph*> graph

if isinstance(input_graph, cugraph.DiGraph):
raise TypeError("DiGraph objects are not supported")

if input_graph.adjlist:
[offsets, indices] = graph_wrapper.datatype_cast([input_graph.adjlist.offsets, input_graph.adjlist.indices], [np.int32])
[weights] = graph_wrapper.datatype_cast([input_graph.adjlist.weights], [np.float32, np.float64])
Expand Down Expand Up @@ -73,7 +77,6 @@ def spectralBalancedCutClustering(input_graph,

# Set the vertex identifiers
g.adjList.get_vertex_identifiers(&c_identifier_col)


balancedCutClustering_nvgraph(g,
num_clusters,
Expand All @@ -83,7 +86,6 @@ def spectralBalancedCutClustering(input_graph,
kmean_tolerance,
kmean_max_iter,
&c_cluster_col)


if input_graph.renumbered:
df = unrenumber(input_graph.edgelist.renumber_map, df, 'vertex')
Expand All @@ -103,6 +105,9 @@ def spectralModularityMaximizationClustering(input_graph,
cdef uintptr_t graph = graph_wrapper.allocate_cpp_graph()
cdef Graph * g = <Graph*> graph

if isinstance(input_graph, cugraph.DiGraph):
raise TypeError("DiGraph objects are not supported")

if input_graph.adjlist:
graph_wrapper.add_adj_list(graph, input_graph.adjlist.offsets, input_graph.adjlist.indices, input_graph.adjlist.weights)
else:
Expand All @@ -127,7 +132,7 @@ def spectralModularityMaximizationClustering(input_graph,

# Set the vertex identifiers
g.adjList.get_vertex_identifiers(&c_identifier_col)


spectralModularityMaximization_nvgraph(g,
num_clusters,
Expand All @@ -137,7 +142,7 @@ def spectralModularityMaximizationClustering(input_graph,
kmean_tolerance,
kmean_max_iter,
&c_cluster_col)


if input_graph.renumbered:
df = unrenumber(input_graph.edgelist.renumber_map, df, 'vertex')
Expand Down Expand Up @@ -165,7 +170,7 @@ def analyzeClustering_modularity(input_graph, n_clusters, clustering):
cdef gdf_column c_clustering_col = get_gdf_column_view(clustering)
cdef float score
analyzeClustering_modularity_nvgraph(g, n_clusters, &c_clustering_col, &score)

return score

def analyzeClustering_edge_cut(input_graph, n_clusters, clustering):
Expand All @@ -189,7 +194,7 @@ def analyzeClustering_edge_cut(input_graph, n_clusters, clustering):
cdef gdf_column c_clustering_col = get_gdf_column_view(clustering)
cdef float score
analyzeClustering_edge_cut_nvgraph(g, n_clusters, &c_clustering_col, &score)

return score

def analyzeClustering_ratio_cut(input_graph, n_clusters, clustering):
Expand All @@ -213,5 +218,5 @@ def analyzeClustering_ratio_cut(input_graph, n_clusters, clustering):
cdef gdf_column c_clustering_col = get_gdf_column_view(clustering)
cdef float score
analyzeClustering_ratio_cut_nvgraph(g, n_clusters, &c_clustering_col, &score)

return score
38 changes: 34 additions & 4 deletions python/cugraph/tests/test_balanced_cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,9 @@ def test_edge_cut_clustering(managed, pool, graph_file, partitions):
'''row_offsets = cudf.Series(M.indptr)
col_indices = cudf.Series(M.indices)
G_adj = cugraph.DiGraph()
G_adj = cugraph.Graph()
G_adj.from_cudf_adjlist(row_offsets, col_indices)'''
G_edge = cugraph.DiGraph()
G_edge = cugraph.Graph()
G_edge.from_cudf_edgelist(cu_M, source='0', destination='1')

# Get the edge_cut score for partitioning versus random assignment
Expand Down Expand Up @@ -118,10 +118,10 @@ def test_edge_cut_clustering_with_edgevals(managed, pool,
col_indices = cudf.Series(M.indices)
val = cudf.Series(M.data)
G_adj = cugraph.DiGraph()
G_adj = cugraph.Graph()
G_adj.from_cudf_adjlist(row_offsets, col_indices, val)
'''
G_edge = cugraph.DiGraph()
G_edge = cugraph.Graph()
G_edge.from_cudf_edgelist(cu_M, source='0', destination='1',
edge_attr='2')

Expand All @@ -141,3 +141,33 @@ def test_edge_cut_clustering_with_edgevals(managed, pool,
# assignment
print(cu_score, rand_score)
assert cu_score < rand_score


# Test to ensure DiGraph objs are not accepted
# Test all combinations of default/managed and pooled/non-pooled allocation
@pytest.mark.parametrize('managed, pool',
list(product([False, True], [False, True])))
def test_digraph_rejected(managed, pool):
gc.collect()

rmm.reinitialize(
managed_memory=managed,
pool_allocator=pool,
initial_pool_size=2 << 27
)

assert(rmm.is_initialized())

df = cudf.DataFrame()
df['src'] = cudf.Series(range(10))
df['dst'] = cudf.Series(range(10))
df['val'] = cudf.Series(range(10))

G = cugraph.DiGraph()
G.from_cudf_edgelist(df, source="src",
destination="dst",
edge_attr="val",
renumber=False)

with pytest.raises(Exception):
cugraph_call(G, 2)
32 changes: 31 additions & 1 deletion python/cugraph/tests/test_modularity.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_modularity_clustering(managed, pool, graph_file, partitions):

# Read in the graph and get a cugraph object
cu_M = utils.read_csv_file(graph_file, read_weights_in_sp=False)
G = cugraph.DiGraph()
G = cugraph.Graph()
G.from_cudf_edgelist(cu_M, source='0', destination='1',
edge_attr='2')

Expand All @@ -76,3 +76,33 @@ def test_modularity_clustering(managed, pool, graph_file, partitions):
# Assert that the partitioning has better modularity than the random
# assignment
assert cu_score > rand_score


# Test to ensure DiGraph objs are not accepted
# Test all combinations of default/managed and pooled/non-pooled allocation
@pytest.mark.parametrize('managed, pool',
list(product([False, True], [False, True])))
def test_digraph_rejected(managed, pool):
gc.collect()

rmm.reinitialize(
managed_memory=managed,
pool_allocator=pool,
initial_pool_size=2 << 27
)

assert(rmm.is_initialized())

df = cudf.DataFrame()
df['src'] = cudf.Series(range(10))
df['dst'] = cudf.Series(range(10))
df['val'] = cudf.Series(range(10))

G = cugraph.DiGraph()
G.from_cudf_edgelist(df, source="src",
destination="dst",
edge_attr="val",
renumber=False)

with pytest.raises(Exception):
cugraph_call(G, 2)

0 comments on commit c611719

Please sign in to comment.