diff --git a/src/GPUGraphsMatrix.jl b/src/GPUGraphsMatrix.jl index 49dfb03..dc8ff9f 100644 --- a/src/GPUGraphsMatrix.jl +++ b/src/GPUGraphsMatrix.jl @@ -205,6 +205,7 @@ mutable struct SparseGPUMatrixSELL{ } <: AbstractSparseGPUMatrix{Tv,Ti} m::Int n::Int + perm::Vector{Int} # Row permutation to reduce padding (not implemented yet) slice_size::Int nslices::Int # Number of slices nnz::Int # Number of nonzeros @@ -219,6 +220,7 @@ mutable struct SparseGPUMatrixSELL{ function SparseGPUMatrixSELL( m::Int, n::Int, + perm::Vector{Int}, slice_size::Int, nslices::Int, nnz::Int, @@ -269,6 +271,7 @@ mutable struct SparseGPUMatrixSELL{ new{Tv,Ti,typeof(nzval_gpu),typeof(slice_ptr_gpu)}( m, n, + perm, slice_size, nslices, nnz, @@ -304,6 +307,11 @@ function SparseGPUMatrixSELL( max_nnz_per_slice = zeros(Int, n_slices) nnz_per_row = diff(rowptr) + # Compute optimal permutation of rows to minimize padding (not implemented yet) + #perm = reverse!(sortperm(nnz_per_row[:])) + perm = collect(1:size(m_t, 2)) + nnz_per_row = nnz_per_row[perm] + # Compute the maximum number of nonzeros per row for each slice n_stored = 0 for i = 1:n_slices @@ -333,11 +341,15 @@ function SparseGPUMatrixSELL( if row > size(m_t, 2) break end - start = rowptr[row] - end_ = rowptr[row+1] - 1 + + start = rowptr[perm[row]] + end_ = rowptr[perm[row]+1] - 1 temp_colval[row-slice_start+1, 1:(end_-start+1)] = colval[start:end_] temp_nzval[row-slice_start+1, 1:(end_-start+1)] = nzval[start:end_] + + end + # Reshape the sub-matrix to make it column-major vector and copy it to final storage colval_padded[slice_ptr[slice]:(slice_ptr[slice+1]-1)] = @@ -346,11 +358,13 @@ function SparseGPUMatrixSELL( collect(Iterators.flatten(temp_nzval)) + end SparseGPUMatrixSELL( size(m_t, 2), size(m_t, 1), + perm, slice_size, n_slices, nnz(m_t), @@ -372,7 +386,7 @@ function SparseGPUMatrixSELL( end -# Base methods for the SparseGPUMatrixCSR type +# Base methods for the SparseGPUMatrixSELL type Base.size(A::SparseGPUMatrixSELL) = (A.m, A.n) Base.size(A::SparseGPUMatrixSELL, i::Int) = (i == 1) ? A.m : A.n Base.length(A::SparseGPUMatrixSELL) = A.m * A.n @@ -385,7 +399,7 @@ Base.display(A::SparseGPUMatrixSELL) = show(stdout, A) function Base.getindex(A::SparseGPUMatrixSELL, i::Int, j::Int) - #@warn "Scalar indexing on a SparseGPUMatrixCSR is slow. For better performance, vectorize the operation." + #@warn "Scalar indexing on a SparseGPUMatrixSELL is slow. For better performance, vectorize the operation." if i < 1 || i > A.m || j < 1 || j > A.n throw(BoundsError(A, (i, j))) end diff --git a/src/algorithms/shortest_path.jl b/src/algorithms/shortest_path.jl index bd05a22..0cd13ba 100644 --- a/src/algorithms/shortest_path.jl +++ b/src/algorithms/shortest_path.jl @@ -57,6 +57,7 @@ function shortest_path!( while true iter += one(Ti) + gpu_spmv!( next, A_T, @@ -67,10 +68,13 @@ function shortest_path!( #mask = updated, Not used yet ) # Diff : where we made progress - @. diff = next < dist - if reduce(|, diff) == zero(Td) - return nothing + if iter%8 == 0 + @. diff = next < dist + if reduce(|, diff) == zero(Td) + return nothing + end end + # Update the dist array dist .= next diff --git a/src/spmv.jl b/src/spmv.jl index d08ae9e..0fd47d0 100644 --- a/src/spmv.jl +++ b/src/spmv.jl @@ -340,15 +340,18 @@ end add, accum, ) + #slice, offset = @index(Global, NTuple) + #offset = offset - 1 + #row = (slice-1) * slice_size + offset + 1 row = @index(Global, Linear) - if mask[row] != mask_zero - slice = (row-1) ÷ slice_size + 1 - offset = (row-1) % slice_size + slice = (row-1) ÷ slice_size + 1 + offset = (row-1) % slice_size + if row <= n && mask[row] != mask_zero acc = monoid_neutral_element for i = (a_slice_ptr[slice]+offset):slice_size:(a_slice_ptr[slice+1]-1) col = a_col_val[i] - if col == -1 + if col == -1 break end acc = add(acc, mul(a_nz_val[i], b[col], row, col, col, 1), row, col, col, 1) diff --git a/test/runtests.jl b/test/runtests.jl index 6cdb786..5181a12 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -41,9 +41,9 @@ const PAD_VAL = -1 - #include("structs.jl") - #include("spmv.jl") - #include("spmm.jl") - #include("bfs.jl") + include("structs.jl") + include("spmv.jl") + include("spmm.jl") + include("bfs.jl") include("shortest_path.jl") end diff --git a/test/scratch.jl b/test/scratch.jl index f0e410e..4dcff7c 100644 --- a/test/scratch.jl +++ b/test/scratch.jl @@ -109,41 +109,46 @@ using HarwellRutherfordBoeing MAIN_TYPE = Bool #MAIN_TYPE = Float32 +RES_TYPE = ifelse(MAIN_TYPE == Bool, Int32, MAIN_TYPE) BACKEND = CUDA.CUDABackend() -ssmc = ssmc_db(); -orkut_path = fetch_ssmc(ssmc_matrices(ssmc, "SNAP", "Orkut"), format = "RB")[1] -loaded_matrix = RutherfordBoeingData(joinpath(orkut_path, "com-Orkut.rb")); -graph = SimpleDiGraph(loaded_matrix.data) - -#graph = dorogovtsev_mendes(150) -A_cpu = transpose( - convert( +use_dataset = true +graph = dorogovtsev_mendes(150) +if use_dataset + ssmc = ssmc_db(); + orkut_path = fetch_ssmc(ssmc_matrices(ssmc, "SNAP", "Orkut"), format = "RB")[1] + nlpkkt_path = fetch_ssmc(ssmc_matrices(ssmc, "Schenk", "nlpkkt160"), format = "RB")[1] + + #loaded_matrix = RutherfordBoeingData(joinpath(orkut_path, "com-Orkut.rb")); + loaded_matrix = RutherfordBoeingData(joinpath(nlpkkt_path, "nlpkkt160.rb")); + graph = SimpleDiGraph(loaded_matrix.data) +else + graph = dorogovtsev_mendes(150) +end +A_cpu = transpose(convert( SparseMatrixCSC{MAIN_TYPE,Int32}, adjacency_matrix(graph, MAIN_TYPE; dir = :both), - ), -) -#A2 = convert( -# SparseMatrixCSC{Bool,Int32}, -# adjacency_matrix(graph, Bool; dir = :both), -#) - -#A_T_gpu2 = SparseGPUMatrixCSR(transpose(A2), BACKEND) + )) + + + SIZE = size(A_cpu, 2) -SIZE_2 = 32 +A_T = SparseGPUMatrixSELL(A_cpu, BACKEND) +A_T2 = SparseGPUMatrixCSR(A_cpu, BACKEND) + +SIZE_2 = 8 B_cpu = rand(MAIN_TYPE, SIZE, SIZE_2); b_cpu = B_cpu[:, 1]; -C_cpu = zeros(MAIN_TYPE, SIZE, SIZE_2); +C_cpu = zeros(RES_TYPE, SIZE, SIZE_2); + -A_T = SparseGPUMatrixSELL(A_cpu, BACKEND) -A_T2 = SparseGPUMatrixCSR(A_cpu, BACKEND) B = KernelAbstractions.zeros(BACKEND, MAIN_TYPE, SIZE, SIZE_2); copyto!(B, B_cpu); b = KernelAbstractions.zeros(BACKEND, MAIN_TYPE, SIZE); copyto!(b, b_cpu); -C = KernelAbstractions.zeros(BACKEND, MAIN_TYPE, SIZE, SIZE_2); -c = KernelAbstractions.zeros(BACKEND, MAIN_TYPE, SIZE); +C = KernelAbstractions.zeros(BACKEND, RES_TYPE, SIZE, SIZE_2); +c = KernelAbstractions.zeros(BACKEND, RES_TYPE, SIZE); mask = rand(Bool, SIZE) mask_dense = KernelAbstractions.zeros(BACKEND, Bool, SIZE) @@ -169,7 +174,14 @@ isapprox(C_cpu, C_res) end CUDA.synchronize() end -c_res = zeros(MAIN_TYPE, SIZE); +@benchmark begin + for _ = 1:SIZE_2 + gpu_spmv!(c, A_T2, b, mask = mask_dense) + end + CUDA.synchronize() +end + +c_res = zeros(RES_TYPE, SIZE); copyto!(c_res, c); c_cpu = A_cpu * b_cpu .* mask; isapprox(c_cpu, c_res) @@ -201,7 +213,7 @@ isapprox(mat_res_cpu, vec_res) @benchmark begin - mat_res = GPUGraphs.shortest_path(A_T2, convert(Vector{Int32}, range(1, SIZE_2))); + mat_res = GPUGraphs.shortest_path(A_T, convert(Vector{Int32}, range(1, SIZE_2))); KernelAbstractions.synchronize(BACKEND) end