-
-
Notifications
You must be signed in to change notification settings - Fork 5.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[WIP] Speed up dense-sparse matmul #38876
Merged
Merged
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
2b9bf0e
Speed up dense-sparse matmul
dkarrasch 08133a8
add one at-simd, minor edits
dkarrasch 48d19a4
improve A_mul_Bq for dense-sparse
dkarrasch 0d8c107
revert ineffective changes
dkarrasch 593d744
shift at-inbounds annotation
dkarrasch File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,10 +34,10 @@ function mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::Union{StridedVe | |
if β != 1 | ||
β != 0 ? rmul!(C, β) : fill!(C, zero(eltype(C))) | ||
end | ||
for k = 1:size(C, 2) | ||
@inbounds for col = 1:size(A, 2) | ||
for k in 1:size(C, 2) | ||
@inbounds for col in 1:size(A, 2) | ||
αxj = B[col,k] * α | ||
for j = getcolptr(A)[col]:(getcolptr(A)[col + 1] - 1) | ||
for j in nzrange(A, col) | ||
C[rv[j], k] += nzv[j]*αxj | ||
end | ||
end | ||
|
@@ -49,67 +49,38 @@ end | |
*(A::SparseMatrixCSCUnion{TA}, B::AdjOrTransStridedOrTriangularMatrix{Tx}) where {TA,Tx} = | ||
(T = promote_op(matprod, TA, Tx); mul!(similar(B, T, (size(A, 1), size(B, 2))), A, B, true, false)) | ||
|
||
function mul!(C::StridedVecOrMat, adjA::Adjoint{<:Any,<:AbstractSparseMatrixCSC}, B::Union{StridedVector,AdjOrTransStridedOrTriangularMatrix}, α::Number, β::Number) | ||
A = adjA.parent | ||
size(A, 2) == size(C, 1) || throw(DimensionMismatch()) | ||
size(A, 1) == size(B, 1) || throw(DimensionMismatch()) | ||
size(B, 2) == size(C, 2) || throw(DimensionMismatch()) | ||
nzv = nonzeros(A) | ||
rv = rowvals(A) | ||
if β != 1 | ||
β != 0 ? rmul!(C, β) : fill!(C, zero(eltype(C))) | ||
end | ||
for k = 1:size(C, 2) | ||
@inbounds for col = 1:size(A, 2) | ||
tmp = zero(eltype(C)) | ||
for j = getcolptr(A)[col]:(getcolptr(A)[col + 1] - 1) | ||
tmp += adjoint(nzv[j])*B[rv[j],k] | ||
for (T, t) in ((Adjoint, adjoint), (Transpose, transpose)) | ||
@eval function mul!(C::StridedVecOrMat, xA::$T{<:Any,<:AbstractSparseMatrixCSC}, B::Union{StridedVector,AdjOrTransStridedOrTriangularMatrix}, α::Number, β::Number) | ||
A = xA.parent | ||
size(A, 2) == size(C, 1) || throw(DimensionMismatch()) | ||
size(A, 1) == size(B, 1) || throw(DimensionMismatch()) | ||
size(B, 2) == size(C, 2) || throw(DimensionMismatch()) | ||
nzv = nonzeros(A) | ||
rv = rowvals(A) | ||
if β != 1 | ||
β != 0 ? rmul!(C, β) : fill!(C, zero(eltype(C))) | ||
end | ||
for k in 1:size(C, 2) | ||
@inbounds for col in 1:size(A, 2) | ||
tmp = zero(eltype(C)) | ||
for j in nzrange(A, col) | ||
tmp += $t(nzv[j])*B[rv[j],k] | ||
end | ||
C[col,k] += tmp * α | ||
end | ||
C[col,k] += tmp * α | ||
end | ||
C | ||
end | ||
C | ||
end | ||
*(adjA::Adjoint{<:Any,<:AbstractSparseMatrixCSC}, x::StridedVector{Tx}) where {Tx} = | ||
(T = promote_op(matprod, eltype(adjA), Tx); mul!(similar(x, T, size(adjA, 1)), adjA, x, true, false)) | ||
*(adjA::Adjoint{<:Any,<:AbstractSparseMatrixCSC}, B::AdjOrTransStridedOrTriangularMatrix) = | ||
(T = promote_op(matprod, eltype(adjA), eltype(B)); mul!(similar(B, T, (size(adjA, 1), size(B, 2))), adjA, B, true, false)) | ||
|
||
function mul!(C::StridedVecOrMat, transA::Transpose{<:Any,<:AbstractSparseMatrixCSC}, B::Union{StridedVector,AdjOrTransStridedOrTriangularMatrix}, α::Number, β::Number) | ||
A = transA.parent | ||
size(A, 2) == size(C, 1) || throw(DimensionMismatch()) | ||
size(A, 1) == size(B, 1) || throw(DimensionMismatch()) | ||
size(B, 2) == size(C, 2) || throw(DimensionMismatch()) | ||
nzv = nonzeros(A) | ||
rv = rowvals(A) | ||
if β != 1 | ||
β != 0 ? rmul!(C, β) : fill!(C, zero(eltype(C))) | ||
end | ||
for k = 1:size(C, 2) | ||
@inbounds for col = 1:size(A, 2) | ||
tmp = zero(eltype(C)) | ||
for j = getcolptr(A)[col]:(getcolptr(A)[col + 1] - 1) | ||
tmp += transpose(nzv[j])*B[rv[j],k] | ||
end | ||
C[col,k] += tmp * α | ||
end | ||
end | ||
C | ||
end | ||
*(transA::Transpose{<:Any,<:AbstractSparseMatrixCSC}, x::StridedVector{Tx}) where {Tx} = | ||
(T = promote_op(matprod, eltype(transA), Tx); mul!(similar(x, T, size(transA, 1)), transA, x, true, false)) | ||
*(transA::Transpose{<:Any,<:AbstractSparseMatrixCSC}, B::AdjOrTransStridedOrTriangularMatrix) = | ||
(T = promote_op(matprod, eltype(transA), eltype(B)); mul!(similar(B, T, (size(transA, 1), size(B, 2))), transA, B, true, false)) | ||
|
||
# For compatibility with dense multiplication API. Should be deleted when dense multiplication | ||
# API is updated to follow BLAS API. | ||
mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::Union{StridedVector,AdjOrTransStridedOrTriangularMatrix}) = | ||
mul!(C, A, B, true, false) | ||
mul!(C::StridedVecOrMat, adjA::Adjoint{<:Any,<:AbstractSparseMatrixCSC}, B::Union{StridedVector,AdjOrTransStridedOrTriangularMatrix}) = | ||
mul!(C, adjA, B, true, false) | ||
mul!(C::StridedVecOrMat, transA::Transpose{<:Any,<:AbstractSparseMatrixCSC}, B::Union{StridedVector,AdjOrTransStridedOrTriangularMatrix}) = | ||
mul!(C, transA, B, true, false) | ||
|
||
function mul!(C::StridedVecOrMat, X::AdjOrTransStridedOrTriangularMatrix, A::AbstractSparseMatrixCSC, α::Number, β::Number) | ||
mX, nX = size(X) | ||
nX == size(A, 1) || throw(DimensionMismatch()) | ||
|
@@ -120,49 +91,50 @@ function mul!(C::StridedVecOrMat, X::AdjOrTransStridedOrTriangularMatrix, A::Abs | |
if β != 1 | ||
β != 0 ? rmul!(C, β) : fill!(C, zero(eltype(C))) | ||
end | ||
@inbounds for multivec_row=1:mX, col = 1:size(A, 2), k=getcolptr(A)[col]:(getcolptr(A)[col+1]-1) | ||
C[multivec_row, col] += α * X[multivec_row, rv[k]] * nzv[k] # perhaps suboptimal position of α? | ||
if X isa StridedOrTriangularMatrix | ||
@inbounds for col in 1:size(A, 2), k in nzrange(A, col) | ||
Aiα = nzv[k] * α | ||
rvk = rv[k] | ||
@simd for multivec_row in 1:mX | ||
C[multivec_row, col] += X[multivec_row, rvk] * Aiα | ||
end | ||
end | ||
else # X isa Adjoint or Transpose | ||
for multivec_row in 1:mX, col in 1:size(A, 2) | ||
@inbounds for k in nzrange(A, col) | ||
C[multivec_row, col] += X[multivec_row, rv[k]] * nzv[k] * α | ||
end | ||
end | ||
end | ||
C | ||
end | ||
*(X::AdjOrTransStridedOrTriangularMatrix, A::SparseMatrixCSCUnion{TvA}) where {TvA} = | ||
(T = promote_op(matprod, eltype(X), TvA); mul!(similar(X, T, (size(X, 1), size(A, 2))), X, A, true, false)) | ||
|
||
function mul!(C::StridedVecOrMat, X::AdjOrTransStridedOrTriangularMatrix, adjA::Adjoint{<:Any,<:AbstractSparseMatrixCSC}, α::Number, β::Number) | ||
A = adjA.parent | ||
mX, nX = size(X) | ||
nX == size(A, 2) || throw(DimensionMismatch()) | ||
mX == size(C, 1) || throw(DimensionMismatch()) | ||
size(A, 1) == size(C, 2) || throw(DimensionMismatch()) | ||
rv = rowvals(A) | ||
nzv = nonzeros(A) | ||
if β != 1 | ||
β != 0 ? rmul!(C, β) : fill!(C, zero(eltype(C))) | ||
end | ||
@inbounds for col = 1:size(A, 2), k=getcolptr(A)[col]:(getcolptr(A)[col+1]-1), multivec_col=1:mX | ||
C[multivec_col, rv[k]] += α * X[multivec_col, col] * adjoint(nzv[k]) # perhaps suboptimal position of α? | ||
for (T, t) in ((Adjoint, adjoint), (Transpose, transpose)) | ||
@eval function mul!(C::StridedVecOrMat, X::AdjOrTransStridedOrTriangularMatrix, xA::$T{<:Any,<:AbstractSparseMatrixCSC}, α::Number, β::Number) | ||
A = xA.parent | ||
mX, nX = size(X) | ||
nX == size(A, 2) || throw(DimensionMismatch()) | ||
mX == size(C, 1) || throw(DimensionMismatch()) | ||
size(A, 1) == size(C, 2) || throw(DimensionMismatch()) | ||
rv = rowvals(A) | ||
nzv = nonzeros(A) | ||
if β != 1 | ||
β != 0 ? rmul!(C, β) : fill!(C, zero(eltype(C))) | ||
end | ||
@inbounds for col in 1:size(A, 2), k in nzrange(A, col) | ||
Aiα = $t(nzv[k]) * α | ||
rvk = rv[k] | ||
@simd for multivec_col in 1:mX | ||
Comment on lines
+126
to
+129
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Loop order is the same as before, but the two line have been hoisted out and the inner most loop |
||
C[multivec_col, rvk] += X[multivec_col, col] * Aiα | ||
end | ||
end | ||
C | ||
end | ||
C | ||
end | ||
*(X::AdjOrTransStridedOrTriangularMatrix, adjA::Adjoint{<:Any,<:AbstractSparseMatrixCSC}) = | ||
(T = promote_op(matprod, eltype(X), eltype(adjA)); mul!(similar(X, T, (size(X, 1), size(adjA, 2))), X, adjA, true, false)) | ||
|
||
function mul!(C::StridedVecOrMat, X::AdjOrTransStridedOrTriangularMatrix, transA::Transpose{<:Any,<:AbstractSparseMatrixCSC}, α::Number, β::Number) | ||
A = transA.parent | ||
mX, nX = size(X) | ||
nX == size(A, 2) || throw(DimensionMismatch()) | ||
mX == size(C, 1) || throw(DimensionMismatch()) | ||
size(A, 1) == size(C, 2) || throw(DimensionMismatch()) | ||
rv = rowvals(A) | ||
nzv = nonzeros(A) | ||
if β != 1 | ||
β != 0 ? rmul!(C, β) : fill!(C, zero(eltype(C))) | ||
end | ||
@inbounds for col = 1:size(A, 2), k=getcolptr(A)[col]:(getcolptr(A)[col+1]-1), multivec_col=1:mX | ||
C[multivec_col, rv[k]] += α * X[multivec_col, col] * transpose(nzv[k]) # perhaps suboptimal position of α? | ||
end | ||
C | ||
end | ||
*(X::AdjOrTransStridedOrTriangularMatrix, transA::Transpose{<:Any,<:AbstractSparseMatrixCSC}) = | ||
(T = promote_op(matprod, eltype(X), eltype(transA)); mul!(similar(X, T, (size(X, 1), size(transA, 2))), X, transA, true, false)) | ||
|
||
|
@@ -896,7 +868,7 @@ function ldiv!(D::Diagonal{T}, A::AbstractSparseMatrixCSC{T}) where {T} | |
for i=1:length(b) | ||
iszero(b[i]) && throw(SingularException(i)) | ||
end | ||
@inbounds for col = 1:size(A, 2), p = getcolptr(A)[col]:(getcolptr(A)[col + 1] - 1) | ||
@inbounds for col in 1:size(A, 2), p in nzrange(A, col) | ||
nonz[p] = b[Arowval[p]] \ nonz[p] | ||
end | ||
A | ||
|
@@ -916,7 +888,7 @@ function triu(S::AbstractSparseMatrixCSC{Tv,Ti}, k::Integer=0) where {Tv,Ti} | |
colptr[col] = 1 | ||
end | ||
for col = max(k+1,1) : n | ||
for c1 = getcolptr(S)[col] : getcolptr(S)[col+1]-1 | ||
for c1 in nzrange(S, col) | ||
rowvals(S)[c1] > col - k && break | ||
nnz += 1 | ||
end | ||
|
@@ -927,7 +899,7 @@ function triu(S::AbstractSparseMatrixCSC{Tv,Ti}, k::Integer=0) where {Tv,Ti} | |
A = SparseMatrixCSC(m, n, colptr, rowval, nzval) | ||
for col = max(k+1,1) : n | ||
c1 = getcolptr(S)[col] | ||
for c2 = getcolptr(A)[col] : getcolptr(A)[col+1]-1 | ||
for c2 in nzrange(A, col) | ||
rowvals(A)[c2] = rowvals(S)[c1] | ||
nonzeros(A)[c2] = nonzeros(S)[c1] | ||
c1 += 1 | ||
|
@@ -981,7 +953,7 @@ function sparse_diff1(S::AbstractSparseMatrixCSC{Tv,Ti}) where {Tv,Ti} | |
for col = 1 : n | ||
last_row = 0 | ||
last_val = 0 | ||
for k = getcolptr(S)[col] : getcolptr(S)[col+1]-1 | ||
for k in nzrange(S, col) | ||
row = rowvals(S)[k] | ||
val = nonzeros(S)[k] | ||
if row > 1 | ||
|
@@ -1124,7 +1096,7 @@ function opnorm(A::AbstractSparseMatrixCSC, p::Real=2) | |
nA::Tsum = 0 | ||
for j=1:n | ||
colSum::Tsum = 0 | ||
for i = getcolptr(A)[j]:getcolptr(A)[j+1]-1 | ||
for i in nzrange(A, j) | ||
colSum += abs(nonzeros(A)[i]) | ||
end | ||
nA = max(nA, colSum) | ||
|
@@ -1469,7 +1441,7 @@ function mul!(C::AbstractSparseMatrixCSC, A::AbstractSparseMatrixCSC, D::Diagona | |
Cnzval = nonzeros(C) | ||
Anzval = nonzeros(A) | ||
resize!(Cnzval, length(Anzval)) | ||
for col = 1:n, p = getcolptr(A)[col]:(getcolptr(A)[col+1]-1) | ||
for col in 1:n, p in nzrange(A, col) | ||
@inbounds Cnzval[p] = Anzval[p] * b[col] | ||
end | ||
C | ||
|
@@ -1484,7 +1456,7 @@ function mul!(C::AbstractSparseMatrixCSC, D::Diagonal, A::AbstractSparseMatrixCS | |
Anzval = nonzeros(A) | ||
Arowval = rowvals(A) | ||
resize!(Cnzval, length(Anzval)) | ||
for col = 1:n, p = getcolptr(A)[col]:(getcolptr(A)[col+1]-1) | ||
for col in 1:n, p in nzrange(A, col) | ||
@inbounds Cnzval[p] = b[Arowval[p]] * Anzval[p] | ||
end | ||
C | ||
|
@@ -1520,7 +1492,7 @@ function rmul!(A::AbstractSparseMatrixCSC, D::Diagonal) | |
m, n = size(A) | ||
(n == size(D, 1)) || throw(DimensionMismatch()) | ||
Anzval = nonzeros(A) | ||
@inbounds for col = 1:n, p = getcolptr(A)[col]:(getcolptr(A)[col + 1] - 1) | ||
@inbounds for col in 1:n, p in nzrange(A, col) | ||
Anzval[p] = Anzval[p] * D.diag[col] | ||
end | ||
return A | ||
|
@@ -1531,7 +1503,7 @@ function lmul!(D::Diagonal, A::AbstractSparseMatrixCSC) | |
(m == size(D, 2)) || throw(DimensionMismatch()) | ||
Anzval = nonzeros(A) | ||
Arowval = rowvals(A) | ||
@inbounds for col = 1:n, p = getcolptr(A)[col]:(getcolptr(A)[col + 1] - 1) | ||
@inbounds for col in 1:n, p in nzrange(A, col) | ||
Anzval[p] = D.diag[Arowval[p]] * Anzval[p] | ||
end | ||
return A | ||
|
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This branch is new, and loop order rearranged for optimal access pattern both in
X
andC
, plus the@simd
in the innermost loop. According to nanosoldier, this results in a significant improvement.