From 3e5979400c5f5b2e00392e4e709e2774aabec3ad Mon Sep 17 00:00:00 2001
From: Alexis Montoison <35051714+amontoison@users.noreply.github.com>
Date: Mon, 13 Nov 2023 01:52:05 -0600
Subject: [PATCH] [CUSPARSE] Support in-place triangular solves with CUDA v12.x
 (#2164)

---
 lib/cusparse/generic.jl               |   8 +-
 lib/cusparse/interfaces.jl            | 195 +++++++++++++-------------
 test/libraries/cusparse/interfaces.jl | 107 +++++++++-----
 3 files changed, 178 insertions(+), 132 deletions(-)

diff --git a/lib/cusparse/generic.jl b/lib/cusparse/generic.jl
index 09e7ae031c..87410f6208 100644
--- a/lib/cusparse/generic.jl
+++ b/lib/cusparse/generic.jl
@@ -535,13 +535,17 @@ function sm!(transa::SparseChar, transb::SparseChar, uplo::SparseChar, diag::Spa
     transa = T <: Real && transa == 'C' ? 'T' : transa
     transb = T <: Real && transb == 'C' ? 'T' : transb
 
+    # Check if we solve a triangular system in-place with transb != 'N'.
+    # In that case we need to update the descriptor of C such that it represents Bᵀ.
+    is_C_transposed = (B === C) && (transb != 'N')
+
     if isa(A, CuSparseMatrixCSC) && transa == 'C' && T <: Complex
         throw(ArgumentError("Backward and forward sweeps with the adjoint of a complex CSC matrix is not supported. Use a CSR or COO matrix instead."))
     end
 
     mA,nA = size(A)
     mB,nB = size(B)
-    mC,nC = size(C)
+    mC,nC = !is_C_transposed ? size(C) : reverse(size(C))
 
     (mA != nA) && throw(DimensionMismatch("A must be square, but has dimensions ($mA,$nA)!"))
     (mC != mA) && throw(DimensionMismatch("C must have $mA rows, but has $mC rows"))
@@ -566,7 +570,7 @@ function sm!(transa::SparseChar, transb::SparseChar, uplo::SparseChar, diag::Spa
     cusparseSpMatSetAttribute(descA, 'D', cusparse_diag, Csize_t(sizeof(cusparse_diag)))
 
     descB = CuDenseMatrixDescriptor(B)
-    descC = CuDenseMatrixDescriptor(C)
+    descC = CuDenseMatrixDescriptor(C, transposed=is_C_transposed)
 
     spsm_desc = CuSparseSpSMDescriptor()
     function bufferSize()
diff --git a/lib/cusparse/interfaces.jl b/lib/cusparse/interfaces.jl
index 4dc3d767f5..3715a8d45a 100644
--- a/lib/cusparse/interfaces.jl
+++ b/lib/cusparse/interfaces.jl
@@ -208,12 +208,26 @@ end
 
 # triangular
 for SparseMatrixType in (:CuSparseMatrixBSR,)
-    if VERSION >= v"1.10-"
+    if VERSION ≥ v"1.10-"
     @eval begin
-        LinearAlgebra.generic_trimatdiv!(C::DenseCuVector{T}, uploc, isunitc, tfun::Function, A::$SparseMatrixType{T}, B::AbstractVector{T}) where {T<:BlasFloat} =
+        LinearAlgebra.generic_trimatdiv!(C::DenseCuVector{T}, uploc, isunitc, tfun::Function, A::$SparseMatrixType{T}, B::DenseCuVector{T}) where {T<:BlasFloat} =
             sv2!(tfun === identity ? 'N' : tfun === transpose ? 'T' : 'C', uploc, isunitc, one(T), A, C === B ? C : copyto!(C, B), 'O')
-        LinearAlgebra.generic_trimatdiv!(C::DenseCuMatrix{T}, uploc, isunitc, tfun::Function, A::$SparseMatrixType{T}, B::AbstractMatrix{T}) where {T<:BlasFloat} =
+        LinearAlgebra.generic_trimatdiv!(C::DenseCuMatrix{T}, uploc, isunitc, tfun::Function, A::$SparseMatrixType{T}, B::DenseCuMatrix{T}) where {T<:BlasFloat} =
             sm2!(tfun === identity ? 'N' : tfun === transpose ? 'T' : 'C', 'N', uploc, isunitc, one(T), A, C === B ? C : copyto!(C, B), 'O')
+        function LinearAlgebra.generic_trimatdiv!(C::DenseCuMatrix{T}, uploc, isunitc, tfun::Function, A::$SparseMatrixType{T}, B::AdjOrTrans{T,<:DenseCuMatrix{T}}) where {T<:BlasFloat}
+            transb = LinearAlgebra.wrapper_char(B)
+            (transb == 'C') && (T <: Complex) && throw(ErrorException("This operation is not supported by the current CUDA version."))
+            C !== parent(B) && copyto!(C, B)
+            sm2!(tfun === identity ? 'N' : tfun === transpose ? 'T' : 'C', 'N', uploc, isunitc, one(T), A, C, 'O')
+        end
+        function LinearAlgebra.generic_trimatdiv!(C::Transpose{T,<:DenseCuMatrix{T}}, uploc, isunitc, tfun::Function, A::$SparseMatrixType{T}, B::Transpose{T,<:DenseCuMatrix{T}}) where {T<:BlasFloat}
+            B === C || throw(ErrorException("This operation is only supported if B and C are identical."))
+            sm2!(tfun === identity ? 'N' : tfun === transpose ? 'T' : 'C', 'T', uploc, isunitc, one(T), A, parent(B), 'O')
+        end
+        function LinearAlgebra.generic_trimatdiv!(C::Adjoint{T,<:DenseCuMatrix{T}}, uploc, isunitc, tfun::Function, A::$SparseMatrixType{T}, B::Adjoint{T,<:DenseCuMatrix{T}}) where {T<:BlasFloat}
+            B === C || throw(ErrorException("This operation is only supported if B and C are identical."))
+            sm2!(tfun === identity ? 'N' : tfun === transpose ? 'T' : 'C', 'C', uploc, isunitc, one(T), A, parent(B), 'O')
+        end
     end
     else
     ## direct
@@ -231,6 +245,14 @@ for SparseMatrixType in (:CuSparseMatrixBSR,)
             LinearAlgebra.ldiv!(A::$t{T,<:$SparseMatrixType},
                                 B::DenseCuMatrix{T}) where {T<:BlasFloat} =
                 sm2!('N', 'N', $uploc, $isunitc, one(T), parent(A), B, 'O')
+
+            LinearAlgebra.ldiv!(A::$t{T,<:$SparseMatrixType},
+                                B::Transpose{T,<:DenseCuMatrix}) where {T<:BlasFloat} =
+                sm2!('N', 'T', $uploc, $isunitc, one(T), parent(A), parent(B), 'O')
+
+            LinearAlgebra.ldiv!(A::$t{T,<:$SparseMatrixType},
+                                B::Adjoint{T,<:DenseCuMatrix}) where {T<:BlasFloat} =
+                sm2!('N', 'C', $uploc, $isunitc, one(T), parent(A), parent(B), 'O')
         end
     end
 
@@ -252,6 +274,14 @@ for SparseMatrixType in (:CuSparseMatrixBSR,)
                 LinearAlgebra.ldiv!(A::$t{T,<:$opa{T,<:$SparseMatrixType}},
                                     B::DenseCuMatrix{T}) where {T<:BlasFloat} =
                     sm2!($transa, 'N', $uploc, $isunitc, one(T), parent(parent(A)), B, 'O')
+
+                LinearAlgebra.ldiv!(A::$t{T,<:$opa{T,<:$SparseMatrixType}},
+                                    B::Transpose{T,<:DenseCuMatrix}) where {T<:BlasFloat} =
+                    sm2!($transa, 'T', $uploc, $isunitc, one(T), parent(parent(A)), parent(B), 'O')
+
+                LinearAlgebra.ldiv!(A::$t{T,<:$opa{T,<:$SparseMatrixType}},
+                                    B::Adjoint{T,<:DenseCuMatrix}) where {T<:BlasFloat} =
+                    sm2!($transa, 'C', $uploc, $isunitc, one(T), parent(parent(A)), parent(B), 'O')
             end
         end
     end
@@ -259,7 +289,7 @@ for SparseMatrixType in (:CuSparseMatrixBSR,)
 end # SparseMatrixType loop
 
 for SparseMatrixType in (:CuSparseMatrixCOO, :CuSparseMatrixCSR, :CuSparseMatrixCSC)
-    if VERSION >= v"1.10-"
+    if VERSION ≥ v"1.10-"
     @eval begin
         function LinearAlgebra.generic_trimatdiv!(C::DenseCuVector{T}, uploc, isunitc, tfun::Function, A::$SparseMatrixType{T}, B::DenseCuVector{T}) where {T<:BlasFloat}
             if CUSPARSE.version() ≥ v"12.0"
@@ -272,31 +302,40 @@ for SparseMatrixType in (:CuSparseMatrixCOO, :CuSparseMatrixCSR, :CuSparseMatrix
         end
         function LinearAlgebra.generic_trimatdiv!(C::DenseCuMatrix{T}, uploc, isunitc, tfun::Function, A::$SparseMatrixType{T}, B::DenseCuMatrix{T}) where {T<:BlasFloat}
             if CUSPARSE.version() ≥ v"12.0"
-                sm!(tfun === identity ? 'N' : tfun === transpose ? 'T' : 'C', 'N', uploc, isunitc, one(T), parent(A), B, C, 'O')
+                sm!(tfun === identity ? 'N' : tfun === transpose ? 'T' : 'C', 'N', uploc, isunitc, one(T), A, B, C, 'O')
             else
                 $SparseMatrixType == CuSparseMatrixCOO && throw(ErrorException("This operation is not supported by the current CUDA version."))
                 C !== B && copyto!(C, B)
                 sm2!(tfun === identity ? 'N' : tfun === transpose ? 'T' : 'C', 'N', uploc, isunitc, one(T), A, C, 'O')
             end
         end
-        function LinearAlgebra.generic_trimatdiv!(C::DenseCuMatrix{T}, uploc, isunitc, tfun::Function, A::$SparseMatrixType{T}, B::AdjOrTrans{<:T,<:DenseCuMatrix{T}}) where {T<:BlasFloat}
-            CUSPARSE.version() < v"12.0" && throw(ErrorException("This operation is not supported by the current CUDA version."))
+        function LinearAlgebra.generic_trimatdiv!(C::DenseCuMatrix{T}, uploc, isunitc, tfun::Function, A::$SparseMatrixType{T}, B::AdjOrTrans{T,<:DenseCuMatrix{T}}) where {T<:BlasFloat}
             transb = LinearAlgebra.wrapper_char(B)
-            transb == 'C' && throw(ErrorException("adjoint rhs is not supported"))
-            sm!(tfun === identity ? 'N' : tfun === transpose ? 'T' : 'C', transb, uploc, isunitc, one(T), A, parent(B), C, 'O')
+            (transb == 'C') && (T <: Complex) && throw(ErrorException("This operation is not supported by the current CUDA version."))
+            if CUSPARSE.version() ≥ v"12.0"
+                sm!(tfun === identity ? 'N' : tfun === transpose ? 'T' : 'C', transb, uploc, isunitc, one(T), A, parent(B), C, 'O')
+            else
+                $SparseMatrixType == CuSparseMatrixCOO && throw(ErrorException("This operation is not supported by the current CUDA version."))
+                C !== parent(B) && copyto!(C, B)
+                sm2!(tfun === identity ? 'N' : tfun === transpose ? 'T' : 'C', 'N', uploc, isunitc, one(T), A, C, 'O')
+            end
         end
-        function LinearAlgebra.:(\)(A::LinearAlgebra.UpperOrLowerTriangular{T,<:$SparseMatrixType}, B::DenseCuVector{T}) where {T}
-            C = CuVector{T}(undef, length(B))
-            LinearAlgebra.ldiv!(C, A, B)
+        function LinearAlgebra.generic_trimatdiv!(C::Transpose{T,<:DenseCuMatrix{T}}, uploc, isunitc, tfun::Function, A::$SparseMatrixType{T}, B::Transpose{T,<:DenseCuMatrix{T}}) where {T<:BlasFloat}
+            (B !== C) && throw(ErrorException("This operation is only supported if B and C are identical."))
+            if CUSPARSE.version() ≥ v"12.0"
+                sm!(tfun === identity ? 'N' : tfun === transpose ? 'T' : 'C', 'T', uploc, isunitc, one(T), A, parent(B), parent(C), 'O')
+            else
+                $SparseMatrixType == CuSparseMatrixCOO && throw(ErrorException("This operation is not supported by the current CUDA version."))
+                sm2!(tfun === identity ? 'N' : tfun === transpose ? 'T' : 'C', 'T', uploc, isunitc, one(T), A, parent(B), 'O')
+            end
         end
-    end
-    
-    for rhs in (:(DenseCuMatrix{T}), :(Transpose{T,<:DenseCuMatrix}), :(Adjoint{T,<:DenseCuMatrix}))
-        @eval begin
-            function LinearAlgebra.:(\)(A::LinearAlgebra.UpperOrLowerTriangular{T,<:$SparseMatrixType}, B::$rhs) where {T}
-                m, n = size(B)
-                C = CuMatrix{T}(undef, m, n)
-                LinearAlgebra.ldiv!(C, A, B)
+        function LinearAlgebra.generic_trimatdiv!(C::Adjoint{T,<:DenseCuMatrix{T}}, uploc, isunitc, tfun::Function, A::$SparseMatrixType{T}, B::Adjoint{T,<:DenseCuMatrix{T}}) where {T<:BlasFloat}
+            (B !== C) && throw(ErrorException("This operation is only supported if B and C are identical."))
+            if CUSPARSE.version() ≥ v"12.0"
+                sm!(tfun === identity ? 'N' : tfun === transpose ? 'T' : 'C', 'C', uploc, isunitc, one(T), A, parent(B), parent(C), 'O')
+            else
+                $SparseMatrixType == CuSparseMatrixCOO && throw(ErrorException("This operation is not supported by the current CUDA version."))
+                sm2!(tfun === identity ? 'N' : tfun === transpose ? 'T' : 'C', 'C', uploc, isunitc, one(T), A, parent(B), 'O')
             end
         end
     end
@@ -308,59 +347,40 @@ for SparseMatrixType in (:CuSparseMatrixCOO, :CuSparseMatrixCSR, :CuSparseMatrix
                                 (:UnitUpperTriangular, 'U', 'U'))
         @eval begin
             # Left division with vectors
-            function LinearAlgebra.ldiv!(C::DenseCuVector{T},
-                                A::$t{T,<:$SparseMatrixType},
-                                B::DenseCuVector{T}) where {T<:BlasFloat}
+            function LinearAlgebra.ldiv!(A::$t{T,<:$SparseMatrixType}, B::DenseCuVector{T}) where {T<:BlasFloat}
                 if CUSPARSE.version() ≥ v"12.0"
-                    sv!('N', $uploc, $isunitc, one(T), parent(A), B, C, 'O')
+                    sv!('N', $uploc, $isunitc, one(T), parent(A), B, B, 'O')
                 else
                     $SparseMatrixType == CuSparseMatrixCOO && throw(ErrorException("This operation is not supported by the current CUDA version."))
-                    copyto!(C, B)
-                    sv2!('N', $uploc, $isunitc, one(T), parent(A), C, 'O')
+                    sv2!('N', $uploc, $isunitc, one(T), parent(A), B, 'O')
                 end
             end
 
             # Left division with matrices
-            function LinearAlgebra.ldiv!(C::DenseCuMatrix{T},
-                                A::$t{T,<:$SparseMatrixType},
-                                B::DenseCuMatrix{T}) where {T<:BlasFloat}
+            function LinearAlgebra.ldiv!(A::$t{T,<:$SparseMatrixType}, B::DenseCuMatrix{T}) where {T<:BlasFloat}
                 if CUSPARSE.version() ≥ v"12.0"
-                    sm!('N', 'N', $uploc, $isunitc, one(T), parent(A), B, C, 'O')
+                    sm!('N', 'N', $uploc, $isunitc, one(T), parent(A), B, B, 'O')
                 else
                     $SparseMatrixType == CuSparseMatrixCOO && throw(ErrorException("This operation is not supported by the current CUDA version."))
-                    copyto!(C, B)
-                    sm2!('N', 'N', $uploc, $isunitc, one(T), parent(A), C, 'O')
+                    sm2!('N', 'N', $uploc, $isunitc, one(T), parent(A), B, 'O')
                 end
             end
 
-            function LinearAlgebra.ldiv!(C::DenseCuMatrix{T},
-                                A::$t{T,<:$SparseMatrixType},
-                                B::Transpose{T,<:DenseCuMatrix}) where {T<:BlasFloat}
-                CUSPARSE.version() < v"12.0" && throw(ErrorException("This operation is not supported by the current CUDA version."))
-                sm!('N', 'T', $uploc, $isunitc, one(T), parent(A), parent(B), C, 'O')
-            end
-
-            # transb = 'C' is not supported.
-            function LinearAlgebra.ldiv!(C::DenseCuMatrix{T},
-                                A::$t{T,<:$SparseMatrixType},
-                                B::Adjoint{T,<:DenseCuMatrix}) where {T<:BlasReal}
-                CUSPARSE.version() < v"12.0" && throw(ErrorException("This operation is not supported by the current CUDA version."))
-                sm!('N', 'T', $uploc, $isunitc, one(T), parent(A), parent(B), C, 'O')
-            end
-
-            function LinearAlgebra.:(\)(A::$t{T,<:$SparseMatrixType}, B::DenseCuVector{T}) where {T}
-                m = length(B)
-                C = CuVector{T}(undef, m)
-                LinearAlgebra.ldiv!(C, A, B)
+            function LinearAlgebra.ldiv!(A::$t{T,<:$SparseMatrixType}, B::Transpose{T,<:DenseCuMatrix}) where {T<:BlasFloat}
+                if CUSPARSE.version() ≥ v"12.0"
+                    sm!('N', 'T', $uploc, $isunitc, one(T), parent(A), parent(B), parent(B), 'O')
+                else
+                    $SparseMatrixType == CuSparseMatrixCOO && throw(ErrorException("This operation is not supported by the current CUDA version."))
+                    sm2!('N', 'T', $uploc, $isunitc, one(T), parent(A), parent(B), 'O')
+                end
             end
-        end
 
-        for rhs in (:(DenseCuMatrix{T}), :(Transpose{T,<:DenseCuMatrix}), :(Adjoint{T,<:DenseCuMatrix}))
-            @eval begin
-                function LinearAlgebra.:(\)(A::$t{T,<:$SparseMatrixType}, B::$rhs) where {T}
-                    m, n = size(B)
-                    C = CuMatrix{T}(undef, m, n)
-                    LinearAlgebra.ldiv!(C, A, B)
+            function LinearAlgebra.ldiv!(A::$t{T,<:$SparseMatrixType}, B::Adjoint{T,<:DenseCuMatrix}) where {T<:BlasFloat}
+                if CUSPARSE.version() ≥ v"12.0"
+                    sm!('N', 'C', $uploc, $isunitc, one(T), parent(A), parent(B), parent(B), 'O')
+                else
+                    $SparseMatrixType == CuSparseMatrixCOO && throw(ErrorException("This operation is not supported by the current CUDA version."))
+                    sm2!('N', 'C', $uploc, $isunitc, one(T), parent(A), parent(B), 'O')
                 end
             end
         end
@@ -376,57 +396,40 @@ for SparseMatrixType in (:CuSparseMatrixCOO, :CuSparseMatrixCSR, :CuSparseMatrix
                               (:Adjoint, 'C'))
             @eval begin
                 # Left division with vectors
-                function LinearAlgebra.ldiv!(C::DenseCuVector{T},
-                                    A::$t{T,<:$opa{T,<:$SparseMatrixType}},
-                                    B::DenseCuVector{T}) where {T<:BlasFloat}
+                function LinearAlgebra.ldiv!(A::$t{T,<:$opa{T,<:$SparseMatrixType}}, B::DenseCuVector{T}) where {T<:BlasFloat}
                     if CUSPARSE.version() ≥ v"12.0"
-                        sv!($transa, $uploc, $isunitc, one(T), parent(parent(A)), B, C, 'O')
+                        sv!($transa, $uploc, $isunitc, one(T), parent(parent(A)), B, B, 'O')
                     else
-                        copyto!(C, B)
-                        sv2!($transa, $uploc, $isunitc, one(T), parent(parent(A)), C, 'O')
+                        $SparseMatrixType == CuSparseMatrixCOO && throw(ErrorException("This operation is not supported by the current CUDA version."))
+                        sv2!($transa, $uploc, $isunitc, one(T), parent(parent(A)), B, 'O')
                     end
                 end
 
                 # Left division with matrices
-                function LinearAlgebra.ldiv!(C::DenseCuMatrix{T},
-                                    A::$t{T,<:$opa{T,<:$SparseMatrixType}},
-                                    B::DenseCuMatrix{T}) where {T<:BlasFloat}
+                function LinearAlgebra.ldiv!(A::$t{T,<:$opa{T,<:$SparseMatrixType}}, B::DenseCuMatrix{T}) where {T<:BlasFloat}
                     if CUSPARSE.version() ≥ v"12.0"
-                        sm!($transa, 'N', $uploc, $isunitc, one(T), parent(parent(A)), B, C, 'O')
+                        sm!($transa, 'N', $uploc, $isunitc, one(T), parent(parent(A)), B, B, 'O')
                     else
-                        copyto!(C, B)
-                        sm2!($transa, 'N', $uploc, $isunitc, one(T), parent(parent(A)), C, 'O')
+                        $SparseMatrixType == CuSparseMatrixCOO && throw(ErrorException("This operation is not supported by the current CUDA version."))
+                        sm2!($transa, 'N', $uploc, $isunitc, one(T), parent(parent(A)), B, 'O')
                     end
                 end
 
-                function LinearAlgebra.ldiv!(C::DenseCuMatrix{T},
-                                    A::$t{T,<:$opa{T,<:$SparseMatrixType}},
-                                    B::Transpose{T,<:DenseCuMatrix}) where {T<:BlasFloat}
-                    CUSPARSE.version() < v"12.0" && throw(ErrorException("This operation is not supported by the current CUDA version."))
-                    sm!($transa, 'T', $uploc, $isunitc, one(T), parent(parent(A)), parent(B), C, 'O')
-                end
-
-                # transb = 'C' is not supported.
-                function LinearAlgebra.ldiv!(C::DenseCuMatrix{T},
-                                    A::$t{T,<:$opa{T,<:$SparseMatrixType}},
-                                    B::Adjoint{T,<:DenseCuMatrix}) where {T<:BlasReal}
-                    CUSPARSE.version() < v"12.0" && throw(ErrorException("This operation is not supported by the current CUDA version."))
-                    sm!($transa, 'T', $uploc, $isunitc, one(T), parent(parent(A)), parent(B), C, 'O')
-                end
-
-                function LinearAlgebra.:(\)(A::$t{T,<:$opa{T,<:$SparseMatrixType}}, B::DenseCuVector{T}) where {T}
-                    m = length(B)
-                    C = CuVector{T}(undef, m)
-                    LinearAlgebra.ldiv!(C, A, B)
+                function LinearAlgebra.ldiv!(A::$t{T,<:$opa{T,<:$SparseMatrixType}}, B::Transpose{T,<:DenseCuMatrix}) where {T<:BlasFloat}
+                    if CUSPARSE.version() ≥ v"12.0"
+                        sm!($transa, 'T', $uploc, $isunitc, one(T), parent(parent(A)), parent(B), parent(B), 'O')
+                    else
+                        $SparseMatrixType == CuSparseMatrixCOO && throw(ErrorException("This operation is not supported by the current CUDA version."))
+                        sm2!($transa, 'T', $uploc, $isunitc, one(T), parent(parent(A)), parent(B), 'O')
+                    end
                 end
-            end
 
-            for rhs in (:(DenseCuMatrix{T}), :(Transpose{T,<:DenseCuMatrix}), :(Adjoint{T,<:DenseCuMatrix}))
-                @eval begin
-                    function LinearAlgebra.:(\)(A::$t{T,<:$opa{T,<:$SparseMatrixType}}, B::$rhs) where {T}
-                        m, n = size(B)
-                        C = CuMatrix{T}(undef, m, n)
-                        LinearAlgebra.ldiv!(C, A, B)
+                function LinearAlgebra.ldiv!(A::$t{T,<:$opa{T,<:$SparseMatrixType}}, B::Adjoint{T,<:DenseCuMatrix}) where {T<:BlasFloat}
+                    if CUSPARSE.version() ≥ v"12.0"
+                        sm!($transa, 'C', $uploc, $isunitc, one(T), parent(parent(A)), parent(B), parent(B), 'O')
+                    else
+                        $SparseMatrixType == CuSparseMatrixCOO && throw(ErrorException("This operation is not supported by the current CUDA version."))
+                        sm2!($transa, 'C', $uploc, $isunitc, one(T), parent(parent(A)), parent(B), 'O')
                     end
                 end
             end
diff --git a/test/libraries/cusparse/interfaces.jl b/test/libraries/cusparse/interfaces.jl
index 0cb61c8c22..1238bb96f0 100644
--- a/test/libraries/cusparse/interfaces.jl
+++ b/test/libraries/cusparse/interfaces.jl
@@ -358,8 +358,8 @@ using LinearAlgebra, SparseArrays
 
     for triangle in [LowerTriangular, UnitLowerTriangular, UpperTriangular, UnitUpperTriangular]
         @testset "ldiv!($triangle(CuSparseMatrixBSR), CuVector) -- $elty" for elty in [Float32,Float64,ComplexF32,ComplexF64]
-            for opa in [identity, transpose, adjoint]
-                A = A = rand(elty, 10, 10)
+            @testset "opa = $opa" for opa in [identity, transpose, adjoint]
+                A = rand(elty, 10, 10)
                 A = triangle in (UnitLowerTriangular, LowerTriangular) ? tril(A) : triu(A)
                 A = triangle in (UnitLowerTriangular, UnitUpperTriangular) ? A - Diagonal(A) + I : A
                 A = sparse(A)
@@ -373,8 +373,8 @@ using LinearAlgebra, SparseArrays
         end
 
         @testset "$triangle(CuSparseMatrixBSR) \\ CuVector -- $elty" for elty in [Float32,Float64,ComplexF32,ComplexF64]
-            for opa in [identity, transpose, adjoint]
-                A = A = rand(elty, 10, 10)
+            @testset "opa = $opa" for opa in [identity, transpose, adjoint]
+                A = rand(elty, 10, 10)
                 A = triangle in (UnitLowerTriangular, LowerTriangular) ? tril(A) : triu(A)
                 A = triangle in (UnitLowerTriangular, UnitUpperTriangular) ? A - Diagonal(A) + I : A
                 A = sparse(A)
@@ -388,32 +388,38 @@ using LinearAlgebra, SparseArrays
         end
 
         @testset "ldiv!($triangle(CuSparseMatrixBSR), CuMatrix) -- $elty" for elty in [Float32,Float64,ComplexF32,ComplexF64]
-            for opa in [identity, transpose, adjoint]
-                A = A = rand(elty, 10, 10)
-                A = triangle in (UnitLowerTriangular, LowerTriangular) ? tril(A) : triu(A)
-                A = triangle in (UnitLowerTriangular, UnitUpperTriangular) ? A - Diagonal(A) + I : A
-                A = sparse(A)
-                B = rand(elty, 10, 2)
-                dA = CuSparseMatrixBSR(A, 1)
-                dB = CuArray(B)
-                ldiv!(triangle(opa(A)), B)
-                ldiv!(triangle(opa(dA)), dB)
-                @test B ≈ collect(dB)
+            @testset "opa = $opa" for opa in [identity, transpose, adjoint]
+                @testset "opb = $opb" for opb in [identity, transpose, adjoint]
+                    elty <: Complex && opb == adjoint && continue
+                    A = rand(elty, 10, 10)
+                    A = triangle in (UnitLowerTriangular, LowerTriangular) ? tril(A) : triu(A)
+                    A = triangle in (UnitLowerTriangular, UnitUpperTriangular) ? A - Diagonal(A) + I : A
+                    A = sparse(A)
+                    B = opb == identity ? rand(elty, 10, 2) : rand(elty, 2, 10)
+                    dA = CuSparseMatrixBSR(A, 1)
+                    dB = CuArray(B)
+                    ldiv!(triangle(opa(A)), opb(B))
+                    ldiv!(triangle(opa(dA)), opb(dB))
+                    @test B ≈ collect(dB)
+                end
             end
         end
 
         @testset "$triangle(CuSparseMatrixBSR) \\ CuMatrix -- $elty" for elty in [Float32,Float64,ComplexF32,ComplexF64]
-            for opa in [identity, transpose, adjoint]
-                A = A = rand(elty, 10, 10)
-                A = triangle in (UnitLowerTriangular, LowerTriangular) ? tril(A) : triu(A)
-                A = triangle in (UnitLowerTriangular, UnitUpperTriangular) ? A - Diagonal(A) + I : A
-                A = sparse(A)
-                B = rand(elty, 10, 2)
-                dA = CuSparseMatrixBSR(A, 1)
-                dB = CuArray(B)
-                C = triangle(opa(A)) \ B
-                dC = triangle(opa(dA)) \ dB
-                @test C ≈ collect(dC)
+            @testset "opa = $opa" for opa in [identity, transpose, adjoint]
+                @testset "opb = $opb" for opb in [identity, transpose, adjoint]
+                    elty <: Complex && opb == adjoint && continue
+                    A = rand(elty, 10, 10)
+                    A = triangle in (UnitLowerTriangular, LowerTriangular) ? tril(A) : triu(A)
+                    A = triangle in (UnitLowerTriangular, UnitUpperTriangular) ? A - Diagonal(A) + I : A
+                    A = sparse(A)
+                    B = opb == identity ? rand(elty, 10, 2) : rand(elty, 2, 10)
+                    dA = CuSparseMatrixBSR(A, 1)
+                    dB = CuArray(B)
+                    C = triangle(opa(A)) \ opb(B)
+                    dC = triangle(opa(dA)) \ opb(dB)
+                    @test C ≈ collect(dC)
+                end
             end
         end
     end
@@ -421,8 +427,24 @@ using LinearAlgebra, SparseArrays
     for SparseMatrixType in (CuSparseMatrixCOO, CuSparseMatrixCSC, CuSparseMatrixCSR)
         SparseMatrixType == CuSparseMatrixCOO && CUSPARSE.version() < v"12.0" && continue
         for triangle in [LowerTriangular, UnitLowerTriangular, UpperTriangular, UnitUpperTriangular]
+            @testset "ldiv!($triangle($SparseMatrixType), CuVector) -- $elty" for elty in [Float64,ComplexF64]
+                @testset "opa = $opa" for opa in [identity, transpose, adjoint]
+                    SparseMatrixType == CuSparseMatrixCSC && elty <: Complex && opa == adjoint && continue
+                    A = A = rand(elty, 10, 10)
+                    A = triangle in (UnitLowerTriangular, LowerTriangular) ? tril(A) : triu(A)
+                    A = triangle in (UnitLowerTriangular, UnitUpperTriangular) ? A - Diagonal(A) + I : A
+                    A = sparse(A)
+                    y = rand(elty, 10)
+                    dA = SparseMatrixType(A)
+                    dy = CuArray(y)
+                    ldiv!(triangle(opa(A)), y)
+                    ldiv!(triangle(opa(dA)), dy)
+                    @test y ≈ collect(dy)
+                 end
+            end
+
             @testset "ldiv!(CuVector, $triangle($SparseMatrixType), CuVector) -- $elty" for elty in [Float64,ComplexF64]
-                for opa in [identity, transpose, adjoint]
+                @testset "opa = $opa" for opa in [identity, transpose, adjoint]
                     SparseMatrixType == CuSparseMatrixCSC && elty <: Complex && opa == adjoint && continue
                     A = A = rand(elty, 10, 10)
                     A = triangle in (UnitLowerTriangular, LowerTriangular) ? tril(A) : triu(A)
@@ -440,7 +462,7 @@ using LinearAlgebra, SparseArrays
             end
 
             @testset "$triangle($SparseMatrixType) \\ CuVector -- $elty" for elty in [Float64,ComplexF64]
-                for opa in [identity, transpose, adjoint]
+                @testset "opa = $opa" for opa in [identity, transpose, adjoint]
                     SparseMatrixType == CuSparseMatrixCSC && elty <: Complex && opa == adjoint && continue
                     A = rand(elty, 10, 10)
                     A = triangle in (UnitLowerTriangular, LowerTriangular) ? tril(A) : triu(A)
@@ -455,11 +477,29 @@ using LinearAlgebra, SparseArrays
                  end
             end
 
+            @testset "ldiv!($triangle($SparseMatrixType), CuMatrix) -- $elty" for elty in [Float64,ComplexF64]
+                @testset "opa = $opa" for opa in [identity, transpose, adjoint]
+                    @testset "opb = $opb" for opb in [identity, transpose, adjoint]
+                        SparseMatrixType == CuSparseMatrixCSC && elty <: Complex && opa == adjoint && continue
+                        elty <: Complex && opb == adjoint && continue
+                        A = rand(elty, 10, 10)
+                        A = triangle in (UnitLowerTriangular, LowerTriangular) ? tril(A) : triu(A)
+                        A = triangle in (UnitLowerTriangular, UnitUpperTriangular) ? A - Diagonal(A) + I : A
+                        A = sparse(A)
+                        B = opb == identity ? rand(elty, 10, 2) : rand(elty, 2, 10)
+                        dA = SparseMatrixType(A)
+                        dB = CuArray(B)
+                        ldiv!(triangle(opa(A)), opb(B))
+                        ldiv!(triangle(opa(dA)), opb(dB))
+                        @test B ≈ collect(dB)
+                    end
+                end
+            end
+
             @testset "ldiv!(CuMatrix, $triangle($SparseMatrixType), CuMatrix) -- $elty" for elty in [Float64,ComplexF64]
-                for opa in [identity, transpose, adjoint]
-                    for opb in [identity, transpose, adjoint]
+                @testset "opa = $opa" for opa in [identity, transpose, adjoint]
+                    @testset "opb = $opb" for opb in [identity, transpose, adjoint]
                         SparseMatrixType == CuSparseMatrixCSC && elty <: Complex && opa == adjoint && continue
-                        opb != identity && CUSPARSE.version() < v"12.0" && continue
                         elty <: Complex && opb == adjoint && continue
                         A = rand(elty, 10, 10)
                         A = triangle in (UnitLowerTriangular, LowerTriangular) ? tril(A) : triu(A)
@@ -478,10 +518,9 @@ using LinearAlgebra, SparseArrays
             end
 
             @testset "$triangle($SparseMatrixType) \\ CuMatrix -- $elty" for elty in [Float64,ComplexF64]
-                for opa in [identity, transpose, adjoint]
-                    for opb in [identity, transpose, adjoint]
+                @testset "opa = $opa" for opa in [identity, transpose, adjoint]
+                    @testset "opb = $opb" for opb in [identity, transpose, adjoint]
                         SparseMatrixType == CuSparseMatrixCSC && elty <: Complex && opa == adjoint && continue
-                        opb != identity && CUSPARSE.version() < v"12.0" && continue
                         elty <: Complex && opb == adjoint && continue
                         A = rand(elty, 10, 10)
                         A = triangle in (UnitLowerTriangular, LowerTriangular) ? tril(A) : triu(A)