[NDTensors] Add AMDGPU.jl (ROCm) based extension for NDTensors (#1325)

ITensor · Mar 21, 2024 · 093d339 · 093d339
1 parent f4ad958
commit 093d339
Show file tree

Hide file tree

Showing 26 changed files with 273 additions and 25 deletions.
diff --git a/NDTensors/Project.toml b/NDTensors/Project.toml
@@ -37,12 +37,14 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
 TBLIS = "48530278-0828-4a49-9772-0f3830dfa1e9"
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 
 [extensions]
 NDTensorsCUDAExt = "CUDA"
 NDTensorsMetalExt = "Metal"
 NDTensorsOctavianExt = "Octavian"
 NDTensorsTBLISExt = "TBLIS"
+NDTensorsAMDGPUExt = "AMDGPU"
 
 [compat]
 Accessors = "0.1.33"

diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/NDTensorsAMDGPUExt.jl b/NDTensors/ext/NDTensorsAMDGPUExt/NDTensorsAMDGPUExt.jl
@@ -0,0 +1,11 @@
+module NDTensorsAMDGPUExt
+
+include("copyto.jl")
+include("set_types.jl")
+include("adapt.jl")
+include("indexing.jl")
+include("linearalgebra.jl")
+include("mul.jl")
+include("permutedims.jl")
+
+end
diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/adapt.jl b/NDTensors/ext/NDTensorsAMDGPUExt/adapt.jl
@@ -0,0 +1,31 @@
+using NDTensors: NDTensors, EmptyStorage, adapt_storagetype, emptytype
+using NDTensors.AMDGPUExtensions: AMDGPUExtensions, ROCArrayAdaptor
+using NDTensors.GPUArraysCoreExtensions: storagemode
+using NDTensors.TypeParameterAccessors:
+  default_type_parameter,
+  set_type_parameter,
+  set_type_parameters,
+  type_parameter,
+  type_parameters
+using Adapt: Adapt, adapt
+using AMDGPU: AMDGPU, ROCArray, ROCVector
+using Functors: fmap
+
+function AMDGPUExtensions.roc(xs; storagemode=default_type_parameter(ROCArray, storagemode))
+  return fmap(x -> adapt(ROCArrayAdaptor{storagemode}(), x), xs)
+end
+
+function Adapt.adapt_storage(adaptor::ROCArrayAdaptor, xs::AbstractArray)
+  new_parameters = (type_parameters(xs, (eltype, ndims))..., storagemode(adaptor))
+  roctype = set_type_parameters(ROCArray, (eltype, ndims, storagemode), new_parameters)
+  return isbits(xs) ? xs : adapt(roctype, xs)
+end
+
+function NDTensors.adapt_storagetype(
+  adaptor::ROCArrayAdaptor, xs::Type{EmptyStorage{ElT,StoreT}}
+) where {ElT,StoreT}
+  roctype = set_type_parameters(
+    ROCVector, (eltype, storagemode), (ElT, storagemode(adaptor))
+  )
+  return emptytype(adapt_storagetype(roctype, StoreT))
+end
diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/copyto.jl b/NDTensors/ext/NDTensorsAMDGPUExt/copyto.jl
@@ -0,0 +1,35 @@
+using NDTensors.Expose: Exposed, expose, parent, unexpose
+using LinearAlgebra: LinearAlgebra, Adjoint
+using AMDGPU: ROCArray
+
+# Same definition as `MtlArray`.
+function Base.copy(src::Exposed{<:ROCArray,<:Base.ReshapedArray})
+  return reshape(copy(parent(src)), size(unexpose(src)))
+end
+
+function Base.copy(
+  src::Exposed{
+    <:ROCArray,<:SubArray{<:Any,<:Any,<:Base.ReshapedArray{<:Any,<:Any,<:Adjoint}}
+  },
+)
+  return copy(@view copy(expose(parent(src)))[parentindices(unexpose(src))...])
+end
+
+function Base.copyto!(dest::Exposed{<:ROCArray}, src::Exposed{<:ROCArray,<:SubArray})
+  copyto!(dest, expose(copy(src)))
+  return unexpose(dest)
+end
+
+function Base.copyto!(
+  dest::Exposed{<:ROCArray}, src::Exposed{<:ROCArray,<:Base.ReshapedArray}
+)
+  copyto!(dest, expose(parent(src)))
+  return unexpose(dest)
+end
+
+function Base.copyto!(
+  dest::Exposed{<:ROCArray}, src::Exposed{<:ROCArray,<:LinearAlgebra.Transpose}
+)
+  copyto!(expose(transpose(dest)), expose(parent(src)))
+  return unexpose(dest)
+end
diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/indexing.jl b/NDTensors/ext/NDTensorsAMDGPUExt/indexing.jl
@@ -0,0 +1,23 @@
+using NDTensors.Expose: Exposed, expose, parent, unexpose
+using NDTensors.GPUArraysCoreExtensions: cpu
+using AMDGPU: AMDGPU, ROCArray
+using GPUArraysCore: @allowscalar
+
+function Base.getindex(E::Exposed{<:ROCArray})
+  return @allowscalar unexpose(E)[]
+end
+
+function Base.setindex!(E::Exposed{<:ROCArray}, x::Number)
+  @allowscalar unexpose(E)[] = x
+  return unexpose(E)
+end
+
+function Base.getindex(E::Exposed{<:ROCArray,<:Adjoint}, i, j)
+  return (expose(parent(E))[j, i])'
+end
+
+Base.any(f, E::Exposed{<:ROCArray,<:NDTensors.Tensor}) = any(f, data(unexpose(E)))
+
+function Base.print_array(io::IO, E::Exposed{<:ROCArray})
+  return Base.print_array(io, expose(cpu(E)))
+end
diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/linearalgebra.jl b/NDTensors/ext/NDTensorsAMDGPUExt/linearalgebra.jl
@@ -0,0 +1,22 @@
+using NDTensors.AMDGPUExtensions: roc
+using NDTensors.Expose: Expose, Exposed, expose, ql, ql_positive
+using NDTensors.GPUArraysCoreExtensions: cpu
+using NDTensors.TypeParameterAccessors: unwrap_array_type
+using LinearAlgebra: svd
+using Adapt: adapt
+using AMDGPU: ROCMatrix
+
+function LinearAlgebra.svd(A::Exposed{<:ROCMatrix}; kwargs...)
+  U, S, V = svd(cpu(A))
+  return roc.((U, S, V))
+end
+
+## TODO currently AMDGPU doesn't have ql so make a ql function
+function Expose.ql(A::Exposed{<:ROCMatrix})
+  Q, L = ql(expose(cpu(A)))
+  return adapt(unwrap_array_type(A), Matrix(Q)), adapt(unwrap_array_type(A), L)
+end
+function Expose.ql_positive(A::Exposed{<:ROCMatrix})
+  Q, L = ql_positive(expose(cpu(A)))
+  return adapt(unwrap_array_type(A), Matrix(Q)), adapt(unwrap_array_type(A), L)
+end
diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/mul.jl b/NDTensors/ext/NDTensorsAMDGPUExt/mul.jl
@@ -0,0 +1,45 @@
+using NDTensors.Expose: Exposed, expose, parent, unexpose
+using LinearAlgebra: LinearAlgebra, Adjoint, Transpose, mul!
+using AMDGPU: ROCArray
+
+# This was calling generic matrix multiplication.
+function LinearAlgebra.mul!(
+  CM::Exposed{<:ROCArray,<:LinearAlgebra.Transpose},
+  AM::Exposed{<:ROCArray},
+  BM::Exposed{<:ROCArray},
+  α,
+  β,
+)
+  mul!(transpose(CM), transpose(BM), transpose(AM), α, β)
+  return unexpose(CM)
+end
+
+# This was calling generic matrix multiplication.
+function LinearAlgebra.mul!(
+  CM::Exposed{<:ROCArray,<:LinearAlgebra.Adjoint},
+  AM::Exposed{<:ROCArray},
+  BM::Exposed{<:ROCArray},
+  α,
+  β,
+)
+  mul!(CM', BM', AM', α, β)
+  return unexpose(CM)
+end
+
+# Fix issue in AMDGPU.jl where it cannot distinguish
+# Transpose{Reshape{Adjoint{ROCArray}}} as a ROCArray and calls generic matmul
+function LinearAlgebra.mul!(
+  CM::Exposed{<:ROCArray},
+  AM::Exposed{<:ROCArray},
+  BM::Exposed{
+    <:ROCArray,
+    <:LinearAlgebra.Transpose{
+      <:Any,<:Base.ReshapedArray{<:Any,<:Any,<:LinearAlgebra.Adjoint}
+    },
+  },
+  α,
+  β,
+)
+  mul!(CM, AM, expose(transpose(copy(expose(parent(BM))))), α, β)
+  return unexpose(CM)
+end
diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/permutedims.jl b/NDTensors/ext/NDTensorsAMDGPUExt/permutedims.jl
@@ -0,0 +1,23 @@
+using NDTensors.Expose: Exposed, expose, parent, unexpose
+using AMDGPU: ROCArray
+
+function Base.permutedims!(
+  Edest::Exposed{<:ROCArray,<:Base.ReshapedArray}, Esrc::Exposed{<:ROCArray}, perm
+)
+  Aperm = permutedims(Esrc, perm)
+  copyto!(expose(parent(Edest)), expose(Aperm))
+  return unexpose(Edest)
+end
+
+# There is an issue in AMDGPU where if Edest is a reshaped{<:Adjoint}
+# .= can fail. So instead force Esrc into the shape of parent(Edest)
+function Base.permutedims!(
+  Edest::Exposed{<:ROCArray,<:Base.ReshapedArray{<:Any,<:Any,<:Adjoint}},
+  Esrc::Exposed{<:ROCArray},
+  perm,
+  f,
+)
+  Aperm = reshape(permutedims(Esrc, perm), size(parent(Edest)))
+  parent(Edest) .= f.(parent(Edest), Aperm)
+  return unexpose(Edest)
+end
diff --git a/NDTensors/ext/NDTensorsAMDGPUExt/set_types.jl b/NDTensors/ext/NDTensorsAMDGPUExt/set_types.jl
@@ -0,0 +1,11 @@
+# TypeParameterAccessors definitions
+using NDTensors.TypeParameterAccessors: TypeParameterAccessors, Position
+using NDTensors.GPUArraysCoreExtensions: storagemode
+using AMDGPU: AMDGPU, ROCArray
+
+function TypeParameterAccessors.default_type_parameters(::Type{<:ROCArray})
+  return (Float64, 1, AMDGPU.Mem.HIPBuffer)
+end
+TypeParameterAccessors.position(::Type{<:ROCArray}, ::typeof(eltype)) = Position(1)
+TypeParameterAccessors.position(::Type{<:ROCArray}, ::typeof(ndims)) = Position(2)
+TypeParameterAccessors.position(::Type{<:ROCArray}, ::typeof(storagemode)) = Position(3)
diff --git a/NDTensors/ext/NDTensorsCUDAExt/linearalgebra.jl b/NDTensors/ext/NDTensorsCUDAExt/linearalgebra.jl
@@ -41,3 +41,16 @@ function NDTensors.svd_catch_error(A::CuMatrix, ::CUDA.CUSOLVER.QRAlgorithm)
   end
   return USV
 end
+
+using NDTensors.GPUArraysCoreExtensions: cpu
+using NDTensors.Expose: Expose, expose, ql, ql_positive
+using NDTensors.TypeParameterAccessors: unwrap_array_type
+## TODO currently AMDGPU doesn't have ql so make a ql function
+function Expose.ql(A::Exposed{<:CuMatrix})
+  Q, L = ql(expose(cpu(A)))
+  return adapt(unwrap_array_type(A), Matrix(Q)), adapt(unwrap_array_type(A), L)
+end
+function Expose.ql_positive(A::Exposed{<:CuMatrix})
+  Q, L = ql_positive(expose(cpu(A)))
+  return adapt(unwrap_array_type(A), Matrix(Q)), adapt(unwrap_array_type(A), L)
+end
diff --git a/NDTensors/ext/NDTensorsCUDAExt/set_types.jl b/NDTensors/ext/NDTensorsCUDAExt/set_types.jl
@@ -1,7 +1,7 @@
 # TypeParameterAccessors definitions
 using NDTensors.TypeParameterAccessors: TypeParameterAccessors, Position
 using NDTensors.GPUArraysCoreExtensions: storagemode
-## TODO remove TypeParameterAccessors when SetParameters is removed
+
 function TypeParameterAccessors.position(::Type{<:CuArray}, ::typeof(eltype))
   return Position(1)
 end

diff --git a/NDTensors/src/adapt.jl b/NDTensors/src/adapt.jl
@@ -2,7 +2,6 @@ using .GPUArraysCoreExtensions: GPUArraysCoreExtensions
 adapt_structure(to, x::TensorStorage) = setdata(x, adapt(to, data(x)))
 adapt_structure(to, x::Tensor) = setstorage(x, adapt(to, storage(x)))
 
-## use unwrap cpu here because Expose is included before NDTensors
 function GPUArraysCoreExtensions.cpu(eltype::Type{<:Number}, x)
   return fmap(x -> adapt(Array{eltype}, x), x)
 end

diff --git a/NDTensors/src/imports.jl b/NDTensors/src/imports.jl
@@ -29,6 +29,7 @@ for lib in [
   :UnspecifiedTypes,
   :TypeParameterAccessors,
   :GPUArraysCoreExtensions,
+  :AMDGPUExtensions,
   :CUDAExtensions,
   :MetalExtensions,
   :Expose,
@@ -58,9 +59,10 @@ using Base.Cartesian: @nexprs
 
 using Base.Threads: @spawn
 
+using .AMDGPUExtensions: roc
 using .CUDAExtensions: cu
-using .MetalExtensions: mtl
 using .GPUArraysCoreExtensions: cpu
+using .MetalExtensions: mtl
 
 import Base:
   # Types

diff --git a/NDTensors/src/lib/AMDGPUExtensions/.JuliaFormatter.toml b/NDTensors/src/lib/AMDGPUExtensions/.JuliaFormatter.toml
@@ -0,0 +1,2 @@
+style = "blue"
+indent = 2
diff --git a/NDTensors/src/lib/AMDGPUExtensions/src/AMDGPUExtensions.jl b/NDTensors/src/lib/AMDGPUExtensions/src/AMDGPUExtensions.jl
@@ -0,0 +1,4 @@
+module AMDGPUExtensions
+include("roc.jl")
+
+end
diff --git a/NDTensors/src/lib/AMDGPUExtensions/src/roc.jl b/NDTensors/src/lib/AMDGPUExtensions/src/roc.jl
@@ -0,0 +1,14 @@
+using NDTensors.TypeParameterAccessors: TypeParameterAccessors, Position
+using NDTensors.GPUArraysCoreExtensions: storagemode
+# Implemented in NDTensorsAMDGPUExt
+function roc end
+
+## Here we need an ROCArrayAdaptor to prevent conversion of 64 bit numbers to 32 bit.  
+## We cannot write `adapt(CuVector, x)` because this
+## will not allow us to properly utilize the buffer preference without changing the value of
+## default_buffertype. Also `adapt(CuVector{<:Any, <:Any, Buffertype})` fails to work properly
+struct ROCArrayAdaptor{B} end
+
+function TypeParameterAccessors.position(::Type{<:ROCArrayAdaptor}, ::typeof(storagemode))
+  return Position(1)
+end
diff --git a/NDTensors/src/lib/AMDGPUExtensions/test/runtests.jl b/NDTensors/src/lib/AMDGPUExtensions/test/runtests.jl
@@ -0,0 +1,9 @@
+@eval module $(gensym())
+using Test: @testset, @test
+using NDTensors.AMDGPUExtensions: roc, ROCArrayAdaptor
+using NDTensors.GPUArraysCoreExtensions: storagemode
+@testset "roc and ROCArrayAdaptor" begin
+  @test roc isa Function
+  @test storagemode(ROCArrayAdaptor{1}) == 1
+end
+end
diff --git a/NDTensors/src/lib/CUDAExtensions/src/cuda.jl b/NDTensors/src/lib/CUDAExtensions/src/cuda.jl
@@ -1,9 +1,9 @@
 using NDTensors.TypeParameterAccessors: TypeParameterAccessors
 using NDTensors.GPUArraysCoreExtensions: storagemode
-# Implemented in `ITensorGPU` and NDTensorCUDA
+# Implemented in `ITensorGPU` and NDTensorsCUDAExt
 function cu end
 
-## Here we need an NDTensorCuArrayAdaptor because the CuArrayAdaptor provided by CUDA
+## Here we need an CuArrayAdaptor because the CuArrayAdaptor provided by CUDA
 ## converts 64 bit numbers to 32 bit.  We cannot write `adapt(CuVector, x)` because this
 ## Will not allow us to properly utilize the buffer preference without changing the value of
 ## default_buffertype. Also `adapt(CuVector{<:Any, <:Any, Buffertype})` fails to work properly

diff --git a/NDTensors/src/linearalgebra/linearalgebra.jl b/NDTensors/src/linearalgebra/linearalgebra.jl
@@ -389,11 +389,6 @@ function ql_positive(M::AbstractMatrix)
   # TODO: Change to `isgpu`, or better yet rewrite
   # in terms of broadcasting and linear algebra
   # like `qr_positive`.
-  iscuda = iscu(M)
-  if iscuda
-    cutype = unwrap_array_type(M)
-    M = NDTensors.cpu(M)
-  end
   sparseQ, L = ql(M)
   Q = convert(typeof(L), sparseQ)
   nr, nc = size(L)
@@ -407,10 +402,6 @@ function ql_positive(M::AbstractMatrix)
       end
     end
   end
-  if iscuda
-    Q = adapt(cutype, Q)
-    L = adapt(cutype, L)
-  end
   return (Q, L)
 end
 
@@ -423,23 +414,16 @@ function ql(A::AbstractMatrix)
   T = eltype(A)
   AA = similar(A, LinearAlgebra._qreltype(T), size(A))
   copyto!(expose(AA), expose(A))
-  iscuda = iscu(AA)
-  if iscuda
-    cutype = unwrap_array_type(AA)
-    AA = NDTensors.cpu(AA)
-  end
   Q, L = ql!(AA)
-  if iscuda
-    Q = adapt(cutype, Q)
-    L = adapt(cutype, L)
-  end
   return (Q, L)
 end
 #
 # This is where the low level call to lapack actually occurs.  Most of the work is
 # about unpacking Q and L from the A matrix.
 #
 function ql!(A::StridedMatrix{<:LAPACK.BlasFloat})
+  ## TODO is this really necessary here, we could create Expose function if
+  ## we need this function on CU/GPU
   if iscu(A)
     throw("Error: ql is not implemented in CUDA.jl")
   end

diff --git a/NDTensors/test/ITensors/TestITensorDMRG/TestITensorDMRG.jl b/NDTensors/test/ITensors/TestITensorDMRG/TestITensorDMRG.jl
@@ -5,14 +5,18 @@ module TestITensorDMRG
 using ITensors
 using NDTensors
 using NDTensors.CUDAExtensions: cu
+using NDTensors.AMDGPUExtensions: roc
 using Random
 
 reference_energies = Dict([
   (4, -1.6160254037844384), (8, -3.374932598687889), (10, -4.258035207282885)
 ])
 
 is_broken(dev, elt::Type, conserve_qns::Val) = false
+## Disable blocksparse GPU testing on CUDA and ROC backends while
+## we work on the blocksparse backend. In the future these will work too
 is_broken(dev::typeof(cu), elt::Type, conserve_qns::Val{true}) = true
+is_broken(dev::typeof(roc), elt::Type, conserve_qns::Val{true}) = true
 
 include("dmrg.jl")