Skip to content

Commit

Permalink
[NDTensors] Add AMDGPU.jl (ROCm) based extension for NDTensors (#1325)
Browse files Browse the repository at this point in the history
  • Loading branch information
wbernoudy authored Mar 21, 2024
1 parent f4ad958 commit 093d339
Show file tree
Hide file tree
Showing 26 changed files with 273 additions and 25 deletions.
2 changes: 2 additions & 0 deletions NDTensors/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,14 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
TBLIS = "48530278-0828-4a49-9772-0f3830dfa1e9"
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"

[extensions]
NDTensorsCUDAExt = "CUDA"
NDTensorsMetalExt = "Metal"
NDTensorsOctavianExt = "Octavian"
NDTensorsTBLISExt = "TBLIS"
NDTensorsAMDGPUExt = "AMDGPU"

[compat]
Accessors = "0.1.33"
Expand Down
11 changes: 11 additions & 0 deletions NDTensors/ext/NDTensorsAMDGPUExt/NDTensorsAMDGPUExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
module NDTensorsAMDGPUExt

include("copyto.jl")
include("set_types.jl")
include("adapt.jl")
include("indexing.jl")
include("linearalgebra.jl")
include("mul.jl")
include("permutedims.jl")

end
31 changes: 31 additions & 0 deletions NDTensors/ext/NDTensorsAMDGPUExt/adapt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
using NDTensors: NDTensors, EmptyStorage, adapt_storagetype, emptytype
using NDTensors.AMDGPUExtensions: AMDGPUExtensions, ROCArrayAdaptor
using NDTensors.GPUArraysCoreExtensions: storagemode
using NDTensors.TypeParameterAccessors:
default_type_parameter,
set_type_parameter,
set_type_parameters,
type_parameter,
type_parameters
using Adapt: Adapt, adapt
using AMDGPU: AMDGPU, ROCArray, ROCVector
using Functors: fmap

function AMDGPUExtensions.roc(xs; storagemode=default_type_parameter(ROCArray, storagemode))
return fmap(x -> adapt(ROCArrayAdaptor{storagemode}(), x), xs)
end

function Adapt.adapt_storage(adaptor::ROCArrayAdaptor, xs::AbstractArray)
new_parameters = (type_parameters(xs, (eltype, ndims))..., storagemode(adaptor))
roctype = set_type_parameters(ROCArray, (eltype, ndims, storagemode), new_parameters)
return isbits(xs) ? xs : adapt(roctype, xs)
end

function NDTensors.adapt_storagetype(
adaptor::ROCArrayAdaptor, xs::Type{EmptyStorage{ElT,StoreT}}
) where {ElT,StoreT}
roctype = set_type_parameters(
ROCVector, (eltype, storagemode), (ElT, storagemode(adaptor))
)
return emptytype(adapt_storagetype(roctype, StoreT))
end
35 changes: 35 additions & 0 deletions NDTensors/ext/NDTensorsAMDGPUExt/copyto.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
using NDTensors.Expose: Exposed, expose, parent, unexpose
using LinearAlgebra: LinearAlgebra, Adjoint
using AMDGPU: ROCArray

# Same definition as `MtlArray`.
function Base.copy(src::Exposed{<:ROCArray,<:Base.ReshapedArray})
return reshape(copy(parent(src)), size(unexpose(src)))
end

function Base.copy(
src::Exposed{
<:ROCArray,<:SubArray{<:Any,<:Any,<:Base.ReshapedArray{<:Any,<:Any,<:Adjoint}}
},
)
return copy(@view copy(expose(parent(src)))[parentindices(unexpose(src))...])
end

function Base.copyto!(dest::Exposed{<:ROCArray}, src::Exposed{<:ROCArray,<:SubArray})
copyto!(dest, expose(copy(src)))
return unexpose(dest)
end

function Base.copyto!(
dest::Exposed{<:ROCArray}, src::Exposed{<:ROCArray,<:Base.ReshapedArray}
)
copyto!(dest, expose(parent(src)))
return unexpose(dest)
end

function Base.copyto!(
dest::Exposed{<:ROCArray}, src::Exposed{<:ROCArray,<:LinearAlgebra.Transpose}
)
copyto!(expose(transpose(dest)), expose(parent(src)))
return unexpose(dest)
end
23 changes: 23 additions & 0 deletions NDTensors/ext/NDTensorsAMDGPUExt/indexing.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
using NDTensors.Expose: Exposed, expose, parent, unexpose
using NDTensors.GPUArraysCoreExtensions: cpu
using AMDGPU: AMDGPU, ROCArray
using GPUArraysCore: @allowscalar

function Base.getindex(E::Exposed{<:ROCArray})
return @allowscalar unexpose(E)[]
end

function Base.setindex!(E::Exposed{<:ROCArray}, x::Number)
@allowscalar unexpose(E)[] = x
return unexpose(E)
end

function Base.getindex(E::Exposed{<:ROCArray,<:Adjoint}, i, j)
return (expose(parent(E))[j, i])'
end

Base.any(f, E::Exposed{<:ROCArray,<:NDTensors.Tensor}) = any(f, data(unexpose(E)))

function Base.print_array(io::IO, E::Exposed{<:ROCArray})
return Base.print_array(io, expose(cpu(E)))
end
22 changes: 22 additions & 0 deletions NDTensors/ext/NDTensorsAMDGPUExt/linearalgebra.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
using NDTensors.AMDGPUExtensions: roc
using NDTensors.Expose: Expose, Exposed, expose, ql, ql_positive
using NDTensors.GPUArraysCoreExtensions: cpu
using NDTensors.TypeParameterAccessors: unwrap_array_type
using LinearAlgebra: svd
using Adapt: adapt
using AMDGPU: ROCMatrix

function LinearAlgebra.svd(A::Exposed{<:ROCMatrix}; kwargs...)
U, S, V = svd(cpu(A))
return roc.((U, S, V))
end

## TODO currently AMDGPU doesn't have ql so make a ql function
function Expose.ql(A::Exposed{<:ROCMatrix})
Q, L = ql(expose(cpu(A)))
return adapt(unwrap_array_type(A), Matrix(Q)), adapt(unwrap_array_type(A), L)
end
function Expose.ql_positive(A::Exposed{<:ROCMatrix})
Q, L = ql_positive(expose(cpu(A)))
return adapt(unwrap_array_type(A), Matrix(Q)), adapt(unwrap_array_type(A), L)
end
45 changes: 45 additions & 0 deletions NDTensors/ext/NDTensorsAMDGPUExt/mul.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
using NDTensors.Expose: Exposed, expose, parent, unexpose
using LinearAlgebra: LinearAlgebra, Adjoint, Transpose, mul!
using AMDGPU: ROCArray

# This was calling generic matrix multiplication.
function LinearAlgebra.mul!(
CM::Exposed{<:ROCArray,<:LinearAlgebra.Transpose},
AM::Exposed{<:ROCArray},
BM::Exposed{<:ROCArray},
α,
β,
)
mul!(transpose(CM), transpose(BM), transpose(AM), α, β)
return unexpose(CM)
end

# This was calling generic matrix multiplication.
function LinearAlgebra.mul!(
CM::Exposed{<:ROCArray,<:LinearAlgebra.Adjoint},
AM::Exposed{<:ROCArray},
BM::Exposed{<:ROCArray},
α,
β,
)
mul!(CM', BM', AM', α, β)
return unexpose(CM)
end

# Fix issue in AMDGPU.jl where it cannot distinguish
# Transpose{Reshape{Adjoint{ROCArray}}} as a ROCArray and calls generic matmul
function LinearAlgebra.mul!(
CM::Exposed{<:ROCArray},
AM::Exposed{<:ROCArray},
BM::Exposed{
<:ROCArray,
<:LinearAlgebra.Transpose{
<:Any,<:Base.ReshapedArray{<:Any,<:Any,<:LinearAlgebra.Adjoint}
},
},
α,
β,
)
mul!(CM, AM, expose(transpose(copy(expose(parent(BM))))), α, β)
return unexpose(CM)
end
23 changes: 23 additions & 0 deletions NDTensors/ext/NDTensorsAMDGPUExt/permutedims.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
using NDTensors.Expose: Exposed, expose, parent, unexpose
using AMDGPU: ROCArray

function Base.permutedims!(
Edest::Exposed{<:ROCArray,<:Base.ReshapedArray}, Esrc::Exposed{<:ROCArray}, perm
)
Aperm = permutedims(Esrc, perm)
copyto!(expose(parent(Edest)), expose(Aperm))
return unexpose(Edest)
end

# There is an issue in AMDGPU where if Edest is a reshaped{<:Adjoint}
# .= can fail. So instead force Esrc into the shape of parent(Edest)
function Base.permutedims!(
Edest::Exposed{<:ROCArray,<:Base.ReshapedArray{<:Any,<:Any,<:Adjoint}},
Esrc::Exposed{<:ROCArray},
perm,
f,
)
Aperm = reshape(permutedims(Esrc, perm), size(parent(Edest)))
parent(Edest) .= f.(parent(Edest), Aperm)
return unexpose(Edest)
end
11 changes: 11 additions & 0 deletions NDTensors/ext/NDTensorsAMDGPUExt/set_types.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# TypeParameterAccessors definitions
using NDTensors.TypeParameterAccessors: TypeParameterAccessors, Position
using NDTensors.GPUArraysCoreExtensions: storagemode
using AMDGPU: AMDGPU, ROCArray

function TypeParameterAccessors.default_type_parameters(::Type{<:ROCArray})
return (Float64, 1, AMDGPU.Mem.HIPBuffer)
end
TypeParameterAccessors.position(::Type{<:ROCArray}, ::typeof(eltype)) = Position(1)
TypeParameterAccessors.position(::Type{<:ROCArray}, ::typeof(ndims)) = Position(2)
TypeParameterAccessors.position(::Type{<:ROCArray}, ::typeof(storagemode)) = Position(3)
13 changes: 13 additions & 0 deletions NDTensors/ext/NDTensorsCUDAExt/linearalgebra.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,16 @@ function NDTensors.svd_catch_error(A::CuMatrix, ::CUDA.CUSOLVER.QRAlgorithm)
end
return USV
end

using NDTensors.GPUArraysCoreExtensions: cpu
using NDTensors.Expose: Expose, expose, ql, ql_positive
using NDTensors.TypeParameterAccessors: unwrap_array_type
## TODO currently AMDGPU doesn't have ql so make a ql function
function Expose.ql(A::Exposed{<:CuMatrix})
Q, L = ql(expose(cpu(A)))
return adapt(unwrap_array_type(A), Matrix(Q)), adapt(unwrap_array_type(A), L)
end
function Expose.ql_positive(A::Exposed{<:CuMatrix})
Q, L = ql_positive(expose(cpu(A)))
return adapt(unwrap_array_type(A), Matrix(Q)), adapt(unwrap_array_type(A), L)
end
2 changes: 1 addition & 1 deletion NDTensors/ext/NDTensorsCUDAExt/set_types.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# TypeParameterAccessors definitions
using NDTensors.TypeParameterAccessors: TypeParameterAccessors, Position
using NDTensors.GPUArraysCoreExtensions: storagemode
## TODO remove TypeParameterAccessors when SetParameters is removed

function TypeParameterAccessors.position(::Type{<:CuArray}, ::typeof(eltype))
return Position(1)
end
Expand Down
1 change: 0 additions & 1 deletion NDTensors/src/adapt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ using .GPUArraysCoreExtensions: GPUArraysCoreExtensions
adapt_structure(to, x::TensorStorage) = setdata(x, adapt(to, data(x)))
adapt_structure(to, x::Tensor) = setstorage(x, adapt(to, storage(x)))

## use unwrap cpu here because Expose is included before NDTensors
function GPUArraysCoreExtensions.cpu(eltype::Type{<:Number}, x)
return fmap(x -> adapt(Array{eltype}, x), x)
end
Expand Down
4 changes: 3 additions & 1 deletion NDTensors/src/imports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ for lib in [
:UnspecifiedTypes,
:TypeParameterAccessors,
:GPUArraysCoreExtensions,
:AMDGPUExtensions,
:CUDAExtensions,
:MetalExtensions,
:Expose,
Expand Down Expand Up @@ -58,9 +59,10 @@ using Base.Cartesian: @nexprs

using Base.Threads: @spawn

using .AMDGPUExtensions: roc
using .CUDAExtensions: cu
using .MetalExtensions: mtl
using .GPUArraysCoreExtensions: cpu
using .MetalExtensions: mtl

import Base:
# Types
Expand Down
2 changes: 2 additions & 0 deletions NDTensors/src/lib/AMDGPUExtensions/.JuliaFormatter.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
style = "blue"
indent = 2
4 changes: 4 additions & 0 deletions NDTensors/src/lib/AMDGPUExtensions/src/AMDGPUExtensions.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
module AMDGPUExtensions
include("roc.jl")

end
14 changes: 14 additions & 0 deletions NDTensors/src/lib/AMDGPUExtensions/src/roc.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
using NDTensors.TypeParameterAccessors: TypeParameterAccessors, Position
using NDTensors.GPUArraysCoreExtensions: storagemode
# Implemented in NDTensorsAMDGPUExt
function roc end

## Here we need an ROCArrayAdaptor to prevent conversion of 64 bit numbers to 32 bit.
## We cannot write `adapt(CuVector, x)` because this
## will not allow us to properly utilize the buffer preference without changing the value of
## default_buffertype. Also `adapt(CuVector{<:Any, <:Any, Buffertype})` fails to work properly
struct ROCArrayAdaptor{B} end

function TypeParameterAccessors.position(::Type{<:ROCArrayAdaptor}, ::typeof(storagemode))
return Position(1)
end
9 changes: 9 additions & 0 deletions NDTensors/src/lib/AMDGPUExtensions/test/runtests.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
@eval module $(gensym())
using Test: @testset, @test
using NDTensors.AMDGPUExtensions: roc, ROCArrayAdaptor
using NDTensors.GPUArraysCoreExtensions: storagemode
@testset "roc and ROCArrayAdaptor" begin
@test roc isa Function
@test storagemode(ROCArrayAdaptor{1}) == 1
end
end
4 changes: 2 additions & 2 deletions NDTensors/src/lib/CUDAExtensions/src/cuda.jl
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
using NDTensors.TypeParameterAccessors: TypeParameterAccessors
using NDTensors.GPUArraysCoreExtensions: storagemode
# Implemented in `ITensorGPU` and NDTensorCUDA
# Implemented in `ITensorGPU` and NDTensorsCUDAExt
function cu end

## Here we need an NDTensorCuArrayAdaptor because the CuArrayAdaptor provided by CUDA
## Here we need an CuArrayAdaptor because the CuArrayAdaptor provided by CUDA
## converts 64 bit numbers to 32 bit. We cannot write `adapt(CuVector, x)` because this
## Will not allow us to properly utilize the buffer preference without changing the value of
## default_buffertype. Also `adapt(CuVector{<:Any, <:Any, Buffertype})` fails to work properly
Expand Down
20 changes: 2 additions & 18 deletions NDTensors/src/linearalgebra/linearalgebra.jl
Original file line number Diff line number Diff line change
Expand Up @@ -389,11 +389,6 @@ function ql_positive(M::AbstractMatrix)
# TODO: Change to `isgpu`, or better yet rewrite
# in terms of broadcasting and linear algebra
# like `qr_positive`.
iscuda = iscu(M)
if iscuda
cutype = unwrap_array_type(M)
M = NDTensors.cpu(M)
end
sparseQ, L = ql(M)
Q = convert(typeof(L), sparseQ)
nr, nc = size(L)
Expand All @@ -407,10 +402,6 @@ function ql_positive(M::AbstractMatrix)
end
end
end
if iscuda
Q = adapt(cutype, Q)
L = adapt(cutype, L)
end
return (Q, L)
end

Expand All @@ -423,23 +414,16 @@ function ql(A::AbstractMatrix)
T = eltype(A)
AA = similar(A, LinearAlgebra._qreltype(T), size(A))
copyto!(expose(AA), expose(A))
iscuda = iscu(AA)
if iscuda
cutype = unwrap_array_type(AA)
AA = NDTensors.cpu(AA)
end
Q, L = ql!(AA)
if iscuda
Q = adapt(cutype, Q)
L = adapt(cutype, L)
end
return (Q, L)
end
#
# This is where the low level call to lapack actually occurs. Most of the work is
# about unpacking Q and L from the A matrix.
#
function ql!(A::StridedMatrix{<:LAPACK.BlasFloat})
## TODO is this really necessary here, we could create Expose function if
## we need this function on CU/GPU
if iscu(A)
throw("Error: ql is not implemented in CUDA.jl")
end
Expand Down
4 changes: 4 additions & 0 deletions NDTensors/test/ITensors/TestITensorDMRG/TestITensorDMRG.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,18 @@ module TestITensorDMRG
using ITensors
using NDTensors
using NDTensors.CUDAExtensions: cu
using NDTensors.AMDGPUExtensions: roc
using Random

reference_energies = Dict([
(4, -1.6160254037844384), (8, -3.374932598687889), (10, -4.258035207282885)
])

is_broken(dev, elt::Type, conserve_qns::Val) = false
## Disable blocksparse GPU testing on CUDA and ROC backends while
## we work on the blocksparse backend. In the future these will work too
is_broken(dev::typeof(cu), elt::Type, conserve_qns::Val{true}) = true
is_broken(dev::typeof(roc), elt::Type, conserve_qns::Val{true}) = true

include("dmrg.jl")

Expand Down
Loading

0 comments on commit 093d339

Please sign in to comment.