Closed
Description
Each different permutation costs ~0.6s on my device, too bad for contracting a tensor network. This is because the host function unrolls the permutation order:
GPUArrays.jl/src/host/linalg.jl
Line 195 in e1856fe
The following version is compiler friendly, but not runtime efficient.
using CUDA, Random
using CUDA: @cartesianidx, AbstractGPUArray, gpu_call
using LinearAlgebra: permutedims!
@inline @generated function map_index(dest, src, I, perm::NTuple{N,T}) where {N,T}
Expr(:(=), Expr(:ref, :dest, [:(@inbounds I[perm[$i]]) for i in 1:N]...), Expr(:ref, :src, :I))
end
function mypermutedims!(dest::AbstractGPUArray, src::AbstractGPUArray,
perm::NTuple)
Base.checkdims_perm(dest, src, perm)
function permutedims_kernel(ctx, dest, src, perm)
I = @cartesianidx src
map_index(dest, src, I, perm)
return
end
gpu_call(permutedims_kernel, dest, src, perm)
return dest
end
using BenchmarkTools
x = CUDA.randn(fill(2, 18)...);
y = zero(x);
p = (randperm(18)...,)
@btime CUDA.@sync permutedims!($y, $x, $p);
142.905 μs (97 allocations: 3.41 KiB)
@btime CUDA.@sync mypermutedims!($y, $x, $p);
400.497 μs (413 allocations: 13.14 KiB) # too bad
x = CUDA.randn(80, 80, 80);
y = zero(x);
p = (2,3,1)
using BenchmarkTools
@btime CUDA.@sync permutedims!($y, $x, $p);
100.064 μs (53 allocations: 1.91 KiB)
@btime CUDA.@sync mypermutedims!($y, $x, $p);
130.855 μs (217 allocations: 7.02 KiB) # this looks not too bad
Wish for some advices to improve the permutedims
implementation.