diff --git a/.gitignore b/.gitignore index d0a0746..56b59ee 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ docs/src/tutorial/QCBM.md _*.dat *.swp __pycache__/ +.vscode/ Manifest.toml diff --git a/Project.toml b/Project.toml index 2142042..14c82e3 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "CuYao" uuid = "b48ca7a8-dd42-11e8-2b8e-1b7706800275" -version = "0.3.1" +version = "0.3.2" [deps] BitBasis = "50ba71b6-fa0f-514d-ae9a-0916efc90dcf" @@ -16,13 +16,13 @@ Yao = "5872b779-8223-5990-8dd0-5abbb0748c8c" [compat] BitBasis = "0.7" -CUDA = "3.6" +CUDA = "3.8" LuxurySparse = "0.6" Reexport = "0.2, 1" StaticArrays = "0.12, 1" StatsBase = "0.33" TupleTools = "1" -Yao = "0.6" +Yao = "0.7" julia = "1" [extras] diff --git a/src/GPUReg.jl b/src/GPUReg.jl index f373753..9cbff2b 100644 --- a/src/GPUReg.jl +++ b/src/GPUReg.jl @@ -6,37 +6,40 @@ import Yao: expect export cpu, cu, GPUReg -cu(reg::ArrayReg{B}) where B = ArrayReg{B}(CuArray(reg.state)) -cpu(reg::ArrayReg{B}) where B = ArrayReg{B}(collect(reg.state)) -const GPUReg{B, T, MT} = ArrayReg{B, T, MT} where MT<:CuArray +cu(reg::ArrayReg{D}) where D = ArrayReg{D}(CuArray(reg.state)) +cpu(reg::ArrayReg{D}) where D = ArrayReg{D}(Array(reg.state)) +cu(reg::BatchedArrayReg{D}) where D = BatchedArrayReg{D}(CuArray(reg.state), reg.nbatch) +cpu(reg::BatchedArrayReg{D}) where D = BatchedArrayReg{D}(Array(reg.state), reg.nbatch) +const GPUReg{D, T, MT} = AbstractArrayReg{D, T, MT} where MT<:DenseCuArray function batch_normalize!(s::DenseCuArray, p::Real=2) p!=2 && throw(ArgumentError("p must be 2!")) s./=norm2(s, dims=1) - s + return s end @inline function tri2ij(l::Int) i = ceil(Int, sqrt(2*l+0.25)-0.5) j = l-i*(i-1)÷2 - i+1,j + return i+1,j end ############### MEASURE ################## -function measure(::ComputationalBasis, reg::GPUReg{1}, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG, nshots::Int=1) +function measure(::ComputationalBasis, reg::ArrayReg{D, T, MT} where MT<:DenseCuArray, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG, nshots::Int=1) where {D,T} _measure(rng, reg |> probs |> Vector, nshots) end # TODO: optimize the batch dimension using parallel sampling -function measure(::ComputationalBasis, reg::GPUReg{B}, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG, nshots::Int=1) where B +function measure(::ComputationalBasis, reg::BatchedArrayReg{D, T, MT} where MT<:DenseCuArray, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG, nshots::Int=1) where {D,T} regm = reg |> rank3 pl = dropdims(mapreduce(abs2, +, regm, dims=2), dims=2) - _measure(rng, pl |> Matrix, nshots) + return _measure(rng, pl |> Matrix, nshots) end -function measure!(::RemoveMeasured, ::ComputationalBasis, reg::GPUReg{B}, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG) where B +function measure!(::RemoveMeasured, ::ComputationalBasis, reg::GPUReg{D}, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG) where D regm = reg |> rank3 - nregm = similar(regm, 1< Matrix res_cpu = map(ib->_measure(rng, view(pl_cpu, :, ib), 1)[], 1:B) @@ -51,11 +54,12 @@ function measure!(::RemoveMeasured, ::ComputationalBasis, reg::GPUReg{B}, ::AllL end gpu_call(kernel, nregm, regm, res, pl) reg.state = reshape(nregm,1,:) - B == 1 ? Array(res)[] : res + return reg isa ArrayReg ? Array(res)[] : res end -function measure!(::NoPostProcess, ::ComputationalBasis, reg::GPUReg{B, T}, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG) where {B, T} +function measure!(::NoPostProcess, ::ComputationalBasis, reg::GPUReg{D, T}, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG) where {D, T} regm = reg |> rank3 + B = size(regm, 3) pl = dropdims(mapreduce(abs2, +, regm, dims=2), dims=2) pl_cpu = pl |> Matrix res_cpu = map(ib->_measure(rng, view(pl_cpu, :, ib), 1)[], 1:B) @@ -70,17 +74,18 @@ function measure!(::NoPostProcess, ::ComputationalBasis, reg::GPUReg{B, T}, ::Al return end gpu_call(kernel, regm, res, pl) - B == 1 ? Array(res)[] : res + return reg isa ArrayReg ? Array(res)[] : res end function YaoBase.measure!( ::NoPostProcess, bb::BlockedBasis, - reg::GPUReg{B,T}, + reg::GPUReg{D,T}, ::AllLocs; rng::AbstractRNG = Random.GLOBAL_RNG, -) where {B,T} +) where {D,T} state = @inbounds (reg|>rank3)[bb.perm, :, :] # permute to make eigen values sorted + B = size(state, 3) pl = dropdims(mapreduce(abs2, +, state, dims=2), dims=2) pl_cpu = pl |> Matrix pl_block = zeros(eltype(pl), nblocks(bb), B) @@ -106,11 +111,12 @@ function YaoBase.measure!( _state = reshape(state, 1 << nactive(reg), :) rstate = reshape(reg.state, 1 << nactive(reg), :) @inbounds rstate[bb.perm, :] .= _state - return B == 1 ? bb.values[res_cpu[]] : CuArray(bb.values[res_cpu]) + return reg isa ArrayReg ? bb.values[res_cpu[]] : CuArray(bb.values[res_cpu]) end -function measure!(rst::ResetTo, ::ComputationalBasis, reg::GPUReg{B, T}, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG) where {B, T} +function measure!(rst::ResetTo, ::ComputationalBasis, reg::GPUReg{D, T}, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG) where {D, T} regm = reg |> rank3 + B = size(regm, 3) pl = dropdims(mapreduce(abs2, +, regm, dims=2), dims=2) pl_cpu = pl |> Matrix res_cpu = map(ib->_measure(rng, view(pl_cpu, :, ib), 1)[], 1:B) @@ -128,10 +134,10 @@ function measure!(rst::ResetTo, ::ComputationalBasis, reg::GPUReg{B, T}, ::AllLo end gpu_call(kernel, regm, res, pl, rst.x) - B == 1 ? Array(res)[] : res + return reg isa ArrayReg ? Array(res)[] : res end -import Yao.YaoArrayRegister: insert_qubits!, join +import Yao.YaoArrayRegister: insert_qudits!, join function YaoBase.batched_kron(A::DenseCuArray{T1}, B::DenseCuArray{T2}) where {T1 ,T2} res = CUDA.zeros(promote_type(T1,T2), size(A,1)*size(B, 1), size(A,2)*size(B,2), size(A, 3)) CI = Base.CartesianIndices(res) @@ -145,7 +151,7 @@ function YaoBase.batched_kron(A::DenseCuArray{T1}, B::DenseCuArray{T2}) where {T end gpu_call(kernel, res, A, B) - res + return res end """ @@ -154,7 +160,7 @@ end Performs batched Kronecker products in-place on the GPU. The results are stored in 'C', overwriting the existing values of 'C'. """ -function YaoBase.batched_kron!(C::CuArray{T3, 3}, A::DenseCuArray, B::DenseCuArray) where {T1 ,T2, T3} +function YaoBase.batched_kron!(C::CuArray{T3, 3}, A::DenseCuArray, B::DenseCuArray) where {T3} @boundscheck (size(C) == (size(A,1)*size(B,1), size(A,2)*size(B,2), size(A,3))) || throw(DimensionMismatch()) @boundscheck (size(A,3) == size(B,3) == size(C,3)) || throw(DimensionMismatch()) CI = Base.CartesianIndices(C) @@ -168,23 +174,24 @@ function YaoBase.batched_kron!(C::CuArray{T3, 3}, A::DenseCuArray, B::DenseCuArr end gpu_call(kernel, C, A, B) - C + return C end -function join(reg1::GPUReg{B}, reg2::GPUReg{B}) where {B} +function join(reg1::GPUReg{D}, reg2::GPUReg{D}) where {D} + @assert nbatch(reg1) == nbatch(reg2) s1 = reg1 |> rank3 s2 = reg2 |> rank3 state = YaoBase.batched_kron(s1, s2) - ArrayReg{B}(copy(reshape(state, size(state, 1), :))) + return arrayreg(copy(reshape(state, size(state, 1), :)); nlevel=D, nbatch=nbatch(reg1)) end -export insert_qubits! -function insert_qubits!(reg::GPUReg{B}, loc::Int; nqubits::Int=1) where B +export insert_qudits! +function insert_qudits!(reg::GPUReg{D}, loc::Int; nqudits::Int=1) where D na = nactive(reg) focus!(reg, 1:loc-1) - reg2 = join(zero_state(nqubits; nbatch=B) |> cu, reg) |> relax! |> focus!((1:na+nqubits)...) + reg2 = join(zero_state(nqudits; nbatch=nbatch(reg)) |> cu, reg) |> relax! |> focus!((1:na+nqudits)...) reg.state = reg2.state - reg + return reg end #= diff --git a/src/gpuapplys.jl b/src/gpuapplys.jl index 294e72c..31c9e18 100644 --- a/src/gpuapplys.jl +++ b/src/gpuapplys.jl @@ -7,7 +7,7 @@ gpu_compatible(A::AbstractVecOrMat) = A |> staticize gpu_compatible(A::StaticArray) = A ###################### unapply! ############################ -function instruct!(state::DenseCuVecOrMat, U0::AbstractMatrix, locs::NTuple{M, Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {C, M} +function instruct!(::Val{2}, state::DenseCuVecOrMat, U0::AbstractMatrix, locs::NTuple{M, Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {C, M} U0 = gpu_compatible(U0) # reorder a unirary matrix. D, kf = un_kernel(log2dim1(state), clocs, cvals, U0, locs) @@ -15,18 +15,18 @@ function instruct!(state::DenseCuVecOrMat, U0::AbstractMatrix, locs::NTuple{M, I gpu_call(kf, state; elements=D*size(state,2)) state end -instruct!(state::DenseCuVecOrMat, U0::IMatrix, locs::NTuple{M, Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {C, M} = state -instruct!(state::DenseCuVecOrMat, U0::SDSparseMatrixCSC, locs::NTuple{M, Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {C, M} = instruct!(state, U0 |> Matrix, locs, clocs, cvals) +instruct!(::Val{2}, state::DenseCuVecOrMat, U0::IMatrix, locs::NTuple{M, Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {C, M} = state +instruct!(::Val{2}, state::DenseCuVecOrMat, U0::SDSparseMatrixCSC, locs::NTuple{M, Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {C, M} = instruct!(Val(2), state, U0 |> Matrix, locs, clocs, cvals) ################## General U1 apply! ################### for MT in [:SDDiagonal, :SDPermMatrix, :AbstractMatrix, :SDSparseMatrixCSC] - @eval function instruct!(state::DenseCuVecOrMat, U1::$MT, ibit::Tuple{Int}) + @eval function instruct!(::Val{2}, state::DenseCuVecOrMat, U1::$MT, ibit::Tuple{Int}) D,kf = u1_kernel(log2dim1(state), U1, ibit...) gpu_call(kf, state; elements=D*size(state,2)) state end end -instruct!(state::DenseCuVecOrMat, U::IMatrix, locs::Tuple{Int}) = state +instruct!(::Val{2}, state::DenseCuVecOrMat, U::IMatrix, locs::Tuple{Int}) = state ################## XYZ ############# using Yao.ConstGate: S, T, Sdag, Tdag @@ -50,26 +50,26 @@ for G in [:X, :Y, :Z, :S, :T, :Sdag, :Tdag] end @eval begin - function YaoBase.instruct!(state::DenseCuVecOrMat, g::Val{$(QuoteNode(G))}, locs::NTuple{C,Int}) where C + function YaoBase.instruct!(::Val{2}, state::DenseCuVecOrMat, g::Val{$(QuoteNode(G))}, locs::NTuple{C,Int}) where C _instruct!(state, g, locs) end - function YaoBase.instruct!(state::DenseCuVecOrMat, g::Val{$(QuoteNode(G))}, locs::Tuple{Int}) + function YaoBase.instruct!(::Val{2}, state::DenseCuVecOrMat, g::Val{$(QuoteNode(G))}, locs::Tuple{Int}) _instruct!(state, g, locs) end - function YaoBase.instruct!(state::DenseCuVecOrMat, g::Val{$(QuoteNode(G))}, loc::Tuple{Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where C + function YaoBase.instruct!(::Val{2}, state::DenseCuVecOrMat, g::Val{$(QuoteNode(G))}, loc::Tuple{Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where C _instruct!(state, g, loc, clocs, cvals) end - function YaoBase.instruct!(state::DenseCuVecOrMat, vg::Val{$(QuoteNode(G))}, loc::Tuple{Int}, cloc::Tuple{Int}, cval::Tuple{Int}) + function YaoBase.instruct!(::Val{2}, state::DenseCuVecOrMat, vg::Val{$(QuoteNode(G))}, loc::Tuple{Int}, cloc::Tuple{Int}, cval::Tuple{Int}) _instruct!(state, vg, loc, cloc, cval) end end end -function instruct!(state::DenseCuVecOrMat, ::Val{:SWAP}, locs::Tuple{Int,Int}) +function instruct!(::Val{2}, state::DenseCuVecOrMat, ::Val{:SWAP}, locs::Tuple{Int,Int}) b1, b2 = locs mask1 = bmask(b1) mask2 = bmask(b2) @@ -92,7 +92,7 @@ end # parametrized swap gate using Yao.ConstGate: SWAPGate -function instruct!(state::DenseCuVecOrMat, ::Val{:PSWAP}, locs::Tuple{Int, Int}, θ::Real) +function instruct!(::Val{2}, state::DenseCuVecOrMat, ::Val{:PSWAP}, locs::Tuple{Int, Int}, θ::Real) D, kf = pswap_kernel(log2dim1(state),locs..., θ) gpu_call(kf, state; elements=D*size(state,2)) state @@ -107,6 +107,7 @@ end for RG in [:Rx, :Ry, :Rz] @eval function instruct!( + ::Val{2}, state::DenseCuVecOrMat{T}, ::Val{$(QuoteNode(RG))}, locs::Tuple{Int}, diff --git a/test/GPUReg.jl b/test/GPUReg.jl index cd1c116..e490209 100644 --- a/test/GPUReg.jl +++ b/test/GPUReg.jl @@ -50,12 +50,12 @@ end res = measure!(ResetTo(3), greg1) @test all(measure(greg1, nshots=10) .== 3) @test greg1 |> isnormalized - @test all(select.(greg0 |> cpu, res |> myvec) .|> normalize! .≈ select.(greg1 |> cpu, 3)) + @test all(select.(BatchedArrayReg(greg0 |> cpu), res |> myvec) .|> normalize! .≈ select.(BatchedArrayReg(greg1 |> cpu), 3)) greg1 = greg |> copy |> focus!(1,4,3) greg0 = copy(greg1) res = measure!(greg1) - @test all(select.(greg0 |> cpu, res |> myvec) .|> normalize! .≈ select.(greg1 |> cpu, res|>myvec)) + @test all(select.(BatchedArrayReg(greg0 |> cpu), res |> myvec) .|> normalize! .≈ select.(BatchedArrayReg(greg1 |> cpu), res|>myvec)) end @test join(rand_state(3) |> cu, rand_state(3) |> cu) |> nactive == 6 @@ -64,12 +64,12 @@ end @testset "insert_qubits!" begin reg = rand_state(5; nbatch=10) - res = insert_qubits!(reg |> cu, 3; nqubits=2) |> cpu - @test insert_qubits!(reg, 3; nqubits=2) ≈ res + res = insert_qudits!(reg |> cu, 3; nqudits=2) |> cpu + @test insert_qudits!(reg, 3; nqudits=2) ≈ res reg = rand_state(5, nbatch=10) |>focus!(2,3) - res = insert_qubits!(reg |> cu, 3; nqubits=2) |> cpu - @test insert_qubits!(reg, 3; nqubits=2) ≈ res + res = insert_qudits!(reg |> cu, 3; nqudits=2) |> cpu + @test insert_qudits!(reg, 3; nqudits=2) ≈ res end @testset "cuda-op-measures" begin diff --git a/test/gpuapplys.jl b/test/gpuapplys.jl index f2d0906..af479d5 100644 --- a/test/gpuapplys.jl +++ b/test/gpuapplys.jl @@ -21,8 +21,8 @@ using CUDA mat(ComplexF32, put(2,2=>P0)) ] - @test instruct!(v1 |> CuArray, UN, (3,1)) |> Vector ≈ instruct!(v1 |> copy, UN, (3,1)) - @test instruct!(vn |> CuArray, UN, (3,1)) |> Matrix ≈ instruct!(vn |> copy, UN, (3,1)) + @test instruct!(Val(2),v1 |> CuArray, UN, (3,1)) |> Vector ≈ instruct!(Val(2),v1 |> copy, UN, (3,1)) + @test instruct!(Val(2),vn |> CuArray, UN, (3,1)) |> Matrix ≈ instruct!(Val(2),vn |> copy, UN, (3,1)) end # sparse matrix like P0, P1 et. al. are not implemented. end @@ -34,8 +34,8 @@ end v1 = randn(ComplexF32, N) vn = randn(ComplexF32, N, 333) - @test instruct!(v1 |> CuArray, Val(:SWAP), (3,5)) |> Vector ≈ instruct!(v1 |> copy, Val(:SWAP), (3,5)) - @test instruct!(vn |> CuArray, Val(:SWAP), (3,5)) |> Matrix ≈ instruct!(vn |> copy, Val(:SWAP), (3,5)) + @test instruct!(Val(2),v1 |> CuArray, Val(:SWAP), (3,5)) |> Vector ≈ instruct!(Val(2),v1 |> copy, Val(:SWAP), (3,5)) + @test instruct!(Val(2),vn |> CuArray, Val(:SWAP), (3,5)) |> Matrix ≈ instruct!(Val(2),vn |> copy, Val(:SWAP), (3,5)) end @testset "gpu instruct! 1bit" begin @@ -46,8 +46,8 @@ end vn = randn(ComplexF32, N, 333) for U1 in [mat(H), mat(Z), mat(I2), mat(ConstGate.P0), mat(X), mat(Y)] - @test instruct!(v1 |> CuArray, U1, (3,)) |> Vector ≈ instruct!(v1 |> copy, U1, (3,)) - @test instruct!(vn |> CuArray, U1, (3,)) |> Matrix ≈ instruct!(vn |> copy, U1, (3,)) + @test instruct!(Val(2),v1 |> CuArray, U1, (3,)) |> Vector ≈ instruct!(Val(2),v1 |> copy, U1, (3,)) + @test instruct!(Val(2),vn |> CuArray, U1, (3,)) |> Matrix ≈ instruct!(Val(2),vn |> copy, U1, (3,)) end # sparse matrix like P0, P1 et. al. are not implemented. end @@ -60,11 +60,11 @@ end vn = randn(ComplexF32, N, 333) for G in [:X, :Y, :Z, :T, :H, :Tdag, :S, :Sdag] - @test instruct!(v1 |> CuArray, Val(G), (3,)) |> Vector ≈ instruct!(v1 |> copy, Val(G), (3,)) - @test instruct!(vn |> CuArray, Val(G), (3,)) |> Matrix ≈ instruct!(vn |> copy, Val(G), (3,)) + @test instruct!(Val(2),v1 |> CuArray, Val(G), (3,)) |> Vector ≈ instruct!(Val(2),v1 |> copy, Val(G), (3,)) + @test instruct!(Val(2),vn |> CuArray, Val(G), (3,)) |> Matrix ≈ instruct!(Val(2),vn |> copy, Val(G), (3,)) if G != :H - @test instruct!(v1 |> CuArray, Val(G), (1,3,4)) |> Vector ≈ instruct!(v1 |> copy, Val(G), (1,3,4)) - @test instruct!(vn |> CuArray, Val(G),(1,3,4)) |> Matrix ≈ instruct!(vn |> copy, Val(G), (1,3,4)) + @test instruct!(Val(2),v1 |> CuArray, Val(G), (1,3,4)) |> Vector ≈ instruct!(Val(2),v1 |> copy, Val(G), (1,3,4)) + @test instruct!(Val(2),vn |> CuArray, Val(G),(1,3,4)) |> Matrix ≈ instruct!(Val(2),vn |> copy, Val(G), (1,3,4)) end end end @@ -77,10 +77,10 @@ end vn = randn(ComplexF32, N, 333) for G in [:X, :Y, :Z, :T, :Tdag, :S, :Sdag] - @test instruct!(v1 |> CuArray, Val(G), (3,), (4,5), (0, 1)) |> Vector ≈ instruct!(v1 |> copy, Val(G), (3,), (4,5), (0, 1)) - @test instruct!(vn |> CuArray, Val(G), (3,), (4,5), (0, 1)) |> Matrix ≈ instruct!(vn |> copy, Val(G), (3,), (4,5), (0, 1)) - @test instruct!(v1 |> CuArray, Val(G), (3,), (1,), (1,)) |> Vector ≈ instruct!(v1 |> copy, Val(G),(3,), (1,), (1,)) - @test instruct!(vn |> CuArray, Val(G), (3,), (1,), (1,)) |> Matrix ≈ instruct!(vn |> copy, Val(G),(3,), (1,), (1,)) + @test instruct!(Val(2),v1 |> CuArray, Val(G), (3,), (4,5), (0, 1)) |> Vector ≈ instruct!(Val(2),v1 |> copy, Val(G), (3,), (4,5), (0, 1)) + @test instruct!(Val(2),vn |> CuArray, Val(G), (3,), (4,5), (0, 1)) |> Matrix ≈ instruct!(Val(2),vn |> copy, Val(G), (3,), (4,5), (0, 1)) + @test instruct!(Val(2),v1 |> CuArray, Val(G), (3,), (1,), (1,)) |> Vector ≈ instruct!(Val(2),v1 |> copy, Val(G),(3,), (1,), (1,)) + @test instruct!(Val(2),vn |> CuArray, Val(G), (3,), (1,), (1,)) |> Matrix ≈ instruct!(Val(2),vn |> copy, Val(G),(3,), (1,), (1,)) end end