diff --git a/.github/workflows/runic.yml b/.github/workflows/runic.yml new file mode 100644 index 00000000..aa6d3bfd --- /dev/null +++ b/.github/workflows/runic.yml @@ -0,0 +1,25 @@ +name: Runic formatting +on: + push: + branches: + - 'master' + - 'release-' + tags: + - '*' + pull_request: +jobs: + runic: + name: Runic + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: julia-actions/setup-julia@v2 + with: + version: "nightly" # Only nightly have the -m flag currently + - uses: julia-actions/cache@v2 + - name: Install Runic + run: | + julia --color=yes --project=@runic -e 'using Pkg; Pkg.add(url = "https://github.com/fredrikekre/Runic.jl")' + - name: Run Runic + run: | + git ls-files -z -- '*.jl' | xargs -0 julia --project=@runic -m Runic --check --diff diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 74276c41..b153f60e 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -36,11 +36,12 @@ let static = BenchmarkGroup() for N in (64, 256, 512, 1024, 2048, 4096, 16384, 32768, 65536, 262144, 1048576) dtype[N] = @benchmarkable begin kernel = saxpy_kernel!($BACKEND, 1024) - kernel(Z, convert($T, 2.0), X, Y, ndrange=size(Z)) - end setup=( + kernel(Z, convert($T, 2.0), X, Y, ndrange = size(Z)) + end setup = ( X = rand!(KernelAbstractions.zeros($BACKEND, $T, $N)); Y = rand!(KernelAbstractions.zeros($BACKEND, $T, $N)); - Z = KernelAbstractions.zeros($BACKEND, $T, $N)) + Z = KernelAbstractions.zeros($BACKEND, $T, $N) + ) end static["$T"] = dtype end @@ -53,11 +54,12 @@ let default = BenchmarkGroup() for N in (64, 256, 512, 1024, 2048, 4096, 16384, 32768, 65536, 262144, 1048576) dtype[N] = @benchmarkable begin kernel = saxpy_kernel!($BACKEND) - kernel(Z, convert($T, 2.0), X, Y, ndrange=size(Z)) - end setup=( + kernel(Z, convert($T, 2.0), X, Y, ndrange = size(Z)) + end setup = ( X = rand!(KernelAbstractions.zeros($BACKEND, $T, $N)); Y = rand!(KernelAbstractions.zeros($BACKEND, $T, $N)); - Z = KernelAbstractions.zeros($BACKEND, $T, $N)) + Z = KernelAbstractions.zeros($BACKEND, $T, $N) + ) end default["$T"] = dtype end diff --git a/docs/make.jl b/docs/make.jl index da1e4179..61c50655 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -7,17 +7,17 @@ function main() ci = get(ENV, "CI", "") == "true" makedocs(; - modules=[KernelAbstractions], - authors="JuliaGPU and contributors", - repo="https://github.com/JuliaGPU/KernelAbstractions.jl/blob/{commit}{path}#L{line}", - sitename="KernelAbstractions.jl", - format=Documenter.HTML(; - prettyurls=ci, - canonical="https://juliagpu.github.io/KernelAbstractions.jl", - assets=String[], + modules = [KernelAbstractions], + authors = "JuliaGPU and contributors", + repo = "https://github.com/JuliaGPU/KernelAbstractions.jl/blob/{commit}{path}#L{line}", + sitename = "KernelAbstractions.jl", + format = Documenter.HTML(; + prettyurls = ci, + canonical = "https://juliagpu.github.io/KernelAbstractions.jl", + assets = String[], ), - warnonly=[:missing_docs], - pages=[ + warnonly = [:missing_docs], + pages = [ "Home" => "index.md", "Quickstart" => "quickstart.md", "Writing kernels" => "kernels.md", @@ -34,13 +34,13 @@ function main() "Extras" => [ "extras/unrolling.md", ], # Extras - "Notes for implementations" => "implementations.md" + "Notes for implementations" => "implementations.md", ], # pages ) if ci deploydocs(; - repo="github.com/JuliaGPU/KernelAbstractions.jl", + repo = "github.com/JuliaGPU/KernelAbstractions.jl", push_preview = true, ) end diff --git a/examples/histogram.jl b/examples/histogram.jl index a4013481..311cff76 100644 --- a/examples/histogram.jl +++ b/examples/histogram.jl @@ -30,7 +30,7 @@ end # possible to get a value of 312, then we will have 2 separate shmem blocks, # one from 1->256, and another from 256->512 @uniform max_element = 1 - for min_element = 1:gs:N + for min_element in 1:gs:N # Setting shared_histogram to 0 @inbounds shared_histogram[lid] = 0 @@ -38,20 +38,20 @@ end max_element = min_element + gs if max_element > N - max_element = N+1 + max_element = N + 1 end # Defining bin on shared memory and writing to it if possible bin = input[tid] if bin >= min_element && bin < max_element - bin -= min_element-1 + bin -= min_element - 1 @atomic shared_histogram[bin] += 1 end @synchronize() - if ((lid+min_element-1) <= N) - @atomic histogram_output[lid+min_element-1] += shared_histogram[lid] + if ((lid + min_element - 1) <= N) + @atomic histogram_output[lid + min_element - 1] += shared_histogram[lid] end end @@ -62,7 +62,7 @@ function histogram!(histogram_output, input) backend = get_backend(histogram_output) # Need static block size kernel! = histogram_kernel!(backend, (256,)) - kernel!(histogram_output, input, ndrange=size(input)) + kernel!(histogram_output, input, ndrange = size(input)) end function move(backend, input) @@ -75,9 +75,9 @@ end if Base.VERSION < v"1.7.0" && !KernelAbstractions.isgpu(backend) @test_skip false else - rand_input = [rand(1:128) for i = 1:1000] - linear_input = [i for i = 1:1024] - all_two = [2 for i = 1:512] + rand_input = [rand(1:128) for i in 1:1000] + linear_input = [i for i in 1:1024] + all_two = [2 for i in 1:512] histogram_rand_baseline = create_histogram(rand_input) histogram_linear_baseline = create_histogram(linear_input) diff --git a/examples/matmul.jl b/examples/matmul.jl index 11d5c152..4ade3f37 100644 --- a/examples/matmul.jl +++ b/examples/matmul.jl @@ -7,11 +7,11 @@ include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # # creating a temporary sum variable for matrix multiplication tmp_sum = zero(eltype(output)) - for k = 1:size(a)[2] - tmp_sum += a[i,k] * b[k, j] + for k in 1:size(a)[2] + tmp_sum += a[i, k] * b[k, j] end - output[i,j] = tmp_sum + output[i, j] = tmp_sum end # Creating a wrapper kernel for launching with error checks @@ -22,14 +22,14 @@ function matmul!(output, a, b) end backend = KernelAbstractions.get_backend(a) kernel! = matmul_kernel!(backend) - kernel!(output, a, b, ndrange=size(output)) + kernel!(output, a, b, ndrange = size(output)) end a = rand!(allocate(backend, Float32, 256, 123)) b = rand!(allocate(backend, Float32, 123, 45)) output = KernelAbstractions.zeros(backend, Float32, 256, 45) -matmul!(output, a,b) +matmul!(output, a, b) KernelAbstractions.synchronize(backend) -@test isapprox(output, a*b) +@test isapprox(output, a * b) diff --git a/examples/memcopy.jl b/examples/memcopy.jl index 8602769c..f826caec 100644 --- a/examples/memcopy.jl +++ b/examples/memcopy.jl @@ -12,7 +12,7 @@ function mycopy!(A, B) @assert get_backend(B) == backend kernel = copy_kernel!(backend) - kernel(A, B, ndrange=length(A)) + kernel(A, B, ndrange = length(A)) end A = KernelAbstractions.zeros(backend, Float64, 128, 128) diff --git a/examples/memcopy_static.jl b/examples/memcopy_static.jl index 3ded81f2..e3239081 100644 --- a/examples/memcopy_static.jl +++ b/examples/memcopy_static.jl @@ -12,7 +12,7 @@ function mycopy_static!(A, B) @assert get_backend(B) == backend kernel = copy_kernel!(backend, 32, size(A)) # if size(A) varies this will cause recompilation - kernel(A, B, ndrange=size(A)) + kernel(A, B, ndrange = size(A)) end A = KernelAbstractions.zeros(backend, Float64, 128, 128) diff --git a/examples/mpi.jl b/examples/mpi.jl index d0825157..2f5e690d 100644 --- a/examples/mpi.jl +++ b/examples/mpi.jl @@ -43,8 +43,8 @@ function main(backend) comm = MPI.COMM_WORLD MPI.Barrier(comm) - dst_rank = mod(MPI.Comm_rank(comm)+1, MPI.Comm_size(comm)) - src_rank = mod(MPI.Comm_rank(comm)-1, MPI.Comm_size(comm)) + dst_rank = mod(MPI.Comm_rank(comm) + 1, MPI.Comm_size(comm)) + src_rank = mod(MPI.Comm_rank(comm) - 1, MPI.Comm_size(comm)) T = Int64 M = 10 @@ -59,8 +59,10 @@ function main(backend) KernelAbstractions.synchronize(backend) - recv_task, send_task = exchange!(h_send_buf, d_recv_buf, h_recv_buf, - src_rank, dst_rank, comm) + recv_task, send_task = exchange!( + h_send_buf, d_recv_buf, h_recv_buf, + src_rank, dst_rank, comm, + ) cooperative_wait(recv_task) cooperative_wait(send_task) diff --git a/examples/naive_transpose.jl b/examples/naive_transpose.jl index bca184fa..20ea6a0c 100644 --- a/examples/naive_transpose.jl +++ b/examples/naive_transpose.jl @@ -17,7 +17,7 @@ function naive_transpose!(a, b) @assert get_backend(b) == backend groupsize = KernelAbstractions.isgpu(backend) ? 256 : 1024 kernel! = naive_transpose_kernel!(backend, groupsize) - kernel!(a, b, ndrange=size(a)) + kernel!(a, b, ndrange = size(a)) end # resolution of grid will be res*res @@ -27,7 +27,6 @@ res = 1024 b = rand!(allocate(backend, Float32, res, res)) a = KernelAbstractions.zeros(backend, Float32, res, res) -naive_transpose!(a,b) +naive_transpose!(a, b) KernelAbstractions.synchronize(backend) @test a == transpose(b) - diff --git a/examples/numa_aware.jl b/examples/numa_aware.jl index d2185085..f970d558 100644 --- a/examples/numa_aware.jl +++ b/examples/numa_aware.jl @@ -18,8 +18,10 @@ end Estimate the memory bandwidth (GB/s) by performing a time measurement of a SAXPY kernel. Returns the memory bandwidth (GB/s) and the compute (GFLOP/s). """ -function measure_membw(backend = CPU(); verbose = true, N = 1024 * 500_000, dtype = Float32, - init = :parallel) +function measure_membw( + backend = CPU(); verbose = true, N = 1024 * 500_000, dtype = Float32, + init = :parallel, + ) bytes = 3 * sizeof(dtype) * N # num bytes transferred in SAXPY flops = 2 * N # num flops in SAXY @@ -37,10 +39,10 @@ function measure_membw(backend = CPU(); verbose = true, N = 1024 * 500_000, dtyp kernel = saxpy_kernel($backend, $workgroup_size, $(size(Y))) kernel($a, $X, $Y, ndrange = $(size(Y))) KernelAbstractions.synchronize($backend) - end evals=2 samples=10 + end evals = 2 samples = 10 - mem_rate = bytes * 1e-9 / t # GB/s - flop_rate = flops * 1e-9 / t # GFLOP/s + mem_rate = bytes * 1.0e-9 / t # GB/s + flop_rate = flops * 1.0e-9 / t # GFLOP/s if verbose println("\tMemory Bandwidth (GB/s): ", round(mem_rate; digits = 2)) @@ -51,7 +53,7 @@ end # Static should be much better (on a system with multiple NUMA domains) measure_membw(CPU()); -measure_membw(CPU(; static=true)); +measure_membw(CPU(; static = true)); # The following has significantly worse performance (even on systems with a single memory domain)! # measure_membw(CPU(); init=:serial); diff --git a/examples/performance.jl b/examples/performance.jl index 4e1dbd9d..8faf3cae 100644 --- a/examples/performance.jl +++ b/examples/performance.jl @@ -15,8 +15,8 @@ const BLOCK_ROWS = 8 # Simple variants @kernel function simple_copy_kernel!(output, @Const(input)) - I, J = @index(Global, NTuple) - @inbounds output[I, J] = input[I, J] + I, J = @index(Global, NTuple) + @inbounds output[I, J] = input[I, J] end @kernel function simple_transpose_kernel!(output, @Const(input)) @@ -26,16 +26,18 @@ end # Local memory variants -@kernel function lmem_copy_kernel!(output, @Const(input), - ::Val{BANK}=Val(1)) where BANK +@kernel function lmem_copy_kernel!( + output, @Const(input), + ::Val{BANK} = Val(1), + ) where {BANK} I, J = @index(Global, NTuple) - i, j = @index(Local, NTuple) + i, j = @index(Local, NTuple) N = @uniform @groupsize()[1] M = @uniform @groupsize()[2] # +1 to avoid bank conflicts on shared memory - tile = @localmem eltype(output) (N+BANK, M) + tile = @localmem eltype(output) (N + BANK, M) @inbounds tile[i, j] = input[I, J] @@ -44,106 +46,112 @@ end @inbounds output[I, J] = tile[i, j] end -@kernel function lmem_transpose_kernel!(output, @Const(input), - ::Val{BANK}=Val(1)) where BANK +@kernel function lmem_transpose_kernel!( + output, @Const(input), + ::Val{BANK} = Val(1), + ) where {BANK} gi, gj = @index(Group, NTuple) - i, j = @index(Local, NTuple) + i, j = @index(Local, NTuple) N = @uniform @groupsize()[1] M = @uniform @groupsize()[2] - + # +1 to avoid bank conflicts on shared memory - tile = @localmem eltype(output) (N+BANK, M) + tile = @localmem eltype(output) (N + BANK, M) # Manually calculate global indexes # Later on we need to pivot the group index - I = (gi-1) * N + i - J = (gj-1) * M + j + I = (gi - 1) * N + i + J = (gj - 1) * M + j @inbounds tile[i, j] = input[I, J] @synchronize # Pivot the group index - I = (gj-1) * M + i - J = (gi-1) * N + j + I = (gj - 1) * M + i + J = (gi - 1) * N + j @inbounds output[I, J] = tile[j, i] end # Local Memory + process multiple elements per lane -@kernel function coalesced_copy_kernel!(output, @Const(input), - ::Val{BANK}=Val(1)) where BANK +@kernel function coalesced_copy_kernel!( + output, @Const(input), + ::Val{BANK} = Val(1), + ) where {BANK} gi, gj = @index(Group, NTuple) - i, j = @index(Local, NTuple) + i, j = @index(Local, NTuple) - TILE_DIM = @uniform @groupsize()[1] + TILE_DIM = @uniform @groupsize()[1] BLOCK_ROWS = @uniform @groupsize()[2] # +1 to avoid bank conflicts on shared memory - tile = @localmem eltype(output) (TILE_DIM+BANK, TILE_DIM) + tile = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM) # Can't use @index(Global), because we use a smaller ndrange - I = (gi-1) * TILE_DIM + i - J = (gj-1) * TILE_DIM + j + I = (gi - 1) * TILE_DIM + i + J = (gj - 1) * TILE_DIM + j - @unroll for k in 0:BLOCK_ROWS:(TILE_DIM-1) - @inbounds tile[i, j+k] = input[I, J+k] + @unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1) + @inbounds tile[i, j + k] = input[I, J + k] end @synchronize - @unroll for k in 0:BLOCK_ROWS:(TILE_DIM-1) - @inbounds output[I, J+k] = tile[i, j+k] + @unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1) + @inbounds output[I, J + k] = tile[i, j + k] end end -@kernel function coalesced_transpose_kernel!(output, @Const(input), - ::Val{BANK}=Val(1)) where BANK +@kernel function coalesced_transpose_kernel!( + output, @Const(input), + ::Val{BANK} = Val(1), + ) where {BANK} gi, gj = @index(Group, NTuple) - i, j = @index(Local, NTuple) + i, j = @index(Local, NTuple) - TILE_DIM = @uniform @groupsize()[1] + TILE_DIM = @uniform @groupsize()[1] BLOCK_ROWS = @uniform @groupsize()[2] # +1 to avoid bank conflicts on shared memory - tile = @localmem eltype(output) (TILE_DIM+BANK, TILE_DIM) + tile = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM) # Can't use @index(Global), because we use a smaller ndrange - I = (gi-1) * TILE_DIM + i - J = (gj-1) * TILE_DIM + j + I = (gi - 1) * TILE_DIM + i + J = (gj - 1) * TILE_DIM + j - @unroll for k in 0:BLOCK_ROWS:(TILE_DIM-1) - @inbounds tile[i, j+k] = input[I, J+k] + @unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1) + @inbounds tile[i, j + k] = input[I, J + k] end @synchronize # Transpose block offsets - I = (gj-1) * TILE_DIM + i - J = (gi-1) * TILE_DIM + j + I = (gj - 1) * TILE_DIM + i + J = (gi - 1) * TILE_DIM + j - @unroll for k in 0:BLOCK_ROWS:(TILE_DIM-1) - @inbounds output[I, J+k] = tile[j+k, i] + @unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1) + @inbounds output[I, J + k] = tile[j + k, i] end end # Benchmark simple -for block_dims in ((TILE_DIM, TILE_DIM), (TILE_DIM*TILE_DIM, 1), (1, TILE_DIM*TILE_DIM)) - for (name, kernel) in ( - ("copy", simple_copy_kernel!(backend, block_dims)), - ("transpose", simple_transpose_kernel!(backend, block_dims)), - ) +for block_dims in ((TILE_DIM, TILE_DIM), (TILE_DIM * TILE_DIM, 1), (1, TILE_DIM * TILE_DIM)) + for (name, kernel) in ( + ("copy", simple_copy_kernel!(backend, block_dims)), + ("transpose", simple_transpose_kernel!(backend, block_dims)), + ) NVTX.@range "Simple $name $block_dims" let input = rand!(allocate(backend, T, N, N)) output = similar(input) # compile kernel - kernel(output, input, ndrange=size(output)) + kernel(output, input, ndrange = size(output)) for rep in 1:nreps - kernel(output, input, ndrange=size(output)) + kernel(output, input, ndrange = size(output)) end KernelAbstractions.synchronize(backend) end @@ -151,19 +159,19 @@ for block_dims in ((TILE_DIM, TILE_DIM), (TILE_DIM*TILE_DIM, 1), (1, TILE_DIM*TI end # Benchmark localmem -for (name, kernel) in ( - ("copy", lmem_copy_kernel!(backend, (TILE_DIM, TILE_DIM))), - ("transpose", lmem_transpose_kernel!(backend, (TILE_DIM, TILE_DIM))), - ) +for (name, kernel) in ( + ("copy", lmem_copy_kernel!(backend, (TILE_DIM, TILE_DIM))), + ("transpose", lmem_transpose_kernel!(backend, (TILE_DIM, TILE_DIM))), + ) for bank in (true, false) NVTX.@range "Localmem $name ($TILE_DIM, $TILE_DIM) bank=$bank" let input = rand!(allocate(backend, T, N, N)) output = similar(input) # compile kernel - kernel(output, input, Val(Int(bank)), ndrange=size(output)) + kernel(output, input, Val(Int(bank)), ndrange = size(output)) for rep in 1:nreps - kernel(output, input, Val(Int(bank)), ndrange=size(output)) + kernel(output, input, Val(Int(bank)), ndrange = size(output)) end KernelAbstractions.synchronize(backend) end @@ -171,10 +179,10 @@ for (name, kernel) in ( end # Benchmark localmem + multiple elements per lane -for (name, kernel) in ( - ("copy", coalesced_copy_kernel!(backend, (TILE_DIM, BLOCK_ROWS))), - ("transpose", coalesced_transpose_kernel!(backend, (TILE_DIM, BLOCK_ROWS))), - ) +for (name, kernel) in ( + ("copy", coalesced_copy_kernel!(backend, (TILE_DIM, BLOCK_ROWS))), + ("transpose", coalesced_transpose_kernel!(backend, (TILE_DIM, BLOCK_ROWS))), + ) for bank in (true, false) NVTX.@range "Localmem + multiple elements $name ($TILE_DIM, $BLOCK_ROWS) bank=$bank" let input = rand!(allocate(backend, T, N, N)) @@ -187,9 +195,9 @@ for (name, kernel) in ( ndrange = (N, div(N, block_factor)) # compile kernel - kernel(output, input, Val(Int(bank)), ndrange=ndrange) + kernel(output, input, Val(Int(bank)), ndrange = ndrange) for rep in 1:nreps - kernel(output, input, Val(Int(bank)), ndrange=ndrange) + kernel(output, input, Val(Int(bank)), ndrange = ndrange) end KernelAbstractions.synchronize(backend) end diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl index ae0760eb..dbd5ad75 100644 --- a/examples/performant_matmul.jl +++ b/examples/performant_matmul.jl @@ -6,68 +6,70 @@ include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # const TILE_DIM = 32 -@kernel function coalesced_matmul_kernel!(output, @Const(input1), @Const(input2), N, R, M, - ::Val{BANK}=Val(1)) where BANK - gi, gj = @index(Group, NTuple) - i, j = @index(Local, NTuple) - - TILE_DIM = @uniform @groupsize()[1] - - # +1 to avoid bank conflicts on shared memory - tile1 = @localmem eltype(output) (TILE_DIM+BANK, TILE_DIM) - tile2 = @localmem eltype(output) (TILE_DIM+BANK, TILE_DIM) - - # private variable for tile output - outval = @private eltype(output) 1 - @inbounds outval[1] = -zero(eltype(output)) - - @uniform N = size(output, 1) - # number of tiles depends on inner dimension - @uniform NUM_TILES = div(R + TILE_DIM - 1, TILE_DIM) - - # loop over all tiles needed for this calculation - for t in 0:NUM_TILES-1 - # Can't use @index(Global), because we use a smaller ndrange - I = (gi-1) * TILE_DIM + i - J = (gj-1) * TILE_DIM + j - - # load inputs into tiles, with bounds checking for non-square matrices - if I <= N && t*TILE_DIM + j <= R - @inbounds tile1[i, j] = input1[I, t*TILE_DIM + j] - else - @inbounds tile1[i, j] = 0.0 - end - if t*TILE_DIM + i <= R && J <= M - @inbounds tile2[i, j] = input2[t*TILE_DIM + i, J] - else - @inbounds tile2[i, j] = 0.0 - end - - # wait for all tiles to be loaded - @synchronize - - # get global values again - I = (gi-1) * TILE_DIM + i - J = (gj-1) * TILE_DIM + j - - # calculate value of spot in output, use temporary value to allow for vectorization - out = zero(eltype(output)) - @simd for k in 1:TILE_DIM - @inbounds out += tile1[i, k] * tile2[k, j] - end - outval[1] += out - - @synchronize - end - - # get global indices again - I = (gi-1) * TILE_DIM + i - J = (gj-1) * TILE_DIM + j - - # save if inbounds - if I <= N && J <= M - @inbounds output[I, J] = outval[1] - end +@kernel function coalesced_matmul_kernel!( + output, @Const(input1), @Const(input2), N, R, M, + ::Val{BANK} = Val(1), + ) where {BANK} + gi, gj = @index(Group, NTuple) + i, j = @index(Local, NTuple) + + TILE_DIM = @uniform @groupsize()[1] + + # +1 to avoid bank conflicts on shared memory + tile1 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM) + tile2 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM) + + # private variable for tile output + outval = @private eltype(output) 1 + @inbounds outval[1] = -zero(eltype(output)) + + @uniform N = size(output, 1) + # number of tiles depends on inner dimension + @uniform NUM_TILES = div(R + TILE_DIM - 1, TILE_DIM) + + # loop over all tiles needed for this calculation + for t in 0:(NUM_TILES - 1) + # Can't use @index(Global), because we use a smaller ndrange + I = (gi - 1) * TILE_DIM + i + J = (gj - 1) * TILE_DIM + j + + # load inputs into tiles, with bounds checking for non-square matrices + if I <= N && t * TILE_DIM + j <= R + @inbounds tile1[i, j] = input1[I, t * TILE_DIM + j] + else + @inbounds tile1[i, j] = 0.0 + end + if t * TILE_DIM + i <= R && J <= M + @inbounds tile2[i, j] = input2[t * TILE_DIM + i, J] + else + @inbounds tile2[i, j] = 0.0 + end + + # wait for all tiles to be loaded + @synchronize + + # get global values again + I = (gi - 1) * TILE_DIM + i + J = (gj - 1) * TILE_DIM + j + + # calculate value of spot in output, use temporary value to allow for vectorization + out = zero(eltype(output)) + @simd for k in 1:TILE_DIM + @inbounds out += tile1[i, k] * tile2[k, j] + end + outval[1] += out + + @synchronize + end + + # get global indices again + I = (gi - 1) * TILE_DIM + i + J = (gj - 1) * TILE_DIM + j + + # save if inbounds + if I <= N && J <= M + @inbounds output[I, J] = outval[1] + end end N = 1024 @@ -79,7 +81,7 @@ C = KernelAbstractions.zeros(backend, Float32, N, M) kern = coalesced_matmul_kernel!(backend, (TILE_DIM, TILE_DIM)) -kern(C, A, B, N, R, M, ndrange=size(C)) +kern(C, A, B, N, R, M, ndrange = size(C)) KernelAbstractions.synchronize(backend) -@test isapprox(A*B, C) +@test isapprox(A * B, C) diff --git a/examples/utils.jl b/examples/utils.jl index ea3d3db3..5e93299b 100644 --- a/examples/utils.jl +++ b/examples/utils.jl @@ -7,4 +7,3 @@ if Base.find_package("CUDA") !== nothing else const backend = CPU() end - diff --git a/ext/EnzymeExt.jl b/ext/EnzymeExt.jl index 04eecd87..aee0d296 100644 --- a/ext/EnzymeExt.jl +++ b/ext/EnzymeExt.jl @@ -29,10 +29,10 @@ import KernelAbstractions: synchronize function EnzymeCore.compiler_job_from_backend( - b::Backend, - @nospecialize(F::Type), - @nospecialize(TT::Type) -) + b::Backend, + @nospecialize(F::Type), + @nospecialize(TT::Type), + ) error( "EnzymeCore.compiler_job_from_backend is not yet implemented for $(typeof(b)), please file an issue.", ) @@ -54,12 +54,12 @@ function gpu_fwd(ctx, f, args...) end function EnzymeRules.forward( - func::Const{<:Kernel{CPU}}, - ::Type{Const{Nothing}}, - args...; - ndrange = nothing, - workgroupsize = nothing, -) + func::Const{<:Kernel{CPU}}, + ::Type{Const{Nothing}}, + args...; + ndrange = nothing, + workgroupsize = nothing, + ) kernel = func.val f = kernel.f fwd_kernel = similar(kernel, cpu_fwd) @@ -68,12 +68,12 @@ function EnzymeRules.forward( end function EnzymeRules.forward( - func::Const{<:Kernel{<:GPU}}, - ::Type{Const{Nothing}}, - args...; - ndrange = nothing, - workgroupsize = nothing, -) + func::Const{<:Kernel{<:GPU}}, + ::Type{Const{Nothing}}, + args...; + ndrange = nothing, + workgroupsize = nothing, + ) kernel = func.val f = kernel.f fwd_kernel = similar(kernel, gpu_fwd) @@ -87,23 +87,23 @@ _enzyme_mkcontext(kernel::Kernel{<:GPU}, ndrange, iterspace, dynamic) = mkcontext(kernel, ndrange, iterspace) _augmented_return(::Kernel{CPU}, subtape, arg_refs, tape_type) = - AugmentedReturn{Nothing,Nothing,Tuple{Array,typeof(arg_refs),typeof(tape_type)}}( - nothing, - nothing, - (subtape, arg_refs, tape_type), - ) + AugmentedReturn{Nothing, Nothing, Tuple{Array, typeof(arg_refs), typeof(tape_type)}}( + nothing, + nothing, + (subtape, arg_refs, tape_type), +) _augmented_return(::Kernel{<:GPU}, subtape, arg_refs, tape_type) = - AugmentedReturn{Nothing,Nothing,Any}(nothing, nothing, (subtape, arg_refs, tape_type)) + AugmentedReturn{Nothing, Nothing, Any}(nothing, nothing, (subtape, arg_refs, tape_type)) function _create_tape_kernel( - kernel::Kernel{CPU}, - ModifiedBetween, - FT, - ctxTy, - ndrange, - iterspace, - args2..., -) + kernel::Kernel{CPU}, + ModifiedBetween, + FT, + ctxTy, + ndrange, + iterspace, + args2..., + ) TapeType = EnzymeCore.tape_type( ReverseSplitModified(ReverseSplitWithPrimal, ModifiedBetween), FT, @@ -117,14 +117,14 @@ function _create_tape_kernel( end function _create_tape_kernel( - kernel::Kernel{<:GPU}, - ModifiedBetween, - FT, - ctxTy, - ndrange, - iterspace, - args2..., -) + kernel::Kernel{<:GPU}, + ModifiedBetween, + FT, + ctxTy, + ndrange, + iterspace, + args2..., + ) # For peeking at the TapeType we need to first construct a correct compilation job # this requires the use of the device side representation of arguments. # So we convert the arguments here, this is a bit wasteful since the `aug_kernel` call @@ -154,13 +154,13 @@ _create_rev_kernel(kernel::Kernel{CPU}) = similar(kernel, cpu_rev) _create_rev_kernel(kernel::Kernel{<:GPU}) = similar(kernel, gpu_rev) function cpu_aug_fwd( - ctx, - f::FT, - ::Val{ModifiedBetween}, - subtape, - ::Val{TapeType}, - args..., -) where {ModifiedBetween,FT,TapeType} + ctx, + f::FT, + ::Val{ModifiedBetween}, + subtape, + ::Val{TapeType}, + args..., + ) where {ModifiedBetween, FT, TapeType} # A2 = Const{Nothing} -- since f->Nothing forward, _ = EnzymeCore.autodiff_thunk( ReverseSplitModified(ReverseSplitWithPrimal, Val(ModifiedBetween)), @@ -178,13 +178,13 @@ function cpu_aug_fwd( end function cpu_rev( - ctx, - f::FT, - ::Val{ModifiedBetween}, - subtape, - ::Val{TapeType}, - args..., -) where {ModifiedBetween,FT,TapeType} + ctx, + f::FT, + ::Val{ModifiedBetween}, + subtape, + ::Val{TapeType}, + args..., + ) where {ModifiedBetween, FT, TapeType} _, reverse = EnzymeCore.autodiff_thunk( ReverseSplitModified(ReverseSplitWithPrimal, Val(ModifiedBetween)), Const{Core.Typeof(f)}, @@ -200,13 +200,13 @@ end # GPU support function gpu_aug_fwd( - ctx, - f::FT, - ::Val{ModifiedBetween}, - subtape, - ::Val{TapeType}, - args..., -) where {ModifiedBetween,FT,TapeType} + ctx, + f::FT, + ::Val{ModifiedBetween}, + subtape, + ::Val{TapeType}, + args..., + ) where {ModifiedBetween, FT, TapeType} # A2 = Const{Nothing} -- since f->Nothing forward, _ = EnzymeCore.autodiff_deferred_thunk( ReverseSplitModified(ReverseSplitWithPrimal, Val(ModifiedBetween)), @@ -225,13 +225,13 @@ function gpu_aug_fwd( end function gpu_rev( - ctx, - f::FT, - ::Val{ModifiedBetween}, - subtape, - ::Val{TapeType}, - args..., -) where {ModifiedBetween,FT,TapeType} + ctx, + f::FT, + ::Val{ModifiedBetween}, + subtape, + ::Val{TapeType}, + args..., + ) where {ModifiedBetween, FT, TapeType} # XXX: TapeType and A2 as args to autodiff_deferred_thunk _, reverse = EnzymeCore.autodiff_deferred_thunk( ReverseSplitModified(ReverseSplitWithPrimal, Val(ModifiedBetween)), @@ -248,13 +248,13 @@ function gpu_rev( end function EnzymeRules.augmented_primal( - config::Config, - func::Const{<:Kernel}, - ::Type{Const{Nothing}}, - args::Vararg{Any,N}; - ndrange = nothing, - workgroupsize = nothing, -) where {N} + config::Config, + func::Const{<:Kernel}, + ::Type{Const{Nothing}}, + args::Vararg{Any, N}; + ndrange = nothing, + workgroupsize = nothing, + ) where {N} kernel = func.val f = kernel.f @@ -306,14 +306,14 @@ function EnzymeRules.augmented_primal( end function EnzymeRules.reverse( - config::Config, - func::Const{<:Kernel}, - ::Type{<:EnzymeCore.Annotation}, - tape, - args::Vararg{Any,N}; - ndrange = nothing, - workgroupsize = nothing, -) where {N} + config::Config, + func::Const{<:Kernel}, + ::Type{<:EnzymeCore.Annotation}, + tape, + args::Vararg{Any, N}; + ndrange = nothing, + workgroupsize = nothing, + ) where {N} subtape, arg_refs, tape_type = tape args2 = ntuple(Val(N)) do i @@ -359,22 +359,22 @@ end # synchronize rule and then synchronize where the launch was. However, with the current # kernel semantics this ensures correctness for now. function EnzymeRules.augmented_primal( - config::Config, - func::Const{typeof(synchronize)}, - ::Type{Const{Nothing}}, - backend::T, -) where {T<:EnzymeCore.Annotation} + config::Config, + func::Const{typeof(synchronize)}, + ::Type{Const{Nothing}}, + backend::T, + ) where {T <: EnzymeCore.Annotation} synchronize(backend.val) return AugmentedReturn(nothing, nothing, nothing) end function EnzymeRules.reverse( - config::Config, - func::Const{typeof(synchronize)}, - ::Type{Const{Nothing}}, - tape, - backend, -) + config::Config, + func::Const{typeof(synchronize)}, + ::Type{Const{Nothing}}, + tape, + backend, + ) # noop for now return (nothing,) end diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index d1f21b32..a2cea14f 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -53,7 +53,7 @@ synchronize(backend) ``` """ macro kernel(expr) - __kernel(expr, #=generate_cpu=#true, #=force_inbounds=#false) + __kernel(expr, #=generate_cpu=# true, #=force_inbounds=# false) end """ @@ -75,18 +75,20 @@ macro kernel(ex...) else generate_cpu = true force_inbounds = false - for i = 1:length(ex)-1 + for i in 1:(length(ex) - 1) if ex[i] isa Expr && ex[i].head == :(=) && - ex[i].args[1] == :cpu && ex[i].args[2] isa Bool + ex[i].args[1] == :cpu && ex[i].args[2] isa Bool generate_cpu = ex[i].args[2] elseif ex[i] isa Expr && ex[i].head == :(=) && - ex[i].args[1] == :inbounds && ex[i].args[2] isa Bool + ex[i].args[1] == :inbounds && ex[i].args[2] isa Bool force_inbounds = ex[i].args[2] else - error("Configuration should be of form:\n"* - "* `cpu=true`\n"* - "* `inbounds=false`\n"* - "got `", ex[i], "`") + error( + "Configuration should be of form:\n" * + "* `cpu=true`\n" * + "* `inbounds=false`\n" * + "got `", ex[i], "`", + ) end end __kernel(ex[end], generate_cpu, force_inbounds) @@ -303,7 +305,7 @@ This is a unified print statement. """ macro print(items...) - args = Union{Val,Expr,Symbol}[] + args = Union{Val, Expr, Symbol}[] items = [items...] while true @@ -330,7 +332,7 @@ macro print(items...) end quote - $__print($(map(esc,args)...)) + $__print($(map(esc, args)...)) end end @@ -373,8 +375,8 @@ macro index(locale, args...) if length(args) >= 1 if args[1] === :Cartesian || - args[1] === :Linear || - args[1] === :NTuple + args[1] === :Linear || + args[1] === :NTuple indexkind = args[1] args = args[2:end] else @@ -439,7 +441,7 @@ Instantiate a CPU (multi-threaded) backend. """ struct CPU <: Backend static::Bool - CPU(;static::Bool=false) = new(static) + CPU(; static::Bool = false) = new(static) end """ @@ -493,7 +495,7 @@ allocate(backend::Backend, T, dims::Tuple) = throw(MethodError(allocate, (backen Allocate a storage array appropriate for the computational backend filled with zeros. """ zeros(backend::Backend, T, dims...) = zeros(backend, T, dims) -function zeros(backend::Backend, ::Type{T}, dims::Tuple) where T +function zeros(backend::Backend, ::Type{T}, dims::Tuple) where {T} data = allocate(backend, T, dims...) fill!(data, zero(T)) return data @@ -505,7 +507,7 @@ end Allocate a storage array appropriate for the computational backend filled with ones. """ ones(backend::Backend, T, dims...) = ones(backend, T, dims) -function ones(backend::Backend, ::Type{T}, dims::Tuple) where T +function ones(backend::Backend, ::Type{T}, dims::Tuple) where {T} data = allocate(backend, T, dims) fill!(data, one(T)) return data @@ -589,7 +591,7 @@ in a workgroup. ``` As well as the on-device functionality. """ -struct Kernel{Backend, WorkgroupSize<:_Size, NDRange<:_Size, Fun} +struct Kernel{Backend, WorkgroupSize <: _Size, NDRange <: _Size, Fun} backend::Backend f::Fun end @@ -599,7 +601,7 @@ function Base.similar(kernel::Kernel{D, WS, ND}, f::F) where {D, WS, ND, F} end workgroupsize(::Kernel{D, WorkgroupSize}) where {D, WorkgroupSize} = WorkgroupSize -ndrange(::Kernel{D, WorkgroupSize, NDRange}) where {D, WorkgroupSize,NDRange} = NDRange +ndrange(::Kernel{D, WorkgroupSize, NDRange}) where {D, WorkgroupSize, NDRange} = NDRange backend(kernel::Kernel) = kernel.backend """ @@ -610,7 +612,7 @@ Partition a kernel for the given ndrange and workgroupsize. static_workgroupsize = KernelAbstractions.workgroupsize(kernel) if ndrange === nothing && static_ndrange <: DynamicSize || - workgroupsize === nothing && static_workgroupsize <: DynamicSize + workgroupsize === nothing && static_workgroupsize <: DynamicSize errmsg = """ Can not partition kernel! @@ -662,7 +664,7 @@ Partition a kernel for the given ndrange and workgroupsize. return iterspace, dynamic end -function construct(backend::Backend, ::S, ::NDRange, xpu_name::XPUName) where {Backend<:Union{CPU,GPU}, S<:_Size, NDRange<:_Size, XPUName} +function construct(backend::Backend, ::S, ::NDRange, xpu_name::XPUName) where {Backend <: Union{CPU, GPU}, S <: _Size, NDRange <: _Size, XPUName} return Kernel{Backend, S, NDRange, XPUName}(backend, xpu_name) end @@ -719,7 +721,7 @@ __size(i::Int) = Tuple{i} Convert arguments to the device side representation. """ -argconvert(k::Kernel{T}, arg) where T = +argconvert(k::Kernel{T}, arg) where {T} = error("Don't know how to convert arguments for Kernel{$T}") # Enzyme support @@ -764,7 +766,7 @@ PrecompileTools.@compile_workload begin end if !isdefined(Base, :get_extension) -using Requires + using Requires end @static if !isdefined(Base, :get_extension) diff --git a/src/compiler.jl b/src/compiler.jl index 0f7b7a71..c60bc83b 100644 --- a/src/compiler.jl +++ b/src/compiler.jl @@ -20,12 +20,12 @@ struct CompilerMetadata{StaticNDRange, CheckBounds, I, NDRange, Iterspace} end end -@inline __iterspace(cm::CompilerMetadata) = cm.iterspace +@inline __iterspace(cm::CompilerMetadata) = cm.iterspace @inline __groupindex(cm::CompilerMetadata) = cm.groupindex @inline __groupsize(cm::CompilerMetadata) = size(workitems(__iterspace(cm))) @inline __dynamic_checkbounds(::CompilerMetadata{NDRange, CB}) where {NDRange, CB} = CB <: DynamicCheck -@inline __ndrange(::CompilerMetadata{NDRange}) where {NDRange<:StaticSize} = CartesianIndices(get(NDRange)) -@inline __ndrange(cm::CompilerMetadata{NDRange}) where {NDRange<:DynamicSize} = cm.ndrange +@inline __ndrange(::CompilerMetadata{NDRange}) where {NDRange <: StaticSize} = CartesianIndices(get(NDRange)) +@inline __ndrange(cm::CompilerMetadata{NDRange}) where {NDRange <: DynamicSize} = cm.ndrange @inline __workitems_iterspace(ctx) = workitems(__iterspace(ctx)) @inline groupsize(ctx) = __groupsize(ctx) diff --git a/src/cpu.jl b/src/cpu.jl index 539a1e33..c4afec5c 100644 --- a/src/cpu.jl +++ b/src/cpu.jl @@ -3,15 +3,15 @@ import UnsafeAtomicsLLVM unsafe_free!(::AbstractArray) = return synchronize(::CPU) = nothing -allocate(::CPU, ::Type{T}, dims::Tuple) where T = Array{T}(undef, dims) +allocate(::CPU, ::Type{T}, dims::Tuple) where {T} = Array{T}(undef, dims) -function zeros(backend::CPU, ::Type{T}, dims::Tuple) where T +function zeros(backend::CPU, ::Type{T}, dims::Tuple) where {T} arr = allocate(backend, T, dims) kernel = init_kernel(backend) - kernel(arr, zero, T,ndrange = length(arr)) + kernel(arr, zero, T, ndrange = length(arr)) return arr end -function ones(backend::CPU, ::Type{T}, dims::Tuple) where T +function ones(backend::CPU, ::Type{T}, dims::Tuple) where {T} arr = allocate(backend, T, dims) kernel = init_kernel(backend) kernel(arr, one, T; ndrange = length(arr)) @@ -36,7 +36,7 @@ end functional(::CPU) = true -function (obj::Kernel{CPU})(args...; ndrange=nothing, workgroupsize=nothing, ) +function (obj::Kernel{CPU})(args...; ndrange = nothing, workgroupsize = nothing) ndrange, workgroupsize, iterspace, dynamic = launch_config(obj, ndrange, workgroupsize) if length(blocks(iterspace)) == 0 @@ -74,7 +74,7 @@ end ndrange = (ndrange,) end if workgroupsize isa Integer - workgroupsize = (workgroupsize, ) + workgroupsize = (workgroupsize,) end if KernelAbstractions.workgroupsize(kernel) <: DynamicSize && workgroupsize === nothing @@ -121,12 +121,12 @@ end function __thread_run(tid, len, rem, obj, ndrange, iterspace, args, dynamic) # compute this thread's iterations - f = 1 + ((tid-1) * len) + f = 1 + ((tid - 1) * len) l = f + len - 1 # distribute remaining iterations evenly if rem > 0 if tid <= rem - f = f + (tid-1) + f = f + (tid - 1) l = l + tid else f = f + rem @@ -134,7 +134,7 @@ function __thread_run(tid, len, rem, obj, ndrange, iterspace, args, dynamic) end end # run this thread's iterations - for i = f:l + for i in f:l block = @inbounds blocks(iterspace)[i] ctx = mkcontext(obj, block, ndrange, iterspace, dynamic) obj.f(ctx, args...) @@ -142,7 +142,7 @@ function __thread_run(tid, len, rem, obj, ndrange, iterspace, args, dynamic) return nothing end -function mkcontext(kernel::Kernel{CPU}, I, _ndrange, iterspace, ::Dynamic) where Dynamic +function mkcontext(kernel::Kernel{CPU}, I, _ndrange, iterspace, ::Dynamic) where {Dynamic} return CompilerMetadata{ndrange(kernel), Dynamic}(I, _ndrange, iterspace) end @@ -206,13 +206,13 @@ end # Base.view creates a boundscheck which captures A # https://github.com/JuliaLang/julia/issues/39308 -@inline function aview(A, I::Vararg{Any, N}) where N - J = Base.to_indices(A, I) - Base.unsafe_view(Base._maybe_reshape_parent(A, Base.index_ndims(J...)), J...) +@inline function aview(A, I::Vararg{Any, N}) where {N} + J = Base.to_indices(A, I) + Base.unsafe_view(Base._maybe_reshape_parent(A, Base.index_ndims(J...)), J...) end -@inline function Base.getindex(A::ScratchArray{N}, idx) where N - return @inbounds aview(A.data, ntuple(_->:, Val(N))..., idx) +@inline function Base.getindex(A::ScratchArray{N}, idx) where {N} + return @inbounds aview(A.data, ntuple(_ -> :, Val(N))..., idx) end # Argument conversion diff --git a/src/macros.jl b/src/macros.jl index a659551d..55b4bab4 100644 --- a/src/macros.jl +++ b/src/macros.jl @@ -10,7 +10,7 @@ function find_return(stmt) end # XXX: Proper errors -function __kernel(expr, generate_cpu=true, force_inbounds=false) +function __kernel(expr, generate_cpu = true, force_inbounds = false) def = splitdef(expr) name = def[:name] args = def[:args] @@ -53,7 +53,7 @@ function __kernel(expr, generate_cpu=true, force_inbounds=false) Core.@__doc__ $name(dev) = $name(dev, $DynamicSize(), $DynamicSize()) $name(dev, size) = $name(dev, $StaticSize(size), $DynamicSize()) $name(dev, size, range) = $name(dev, $StaticSize(size), $StaticSize(range)) - function $name(dev::Dev, sz::S, range::NDRange) where {Dev, S<:$_Size, NDRange<:$_Size} + function $name(dev::Dev, sz::S, range::NDRange) where {Dev, S <: $_Size, NDRange <: $_Size} if $isgpu(dev) return $construct(dev, sz, range, $gpu_name) else @@ -87,7 +87,7 @@ function transform_gpu!(def, constargs, force_inbounds) body = def[:body] if force_inbounds body = quote - @inbounds $(body) + @inbounds $(body) end end body = quote @@ -96,7 +96,8 @@ function transform_gpu!(def, constargs, force_inbounds) end return nothing end - def[:body] = Expr(:let, + def[:body] = Expr( + :let, Expr(:block, let_constargs...), body, ) @@ -129,18 +130,19 @@ function transform_cpu!(def, constargs, force_inbounds) end push!(new_stmts, Expr(:popaliasscope)) push!(new_stmts, :(return nothing)) - def[:body] = Expr(:let, + def[:body] = Expr( + :let, Expr(:block, let_constargs...), - Expr(:block, new_stmts...) + Expr(:block, new_stmts...), ) end struct WorkgroupLoop - indicies :: Vector{Any} - stmts :: Vector{Any} - allocations :: Vector{Any} - private_allocations :: Vector{Any} - private :: Set{Symbol} + indicies::Vector{Any} + stmts::Vector{Any} + allocations::Vector{Any} + private_allocations::Vector{Any} + private::Set{Symbol} end is_sync(expr) = @capture(expr, @synchronize() | @synchronize(a_)) @@ -160,17 +162,19 @@ function find_sync(stmt) end # TODO proper handling of LineInfo -function split(stmts, - indicies = Any[], private = Set{Symbol}()) +function split( + stmts, + indicies = Any[], private = Set{Symbol}(), + ) # 1. Split the code into blocks separated by `@synchronize` # 2. Aggregate `@index` expressions # 3. Hoist allocations # 4. Hoist uniforms - current = Any[] + current = Any[] allocations = Any[] private_allocations = Any[] - new_stmts = Any[] + new_stmts = Any[] for stmt in stmts has_sync = find_sync(stmt) if has_sync @@ -178,7 +182,7 @@ function split(stmts, push!(new_stmts, emit(loop)) allocations = Any[] private_allocations = Any[] - current = Any[] + current = Any[] is_sync(stmt) && continue # Recurse into scope constructs @@ -210,7 +214,7 @@ function split(stmts, if @capture(rhs, @index(args__)) push!(indicies, stmt) continue - elseif @capture(rhs, @localmem(args__) | @uniform(args__) ) + elseif @capture(rhs, @localmem(args__) | @uniform(args__)) push!(allocations, stmt) continue elseif @capture(rhs, @private(T_, dims_)) @@ -255,14 +259,14 @@ function emit(loop) for stmt in loop.private_allocations if @capture(stmt, lhs_ = rhs_) - push!(stmts, :($lhs = ntuple(_->$rhs, $N))) + push!(stmts, :($lhs = ntuple(_ -> $rhs, $N))) else error("@private $stmt not an assignment") end end # don't emit empty loops - if !(isempty(loop.stmts) || all(s->s isa LineNumberNode, loop.stmts)) + if !(isempty(loop.stmts) || all(s -> s isa LineNumberNode, loop.stmts)) body = Expr(:block, loop.stmts...) body = postwalk(body) do expr if @capture(expr, lhs_ = rhs_) diff --git a/src/nditeration.jl b/src/nditeration.jl index d7598ae2..ab3fd4ec 100644 --- a/src/nditeration.jl +++ b/src/nditeration.jl @@ -12,13 +12,13 @@ struct NoDynamicCheck end abstract type _Size end struct DynamicSize <: _Size end struct StaticSize{S} <: _Size - function StaticSize{S}() where S + function StaticSize{S}() where {S} new{S::Tuple{Vararg{Int}}}() end end -@pure StaticSize(s::Tuple{Vararg{Int}}) = StaticSize{s}() -@pure StaticSize(s::Int...) = StaticSize{s}() +@pure StaticSize(s::Tuple{Vararg{Int}}) = StaticSize{s}() +@pure StaticSize(s::Int...) = StaticSize{s}() @pure StaticSize(s::Type{<:Tuple}) = StaticSize{tuple(s.parameters...)}() # Some @pure convenience functions for `StaticSize` @@ -59,10 +59,10 @@ struct NDRange{N, StaticBlocks, StaticWorkitems, DynamicBlock, DynamicWorkitems} end end -@inline workitems(range::NDRange{N, B, W}) where {N,B,W<:DynamicSize} = range.workitems::CartesianIndices{N} -@inline workitems(range::NDRange{N, B, W}) where {N,B,W<:StaticSize} = CartesianIndices(get(W))::CartesianIndices{N} -@inline blocks(range::NDRange{N, B}) where {N,B<:DynamicSize} = range.blocks::CartesianIndices{N} -@inline blocks(range::NDRange{N, B}) where {N,B<:StaticSize} = CartesianIndices(get(B))::CartesianIndices{N} +@inline workitems(range::NDRange{N, B, W}) where {N, B, W <: DynamicSize} = range.workitems::CartesianIndices{N} +@inline workitems(range::NDRange{N, B, W}) where {N, B, W <: StaticSize} = CartesianIndices(get(W))::CartesianIndices{N} +@inline blocks(range::NDRange{N, B}) where {N, B <: DynamicSize} = range.blocks::CartesianIndices{N} +@inline blocks(range::NDRange{N, B}) where {N, B <: StaticSize} = CartesianIndices(get(B))::CartesianIndices{N} import Base.iterate @inline iterate(range::NDRange) = iterate(blocks(range)) @@ -70,12 +70,12 @@ import Base.iterate Base.length(range::NDRange) = length(blocks(range)) -@inline function expand(ndrange::NDRange{N}, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) where N +@inline function expand(ndrange::NDRange{N}, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) where {N} nI = ntuple(Val(N)) do I Base.@_inline_meta stride = size(workitems(ndrange), I) gidx = groupidx.I[I] - (gidx-1)*stride + idx.I[I] + (gidx - 1) * stride + idx.I[I] end CartesianIndex(nI) end @@ -84,11 +84,11 @@ Base.@propagate_inbounds function expand(ndrange::NDRange, groupidx::Integer, id expand(ndrange, blocks(ndrange)[groupidx], workitems(ndrange)[idx]) end -Base.@propagate_inbounds function expand(ndrange::NDRange{N}, groupidx::CartesianIndex{N}, idx::Integer) where N +Base.@propagate_inbounds function expand(ndrange::NDRange{N}, groupidx::CartesianIndex{N}, idx::Integer) where {N} expand(ndrange, groupidx, workitems(ndrange)[idx]) end -Base.@propagate_inbounds function expand(ndrange::NDRange{N}, groupidx::Integer, idx::CartesianIndex{N}) where N +Base.@propagate_inbounds function expand(ndrange::NDRange{N}, groupidx::Integer, idx::CartesianIndex{N}) where {N} expand(ndrange, blocks(ndrange)[groupidx], idx) end diff --git a/src/reflection.jl b/src/reflection.jl index e0be71c6..da3ba1fb 100644 --- a/src/reflection.jl +++ b/src/reflection.jl @@ -4,7 +4,7 @@ export @ka_code_typed, @ka_code_llvm using UUIDs const Cthulhu = Base.PkgId(UUID("f68482b8-f384-11e8-15f7-abe071a5a75f"), "Cthulhu") -function ka_code_typed(kernel, argtypes; ndrange=nothing, workgroupsize=nothing, interactive=false, kwargs...) +function ka_code_typed(kernel, argtypes; ndrange = nothing, workgroupsize = nothing, interactive = false, kwargs...) # get the iterspace and dynamic of a kernel ndrange, workgroupsize, iterspace, dynamic = KernelAbstractions.launch_config(kernel, ndrange, workgroupsize) @@ -24,7 +24,7 @@ function ka_code_typed(kernel, argtypes; ndrange=nothing, workgroupsize=nothing, if interactive # call Cthulhu without introducing a dependency on Cthulhu mod = Base.get(Base.loaded_modules, Cthulhu, nothing) - mod===nothing && error("Interactive code reflection requires Cthulhu; please install and load this package first.") + mod === nothing && error("Interactive code reflection requires Cthulhu; please install and load this package first.") descend_code_typed = getfield(mod, :descend_code_typed) return descend_code_typed(typeof(kernel.f), (typeof(ctx), argtypes...); kwargs...) else @@ -33,11 +33,11 @@ function ka_code_typed(kernel, argtypes; ndrange=nothing, workgroupsize=nothing, end -function ka_code_llvm(kernel, argtypes; ndrange=nothing, workgroupsize=nothing, kwargs...) - ka_code_llvm(stdout, kernel, argtypes; ndrange=ndrange, workgroupsize=nothing, kwargs...) +function ka_code_llvm(kernel, argtypes; ndrange = nothing, workgroupsize = nothing, kwargs...) + ka_code_llvm(stdout, kernel, argtypes; ndrange = ndrange, workgroupsize = nothing, kwargs...) end -function ka_code_llvm(io::IO, kernel, argtypes; ndrange=nothing, workgroupsize=nothing, kwargs...) +function ka_code_llvm(io::IO, kernel, argtypes; ndrange = nothing, workgroupsize = nothing, kwargs...) # get the iterspace and dynamic of a kernel ndrange, workgroupsize, iterspace, dynamic = KernelAbstractions.launch_config(kernel, ndrange, workgroupsize) @@ -60,7 +60,7 @@ function format_ex(ex0) args = gensym(:args) old_args = nothing kern = nothing - for i = 1:length(ex0) + for i in 1:length(ex0) if ex0[i].head == :call # inside kernel() expr while length(ex0[i].args) > 2 @@ -122,7 +122,7 @@ macro ka_code_typed(ex0...) quote local $(esc(args)) = $(old_args) # e.g. translate CuArray to CuBackendArray - $(esc(args)) = map(x->argconvert($kern, x), $(esc(args))) + $(esc(args)) = map(x -> argconvert($kern, x), $(esc(args))) local results = $thecall if results !== nothing @@ -155,7 +155,7 @@ macro ka_code_llvm(ex0...) quote local $(esc(args)) = $(old_args) - if isa($kern, Kernel{G} where {G<:GPU}) + if isa($kern, Kernel{G} where {G <: GPU}) # does not support GPU kernels error("@ka_code_llvm does not support GPU kernels") end diff --git a/test/compiler.jl b/test/compiler.jl index 51050650..cf86386e 100644 --- a/test/compiler.jl +++ b/test/compiler.jl @@ -44,33 +44,33 @@ function compiler_testsuite(backend, ArrayT) @test KernelAbstractions.__index_Global_NTuple(ctx, CartesianIndex(1)) == (1,) A = ArrayT{Int}(undef, 1) - let (CI, rt) = @ka_code_typed literal_pow(backend())(A, ndrange=1) + let (CI, rt) = @ka_code_typed literal_pow(backend())(A, ndrange = 1) # test that there is no invoke of overdub @test !any(check_for_overdub, CI.code) end A = ArrayT{Float32}(undef, 1) - let (CI, rt) = @ka_code_typed square(backend())(A, A, ndrange=1) + let (CI, rt) = @ka_code_typed square(backend())(A, A, ndrange = 1) # test that there is no invoke of overdub @test !any(check_for_overdub, CI.code) end A = ArrayT{Float32}(undef, 1) B = ArrayT{Float32}(undef, 1) - let (CI, rt) = @ka_code_typed pow(backend())(A, B, ndrange=1) + let (CI, rt) = @ka_code_typed pow(backend())(A, B, ndrange = 1) # test that there is no invoke of overdub @test !any(check_for_overdub, CI.code) end A = ArrayT{Float32}(undef, 1) B = ArrayT{Int32}(undef, 1) - let (CI, rt) = @ka_code_typed pow(backend())(A, B, ndrange=1) + let (CI, rt) = @ka_code_typed pow(backend())(A, B, ndrange = 1) # test that there is no invoke of overdub @test !any(check_for_overdub, CI.code) end A = ArrayT{Int}(undef, 1) - let (CI, rt) = @ka_code_typed checked(backend())(A, 1, 2, ndrange=1) + let (CI, rt) = @ka_code_typed checked(backend())(A, 1, 2, ndrange = 1) # test that there is no invoke of overdub @test !any(check_for_overdub, CI.code) end diff --git a/test/convert.jl b/test/convert.jl index 95a9a822..a87f4801 100644 --- a/test/convert.jl +++ b/test/convert.jl @@ -47,20 +47,20 @@ function convert_testsuite(backend, ArrayT) ET = KernelAbstractions.supports_float64(backend()) ? Float64 : Float32 N = 32 - d_A = ArrayT([rand(ET)*3 for i = 1:N]) + d_A = ArrayT([rand(ET) * 3 for i in 1:N]) # 30 because we have 10 integer types and we have 3 operations d_B = ArrayT(zeros(ET, N, 30)) @testset "convert test" begin kernel = convert_kernel!(backend(), 4) - kernel(d_A, d_B, ndrange=(N),) + kernel(d_A, d_B, ndrange = (N)) synchronize(backend()) - for i = 1:10 - @test d_B[:,i] == ceil.(d_A) - @test d_B[:,i+10] == floor.(d_A) - @test d_B[:,i+20] == round.(d_A) + for i in 1:10 + @test d_B[:, i] == ceil.(d_A) + @test d_B[:, i + 10] == floor.(d_A) + @test d_B[:, i + 20] == round.(d_A) end end end diff --git a/test/examples.jl b/test/examples.jl index 9b1d48f7..a3cb04d1 100644 --- a/test/examples.jl +++ b/test/examples.jl @@ -1,4 +1,4 @@ -function find_sources(path::String, sources=String[]) +function find_sources(path::String, sources = String[]) if isdir(path) for entry in readdir(path) find_sources(joinpath(path, entry), sources) @@ -10,21 +10,21 @@ function find_sources(path::String, sources=String[]) end function examples_testsuite(backend_str) -@testset "examples" begin - examples_dir = joinpath(@__DIR__, "..", "examples") - examples = find_sources(examples_dir) - filter!(file -> readline(file) != "# EXCLUDE FROM TESTING", examples) - if backend_str == "ROCM" - filter!(file -> occursin("# INCLUDE ROCM", String(read(file))), examples) - end + @testset "examples" begin + examples_dir = joinpath(@__DIR__, "..", "examples") + examples = find_sources(examples_dir) + filter!(file -> readline(file) != "# EXCLUDE FROM TESTING", examples) + if backend_str == "ROCM" + filter!(file -> occursin("# INCLUDE ROCM", String(read(file))), examples) + end - @testset "$(basename(example))" for example in examples - @eval module $(gensym()) + @testset "$(basename(example))" for example in examples + @eval module $(gensym()) backend_str = $backend_str include($example) + end + @test true end - @test true - end -end + end end diff --git a/test/extensions/enzyme.jl b/test/extensions/enzyme.jl index 3d8a5082..01403776 100644 --- a/test/extensions/enzyme.jl +++ b/test/extensions/enzyme.jl @@ -9,7 +9,7 @@ end function square_caller(A, backend) kernel = square!(backend) - kernel(A, ndrange=size(A)) + kernel(A, ndrange = size(A)) KernelAbstractions.synchronize(backend) end @@ -21,11 +21,11 @@ end function mul_caller(A, B, backend) kernel = mul!(backend) - kernel(A, B, ndrange=size(A)) + kernel(A, B, ndrange = size(A)) KernelAbstractions.synchronize(backend) end -function enzyme_testsuite(backend, ArrayT, supports_reverse=true) +function enzyme_testsuite(backend, ArrayT, supports_reverse = true) @testset "kernels" begin A = ArrayT{Float64}(undef, 64) dA = ArrayT{Float64}(undef, 64) diff --git a/test/localmem.jl b/test/localmem.jl index 65efea32..2aa87239 100644 --- a/test/localmem.jl +++ b/test/localmem.jl @@ -38,7 +38,7 @@ function localmem_testsuite(backend, ArrayT) @testset "kernels" begin @testset for kernel! in (localmem(backend(), 16), localmem2(backend(), 16)) A = ArrayT{Int}(undef, 64) - kernel!(A, ndrange=size(A)) + kernel!(A, ndrange = size(A)) synchronize(backend()) @test all(A[1:16] .== 16:-1:1) @test all(A[17:32] .== 16:-1:1) diff --git a/test/nditeration.jl b/test/nditeration.jl index 1647c5a3..1310945f 100644 --- a/test/nditeration.jl +++ b/test/nditeration.jl @@ -3,86 +3,86 @@ using KernelAbstractions.NDIteration using Test function nditeration_testsuite() -@testset "iteration" begin - let ndrange = NDRange{2, DynamicSize, DynamicSize}(CartesianIndices((256, 256)), CartesianIndices((32, 32))); - @test length(ndrange) == 256*256 - @test all(p->p[1]==p[2], zip(ndrange, CartesianIndices((256, 256)))) - end - let ndrange = NDRange{2, StaticSize{(256,256)}, DynamicSize}(nothing, CartesianIndices((32, 32))); - @test length(ndrange) == 256*256 - @test all(p->p[1]==p[2], zip(ndrange, CartesianIndices((256, 256)))) + @testset "iteration" begin + let ndrange = NDRange{2, DynamicSize, DynamicSize}(CartesianIndices((256, 256)), CartesianIndices((32, 32))); + @test length(ndrange) == 256 * 256 + @test all(p -> p[1] == p[2], zip(ndrange, CartesianIndices((256, 256)))) + end + let ndrange = NDRange{2, StaticSize{(256, 256)}, DynamicSize}(nothing, CartesianIndices((32, 32))); + @test length(ndrange) == 256 * 256 + @test all(p -> p[1] == p[2], zip(ndrange, CartesianIndices((256, 256)))) + end end -end -# GPU scenario where we get a linear index into workitems/blocks -function linear_iteration(ndrange) - idx = Array{CartesianIndex{2}}(undef, length(blocks(ndrange)) * length(workitems(ndrange))) - for i in 1:length(blocks(ndrange)) - for j in 1:length(workitems(ndrange)) - I = j + (i-1) * length(workitems(ndrange)) - idx[I] = expand(ndrange, i, j) + # GPU scenario where we get a linear index into workitems/blocks + function linear_iteration(ndrange) + idx = Array{CartesianIndex{2}}(undef, length(blocks(ndrange)) * length(workitems(ndrange))) + for i in 1:length(blocks(ndrange)) + for j in 1:length(workitems(ndrange)) + I = j + (i - 1) * length(workitems(ndrange)) + idx[I] = expand(ndrange, i, j) + end end + return idx end - return idx -end -function check(idx, offset, offset_x, offset_y, Dim_x, Dim_y) - N = Dim_x * Dim_y - all(p->p[1]==p[2], zip(idx[offset*N .+ 1:N], CartesianIndices((offset_x*Dim_x .+ 1:Dim_x, offset_y*Dim_y .+ 1:Dim_y)))) -end + function check(idx, offset, offset_x, offset_y, Dim_x, Dim_y) + N = Dim_x * Dim_y + all(p -> p[1] == p[2], zip(idx[(offset * N .+ 1):N], CartesianIndices(((offset_x * Dim_x .+ 1):Dim_x, (offset_y * Dim_y .+ 1):Dim_y)))) + end -@testset "linear_iteration" begin - Dim_x = 32 - Dim_y = 32 - let ndrange = NDRange{2, StaticSize{(4,4)}, StaticSize{(Dim_x,Dim_y)}}(); - idx = linear_iteration(ndrange) - for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange)) - I = Tuple(I) - @test check(idx, i-1, ntuple(i->I[i]-1, length(I))..., Dim_x, Dim_y) + @testset "linear_iteration" begin + Dim_x = 32 + Dim_y = 32 + let ndrange = NDRange{2, StaticSize{(4, 4)}, StaticSize{(Dim_x, Dim_y)}}(); + idx = linear_iteration(ndrange) + for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange)) + I = Tuple(I) + @test check(idx, i - 1, ntuple(i -> I[i] - 1, length(I))..., Dim_x, Dim_y) + end end - end - let ndrange = NDRange{2, DynamicSize, DynamicSize}(CartesianIndices((4,4)), CartesianIndices((Dim_x, Dim_y))); - idx = linear_iteration(ndrange) - for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange)) - I = Tuple(I) - @test check(idx, i-1, ntuple(i->I[i]-1, length(I))..., Dim_x, Dim_y) + let ndrange = NDRange{2, DynamicSize, DynamicSize}(CartesianIndices((4, 4)), CartesianIndices((Dim_x, Dim_y))); + idx = linear_iteration(ndrange) + for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange)) + I = Tuple(I) + @test check(idx, i - 1, ntuple(i -> I[i] - 1, length(I))..., Dim_x, Dim_y) + end end - end - Dim_x = 32 - Dim_y = 1 - let ndrange = NDRange{2, StaticSize{(4,4*32)}, StaticSize{(Dim_x,Dim_y)}}(); - idx = linear_iteration(ndrange) - N = length(workitems(ndrange)) - for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange)) - I = Tuple(I) - @test check(idx, i-1, ntuple(i->I[i]-1, length(I))..., Dim_x, Dim_y) + Dim_x = 32 + Dim_y = 1 + let ndrange = NDRange{2, StaticSize{(4, 4 * 32)}, StaticSize{(Dim_x, Dim_y)}}(); + idx = linear_iteration(ndrange) + N = length(workitems(ndrange)) + for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange)) + I = Tuple(I) + @test check(idx, i - 1, ntuple(i -> I[i] - 1, length(I))..., Dim_x, Dim_y) + end end - end - let ndrange = NDRange{2, DynamicSize, DynamicSize}(CartesianIndices((4,4*32)), CartesianIndices((Dim_x, Dim_y))); - idx = linear_iteration(ndrange) - for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange)) - I = Tuple(I) - @test check(idx, i-1, ntuple(i->I[i]-1, length(I))..., Dim_x, Dim_y) + let ndrange = NDRange{2, DynamicSize, DynamicSize}(CartesianIndices((4, 4 * 32)), CartesianIndices((Dim_x, Dim_y))); + idx = linear_iteration(ndrange) + for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange)) + I = Tuple(I) + @test check(idx, i - 1, ntuple(i -> I[i] - 1, length(I))..., Dim_x, Dim_y) + end end - end - Dim_x = 1 - Dim_y = 32 - let ndrange = NDRange{2, StaticSize{(4*32,4)}, StaticSize{(Dim_x,Dim_y)}}(); - idx = linear_iteration(ndrange) - N = length(workitems(ndrange)) - for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange)) - I = Tuple(I) - @test check(idx, i-1, ntuple(i->I[i]-1, length(I))..., Dim_x, Dim_y) + Dim_x = 1 + Dim_y = 32 + let ndrange = NDRange{2, StaticSize{(4 * 32, 4)}, StaticSize{(Dim_x, Dim_y)}}(); + idx = linear_iteration(ndrange) + N = length(workitems(ndrange)) + for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange)) + I = Tuple(I) + @test check(idx, i - 1, ntuple(i -> I[i] - 1, length(I))..., Dim_x, Dim_y) + end end - end - let ndrange = NDRange{2, DynamicSize, DynamicSize}(CartesianIndices((4*32,4)), CartesianIndices((Dim_x, Dim_y))); - idx = linear_iteration(ndrange) - for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange)) - I = Tuple(I) - @test check(idx, i-1, ntuple(i->I[i]-1, length(I))..., Dim_x, Dim_y) + let ndrange = NDRange{2, DynamicSize, DynamicSize}(CartesianIndices((4 * 32, 4)), CartesianIndices((Dim_x, Dim_y))); + idx = linear_iteration(ndrange) + for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange)) + I = Tuple(I) + @test check(idx, i - 1, ntuple(i -> I[i] - 1, length(I))..., Dim_x, Dim_y) + end end end end -end diff --git a/test/print_test.jl b/test/print_test.jl index 9d4b653b..79b52730 100644 --- a/test/print_test.jl +++ b/test/print_test.jl @@ -1,8 +1,8 @@ using KernelAbstractions, Test @kernel function kernel_print() - I = @index(Global) - @print("Hello from thread ", I, "!\n") + I = @index(Global) + @print("Hello from thread ", I, "!\n") end function printing_testsuite(backend) @@ -12,14 +12,14 @@ function printing_testsuite(backend) @testset "print test" begin kernel = kernel_print(backend(), 4) redirect_stdout(devnull) do - kernel(ndrange=(4,)) - synchronize(backend()) + kernel(ndrange = (4,)) + synchronize(backend()) end @test true redirect_stdout(devnull) do - @print("Why this should work\n") - synchronize(backend()) + @print("Why this should work\n") + synchronize(backend()) end @test true end diff --git a/test/private.jl b/test/private.jl index 8406316f..b8134042 100644 --- a/test/private.jl +++ b/test/private.jl @@ -11,7 +11,7 @@ end priv = @private eltype(A) (1,) I = @index(Global, Linear) @inbounds begin - B[I] = eltype(priv) === eltype(A) + B[I] = eltype(priv) === eltype(A) end end @@ -28,7 +28,7 @@ end end # This is horrible don't write code like this -@kernel function forloop(A, ::Val{N}) where N +@kernel function forloop(A, ::Val{N}) where {N} I = @index(Global, Linear) i = @index(Local, Linear) priv = @private Int (N,) @@ -50,20 +50,20 @@ end priv = @private eltype(A) (1,) @inbounds begin - priv[1] = zero(eltype(A)) - for k in 1:size(A, ndims(A)) - priv[1] += A[I..., k] - end - out[I...] = priv[1] + priv[1] = zero(eltype(A)) + for k in 1:size(A, ndims(A)) + priv[1] += A[I..., k] + end + out[I...] = priv[1] end end function private_testsuite(backend, ArrayT) @testset "kernels" begin - stmt_form(backend(), 16)(ndrange=16) + stmt_form(backend(), 16)(ndrange = 16) synchronize(backend()) A = ArrayT{Int}(undef, 64) - private(backend(), 16)(A, ndrange=size(A)) + private(backend(), 16)(A, ndrange = size(A)) synchronize(backend()) @test all(A[1:16] .== 16:-1:1) @test all(A[17:32] .== 16:-1:1) @@ -72,28 +72,30 @@ function private_testsuite(backend, ArrayT) A = ArrayT{Int}(undef, 64, 64) A .= 1 - forloop(backend())(A, Val(size(A, 2)), ndrange=size(A,1), workgroupsize=size(A,1)) + forloop(backend())(A, Val(size(A, 2)), ndrange = size(A, 1), workgroupsize = size(A, 1)) synchronize(backend()) @test all(A[:, 1] .== 64) @test all(A[:, 2:end] .== 1) B = ArrayT{Bool}(undef, size(A)...) - typetest(backend(), 16)(A, B, ndrange=size(A)) + typetest(backend(), 16)(A, B, ndrange = size(A)) synchronize(backend()) @test all(B) - A = ArrayT{Float32}(ones(64,3)); + A = ArrayT{Float32}(ones(64, 3)); out = ArrayT{Float32}(undef, 64) - reduce_private(backend(), 8)(out, A, ndrange=size(out)) + reduce_private(backend(), 8)(out, A, ndrange = size(out)) synchronize(backend()) - @test all(out .== 3f0) + @test all(out .== 3.0f0) end if backend == CPU @testset "codegen" begin IR = sprint() do io - KernelAbstractions.ka_code_llvm(io, reduce_private(backend(), (8,)), Tuple{ArrayT{Float64,1}, ArrayT{Float64,2}}, - optimize=true, ndrange=(64,)) + KernelAbstractions.ka_code_llvm( + io, reduce_private(backend(), (8,)), Tuple{ArrayT{Float64, 1}, ArrayT{Float64, 2}}, + optimize = true, ndrange = (64,), + ) end @test !occursin("gcframe", IR) end diff --git a/test/reflection.jl b/test/reflection.jl index f2d5bd27..5153c94b 100644 --- a/test/reflection.jl +++ b/test/reflection.jl @@ -19,16 +19,16 @@ function test_typed_kernel_dynamic(backend, backend_str, ArrayT) A = ArrayT(ones(Float32, 1024, 1024)) kernel = mul2(backend()) res = if backend == CPU - @ka_code_typed kernel(A, ndrange=size(A), workgroupsize=16) + @ka_code_typed kernel(A, ndrange = size(A), workgroupsize = 16) else - @ka_code_typed kernel(A, ndrange=size(A), workgroupsize=(32, 32)) + @ka_code_typed kernel(A, ndrange = size(A), workgroupsize = (32, 32)) end if backend_str == "CUDA" || backend_str == "ROCM" || backend_str == "oneAPI" || backend_str == "Metal" @test_broken isa(res, Pair{Core.CodeInfo, DataType}) else @test isa(res, Pair{Core.CodeInfo, DataType}) end - @test isa(res[1].code, Array{Any,1}) + @test isa(res[1].code, Array{Any, 1}) end function test_typed_kernel_dynamic_no_info(backend, backend_str, ArrayT) @@ -36,13 +36,13 @@ function test_typed_kernel_dynamic_no_info(backend, backend_str, ArrayT) B = similar(A) C = similar(A) kernel = add3(backend()) - res = @ka_code_typed kernel(A, B, C, ndrange=size(A)) + res = @ka_code_typed kernel(A, B, C, ndrange = size(A)) if backend_str == "CUDA" || backend_str == "ROCM" || backend_str == "oneAPI" || backend_str == "Metal" @test_broken isa(res, Pair{Core.CodeInfo, DataType}) else @test isa(res, Pair{Core.CodeInfo, DataType}) end - @test isa(res[1].code, Array{Any,1}) + @test isa(res[1].code, Array{Any, 1}) end function test_typed_kernel_static(backend, backend_str, ArrayT) @@ -52,13 +52,13 @@ function test_typed_kernel_static(backend, backend_str, ArrayT) else mul2(backend(), (32, 32)) end - res = @ka_code_typed kernel(A, ndrange=size(A)) + res = @ka_code_typed kernel(A, ndrange = size(A)) if backend_str == "CUDA" || backend_str == "ROCM" || backend_str == "oneAPI" || backend_str == "Metal" @test_broken isa(res, Pair{Core.CodeInfo, DataType}) else @test isa(res, Pair{Core.CodeInfo, DataType}) end - @test isa(res[1].code, Array{Any,1}) + @test isa(res[1].code, Array{Any, 1}) end function test_typed_kernel_no_optimize(backend, backend_str, ArrayT) @@ -68,8 +68,8 @@ function test_typed_kernel_no_optimize(backend, backend_str, ArrayT) else mul2(backend(), (32, 32)) end - res = @ka_code_typed optimize=false kernel(A, ndrange=size(A)) - res_opt = @ka_code_typed kernel(A, ndrange=size(A)) + res = @ka_code_typed optimize = false kernel(A, ndrange = size(A)) + res_opt = @ka_code_typed kernel(A, ndrange = size(A)) # FIXME: Need a better test # @test size(res[1].code) < size(res_opt[1].code) end @@ -82,13 +82,13 @@ function test_expr_kernel(backend, backend_str, ArrayT) else addi(backend(), (32, 32)) end - res = @ka_code_typed kernel(A, C, 1+2, ndrange=size(A)) + res = @ka_code_typed kernel(A, C, 1 + 2, ndrange = size(A)) if backend_str == "CUDA" || backend_str == "ROCM" || backend_str == "oneAPI" || backend_str == "Metal" @test_broken isa(res, Pair{Core.CodeInfo, DataType}) else @test isa(res, Pair{Core.CodeInfo, DataType}) end - @test isa(res[1].code, Array{Any,1}) + @test isa(res[1].code, Array{Any, 1}) end function reflection_testsuite(backend, backend_str, ArrayT) diff --git a/test/runtests.jl b/test/runtests.jl index d7cca6ca..bf9b445a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,7 +4,7 @@ using Test include("testsuite.jl") @testset "CPU back-end" begin - struct CPUBackendArray{T,N,A} end # Fake and unused + struct CPUBackendArray{T, N, A} end # Fake and unused Testsuite.testsuite(CPU, "CPU", Base, Array, CPUBackendArray) end @@ -14,33 +14,33 @@ end end A = zeros(Int, Threads.nthreads()) -kern_static(CPU(static=true), (1,))(A, ndrange=length(A)) +kern_static(CPU(static = true), (1,))(A, ndrange = length(A)) @test A == 1:Threads.nthreads() -@kernel cpu=false function my_no_cpu_kernel(a) +@kernel cpu = false function my_no_cpu_kernel(a) end @test_throws ErrorException("This kernel is unavailable for backend CPU") my_no_cpu_kernel(CPU()) # testing multiple configurations at the same time -@kernel cpu=false inbounds=false function my_no_cpu_kernel2(a) +@kernel cpu = false inbounds = false function my_no_cpu_kernel2(a) end @test_throws ErrorException("This kernel is unavailable for backend CPU") my_no_cpu_kernel2(CPU()) if Base.JLOptions().check_bounds == 0 || Base.JLOptions().check_bounds == 1 # testing bounds errors - @kernel inbounds=false function my_bounded_kernel(a) + @kernel inbounds = false function my_bounded_kernel(a) idx = @index(Global, Linear) a[idx] = 0 end - @test_throws BoundsError(Int64[],(1,)) my_bounded_kernel(CPU())(Int[], ndrange=1) + @test_throws BoundsError(Int64[], (1,)) my_bounded_kernel(CPU())(Int[], ndrange = 1) end if Base.JLOptions().check_bounds == 0 || Base.JLOptions().check_bounds == 2 - @kernel inbounds=true function my_bounded_kernel(a) + @kernel inbounds = true function my_bounded_kernel(a) idx = @index(Global, Linear) a[idx] = 0 end - @test nothing == my_inbounds_kernel(CPU())(Int[], ndrange=1) + @test nothing == my_inbounds_kernel(CPU())(Int[], ndrange = 1) end struct NewBackend <: KernelAbstractions.GPU end diff --git a/test/specialfunctions.jl b/test/specialfunctions.jl index a6855d4f..26198646 100644 --- a/test/specialfunctions.jl +++ b/test/specialfunctions.jl @@ -20,37 +20,37 @@ end function specialfunctions_testsuite(Backend) backend = Backend() @testset "special functions: gamma" begin - x = Float32[1.0,2.0,3.0,5.5] + x = Float32[1.0, 2.0, 3.0, 5.5] cx = allocate(backend, Float32, length(x)) KernelAbstractions.copyto!(backend, cx, x) cy = similar(cx) - gamma_knl(backend)(cy, cx; ndrange=length(x)) + gamma_knl(backend)(cy, cx; ndrange = length(x)) synchronize(backend) @test Array(cy) ≈ SpecialFunctions.gamma.(x) end @testset "special functions: erf" begin - x = Float32[-1.0,-0.5,0.0,1e-3,1.0,2.0,5.5] + x = Float32[-1.0, -0.5, 0.0, 1.0e-3, 1.0, 2.0, 5.5] cx = allocate(backend, Float32, length(x)) KernelAbstractions.copyto!(backend, cx, x) cy = similar(cx) - erf_knl(backend)(cy, cx; ndrange=length(x)) + erf_knl(backend)(cy, cx; ndrange = length(x)) synchronize(backend) @test Array(cy) ≈ SpecialFunctions.erf.(x) end @testset "special functions: erfc" begin - x = Float32[-1.0,-0.5,0.0,1e-3,1.0,2.0,5.5] + x = Float32[-1.0, -0.5, 0.0, 1.0e-3, 1.0, 2.0, 5.5] cx = allocate(backend, Float32, length(x)) KernelAbstractions.copyto!(backend, cx, x) cy = similar(cx) - erfc_knl(backend)(cy, cx; ndrange=length(x)) + erfc_knl(backend)(cy, cx; ndrange = length(x)) synchronize(backend) @test Array(cy) ≈ SpecialFunctions.erfc.(x) end -end \ No newline at end of file +end diff --git a/test/test.jl b/test/test.jl index 9bf6f849..7269c297 100644 --- a/test/test.jl +++ b/test/test.jl @@ -8,288 +8,292 @@ using Adapt identity(x) = x function unittest_testsuite(Backend, backend_str, backend_mod, BackendArrayT; skip_tests = Set{String}()) -@conditional_testset "partition" skip_tests begin - backend = Backend() - let kernel = KernelAbstractions.Kernel{typeof(backend), StaticSize{(64,)}, DynamicSize, typeof(identity)}(backend, identity) - iterspace, dynamic = KernelAbstractions.partition(kernel, (128,), nothing) - @test length(blocks(iterspace)) == 2 - @test dynamic isa NoDynamicCheck - - iterspace, dynamic = KernelAbstractions.partition(kernel, (129,), nothing) - @test length(blocks(iterspace)) == 3 - @test dynamic isa DynamicCheck - - iterspace, dynamic = KernelAbstractions.partition(kernel, (129,), (64,)) - @test length(blocks(iterspace)) == 3 - @test dynamic isa DynamicCheck - - @test_throws ErrorException KernelAbstractions.partition(kernel, (129,), (65,)) - @test KernelAbstractions.backend(kernel) == backend - end - let kernel = KernelAbstractions.Kernel{typeof(backend), StaticSize{(64,)}, StaticSize{(128,)}, typeof(identity)}(backend, identity) - iterspace, dynamic = KernelAbstractions.partition(kernel, (128,), nothing) - @test length(blocks(iterspace)) == 2 - @test dynamic isa NoDynamicCheck + @conditional_testset "partition" skip_tests begin + backend = Backend() + let kernel = KernelAbstractions.Kernel{typeof(backend), StaticSize{(64,)}, DynamicSize, typeof(identity)}(backend, identity) + iterspace, dynamic = KernelAbstractions.partition(kernel, (128,), nothing) + @test length(blocks(iterspace)) == 2 + @test dynamic isa NoDynamicCheck + + iterspace, dynamic = KernelAbstractions.partition(kernel, (129,), nothing) + @test length(blocks(iterspace)) == 3 + @test dynamic isa DynamicCheck + + iterspace, dynamic = KernelAbstractions.partition(kernel, (129,), (64,)) + @test length(blocks(iterspace)) == 3 + @test dynamic isa DynamicCheck + + @test_throws ErrorException KernelAbstractions.partition(kernel, (129,), (65,)) + @test KernelAbstractions.backend(kernel) == backend + end + let kernel = KernelAbstractions.Kernel{typeof(backend), StaticSize{(64,)}, StaticSize{(128,)}, typeof(identity)}(backend, identity) + iterspace, dynamic = KernelAbstractions.partition(kernel, (128,), nothing) + @test length(blocks(iterspace)) == 2 + @test dynamic isa NoDynamicCheck - iterspace, dynamic = KernelAbstractions.partition(kernel, nothing, nothing) - @test length(blocks(iterspace)) == 2 - @test dynamic isa NoDynamicCheck + iterspace, dynamic = KernelAbstractions.partition(kernel, nothing, nothing) + @test length(blocks(iterspace)) == 2 + @test dynamic isa NoDynamicCheck - @test_throws ErrorException KernelAbstractions.partition(kernel, (129,), nothing) - @test KernelAbstractions.backend(kernel) == backend + @test_throws ErrorException KernelAbstractions.partition(kernel, (129,), nothing) + @test KernelAbstractions.backend(kernel) == backend + end end -end -@kernel function index_linear_global(A) - I = @index(Global, Linear) - A[I] = I -end -@kernel function index_linear_local(A) - I = @index(Global, Linear) - i = @index(Local, Linear) - A[I] = i -end -@kernel function index_linear_group(A) - I = @index(Global, Linear) - i = @index(Group, Linear) - A[I] = i -end -@kernel function index_cartesian_global(A) - I = @index(Global, Cartesian) - A[I] = I -end -@kernel function index_cartesian_local(A) - I = @index(Global, Cartesian) - i = @index(Local, Cartesian) - A[I] = i -end -@kernel function index_cartesian_group(A) - I = @index(Global, Cartesian) - i = @index(Group, Cartesian) - A[I] = i -end + @kernel function index_linear_global(A) + I = @index(Global, Linear) + A[I] = I + end + @kernel function index_linear_local(A) + I = @index(Global, Linear) + i = @index(Local, Linear) + A[I] = i + end + @kernel function index_linear_group(A) + I = @index(Global, Linear) + i = @index(Group, Linear) + A[I] = i + end + @kernel function index_cartesian_global(A) + I = @index(Global, Cartesian) + A[I] = I + end + @kernel function index_cartesian_local(A) + I = @index(Global, Cartesian) + i = @index(Local, Cartesian) + A[I] = i + end + @kernel function index_cartesian_group(A) + I = @index(Global, Cartesian) + i = @index(Group, Cartesian) + A[I] = i + end -@conditional_testset "get_backend" skip_tests begin - backend = Backend() - backendT = typeof(backend).name.wrapper # To look through CUDABackend{true, false} - @test backend isa backendT - - x = allocate(backend, Float32, 5) - A = allocate(backend, Float32, 5, 5) - @test @inferred(KernelAbstractions.get_backend(A)) isa backendT - @test @inferred(KernelAbstractions.get_backend(view(A, 2:4, 1:3))) isa backendT - @test @inferred(KernelAbstractions.get_backend(Diagonal(x))) isa backendT - @test @inferred(KernelAbstractions.get_backend(Tridiagonal(A))) isa backendT -end + @conditional_testset "get_backend" skip_tests begin + backend = Backend() + backendT = typeof(backend).name.wrapper # To look through CUDABackend{true, false} + @test backend isa backendT + + x = allocate(backend, Float32, 5) + A = allocate(backend, Float32, 5, 5) + @test @inferred(KernelAbstractions.get_backend(A)) isa backendT + @test @inferred(KernelAbstractions.get_backend(view(A, 2:4, 1:3))) isa backendT + @test @inferred(KernelAbstractions.get_backend(Diagonal(x))) isa backendT + @test @inferred(KernelAbstractions.get_backend(Tridiagonal(A))) isa backendT + end -@conditional_testset "sparse" skip_tests begin - backend = Backend() - backendT = typeof(backend).name.wrapper # To look through CUDABackend{true, false} - @test backend isa backendT + @conditional_testset "sparse" skip_tests begin + backend = Backend() + backendT = typeof(backend).name.wrapper # To look through CUDABackend{true, false} + @test backend isa backendT - A = allocate(backend, Float32, 5, 5) - @test @inferred(KernelAbstractions.get_backend(sparse(A))) isa backendT -end + A = allocate(backend, Float32, 5, 5) + @test @inferred(KernelAbstractions.get_backend(sparse(A))) isa backendT + end -@conditional_testset "adapt" skip_tests begin - backend = Backend() - x = allocate(backend, Float32, 5) - @test adapt(CPU(), x) isa Array - y = adapt(backend, Array{Float32}(undef, 5)) - @test typeof(y) == typeof(x) -end + @conditional_testset "adapt" skip_tests begin + backend = Backend() + x = allocate(backend, Float32, 5) + @test adapt(CPU(), x) isa Array + y = adapt(backend, Array{Float32}(undef, 5)) + @test typeof(y) == typeof(x) + end -# TODO: add test for _group and _local_cartesian -@conditional_testset "indextest" skip_tests begin - backend = Backend() - A = allocate(backend, Int, 16, 16) - index_linear_global(backend, 8)(A, ndrange=length(A)) - synchronize(backend) - @test all(A .== LinearIndices(A)) - - A = allocate(backend, Int, 8) - index_linear_local(backend, 8)(A, ndrange=length(A)) - synchronize(backend) - @test all(A .== 1:8) - - A = allocate(backend, Int, 16) - index_linear_local(backend, 8)(A, ndrange=length(A)) - synchronize(backend) - @test all(A[1:8] .== 1:8) - @test all(A[9:16] .== 1:8) - - A = allocate(backend, Int, 8, 2) - index_linear_local(backend, 8)(A, ndrange=length(A)) - synchronize(backend) - @test all(A[1:8] .== 1:8) - @test all(A[9:16] .== 1:8) - - A = allocate(backend, CartesianIndex{2}, 16, 16) - index_cartesian_global(backend, 8)(A, ndrange=size(A)) - synchronize(backend) - @test all(A .== CartesianIndices(A)) - - A = allocate(backend, CartesianIndex{1}, 16, 16) - index_cartesian_global(backend, 8)(A, ndrange=length(A)) - synchronize(backend) - @test all(A[:] .== CartesianIndices((length(A),))) - - # Non-multiplies of the workgroupsize - A = allocate(backend, Int, 7, 7) - index_linear_global(backend, 8)(A, ndrange=length(A)) - synchronize(backend) - @test all(A .== LinearIndices(A)) - - A = allocate(backend, Int, 5) - index_linear_local(backend, 8)(A, ndrange=length(A)) - synchronize(backend) - @test all(A .== 1:5) -end + # TODO: add test for _group and _local_cartesian + @conditional_testset "indextest" skip_tests begin + backend = Backend() + A = allocate(backend, Int, 16, 16) + index_linear_global(backend, 8)(A, ndrange = length(A)) + synchronize(backend) + @test all(A .== LinearIndices(A)) + + A = allocate(backend, Int, 8) + index_linear_local(backend, 8)(A, ndrange = length(A)) + synchronize(backend) + @test all(A .== 1:8) + + A = allocate(backend, Int, 16) + index_linear_local(backend, 8)(A, ndrange = length(A)) + synchronize(backend) + @test all(A[1:8] .== 1:8) + @test all(A[9:16] .== 1:8) + + A = allocate(backend, Int, 8, 2) + index_linear_local(backend, 8)(A, ndrange = length(A)) + synchronize(backend) + @test all(A[1:8] .== 1:8) + @test all(A[9:16] .== 1:8) + + A = allocate(backend, CartesianIndex{2}, 16, 16) + index_cartesian_global(backend, 8)(A, ndrange = size(A)) + synchronize(backend) + @test all(A .== CartesianIndices(A)) + + A = allocate(backend, CartesianIndex{1}, 16, 16) + index_cartesian_global(backend, 8)(A, ndrange = length(A)) + synchronize(backend) + @test all(A[:] .== CartesianIndices((length(A),))) + + # Non-multiplies of the workgroupsize + A = allocate(backend, Int, 7, 7) + index_linear_global(backend, 8)(A, ndrange = length(A)) + synchronize(backend) + @test all(A .== LinearIndices(A)) + + A = allocate(backend, Int, 5) + index_linear_local(backend, 8)(A, ndrange = length(A)) + synchronize(backend) + @test all(A .== 1:5) + end -@kernel function constarg(A, @Const(B)) - I = @index(Global) - @inbounds A[I] = B[I] -end + @kernel function constarg(A, @Const(B)) + I = @index(Global) + @inbounds A[I] = B[I] + end -@conditional_testset "Const" skip_tests begin - let kernel = constarg(Backend(), 8, (1024,)) - # this is poking at internals - iterspace = NDRange{1, StaticSize{(128,)}, StaticSize{(8,)}}(); - ctx = if Backend == CPU - KernelAbstractions.mkcontext(kernel, 1, nothing, iterspace, Val(NoDynamicCheck())) - else - KernelAbstractions.mkcontext(kernel, nothing, iterspace) - end - AT = if Backend == CPU - Array{Float32, 2} - else - BackendArrayT{Float32, 2, 1} # AS 1 - end - IR = sprint() do io + @conditional_testset "Const" skip_tests begin + let kernel = constarg(Backend(), 8, (1024,)) + # this is poking at internals + iterspace = NDRange{1, StaticSize{(128,)}, StaticSize{(8,)}}(); + ctx = if Backend == CPU + KernelAbstractions.mkcontext(kernel, 1, nothing, iterspace, Val(NoDynamicCheck())) + else + KernelAbstractions.mkcontext(kernel, nothing, iterspace) + end + AT = if Backend == CPU + Array{Float32, 2} + else + BackendArrayT{Float32, 2, 1} # AS 1 + end + IR = sprint() do io + if backend_str == "CPU" + code_llvm( + io, kernel.f, (typeof(ctx), AT, AT), + optimize = false, raw = true, + ) + else + backend_mod.code_llvm( + io, kernel.f, (typeof(ctx), AT, AT), + kernel = true, optimize = true, + ) + end + end if backend_str == "CPU" - code_llvm(io, kernel.f, (typeof(ctx), AT, AT), - optimize=false, raw=true) + @test occursin("!alias.scope", IR) + @test occursin("!noalias", IR) + elseif backend_str == "CUDA" + @test occursin("@llvm.nvvm.ldg", IR) + elseif backend_str == "ROCM" + @test occursin("addrspace(4)", IR) else - backend_mod.code_llvm(io, kernel.f, (typeof(ctx), AT, AT), - kernel=true, optimize=true) + @test_skip false end end - if backend_str == "CPU" - @test occursin("!alias.scope", IR) - @test occursin("!noalias", IR) - elseif backend_str == "CUDA" - @test occursin("@llvm.nvvm.ldg", IR) - elseif backend_str == "ROCM" - @test occursin("addrspace(4)", IR) - else - @test_skip false - end end -end -@kernel function kernel_val!(a, ::Val{m}) where {m} - I = @index(Global) - @inbounds a[I] = m -end + @kernel function kernel_val!(a, ::Val{m}) where {m} + I = @index(Global) + @inbounds a[I] = m + end -A = KernelAbstractions.zeros(Backend(), Int64, 1024) -kernel_val!(Backend())(A, Val(3), ndrange=size(A)) -synchronize(Backend()) -@test all((a)->a==3, A) + A = KernelAbstractions.zeros(Backend(), Int64, 1024) + kernel_val!(Backend())(A, Val(3), ndrange = size(A)) + synchronize(Backend()) + @test all((a) -> a == 3, A) -@kernel function kernel_empty() - nothing -end + @kernel function kernel_empty() + nothing + end -@conditional_testset "CPU synchronization" skip_tests begin - kernel_empty(CPU(), 1)(ndrange=1) - synchronize(CPU()) -end + @conditional_testset "CPU synchronization" skip_tests begin + kernel_empty(CPU(), 1)(ndrange = 1) + synchronize(CPU()) + end -@conditional_testset "Zero iteration space $Backend" skip_tests begin - kernel_empty(Backend(), 1)(ndrange=1) - kernel_empty(Backend(), 1)(ndrange=0) - synchronize(Backend()) + @conditional_testset "Zero iteration space $Backend" skip_tests begin + kernel_empty(Backend(), 1)(ndrange = 1) + kernel_empty(Backend(), 1)(ndrange = 0) + synchronize(Backend()) - kernel_empty(Backend(), 1)(ndrange=0) - synchronize(Backend()) -end + kernel_empty(Backend(), 1)(ndrange = 0) + synchronize(Backend()) + end -@conditional_testset "return statement" skip_tests begin - try - @eval @kernel function kernel_return() - return + @conditional_testset "return statement" skip_tests begin + try + @eval @kernel function kernel_return() + return + end + @test false + catch e + @test e.error == + ErrorException("Return statement not permitted in a kernel function kernel_return") end - @test false - catch e - @test e.error == - ErrorException("Return statement not permitted in a kernel function kernel_return") end -end -@conditional_testset "fallback test: callable types" skip_tests begin - @eval begin - function f end - @kernel function (a::typeof(f))(x, ::Val{m}) where m - I = @index(Global) - @inbounds x[I] = m - end - @kernel function (a::typeof(f))(x, ::Val{1}) - I = @index(Global) - @inbounds x[I] = 1 + @conditional_testset "fallback test: callable types" skip_tests begin + @eval begin + function f end + @kernel function (a::typeof(f))(x, ::Val{m}) where {m} + I = @index(Global) + @inbounds x[I] = m + end + @kernel function (a::typeof(f))(x, ::Val{1}) + I = @index(Global) + @inbounds x[I] = 1 + end + x = [1, 2, 3] + env = f(CPU())(x, Val(4); ndrange = length(x)) + synchronize(CPU()) + @test x == [4, 4, 4] + + x = [1, 2, 3] + env = f(CPU())(x, Val(1); ndrange = length(x)) + synchronize(CPU()) + @test x == [1, 1, 1] end - x = [1,2,3] - env = f(CPU())(x, Val(4); ndrange=length(x)) - synchronize(CPU()) - @test x == [4,4,4] - - x = [1,2,3] - env = f(CPU())(x, Val(1); ndrange=length(x)) - synchronize(CPU()) - @test x == [1,1,1] end -end -@conditional_testset "priority" skip_tests begin - KernelAbstractions.priority!(Backend(), :normal) - KernelAbstractions.priority!(Backend(), :high) - KernelAbstractions.priority!(Backend(), :low) + @conditional_testset "priority" skip_tests begin + KernelAbstractions.priority!(Backend(), :normal) + KernelAbstractions.priority!(Backend(), :high) + KernelAbstractions.priority!(Backend(), :low) - @test_throws ErrorException KernelAbstractions.priority!(Backend(), :default) -end + @test_throws ErrorException KernelAbstractions.priority!(Backend(), :default) + end -function f(KernelAbstractions.@context, a) - I = @index(Global, Linear) - a[I] = 1 -end -@kernel cpu=false function context_kernel(a) - f(KernelAbstractions.@context, a) -end + function f(KernelAbstractions.@context, a) + I = @index(Global, Linear) + a[I] = 1 + end + @kernel cpu = false function context_kernel(a) + f(KernelAbstractions.@context, a) + end -@testset "No CPU kernel" begin - if !(Backend() isa CPU) - A = KernelAbstractions.zeros(Backend(), Int64, 1024) - context_kernel(Backend())(A, ndrange=size(A)) - synchronize(Backend()) - @test all((a)->a==1, A) - else - @test_throws ErrorException("This kernel is unavailable for backend CPU") context_kernel(Backend()) + @testset "No CPU kernel" begin + if !(Backend() isa CPU) + A = KernelAbstractions.zeros(Backend(), Int64, 1024) + context_kernel(Backend())(A, ndrange = size(A)) + synchronize(Backend()) + @test all((a) -> a == 1, A) + else + @test_throws ErrorException("This kernel is unavailable for backend CPU") context_kernel(Backend()) + end end -end -@testset "functional" begin - @test KernelAbstractions.functional(Backend()) isa Union{Missing, Bool} -end + @testset "functional" begin + @test KernelAbstractions.functional(Backend()) isa Union{Missing, Bool} + end -@testset "CPU default workgroupsize" begin - @test KernelAbstractions.default_cpu_workgroupsize((64,)) == (64,) - @test KernelAbstractions.default_cpu_workgroupsize((1024,)) == (1024,) - @test KernelAbstractions.default_cpu_workgroupsize((2056,)) == (1024,) - @test KernelAbstractions.default_cpu_workgroupsize((64, 64,)) == (64,16) - @test KernelAbstractions.default_cpu_workgroupsize((64, 64, 64,4)) == (64,16,1,1) - @test KernelAbstractions.default_cpu_workgroupsize((64,15)) == (64,15) - @test KernelAbstractions.default_cpu_workgroupsize((5,7,13,17)) == (5,7,13,2) -end + @testset "CPU default workgroupsize" begin + @test KernelAbstractions.default_cpu_workgroupsize((64,)) == (64,) + @test KernelAbstractions.default_cpu_workgroupsize((1024,)) == (1024,) + @test KernelAbstractions.default_cpu_workgroupsize((2056,)) == (1024,) + @test KernelAbstractions.default_cpu_workgroupsize((64, 64)) == (64, 16) + @test KernelAbstractions.default_cpu_workgroupsize((64, 64, 64, 4)) == (64, 16, 1, 1) + @test KernelAbstractions.default_cpu_workgroupsize((64, 15)) == (64, 15) + @test KernelAbstractions.default_cpu_workgroupsize((5, 7, 13, 17)) == (5, 7, 13, 2) + end end diff --git a/test/testsuite.jl b/test/testsuite.jl index cd78e76d..242d2e85 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -4,19 +4,24 @@ using ..KernelAbstractions using ..Test # We can't add test-dependencies withouth breaking backend packages -const Pkg = Base.require(Base.PkgId( - Base.UUID("44cfe95a-1eb2-52ea-b672-e2afdf69b78f"), "Pkg")) +const Pkg = Base.require( + Base.PkgId( + Base.UUID("44cfe95a-1eb2-52ea-b672-e2afdf69b78f"), "Pkg", + ), +) macro conditional_testset(name, skip_tests, expr) - esc(quote - @testset $name begin - if $name ∉ $skip_tests - $expr - else - @test_skip false + esc( + quote + @testset $name begin + if $name ∉ $skip_tests + $expr + else + @test_skip false + end end - end - end) + end, + ) end diff --git a/test/unroll.jl b/test/unroll.jl index 6b33fa76..7bbbab4d 100644 --- a/test/unroll.jl +++ b/test/unroll.jl @@ -3,37 +3,37 @@ using KernelAbstractions.Extras using StaticArrays @kernel function kernel_unroll!(a) - @unroll for i in 1:5 - @inbounds a[i] = i - end + @unroll for i in 1:5 + @inbounds a[i] = i + end end -@kernel function kernel_unroll!(a, ::Val{N}) where N - let M = N+5 - @unroll for i in 6:M - @inbounds a[i-5] = i +@kernel function kernel_unroll!(a, ::Val{N}) where {N} + let M = N + 5 + @unroll for i in 6:M + @inbounds a[i - 5] = i + end + @synchronize end - @synchronize - end end # Check that nested `@unroll` doesn't throw a syntax error @kernel function kernel_unroll2!(A) - @uniform begin - a = MVector{3, Float32}(1, 2, 3) - b = MVector{3, Float32}(3, 2, 1) - c = MMatrix{3, 3, Float32}(undef) - end - I = @index(Global) - @inbounds for m in 1:3 - @unroll for j = 1:3 - @unroll for i = 1:3 - c[1, j] = m * a[1] * b[j] - end + @uniform begin + a = MVector{3, Float32}(1, 2, 3) + b = MVector{3, Float32}(3, 2, 1) + c = MMatrix{3, 3, Float32}(undef) + end + I = @index(Global) + @inbounds for m in 1:3 + @unroll for j in 1:3 + @unroll for i in 1:3 + c[1, j] = m * a[1] * b[j] + end + end + A[I] = c[1, 1] + @synchronize(m % 2 == 0) end - A[I] = c[1, 1] - @synchronize(m % 2 == 0) - end end function unroll_testsuite(backend, ArrayT)