diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml new file mode 100644 index 0000000..43cfba3 --- /dev/null +++ b/.buildkite/pipeline.yml @@ -0,0 +1,164 @@ +steps: + + # CUDA + - label: "CUDA - Julia v1.10" + plugins: + - JuliaCI/julia#v1: + version: "1.10" + command: | + julia -e 'using Pkg + + println("--- :julia: Instantiating environment") + Pkg.add("CUDA") + Pkg.develop(path=".") + + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--CUDA"])' + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 + + - label: "CUDA - Julia v1.11" + plugins: + - JuliaCI/julia#v1: + version: "1.11" + command: | + julia -e 'using Pkg + + println("--- :julia: Instantiating environment") + Pkg.add("CUDA") + Pkg.develop(path=".") + + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--CUDA"])' + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 + + + # AMDGPU + - label: "AMDGPU - Julia v1.10" + plugins: + - JuliaCI/julia#v1: + version: "1.10" + command: | + julia -e 'using Pkg + + println("--- :julia: Instantiating environment") + Pkg.add("AMDGPU") + Pkg.develop(path=".") + + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--AMDGPU"])' + agents: + queue: "juliagpu" + rocm: "*" + rocmgpu: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 + + - label: "AMDGPU - Julia v1.11" + plugins: + - JuliaCI/julia#v1: + version: "1.11" + command: | + julia -e 'using Pkg + + println("--- :julia: Instantiating environment") + Pkg.add("AMDGPU") + Pkg.develop(path=".") + + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--AMDGPU"])' + agents: + queue: "juliagpu" + rocm: "*" + rocmgpu: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 + + + # oneAPI + - label: "oneAPI - Julia v1.10" + plugins: + - JuliaCI/julia#v1: + version: "1.10" + command: | + julia -e 'using Pkg + + println("--- :julia: Instantiating environment") + Pkg.add("oneAPI") + Pkg.develop(path=".") + + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])' + agents: + queue: "juliagpu" + intel: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 + + - label: "oneAPI - Julia v1.11" + plugins: + - JuliaCI/julia#v1: + version: "1.11" + command: | + julia -e 'using Pkg + + println("--- :julia: Instantiating environment") + Pkg.add("oneAPI") + Pkg.develop(path=".") + + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])' + agents: + queue: "juliagpu" + intel: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 + + + # Metal + - label: "Metal - Julia v1.10" + plugins: + - JuliaCI/julia#v1: + version: "1.10" + command: | + julia -e 'using Pkg + + println("--- :julia: Instantiating environment") + Pkg.add("Metal") + Pkg.develop(path=".") + + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--Metal"])' + agents: + queue: "juliaecosystem" + os: "macos" + arch: "aarch64" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 + + - label: "Metal - Julia v1.11" + plugins: + - JuliaCI/julia#v1: + version: "1.11" + command: | + julia -e 'using Pkg + + println("--- :julia: Instantiating environment") + Pkg.add("Metal") + Pkg.develop(path=".") + + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--Metal"])' + agents: + queue: "juliaecosystem" + os: "macos" + arch: "aarch64" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 diff --git a/Project.toml b/Project.toml index 3e8a382..2962cf7 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "AcceleratedKernels" uuid = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" authors = ["Andrei-Leonard Nicusan and contributors"] -version = "0.3.0-DEV" +version = "0.2.1" [deps] ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197" @@ -14,12 +14,12 @@ Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588" Unrolled = "9602ed7d-8fef-5bc8-8597-8f21381861e8" [compat] -ArgCheck = "2.1" +ArgCheck = "2" DocStringExtensions = "0.9" -GPUArraysCore = "0.2" +GPUArraysCore = "0.1, 0.2" KernelAbstractions = "0.9" -Markdown = "1.10" -OhMyThreads = "0.7.0" +Markdown = "1" +OhMyThreads = "0.7" Polyester = "0.7" -Unrolled = "0.1.5" +Unrolled = "0.1" julia = "1.10" diff --git a/README.md b/README.md index 572c7e9..fcaa327 100644 --- a/README.md +++ b/README.md @@ -4,12 +4,170 @@ [![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://juliagpu.github.io/AcceleratedKernels.jl/stable/) [![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://juliagpu.github.io/AcceleratedKernels.jl/dev/) -[![CI-CPU](https://github.com/juliagpu/AcceleratedKernels.jl/actions/workflows/CI-CPU.yml/badge.svg)](https://github.com/juliagpu/AcceleratedKernels.jl/actions/workflows/CI-CPU.yml) [![Aqua QA](https://raw.githubusercontent.com/JuliaTesting/Aqua.jl/master/badge.svg)](https://github.com/JuliaTesting/Aqua.jl) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -Parallel algorithm building blocks for the Julia ecosystem, targeting multithreaded CPUs, and GPUs via Intel oneAPI, AMD ROCm, Apple Metal and Nvidia CUDA (and any future backends added to the [JuliaGPU](https://juliagpu.org/) organisation). +Parallel algorithm building blocks for the Julia ecosystem, targeting multithreaded CPUs, and GPUs via Intel oneAPI, AMD ROCm, Apple Metal and Nvidia CUDA (and any future backends added to the [JuliaGPU](https://juliagpu.org/) organisation) from a unified KernelAbstractions.jl codebase. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AK BackendJulia VersionCI Status
+ +CPU Single- and Multi-Threaded + + + +Julia LTS, Stable, Pre-Release + +x86 and x64 + +Windows, Ubuntu, MacOS + + + +[![CI-CPU](https://github.com/juliagpu/AcceleratedKernels.jl/actions/workflows/CI-CPU.yml/badge.svg)](https://github.com/juliagpu/AcceleratedKernels.jl/actions/workflows/CI-CPU.yml) + +
+ +[CUDA](https://github.com/JuliaGPU/CUDA.jl) + + + +Julia v1.10 + + + +[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=CUDA%20-%20Julia%20v1.10)](https://buildkite.com/julialang/acceleratedkernels-dot-jl) + +
+ +Julia v1.11 + + + +[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=CUDA%20-%20Julia%20v1.11)](https://buildkite.com/julialang/acceleratedkernels-dot-jl) + +
+ +[AMDGPU](https://github.com/JuliaGPU/AMDGPU.jl) + + + +Julia v1.10 + + + +[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=AMDGPU%20-%20Julia%20v1.10)](https://buildkite.com/julialang/acceleratedkernels-dot-jl) + +
+ +Julia v1.11 + + + +[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=AMDGPU%20-%20Julia%20v1.11)](https://buildkite.com/julialang/acceleratedkernels-dot-jl) + +
+ +[oneAPI](https://github.com/JuliaGPU/oneAPI.jl) + + + +Julia v1.10 + + + +[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=oneAPI%20-%20Julia%20v1.10)](https://buildkite.com/julialang/acceleratedkernels-dot-jl) + +
+ +Julia v1.11 + + + +[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=oneAPI%20-%20Julia%20v1.11)](https://buildkite.com/julialang/acceleratedkernels-dot-jl) + +
+ +[Metal](https://github.com/JuliaGPU/Metal.jl) + +[Known Issue](https://github.com/JuliaGPU/AcceleratedKernels.jl/issues/10) + + + +Julia v1.10 + + + +[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=Metal%20-%20Julia%20v1.10)](https://buildkite.com/julialang/acceleratedkernels-dot-jl) + +
+ +Julia v1.11 + + + +[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=Metal%20-%20Julia%20v1.11)](https://buildkite.com/julialang/acceleratedkernels-dot-jl) + +
- [1. What's Different?](#1-whats-different) @@ -43,11 +201,11 @@ Again, this is only possible because of the unique Julia compilation model, the ## 2. Status -The AcceleratedKernels.jl sorters were adopted as the official [AMDGPU algorithms](https://github.com/JuliaGPU/AMDGPU.jl/pull/688)! The API is starting to stabilise; it follows the Julia standard library fairly closely - additionally exposing all temporary arrays for memory reuse. For any new ideas / requests, please join the conversation on [Julia Discourse](https://discourse.julialang.org/t/ann-acceleratedkernels-jl-cross-architecture-parallel-algorithms-for-julias-gpu-backends/119698/16) or post [an issue](https://github.com/juliagpu/AcceleratedKernels.jl/issues). +The AcceleratedKernels.jl sorters were adopted as the official [AMDGPU algorithms](https://github.com/JuliaGPU/AMDGPU.jl/pull/688)! The API is starting to stabilise; it follows the Julia standard library fairly closely - and additionally exposing all temporary arrays for memory reuse. For any new ideas / requests, please join the conversation on [Julia Discourse](https://discourse.julialang.org/t/ann-acceleratedkernels-jl-cross-architecture-parallel-algorithms-for-julias-gpu-backends/119698/16) or post [an issue](https://github.com/juliagpu/AcceleratedKernels.jl/issues). -We have an extensive test suite; however, I only ran them locally on the oneAPI (laptop Intel UHD Graphics 620), CUDA (laptop with Nvidia Quadro RTX 4000 and data centre Nvidia A100-40), Metal (Mac M2 and M3), and AMD (data centre AMD MI210) backends. Some kinks might still exist for some platform / OS permutations before a CI is set up. +We have an extensive randomised test suite that we run on the CPU (single- and multi-threaded) backend on Windows, Ubuntu and MacOS for Julia LTS, Stable, and Pre-Release, plus the CUDA, AMDGPU, oneAPI and Metal backends on the [JuliaGPU buildkite](https://github.com/JuliaGPU/buildkite). -AcceleratedKernels.jl will also be a fundamental building block of applications developed at [EvoPhase](https://evophase.co.uk/), so it will see continuous heavy use with industry backing. Long-term stability, performance improvements and support are priorities for us. +AcceleratedKernels.jl is also be a fundamental building block of applications developed at [EvoPhase](https://evophase.co.uk/), so it will see continuous heavy use with industry backing. Long-term stability, performance improvements and support are priorities for us. ## 3. Benchmarks @@ -560,8 +718,6 @@ Leave out to test the CPU backend: $> julia -e 'import Pkg; Pkg.test("AcceleratedKernels.jl") ``` -**TODO**: talk with the JuliaGPU team to add library to their [BuildKite agents](https://github.com/JuliaGPU/buildkite) CI. - ## 8. Issues and Debugging As the compilation pipeline of GPU kernels is different to that of base Julia, error messages also look different - for example, where Julia would insert an exception when a variable name was not defined (e.g. we had a typo), a GPU kernel throwing exceptions cannot be compiled and instead you'll see some cascading errors like `"[...] compiling [...] resulted in invalid LLVM IR"` caused by `"Reason: unsupported use of an undefined name"` resulting in `"Reason: unsupported dynamic function invocation"`, etc. diff --git a/prototype/Project.toml b/prototype/Project.toml index 88d95f6..0463a0b 100644 --- a/prototype/Project.toml +++ b/prototype/Project.toml @@ -4,4 +4,5 @@ Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +Metal = "dde4c033-4e86-420c-a63e-0dd931031962" PProf = "e4faabce-9ead-11e9-39d9-4379958e3056" diff --git a/prototype/accumulate_benchmark.jl b/prototype/accumulate_benchmark.jl new file mode 100644 index 0000000..6b9191a --- /dev/null +++ b/prototype/accumulate_benchmark.jl @@ -0,0 +1,33 @@ +using BenchmarkTools +using Metal +import AcceleratedKernels as AK + +using Random +Random.seed!(0) + + +function akacc(v) + va = AK.accumulate(+, v, init=zero(eltype(v)), block_size=512) + Metal.synchronize() + va +end + + +function baseacc(v) + va = accumulate(+, v, init=zero(eltype(v))) + Metal.synchronize() + va +end + + +v = MtlArray(rand(1:100, 1_000_000)) + +# Correctness checks +va = akacc(v) |> Array +vb = baseacc(v) |> Array +# @assert va == vb + +# Benchmarks +println("Base vs AK") +display(@benchmark baseacc($v)) +display(@benchmark akacc($v)) diff --git a/src/accumulate.jl b/src/accumulate.jl index 8b3a27e..6e59e30 100644 --- a/src/accumulate.jl +++ b/src/accumulate.jl @@ -143,7 +143,6 @@ end len = length(v) block_size = @groupsize()[1] - temp = @localmem eltype(v) (2 * block_size,) # NOTE: for many index calculations in this library, computation using zero-indexing leads to # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero @@ -155,13 +154,6 @@ end ithread = @index(Local, Linear) - 1 block_offset = iblock * block_size * 2 # Processing two elements per thread - # Copy two elements from the main array - ai = ithread - bi = ithread + block_size - - temp[ai + 1] = block_offset + ai < len ? v[block_offset + ai + 1] : init - temp[bi + 1] = block_offset + bi < len ? v[block_offset + bi + 1] : init - # Each block looks back to find running prefix sum running_prefix = init inspected_block = iblock - 1 @@ -180,14 +172,14 @@ end end # Now we have aggregate prefix of all previous blocks, add it to all our elements - temp[ai + 1] = op(running_prefix, temp[ai + 1]) - temp[bi + 1] = op(running_prefix, temp[bi + 1]) - + ai = ithread if block_offset + ai < len - v[block_offset + ai + 1] = temp[ai + 1] + v[block_offset + ai + 1] = op(running_prefix, v[block_offset + ai + 1]) end + + bi = ithread + block_size if block_offset + bi < len - v[block_offset + bi + 1] = temp[bi + 1] + v[block_offset + bi + 1] = op(running_prefix, v[block_offset + bi + 1]) end # Set flag for "aggregate of all prefixes up to this block finished" @@ -215,6 +207,11 @@ function accumulate!( @argcheck block_size > 0 @argcheck ispow2(block_size) + # Nothing to accumulate + if length(v) == 0 + return v + end + # Each thread will process two elements elems_per_block = block_size * 2 num_blocks = (length(v) + elems_per_block - 1) ÷ elems_per_block @@ -246,7 +243,7 @@ function accumulate!( ndrange=(num_blocks - 1) * block_size) end - nothing + return v end @@ -275,8 +272,8 @@ function accumulate!( for i in eachindex(v) v[i], running = running, op(running, v[i]) end - end + return v end diff --git a/test/runtests.jl b/test/runtests.jl index e3ba362..a530935 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -911,10 +911,10 @@ end mbase = minbox_base(vh, dims) @test eltype(mgpu) === eltype(mcpu) === eltype(mbase) - for (i, mgpu_red) in enumerate(Array(mgpu)) - @test mgpu_red[1] ≈ mcpu[i][1] ≈ mbase[i][1] - @test mgpu_red[2] ≈ mcpu[i][2] ≈ mbase[i][2] - end + @test all([ + (mgpu_red[1] ≈ mcpu[i][1] ≈ mbase[i][1]) && (mgpu_red[2] ≈ mcpu[i][2] ≈ mbase[i][2]) + for (i, mgpu_red) in enumerate(Array(mgpu)) + ]) end end