From 6fb3b3c7d8ac70fe70c83877d9bb577573ef7291 Mon Sep 17 00:00:00 2001 From: Julian Samaroo Date: Mon, 11 Nov 2024 10:38:09 -0600 Subject: [PATCH 01/10] Add Buildkite CI for CUDA --- .buildkite/pipeline.yml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 .buildkite/pipeline.yml diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml new file mode 100644 index 0000000..edbef1f --- /dev/null +++ b/.buildkite/pipeline.yml @@ -0,0 +1,26 @@ +steps: + - label: "CUDA - Julia v1.10" + plugins: + - JuliaCI/julia#v1: + version: "1.10" + - JuliaCI/julia-test#v1: ~ + - JuliaCI/julia-coverage#v1: + codecov: true + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 + + - label: "CUDA - Julia v1.11" + plugins: + - JuliaCI/julia#v1: + version: "1.11" + - JuliaCI/julia-test#v1: ~ + - JuliaCI/julia-coverage#v1: + codecov: true + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 From e7ea9a4cd274a1758786314b6926f5c336517976 Mon Sep 17 00:00:00 2001 From: anicusan Date: Mon, 11 Nov 2024 17:12:58 +0000 Subject: [PATCH 02/10] added separate GPU CIs for each backend --- .buildkite/CI-AMDGPU.yml | 40 ++++++++++++++++++++++++++++++++++++++++ .buildkite/CI-CUDA.yml | 38 ++++++++++++++++++++++++++++++++++++++ .buildkite/CI-Metal.yml | 40 ++++++++++++++++++++++++++++++++++++++++ .buildkite/CI-oneAPI.yml | 38 ++++++++++++++++++++++++++++++++++++++ .buildkite/pipeline.yml | 26 -------------------------- 5 files changed, 156 insertions(+), 26 deletions(-) create mode 100644 .buildkite/CI-AMDGPU.yml create mode 100644 .buildkite/CI-CUDA.yml create mode 100644 .buildkite/CI-Metal.yml create mode 100644 .buildkite/CI-oneAPI.yml delete mode 100644 .buildkite/pipeline.yml diff --git a/.buildkite/CI-AMDGPU.yml b/.buildkite/CI-AMDGPU.yml new file mode 100644 index 0000000..8bddcaf --- /dev/null +++ b/.buildkite/CI-AMDGPU.yml @@ -0,0 +1,40 @@ +steps: + - label: "AMDGPU - Julia v1.10" + plugins: + - JuliaCI/julia#v1: + version: "1.10" + command: | + julia -e 'using Pkg + + println("--- :julia: Instantiating environment") + Pkg.add("AMDGPU") + Pkg.develop(path=".") + + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--AMDGPU"])' + agents: + queue: "juliagpu" + rocm: "*" + rocmgpu: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 + + - label: "AMDGPU - Julia v1.11" + plugins: + - JuliaCI/julia#v1: + version: "1.11" + command: | + julia -e 'using Pkg + + println("--- :julia: Instantiating environment") + Pkg.add("AMDGPU") + Pkg.develop(path=".") + + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--AMDGPU"])' + agents: + queue: "juliagpu" + rocm: "*" + rocmgpu: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 diff --git a/.buildkite/CI-CUDA.yml b/.buildkite/CI-CUDA.yml new file mode 100644 index 0000000..dd6b785 --- /dev/null +++ b/.buildkite/CI-CUDA.yml @@ -0,0 +1,38 @@ +steps: + - label: "CUDA - Julia v1.10" + plugins: + - JuliaCI/julia#v1: + version: "1.10" + command: | + julia -e 'using Pkg + + println("--- :julia: Instantiating environment") + Pkg.add("CUDA") + Pkg.develop(path=".") + + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--CUDA"])' + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 + + - label: "CUDA - Julia v1.11" + plugins: + - JuliaCI/julia#v1: + version: "1.11" + command: | + julia -e 'using Pkg + + println("--- :julia: Instantiating environment") + Pkg.add("CUDA") + Pkg.develop(path=".") + + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--CUDA"])' + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 diff --git a/.buildkite/CI-Metal.yml b/.buildkite/CI-Metal.yml new file mode 100644 index 0000000..d48567c --- /dev/null +++ b/.buildkite/CI-Metal.yml @@ -0,0 +1,40 @@ +steps: + - label: "Metal - Julia v1.10" + plugins: + - JuliaCI/julia#v1: + version: "1.10" + command: | + julia -e 'using Pkg + + println("--- :julia: Instantiating environment") + Pkg.add("Metal") + Pkg.develop(path=".") + + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--Metal"])' + agents: + queue: "juliaecosystem" + os: "macos" + arch: "aarch64" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 + + - label: "Metal - Julia v1.11" + plugins: + - JuliaCI/julia#v1: + version: "1.11" + command: | + julia -e 'using Pkg + + println("--- :julia: Instantiating environment") + Pkg.add("Metal") + Pkg.develop(path=".") + + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--Metal"])' + agents: + queue: "juliaecosystem" + os: "macos" + arch: "aarch64" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 diff --git a/.buildkite/CI-oneAPI.yml b/.buildkite/CI-oneAPI.yml new file mode 100644 index 0000000..6d64bfa --- /dev/null +++ b/.buildkite/CI-oneAPI.yml @@ -0,0 +1,38 @@ +steps: + - label: "oneAPI - Julia v1.10" + plugins: + - JuliaCI/julia#v1: + version: "1.10" + command: | + julia -e 'using Pkg + + println("--- :julia: Instantiating environment") + Pkg.add("oneAPI") + Pkg.develop(path=".") + + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])' + agents: + queue: "juliagpu" + intel: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 + + - label: "oneAPI - Julia v1.11" + plugins: + - JuliaCI/julia#v1: + version: "1.11" + command: | + julia -e 'using Pkg + + println("--- :julia: Instantiating environment") + Pkg.add("oneAPI") + Pkg.develop(path=".") + + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])' + agents: + queue: "juliagpu" + intel: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml deleted file mode 100644 index edbef1f..0000000 --- a/.buildkite/pipeline.yml +++ /dev/null @@ -1,26 +0,0 @@ -steps: - - label: "CUDA - Julia v1.10" - plugins: - - JuliaCI/julia#v1: - version: "1.10" - - JuliaCI/julia-test#v1: ~ - - JuliaCI/julia-coverage#v1: - codecov: true - agents: - queue: "juliagpu" - cuda: "*" - if: build.message !~ /\[skip tests\]/ - timeout_in_minutes: 60 - - - label: "CUDA - Julia v1.11" - plugins: - - JuliaCI/julia#v1: - version: "1.11" - - JuliaCI/julia-test#v1: ~ - - JuliaCI/julia-coverage#v1: - codecov: true - agents: - queue: "juliagpu" - cuda: "*" - if: build.message !~ /\[skip tests\]/ - timeout_in_minutes: 60 From 0e86e1dd0d0f4b00ec5c7b8f3738845fb0dff73f Mon Sep 17 00:00:00 2001 From: anicusan Date: Mon, 11 Nov 2024 17:17:33 +0000 Subject: [PATCH 03/10] triggering from pipeline.yml --- .buildkite/pipeline.yml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .buildkite/pipeline.yml diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml new file mode 100644 index 0000000..34939b1 --- /dev/null +++ b/.buildkite/pipeline.yml @@ -0,0 +1,5 @@ +steps: + - trigger: ci-cuda + - trigger: ci-amdgpu + - trigger: ci-oneapi + - trigger: ci-metal From 56b689745d8c76b9a335afce2d3f08206ad9f5ed Mon Sep 17 00:00:00 2001 From: anicusan Date: Mon, 11 Nov 2024 17:21:10 +0000 Subject: [PATCH 04/10] you know what, let's keep it simple... --- .buildkite/CI-AMDGPU.yml | 40 ---------- .buildkite/CI-CUDA.yml | 38 --------- .buildkite/CI-Metal.yml | 40 ---------- .buildkite/CI-oneAPI.yml | 38 --------- .buildkite/pipeline.yml | 167 ++++++++++++++++++++++++++++++++++++++- 5 files changed, 163 insertions(+), 160 deletions(-) delete mode 100644 .buildkite/CI-AMDGPU.yml delete mode 100644 .buildkite/CI-CUDA.yml delete mode 100644 .buildkite/CI-Metal.yml delete mode 100644 .buildkite/CI-oneAPI.yml diff --git a/.buildkite/CI-AMDGPU.yml b/.buildkite/CI-AMDGPU.yml deleted file mode 100644 index 8bddcaf..0000000 --- a/.buildkite/CI-AMDGPU.yml +++ /dev/null @@ -1,40 +0,0 @@ -steps: - - label: "AMDGPU - Julia v1.10" - plugins: - - JuliaCI/julia#v1: - version: "1.10" - command: | - julia -e 'using Pkg - - println("--- :julia: Instantiating environment") - Pkg.add("AMDGPU") - Pkg.develop(path=".") - - println("+++ :julia: Running tests") - Pkg.test("AcceleratedKernels", test_args=["--AMDGPU"])' - agents: - queue: "juliagpu" - rocm: "*" - rocmgpu: "*" - if: build.message !~ /\[skip tests\]/ - timeout_in_minutes: 60 - - - label: "AMDGPU - Julia v1.11" - plugins: - - JuliaCI/julia#v1: - version: "1.11" - command: | - julia -e 'using Pkg - - println("--- :julia: Instantiating environment") - Pkg.add("AMDGPU") - Pkg.develop(path=".") - - println("+++ :julia: Running tests") - Pkg.test("AcceleratedKernels", test_args=["--AMDGPU"])' - agents: - queue: "juliagpu" - rocm: "*" - rocmgpu: "*" - if: build.message !~ /\[skip tests\]/ - timeout_in_minutes: 60 diff --git a/.buildkite/CI-CUDA.yml b/.buildkite/CI-CUDA.yml deleted file mode 100644 index dd6b785..0000000 --- a/.buildkite/CI-CUDA.yml +++ /dev/null @@ -1,38 +0,0 @@ -steps: - - label: "CUDA - Julia v1.10" - plugins: - - JuliaCI/julia#v1: - version: "1.10" - command: | - julia -e 'using Pkg - - println("--- :julia: Instantiating environment") - Pkg.add("CUDA") - Pkg.develop(path=".") - - println("+++ :julia: Running tests") - Pkg.test("AcceleratedKernels", test_args=["--CUDA"])' - agents: - queue: "juliagpu" - cuda: "*" - if: build.message !~ /\[skip tests\]/ - timeout_in_minutes: 60 - - - label: "CUDA - Julia v1.11" - plugins: - - JuliaCI/julia#v1: - version: "1.11" - command: | - julia -e 'using Pkg - - println("--- :julia: Instantiating environment") - Pkg.add("CUDA") - Pkg.develop(path=".") - - println("+++ :julia: Running tests") - Pkg.test("AcceleratedKernels", test_args=["--CUDA"])' - agents: - queue: "juliagpu" - cuda: "*" - if: build.message !~ /\[skip tests\]/ - timeout_in_minutes: 60 diff --git a/.buildkite/CI-Metal.yml b/.buildkite/CI-Metal.yml deleted file mode 100644 index d48567c..0000000 --- a/.buildkite/CI-Metal.yml +++ /dev/null @@ -1,40 +0,0 @@ -steps: - - label: "Metal - Julia v1.10" - plugins: - - JuliaCI/julia#v1: - version: "1.10" - command: | - julia -e 'using Pkg - - println("--- :julia: Instantiating environment") - Pkg.add("Metal") - Pkg.develop(path=".") - - println("+++ :julia: Running tests") - Pkg.test("AcceleratedKernels", test_args=["--Metal"])' - agents: - queue: "juliaecosystem" - os: "macos" - arch: "aarch64" - if: build.message !~ /\[skip tests\]/ - timeout_in_minutes: 60 - - - label: "Metal - Julia v1.11" - plugins: - - JuliaCI/julia#v1: - version: "1.11" - command: | - julia -e 'using Pkg - - println("--- :julia: Instantiating environment") - Pkg.add("Metal") - Pkg.develop(path=".") - - println("+++ :julia: Running tests") - Pkg.test("AcceleratedKernels", test_args=["--Metal"])' - agents: - queue: "juliaecosystem" - os: "macos" - arch: "aarch64" - if: build.message !~ /\[skip tests\]/ - timeout_in_minutes: 60 diff --git a/.buildkite/CI-oneAPI.yml b/.buildkite/CI-oneAPI.yml deleted file mode 100644 index 6d64bfa..0000000 --- a/.buildkite/CI-oneAPI.yml +++ /dev/null @@ -1,38 +0,0 @@ -steps: - - label: "oneAPI - Julia v1.10" - plugins: - - JuliaCI/julia#v1: - version: "1.10" - command: | - julia -e 'using Pkg - - println("--- :julia: Instantiating environment") - Pkg.add("oneAPI") - Pkg.develop(path=".") - - println("+++ :julia: Running tests") - Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])' - agents: - queue: "juliagpu" - intel: "*" - if: build.message !~ /\[skip tests\]/ - timeout_in_minutes: 60 - - - label: "oneAPI - Julia v1.11" - plugins: - - JuliaCI/julia#v1: - version: "1.11" - command: | - julia -e 'using Pkg - - println("--- :julia: Instantiating environment") - Pkg.add("oneAPI") - Pkg.develop(path=".") - - println("+++ :julia: Running tests") - Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])' - agents: - queue: "juliagpu" - intel: "*" - if: build.message !~ /\[skip tests\]/ - timeout_in_minutes: 60 diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 34939b1..43cfba3 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1,5 +1,164 @@ steps: - - trigger: ci-cuda - - trigger: ci-amdgpu - - trigger: ci-oneapi - - trigger: ci-metal + + # CUDA + - label: "CUDA - Julia v1.10" + plugins: + - JuliaCI/julia#v1: + version: "1.10" + command: | + julia -e 'using Pkg + + println("--- :julia: Instantiating environment") + Pkg.add("CUDA") + Pkg.develop(path=".") + + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--CUDA"])' + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 + + - label: "CUDA - Julia v1.11" + plugins: + - JuliaCI/julia#v1: + version: "1.11" + command: | + julia -e 'using Pkg + + println("--- :julia: Instantiating environment") + Pkg.add("CUDA") + Pkg.develop(path=".") + + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--CUDA"])' + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 + + + # AMDGPU + - label: "AMDGPU - Julia v1.10" + plugins: + - JuliaCI/julia#v1: + version: "1.10" + command: | + julia -e 'using Pkg + + println("--- :julia: Instantiating environment") + Pkg.add("AMDGPU") + Pkg.develop(path=".") + + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--AMDGPU"])' + agents: + queue: "juliagpu" + rocm: "*" + rocmgpu: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 + + - label: "AMDGPU - Julia v1.11" + plugins: + - JuliaCI/julia#v1: + version: "1.11" + command: | + julia -e 'using Pkg + + println("--- :julia: Instantiating environment") + Pkg.add("AMDGPU") + Pkg.develop(path=".") + + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--AMDGPU"])' + agents: + queue: "juliagpu" + rocm: "*" + rocmgpu: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 + + + # oneAPI + - label: "oneAPI - Julia v1.10" + plugins: + - JuliaCI/julia#v1: + version: "1.10" + command: | + julia -e 'using Pkg + + println("--- :julia: Instantiating environment") + Pkg.add("oneAPI") + Pkg.develop(path=".") + + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])' + agents: + queue: "juliagpu" + intel: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 + + - label: "oneAPI - Julia v1.11" + plugins: + - JuliaCI/julia#v1: + version: "1.11" + command: | + julia -e 'using Pkg + + println("--- :julia: Instantiating environment") + Pkg.add("oneAPI") + Pkg.develop(path=".") + + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])' + agents: + queue: "juliagpu" + intel: "*" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 + + + # Metal + - label: "Metal - Julia v1.10" + plugins: + - JuliaCI/julia#v1: + version: "1.10" + command: | + julia -e 'using Pkg + + println("--- :julia: Instantiating environment") + Pkg.add("Metal") + Pkg.develop(path=".") + + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--Metal"])' + agents: + queue: "juliaecosystem" + os: "macos" + arch: "aarch64" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 + + - label: "Metal - Julia v1.11" + plugins: + - JuliaCI/julia#v1: + version: "1.11" + command: | + julia -e 'using Pkg + + println("--- :julia: Instantiating environment") + Pkg.add("Metal") + Pkg.develop(path=".") + + println("+++ :julia: Running tests") + Pkg.test("AcceleratedKernels", test_args=["--Metal"])' + agents: + queue: "juliaecosystem" + os: "macos" + arch: "aarch64" + if: build.message !~ /\[skip tests\]/ + timeout_in_minutes: 60 From 2a04e41ea39daf7b73fbb283bd1874e6adf4152c Mon Sep 17 00:00:00 2001 From: anicusan Date: Mon, 11 Nov 2024 17:27:49 +0000 Subject: [PATCH 05/10] Waiting on the rest of the ecosystem to update to GPUArraysCore 0.2 - until then, adding 0.1 in too --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 3e8a382..c45b5ff 100644 --- a/Project.toml +++ b/Project.toml @@ -16,7 +16,7 @@ Unrolled = "9602ed7d-8fef-5bc8-8597-8f21381861e8" [compat] ArgCheck = "2.1" DocStringExtensions = "0.9" -GPUArraysCore = "0.2" +GPUArraysCore = "0.1, 0.2" KernelAbstractions = "0.9" Markdown = "1.10" OhMyThreads = "0.7.0" From fe709eb50575fca0597066d7d6790df948bf6093 Mon Sep 17 00:00:00 2001 From: anicusan Date: Tue, 12 Nov 2024 01:48:02 +0000 Subject: [PATCH 06/10] Trying a dummy read for Metal to see if it creates a data dependency so flags are written after v in accumulate --- prototype/Project.toml | 1 + src/accumulate.jl | 10 ++++++++-- test/runtests.jl | 8 ++++---- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/prototype/Project.toml b/prototype/Project.toml index 88d95f6..0463a0b 100644 --- a/prototype/Project.toml +++ b/prototype/Project.toml @@ -4,4 +4,5 @@ Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +Metal = "dde4c033-4e86-420c-a63e-0dd931031962" PProf = "e4faabce-9ead-11e9-39d9-4379958e3056" diff --git a/src/accumulate.jl b/src/accumulate.jl index 8b3a27e..2bc4cbb 100644 --- a/src/accumulate.jl +++ b/src/accumulate.jl @@ -195,6 +195,7 @@ end # there should be better memory fences to guarantee ordering without # thread synchronization... if ithread == 0 + dummy = v[1] flags[iblock + 1] = ACC_FLAG_A end end @@ -215,6 +216,11 @@ function accumulate!( @argcheck block_size > 0 @argcheck ispow2(block_size) + # Nothing to accumulate + if length(v) == 0 + return v + end + # Each thread will process two elements elems_per_block = block_size * 2 num_blocks = (length(v) + elems_per_block - 1) ÷ elems_per_block @@ -246,7 +252,7 @@ function accumulate!( ndrange=(num_blocks - 1) * block_size) end - nothing + return v end @@ -275,8 +281,8 @@ function accumulate!( for i in eachindex(v) v[i], running = running, op(running, v[i]) end - end + return v end diff --git a/test/runtests.jl b/test/runtests.jl index e3ba362..a530935 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -911,10 +911,10 @@ end mbase = minbox_base(vh, dims) @test eltype(mgpu) === eltype(mcpu) === eltype(mbase) - for (i, mgpu_red) in enumerate(Array(mgpu)) - @test mgpu_red[1] ≈ mcpu[i][1] ≈ mbase[i][1] - @test mgpu_red[2] ≈ mcpu[i][2] ≈ mbase[i][2] - end + @test all([ + (mgpu_red[1] ≈ mcpu[i][1] ≈ mbase[i][1]) && (mgpu_red[2] ≈ mcpu[i][2] ≈ mbase[i][2]) + for (i, mgpu_red) in enumerate(Array(mgpu)) + ]) end end From 42cfc550f49937b51f2f632c9b7911a080f9dbba Mon Sep 17 00:00:00 2001 From: anicusan Date: Tue, 12 Nov 2024 02:09:03 +0000 Subject: [PATCH 07/10] Trying to enforce ordering by a comparison in v. Left AK at 0.2.1 to allow compatibility with AMDGPU which now depends on it --- Project.toml | 10 +++++----- src/accumulate.jl | 3 +-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/Project.toml b/Project.toml index c45b5ff..f480d9b 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "AcceleratedKernels" uuid = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" authors = ["Andrei-Leonard Nicusan and contributors"] -version = "0.3.0-DEV" +version = "0.2.1-DEV" [deps] ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197" @@ -14,12 +14,12 @@ Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588" Unrolled = "9602ed7d-8fef-5bc8-8597-8f21381861e8" [compat] -ArgCheck = "2.1" +ArgCheck = "2" DocStringExtensions = "0.9" GPUArraysCore = "0.1, 0.2" KernelAbstractions = "0.9" -Markdown = "1.10" -OhMyThreads = "0.7.0" +Markdown = "1" +OhMyThreads = "0.7" Polyester = "0.7" -Unrolled = "0.1.5" +Unrolled = "0.1" julia = "1.10" diff --git a/src/accumulate.jl b/src/accumulate.jl index 2bc4cbb..fb8dc46 100644 --- a/src/accumulate.jl +++ b/src/accumulate.jl @@ -194,8 +194,7 @@ end @synchronize() # This is needed so that the flag is not set before copying into v, but # there should be better memory fences to guarantee ordering without # thread synchronization... - if ithread == 0 - dummy = v[1] + if ithread == 0 && v[1] != typemax(eltype(v)) # This is a hack to enforce ordering of flags AFTER v is written flags[iblock + 1] = ACC_FLAG_A end end From e592e9832415066d314a996a73d1a50a2671fb14 Mon Sep 17 00:00:00 2001 From: anicusan Date: Tue, 12 Nov 2024 18:23:57 +0000 Subject: [PATCH 08/10] Added accumulate benchmark. Known issue for Metal with accumulate. Added CI badges. --- README.md | 156 +++++++++++++++++++++++++++++- prototype/accumulate_benchmark.jl | 33 +++++++ src/accumulate.jl | 20 ++-- 3 files changed, 194 insertions(+), 15 deletions(-) create mode 100644 prototype/accumulate_benchmark.jl diff --git a/README.md b/README.md index 572c7e9..c5f3d69 100644 --- a/README.md +++ b/README.md @@ -4,11 +4,165 @@ [![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://juliagpu.github.io/AcceleratedKernels.jl/stable/) [![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://juliagpu.github.io/AcceleratedKernels.jl/dev/) -[![CI-CPU](https://github.com/juliagpu/AcceleratedKernels.jl/actions/workflows/CI-CPU.yml/badge.svg)](https://github.com/juliagpu/AcceleratedKernels.jl/actions/workflows/CI-CPU.yml) [![Aqua QA](https://raw.githubusercontent.com/JuliaTesting/Aqua.jl/master/badge.svg)](https://github.com/JuliaTesting/Aqua.jl) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
AK BackendJulia VersionCI Status
+ +CPU Single- and Multi-Threaded + + + +Julia LTS, Stable, Pre-Release + + + +[![CI-CPU](https://github.com/juliagpu/AcceleratedKernels.jl/actions/workflows/CI-CPU.yml/badge.svg)](https://github.com/juliagpu/AcceleratedKernels.jl/actions/workflows/CI-CPU.yml) + +
+ +CUDA + + + +Julia v1.10 + + + +[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=CUDA%20-%20Julia%20v1.10)](https://buildkite.com/julialang/acceleratedkernels-dot-jl) + +
+ +Julia v1.11 + + + +[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=CUDA%20-%20Julia%20v1.11)](https://buildkite.com/julialang/acceleratedkernels-dot-jl) + +
+ +AMDGPU + + + +Julia v1.10 + + + +[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=AMDGPU%20-%20Julia%20v1.10)](https://buildkite.com/julialang/acceleratedkernels-dot-jl) + +
+ +Julia v1.11 + + + +[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=AMDGPU%20-%20Julia%20v1.11)](https://buildkite.com/julialang/acceleratedkernels-dot-jl) + +
+ +oneAPI + + + +Julia v1.10 + + + +[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=oneAPI%20-%20Julia%20v1.10)](https://buildkite.com/julialang/acceleratedkernels-dot-jl) + +
+ +Julia v1.11 + + + +[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=oneAPI%20-%20Julia%20v1.11)](https://buildkite.com/julialang/acceleratedkernels-dot-jl) + +
+ +Metal + +[Known Issue](https://github.com/JuliaGPU/AcceleratedKernels.jl/issues/10) + + + +Julia v1.10 + + + +[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=Metal%20-%20Julia%20v1.10)](https://buildkite.com/julialang/acceleratedkernels-dot-jl) + +
+ +Julia v1.11 + + + +[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=Metal%20-%20Julia%20v1.11)](https://buildkite.com/julialang/acceleratedkernels-dot-jl) + +
+ Parallel algorithm building blocks for the Julia ecosystem, targeting multithreaded CPUs, and GPUs via Intel oneAPI, AMD ROCm, Apple Metal and Nvidia CUDA (and any future backends added to the [JuliaGPU](https://juliagpu.org/) organisation). diff --git a/prototype/accumulate_benchmark.jl b/prototype/accumulate_benchmark.jl new file mode 100644 index 0000000..6b9191a --- /dev/null +++ b/prototype/accumulate_benchmark.jl @@ -0,0 +1,33 @@ +using BenchmarkTools +using Metal +import AcceleratedKernels as AK + +using Random +Random.seed!(0) + + +function akacc(v) + va = AK.accumulate(+, v, init=zero(eltype(v)), block_size=512) + Metal.synchronize() + va +end + + +function baseacc(v) + va = accumulate(+, v, init=zero(eltype(v))) + Metal.synchronize() + va +end + + +v = MtlArray(rand(1:100, 1_000_000)) + +# Correctness checks +va = akacc(v) |> Array +vb = baseacc(v) |> Array +# @assert va == vb + +# Benchmarks +println("Base vs AK") +display(@benchmark baseacc($v)) +display(@benchmark akacc($v)) diff --git a/src/accumulate.jl b/src/accumulate.jl index fb8dc46..6e59e30 100644 --- a/src/accumulate.jl +++ b/src/accumulate.jl @@ -143,7 +143,6 @@ end len = length(v) block_size = @groupsize()[1] - temp = @localmem eltype(v) (2 * block_size,) # NOTE: for many index calculations in this library, computation using zero-indexing leads to # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero @@ -155,13 +154,6 @@ end ithread = @index(Local, Linear) - 1 block_offset = iblock * block_size * 2 # Processing two elements per thread - # Copy two elements from the main array - ai = ithread - bi = ithread + block_size - - temp[ai + 1] = block_offset + ai < len ? v[block_offset + ai + 1] : init - temp[bi + 1] = block_offset + bi < len ? v[block_offset + bi + 1] : init - # Each block looks back to find running prefix sum running_prefix = init inspected_block = iblock - 1 @@ -180,21 +172,21 @@ end end # Now we have aggregate prefix of all previous blocks, add it to all our elements - temp[ai + 1] = op(running_prefix, temp[ai + 1]) - temp[bi + 1] = op(running_prefix, temp[bi + 1]) - + ai = ithread if block_offset + ai < len - v[block_offset + ai + 1] = temp[ai + 1] + v[block_offset + ai + 1] = op(running_prefix, v[block_offset + ai + 1]) end + + bi = ithread + block_size if block_offset + bi < len - v[block_offset + bi + 1] = temp[bi + 1] + v[block_offset + bi + 1] = op(running_prefix, v[block_offset + bi + 1]) end # Set flag for "aggregate of all prefixes up to this block finished" @synchronize() # This is needed so that the flag is not set before copying into v, but # there should be better memory fences to guarantee ordering without # thread synchronization... - if ithread == 0 && v[1] != typemax(eltype(v)) # This is a hack to enforce ordering of flags AFTER v is written + if ithread == 0 flags[iblock + 1] = ACC_FLAG_A end end From 925b17e7ff6f528c1e7fbc2432af88bde811433b Mon Sep 17 00:00:00 2001 From: anicusan Date: Tue, 12 Nov 2024 18:34:40 +0000 Subject: [PATCH 09/10] Updated README on CI --- README.md | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index c5f3d69..fcaa327 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,8 @@ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) +Parallel algorithm building blocks for the Julia ecosystem, targeting multithreaded CPUs, and GPUs via Intel oneAPI, AMD ROCm, Apple Metal and Nvidia CUDA (and any future backends added to the [JuliaGPU](https://juliagpu.org/) organisation) from a unified KernelAbstractions.jl codebase. + @@ -27,6 +29,10 @@ CPU Single- and Multi-Threaded Julia LTS, Stable, Pre-Release +x86 and x64 + +Windows, Ubuntu, MacOS +
@@ -38,7 +44,7 @@ Julia LTS, Stable, Pre-Release
-CUDA +[CUDA](https://github.com/JuliaGPU/CUDA.jl) @@ -69,7 +75,7 @@ Julia v1.11
-AMDGPU +[AMDGPU](https://github.com/JuliaGPU/AMDGPU.jl) @@ -100,7 +106,7 @@ Julia v1.11
-oneAPI +[oneAPI](https://github.com/JuliaGPU/oneAPI.jl) @@ -131,7 +137,7 @@ Julia v1.11
-Metal +[Metal](https://github.com/JuliaGPU/Metal.jl) [Known Issue](https://github.com/JuliaGPU/AcceleratedKernels.jl/issues/10) @@ -163,8 +169,6 @@ Julia v1.11
-Parallel algorithm building blocks for the Julia ecosystem, targeting multithreaded CPUs, and GPUs via Intel oneAPI, AMD ROCm, Apple Metal and Nvidia CUDA (and any future backends added to the [JuliaGPU](https://juliagpu.org/) organisation). - - [1. What's Different?](#1-whats-different) - [2. Status](#2-status) @@ -197,11 +201,11 @@ Again, this is only possible because of the unique Julia compilation model, the ## 2. Status -The AcceleratedKernels.jl sorters were adopted as the official [AMDGPU algorithms](https://github.com/JuliaGPU/AMDGPU.jl/pull/688)! The API is starting to stabilise; it follows the Julia standard library fairly closely - additionally exposing all temporary arrays for memory reuse. For any new ideas / requests, please join the conversation on [Julia Discourse](https://discourse.julialang.org/t/ann-acceleratedkernels-jl-cross-architecture-parallel-algorithms-for-julias-gpu-backends/119698/16) or post [an issue](https://github.com/juliagpu/AcceleratedKernels.jl/issues). +The AcceleratedKernels.jl sorters were adopted as the official [AMDGPU algorithms](https://github.com/JuliaGPU/AMDGPU.jl/pull/688)! The API is starting to stabilise; it follows the Julia standard library fairly closely - and additionally exposing all temporary arrays for memory reuse. For any new ideas / requests, please join the conversation on [Julia Discourse](https://discourse.julialang.org/t/ann-acceleratedkernels-jl-cross-architecture-parallel-algorithms-for-julias-gpu-backends/119698/16) or post [an issue](https://github.com/juliagpu/AcceleratedKernels.jl/issues). -We have an extensive test suite; however, I only ran them locally on the oneAPI (laptop Intel UHD Graphics 620), CUDA (laptop with Nvidia Quadro RTX 4000 and data centre Nvidia A100-40), Metal (Mac M2 and M3), and AMD (data centre AMD MI210) backends. Some kinks might still exist for some platform / OS permutations before a CI is set up. +We have an extensive randomised test suite that we run on the CPU (single- and multi-threaded) backend on Windows, Ubuntu and MacOS for Julia LTS, Stable, and Pre-Release, plus the CUDA, AMDGPU, oneAPI and Metal backends on the [JuliaGPU buildkite](https://github.com/JuliaGPU/buildkite). -AcceleratedKernels.jl will also be a fundamental building block of applications developed at [EvoPhase](https://evophase.co.uk/), so it will see continuous heavy use with industry backing. Long-term stability, performance improvements and support are priorities for us. +AcceleratedKernels.jl is also be a fundamental building block of applications developed at [EvoPhase](https://evophase.co.uk/), so it will see continuous heavy use with industry backing. Long-term stability, performance improvements and support are priorities for us. ## 3. Benchmarks @@ -714,8 +718,6 @@ Leave out to test the CPU backend: $> julia -e 'import Pkg; Pkg.test("AcceleratedKernels.jl") ``` -**TODO**: talk with the JuliaGPU team to add library to their [BuildKite agents](https://github.com/JuliaGPU/buildkite) CI. - ## 8. Issues and Debugging As the compilation pipeline of GPU kernels is different to that of base Julia, error messages also look different - for example, where Julia would insert an exception when a variable name was not defined (e.g. we had a typo), a GPU kernel throwing exceptions cannot be compiled and instead you'll see some cascading errors like `"[...] compiling [...] resulted in invalid LLVM IR"` caused by `"Reason: unsupported use of an undefined name"` resulting in `"Reason: unsupported dynamic function invocation"`, etc. From b5a0f771c3c0dc6b9d39da9835bc9d7e16ee6d1f Mon Sep 17 00:00:00 2001 From: anicusan Date: Tue, 12 Nov 2024 18:35:10 +0000 Subject: [PATCH 10/10] bumped version preparing for release --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index f480d9b..2962cf7 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "AcceleratedKernels" uuid = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" authors = ["Andrei-Leonard Nicusan and contributors"] -version = "0.2.1-DEV" +version = "0.2.1" [deps] ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197"