From 6fb3b3c7d8ac70fe70c83877d9bb577573ef7291 Mon Sep 17 00:00:00 2001
From: Julian Samaroo <jpsamaroo@gmail.com>
Date: Mon, 11 Nov 2024 10:38:09 -0600
Subject: [PATCH 01/10] Add Buildkite CI for CUDA

---
 .buildkite/pipeline.yml | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 .buildkite/pipeline.yml

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
new file mode 100644
index 0000000..edbef1f
--- /dev/null
+++ b/.buildkite/pipeline.yml
@@ -0,0 +1,26 @@
+steps:
+  - label: "CUDA - Julia v1.10"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.10"
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
+          codecov: true
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
+
+  - label: "CUDA - Julia v1.11"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.11"
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
+          codecov: true
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60

From e7ea9a4cd274a1758786314b6926f5c336517976 Mon Sep 17 00:00:00 2001
From: anicusan <aln705@student.bham.ac.uk>
Date: Mon, 11 Nov 2024 17:12:58 +0000
Subject: [PATCH 02/10] added separate GPU CIs for each backend

---
 .buildkite/CI-AMDGPU.yml | 40 ++++++++++++++++++++++++++++++++++++++++
 .buildkite/CI-CUDA.yml   | 38 ++++++++++++++++++++++++++++++++++++++
 .buildkite/CI-Metal.yml  | 40 ++++++++++++++++++++++++++++++++++++++++
 .buildkite/CI-oneAPI.yml | 38 ++++++++++++++++++++++++++++++++++++++
 .buildkite/pipeline.yml  | 26 --------------------------
 5 files changed, 156 insertions(+), 26 deletions(-)
 create mode 100644 .buildkite/CI-AMDGPU.yml
 create mode 100644 .buildkite/CI-CUDA.yml
 create mode 100644 .buildkite/CI-Metal.yml
 create mode 100644 .buildkite/CI-oneAPI.yml
 delete mode 100644 .buildkite/pipeline.yml

diff --git a/.buildkite/CI-AMDGPU.yml b/.buildkite/CI-AMDGPU.yml
new file mode 100644
index 0000000..8bddcaf
--- /dev/null
+++ b/.buildkite/CI-AMDGPU.yml
@@ -0,0 +1,40 @@
+steps:
+  - label: "AMDGPU - Julia v1.10"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.10"
+    command: |
+      julia -e 'using Pkg
+
+                println("--- :julia: Instantiating environment")
+                Pkg.add("AMDGPU")
+                Pkg.develop(path=".")
+
+                println("+++ :julia: Running tests")
+                Pkg.test("AcceleratedKernels", test_args=["--AMDGPU"])'
+    agents:
+      queue: "juliagpu"
+      rocm: "*"
+      rocmgpu: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
+
+  - label: "AMDGPU - Julia v1.11"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.11"
+    command: |
+      julia -e 'using Pkg
+
+                println("--- :julia: Instantiating environment")
+                Pkg.add("AMDGPU")
+                Pkg.develop(path=".")
+
+                println("+++ :julia: Running tests")
+                Pkg.test("AcceleratedKernels", test_args=["--AMDGPU"])'
+    agents:
+      queue: "juliagpu"
+      rocm: "*"
+      rocmgpu: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
diff --git a/.buildkite/CI-CUDA.yml b/.buildkite/CI-CUDA.yml
new file mode 100644
index 0000000..dd6b785
--- /dev/null
+++ b/.buildkite/CI-CUDA.yml
@@ -0,0 +1,38 @@
+steps:
+  - label: "CUDA - Julia v1.10"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.10"
+    command: |
+      julia -e 'using Pkg
+
+                println("--- :julia: Instantiating environment")
+                Pkg.add("CUDA")
+                Pkg.develop(path=".")
+
+                println("+++ :julia: Running tests")
+                Pkg.test("AcceleratedKernels", test_args=["--CUDA"])'
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
+
+  - label: "CUDA - Julia v1.11"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.11"
+    command: |
+      julia -e 'using Pkg
+
+                println("--- :julia: Instantiating environment")
+                Pkg.add("CUDA")
+                Pkg.develop(path=".")
+
+                println("+++ :julia: Running tests")
+                Pkg.test("AcceleratedKernels", test_args=["--CUDA"])'
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
diff --git a/.buildkite/CI-Metal.yml b/.buildkite/CI-Metal.yml
new file mode 100644
index 0000000..d48567c
--- /dev/null
+++ b/.buildkite/CI-Metal.yml
@@ -0,0 +1,40 @@
+steps:
+  - label: "Metal - Julia v1.10"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.10"
+    command: |
+      julia -e 'using Pkg
+
+                println("--- :julia: Instantiating environment")
+                Pkg.add("Metal")
+                Pkg.develop(path=".")
+
+                println("+++ :julia: Running tests")
+                Pkg.test("AcceleratedKernels", test_args=["--Metal"])'
+    agents:
+      queue: "juliaecosystem"
+      os: "macos"
+      arch: "aarch64"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
+
+  - label: "Metal - Julia v1.11"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.11"
+    command: |
+      julia -e 'using Pkg
+
+                println("--- :julia: Instantiating environment")
+                Pkg.add("Metal")
+                Pkg.develop(path=".")
+
+                println("+++ :julia: Running tests")
+                Pkg.test("AcceleratedKernels", test_args=["--Metal"])'
+    agents:
+      queue: "juliaecosystem"
+      os: "macos"
+      arch: "aarch64"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
diff --git a/.buildkite/CI-oneAPI.yml b/.buildkite/CI-oneAPI.yml
new file mode 100644
index 0000000..6d64bfa
--- /dev/null
+++ b/.buildkite/CI-oneAPI.yml
@@ -0,0 +1,38 @@
+steps:
+  - label: "oneAPI - Julia v1.10"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.10"
+    command: |
+      julia -e 'using Pkg
+
+                println("--- :julia: Instantiating environment")
+                Pkg.add("oneAPI")
+                Pkg.develop(path=".")
+
+                println("+++ :julia: Running tests")
+                Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])'
+    agents:
+      queue: "juliagpu"
+      intel: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
+
+  - label: "oneAPI - Julia v1.11"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.11"
+    command: |
+      julia -e 'using Pkg
+
+                println("--- :julia: Instantiating environment")
+                Pkg.add("oneAPI")
+                Pkg.develop(path=".")
+
+                println("+++ :julia: Running tests")
+                Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])'
+    agents:
+      queue: "juliagpu"
+      intel: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
deleted file mode 100644
index edbef1f..0000000
--- a/.buildkite/pipeline.yml
+++ /dev/null
@@ -1,26 +0,0 @@
-steps:
-  - label: "CUDA - Julia v1.10"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: "1.10"
-      - JuliaCI/julia-test#v1: ~
-      - JuliaCI/julia-coverage#v1:
-          codecov: true
-    agents:
-      queue: "juliagpu"
-      cuda: "*"
-    if: build.message !~ /\[skip tests\]/
-    timeout_in_minutes: 60
-
-  - label: "CUDA - Julia v1.11"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: "1.11"
-      - JuliaCI/julia-test#v1: ~
-      - JuliaCI/julia-coverage#v1:
-          codecov: true
-    agents:
-      queue: "juliagpu"
-      cuda: "*"
-    if: build.message !~ /\[skip tests\]/
-    timeout_in_minutes: 60

From 0e86e1dd0d0f4b00ec5c7b8f3738845fb0dff73f Mon Sep 17 00:00:00 2001
From: anicusan <aln705@student.bham.ac.uk>
Date: Mon, 11 Nov 2024 17:17:33 +0000
Subject: [PATCH 03/10] triggering from pipeline.yml

---
 .buildkite/pipeline.yml | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 .buildkite/pipeline.yml

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
new file mode 100644
index 0000000..34939b1
--- /dev/null
+++ b/.buildkite/pipeline.yml
@@ -0,0 +1,5 @@
+steps:
+  - trigger: ci-cuda
+  - trigger: ci-amdgpu
+  - trigger: ci-oneapi
+  - trigger: ci-metal

From 56b689745d8c76b9a335afce2d3f08206ad9f5ed Mon Sep 17 00:00:00 2001
From: anicusan <aln705@student.bham.ac.uk>
Date: Mon, 11 Nov 2024 17:21:10 +0000
Subject: [PATCH 04/10] you know what, let's keep it simple...

---
 .buildkite/CI-AMDGPU.yml |  40 ----------
 .buildkite/CI-CUDA.yml   |  38 ---------
 .buildkite/CI-Metal.yml  |  40 ----------
 .buildkite/CI-oneAPI.yml |  38 ---------
 .buildkite/pipeline.yml  | 167 ++++++++++++++++++++++++++++++++++++++-
 5 files changed, 163 insertions(+), 160 deletions(-)
 delete mode 100644 .buildkite/CI-AMDGPU.yml
 delete mode 100644 .buildkite/CI-CUDA.yml
 delete mode 100644 .buildkite/CI-Metal.yml
 delete mode 100644 .buildkite/CI-oneAPI.yml

diff --git a/.buildkite/CI-AMDGPU.yml b/.buildkite/CI-AMDGPU.yml
deleted file mode 100644
index 8bddcaf..0000000
--- a/.buildkite/CI-AMDGPU.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-steps:
-  - label: "AMDGPU - Julia v1.10"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: "1.10"
-    command: |
-      julia -e 'using Pkg
-
-                println("--- :julia: Instantiating environment")
-                Pkg.add("AMDGPU")
-                Pkg.develop(path=".")
-
-                println("+++ :julia: Running tests")
-                Pkg.test("AcceleratedKernels", test_args=["--AMDGPU"])'
-    agents:
-      queue: "juliagpu"
-      rocm: "*"
-      rocmgpu: "*"
-    if: build.message !~ /\[skip tests\]/
-    timeout_in_minutes: 60
-
-  - label: "AMDGPU - Julia v1.11"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: "1.11"
-    command: |
-      julia -e 'using Pkg
-
-                println("--- :julia: Instantiating environment")
-                Pkg.add("AMDGPU")
-                Pkg.develop(path=".")
-
-                println("+++ :julia: Running tests")
-                Pkg.test("AcceleratedKernels", test_args=["--AMDGPU"])'
-    agents:
-      queue: "juliagpu"
-      rocm: "*"
-      rocmgpu: "*"
-    if: build.message !~ /\[skip tests\]/
-    timeout_in_minutes: 60
diff --git a/.buildkite/CI-CUDA.yml b/.buildkite/CI-CUDA.yml
deleted file mode 100644
index dd6b785..0000000
--- a/.buildkite/CI-CUDA.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-steps:
-  - label: "CUDA - Julia v1.10"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: "1.10"
-    command: |
-      julia -e 'using Pkg
-
-                println("--- :julia: Instantiating environment")
-                Pkg.add("CUDA")
-                Pkg.develop(path=".")
-
-                println("+++ :julia: Running tests")
-                Pkg.test("AcceleratedKernels", test_args=["--CUDA"])'
-    agents:
-      queue: "juliagpu"
-      cuda: "*"
-    if: build.message !~ /\[skip tests\]/
-    timeout_in_minutes: 60
-
-  - label: "CUDA - Julia v1.11"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: "1.11"
-    command: |
-      julia -e 'using Pkg
-
-                println("--- :julia: Instantiating environment")
-                Pkg.add("CUDA")
-                Pkg.develop(path=".")
-
-                println("+++ :julia: Running tests")
-                Pkg.test("AcceleratedKernels", test_args=["--CUDA"])'
-    agents:
-      queue: "juliagpu"
-      cuda: "*"
-    if: build.message !~ /\[skip tests\]/
-    timeout_in_minutes: 60
diff --git a/.buildkite/CI-Metal.yml b/.buildkite/CI-Metal.yml
deleted file mode 100644
index d48567c..0000000
--- a/.buildkite/CI-Metal.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-steps:
-  - label: "Metal - Julia v1.10"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: "1.10"
-    command: |
-      julia -e 'using Pkg
-
-                println("--- :julia: Instantiating environment")
-                Pkg.add("Metal")
-                Pkg.develop(path=".")
-
-                println("+++ :julia: Running tests")
-                Pkg.test("AcceleratedKernels", test_args=["--Metal"])'
-    agents:
-      queue: "juliaecosystem"
-      os: "macos"
-      arch: "aarch64"
-    if: build.message !~ /\[skip tests\]/
-    timeout_in_minutes: 60
-
-  - label: "Metal - Julia v1.11"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: "1.11"
-    command: |
-      julia -e 'using Pkg
-
-                println("--- :julia: Instantiating environment")
-                Pkg.add("Metal")
-                Pkg.develop(path=".")
-
-                println("+++ :julia: Running tests")
-                Pkg.test("AcceleratedKernels", test_args=["--Metal"])'
-    agents:
-      queue: "juliaecosystem"
-      os: "macos"
-      arch: "aarch64"
-    if: build.message !~ /\[skip tests\]/
-    timeout_in_minutes: 60
diff --git a/.buildkite/CI-oneAPI.yml b/.buildkite/CI-oneAPI.yml
deleted file mode 100644
index 6d64bfa..0000000
--- a/.buildkite/CI-oneAPI.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-steps:
-  - label: "oneAPI - Julia v1.10"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: "1.10"
-    command: |
-      julia -e 'using Pkg
-
-                println("--- :julia: Instantiating environment")
-                Pkg.add("oneAPI")
-                Pkg.develop(path=".")
-
-                println("+++ :julia: Running tests")
-                Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])'
-    agents:
-      queue: "juliagpu"
-      intel: "*"
-    if: build.message !~ /\[skip tests\]/
-    timeout_in_minutes: 60
-
-  - label: "oneAPI - Julia v1.11"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: "1.11"
-    command: |
-      julia -e 'using Pkg
-
-                println("--- :julia: Instantiating environment")
-                Pkg.add("oneAPI")
-                Pkg.develop(path=".")
-
-                println("+++ :julia: Running tests")
-                Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])'
-    agents:
-      queue: "juliagpu"
-      intel: "*"
-    if: build.message !~ /\[skip tests\]/
-    timeout_in_minutes: 60
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 34939b1..43cfba3 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -1,5 +1,164 @@
 steps:
-  - trigger: ci-cuda
-  - trigger: ci-amdgpu
-  - trigger: ci-oneapi
-  - trigger: ci-metal
+
+  # CUDA
+  - label: "CUDA - Julia v1.10"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.10"
+    command: |
+      julia -e 'using Pkg
+
+                println("--- :julia: Instantiating environment")
+                Pkg.add("CUDA")
+                Pkg.develop(path=".")
+
+                println("+++ :julia: Running tests")
+                Pkg.test("AcceleratedKernels", test_args=["--CUDA"])'
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
+
+  - label: "CUDA - Julia v1.11"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.11"
+    command: |
+      julia -e 'using Pkg
+
+                println("--- :julia: Instantiating environment")
+                Pkg.add("CUDA")
+                Pkg.develop(path=".")
+
+                println("+++ :julia: Running tests")
+                Pkg.test("AcceleratedKernels", test_args=["--CUDA"])'
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
+
+
+  # AMDGPU
+  - label: "AMDGPU - Julia v1.10"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.10"
+    command: |
+      julia -e 'using Pkg
+
+                println("--- :julia: Instantiating environment")
+                Pkg.add("AMDGPU")
+                Pkg.develop(path=".")
+
+                println("+++ :julia: Running tests")
+                Pkg.test("AcceleratedKernels", test_args=["--AMDGPU"])'
+    agents:
+      queue: "juliagpu"
+      rocm: "*"
+      rocmgpu: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
+
+  - label: "AMDGPU - Julia v1.11"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.11"
+    command: |
+      julia -e 'using Pkg
+
+                println("--- :julia: Instantiating environment")
+                Pkg.add("AMDGPU")
+                Pkg.develop(path=".")
+
+                println("+++ :julia: Running tests")
+                Pkg.test("AcceleratedKernels", test_args=["--AMDGPU"])'
+    agents:
+      queue: "juliagpu"
+      rocm: "*"
+      rocmgpu: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
+
+
+  # oneAPI
+  - label: "oneAPI - Julia v1.10"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.10"
+    command: |
+      julia -e 'using Pkg
+
+                println("--- :julia: Instantiating environment")
+                Pkg.add("oneAPI")
+                Pkg.develop(path=".")
+
+                println("+++ :julia: Running tests")
+                Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])'
+    agents:
+      queue: "juliagpu"
+      intel: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
+
+  - label: "oneAPI - Julia v1.11"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.11"
+    command: |
+      julia -e 'using Pkg
+
+                println("--- :julia: Instantiating environment")
+                Pkg.add("oneAPI")
+                Pkg.develop(path=".")
+
+                println("+++ :julia: Running tests")
+                Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])'
+    agents:
+      queue: "juliagpu"
+      intel: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
+
+
+  # Metal
+  - label: "Metal - Julia v1.10"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.10"
+    command: |
+      julia -e 'using Pkg
+
+                println("--- :julia: Instantiating environment")
+                Pkg.add("Metal")
+                Pkg.develop(path=".")
+
+                println("+++ :julia: Running tests")
+                Pkg.test("AcceleratedKernels", test_args=["--Metal"])'
+    agents:
+      queue: "juliaecosystem"
+      os: "macos"
+      arch: "aarch64"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
+
+  - label: "Metal - Julia v1.11"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.11"
+    command: |
+      julia -e 'using Pkg
+
+                println("--- :julia: Instantiating environment")
+                Pkg.add("Metal")
+                Pkg.develop(path=".")
+
+                println("+++ :julia: Running tests")
+                Pkg.test("AcceleratedKernels", test_args=["--Metal"])'
+    agents:
+      queue: "juliaecosystem"
+      os: "macos"
+      arch: "aarch64"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60

From 2a04e41ea39daf7b73fbb283bd1874e6adf4152c Mon Sep 17 00:00:00 2001
From: anicusan <aln705@student.bham.ac.uk>
Date: Mon, 11 Nov 2024 17:27:49 +0000
Subject: [PATCH 05/10] Waiting on the rest of the ecosystem to update to
 GPUArraysCore 0.2 - until then, adding 0.1 in too

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 3e8a382..c45b5ff 100644
--- a/Project.toml
+++ b/Project.toml
@@ -16,7 +16,7 @@ Unrolled = "9602ed7d-8fef-5bc8-8597-8f21381861e8"
 [compat]
 ArgCheck = "2.1"
 DocStringExtensions = "0.9"
-GPUArraysCore = "0.2"
+GPUArraysCore = "0.1, 0.2"
 KernelAbstractions = "0.9"
 Markdown = "1.10"
 OhMyThreads = "0.7.0"

From fe709eb50575fca0597066d7d6790df948bf6093 Mon Sep 17 00:00:00 2001
From: anicusan <aln705@student.bham.ac.uk>
Date: Tue, 12 Nov 2024 01:48:02 +0000
Subject: [PATCH 06/10] Trying a dummy read for Metal to see if it creates a
 data dependency so flags are written after v in accumulate

---
 prototype/Project.toml |  1 +
 src/accumulate.jl      | 10 ++++++++--
 test/runtests.jl       |  8 ++++----
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/prototype/Project.toml b/prototype/Project.toml
index 88d95f6..0463a0b 100644
--- a/prototype/Project.toml
+++ b/prototype/Project.toml
@@ -4,4 +4,5 @@ Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 PProf = "e4faabce-9ead-11e9-39d9-4379958e3056"
diff --git a/src/accumulate.jl b/src/accumulate.jl
index 8b3a27e..2bc4cbb 100644
--- a/src/accumulate.jl
+++ b/src/accumulate.jl
@@ -195,6 +195,7 @@ end
                         # there should be better memory fences to guarantee ordering without
                         # thread synchronization...
     if ithread == 0
+        dummy = v[1]
         flags[iblock + 1] = ACC_FLAG_A
     end
 end
@@ -215,6 +216,11 @@ function accumulate!(
     @argcheck block_size > 0
     @argcheck ispow2(block_size)
 
+    # Nothing to accumulate
+    if length(v) == 0
+        return v
+    end
+
     # Each thread will process two elements
     elems_per_block = block_size * 2
     num_blocks = (length(v) + elems_per_block - 1) ÷ elems_per_block
@@ -246,7 +252,7 @@ function accumulate!(
                  ndrange=(num_blocks - 1) * block_size)
     end
 
-    nothing
+    return v
 end
 
 
@@ -275,8 +281,8 @@ function accumulate!(
         for i in eachindex(v)
             v[i], running = running, op(running, v[i])
         end
-
     end
+    return v
 end
 
 
diff --git a/test/runtests.jl b/test/runtests.jl
index e3ba362..a530935 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -911,10 +911,10 @@ end
             mbase = minbox_base(vh, dims)
 
             @test eltype(mgpu) === eltype(mcpu) === eltype(mbase)
-            for (i, mgpu_red) in enumerate(Array(mgpu))
-                @test mgpu_red[1] ≈ mcpu[i][1] ≈ mbase[i][1]
-                @test mgpu_red[2] ≈ mcpu[i][2] ≈ mbase[i][2]
-            end
+            @test all([
+                (mgpu_red[1] ≈ mcpu[i][1] ≈ mbase[i][1]) && (mgpu_red[2] ≈ mcpu[i][2] ≈ mbase[i][2])
+                for (i, mgpu_red) in enumerate(Array(mgpu))
+            ])
         end
     end
 

From 42cfc550f49937b51f2f632c9b7911a080f9dbba Mon Sep 17 00:00:00 2001
From: anicusan <aln705@student.bham.ac.uk>
Date: Tue, 12 Nov 2024 02:09:03 +0000
Subject: [PATCH 07/10] Trying to enforce ordering by a comparison in v. Left
 AK at 0.2.1 to allow compatibility with AMDGPU which now depends on it

---
 Project.toml      | 10 +++++-----
 src/accumulate.jl |  3 +--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/Project.toml b/Project.toml
index c45b5ff..f480d9b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "AcceleratedKernels"
 uuid = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 authors = ["Andrei-Leonard Nicusan <leonard@evophase.co.uk> and contributors"]
-version = "0.3.0-DEV"
+version = "0.2.1-DEV"
 
 [deps]
 ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197"
@@ -14,12 +14,12 @@ Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
 Unrolled = "9602ed7d-8fef-5bc8-8597-8f21381861e8"
 
 [compat]
-ArgCheck = "2.1"
+ArgCheck = "2"
 DocStringExtensions = "0.9"
 GPUArraysCore = "0.1, 0.2"
 KernelAbstractions = "0.9"
-Markdown = "1.10"
-OhMyThreads = "0.7.0"
+Markdown = "1"
+OhMyThreads = "0.7"
 Polyester = "0.7"
-Unrolled = "0.1.5"
+Unrolled = "0.1"
 julia = "1.10"
diff --git a/src/accumulate.jl b/src/accumulate.jl
index 2bc4cbb..fb8dc46 100644
--- a/src/accumulate.jl
+++ b/src/accumulate.jl
@@ -194,8 +194,7 @@ end
     @synchronize()      # This is needed so that the flag is not set before copying into v, but
                         # there should be better memory fences to guarantee ordering without
                         # thread synchronization...
-    if ithread == 0
-        dummy = v[1]
+    if ithread == 0 && v[1] != typemax(eltype(v))  # This is a hack to enforce ordering of flags AFTER v is written
         flags[iblock + 1] = ACC_FLAG_A
     end
 end

From e592e9832415066d314a996a73d1a50a2671fb14 Mon Sep 17 00:00:00 2001
From: anicusan <aln705@student.bham.ac.uk>
Date: Tue, 12 Nov 2024 18:23:57 +0000
Subject: [PATCH 08/10] Added accumulate benchmark. Known issue for Metal with
 accumulate. Added CI badges.

---
 README.md                         | 156 +++++++++++++++++++++++++++++-
 prototype/accumulate_benchmark.jl |  33 +++++++
 src/accumulate.jl                 |  20 ++--
 3 files changed, 194 insertions(+), 15 deletions(-)
 create mode 100644 prototype/accumulate_benchmark.jl

diff --git a/README.md b/README.md
index 572c7e9..c5f3d69 100644
--- a/README.md
+++ b/README.md
@@ -4,11 +4,165 @@
 
 [![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://juliagpu.github.io/AcceleratedKernels.jl/stable/)
 [![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://juliagpu.github.io/AcceleratedKernels.jl/dev/)
-[![CI-CPU](https://github.com/juliagpu/AcceleratedKernels.jl/actions/workflows/CI-CPU.yml/badge.svg)](https://github.com/juliagpu/AcceleratedKernels.jl/actions/workflows/CI-CPU.yml)
 [![Aqua QA](https://raw.githubusercontent.com/JuliaTesting/Aqua.jl/master/badge.svg)](https://github.com/JuliaTesting/Aqua.jl)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 
 
+
+<table>
+
+<tr>
+<th>AK Backend</th>
+<th>Julia Version</th>
+<th>CI Status</th>
+</tr>
+
+<tr>
+<td>
+
+CPU Single- and Multi-Threaded
+
+</td>
+<td>
+
+Julia LTS, Stable, Pre-Release
+
+</td>
+<td>
+
+[![CI-CPU](https://github.com/juliagpu/AcceleratedKernels.jl/actions/workflows/CI-CPU.yml/badge.svg)](https://github.com/juliagpu/AcceleratedKernels.jl/actions/workflows/CI-CPU.yml)
+
+</td>
+</tr>
+
+<tr>
+<td rowspan=2>
+
+CUDA
+
+</td>
+<td>
+
+Julia v1.10
+
+</td>
+<td>
+
+[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=CUDA%20-%20Julia%20v1.10)](https://buildkite.com/julialang/acceleratedkernels-dot-jl)
+
+</td>
+</tr>
+
+<tr>
+<td>
+
+Julia v1.11
+
+</td>
+<td>
+
+[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=CUDA%20-%20Julia%20v1.11)](https://buildkite.com/julialang/acceleratedkernels-dot-jl)
+
+</td>
+</tr>
+
+<tr>
+<td rowspan=2>
+
+AMDGPU
+
+</td>
+<td>
+
+Julia v1.10
+
+</td>
+<td>
+
+[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=AMDGPU%20-%20Julia%20v1.10)](https://buildkite.com/julialang/acceleratedkernels-dot-jl)
+
+</td>
+</tr>
+
+<tr>
+<td>
+
+Julia v1.11
+
+</td>
+<td>
+
+[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=AMDGPU%20-%20Julia%20v1.11)](https://buildkite.com/julialang/acceleratedkernels-dot-jl)
+
+</td>
+</tr>
+
+<tr>
+<td rowspan=2>
+
+oneAPI
+
+</td>
+<td>
+
+Julia v1.10
+
+</td>
+<td>
+
+[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=oneAPI%20-%20Julia%20v1.10)](https://buildkite.com/julialang/acceleratedkernels-dot-jl)
+
+</td>
+</tr>
+
+<tr>
+<td>
+
+Julia v1.11
+
+</td>
+<td>
+
+[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=oneAPI%20-%20Julia%20v1.11)](https://buildkite.com/julialang/acceleratedkernels-dot-jl)
+
+</td>
+</tr>
+
+<tr>
+<td rowspan=2>
+
+Metal
+
+[Known Issue](https://github.com/JuliaGPU/AcceleratedKernels.jl/issues/10) 
+
+</td>
+<td>
+
+Julia v1.10
+
+</td>
+<td>
+
+[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=Metal%20-%20Julia%20v1.10)](https://buildkite.com/julialang/acceleratedkernels-dot-jl)
+
+</td>
+</tr>
+
+<tr>
+<td>
+
+Julia v1.11
+
+</td>
+<td>
+
+[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=Metal%20-%20Julia%20v1.11)](https://buildkite.com/julialang/acceleratedkernels-dot-jl)
+
+</td>
+</tr>
+
+</table>
+
 Parallel algorithm building blocks for the Julia ecosystem, targeting multithreaded CPUs, and GPUs via Intel oneAPI, AMD ROCm, Apple Metal and Nvidia CUDA (and any future backends added to the [JuliaGPU](https://juliagpu.org/) organisation).
 
 
diff --git a/prototype/accumulate_benchmark.jl b/prototype/accumulate_benchmark.jl
new file mode 100644
index 0000000..6b9191a
--- /dev/null
+++ b/prototype/accumulate_benchmark.jl
@@ -0,0 +1,33 @@
+using BenchmarkTools
+using Metal
+import AcceleratedKernels as AK
+
+using Random
+Random.seed!(0)
+
+
+function akacc(v)
+    va = AK.accumulate(+, v, init=zero(eltype(v)), block_size=512)
+    Metal.synchronize()
+    va
+end
+
+
+function baseacc(v)
+    va = accumulate(+, v, init=zero(eltype(v)))
+    Metal.synchronize()
+    va
+end
+
+
+v = MtlArray(rand(1:100, 1_000_000))
+
+# Correctness checks
+va = akacc(v) |> Array
+vb = baseacc(v) |> Array
+# @assert va == vb
+
+# Benchmarks
+println("Base vs AK")
+display(@benchmark baseacc($v))
+display(@benchmark akacc($v))
diff --git a/src/accumulate.jl b/src/accumulate.jl
index fb8dc46..6e59e30 100644
--- a/src/accumulate.jl
+++ b/src/accumulate.jl
@@ -143,7 +143,6 @@ end
 
     len = length(v)
     block_size = @groupsize()[1]
-    temp = @localmem eltype(v) (2 * block_size,)
 
     # NOTE: for many index calculations in this library, computation using zero-indexing leads to
     # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero
@@ -155,13 +154,6 @@ end
     ithread = @index(Local, Linear) - 1
     block_offset = iblock * block_size * 2              # Processing two elements per thread
 
-    # Copy two elements from the main array
-    ai = ithread
-    bi = ithread + block_size
-
-    temp[ai + 1] = block_offset + ai < len ? v[block_offset + ai + 1] : init
-    temp[bi + 1] = block_offset + bi < len ? v[block_offset + bi + 1] : init
-
     # Each block looks back to find running prefix sum
     running_prefix = init
     inspected_block = iblock - 1
@@ -180,21 +172,21 @@ end
     end
 
     # Now we have aggregate prefix of all previous blocks, add it to all our elements
-    temp[ai + 1] = op(running_prefix, temp[ai + 1])
-    temp[bi + 1] = op(running_prefix, temp[bi + 1])
-
+    ai = ithread
     if block_offset + ai < len
-        v[block_offset + ai + 1] = temp[ai + 1]
+        v[block_offset + ai + 1] = op(running_prefix, v[block_offset + ai + 1])
     end
+
+    bi = ithread + block_size
     if block_offset + bi < len
-        v[block_offset + bi + 1] = temp[bi + 1]
+        v[block_offset + bi + 1] = op(running_prefix, v[block_offset + bi + 1])
     end
 
     # Set flag for "aggregate of all prefixes up to this block finished"
     @synchronize()      # This is needed so that the flag is not set before copying into v, but
                         # there should be better memory fences to guarantee ordering without
                         # thread synchronization...
-    if ithread == 0 && v[1] != typemax(eltype(v))  # This is a hack to enforce ordering of flags AFTER v is written
+    if ithread == 0
         flags[iblock + 1] = ACC_FLAG_A
     end
 end

From 925b17e7ff6f528c1e7fbc2432af88bde811433b Mon Sep 17 00:00:00 2001
From: anicusan <aln705@student.bham.ac.uk>
Date: Tue, 12 Nov 2024 18:34:40 +0000
Subject: [PATCH 09/10] Updated README on CI

---
 README.md | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index c5f3d69..fcaa327 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,8 @@
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 
 
+Parallel algorithm building blocks for the Julia ecosystem, targeting multithreaded CPUs, and GPUs via Intel oneAPI, AMD ROCm, Apple Metal and Nvidia CUDA (and any future backends added to the [JuliaGPU](https://juliagpu.org/) organisation) from a unified KernelAbstractions.jl codebase.
+
 
 <table>
 
@@ -27,6 +29,10 @@ CPU Single- and Multi-Threaded
 
 Julia LTS, Stable, Pre-Release
 
+x86 and x64
+
+Windows, Ubuntu, MacOS
+
 </td>
 <td>
 
@@ -38,7 +44,7 @@ Julia LTS, Stable, Pre-Release
 <tr>
 <td rowspan=2>
 
-CUDA
+[CUDA](https://github.com/JuliaGPU/CUDA.jl)
 
 </td>
 <td>
@@ -69,7 +75,7 @@ Julia v1.11
 <tr>
 <td rowspan=2>
 
-AMDGPU
+[AMDGPU](https://github.com/JuliaGPU/AMDGPU.jl)
 
 </td>
 <td>
@@ -100,7 +106,7 @@ Julia v1.11
 <tr>
 <td rowspan=2>
 
-oneAPI
+[oneAPI](https://github.com/JuliaGPU/oneAPI.jl)
 
 </td>
 <td>
@@ -131,7 +137,7 @@ Julia v1.11
 <tr>
 <td rowspan=2>
 
-Metal
+[Metal](https://github.com/JuliaGPU/Metal.jl)
 
 [Known Issue](https://github.com/JuliaGPU/AcceleratedKernels.jl/issues/10) 
 
@@ -163,8 +169,6 @@ Julia v1.11
 
 </table>
 
-Parallel algorithm building blocks for the Julia ecosystem, targeting multithreaded CPUs, and GPUs via Intel oneAPI, AMD ROCm, Apple Metal and Nvidia CUDA (and any future backends added to the [JuliaGPU](https://juliagpu.org/) organisation).
-
 
 - [1. What's Different?](#1-whats-different)
 - [2. Status](#2-status)
@@ -197,11 +201,11 @@ Again, this is only possible because of the unique Julia compilation model, the
 
 
 ## 2. Status
-The AcceleratedKernels.jl sorters were adopted as the official [AMDGPU algorithms](https://github.com/JuliaGPU/AMDGPU.jl/pull/688)! The API is starting to stabilise; it follows the Julia standard library fairly closely - additionally exposing all temporary arrays for memory reuse. For any new ideas / requests, please join the conversation on [Julia Discourse](https://discourse.julialang.org/t/ann-acceleratedkernels-jl-cross-architecture-parallel-algorithms-for-julias-gpu-backends/119698/16) or post [an issue](https://github.com/juliagpu/AcceleratedKernels.jl/issues).
+The AcceleratedKernels.jl sorters were adopted as the official [AMDGPU algorithms](https://github.com/JuliaGPU/AMDGPU.jl/pull/688)! The API is starting to stabilise; it follows the Julia standard library fairly closely - and additionally exposing all temporary arrays for memory reuse. For any new ideas / requests, please join the conversation on [Julia Discourse](https://discourse.julialang.org/t/ann-acceleratedkernels-jl-cross-architecture-parallel-algorithms-for-julias-gpu-backends/119698/16) or post [an issue](https://github.com/juliagpu/AcceleratedKernels.jl/issues).
 
-We have an extensive test suite; however, I only ran them locally on the oneAPI (laptop Intel UHD Graphics 620), CUDA (laptop with Nvidia Quadro RTX 4000 and data centre Nvidia A100-40), Metal (Mac M2 and M3), and AMD (data centre AMD MI210) backends. Some kinks might still exist for some platform / OS permutations before a CI is set up.
+We have an extensive randomised test suite that we run on the CPU (single- and multi-threaded) backend on Windows, Ubuntu and MacOS for Julia LTS, Stable, and Pre-Release, plus the CUDA, AMDGPU, oneAPI and Metal backends on the [JuliaGPU buildkite](https://github.com/JuliaGPU/buildkite).
 
-AcceleratedKernels.jl will also be a fundamental building block of applications developed at [EvoPhase](https://evophase.co.uk/), so it will see continuous heavy use with industry backing. Long-term stability, performance improvements and support are priorities for us.
+AcceleratedKernels.jl is also be a fundamental building block of applications developed at [EvoPhase](https://evophase.co.uk/), so it will see continuous heavy use with industry backing. Long-term stability, performance improvements and support are priorities for us.
 
 
 ## 3. Benchmarks
@@ -714,8 +718,6 @@ Leave out to test the CPU backend:
 $> julia -e 'import Pkg; Pkg.test("AcceleratedKernels.jl")
 ```
 
-**TODO**: talk with the JuliaGPU team to add library to their [BuildKite agents](https://github.com/JuliaGPU/buildkite) CI.
-
 
 ## 8. Issues and Debugging
 As the compilation pipeline of GPU kernels is different to that of base Julia, error messages also look different - for example, where Julia would insert an exception when a variable name was not defined (e.g. we had a typo), a GPU kernel throwing exceptions cannot be compiled and instead you'll see some cascading errors like `"[...] compiling [...] resulted in invalid LLVM IR"` caused by `"Reason: unsupported use of an undefined name"` resulting in `"Reason: unsupported dynamic function invocation"`, etc.

From b5a0f771c3c0dc6b9d39da9835bc9d7e16ee6d1f Mon Sep 17 00:00:00 2001
From: anicusan <aln705@student.bham.ac.uk>
Date: Tue, 12 Nov 2024 18:35:10 +0000
Subject: [PATCH 10/10] bumped version preparing for release

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index f480d9b..2962cf7 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "AcceleratedKernels"
 uuid = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 authors = ["Andrei-Leonard Nicusan <leonard@evophase.co.uk> and contributors"]
-version = "0.2.1-DEV"
+version = "0.2.1"
 
 [deps]
 ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197"