diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
new file mode 100644
index 0000000..43cfba3
--- /dev/null
+++ b/.buildkite/pipeline.yml
@@ -0,0 +1,164 @@
+steps:
+
+  # CUDA
+  - label: "CUDA - Julia v1.10"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.10"
+    command: |
+      julia -e 'using Pkg
+
+                println("--- :julia: Instantiating environment")
+                Pkg.add("CUDA")
+                Pkg.develop(path=".")
+
+                println("+++ :julia: Running tests")
+                Pkg.test("AcceleratedKernels", test_args=["--CUDA"])'
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
+
+  - label: "CUDA - Julia v1.11"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.11"
+    command: |
+      julia -e 'using Pkg
+
+                println("--- :julia: Instantiating environment")
+                Pkg.add("CUDA")
+                Pkg.develop(path=".")
+
+                println("+++ :julia: Running tests")
+                Pkg.test("AcceleratedKernels", test_args=["--CUDA"])'
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
+
+
+  # AMDGPU
+  - label: "AMDGPU - Julia v1.10"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.10"
+    command: |
+      julia -e 'using Pkg
+
+                println("--- :julia: Instantiating environment")
+                Pkg.add("AMDGPU")
+                Pkg.develop(path=".")
+
+                println("+++ :julia: Running tests")
+                Pkg.test("AcceleratedKernels", test_args=["--AMDGPU"])'
+    agents:
+      queue: "juliagpu"
+      rocm: "*"
+      rocmgpu: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
+
+  - label: "AMDGPU - Julia v1.11"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.11"
+    command: |
+      julia -e 'using Pkg
+
+                println("--- :julia: Instantiating environment")
+                Pkg.add("AMDGPU")
+                Pkg.develop(path=".")
+
+                println("+++ :julia: Running tests")
+                Pkg.test("AcceleratedKernels", test_args=["--AMDGPU"])'
+    agents:
+      queue: "juliagpu"
+      rocm: "*"
+      rocmgpu: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
+
+
+  # oneAPI
+  - label: "oneAPI - Julia v1.10"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.10"
+    command: |
+      julia -e 'using Pkg
+
+                println("--- :julia: Instantiating environment")
+                Pkg.add("oneAPI")
+                Pkg.develop(path=".")
+
+                println("+++ :julia: Running tests")
+                Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])'
+    agents:
+      queue: "juliagpu"
+      intel: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
+
+  - label: "oneAPI - Julia v1.11"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.11"
+    command: |
+      julia -e 'using Pkg
+
+                println("--- :julia: Instantiating environment")
+                Pkg.add("oneAPI")
+                Pkg.develop(path=".")
+
+                println("+++ :julia: Running tests")
+                Pkg.test("AcceleratedKernels", test_args=["--oneAPI"])'
+    agents:
+      queue: "juliagpu"
+      intel: "*"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
+
+
+  # Metal
+  - label: "Metal - Julia v1.10"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.10"
+    command: |
+      julia -e 'using Pkg
+
+                println("--- :julia: Instantiating environment")
+                Pkg.add("Metal")
+                Pkg.develop(path=".")
+
+                println("+++ :julia: Running tests")
+                Pkg.test("AcceleratedKernels", test_args=["--Metal"])'
+    agents:
+      queue: "juliaecosystem"
+      os: "macos"
+      arch: "aarch64"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
+
+  - label: "Metal - Julia v1.11"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.11"
+    command: |
+      julia -e 'using Pkg
+
+                println("--- :julia: Instantiating environment")
+                Pkg.add("Metal")
+                Pkg.develop(path=".")
+
+                println("+++ :julia: Running tests")
+                Pkg.test("AcceleratedKernels", test_args=["--Metal"])'
+    agents:
+      queue: "juliaecosystem"
+      os: "macos"
+      arch: "aarch64"
+    if: build.message !~ /\[skip tests\]/
+    timeout_in_minutes: 60
diff --git a/Project.toml b/Project.toml
index 3e8a382..2962cf7 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "AcceleratedKernels"
 uuid = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 authors = ["Andrei-Leonard Nicusan <leonard@evophase.co.uk> and contributors"]
-version = "0.3.0-DEV"
+version = "0.2.1"
 
 [deps]
 ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197"
@@ -14,12 +14,12 @@ Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
 Unrolled = "9602ed7d-8fef-5bc8-8597-8f21381861e8"
 
 [compat]
-ArgCheck = "2.1"
+ArgCheck = "2"
 DocStringExtensions = "0.9"
-GPUArraysCore = "0.2"
+GPUArraysCore = "0.1, 0.2"
 KernelAbstractions = "0.9"
-Markdown = "1.10"
-OhMyThreads = "0.7.0"
+Markdown = "1"
+OhMyThreads = "0.7"
 Polyester = "0.7"
-Unrolled = "0.1.5"
+Unrolled = "0.1"
 julia = "1.10"
diff --git a/README.md b/README.md
index 572c7e9..fcaa327 100644
--- a/README.md
+++ b/README.md
@@ -4,12 +4,170 @@
 
 [![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://juliagpu.github.io/AcceleratedKernels.jl/stable/)
 [![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://juliagpu.github.io/AcceleratedKernels.jl/dev/)
-[![CI-CPU](https://github.com/juliagpu/AcceleratedKernels.jl/actions/workflows/CI-CPU.yml/badge.svg)](https://github.com/juliagpu/AcceleratedKernels.jl/actions/workflows/CI-CPU.yml)
 [![Aqua QA](https://raw.githubusercontent.com/JuliaTesting/Aqua.jl/master/badge.svg)](https://github.com/JuliaTesting/Aqua.jl)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 
 
-Parallel algorithm building blocks for the Julia ecosystem, targeting multithreaded CPUs, and GPUs via Intel oneAPI, AMD ROCm, Apple Metal and Nvidia CUDA (and any future backends added to the [JuliaGPU](https://juliagpu.org/) organisation).
+Parallel algorithm building blocks for the Julia ecosystem, targeting multithreaded CPUs, and GPUs via Intel oneAPI, AMD ROCm, Apple Metal and Nvidia CUDA (and any future backends added to the [JuliaGPU](https://juliagpu.org/) organisation) from a unified KernelAbstractions.jl codebase.
+
+
+<table>
+
+<tr>
+<th>AK Backend</th>
+<th>Julia Version</th>
+<th>CI Status</th>
+</tr>
+
+<tr>
+<td>
+
+CPU Single- and Multi-Threaded
+
+</td>
+<td>
+
+Julia LTS, Stable, Pre-Release
+
+x86 and x64
+
+Windows, Ubuntu, MacOS
+
+</td>
+<td>
+
+[![CI-CPU](https://github.com/juliagpu/AcceleratedKernels.jl/actions/workflows/CI-CPU.yml/badge.svg)](https://github.com/juliagpu/AcceleratedKernels.jl/actions/workflows/CI-CPU.yml)
+
+</td>
+</tr>
+
+<tr>
+<td rowspan=2>
+
+[CUDA](https://github.com/JuliaGPU/CUDA.jl)
+
+</td>
+<td>
+
+Julia v1.10
+
+</td>
+<td>
+
+[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=CUDA%20-%20Julia%20v1.10)](https://buildkite.com/julialang/acceleratedkernels-dot-jl)
+
+</td>
+</tr>
+
+<tr>
+<td>
+
+Julia v1.11
+
+</td>
+<td>
+
+[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=CUDA%20-%20Julia%20v1.11)](https://buildkite.com/julialang/acceleratedkernels-dot-jl)
+
+</td>
+</tr>
+
+<tr>
+<td rowspan=2>
+
+[AMDGPU](https://github.com/JuliaGPU/AMDGPU.jl)
+
+</td>
+<td>
+
+Julia v1.10
+
+</td>
+<td>
+
+[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=AMDGPU%20-%20Julia%20v1.10)](https://buildkite.com/julialang/acceleratedkernels-dot-jl)
+
+</td>
+</tr>
+
+<tr>
+<td>
+
+Julia v1.11
+
+</td>
+<td>
+
+[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=AMDGPU%20-%20Julia%20v1.11)](https://buildkite.com/julialang/acceleratedkernels-dot-jl)
+
+</td>
+</tr>
+
+<tr>
+<td rowspan=2>
+
+[oneAPI](https://github.com/JuliaGPU/oneAPI.jl)
+
+</td>
+<td>
+
+Julia v1.10
+
+</td>
+<td>
+
+[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=oneAPI%20-%20Julia%20v1.10)](https://buildkite.com/julialang/acceleratedkernels-dot-jl)
+
+</td>
+</tr>
+
+<tr>
+<td>
+
+Julia v1.11
+
+</td>
+<td>
+
+[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=oneAPI%20-%20Julia%20v1.11)](https://buildkite.com/julialang/acceleratedkernels-dot-jl)
+
+</td>
+</tr>
+
+<tr>
+<td rowspan=2>
+
+[Metal](https://github.com/JuliaGPU/Metal.jl)
+
+[Known Issue](https://github.com/JuliaGPU/AcceleratedKernels.jl/issues/10) 
+
+</td>
+<td>
+
+Julia v1.10
+
+</td>
+<td>
+
+[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=Metal%20-%20Julia%20v1.10)](https://buildkite.com/julialang/acceleratedkernels-dot-jl)
+
+</td>
+</tr>
+
+<tr>
+<td>
+
+Julia v1.11
+
+</td>
+<td>
+
+[![Build status](https://badge.buildkite.com/5b8c747451b382a6b1ad0a1b566d565bc851fc59515792c62e.svg?step=Metal%20-%20Julia%20v1.11)](https://buildkite.com/julialang/acceleratedkernels-dot-jl)
+
+</td>
+</tr>
+
+</table>
 
 
 - [1. What's Different?](#1-whats-different)
@@ -43,11 +201,11 @@ Again, this is only possible because of the unique Julia compilation model, the
 
 
 ## 2. Status
-The AcceleratedKernels.jl sorters were adopted as the official [AMDGPU algorithms](https://github.com/JuliaGPU/AMDGPU.jl/pull/688)! The API is starting to stabilise; it follows the Julia standard library fairly closely - additionally exposing all temporary arrays for memory reuse. For any new ideas / requests, please join the conversation on [Julia Discourse](https://discourse.julialang.org/t/ann-acceleratedkernels-jl-cross-architecture-parallel-algorithms-for-julias-gpu-backends/119698/16) or post [an issue](https://github.com/juliagpu/AcceleratedKernels.jl/issues).
+The AcceleratedKernels.jl sorters were adopted as the official [AMDGPU algorithms](https://github.com/JuliaGPU/AMDGPU.jl/pull/688)! The API is starting to stabilise; it follows the Julia standard library fairly closely - and additionally exposing all temporary arrays for memory reuse. For any new ideas / requests, please join the conversation on [Julia Discourse](https://discourse.julialang.org/t/ann-acceleratedkernels-jl-cross-architecture-parallel-algorithms-for-julias-gpu-backends/119698/16) or post [an issue](https://github.com/juliagpu/AcceleratedKernels.jl/issues).
 
-We have an extensive test suite; however, I only ran them locally on the oneAPI (laptop Intel UHD Graphics 620), CUDA (laptop with Nvidia Quadro RTX 4000 and data centre Nvidia A100-40), Metal (Mac M2 and M3), and AMD (data centre AMD MI210) backends. Some kinks might still exist for some platform / OS permutations before a CI is set up.
+We have an extensive randomised test suite that we run on the CPU (single- and multi-threaded) backend on Windows, Ubuntu and MacOS for Julia LTS, Stable, and Pre-Release, plus the CUDA, AMDGPU, oneAPI and Metal backends on the [JuliaGPU buildkite](https://github.com/JuliaGPU/buildkite).
 
-AcceleratedKernels.jl will also be a fundamental building block of applications developed at [EvoPhase](https://evophase.co.uk/), so it will see continuous heavy use with industry backing. Long-term stability, performance improvements and support are priorities for us.
+AcceleratedKernels.jl is also be a fundamental building block of applications developed at [EvoPhase](https://evophase.co.uk/), so it will see continuous heavy use with industry backing. Long-term stability, performance improvements and support are priorities for us.
 
 
 ## 3. Benchmarks
@@ -560,8 +718,6 @@ Leave out to test the CPU backend:
 $> julia -e 'import Pkg; Pkg.test("AcceleratedKernels.jl")
 ```
 
-**TODO**: talk with the JuliaGPU team to add library to their [BuildKite agents](https://github.com/JuliaGPU/buildkite) CI.
-
 
 ## 8. Issues and Debugging
 As the compilation pipeline of GPU kernels is different to that of base Julia, error messages also look different - for example, where Julia would insert an exception when a variable name was not defined (e.g. we had a typo), a GPU kernel throwing exceptions cannot be compiled and instead you'll see some cascading errors like `"[...] compiling [...] resulted in invalid LLVM IR"` caused by `"Reason: unsupported use of an undefined name"` resulting in `"Reason: unsupported dynamic function invocation"`, etc.
diff --git a/prototype/Project.toml b/prototype/Project.toml
index 88d95f6..0463a0b 100644
--- a/prototype/Project.toml
+++ b/prototype/Project.toml
@@ -4,4 +4,5 @@ Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 PProf = "e4faabce-9ead-11e9-39d9-4379958e3056"
diff --git a/prototype/accumulate_benchmark.jl b/prototype/accumulate_benchmark.jl
new file mode 100644
index 0000000..6b9191a
--- /dev/null
+++ b/prototype/accumulate_benchmark.jl
@@ -0,0 +1,33 @@
+using BenchmarkTools
+using Metal
+import AcceleratedKernels as AK
+
+using Random
+Random.seed!(0)
+
+
+function akacc(v)
+    va = AK.accumulate(+, v, init=zero(eltype(v)), block_size=512)
+    Metal.synchronize()
+    va
+end
+
+
+function baseacc(v)
+    va = accumulate(+, v, init=zero(eltype(v)))
+    Metal.synchronize()
+    va
+end
+
+
+v = MtlArray(rand(1:100, 1_000_000))
+
+# Correctness checks
+va = akacc(v) |> Array
+vb = baseacc(v) |> Array
+# @assert va == vb
+
+# Benchmarks
+println("Base vs AK")
+display(@benchmark baseacc($v))
+display(@benchmark akacc($v))
diff --git a/src/accumulate.jl b/src/accumulate.jl
index 8b3a27e..6e59e30 100644
--- a/src/accumulate.jl
+++ b/src/accumulate.jl
@@ -143,7 +143,6 @@ end
 
     len = length(v)
     block_size = @groupsize()[1]
-    temp = @localmem eltype(v) (2 * block_size,)
 
     # NOTE: for many index calculations in this library, computation using zero-indexing leads to
     # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero
@@ -155,13 +154,6 @@ end
     ithread = @index(Local, Linear) - 1
     block_offset = iblock * block_size * 2              # Processing two elements per thread
 
-    # Copy two elements from the main array
-    ai = ithread
-    bi = ithread + block_size
-
-    temp[ai + 1] = block_offset + ai < len ? v[block_offset + ai + 1] : init
-    temp[bi + 1] = block_offset + bi < len ? v[block_offset + bi + 1] : init
-
     # Each block looks back to find running prefix sum
     running_prefix = init
     inspected_block = iblock - 1
@@ -180,14 +172,14 @@ end
     end
 
     # Now we have aggregate prefix of all previous blocks, add it to all our elements
-    temp[ai + 1] = op(running_prefix, temp[ai + 1])
-    temp[bi + 1] = op(running_prefix, temp[bi + 1])
-
+    ai = ithread
     if block_offset + ai < len
-        v[block_offset + ai + 1] = temp[ai + 1]
+        v[block_offset + ai + 1] = op(running_prefix, v[block_offset + ai + 1])
     end
+
+    bi = ithread + block_size
     if block_offset + bi < len
-        v[block_offset + bi + 1] = temp[bi + 1]
+        v[block_offset + bi + 1] = op(running_prefix, v[block_offset + bi + 1])
     end
 
     # Set flag for "aggregate of all prefixes up to this block finished"
@@ -215,6 +207,11 @@ function accumulate!(
     @argcheck block_size > 0
     @argcheck ispow2(block_size)
 
+    # Nothing to accumulate
+    if length(v) == 0
+        return v
+    end
+
     # Each thread will process two elements
     elems_per_block = block_size * 2
     num_blocks = (length(v) + elems_per_block - 1) ÷ elems_per_block
@@ -246,7 +243,7 @@ function accumulate!(
                  ndrange=(num_blocks - 1) * block_size)
     end
 
-    nothing
+    return v
 end
 
 
@@ -275,8 +272,8 @@ function accumulate!(
         for i in eachindex(v)
             v[i], running = running, op(running, v[i])
         end
-
     end
+    return v
 end
 
 
diff --git a/test/runtests.jl b/test/runtests.jl
index e3ba362..a530935 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -911,10 +911,10 @@ end
             mbase = minbox_base(vh, dims)
 
             @test eltype(mgpu) === eltype(mcpu) === eltype(mbase)
-            for (i, mgpu_red) in enumerate(Array(mgpu))
-                @test mgpu_red[1] ≈ mcpu[i][1] ≈ mbase[i][1]
-                @test mgpu_red[2] ≈ mcpu[i][2] ≈ mbase[i][2]
-            end
+            @test all([
+                (mgpu_red[1] ≈ mcpu[i][1] ≈ mbase[i][1]) && (mgpu_red[2] ≈ mcpu[i][2] ≈ mbase[i][2])
+                for (i, mgpu_red) in enumerate(Array(mgpu))
+            ])
         end
     end