diff --git a/.github/workflows/runic.yml b/.github/workflows/runic.yml
new file mode 100644
index 000000000..aa6d3bfd6
--- /dev/null
+++ b/.github/workflows/runic.yml
@@ -0,0 +1,25 @@
+name: Runic formatting
+on:
+  push:
+    branches:
+      - 'master'
+      - 'release-'
+    tags:
+      - '*'
+  pull_request:
+jobs:
+  runic:
+    name: Runic
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v2
+        with:
+          version: "nightly" # Only nightly have the -m flag currently
+      - uses: julia-actions/cache@v2
+      - name: Install Runic
+        run: |
+          julia --color=yes --project=@runic -e 'using Pkg; Pkg.add(url = "https://github.com/fredrikekre/Runic.jl")'
+      - name: Run Runic
+        run: |
+          git ls-files -z -- '*.jl' | xargs -0 julia --project=@runic -m Runic --check --diff
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index 74276c41a..b153f60ee 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -36,11 +36,12 @@ let static = BenchmarkGroup()
         for N in (64, 256, 512, 1024, 2048, 4096, 16384, 32768, 65536, 262144, 1048576)
             dtype[N] = @benchmarkable begin
                 kernel = saxpy_kernel!($BACKEND, 1024)
-                kernel(Z, convert($T, 2.0), X, Y, ndrange=size(Z))
-            end setup=(
+                kernel(Z, convert($T, 2.0), X, Y, ndrange = size(Z))
+            end setup = (
                 X = rand!(KernelAbstractions.zeros($BACKEND, $T, $N));
                 Y = rand!(KernelAbstractions.zeros($BACKEND, $T, $N));
-                Z = KernelAbstractions.zeros($BACKEND, $T, $N))
+                Z = KernelAbstractions.zeros($BACKEND, $T, $N)
+            )
         end
         static["$T"] = dtype
     end
@@ -53,11 +54,12 @@ let default = BenchmarkGroup()
         for N in (64, 256, 512, 1024, 2048, 4096, 16384, 32768, 65536, 262144, 1048576)
             dtype[N] = @benchmarkable begin
                 kernel = saxpy_kernel!($BACKEND)
-                kernel(Z, convert($T, 2.0), X, Y, ndrange=size(Z))
-            end setup=(
+                kernel(Z, convert($T, 2.0), X, Y, ndrange = size(Z))
+            end setup = (
                 X = rand!(KernelAbstractions.zeros($BACKEND, $T, $N));
                 Y = rand!(KernelAbstractions.zeros($BACKEND, $T, $N));
-                Z = KernelAbstractions.zeros($BACKEND, $T, $N))
+                Z = KernelAbstractions.zeros($BACKEND, $T, $N)
+            )
         end
         default["$T"] = dtype
     end
diff --git a/docs/make.jl b/docs/make.jl
index da1e41792..61c506556 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -7,17 +7,17 @@ function main()
     ci = get(ENV, "CI", "") == "true"
 
     makedocs(;
-        modules=[KernelAbstractions],
-        authors="JuliaGPU and contributors",
-        repo="https://github.com/JuliaGPU/KernelAbstractions.jl/blob/{commit}{path}#L{line}",
-        sitename="KernelAbstractions.jl",
-        format=Documenter.HTML(;
-            prettyurls=ci,
-            canonical="https://juliagpu.github.io/KernelAbstractions.jl",
-            assets=String[],
+        modules = [KernelAbstractions],
+        authors = "JuliaGPU and contributors",
+        repo = "https://github.com/JuliaGPU/KernelAbstractions.jl/blob/{commit}{path}#L{line}",
+        sitename = "KernelAbstractions.jl",
+        format = Documenter.HTML(;
+            prettyurls = ci,
+            canonical = "https://juliagpu.github.io/KernelAbstractions.jl",
+            assets = String[],
         ),
-        warnonly=[:missing_docs],
-        pages=[
+        warnonly = [:missing_docs],
+        pages = [
             "Home" => "index.md",
             "Quickstart" => "quickstart.md",
             "Writing kernels" => "kernels.md",
@@ -34,13 +34,13 @@ function main()
             "Extras" => [
                 "extras/unrolling.md",
             ], # Extras
-            "Notes for implementations" => "implementations.md"
+            "Notes for implementations" => "implementations.md",
         ], # pages
     )
 
     if ci
         deploydocs(;
-            repo="github.com/JuliaGPU/KernelAbstractions.jl",
+            repo = "github.com/JuliaGPU/KernelAbstractions.jl",
             push_preview = true,
         )
     end
diff --git a/examples/histogram.jl b/examples/histogram.jl
index a4013481c..311cff761 100644
--- a/examples/histogram.jl
+++ b/examples/histogram.jl
@@ -30,7 +30,7 @@ end
     # possible to get a value of 312, then we will have 2 separate shmem blocks,
     # one from 1->256, and another from 256->512
     @uniform max_element = 1
-    for min_element = 1:gs:N
+    for min_element in 1:gs:N
 
         # Setting shared_histogram to 0
         @inbounds shared_histogram[lid] = 0
@@ -38,20 +38,20 @@ end
 
         max_element = min_element + gs
         if max_element > N
-            max_element = N+1
+            max_element = N + 1
         end
 
         # Defining bin on shared memory and writing to it if possible
         bin = input[tid]
         if bin >= min_element && bin < max_element
-            bin -= min_element-1
+            bin -= min_element - 1
             @atomic shared_histogram[bin] += 1
         end
 
         @synchronize()
 
-        if ((lid+min_element-1) <= N)
-            @atomic histogram_output[lid+min_element-1] += shared_histogram[lid]
+        if ((lid + min_element - 1) <= N)
+            @atomic histogram_output[lid + min_element - 1] += shared_histogram[lid]
         end
 
     end
@@ -62,7 +62,7 @@ function histogram!(histogram_output, input)
     backend = get_backend(histogram_output)
     # Need static block size
     kernel! = histogram_kernel!(backend, (256,))
-    kernel!(histogram_output, input, ndrange=size(input))
+    kernel!(histogram_output, input, ndrange = size(input))
 end
 
 function move(backend, input)
@@ -75,9 +75,9 @@ end
     if Base.VERSION < v"1.7.0" && !KernelAbstractions.isgpu(backend)
         @test_skip false
     else
-        rand_input = [rand(1:128) for i = 1:1000]
-        linear_input = [i for i = 1:1024]
-        all_two = [2 for i = 1:512]
+        rand_input = [rand(1:128) for i in 1:1000]
+        linear_input = [i for i in 1:1024]
+        all_two = [2 for i in 1:512]
 
         histogram_rand_baseline = create_histogram(rand_input)
         histogram_linear_baseline = create_histogram(linear_input)
diff --git a/examples/matmul.jl b/examples/matmul.jl
index 11d5c1520..4ade3f374 100644
--- a/examples/matmul.jl
+++ b/examples/matmul.jl
@@ -7,11 +7,11 @@ include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) #
 
     # creating a temporary sum variable for matrix multiplication
     tmp_sum = zero(eltype(output))
-    for k = 1:size(a)[2]
-        tmp_sum += a[i,k] * b[k, j]
+    for k in 1:size(a)[2]
+        tmp_sum += a[i, k] * b[k, j]
     end
 
-    output[i,j] = tmp_sum
+    output[i, j] = tmp_sum
 end
 
 # Creating a wrapper kernel for launching with error checks
@@ -22,14 +22,14 @@ function matmul!(output, a, b)
     end
     backend = KernelAbstractions.get_backend(a)
     kernel! = matmul_kernel!(backend)
-    kernel!(output, a, b, ndrange=size(output)) 
+    kernel!(output, a, b, ndrange = size(output))
 end
 
 a = rand!(allocate(backend, Float32, 256, 123))
 b = rand!(allocate(backend, Float32, 123, 45))
 output = KernelAbstractions.zeros(backend, Float32, 256, 45)
 
-matmul!(output, a,b)
+matmul!(output, a, b)
 KernelAbstractions.synchronize(backend)
 
-@test isapprox(output, a*b)
+@test isapprox(output, a * b)
diff --git a/examples/memcopy.jl b/examples/memcopy.jl
index 8602769c3..f826caecc 100644
--- a/examples/memcopy.jl
+++ b/examples/memcopy.jl
@@ -12,7 +12,7 @@ function mycopy!(A, B)
     @assert get_backend(B) == backend
 
     kernel = copy_kernel!(backend)
-    kernel(A, B, ndrange=length(A))
+    kernel(A, B, ndrange = length(A))
 end
 
 A = KernelAbstractions.zeros(backend, Float64, 128, 128)
diff --git a/examples/memcopy_static.jl b/examples/memcopy_static.jl
index 3ded81f2c..e32390816 100644
--- a/examples/memcopy_static.jl
+++ b/examples/memcopy_static.jl
@@ -12,7 +12,7 @@ function mycopy_static!(A, B)
     @assert get_backend(B) == backend
 
     kernel = copy_kernel!(backend, 32, size(A)) # if size(A) varies this will cause recompilation
-    kernel(A, B, ndrange=size(A))
+    kernel(A, B, ndrange = size(A))
 end
 
 A = KernelAbstractions.zeros(backend, Float64, 128, 128)
diff --git a/examples/mpi.jl b/examples/mpi.jl
index d0825157b..2f5e690d3 100644
--- a/examples/mpi.jl
+++ b/examples/mpi.jl
@@ -43,8 +43,8 @@ function main(backend)
     comm = MPI.COMM_WORLD
     MPI.Barrier(comm)
 
-    dst_rank = mod(MPI.Comm_rank(comm)+1, MPI.Comm_size(comm))
-    src_rank = mod(MPI.Comm_rank(comm)-1, MPI.Comm_size(comm))
+    dst_rank = mod(MPI.Comm_rank(comm) + 1, MPI.Comm_size(comm))
+    src_rank = mod(MPI.Comm_rank(comm) - 1, MPI.Comm_size(comm))
 
     T = Int64
     M = 10
@@ -59,8 +59,10 @@ function main(backend)
 
     KernelAbstractions.synchronize(backend)
 
-    recv_task, send_task = exchange!(h_send_buf, d_recv_buf, h_recv_buf,
-                                       src_rank, dst_rank, comm)
+    recv_task, send_task = exchange!(
+        h_send_buf, d_recv_buf, h_recv_buf,
+        src_rank, dst_rank, comm,
+    )
 
     cooperative_wait(recv_task)
     cooperative_wait(send_task)
diff --git a/examples/naive_transpose.jl b/examples/naive_transpose.jl
index bca184fa1..20ea6a0c0 100644
--- a/examples/naive_transpose.jl
+++ b/examples/naive_transpose.jl
@@ -17,7 +17,7 @@ function naive_transpose!(a, b)
     @assert get_backend(b) == backend
     groupsize = KernelAbstractions.isgpu(backend) ? 256 : 1024
     kernel! = naive_transpose_kernel!(backend, groupsize)
-    kernel!(a, b, ndrange=size(a))
+    kernel!(a, b, ndrange = size(a))
 end
 
 # resolution of grid will be res*res
@@ -27,7 +27,6 @@ res = 1024
 b = rand!(allocate(backend, Float32, res, res))
 a = KernelAbstractions.zeros(backend, Float32, res, res)
 
-naive_transpose!(a,b)
+naive_transpose!(a, b)
 KernelAbstractions.synchronize(backend)
 @test a == transpose(b)
-
diff --git a/examples/numa_aware.jl b/examples/numa_aware.jl
index d2185085d..f970d5589 100644
--- a/examples/numa_aware.jl
+++ b/examples/numa_aware.jl
@@ -18,8 +18,10 @@ end
 Estimate the memory bandwidth (GB/s) by performing a time measurement of a
 SAXPY kernel. Returns the memory bandwidth (GB/s) and the compute (GFLOP/s).
 """
-function measure_membw(backend = CPU(); verbose = true, N = 1024 * 500_000, dtype = Float32,
-                       init = :parallel)
+function measure_membw(
+        backend = CPU(); verbose = true, N = 1024 * 500_000, dtype = Float32,
+        init = :parallel,
+    )
     bytes = 3 * sizeof(dtype) * N # num bytes transferred in SAXPY
     flops = 2 * N # num flops in SAXY
 
@@ -37,10 +39,10 @@ function measure_membw(backend = CPU(); verbose = true, N = 1024 * 500_000, dtyp
         kernel = saxpy_kernel($backend, $workgroup_size, $(size(Y)))
         kernel($a, $X, $Y, ndrange = $(size(Y)))
         KernelAbstractions.synchronize($backend)
-    end evals=2 samples=10
+    end evals = 2 samples = 10
 
-    mem_rate = bytes * 1e-9 / t # GB/s
-    flop_rate = flops * 1e-9 / t # GFLOP/s
+    mem_rate = bytes * 1.0e-9 / t # GB/s
+    flop_rate = flops * 1.0e-9 / t # GFLOP/s
 
     if verbose
         println("\tMemory Bandwidth (GB/s): ", round(mem_rate; digits = 2))
@@ -51,7 +53,7 @@ end
 
 # Static should be much better (on a system with multiple NUMA domains)
 measure_membw(CPU());
-measure_membw(CPU(; static=true));
+measure_membw(CPU(; static = true));
 
 # The following has significantly worse performance (even on systems with a single memory domain)!
 # measure_membw(CPU(); init=:serial);
diff --git a/examples/performance.jl b/examples/performance.jl
index 4e1dbd9d4..8faf3caee 100644
--- a/examples/performance.jl
+++ b/examples/performance.jl
@@ -15,8 +15,8 @@ const BLOCK_ROWS = 8
 # Simple variants
 
 @kernel function simple_copy_kernel!(output, @Const(input))
-  I, J = @index(Global, NTuple)
-  @inbounds output[I, J] = input[I, J]
+    I, J = @index(Global, NTuple)
+    @inbounds output[I, J] = input[I, J]
 end
 
 @kernel function simple_transpose_kernel!(output, @Const(input))
@@ -26,16 +26,18 @@ end
 
 # Local memory variants
 
-@kernel function lmem_copy_kernel!(output, @Const(input), 
-                                   ::Val{BANK}=Val(1)) where BANK
+@kernel function lmem_copy_kernel!(
+        output, @Const(input),
+        ::Val{BANK} = Val(1),
+    ) where {BANK}
     I, J = @index(Global, NTuple)
-    i, j = @index(Local,  NTuple)
+    i, j = @index(Local, NTuple)
 
     N = @uniform @groupsize()[1]
     M = @uniform @groupsize()[2]
 
     # +1 to avoid bank conflicts on shared memory
-    tile = @localmem eltype(output) (N+BANK, M)
+    tile = @localmem eltype(output) (N + BANK, M)
 
     @inbounds tile[i, j] = input[I, J]
 
@@ -44,106 +46,112 @@ end
     @inbounds output[I, J] = tile[i, j]
 end
 
-@kernel function lmem_transpose_kernel!(output, @Const(input),
-                                       ::Val{BANK}=Val(1)) where BANK
+@kernel function lmem_transpose_kernel!(
+        output, @Const(input),
+        ::Val{BANK} = Val(1),
+    ) where {BANK}
     gi, gj = @index(Group, NTuple)
-    i, j = @index(Local,  NTuple)
+    i, j = @index(Local, NTuple)
 
     N = @uniform @groupsize()[1]
     M = @uniform @groupsize()[2]
-    
+
     # +1 to avoid bank conflicts on shared memory
-    tile = @localmem eltype(output) (N+BANK, M) 
+    tile = @localmem eltype(output) (N + BANK, M)
 
     # Manually calculate global indexes
     # Later on we need to pivot the group index
-    I = (gi-1) * N + i
-    J = (gj-1) * M + j
+    I = (gi - 1) * N + i
+    J = (gj - 1) * M + j
 
     @inbounds tile[i, j] = input[I, J]
 
     @synchronize
 
     # Pivot the group index
-    I = (gj-1) * M + i
-    J = (gi-1) * N + j
+    I = (gj - 1) * M + i
+    J = (gi - 1) * N + j
 
     @inbounds output[I, J] = tile[j, i]
 end
 
 # Local Memory + process multiple elements per lane
 
-@kernel function coalesced_copy_kernel!(output, @Const(input), 
-                                        ::Val{BANK}=Val(1)) where BANK
+@kernel function coalesced_copy_kernel!(
+        output, @Const(input),
+        ::Val{BANK} = Val(1),
+    ) where {BANK}
     gi, gj = @index(Group, NTuple)
-    i, j   = @index(Local, NTuple)
+    i, j = @index(Local, NTuple)
 
-    TILE_DIM   = @uniform @groupsize()[1]
+    TILE_DIM = @uniform @groupsize()[1]
     BLOCK_ROWS = @uniform @groupsize()[2]
 
     # +1 to avoid bank conflicts on shared memory
-    tile = @localmem eltype(output) (TILE_DIM+BANK, TILE_DIM)
+    tile = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)
 
     # Can't use @index(Global), because we use a smaller ndrange
-    I = (gi-1) * TILE_DIM + i
-    J = (gj-1) * TILE_DIM + j
+    I = (gi - 1) * TILE_DIM + i
+    J = (gj - 1) * TILE_DIM + j
 
-    @unroll for k in 0:BLOCK_ROWS:(TILE_DIM-1)
-        @inbounds tile[i, j+k] = input[I, J+k]
+    @unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1)
+        @inbounds tile[i, j + k] = input[I, J + k]
     end
 
     @synchronize
 
-    @unroll for k in 0:BLOCK_ROWS:(TILE_DIM-1)
-        @inbounds output[I, J+k] = tile[i, j+k]
+    @unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1)
+        @inbounds output[I, J + k] = tile[i, j + k]
     end
 end
 
-@kernel function coalesced_transpose_kernel!(output, @Const(input), 
-                                             ::Val{BANK}=Val(1)) where BANK
+@kernel function coalesced_transpose_kernel!(
+        output, @Const(input),
+        ::Val{BANK} = Val(1),
+    ) where {BANK}
     gi, gj = @index(Group, NTuple)
-    i, j   = @index(Local, NTuple)
+    i, j = @index(Local, NTuple)
 
-    TILE_DIM   = @uniform @groupsize()[1]
+    TILE_DIM = @uniform @groupsize()[1]
     BLOCK_ROWS = @uniform @groupsize()[2]
 
     # +1 to avoid bank conflicts on shared memory
-    tile = @localmem eltype(output) (TILE_DIM+BANK, TILE_DIM)
+    tile = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)
 
     # Can't use @index(Global), because we use a smaller ndrange
-    I = (gi-1) * TILE_DIM + i
-    J = (gj-1) * TILE_DIM + j
+    I = (gi - 1) * TILE_DIM + i
+    J = (gj - 1) * TILE_DIM + j
 
-    @unroll for k in 0:BLOCK_ROWS:(TILE_DIM-1)
-        @inbounds tile[i, j+k] = input[I, J+k]
+    @unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1)
+        @inbounds tile[i, j + k] = input[I, J + k]
     end
 
     @synchronize
 
     # Transpose block offsets
-    I = (gj-1) * TILE_DIM + i
-    J = (gi-1) * TILE_DIM + j
+    I = (gj - 1) * TILE_DIM + i
+    J = (gi - 1) * TILE_DIM + j
 
-    @unroll for k in 0:BLOCK_ROWS:(TILE_DIM-1)
-        @inbounds output[I, J+k] = tile[j+k, i]
+    @unroll for k in 0:BLOCK_ROWS:(TILE_DIM - 1)
+        @inbounds output[I, J + k] = tile[j + k, i]
     end
 end
 
 # Benchmark simple
 
-for block_dims in ((TILE_DIM, TILE_DIM), (TILE_DIM*TILE_DIM, 1), (1, TILE_DIM*TILE_DIM))
-    for (name, kernel) in ( 
-                            ("copy",      simple_copy_kernel!(backend, block_dims)),
-                            ("transpose", simple_transpose_kernel!(backend, block_dims)),
-                          )
+for block_dims in ((TILE_DIM, TILE_DIM), (TILE_DIM * TILE_DIM, 1), (1, TILE_DIM * TILE_DIM))
+    for (name, kernel) in (
+            ("copy", simple_copy_kernel!(backend, block_dims)),
+            ("transpose", simple_transpose_kernel!(backend, block_dims)),
+        )
         NVTX.@range "Simple $name $block_dims" let
             input = rand!(allocate(backend, T, N, N))
             output = similar(input)
 
             # compile kernel
-            kernel(output, input, ndrange=size(output))
+            kernel(output, input, ndrange = size(output))
             for rep in 1:nreps
-                kernel(output, input, ndrange=size(output))
+                kernel(output, input, ndrange = size(output))
             end
             KernelAbstractions.synchronize(backend)
         end
@@ -151,19 +159,19 @@ for block_dims in ((TILE_DIM, TILE_DIM), (TILE_DIM*TILE_DIM, 1), (1, TILE_DIM*TI
 end
 
 # Benchmark localmem
-for (name, kernel) in ( 
-                        ("copy",      lmem_copy_kernel!(backend, (TILE_DIM, TILE_DIM))),
-                        ("transpose", lmem_transpose_kernel!(backend, (TILE_DIM, TILE_DIM))),
-                      )
+for (name, kernel) in (
+        ("copy", lmem_copy_kernel!(backend, (TILE_DIM, TILE_DIM))),
+        ("transpose", lmem_transpose_kernel!(backend, (TILE_DIM, TILE_DIM))),
+    )
     for bank in (true, false)
         NVTX.@range "Localmem $name ($TILE_DIM, $TILE_DIM) bank=$bank" let
             input = rand!(allocate(backend, T, N, N))
             output = similar(input)
 
             # compile kernel
-            kernel(output, input, Val(Int(bank)), ndrange=size(output))
+            kernel(output, input, Val(Int(bank)), ndrange = size(output))
             for rep in 1:nreps
-                kernel(output, input, Val(Int(bank)), ndrange=size(output))
+                kernel(output, input, Val(Int(bank)), ndrange = size(output))
             end
             KernelAbstractions.synchronize(backend)
         end
@@ -171,10 +179,10 @@ for (name, kernel) in (
 end
 
 # Benchmark localmem + multiple elements per lane
-for (name, kernel) in ( 
-                        ("copy",      coalesced_copy_kernel!(backend, (TILE_DIM, BLOCK_ROWS))),
-                        ("transpose", coalesced_transpose_kernel!(backend, (TILE_DIM, BLOCK_ROWS))),
-                      )
+for (name, kernel) in (
+        ("copy", coalesced_copy_kernel!(backend, (TILE_DIM, BLOCK_ROWS))),
+        ("transpose", coalesced_transpose_kernel!(backend, (TILE_DIM, BLOCK_ROWS))),
+    )
     for bank in (true, false)
         NVTX.@range "Localmem + multiple elements $name ($TILE_DIM, $BLOCK_ROWS) bank=$bank" let
             input = rand!(allocate(backend, T, N, N))
@@ -187,9 +195,9 @@ for (name, kernel) in (
             ndrange = (N, div(N, block_factor))
 
             # compile kernel
-            kernel(output, input, Val(Int(bank)), ndrange=ndrange)
+            kernel(output, input, Val(Int(bank)), ndrange = ndrange)
             for rep in 1:nreps
-                kernel(output, input, Val(Int(bank)), ndrange=ndrange)
+                kernel(output, input, Val(Int(bank)), ndrange = ndrange)
             end
             KernelAbstractions.synchronize(backend)
         end
diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl
index ae0760eb9..dbd5ad75a 100644
--- a/examples/performant_matmul.jl
+++ b/examples/performant_matmul.jl
@@ -6,68 +6,70 @@ include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) #
 
 const TILE_DIM = 32
 
-@kernel function coalesced_matmul_kernel!(output, @Const(input1), @Const(input2), N, R, M,
-                                          ::Val{BANK}=Val(1)) where BANK
-     gi, gj = @index(Group, NTuple)
-     i, j   = @index(Local, NTuple)
-
-     TILE_DIM = @uniform @groupsize()[1]
-
-     # +1 to avoid bank conflicts on shared memory
-     tile1 = @localmem eltype(output) (TILE_DIM+BANK, TILE_DIM)
-     tile2 = @localmem eltype(output) (TILE_DIM+BANK, TILE_DIM)
-
-     # private variable for tile output
-     outval = @private eltype(output) 1
-     @inbounds outval[1] = -zero(eltype(output))
-
-     @uniform N = size(output, 1)
-     # number of tiles depends on inner dimension
-     @uniform NUM_TILES = div(R + TILE_DIM - 1, TILE_DIM)
-
-     # loop over all tiles needed for this calculation
-     for t in 0:NUM_TILES-1
-         # Can't use @index(Global), because we use a smaller ndrange
-         I = (gi-1) * TILE_DIM + i
-         J = (gj-1) * TILE_DIM + j
-
-         # load inputs into tiles, with bounds checking for non-square matrices
-         if I <= N && t*TILE_DIM + j <= R
-             @inbounds tile1[i, j] = input1[I, t*TILE_DIM + j]
-         else
-             @inbounds tile1[i, j] = 0.0
-         end
-         if t*TILE_DIM + i <= R && J <= M
-             @inbounds tile2[i, j] = input2[t*TILE_DIM + i, J]
-         else
-             @inbounds tile2[i, j] = 0.0
-         end
-
-         # wait for all tiles to be loaded
-         @synchronize
-
-         # get global values again
-         I = (gi-1) * TILE_DIM + i
-         J = (gj-1) * TILE_DIM + j
-
-         # calculate value of spot in output, use temporary value to allow for vectorization
-         out = zero(eltype(output))
-         @simd for k in 1:TILE_DIM
-             @inbounds out += tile1[i, k] * tile2[k, j]
-         end
-         outval[1] += out
-
-         @synchronize
-     end
-
-     # get global indices again
-     I = (gi-1) * TILE_DIM + i
-     J = (gj-1) * TILE_DIM + j
-
-     # save if inbounds
-     if I <= N && J <= M
-         @inbounds output[I, J] = outval[1]
-     end
+@kernel function coalesced_matmul_kernel!(
+        output, @Const(input1), @Const(input2), N, R, M,
+        ::Val{BANK} = Val(1),
+    ) where {BANK}
+    gi, gj = @index(Group, NTuple)
+    i, j = @index(Local, NTuple)
+
+    TILE_DIM = @uniform @groupsize()[1]
+
+    # +1 to avoid bank conflicts on shared memory
+    tile1 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)
+    tile2 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)
+
+    # private variable for tile output
+    outval = @private eltype(output) 1
+    @inbounds outval[1] = -zero(eltype(output))
+
+    @uniform N = size(output, 1)
+    # number of tiles depends on inner dimension
+    @uniform NUM_TILES = div(R + TILE_DIM - 1, TILE_DIM)
+
+    # loop over all tiles needed for this calculation
+    for t in 0:(NUM_TILES - 1)
+        # Can't use @index(Global), because we use a smaller ndrange
+        I = (gi - 1) * TILE_DIM + i
+        J = (gj - 1) * TILE_DIM + j
+
+        # load inputs into tiles, with bounds checking for non-square matrices
+        if I <= N && t * TILE_DIM + j <= R
+            @inbounds tile1[i, j] = input1[I, t * TILE_DIM + j]
+        else
+            @inbounds tile1[i, j] = 0.0
+        end
+        if t * TILE_DIM + i <= R && J <= M
+            @inbounds tile2[i, j] = input2[t * TILE_DIM + i, J]
+        else
+            @inbounds tile2[i, j] = 0.0
+        end
+
+        # wait for all tiles to be loaded
+        @synchronize
+
+        # get global values again
+        I = (gi - 1) * TILE_DIM + i
+        J = (gj - 1) * TILE_DIM + j
+
+        # calculate value of spot in output, use temporary value to allow for vectorization
+        out = zero(eltype(output))
+        @simd for k in 1:TILE_DIM
+            @inbounds out += tile1[i, k] * tile2[k, j]
+        end
+        outval[1] += out
+
+        @synchronize
+    end
+
+    # get global indices again
+    I = (gi - 1) * TILE_DIM + i
+    J = (gj - 1) * TILE_DIM + j
+
+    # save if inbounds
+    if I <= N && J <= M
+        @inbounds output[I, J] = outval[1]
+    end
 end
 
 N = 1024
@@ -79,7 +81,7 @@ C = KernelAbstractions.zeros(backend, Float32, N, M)
 
 kern = coalesced_matmul_kernel!(backend, (TILE_DIM, TILE_DIM))
 
-kern(C, A, B, N, R, M, ndrange=size(C))
+kern(C, A, B, N, R, M, ndrange = size(C))
 KernelAbstractions.synchronize(backend)
 
-@test isapprox(A*B, C)
+@test isapprox(A * B, C)
diff --git a/examples/utils.jl b/examples/utils.jl
index ea3d3db3a..5e93299b1 100644
--- a/examples/utils.jl
+++ b/examples/utils.jl
@@ -7,4 +7,3 @@ if Base.find_package("CUDA") !== nothing
 else
     const backend = CPU()
 end
-
diff --git a/ext/EnzymeExt.jl b/ext/EnzymeExt.jl
index 04eecd872..aee0d2967 100644
--- a/ext/EnzymeExt.jl
+++ b/ext/EnzymeExt.jl
@@ -29,10 +29,10 @@ import KernelAbstractions:
     synchronize
 
 function EnzymeCore.compiler_job_from_backend(
-    b::Backend,
-    @nospecialize(F::Type),
-    @nospecialize(TT::Type)
-)
+        b::Backend,
+        @nospecialize(F::Type),
+        @nospecialize(TT::Type),
+    )
     error(
         "EnzymeCore.compiler_job_from_backend is not yet implemented for $(typeof(b)), please file an issue.",
     )
@@ -54,12 +54,12 @@ function gpu_fwd(ctx, f, args...)
 end
 
 function EnzymeRules.forward(
-    func::Const{<:Kernel{CPU}},
-    ::Type{Const{Nothing}},
-    args...;
-    ndrange = nothing,
-    workgroupsize = nothing,
-)
+        func::Const{<:Kernel{CPU}},
+        ::Type{Const{Nothing}},
+        args...;
+        ndrange = nothing,
+        workgroupsize = nothing,
+    )
     kernel = func.val
     f = kernel.f
     fwd_kernel = similar(kernel, cpu_fwd)
@@ -68,12 +68,12 @@ function EnzymeRules.forward(
 end
 
 function EnzymeRules.forward(
-    func::Const{<:Kernel{<:GPU}},
-    ::Type{Const{Nothing}},
-    args...;
-    ndrange = nothing,
-    workgroupsize = nothing,
-)
+        func::Const{<:Kernel{<:GPU}},
+        ::Type{Const{Nothing}},
+        args...;
+        ndrange = nothing,
+        workgroupsize = nothing,
+    )
     kernel = func.val
     f = kernel.f
     fwd_kernel = similar(kernel, gpu_fwd)
@@ -87,23 +87,23 @@ _enzyme_mkcontext(kernel::Kernel{<:GPU}, ndrange, iterspace, dynamic) =
     mkcontext(kernel, ndrange, iterspace)
 
 _augmented_return(::Kernel{CPU}, subtape, arg_refs, tape_type) =
-    AugmentedReturn{Nothing,Nothing,Tuple{Array,typeof(arg_refs),typeof(tape_type)}}(
-        nothing,
-        nothing,
-        (subtape, arg_refs, tape_type),
-    )
+    AugmentedReturn{Nothing, Nothing, Tuple{Array, typeof(arg_refs), typeof(tape_type)}}(
+    nothing,
+    nothing,
+    (subtape, arg_refs, tape_type),
+)
 _augmented_return(::Kernel{<:GPU}, subtape, arg_refs, tape_type) =
-    AugmentedReturn{Nothing,Nothing,Any}(nothing, nothing, (subtape, arg_refs, tape_type))
+    AugmentedReturn{Nothing, Nothing, Any}(nothing, nothing, (subtape, arg_refs, tape_type))
 
 function _create_tape_kernel(
-    kernel::Kernel{CPU},
-    ModifiedBetween,
-    FT,
-    ctxTy,
-    ndrange,
-    iterspace,
-    args2...,
-)
+        kernel::Kernel{CPU},
+        ModifiedBetween,
+        FT,
+        ctxTy,
+        ndrange,
+        iterspace,
+        args2...,
+    )
     TapeType = EnzymeCore.tape_type(
         ReverseSplitModified(ReverseSplitWithPrimal, ModifiedBetween),
         FT,
@@ -117,14 +117,14 @@ function _create_tape_kernel(
 end
 
 function _create_tape_kernel(
-    kernel::Kernel{<:GPU},
-    ModifiedBetween,
-    FT,
-    ctxTy,
-    ndrange,
-    iterspace,
-    args2...,
-)
+        kernel::Kernel{<:GPU},
+        ModifiedBetween,
+        FT,
+        ctxTy,
+        ndrange,
+        iterspace,
+        args2...,
+    )
     # For peeking at the TapeType we need to first construct a correct compilation job
     # this requires the use of the device side representation of arguments.
     # So we convert the arguments here, this is a bit wasteful since the `aug_kernel` call
@@ -154,13 +154,13 @@ _create_rev_kernel(kernel::Kernel{CPU}) = similar(kernel, cpu_rev)
 _create_rev_kernel(kernel::Kernel{<:GPU}) = similar(kernel, gpu_rev)
 
 function cpu_aug_fwd(
-    ctx,
-    f::FT,
-    ::Val{ModifiedBetween},
-    subtape,
-    ::Val{TapeType},
-    args...,
-) where {ModifiedBetween,FT,TapeType}
+        ctx,
+        f::FT,
+        ::Val{ModifiedBetween},
+        subtape,
+        ::Val{TapeType},
+        args...,
+    ) where {ModifiedBetween, FT, TapeType}
     # A2 = Const{Nothing} -- since f->Nothing
     forward, _ = EnzymeCore.autodiff_thunk(
         ReverseSplitModified(ReverseSplitWithPrimal, Val(ModifiedBetween)),
@@ -178,13 +178,13 @@ function cpu_aug_fwd(
 end
 
 function cpu_rev(
-    ctx,
-    f::FT,
-    ::Val{ModifiedBetween},
-    subtape,
-    ::Val{TapeType},
-    args...,
-) where {ModifiedBetween,FT,TapeType}
+        ctx,
+        f::FT,
+        ::Val{ModifiedBetween},
+        subtape,
+        ::Val{TapeType},
+        args...,
+    ) where {ModifiedBetween, FT, TapeType}
     _, reverse = EnzymeCore.autodiff_thunk(
         ReverseSplitModified(ReverseSplitWithPrimal, Val(ModifiedBetween)),
         Const{Core.Typeof(f)},
@@ -200,13 +200,13 @@ end
 
 # GPU support
 function gpu_aug_fwd(
-    ctx,
-    f::FT,
-    ::Val{ModifiedBetween},
-    subtape,
-    ::Val{TapeType},
-    args...,
-) where {ModifiedBetween,FT,TapeType}
+        ctx,
+        f::FT,
+        ::Val{ModifiedBetween},
+        subtape,
+        ::Val{TapeType},
+        args...,
+    ) where {ModifiedBetween, FT, TapeType}
     # A2 = Const{Nothing} -- since f->Nothing
     forward, _ = EnzymeCore.autodiff_deferred_thunk(
         ReverseSplitModified(ReverseSplitWithPrimal, Val(ModifiedBetween)),
@@ -225,13 +225,13 @@ function gpu_aug_fwd(
 end
 
 function gpu_rev(
-    ctx,
-    f::FT,
-    ::Val{ModifiedBetween},
-    subtape,
-    ::Val{TapeType},
-    args...,
-) where {ModifiedBetween,FT,TapeType}
+        ctx,
+        f::FT,
+        ::Val{ModifiedBetween},
+        subtape,
+        ::Val{TapeType},
+        args...,
+    ) where {ModifiedBetween, FT, TapeType}
     # XXX: TapeType and A2 as args to autodiff_deferred_thunk
     _, reverse = EnzymeCore.autodiff_deferred_thunk(
         ReverseSplitModified(ReverseSplitWithPrimal, Val(ModifiedBetween)),
@@ -248,13 +248,13 @@ function gpu_rev(
 end
 
 function EnzymeRules.augmented_primal(
-    config::Config,
-    func::Const{<:Kernel},
-    ::Type{Const{Nothing}},
-    args::Vararg{Any,N};
-    ndrange = nothing,
-    workgroupsize = nothing,
-) where {N}
+        config::Config,
+        func::Const{<:Kernel},
+        ::Type{Const{Nothing}},
+        args::Vararg{Any, N};
+        ndrange = nothing,
+        workgroupsize = nothing,
+    ) where {N}
     kernel = func.val
     f = kernel.f
 
@@ -306,14 +306,14 @@ function EnzymeRules.augmented_primal(
 end
 
 function EnzymeRules.reverse(
-    config::Config,
-    func::Const{<:Kernel},
-    ::Type{<:EnzymeCore.Annotation},
-    tape,
-    args::Vararg{Any,N};
-    ndrange = nothing,
-    workgroupsize = nothing,
-) where {N}
+        config::Config,
+        func::Const{<:Kernel},
+        ::Type{<:EnzymeCore.Annotation},
+        tape,
+        args::Vararg{Any, N};
+        ndrange = nothing,
+        workgroupsize = nothing,
+    ) where {N}
     subtape, arg_refs, tape_type = tape
 
     args2 = ntuple(Val(N)) do i
@@ -359,22 +359,22 @@ end
 #       synchronize rule and then synchronize where the launch was. However, with the current
 #       kernel semantics this ensures correctness for now.
 function EnzymeRules.augmented_primal(
-    config::Config,
-    func::Const{typeof(synchronize)},
-    ::Type{Const{Nothing}},
-    backend::T,
-) where {T<:EnzymeCore.Annotation}
+        config::Config,
+        func::Const{typeof(synchronize)},
+        ::Type{Const{Nothing}},
+        backend::T,
+    ) where {T <: EnzymeCore.Annotation}
     synchronize(backend.val)
     return AugmentedReturn(nothing, nothing, nothing)
 end
 
 function EnzymeRules.reverse(
-    config::Config,
-    func::Const{typeof(synchronize)},
-    ::Type{Const{Nothing}},
-    tape,
-    backend,
-)
+        config::Config,
+        func::Const{typeof(synchronize)},
+        ::Type{Const{Nothing}},
+        tape,
+        backend,
+    )
     # noop for now
     return (nothing,)
 end
diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index d1f21b32f..a2cea14fd 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -53,7 +53,7 @@ synchronize(backend)
 ```
 """
 macro kernel(expr)
-    __kernel(expr, #=generate_cpu=#true, #=force_inbounds=#false)
+    __kernel(expr, #=generate_cpu=# true, #=force_inbounds=# false)
 end
 
 """
@@ -75,18 +75,20 @@ macro kernel(ex...)
     else
         generate_cpu = true
         force_inbounds = false
-        for i = 1:length(ex)-1
+        for i in 1:(length(ex) - 1)
             if ex[i] isa Expr && ex[i].head == :(=) &&
-                ex[i].args[1] == :cpu && ex[i].args[2] isa Bool
+                    ex[i].args[1] == :cpu && ex[i].args[2] isa Bool
                 generate_cpu = ex[i].args[2]
             elseif ex[i] isa Expr && ex[i].head == :(=) &&
-                ex[i].args[1] == :inbounds && ex[i].args[2] isa Bool
+                    ex[i].args[1] == :inbounds && ex[i].args[2] isa Bool
                 force_inbounds = ex[i].args[2]
             else
-                error("Configuration should be of form:\n"*
-                      "* `cpu=true`\n"*
-                      "* `inbounds=false`\n"*
-                      "got `", ex[i], "`")
+                error(
+                    "Configuration should be of form:\n" *
+                        "* `cpu=true`\n" *
+                        "* `inbounds=false`\n" *
+                        "got `", ex[i], "`",
+                )
             end
         end
         __kernel(ex[end], generate_cpu, force_inbounds)
@@ -303,7 +305,7 @@ This is a unified print statement.
 """
 macro print(items...)
 
-    args = Union{Val,Expr,Symbol}[]
+    args = Union{Val, Expr, Symbol}[]
 
     items = [items...]
     while true
@@ -330,7 +332,7 @@ macro print(items...)
     end
 
     quote
-        $__print($(map(esc,args)...))
+        $__print($(map(esc, args)...))
     end
 end
 
@@ -373,8 +375,8 @@ macro index(locale, args...)
 
     if length(args) >= 1
         if args[1] === :Cartesian ||
-           args[1] === :Linear ||
-           args[1] === :NTuple
+                args[1] === :Linear ||
+                args[1] === :NTuple
             indexkind = args[1]
             args = args[2:end]
         else
@@ -439,7 +441,7 @@ Instantiate a CPU (multi-threaded) backend.
 """
 struct CPU <: Backend
     static::Bool
-    CPU(;static::Bool=false) = new(static)
+    CPU(; static::Bool = false) = new(static)
 end
 
 """
@@ -493,7 +495,7 @@ allocate(backend::Backend, T, dims::Tuple) = throw(MethodError(allocate, (backen
 Allocate a storage array appropriate for the computational backend filled with zeros.
 """
 zeros(backend::Backend, T, dims...) = zeros(backend, T, dims)
-function zeros(backend::Backend, ::Type{T}, dims::Tuple) where T
+function zeros(backend::Backend, ::Type{T}, dims::Tuple) where {T}
     data = allocate(backend, T, dims...)
     fill!(data, zero(T))
     return data
@@ -505,7 +507,7 @@ end
 Allocate a storage array appropriate for the computational backend filled with ones.
 """
 ones(backend::Backend, T, dims...) = ones(backend, T, dims)
-function ones(backend::Backend, ::Type{T}, dims::Tuple) where T
+function ones(backend::Backend, ::Type{T}, dims::Tuple) where {T}
     data = allocate(backend, T, dims)
     fill!(data, one(T))
     return data
@@ -589,7 +591,7 @@ in a workgroup.
     ```
     As well as the on-device functionality.
 """
-struct Kernel{Backend, WorkgroupSize<:_Size, NDRange<:_Size, Fun}
+struct Kernel{Backend, WorkgroupSize <: _Size, NDRange <: _Size, Fun}
     backend::Backend
     f::Fun
 end
@@ -599,7 +601,7 @@ function Base.similar(kernel::Kernel{D, WS, ND}, f::F) where {D, WS, ND, F}
 end
 
 workgroupsize(::Kernel{D, WorkgroupSize}) where {D, WorkgroupSize} = WorkgroupSize
-ndrange(::Kernel{D, WorkgroupSize, NDRange}) where {D, WorkgroupSize,NDRange} = NDRange
+ndrange(::Kernel{D, WorkgroupSize, NDRange}) where {D, WorkgroupSize, NDRange} = NDRange
 backend(kernel::Kernel) = kernel.backend
 
 """
@@ -610,7 +612,7 @@ Partition a kernel for the given ndrange and workgroupsize.
     static_workgroupsize = KernelAbstractions.workgroupsize(kernel)
 
     if ndrange === nothing && static_ndrange <: DynamicSize ||
-       workgroupsize === nothing && static_workgroupsize <: DynamicSize
+            workgroupsize === nothing && static_workgroupsize <: DynamicSize
         errmsg = """
             Can not partition kernel!
 
@@ -662,7 +664,7 @@ Partition a kernel for the given ndrange and workgroupsize.
     return iterspace, dynamic
 end
 
-function construct(backend::Backend, ::S, ::NDRange, xpu_name::XPUName) where {Backend<:Union{CPU,GPU}, S<:_Size, NDRange<:_Size, XPUName}
+function construct(backend::Backend, ::S, ::NDRange, xpu_name::XPUName) where {Backend <: Union{CPU, GPU}, S <: _Size, NDRange <: _Size, XPUName}
     return Kernel{Backend, S, NDRange, XPUName}(backend, xpu_name)
 end
 
@@ -719,7 +721,7 @@ __size(i::Int) = Tuple{i}
 
 Convert arguments to the device side representation.
 """
-argconvert(k::Kernel{T}, arg) where T =
+argconvert(k::Kernel{T}, arg) where {T} =
     error("Don't know how to convert arguments for Kernel{$T}")
 
 # Enzyme support
@@ -764,7 +766,7 @@ PrecompileTools.@compile_workload begin
 end
 
 if !isdefined(Base, :get_extension)
-using Requires
+    using Requires
 end
 
 @static if !isdefined(Base, :get_extension)
diff --git a/src/compiler.jl b/src/compiler.jl
index 0f7b7a713..c60bc83be 100644
--- a/src/compiler.jl
+++ b/src/compiler.jl
@@ -20,12 +20,12 @@ struct CompilerMetadata{StaticNDRange, CheckBounds, I, NDRange, Iterspace}
     end
 end
 
-@inline __iterspace(cm::CompilerMetadata)  = cm.iterspace
+@inline __iterspace(cm::CompilerMetadata) = cm.iterspace
 @inline __groupindex(cm::CompilerMetadata) = cm.groupindex
 @inline __groupsize(cm::CompilerMetadata) = size(workitems(__iterspace(cm)))
 @inline __dynamic_checkbounds(::CompilerMetadata{NDRange, CB}) where {NDRange, CB} = CB <: DynamicCheck
-@inline __ndrange(::CompilerMetadata{NDRange}) where {NDRange<:StaticSize}  = CartesianIndices(get(NDRange))
-@inline __ndrange(cm::CompilerMetadata{NDRange}) where {NDRange<:DynamicSize} = cm.ndrange
+@inline __ndrange(::CompilerMetadata{NDRange}) where {NDRange <: StaticSize} = CartesianIndices(get(NDRange))
+@inline __ndrange(cm::CompilerMetadata{NDRange}) where {NDRange <: DynamicSize} = cm.ndrange
 @inline __workitems_iterspace(ctx) = workitems(__iterspace(ctx))
 
 @inline groupsize(ctx) = __groupsize(ctx)
diff --git a/src/cpu.jl b/src/cpu.jl
index 539a1e33f..c4afec5cd 100644
--- a/src/cpu.jl
+++ b/src/cpu.jl
@@ -3,15 +3,15 @@ import UnsafeAtomicsLLVM
 unsafe_free!(::AbstractArray) = return
 synchronize(::CPU) = nothing
 
-allocate(::CPU, ::Type{T}, dims::Tuple) where T = Array{T}(undef, dims)
+allocate(::CPU, ::Type{T}, dims::Tuple) where {T} = Array{T}(undef, dims)
 
-function zeros(backend::CPU, ::Type{T}, dims::Tuple) where T
+function zeros(backend::CPU, ::Type{T}, dims::Tuple) where {T}
     arr = allocate(backend, T, dims)
     kernel = init_kernel(backend)
-    kernel(arr, zero, T,ndrange = length(arr))
+    kernel(arr, zero, T, ndrange = length(arr))
     return arr
 end
-function ones(backend::CPU, ::Type{T}, dims::Tuple) where T
+function ones(backend::CPU, ::Type{T}, dims::Tuple) where {T}
     arr = allocate(backend, T, dims)
     kernel = init_kernel(backend)
     kernel(arr, one, T; ndrange = length(arr))
@@ -36,7 +36,7 @@ end
 
 functional(::CPU) = true
 
-function (obj::Kernel{CPU})(args...; ndrange=nothing, workgroupsize=nothing, )
+function (obj::Kernel{CPU})(args...; ndrange = nothing, workgroupsize = nothing)
     ndrange, workgroupsize, iterspace, dynamic = launch_config(obj, ndrange, workgroupsize)
 
     if length(blocks(iterspace)) == 0
@@ -74,7 +74,7 @@ end
         ndrange = (ndrange,)
     end
     if workgroupsize isa Integer
-        workgroupsize = (workgroupsize, )
+        workgroupsize = (workgroupsize,)
     end
 
     if KernelAbstractions.workgroupsize(kernel) <: DynamicSize && workgroupsize === nothing
@@ -121,12 +121,12 @@ end
 
 function __thread_run(tid, len, rem, obj, ndrange, iterspace, args, dynamic)
     # compute this thread's iterations
-    f = 1 + ((tid-1) * len)
+    f = 1 + ((tid - 1) * len)
     l = f + len - 1
     # distribute remaining iterations evenly
     if rem > 0
         if tid <= rem
-            f = f + (tid-1)
+            f = f + (tid - 1)
             l = l + tid
         else
             f = f + rem
@@ -134,7 +134,7 @@ function __thread_run(tid, len, rem, obj, ndrange, iterspace, args, dynamic)
         end
     end
     # run this thread's iterations
-    for i = f:l
+    for i in f:l
         block = @inbounds blocks(iterspace)[i]
         ctx = mkcontext(obj, block, ndrange, iterspace, dynamic)
         obj.f(ctx, args...)
@@ -142,7 +142,7 @@ function __thread_run(tid, len, rem, obj, ndrange, iterspace, args, dynamic)
     return nothing
 end
 
-function mkcontext(kernel::Kernel{CPU}, I, _ndrange, iterspace, ::Dynamic) where Dynamic
+function mkcontext(kernel::Kernel{CPU}, I, _ndrange, iterspace, ::Dynamic) where {Dynamic}
     return CompilerMetadata{ndrange(kernel), Dynamic}(I, _ndrange, iterspace)
 end
 
@@ -206,13 +206,13 @@ end
 
 # Base.view creates a boundscheck which captures A
 # https://github.com/JuliaLang/julia/issues/39308
-@inline function aview(A, I::Vararg{Any, N}) where N
-     J = Base.to_indices(A, I)
-     Base.unsafe_view(Base._maybe_reshape_parent(A, Base.index_ndims(J...)), J...)
+@inline function aview(A, I::Vararg{Any, N}) where {N}
+    J = Base.to_indices(A, I)
+    Base.unsafe_view(Base._maybe_reshape_parent(A, Base.index_ndims(J...)), J...)
 end
 
-@inline function Base.getindex(A::ScratchArray{N}, idx) where N
-    return @inbounds aview(A.data, ntuple(_->:, Val(N))..., idx)
+@inline function Base.getindex(A::ScratchArray{N}, idx) where {N}
+    return @inbounds aview(A.data, ntuple(_ -> :, Val(N))..., idx)
 end
 
 # Argument conversion
diff --git a/src/macros.jl b/src/macros.jl
index a659551d6..55b4bab40 100644
--- a/src/macros.jl
+++ b/src/macros.jl
@@ -10,7 +10,7 @@ function find_return(stmt)
 end
 
 # XXX: Proper errors
-function __kernel(expr, generate_cpu=true, force_inbounds=false)
+function __kernel(expr, generate_cpu = true, force_inbounds = false)
     def = splitdef(expr)
     name = def[:name]
     args = def[:args]
@@ -53,7 +53,7 @@ function __kernel(expr, generate_cpu=true, force_inbounds=false)
             Core.@__doc__ $name(dev) = $name(dev, $DynamicSize(), $DynamicSize())
             $name(dev, size) = $name(dev, $StaticSize(size), $DynamicSize())
             $name(dev, size, range) = $name(dev, $StaticSize(size), $StaticSize(range))
-            function $name(dev::Dev, sz::S, range::NDRange) where {Dev, S<:$_Size, NDRange<:$_Size}
+            function $name(dev::Dev, sz::S, range::NDRange) where {Dev, S <: $_Size, NDRange <: $_Size}
                 if $isgpu(dev)
                     return $construct(dev, sz, range, $gpu_name)
                 else
@@ -87,7 +87,7 @@ function transform_gpu!(def, constargs, force_inbounds)
     body = def[:body]
     if force_inbounds
         body = quote
-           @inbounds $(body)
+            @inbounds $(body)
         end
     end
     body = quote
@@ -96,7 +96,8 @@ function transform_gpu!(def, constargs, force_inbounds)
         end
         return nothing
     end
-    def[:body] = Expr(:let,
+    def[:body] = Expr(
+        :let,
         Expr(:block, let_constargs...),
         body,
     )
@@ -129,18 +130,19 @@ function transform_cpu!(def, constargs, force_inbounds)
     end
     push!(new_stmts, Expr(:popaliasscope))
     push!(new_stmts, :(return nothing))
-    def[:body] = Expr(:let,
+    def[:body] = Expr(
+        :let,
         Expr(:block, let_constargs...),
-        Expr(:block, new_stmts...)
+        Expr(:block, new_stmts...),
     )
 end
 
 struct WorkgroupLoop
-    indicies :: Vector{Any}
-    stmts :: Vector{Any}
-    allocations :: Vector{Any}
-    private_allocations :: Vector{Any}
-    private :: Set{Symbol}
+    indicies::Vector{Any}
+    stmts::Vector{Any}
+    allocations::Vector{Any}
+    private_allocations::Vector{Any}
+    private::Set{Symbol}
 end
 
 is_sync(expr) = @capture(expr, @synchronize() | @synchronize(a_))
@@ -160,17 +162,19 @@ function find_sync(stmt)
 end
 
 # TODO proper handling of LineInfo
-function split(stmts,
-               indicies = Any[], private = Set{Symbol}())
+function split(
+        stmts,
+        indicies = Any[], private = Set{Symbol}(),
+    )
     # 1. Split the code into blocks separated by `@synchronize`
     # 2. Aggregate `@index` expressions
     # 3. Hoist allocations
     # 4. Hoist uniforms
 
-    current     = Any[]
+    current = Any[]
     allocations = Any[]
     private_allocations = Any[]
-    new_stmts   = Any[]
+    new_stmts = Any[]
     for stmt in stmts
         has_sync = find_sync(stmt)
         if has_sync
@@ -178,7 +182,7 @@ function split(stmts,
             push!(new_stmts, emit(loop))
             allocations = Any[]
             private_allocations = Any[]
-            current     = Any[]
+            current = Any[]
             is_sync(stmt) && continue
 
             # Recurse into scope constructs
@@ -210,7 +214,7 @@ function split(stmts,
             if @capture(rhs, @index(args__))
                 push!(indicies, stmt)
                 continue
-            elseif @capture(rhs, @localmem(args__) | @uniform(args__) )
+            elseif @capture(rhs, @localmem(args__) | @uniform(args__))
                 push!(allocations, stmt)
                 continue
             elseif @capture(rhs, @private(T_, dims_))
@@ -255,14 +259,14 @@ function emit(loop)
 
     for stmt in loop.private_allocations
         if @capture(stmt, lhs_ = rhs_)
-            push!(stmts, :($lhs = ntuple(_->$rhs, $N)))
+            push!(stmts, :($lhs = ntuple(_ -> $rhs, $N)))
         else
             error("@private $stmt not an assignment")
         end
     end
 
     # don't emit empty loops
-    if !(isempty(loop.stmts) || all(s->s isa LineNumberNode, loop.stmts))
+    if !(isempty(loop.stmts) || all(s -> s isa LineNumberNode, loop.stmts))
         body = Expr(:block, loop.stmts...)
         body = postwalk(body) do expr
             if @capture(expr, lhs_ = rhs_)
diff --git a/src/nditeration.jl b/src/nditeration.jl
index d7598ae20..ab3fd4ec3 100644
--- a/src/nditeration.jl
+++ b/src/nditeration.jl
@@ -12,13 +12,13 @@ struct NoDynamicCheck end
 abstract type _Size end
 struct DynamicSize <: _Size end
 struct StaticSize{S} <: _Size
-    function StaticSize{S}() where S
+    function StaticSize{S}() where {S}
         new{S::Tuple{Vararg{Int}}}()
     end
 end
 
-@pure StaticSize(s::Tuple{Vararg{Int}}) = StaticSize{s}() 
-@pure StaticSize(s::Int...) = StaticSize{s}() 
+@pure StaticSize(s::Tuple{Vararg{Int}}) = StaticSize{s}()
+@pure StaticSize(s::Int...) = StaticSize{s}()
 @pure StaticSize(s::Type{<:Tuple}) = StaticSize{tuple(s.parameters...)}()
 
 # Some @pure convenience functions for `StaticSize`
@@ -59,10 +59,10 @@ struct NDRange{N, StaticBlocks, StaticWorkitems, DynamicBlock, DynamicWorkitems}
     end
 end
 
-@inline workitems(range::NDRange{N, B, W}) where {N,B,W<:DynamicSize} = range.workitems::CartesianIndices{N}
-@inline workitems(range::NDRange{N, B, W}) where {N,B,W<:StaticSize} = CartesianIndices(get(W))::CartesianIndices{N}
-@inline blocks(range::NDRange{N, B}) where {N,B<:DynamicSize} = range.blocks::CartesianIndices{N}
-@inline blocks(range::NDRange{N, B}) where {N,B<:StaticSize} = CartesianIndices(get(B))::CartesianIndices{N}
+@inline workitems(range::NDRange{N, B, W}) where {N, B, W <: DynamicSize} = range.workitems::CartesianIndices{N}
+@inline workitems(range::NDRange{N, B, W}) where {N, B, W <: StaticSize} = CartesianIndices(get(W))::CartesianIndices{N}
+@inline blocks(range::NDRange{N, B}) where {N, B <: DynamicSize} = range.blocks::CartesianIndices{N}
+@inline blocks(range::NDRange{N, B}) where {N, B <: StaticSize} = CartesianIndices(get(B))::CartesianIndices{N}
 
 import Base.iterate
 @inline iterate(range::NDRange) = iterate(blocks(range))
@@ -70,12 +70,12 @@ import Base.iterate
 
 Base.length(range::NDRange) = length(blocks(range))
 
-@inline function expand(ndrange::NDRange{N}, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) where N
+@inline function expand(ndrange::NDRange{N}, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) where {N}
     nI = ntuple(Val(N)) do I
         Base.@_inline_meta
         stride = size(workitems(ndrange), I)
         gidx = groupidx.I[I]
-        (gidx-1)*stride + idx.I[I]
+        (gidx - 1) * stride + idx.I[I]
     end
     CartesianIndex(nI)
 end
@@ -84,11 +84,11 @@ Base.@propagate_inbounds function expand(ndrange::NDRange, groupidx::Integer, id
     expand(ndrange, blocks(ndrange)[groupidx], workitems(ndrange)[idx])
 end
 
-Base.@propagate_inbounds function expand(ndrange::NDRange{N}, groupidx::CartesianIndex{N}, idx::Integer) where N
+Base.@propagate_inbounds function expand(ndrange::NDRange{N}, groupidx::CartesianIndex{N}, idx::Integer) where {N}
     expand(ndrange, groupidx, workitems(ndrange)[idx])
 end
 
-Base.@propagate_inbounds function expand(ndrange::NDRange{N}, groupidx::Integer, idx::CartesianIndex{N}) where N
+Base.@propagate_inbounds function expand(ndrange::NDRange{N}, groupidx::Integer, idx::CartesianIndex{N}) where {N}
     expand(ndrange, blocks(ndrange)[groupidx], idx)
 end
 
diff --git a/src/reflection.jl b/src/reflection.jl
index e0be71c61..da3ba1fbf 100644
--- a/src/reflection.jl
+++ b/src/reflection.jl
@@ -4,7 +4,7 @@ export @ka_code_typed, @ka_code_llvm
 using UUIDs
 const Cthulhu = Base.PkgId(UUID("f68482b8-f384-11e8-15f7-abe071a5a75f"), "Cthulhu")
 
-function ka_code_typed(kernel, argtypes; ndrange=nothing, workgroupsize=nothing, interactive=false, kwargs...)
+function ka_code_typed(kernel, argtypes; ndrange = nothing, workgroupsize = nothing, interactive = false, kwargs...)
     # get the iterspace and dynamic of a kernel
     ndrange, workgroupsize, iterspace, dynamic = KernelAbstractions.launch_config(kernel, ndrange, workgroupsize)
 
@@ -24,7 +24,7 @@ function ka_code_typed(kernel, argtypes; ndrange=nothing, workgroupsize=nothing,
     if interactive
         # call Cthulhu without introducing a dependency on Cthulhu
         mod = Base.get(Base.loaded_modules, Cthulhu, nothing)
-        mod===nothing && error("Interactive code reflection requires Cthulhu; please install and load this package first.")
+        mod === nothing && error("Interactive code reflection requires Cthulhu; please install and load this package first.")
         descend_code_typed = getfield(mod, :descend_code_typed)
         return descend_code_typed(typeof(kernel.f), (typeof(ctx), argtypes...); kwargs...)
     else
@@ -33,11 +33,11 @@ function ka_code_typed(kernel, argtypes; ndrange=nothing, workgroupsize=nothing,
 end
 
 
-function ka_code_llvm(kernel, argtypes; ndrange=nothing, workgroupsize=nothing, kwargs...)
-    ka_code_llvm(stdout, kernel, argtypes;  ndrange=ndrange, workgroupsize=nothing, kwargs...)
+function ka_code_llvm(kernel, argtypes; ndrange = nothing, workgroupsize = nothing, kwargs...)
+    ka_code_llvm(stdout, kernel, argtypes; ndrange = ndrange, workgroupsize = nothing, kwargs...)
 end
 
-function ka_code_llvm(io::IO, kernel, argtypes; ndrange=nothing, workgroupsize=nothing, kwargs...)
+function ka_code_llvm(io::IO, kernel, argtypes; ndrange = nothing, workgroupsize = nothing, kwargs...)
     # get the iterspace and dynamic of a kernel
     ndrange, workgroupsize, iterspace, dynamic = KernelAbstractions.launch_config(kernel, ndrange, workgroupsize)
 
@@ -60,7 +60,7 @@ function format_ex(ex0)
     args = gensym(:args)
     old_args = nothing
     kern = nothing
-    for i = 1:length(ex0)
+    for i in 1:length(ex0)
         if ex0[i].head == :call
             # inside kernel() expr
             while length(ex0[i].args) > 2
@@ -122,7 +122,7 @@ macro ka_code_typed(ex0...)
     quote
         local $(esc(args)) = $(old_args)
         # e.g. translate CuArray to CuBackendArray
-        $(esc(args)) = map(x->argconvert($kern, x), $(esc(args)))
+        $(esc(args)) = map(x -> argconvert($kern, x), $(esc(args)))
 
         local results = $thecall
         if results !== nothing
@@ -155,7 +155,7 @@ macro ka_code_llvm(ex0...)
     quote
         local $(esc(args)) = $(old_args)
 
-        if isa($kern, Kernel{G} where {G<:GPU})
+        if isa($kern, Kernel{G} where {G <: GPU})
             # does not support GPU kernels
             error("@ka_code_llvm does not support GPU kernels")
         end
diff --git a/test/compiler.jl b/test/compiler.jl
index 510506503..cf86386e6 100644
--- a/test/compiler.jl
+++ b/test/compiler.jl
@@ -44,33 +44,33 @@ function compiler_testsuite(backend, ArrayT)
     @test KernelAbstractions.__index_Global_NTuple(ctx, CartesianIndex(1)) == (1,)
 
     A = ArrayT{Int}(undef, 1)
-    let (CI, rt) = @ka_code_typed literal_pow(backend())(A, ndrange=1)
+    let (CI, rt) = @ka_code_typed literal_pow(backend())(A, ndrange = 1)
         # test that there is no invoke of overdub
         @test !any(check_for_overdub, CI.code)
     end
 
     A = ArrayT{Float32}(undef, 1)
-    let (CI, rt) = @ka_code_typed square(backend())(A, A, ndrange=1)
+    let (CI, rt) = @ka_code_typed square(backend())(A, A, ndrange = 1)
         # test that there is no invoke of overdub
         @test !any(check_for_overdub, CI.code)
     end
 
     A = ArrayT{Float32}(undef, 1)
     B = ArrayT{Float32}(undef, 1)
-    let (CI, rt) = @ka_code_typed pow(backend())(A, B, ndrange=1)
+    let (CI, rt) = @ka_code_typed pow(backend())(A, B, ndrange = 1)
         # test that there is no invoke of overdub
         @test !any(check_for_overdub, CI.code)
     end
 
     A = ArrayT{Float32}(undef, 1)
     B = ArrayT{Int32}(undef, 1)
-    let (CI, rt) = @ka_code_typed pow(backend())(A, B, ndrange=1)
+    let (CI, rt) = @ka_code_typed pow(backend())(A, B, ndrange = 1)
         # test that there is no invoke of overdub
         @test !any(check_for_overdub, CI.code)
     end
 
     A = ArrayT{Int}(undef, 1)
-    let (CI, rt) = @ka_code_typed checked(backend())(A, 1, 2, ndrange=1)
+    let (CI, rt) = @ka_code_typed checked(backend())(A, 1, 2, ndrange = 1)
         # test that there is no invoke of overdub
         @test !any(check_for_overdub, CI.code)
     end
diff --git a/test/convert.jl b/test/convert.jl
index 95a9a822f..a87f48016 100644
--- a/test/convert.jl
+++ b/test/convert.jl
@@ -47,20 +47,20 @@ function convert_testsuite(backend, ArrayT)
     ET = KernelAbstractions.supports_float64(backend()) ? Float64 : Float32
 
     N = 32
-    d_A = ArrayT([rand(ET)*3 for i = 1:N])
+    d_A = ArrayT([rand(ET) * 3 for i in 1:N])
 
     # 30 because we have 10 integer types and we have 3 operations
     d_B = ArrayT(zeros(ET, N, 30))
 
     @testset "convert test" begin
         kernel = convert_kernel!(backend(), 4)
-        kernel(d_A, d_B, ndrange=(N),)
+        kernel(d_A, d_B, ndrange = (N))
         synchronize(backend())
 
-        for i = 1:10
-            @test d_B[:,i] == ceil.(d_A)
-            @test d_B[:,i+10] == floor.(d_A)
-            @test d_B[:,i+20] == round.(d_A)
+        for i in 1:10
+            @test d_B[:, i] == ceil.(d_A)
+            @test d_B[:, i + 10] == floor.(d_A)
+            @test d_B[:, i + 20] == round.(d_A)
         end
     end
 end
diff --git a/test/examples.jl b/test/examples.jl
index 9b1d48f7d..a3cb04d12 100644
--- a/test/examples.jl
+++ b/test/examples.jl
@@ -1,4 +1,4 @@
-function find_sources(path::String, sources=String[])
+function find_sources(path::String, sources = String[])
     if isdir(path)
         for entry in readdir(path)
             find_sources(joinpath(path, entry), sources)
@@ -10,21 +10,21 @@ function find_sources(path::String, sources=String[])
 end
 
 function examples_testsuite(backend_str)
-@testset "examples" begin
-    examples_dir = joinpath(@__DIR__, "..", "examples")
-    examples = find_sources(examples_dir)
-    filter!(file -> readline(file) != "# EXCLUDE FROM TESTING", examples)
-    if backend_str == "ROCM"
-        filter!(file -> occursin("# INCLUDE ROCM", String(read(file))), examples)
-    end
+    @testset "examples" begin
+        examples_dir = joinpath(@__DIR__, "..", "examples")
+        examples = find_sources(examples_dir)
+        filter!(file -> readline(file) != "# EXCLUDE FROM TESTING", examples)
+        if backend_str == "ROCM"
+            filter!(file -> occursin("# INCLUDE ROCM", String(read(file))), examples)
+        end
 
-    @testset "$(basename(example))" for example in examples
-        @eval module $(gensym())
+        @testset "$(basename(example))" for example in examples
+            @eval module $(gensym())
             backend_str = $backend_str
             include($example)
+            end
+            @test true
         end
-        @test true
-    end
 
-end
+    end
 end
diff --git a/test/extensions/enzyme.jl b/test/extensions/enzyme.jl
index 3d8a5082d..014037762 100644
--- a/test/extensions/enzyme.jl
+++ b/test/extensions/enzyme.jl
@@ -9,7 +9,7 @@ end
 
 function square_caller(A, backend)
     kernel = square!(backend)
-    kernel(A, ndrange=size(A))
+    kernel(A, ndrange = size(A))
     KernelAbstractions.synchronize(backend)
 end
 
@@ -21,11 +21,11 @@ end
 
 function mul_caller(A, B, backend)
     kernel = mul!(backend)
-    kernel(A, B, ndrange=size(A))
+    kernel(A, B, ndrange = size(A))
     KernelAbstractions.synchronize(backend)
 end
 
-function enzyme_testsuite(backend, ArrayT, supports_reverse=true)
+function enzyme_testsuite(backend, ArrayT, supports_reverse = true)
     @testset "kernels" begin
         A = ArrayT{Float64}(undef, 64)
         dA = ArrayT{Float64}(undef, 64)
diff --git a/test/localmem.jl b/test/localmem.jl
index 65efea32f..2aa872393 100644
--- a/test/localmem.jl
+++ b/test/localmem.jl
@@ -38,7 +38,7 @@ function localmem_testsuite(backend, ArrayT)
     @testset "kernels" begin
         @testset for kernel! in (localmem(backend(), 16), localmem2(backend(), 16))
             A = ArrayT{Int}(undef, 64)
-            kernel!(A, ndrange=size(A))
+            kernel!(A, ndrange = size(A))
             synchronize(backend())
             @test all(A[1:16] .== 16:-1:1)
             @test all(A[17:32] .== 16:-1:1)
diff --git a/test/nditeration.jl b/test/nditeration.jl
index 1647c5a3f..1310945f0 100644
--- a/test/nditeration.jl
+++ b/test/nditeration.jl
@@ -3,86 +3,86 @@ using KernelAbstractions.NDIteration
 using Test
 
 function nditeration_testsuite()
-@testset "iteration" begin
-    let ndrange = NDRange{2, DynamicSize, DynamicSize}(CartesianIndices((256, 256)), CartesianIndices((32, 32)));
-        @test length(ndrange) == 256*256
-        @test all(p->p[1]==p[2], zip(ndrange, CartesianIndices((256, 256))))
-    end
-    let ndrange = NDRange{2, StaticSize{(256,256)}, DynamicSize}(nothing, CartesianIndices((32, 32)));
-        @test length(ndrange) == 256*256
-        @test all(p->p[1]==p[2], zip(ndrange, CartesianIndices((256, 256))))
+    @testset "iteration" begin
+        let ndrange = NDRange{2, DynamicSize, DynamicSize}(CartesianIndices((256, 256)), CartesianIndices((32, 32)));
+            @test length(ndrange) == 256 * 256
+            @test all(p -> p[1] == p[2], zip(ndrange, CartesianIndices((256, 256))))
+        end
+        let ndrange = NDRange{2, StaticSize{(256, 256)}, DynamicSize}(nothing, CartesianIndices((32, 32)));
+            @test length(ndrange) == 256 * 256
+            @test all(p -> p[1] == p[2], zip(ndrange, CartesianIndices((256, 256))))
+        end
     end
-end
 
-# GPU scenario where we get a linear index into workitems/blocks
-function linear_iteration(ndrange)
-    idx = Array{CartesianIndex{2}}(undef, length(blocks(ndrange)) * length(workitems(ndrange)))
-    for i in 1:length(blocks(ndrange))
-        for j in 1:length(workitems(ndrange))
-            I = j + (i-1) * length(workitems(ndrange))
-            idx[I] = expand(ndrange, i, j)
+    # GPU scenario where we get a linear index into workitems/blocks
+    function linear_iteration(ndrange)
+        idx = Array{CartesianIndex{2}}(undef, length(blocks(ndrange)) * length(workitems(ndrange)))
+        for i in 1:length(blocks(ndrange))
+            for j in 1:length(workitems(ndrange))
+                I = j + (i - 1) * length(workitems(ndrange))
+                idx[I] = expand(ndrange, i, j)
+            end
         end
+        return idx
     end
-    return idx
-end
 
-function check(idx, offset, offset_x, offset_y, Dim_x, Dim_y)
-    N = Dim_x * Dim_y
-    all(p->p[1]==p[2], zip(idx[offset*N  .+ 1:N], CartesianIndices((offset_x*Dim_x .+ 1:Dim_x, offset_y*Dim_y .+ 1:Dim_y))))
-end
+    function check(idx, offset, offset_x, offset_y, Dim_x, Dim_y)
+        N = Dim_x * Dim_y
+        all(p -> p[1] == p[2], zip(idx[(offset * N .+ 1):N], CartesianIndices(((offset_x * Dim_x .+ 1):Dim_x, (offset_y * Dim_y .+ 1):Dim_y))))
+    end
 
-@testset "linear_iteration" begin
-    Dim_x = 32
-    Dim_y = 32
-    let ndrange = NDRange{2, StaticSize{(4,4)}, StaticSize{(Dim_x,Dim_y)}}();
-        idx = linear_iteration(ndrange)
-        for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange))
-            I = Tuple(I)
-            @test check(idx, i-1, ntuple(i->I[i]-1, length(I))..., Dim_x, Dim_y)
+    @testset "linear_iteration" begin
+        Dim_x = 32
+        Dim_y = 32
+        let ndrange = NDRange{2, StaticSize{(4, 4)}, StaticSize{(Dim_x, Dim_y)}}();
+            idx = linear_iteration(ndrange)
+            for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange))
+                I = Tuple(I)
+                @test check(idx, i - 1, ntuple(i -> I[i] - 1, length(I))..., Dim_x, Dim_y)
+            end
         end
-    end
-    let ndrange = NDRange{2, DynamicSize, DynamicSize}(CartesianIndices((4,4)), CartesianIndices((Dim_x, Dim_y)));
-        idx = linear_iteration(ndrange)
-        for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange))
-            I = Tuple(I)
-            @test check(idx, i-1, ntuple(i->I[i]-1, length(I))..., Dim_x, Dim_y)
+        let ndrange = NDRange{2, DynamicSize, DynamicSize}(CartesianIndices((4, 4)), CartesianIndices((Dim_x, Dim_y)));
+            idx = linear_iteration(ndrange)
+            for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange))
+                I = Tuple(I)
+                @test check(idx, i - 1, ntuple(i -> I[i] - 1, length(I))..., Dim_x, Dim_y)
+            end
         end
-    end
 
-    Dim_x = 32
-    Dim_y = 1
-    let ndrange = NDRange{2, StaticSize{(4,4*32)}, StaticSize{(Dim_x,Dim_y)}}();
-        idx = linear_iteration(ndrange)
-        N = length(workitems(ndrange))
-        for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange))
-            I = Tuple(I)
-            @test check(idx, i-1, ntuple(i->I[i]-1, length(I))..., Dim_x, Dim_y)
+        Dim_x = 32
+        Dim_y = 1
+        let ndrange = NDRange{2, StaticSize{(4, 4 * 32)}, StaticSize{(Dim_x, Dim_y)}}();
+            idx = linear_iteration(ndrange)
+            N = length(workitems(ndrange))
+            for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange))
+                I = Tuple(I)
+                @test check(idx, i - 1, ntuple(i -> I[i] - 1, length(I))..., Dim_x, Dim_y)
+            end
         end
-    end
-    let ndrange = NDRange{2, DynamicSize, DynamicSize}(CartesianIndices((4,4*32)), CartesianIndices((Dim_x, Dim_y)));
-        idx = linear_iteration(ndrange)
-        for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange))
-            I = Tuple(I)
-            @test check(idx, i-1, ntuple(i->I[i]-1, length(I))..., Dim_x, Dim_y)
+        let ndrange = NDRange{2, DynamicSize, DynamicSize}(CartesianIndices((4, 4 * 32)), CartesianIndices((Dim_x, Dim_y)));
+            idx = linear_iteration(ndrange)
+            for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange))
+                I = Tuple(I)
+                @test check(idx, i - 1, ntuple(i -> I[i] - 1, length(I))..., Dim_x, Dim_y)
+            end
         end
-    end
 
-    Dim_x = 1 
-    Dim_y = 32 
-    let ndrange = NDRange{2, StaticSize{(4*32,4)}, StaticSize{(Dim_x,Dim_y)}}();
-        idx = linear_iteration(ndrange)
-        N = length(workitems(ndrange))
-        for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange))
-            I = Tuple(I)
-            @test check(idx, i-1, ntuple(i->I[i]-1, length(I))..., Dim_x, Dim_y)
+        Dim_x = 1
+        Dim_y = 32
+        let ndrange = NDRange{2, StaticSize{(4 * 32, 4)}, StaticSize{(Dim_x, Dim_y)}}();
+            idx = linear_iteration(ndrange)
+            N = length(workitems(ndrange))
+            for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange))
+                I = Tuple(I)
+                @test check(idx, i - 1, ntuple(i -> I[i] - 1, length(I))..., Dim_x, Dim_y)
+            end
         end
-    end
-    let ndrange = NDRange{2, DynamicSize, DynamicSize}(CartesianIndices((4*32,4)), CartesianIndices((Dim_x, Dim_y)));
-        idx = linear_iteration(ndrange)
-        for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange))
-            I = Tuple(I)
-            @test check(idx, i-1, ntuple(i->I[i]-1, length(I))..., Dim_x, Dim_y)
+        let ndrange = NDRange{2, DynamicSize, DynamicSize}(CartesianIndices((4 * 32, 4)), CartesianIndices((Dim_x, Dim_y)));
+            idx = linear_iteration(ndrange)
+            for (i, I) in zip(1:length(blocks(ndrange)), blocks(ndrange))
+                I = Tuple(I)
+                @test check(idx, i - 1, ntuple(i -> I[i] - 1, length(I))..., Dim_x, Dim_y)
+            end
         end
     end
 end
-end
diff --git a/test/print_test.jl b/test/print_test.jl
index 9d4b653bf..79b527300 100644
--- a/test/print_test.jl
+++ b/test/print_test.jl
@@ -1,8 +1,8 @@
 using KernelAbstractions, Test
 
 @kernel function kernel_print()
-  I = @index(Global)
-  @print("Hello from thread ", I, "!\n")
+    I = @index(Global)
+    @print("Hello from thread ", I, "!\n")
 end
 
 function printing_testsuite(backend)
@@ -12,14 +12,14 @@ function printing_testsuite(backend)
     @testset "print test" begin
         kernel = kernel_print(backend(), 4)
         redirect_stdout(devnull) do
-          kernel(ndrange=(4,))
-          synchronize(backend())
+            kernel(ndrange = (4,))
+            synchronize(backend())
         end
         @test true
 
         redirect_stdout(devnull) do
-          @print("Why this should work\n")
-          synchronize(backend())
+            @print("Why this should work\n")
+            synchronize(backend())
         end
         @test true
     end
diff --git a/test/private.jl b/test/private.jl
index 8406316fa..b81340428 100644
--- a/test/private.jl
+++ b/test/private.jl
@@ -11,7 +11,7 @@ end
     priv = @private eltype(A) (1,)
     I = @index(Global, Linear)
     @inbounds begin
-      B[I] = eltype(priv) === eltype(A)
+        B[I] = eltype(priv) === eltype(A)
     end
 end
 
@@ -28,7 +28,7 @@ end
 end
 
 # This is horrible don't write code like this
-@kernel function forloop(A, ::Val{N}) where N
+@kernel function forloop(A, ::Val{N}) where {N}
     I = @index(Global, Linear)
     i = @index(Local, Linear)
     priv = @private Int (N,)
@@ -50,20 +50,20 @@ end
 
     priv = @private eltype(A) (1,)
     @inbounds begin
-      priv[1] = zero(eltype(A))
-      for k in 1:size(A, ndims(A))
-        priv[1] += A[I..., k]
-      end
-      out[I...] = priv[1]
+        priv[1] = zero(eltype(A))
+        for k in 1:size(A, ndims(A))
+            priv[1] += A[I..., k]
+        end
+        out[I...] = priv[1]
     end
 end
 
 function private_testsuite(backend, ArrayT)
     @testset "kernels" begin
-        stmt_form(backend(), 16)(ndrange=16)
+        stmt_form(backend(), 16)(ndrange = 16)
         synchronize(backend())
         A = ArrayT{Int}(undef, 64)
-        private(backend(), 16)(A, ndrange=size(A))
+        private(backend(), 16)(A, ndrange = size(A))
         synchronize(backend())
         @test all(A[1:16] .== 16:-1:1)
         @test all(A[17:32] .== 16:-1:1)
@@ -72,28 +72,30 @@ function private_testsuite(backend, ArrayT)
 
         A = ArrayT{Int}(undef, 64, 64)
         A .= 1
-        forloop(backend())(A, Val(size(A, 2)), ndrange=size(A,1), workgroupsize=size(A,1))
+        forloop(backend())(A, Val(size(A, 2)), ndrange = size(A, 1), workgroupsize = size(A, 1))
         synchronize(backend())
         @test all(A[:, 1] .== 64)
         @test all(A[:, 2:end] .== 1)
 
         B = ArrayT{Bool}(undef, size(A)...)
-        typetest(backend(), 16)(A, B, ndrange=size(A))
+        typetest(backend(), 16)(A, B, ndrange = size(A))
         synchronize(backend())
         @test all(B)
 
-        A = ArrayT{Float32}(ones(64,3));
+        A = ArrayT{Float32}(ones(64, 3));
         out = ArrayT{Float32}(undef, 64)
-        reduce_private(backend(), 8)(out, A, ndrange=size(out))
+        reduce_private(backend(), 8)(out, A, ndrange = size(out))
         synchronize(backend())
-        @test all(out .== 3f0)
+        @test all(out .== 3.0f0)
     end
 
     if backend == CPU
         @testset "codegen" begin
             IR = sprint() do io
-                KernelAbstractions.ka_code_llvm(io, reduce_private(backend(), (8,)), Tuple{ArrayT{Float64,1}, ArrayT{Float64,2}},
-                                                optimize=true, ndrange=(64,))
+                KernelAbstractions.ka_code_llvm(
+                    io, reduce_private(backend(), (8,)), Tuple{ArrayT{Float64, 1}, ArrayT{Float64, 2}},
+                    optimize = true, ndrange = (64,),
+                )
             end
             @test !occursin("gcframe", IR)
         end
diff --git a/test/reflection.jl b/test/reflection.jl
index f2d5bd278..5153c94b1 100644
--- a/test/reflection.jl
+++ b/test/reflection.jl
@@ -19,16 +19,16 @@ function test_typed_kernel_dynamic(backend, backend_str, ArrayT)
     A = ArrayT(ones(Float32, 1024, 1024))
     kernel = mul2(backend())
     res = if backend == CPU
-        @ka_code_typed kernel(A, ndrange=size(A), workgroupsize=16)
+        @ka_code_typed kernel(A, ndrange = size(A), workgroupsize = 16)
     else
-        @ka_code_typed kernel(A, ndrange=size(A), workgroupsize=(32, 32))
+        @ka_code_typed kernel(A, ndrange = size(A), workgroupsize = (32, 32))
     end
     if backend_str == "CUDA" || backend_str == "ROCM" || backend_str == "oneAPI" || backend_str == "Metal"
         @test_broken isa(res, Pair{Core.CodeInfo, DataType})
     else
         @test isa(res, Pair{Core.CodeInfo, DataType})
     end
-    @test isa(res[1].code, Array{Any,1})
+    @test isa(res[1].code, Array{Any, 1})
 end
 
 function test_typed_kernel_dynamic_no_info(backend, backend_str, ArrayT)
@@ -36,13 +36,13 @@ function test_typed_kernel_dynamic_no_info(backend, backend_str, ArrayT)
     B = similar(A)
     C = similar(A)
     kernel = add3(backend())
-    res = @ka_code_typed kernel(A, B, C, ndrange=size(A))
+    res = @ka_code_typed kernel(A, B, C, ndrange = size(A))
     if backend_str == "CUDA" || backend_str == "ROCM" || backend_str == "oneAPI" || backend_str == "Metal"
         @test_broken isa(res, Pair{Core.CodeInfo, DataType})
     else
         @test isa(res, Pair{Core.CodeInfo, DataType})
     end
-    @test isa(res[1].code, Array{Any,1})
+    @test isa(res[1].code, Array{Any, 1})
 end
 
 function test_typed_kernel_static(backend, backend_str, ArrayT)
@@ -52,13 +52,13 @@ function test_typed_kernel_static(backend, backend_str, ArrayT)
     else
         mul2(backend(), (32, 32))
     end
-    res = @ka_code_typed kernel(A, ndrange=size(A))
+    res = @ka_code_typed kernel(A, ndrange = size(A))
     if backend_str == "CUDA" || backend_str == "ROCM" || backend_str == "oneAPI" || backend_str == "Metal"
         @test_broken isa(res, Pair{Core.CodeInfo, DataType})
     else
         @test isa(res, Pair{Core.CodeInfo, DataType})
     end
-    @test isa(res[1].code, Array{Any,1})
+    @test isa(res[1].code, Array{Any, 1})
 end
 
 function test_typed_kernel_no_optimize(backend, backend_str, ArrayT)
@@ -68,8 +68,8 @@ function test_typed_kernel_no_optimize(backend, backend_str, ArrayT)
     else
         mul2(backend(), (32, 32))
     end
-    res = @ka_code_typed optimize=false kernel(A, ndrange=size(A))
-    res_opt = @ka_code_typed kernel(A, ndrange=size(A))
+    res = @ka_code_typed optimize = false kernel(A, ndrange = size(A))
+    res_opt = @ka_code_typed kernel(A, ndrange = size(A))
     # FIXME: Need a better test
     # @test size(res[1].code) < size(res_opt[1].code)
 end
@@ -82,13 +82,13 @@ function test_expr_kernel(backend, backend_str, ArrayT)
     else
         addi(backend(), (32, 32))
     end
-    res = @ka_code_typed kernel(A, C, 1+2, ndrange=size(A))
+    res = @ka_code_typed kernel(A, C, 1 + 2, ndrange = size(A))
     if backend_str == "CUDA" || backend_str == "ROCM" || backend_str == "oneAPI" || backend_str == "Metal"
         @test_broken isa(res, Pair{Core.CodeInfo, DataType})
     else
         @test isa(res, Pair{Core.CodeInfo, DataType})
     end
-    @test isa(res[1].code, Array{Any,1})
+    @test isa(res[1].code, Array{Any, 1})
 end
 
 function reflection_testsuite(backend, backend_str, ArrayT)
diff --git a/test/runtests.jl b/test/runtests.jl
index d7cca6ca1..bf9b445a5 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -4,7 +4,7 @@ using Test
 include("testsuite.jl")
 
 @testset "CPU back-end" begin
-    struct CPUBackendArray{T,N,A} end # Fake and unused
+    struct CPUBackendArray{T, N, A} end # Fake and unused
     Testsuite.testsuite(CPU, "CPU", Base, Array, CPUBackendArray)
 end
 
@@ -14,33 +14,33 @@ end
 end
 
 A = zeros(Int, Threads.nthreads())
-kern_static(CPU(static=true), (1,))(A, ndrange=length(A))
+kern_static(CPU(static = true), (1,))(A, ndrange = length(A))
 @test A == 1:Threads.nthreads()
 
-@kernel cpu=false function my_no_cpu_kernel(a)
+@kernel cpu = false function my_no_cpu_kernel(a)
 end
 @test_throws ErrorException("This kernel is unavailable for backend CPU") my_no_cpu_kernel(CPU())
 
 # testing multiple configurations at the same time
-@kernel cpu=false inbounds=false function my_no_cpu_kernel2(a)
+@kernel cpu = false inbounds = false function my_no_cpu_kernel2(a)
 end
 @test_throws ErrorException("This kernel is unavailable for backend CPU") my_no_cpu_kernel2(CPU())
 
 if Base.JLOptions().check_bounds == 0 || Base.JLOptions().check_bounds == 1
     # testing bounds errors
-    @kernel inbounds=false function my_bounded_kernel(a)
+    @kernel inbounds = false function my_bounded_kernel(a)
         idx = @index(Global, Linear)
         a[idx] = 0
     end
-    @test_throws BoundsError(Int64[],(1,)) my_bounded_kernel(CPU())(Int[], ndrange=1)
+    @test_throws BoundsError(Int64[], (1,)) my_bounded_kernel(CPU())(Int[], ndrange = 1)
 end
 
 if Base.JLOptions().check_bounds == 0 || Base.JLOptions().check_bounds == 2
-    @kernel inbounds=true function my_bounded_kernel(a)
+    @kernel inbounds = true function my_bounded_kernel(a)
         idx = @index(Global, Linear)
         a[idx] = 0
     end
-    @test nothing == my_inbounds_kernel(CPU())(Int[], ndrange=1)
+    @test nothing == my_inbounds_kernel(CPU())(Int[], ndrange = 1)
 end
 
 struct NewBackend <: KernelAbstractions.GPU end
diff --git a/test/specialfunctions.jl b/test/specialfunctions.jl
index a6855d4f5..261986460 100644
--- a/test/specialfunctions.jl
+++ b/test/specialfunctions.jl
@@ -20,37 +20,37 @@ end
 function specialfunctions_testsuite(Backend)
     backend = Backend()
     @testset "special functions: gamma" begin
-        x = Float32[1.0,2.0,3.0,5.5]
+        x = Float32[1.0, 2.0, 3.0, 5.5]
 
         cx = allocate(backend, Float32, length(x))
         KernelAbstractions.copyto!(backend, cx, x)
         cy = similar(cx)
 
-        gamma_knl(backend)(cy, cx; ndrange=length(x))
+        gamma_knl(backend)(cy, cx; ndrange = length(x))
         synchronize(backend)
         @test Array(cy) ≈ SpecialFunctions.gamma.(x)
     end
 
     @testset "special functions: erf" begin
-        x = Float32[-1.0,-0.5,0.0,1e-3,1.0,2.0,5.5]
+        x = Float32[-1.0, -0.5, 0.0, 1.0e-3, 1.0, 2.0, 5.5]
 
         cx = allocate(backend, Float32, length(x))
         KernelAbstractions.copyto!(backend, cx, x)
         cy = similar(cx)
 
-        erf_knl(backend)(cy, cx; ndrange=length(x))
+        erf_knl(backend)(cy, cx; ndrange = length(x))
         synchronize(backend)
         @test Array(cy) ≈ SpecialFunctions.erf.(x)
     end
 
     @testset "special functions: erfc" begin
-        x = Float32[-1.0,-0.5,0.0,1e-3,1.0,2.0,5.5]
+        x = Float32[-1.0, -0.5, 0.0, 1.0e-3, 1.0, 2.0, 5.5]
         cx = allocate(backend, Float32, length(x))
         KernelAbstractions.copyto!(backend, cx, x)
         cy = similar(cx)
 
-        erfc_knl(backend)(cy, cx; ndrange=length(x))
+        erfc_knl(backend)(cy, cx; ndrange = length(x))
         synchronize(backend)
         @test Array(cy) ≈ SpecialFunctions.erfc.(x)
     end
-end
\ No newline at end of file
+end
diff --git a/test/test.jl b/test/test.jl
index 9bf6f8493..7269c2977 100644
--- a/test/test.jl
+++ b/test/test.jl
@@ -8,288 +8,292 @@ using Adapt
 identity(x) = x
 
 function unittest_testsuite(Backend, backend_str, backend_mod, BackendArrayT; skip_tests = Set{String}())
-@conditional_testset "partition" skip_tests begin
-    backend = Backend()
-    let kernel = KernelAbstractions.Kernel{typeof(backend), StaticSize{(64,)}, DynamicSize, typeof(identity)}(backend, identity)
-        iterspace, dynamic = KernelAbstractions.partition(kernel, (128,), nothing)
-        @test length(blocks(iterspace)) == 2
-        @test dynamic isa NoDynamicCheck
-
-        iterspace, dynamic = KernelAbstractions.partition(kernel, (129,), nothing)
-        @test length(blocks(iterspace)) == 3
-        @test dynamic isa DynamicCheck
-
-        iterspace, dynamic = KernelAbstractions.partition(kernel, (129,), (64,))
-        @test length(blocks(iterspace)) == 3
-        @test dynamic isa DynamicCheck
-
-        @test_throws ErrorException KernelAbstractions.partition(kernel, (129,), (65,))
-        @test KernelAbstractions.backend(kernel) == backend
-    end
-    let kernel = KernelAbstractions.Kernel{typeof(backend), StaticSize{(64,)}, StaticSize{(128,)}, typeof(identity)}(backend, identity)
-        iterspace, dynamic = KernelAbstractions.partition(kernel, (128,), nothing)
-        @test length(blocks(iterspace)) == 2
-        @test dynamic isa NoDynamicCheck
+    @conditional_testset "partition" skip_tests begin
+        backend = Backend()
+        let kernel = KernelAbstractions.Kernel{typeof(backend), StaticSize{(64,)}, DynamicSize, typeof(identity)}(backend, identity)
+            iterspace, dynamic = KernelAbstractions.partition(kernel, (128,), nothing)
+            @test length(blocks(iterspace)) == 2
+            @test dynamic isa NoDynamicCheck
+
+            iterspace, dynamic = KernelAbstractions.partition(kernel, (129,), nothing)
+            @test length(blocks(iterspace)) == 3
+            @test dynamic isa DynamicCheck
+
+            iterspace, dynamic = KernelAbstractions.partition(kernel, (129,), (64,))
+            @test length(blocks(iterspace)) == 3
+            @test dynamic isa DynamicCheck
+
+            @test_throws ErrorException KernelAbstractions.partition(kernel, (129,), (65,))
+            @test KernelAbstractions.backend(kernel) == backend
+        end
+        let kernel = KernelAbstractions.Kernel{typeof(backend), StaticSize{(64,)}, StaticSize{(128,)}, typeof(identity)}(backend, identity)
+            iterspace, dynamic = KernelAbstractions.partition(kernel, (128,), nothing)
+            @test length(blocks(iterspace)) == 2
+            @test dynamic isa NoDynamicCheck
 
-        iterspace, dynamic = KernelAbstractions.partition(kernel, nothing, nothing)
-        @test length(blocks(iterspace)) == 2
-        @test dynamic isa NoDynamicCheck
+            iterspace, dynamic = KernelAbstractions.partition(kernel, nothing, nothing)
+            @test length(blocks(iterspace)) == 2
+            @test dynamic isa NoDynamicCheck
 
-        @test_throws ErrorException KernelAbstractions.partition(kernel, (129,), nothing)
-        @test KernelAbstractions.backend(kernel) == backend
+            @test_throws ErrorException KernelAbstractions.partition(kernel, (129,), nothing)
+            @test KernelAbstractions.backend(kernel) == backend
+        end
     end
-end
 
-@kernel function index_linear_global(A)
-       I = @index(Global, Linear)
-       A[I] = I
-end
-@kernel function index_linear_local(A)
-       I  = @index(Global, Linear)
-       i = @index(Local, Linear)
-       A[I] = i
-end
-@kernel function index_linear_group(A)
-       I  = @index(Global, Linear)
-       i = @index(Group, Linear)
-       A[I] = i
-end
-@kernel function index_cartesian_global(A)
-       I = @index(Global, Cartesian)
-       A[I] = I
-end
-@kernel function index_cartesian_local(A)
-       I = @index(Global, Cartesian)
-       i = @index(Local, Cartesian)
-       A[I] = i
-end
-@kernel function index_cartesian_group(A)
-       I = @index(Global, Cartesian)
-       i = @index(Group, Cartesian)
-       A[I] = i
-end
+    @kernel function index_linear_global(A)
+        I = @index(Global, Linear)
+        A[I] = I
+    end
+    @kernel function index_linear_local(A)
+        I = @index(Global, Linear)
+        i = @index(Local, Linear)
+        A[I] = i
+    end
+    @kernel function index_linear_group(A)
+        I = @index(Global, Linear)
+        i = @index(Group, Linear)
+        A[I] = i
+    end
+    @kernel function index_cartesian_global(A)
+        I = @index(Global, Cartesian)
+        A[I] = I
+    end
+    @kernel function index_cartesian_local(A)
+        I = @index(Global, Cartesian)
+        i = @index(Local, Cartesian)
+        A[I] = i
+    end
+    @kernel function index_cartesian_group(A)
+        I = @index(Global, Cartesian)
+        i = @index(Group, Cartesian)
+        A[I] = i
+    end
 
-@conditional_testset "get_backend" skip_tests begin
-    backend = Backend()
-    backendT = typeof(backend).name.wrapper # To look through CUDABackend{true, false}
-    @test backend isa backendT
-
-    x = allocate(backend, Float32, 5)
-    A = allocate(backend, Float32, 5, 5)
-    @test @inferred(KernelAbstractions.get_backend(A)) isa backendT
-    @test @inferred(KernelAbstractions.get_backend(view(A, 2:4, 1:3))) isa backendT
-    @test @inferred(KernelAbstractions.get_backend(Diagonal(x))) isa backendT
-    @test @inferred(KernelAbstractions.get_backend(Tridiagonal(A))) isa backendT
-end
+    @conditional_testset "get_backend" skip_tests begin
+        backend = Backend()
+        backendT = typeof(backend).name.wrapper # To look through CUDABackend{true, false}
+        @test backend isa backendT
+
+        x = allocate(backend, Float32, 5)
+        A = allocate(backend, Float32, 5, 5)
+        @test @inferred(KernelAbstractions.get_backend(A)) isa backendT
+        @test @inferred(KernelAbstractions.get_backend(view(A, 2:4, 1:3))) isa backendT
+        @test @inferred(KernelAbstractions.get_backend(Diagonal(x))) isa backendT
+        @test @inferred(KernelAbstractions.get_backend(Tridiagonal(A))) isa backendT
+    end
 
-@conditional_testset "sparse" skip_tests begin
-    backend = Backend()
-    backendT = typeof(backend).name.wrapper # To look through CUDABackend{true, false}
-    @test backend isa backendT
+    @conditional_testset "sparse" skip_tests begin
+        backend = Backend()
+        backendT = typeof(backend).name.wrapper # To look through CUDABackend{true, false}
+        @test backend isa backendT
 
-    A = allocate(backend, Float32, 5, 5)
-    @test @inferred(KernelAbstractions.get_backend(sparse(A))) isa backendT
-end
+        A = allocate(backend, Float32, 5, 5)
+        @test @inferred(KernelAbstractions.get_backend(sparse(A))) isa backendT
+    end
 
-@conditional_testset "adapt" skip_tests begin
-    backend = Backend()
-    x = allocate(backend, Float32, 5)
-    @test adapt(CPU(), x) isa Array
-    y = adapt(backend, Array{Float32}(undef, 5))
-    @test typeof(y) == typeof(x)
-end
+    @conditional_testset "adapt" skip_tests begin
+        backend = Backend()
+        x = allocate(backend, Float32, 5)
+        @test adapt(CPU(), x) isa Array
+        y = adapt(backend, Array{Float32}(undef, 5))
+        @test typeof(y) == typeof(x)
+    end
 
-# TODO: add test for _group and _local_cartesian
-@conditional_testset "indextest" skip_tests begin
-    backend = Backend()
-    A = allocate(backend, Int, 16, 16)
-    index_linear_global(backend, 8)(A, ndrange=length(A))
-    synchronize(backend)
-    @test all(A .== LinearIndices(A))
-
-    A = allocate(backend, Int, 8)
-    index_linear_local(backend, 8)(A, ndrange=length(A))
-    synchronize(backend)
-    @test all(A .== 1:8)
-
-    A = allocate(backend, Int, 16)
-    index_linear_local(backend, 8)(A, ndrange=length(A))
-    synchronize(backend)
-    @test all(A[1:8] .== 1:8)
-    @test all(A[9:16] .== 1:8)
-
-    A = allocate(backend, Int, 8, 2)
-    index_linear_local(backend, 8)(A, ndrange=length(A))
-    synchronize(backend)
-    @test all(A[1:8] .== 1:8)
-    @test all(A[9:16] .== 1:8)
-
-    A = allocate(backend, CartesianIndex{2}, 16, 16)
-    index_cartesian_global(backend, 8)(A, ndrange=size(A))
-    synchronize(backend)
-    @test all(A .== CartesianIndices(A))
-
-    A = allocate(backend, CartesianIndex{1}, 16, 16)
-    index_cartesian_global(backend, 8)(A, ndrange=length(A))
-    synchronize(backend)
-    @test all(A[:] .== CartesianIndices((length(A),)))
-
-    # Non-multiplies of the workgroupsize
-    A = allocate(backend, Int, 7, 7)
-    index_linear_global(backend, 8)(A, ndrange=length(A))
-    synchronize(backend)
-    @test all(A .== LinearIndices(A))
-
-    A = allocate(backend, Int, 5)
-    index_linear_local(backend, 8)(A, ndrange=length(A))
-    synchronize(backend)
-    @test all(A .== 1:5)
-end
+    # TODO: add test for _group and _local_cartesian
+    @conditional_testset "indextest" skip_tests begin
+        backend = Backend()
+        A = allocate(backend, Int, 16, 16)
+        index_linear_global(backend, 8)(A, ndrange = length(A))
+        synchronize(backend)
+        @test all(A .== LinearIndices(A))
+
+        A = allocate(backend, Int, 8)
+        index_linear_local(backend, 8)(A, ndrange = length(A))
+        synchronize(backend)
+        @test all(A .== 1:8)
+
+        A = allocate(backend, Int, 16)
+        index_linear_local(backend, 8)(A, ndrange = length(A))
+        synchronize(backend)
+        @test all(A[1:8] .== 1:8)
+        @test all(A[9:16] .== 1:8)
+
+        A = allocate(backend, Int, 8, 2)
+        index_linear_local(backend, 8)(A, ndrange = length(A))
+        synchronize(backend)
+        @test all(A[1:8] .== 1:8)
+        @test all(A[9:16] .== 1:8)
+
+        A = allocate(backend, CartesianIndex{2}, 16, 16)
+        index_cartesian_global(backend, 8)(A, ndrange = size(A))
+        synchronize(backend)
+        @test all(A .== CartesianIndices(A))
+
+        A = allocate(backend, CartesianIndex{1}, 16, 16)
+        index_cartesian_global(backend, 8)(A, ndrange = length(A))
+        synchronize(backend)
+        @test all(A[:] .== CartesianIndices((length(A),)))
+
+        # Non-multiplies of the workgroupsize
+        A = allocate(backend, Int, 7, 7)
+        index_linear_global(backend, 8)(A, ndrange = length(A))
+        synchronize(backend)
+        @test all(A .== LinearIndices(A))
+
+        A = allocate(backend, Int, 5)
+        index_linear_local(backend, 8)(A, ndrange = length(A))
+        synchronize(backend)
+        @test all(A .== 1:5)
+    end
 
-@kernel function constarg(A, @Const(B))
-    I = @index(Global)
-    @inbounds A[I] = B[I]
-end
+    @kernel function constarg(A, @Const(B))
+        I = @index(Global)
+        @inbounds A[I] = B[I]
+    end
 
-@conditional_testset "Const" skip_tests begin
-    let kernel = constarg(Backend(), 8, (1024,))
-        # this is poking at internals
-        iterspace = NDRange{1, StaticSize{(128,)}, StaticSize{(8,)}}();
-        ctx = if Backend == CPU
-            KernelAbstractions.mkcontext(kernel, 1, nothing, iterspace, Val(NoDynamicCheck()))
-        else
-            KernelAbstractions.mkcontext(kernel, nothing, iterspace)
-        end
-        AT = if Backend == CPU
-            Array{Float32, 2}
-        else
-            BackendArrayT{Float32, 2, 1} # AS 1
-        end
-        IR = sprint() do io
+    @conditional_testset "Const" skip_tests begin
+        let kernel = constarg(Backend(), 8, (1024,))
+            # this is poking at internals
+            iterspace = NDRange{1, StaticSize{(128,)}, StaticSize{(8,)}}();
+            ctx = if Backend == CPU
+                KernelAbstractions.mkcontext(kernel, 1, nothing, iterspace, Val(NoDynamicCheck()))
+            else
+                KernelAbstractions.mkcontext(kernel, nothing, iterspace)
+            end
+            AT = if Backend == CPU
+                Array{Float32, 2}
+            else
+                BackendArrayT{Float32, 2, 1} # AS 1
+            end
+            IR = sprint() do io
+                if backend_str == "CPU"
+                    code_llvm(
+                        io, kernel.f, (typeof(ctx), AT, AT),
+                        optimize = false, raw = true,
+                    )
+                else
+                    backend_mod.code_llvm(
+                        io, kernel.f, (typeof(ctx), AT, AT),
+                        kernel = true, optimize = true,
+                    )
+                end
+            end
             if backend_str == "CPU"
-                code_llvm(io, kernel.f, (typeof(ctx), AT, AT),
-                          optimize=false, raw=true)
+                @test occursin("!alias.scope", IR)
+                @test occursin("!noalias", IR)
+            elseif backend_str == "CUDA"
+                @test occursin("@llvm.nvvm.ldg", IR)
+            elseif backend_str == "ROCM"
+                @test occursin("addrspace(4)", IR)
             else
-                backend_mod.code_llvm(io, kernel.f, (typeof(ctx), AT, AT),
-                               kernel=true, optimize=true)
+                @test_skip false
             end
         end
-        if backend_str == "CPU"
-            @test occursin("!alias.scope", IR)
-            @test occursin("!noalias", IR)
-        elseif backend_str == "CUDA"
-            @test occursin("@llvm.nvvm.ldg", IR)
-        elseif backend_str == "ROCM"
-            @test occursin("addrspace(4)", IR)
-        else
-            @test_skip false
-        end
     end
-end
 
-@kernel function kernel_val!(a, ::Val{m}) where {m}
-    I = @index(Global)
-    @inbounds a[I] = m
-end
+    @kernel function kernel_val!(a, ::Val{m}) where {m}
+        I = @index(Global)
+        @inbounds a[I] = m
+    end
 
-A = KernelAbstractions.zeros(Backend(), Int64, 1024)
-kernel_val!(Backend())(A, Val(3), ndrange=size(A))
-synchronize(Backend())
-@test all((a)->a==3, A)
+    A = KernelAbstractions.zeros(Backend(), Int64, 1024)
+    kernel_val!(Backend())(A, Val(3), ndrange = size(A))
+    synchronize(Backend())
+    @test all((a) -> a == 3, A)
 
-@kernel function kernel_empty()
-    nothing
-end
+    @kernel function kernel_empty()
+        nothing
+    end
 
-@conditional_testset "CPU synchronization" skip_tests begin
-    kernel_empty(CPU(), 1)(ndrange=1)
-    synchronize(CPU())
-end
+    @conditional_testset "CPU synchronization" skip_tests begin
+        kernel_empty(CPU(), 1)(ndrange = 1)
+        synchronize(CPU())
+    end
 
-@conditional_testset "Zero iteration space $Backend" skip_tests begin
-    kernel_empty(Backend(), 1)(ndrange=1)
-    kernel_empty(Backend(), 1)(ndrange=0)
-    synchronize(Backend())
+    @conditional_testset "Zero iteration space $Backend" skip_tests begin
+        kernel_empty(Backend(), 1)(ndrange = 1)
+        kernel_empty(Backend(), 1)(ndrange = 0)
+        synchronize(Backend())
 
-    kernel_empty(Backend(), 1)(ndrange=0)
-    synchronize(Backend())
-end
+        kernel_empty(Backend(), 1)(ndrange = 0)
+        synchronize(Backend())
+    end
 
 
-@conditional_testset "return statement" skip_tests begin
-    try
-        @eval @kernel function kernel_return()
-            return
+    @conditional_testset "return statement" skip_tests begin
+        try
+            @eval @kernel function kernel_return()
+                return
+            end
+            @test false
+        catch e
+            @test e.error ==
+                ErrorException("Return statement not permitted in a kernel function kernel_return")
         end
-        @test false
-    catch e
-        @test e.error ==
-            ErrorException("Return statement not permitted in a kernel function kernel_return")
     end
-end
 
-@conditional_testset "fallback test: callable types" skip_tests begin
-    @eval begin
-        function f end
-        @kernel function (a::typeof(f))(x, ::Val{m}) where m
-            I = @index(Global)
-            @inbounds x[I] = m
-        end
-        @kernel function (a::typeof(f))(x, ::Val{1})
-            I = @index(Global)
-            @inbounds x[I] = 1
+    @conditional_testset "fallback test: callable types" skip_tests begin
+        @eval begin
+            function f end
+            @kernel function (a::typeof(f))(x, ::Val{m}) where {m}
+                I = @index(Global)
+                @inbounds x[I] = m
+            end
+            @kernel function (a::typeof(f))(x, ::Val{1})
+                I = @index(Global)
+                @inbounds x[I] = 1
+            end
+            x = [1, 2, 3]
+            env = f(CPU())(x, Val(4); ndrange = length(x))
+            synchronize(CPU())
+            @test x == [4, 4, 4]
+
+            x = [1, 2, 3]
+            env = f(CPU())(x, Val(1); ndrange = length(x))
+            synchronize(CPU())
+            @test x == [1, 1, 1]
         end
-        x = [1,2,3]
-        env = f(CPU())(x, Val(4); ndrange=length(x))
-        synchronize(CPU())
-        @test x == [4,4,4]
-
-        x = [1,2,3]
-        env = f(CPU())(x, Val(1); ndrange=length(x))
-        synchronize(CPU())
-        @test x == [1,1,1]
     end
-end
 
-@conditional_testset "priority" skip_tests begin
-    KernelAbstractions.priority!(Backend(), :normal)
-    KernelAbstractions.priority!(Backend(), :high)
-    KernelAbstractions.priority!(Backend(), :low)
+    @conditional_testset "priority" skip_tests begin
+        KernelAbstractions.priority!(Backend(), :normal)
+        KernelAbstractions.priority!(Backend(), :high)
+        KernelAbstractions.priority!(Backend(), :low)
 
-    @test_throws ErrorException KernelAbstractions.priority!(Backend(), :default)
-end
+        @test_throws ErrorException KernelAbstractions.priority!(Backend(), :default)
+    end
 
-function f(KernelAbstractions.@context, a)
-    I = @index(Global, Linear)
-    a[I] = 1
-end
-@kernel cpu=false function context_kernel(a)
-    f(KernelAbstractions.@context, a)
-end
+    function f(KernelAbstractions.@context, a)
+        I = @index(Global, Linear)
+        a[I] = 1
+    end
+    @kernel cpu = false function context_kernel(a)
+        f(KernelAbstractions.@context, a)
+    end
 
-@testset "No CPU kernel" begin
-    if !(Backend() isa CPU)
-        A = KernelAbstractions.zeros(Backend(), Int64, 1024)
-        context_kernel(Backend())(A, ndrange=size(A))
-        synchronize(Backend())
-        @test all((a)->a==1, A)
-    else
-        @test_throws ErrorException("This kernel is unavailable for backend CPU") context_kernel(Backend())
+    @testset "No CPU kernel" begin
+        if !(Backend() isa CPU)
+            A = KernelAbstractions.zeros(Backend(), Int64, 1024)
+            context_kernel(Backend())(A, ndrange = size(A))
+            synchronize(Backend())
+            @test all((a) -> a == 1, A)
+        else
+            @test_throws ErrorException("This kernel is unavailable for backend CPU") context_kernel(Backend())
+        end
     end
-end
 
-@testset "functional" begin
-    @test KernelAbstractions.functional(Backend()) isa Union{Missing, Bool}
-end
+    @testset "functional" begin
+        @test KernelAbstractions.functional(Backend()) isa Union{Missing, Bool}
+    end
 
-@testset "CPU default workgroupsize" begin
-    @test KernelAbstractions.default_cpu_workgroupsize((64,)) == (64,)
-    @test KernelAbstractions.default_cpu_workgroupsize((1024,)) == (1024,)
-    @test KernelAbstractions.default_cpu_workgroupsize((2056,)) == (1024,)
-    @test KernelAbstractions.default_cpu_workgroupsize((64, 64,)) == (64,16)
-    @test KernelAbstractions.default_cpu_workgroupsize((64, 64, 64,4)) == (64,16,1,1)
-    @test KernelAbstractions.default_cpu_workgroupsize((64,15)) == (64,15)
-    @test KernelAbstractions.default_cpu_workgroupsize((5,7,13,17)) == (5,7,13,2)
-end
+    @testset "CPU default workgroupsize" begin
+        @test KernelAbstractions.default_cpu_workgroupsize((64,)) == (64,)
+        @test KernelAbstractions.default_cpu_workgroupsize((1024,)) == (1024,)
+        @test KernelAbstractions.default_cpu_workgroupsize((2056,)) == (1024,)
+        @test KernelAbstractions.default_cpu_workgroupsize((64, 64)) == (64, 16)
+        @test KernelAbstractions.default_cpu_workgroupsize((64, 64, 64, 4)) == (64, 16, 1, 1)
+        @test KernelAbstractions.default_cpu_workgroupsize((64, 15)) == (64, 15)
+        @test KernelAbstractions.default_cpu_workgroupsize((5, 7, 13, 17)) == (5, 7, 13, 2)
+    end
 
 end
diff --git a/test/testsuite.jl b/test/testsuite.jl
index cd78e76d4..242d2e850 100644
--- a/test/testsuite.jl
+++ b/test/testsuite.jl
@@ -4,19 +4,24 @@ using ..KernelAbstractions
 using ..Test
 
 # We can't add test-dependencies withouth breaking backend packages
-const Pkg = Base.require(Base.PkgId(
-                Base.UUID("44cfe95a-1eb2-52ea-b672-e2afdf69b78f"), "Pkg"))
+const Pkg = Base.require(
+    Base.PkgId(
+        Base.UUID("44cfe95a-1eb2-52ea-b672-e2afdf69b78f"), "Pkg",
+    ),
+)
 
 macro conditional_testset(name, skip_tests, expr)
-    esc(quote
-        @testset $name begin
-            if $name ∉ $skip_tests
-                $expr
-            else
-                @test_skip false
+    esc(
+        quote
+            @testset $name begin
+                if $name ∉ $skip_tests
+                    $expr
+                else
+                    @test_skip false
+                end
             end
-        end
-    end)
+        end,
+    )
 end
 
 
diff --git a/test/unroll.jl b/test/unroll.jl
index 6b33fa761..7bbbab4df 100644
--- a/test/unroll.jl
+++ b/test/unroll.jl
@@ -3,37 +3,37 @@ using KernelAbstractions.Extras
 using StaticArrays
 
 @kernel function kernel_unroll!(a)
-  @unroll for i in 1:5
-    @inbounds a[i] = i
-  end
+    @unroll for i in 1:5
+        @inbounds a[i] = i
+    end
 end
 
-@kernel function kernel_unroll!(a, ::Val{N}) where N
-  let M = N+5
-    @unroll for i in 6:M
-      @inbounds a[i-5] = i
+@kernel function kernel_unroll!(a, ::Val{N}) where {N}
+    let M = N + 5
+        @unroll for i in 6:M
+            @inbounds a[i - 5] = i
+        end
+        @synchronize
     end
-    @synchronize
-  end
 end
 
 # Check that nested `@unroll` doesn't throw a syntax error
 @kernel function kernel_unroll2!(A)
-  @uniform begin
-    a = MVector{3, Float32}(1, 2, 3)
-    b = MVector{3, Float32}(3, 2, 1)
-    c = MMatrix{3, 3, Float32}(undef)
-  end
-  I = @index(Global)
-  @inbounds for m in 1:3
-    @unroll for j = 1:3
-      @unroll for i = 1:3
-        c[1, j] = m * a[1] * b[j]
-      end
+    @uniform begin
+        a = MVector{3, Float32}(1, 2, 3)
+        b = MVector{3, Float32}(3, 2, 1)
+        c = MMatrix{3, 3, Float32}(undef)
+    end
+    I = @index(Global)
+    @inbounds for m in 1:3
+        @unroll for j in 1:3
+            @unroll for i in 1:3
+                c[1, j] = m * a[1] * b[j]
+            end
+        end
+        A[I] = c[1, 1]
+        @synchronize(m % 2 == 0)
     end
-    A[I] = c[1, 1]
-    @synchronize(m % 2 == 0)
-  end
 end
 
 function unroll_testsuite(backend, ArrayT)