diff --git a/Project.toml b/Project.toml
index 6b08e53..7f39d11 100644
--- a/Project.toml
+++ b/Project.toml
@@ -19,4 +19,5 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 [compat]
 LLVM = "9.1"
 OpenCL_jll = "2024.5.8"
+GPUArrays = "11"
 julia = "1.10"
diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl
index 23f31cf..1390eed 100644
--- a/src/gpuarrays.jl
+++ b/src/gpuarrays.jl
@@ -1,70 +1,5 @@
 # GPUArrays.jl interface
 
-
-#
-# Device functionality
-#
-
-
-## execution
-
-struct CLArrayBackend <: AbstractGPUBackend end
-
-struct CLKernelContext <: AbstractKernelContext end
-
-@inline function GPUArrays.launch_heuristic(::CLArrayBackend, f::F, args::Vararg{Any,N};
-                                             elements::Int, elements_per_thread::Int) where {F,N}
-    kernel = @opencl launch=false f(CLKernelContext(), args...)
-    wg_info = cl.work_group_info(kernel.fun, cl.device())
-
-    # XXX: how many groups is a good number? the API doesn't tell us.
-    #      measured on a low-end IGP, 32 blocks seems like a good sweet spot.
-    #      note that this only matters for grid-stride kernels, like broadcast.
-    return (threads=wg_info.size, blocks=32)
-end
-
-function GPUArrays.gpu_call(::CLArrayBackend, f, args, threads::Int, blocks::Int;
-                            name::Union{String,Nothing})
-    @opencl global_size=blocks*threads local_size=threads name=name f(CLKernelContext(), args...)
-end
-
-
-## on-device
-
-# indexing
-
-GPUArrays.blockidx(ctx::CLKernelContext) = get_group_id(1)
-GPUArrays.blockdim(ctx::CLKernelContext) = get_local_size(1)
-GPUArrays.threadidx(ctx::CLKernelContext) = get_local_id(1)
-GPUArrays.griddim(ctx::CLKernelContext) = get_num_groups(1)
-
-# math
-
-@inline GPUArrays.cos(ctx::CLKernelContext, x) = cos(x)
-@inline GPUArrays.sin(ctx::CLKernelContext, x) = sin(x)
-@inline GPUArrays.sqrt(ctx::CLKernelContext, x) = sqrt(x)
-@inline GPUArrays.log(ctx::CLKernelContext, x) = log(x)
-
-# memory
-
-@inline function GPUArrays.LocalMemory(::CLKernelContext, ::Type{T}, ::Val{dims}, ::Val{id}
-                                      ) where {T, dims, id}
-    ptr = SPIRVIntrinsics.emit_localmemory(Val(id), T, Val(prod(dims)))
-    oneDeviceArray(dims, LLVMPtr{T, onePI.AS.Local}(ptr))
-end
-
-# synchronization
-
-@inline GPUArrays.synchronize_threads(::CLKernelContext) = barrier()
-
-
-
-#
-# Host abstractions
-#
-
-GPUArrays.backend(::Type{<:CLArray}) = CLArrayBackend()
-
 function GPUArrays.derive(::Type{T}, a::CLArray, dims::Dims{N}, offset::Int) where {T,N}
     ref = copy(a.data)
     offset = (a.offset * Base.elsize(a)) ÷ sizeof(T) + offset