diff --git a/Project.toml b/Project.toml index 6b08e53..7f39d11 100644 --- a/Project.toml +++ b/Project.toml @@ -19,4 +19,5 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" [compat] LLVM = "9.1" OpenCL_jll = "2024.5.8" +GPUArrays = "11" julia = "1.10" diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl index 23f31cf..1390eed 100644 --- a/src/gpuarrays.jl +++ b/src/gpuarrays.jl @@ -1,70 +1,5 @@ # GPUArrays.jl interface - -# -# Device functionality -# - - -## execution - -struct CLArrayBackend <: AbstractGPUBackend end - -struct CLKernelContext <: AbstractKernelContext end - -@inline function GPUArrays.launch_heuristic(::CLArrayBackend, f::F, args::Vararg{Any,N}; - elements::Int, elements_per_thread::Int) where {F,N} - kernel = @opencl launch=false f(CLKernelContext(), args...) - wg_info = cl.work_group_info(kernel.fun, cl.device()) - - # XXX: how many groups is a good number? the API doesn't tell us. - # measured on a low-end IGP, 32 blocks seems like a good sweet spot. - # note that this only matters for grid-stride kernels, like broadcast. - return (threads=wg_info.size, blocks=32) -end - -function GPUArrays.gpu_call(::CLArrayBackend, f, args, threads::Int, blocks::Int; - name::Union{String,Nothing}) - @opencl global_size=blocks*threads local_size=threads name=name f(CLKernelContext(), args...) -end - - -## on-device - -# indexing - -GPUArrays.blockidx(ctx::CLKernelContext) = get_group_id(1) -GPUArrays.blockdim(ctx::CLKernelContext) = get_local_size(1) -GPUArrays.threadidx(ctx::CLKernelContext) = get_local_id(1) -GPUArrays.griddim(ctx::CLKernelContext) = get_num_groups(1) - -# math - -@inline GPUArrays.cos(ctx::CLKernelContext, x) = cos(x) -@inline GPUArrays.sin(ctx::CLKernelContext, x) = sin(x) -@inline GPUArrays.sqrt(ctx::CLKernelContext, x) = sqrt(x) -@inline GPUArrays.log(ctx::CLKernelContext, x) = log(x) - -# memory - -@inline function GPUArrays.LocalMemory(::CLKernelContext, ::Type{T}, ::Val{dims}, ::Val{id} - ) where {T, dims, id} - ptr = SPIRVIntrinsics.emit_localmemory(Val(id), T, Val(prod(dims))) - oneDeviceArray(dims, LLVMPtr{T, onePI.AS.Local}(ptr)) -end - -# synchronization - -@inline GPUArrays.synchronize_threads(::CLKernelContext) = barrier() - - - -# -# Host abstractions -# - -GPUArrays.backend(::Type{<:CLArray}) = CLArrayBackend() - function GPUArrays.derive(::Type{T}, a::CLArray, dims::Dims{N}, offset::Int) where {T,N} ref = copy(a.data) offset = (a.offset * Base.elsize(a)) รท sizeof(T) + offset