diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 9af2d9d4..138c63c5 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -194,6 +194,10 @@ function unsafe_free! end unsafe_free!(::AbstractArray) = return +include("intrinsics.jl") +import .KernelIntrinsics +export KernelIntrinsics + ### # Kernel language # - @localmem @@ -460,13 +464,27 @@ end # Internal kernel functions ### -function __index_Local_Linear end -function __index_Group_Linear end -function __index_Global_Linear end +function __index_Local_Linear(ctx) + return KernelIntrinsics.get_local_id().x +end + +function __index_Group_Linear(ctx) + return KernelIntrinsics.get_group_id().x +end + +function __index_Global_Linear(ctx) + return KernelIntrinsics.get_global_id().x +end -function __index_Local_Cartesian end -function __index_Group_Cartesian end -function __index_Global_Cartesian end +function __index_Local_Cartesian(ctx) + return @inbounds workitems(__iterspace(ctx))[KernelIntrinsics.get_local_id().x] +end +function __index_Group_Cartesian(ctx) + return @inbounds blocks(__iterspace(ctx))[KernelIntrinsics.get_group_id().x] +end +function __index_Global_Cartesian(ctx) + return @inbounds expand(__iterspace(ctx), KernelIntrinsics.get_group_id().x, KernelIntrinsics.get_local_id().x) +end @inline __index_Local_NTuple(ctx, I...) = Tuple(__index_Local_Cartesian(ctx, I...)) @inline __index_Group_NTuple(ctx, I...) = Tuple(__index_Group_Cartesian(ctx, I...)) @@ -796,11 +814,9 @@ include("macros.jl") ### function Scratchpad end -function SharedMemory end +SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims, id) -function __synchronize() - error("@synchronize used outside kernel or not captured") -end +__synchronize() = KernelIntrinsics.barrier() @generated function __print(items...) str = "" diff --git a/src/intrinsics.jl b/src/intrinsics.jl new file mode 100644 index 00000000..06011620 --- /dev/null +++ b/src/intrinsics.jl @@ -0,0 +1,60 @@ +module KernelIntrinsics + +""" + get_global_size()::@NamedTuple{x::Int, y::Int, z::Int} + +Return the number of global work-items specified. +""" +function get_global_size end + +""" + get_global_id()::@NamedTuple{x::Int, y::Int, z::Int} + +Returns the unique global work-item ID. + +!!! note + 1-based. +""" +function get_global_id end + +""" + get_local_size()::@NamedTuple{x::Int, y::Int, z::Int} + +Return the number of local work-items specified. +""" +function get_local_size end + +""" + get_local_id()::@NamedTuple{x::Int, y::Int, z::Int} + +Returns the unique local work-item ID. + +!!! note + 1-based. +""" +function get_local_id end + +""" + get_num_groups()::@NamedTuple{x::Int, y::Int, z::Int} + +Returns the number of groups. +""" +function get_num_groups end + +""" + get_group_id()::@NamedTuple{x::Int, y::Int, z::Int} + +Returns the unique group ID. + +!!! note + 1-based. +""" +function get_group_id end + +function localmemory end +function barrier() + error("Group barrier used outside kernel or not captured") +end +function print end + +end diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 87fccdb9..ef0aba6b 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -140,29 +140,30 @@ end ## Indexing Functions +const KI = KA.KernelIntrinsics -@device_override @inline function KA.__index_Local_Linear(ctx) - return get_local_id(1) +@device_override @inline function KI.get_local_id() + return (; x = Int(get_local_id(1)), y = Int(get_local_id(2)), z = Int(get_local_id(3))) end -@device_override @inline function KA.__index_Group_Linear(ctx) - return get_group_id(1) +@device_override @inline function KI.get_group_id() + return (; x = Int(get_group_id(1)), y = Int(get_group_id(2)), z = Int(get_group_id(3))) end -@device_override @inline function KA.__index_Global_Linear(ctx) - return get_global_id(1) +@device_override @inline function KI.get_global_id() + return (; x = Int(get_global_id(1)), y = Int(get_global_id(2)), z = Int(get_global_id(3))) end -@device_override @inline function KA.__index_Local_Cartesian(ctx) - @inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id(1)] +@device_override @inline function KI.get_local_size() + return (; x = Int(get_local_size(1)), y = Int(get_local_size(2)), z = Int(get_local_size(3))) end -@device_override @inline function KA.__index_Group_Cartesian(ctx) - @inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id(1)] +@device_override @inline function KI.get_num_groups() + return (; x = Int(get_num_groups(1)), y = Int(get_num_groups(2)), z = Int(get_num_groups(3))) end -@device_override @inline function KA.__index_Global_Cartesian(ctx) - return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1)) +@device_override @inline function KI.get_global_size() + return (; x = Int(get_global_size(1)), y = Int(get_global_size(2)), z = Int(get_global_size(3))) end @device_override @inline function KA.__validindex(ctx) @@ -177,7 +178,7 @@ end ## Shared and Scratch Memory -@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id} +@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id} ptr = POCL.emit_localmemory(T, Val(prod(Dims))) CLDeviceArray(Dims, ptr) end @@ -189,7 +190,7 @@ end ## Synchronization and Printing -@device_override @inline function KA.__synchronize() +@device_override @inline function KI.barrier() work_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE) end diff --git a/test/intrinsics.jl b/test/intrinsics.jl new file mode 100644 index 00000000..e58b58a6 --- /dev/null +++ b/test/intrinsics.jl @@ -0,0 +1,62 @@ + +@kernel cpu = false inbounds = true unsafe_indices = true function test_intrinsics_kernel(results) + # Test all intrinsics return NamedTuples with x, y, z fields + global_size = KernelIntrinsics.get_global_size() + global_id = KernelIntrinsics.get_global_id() + local_size = KernelIntrinsics.get_local_size() + local_id = KernelIntrinsics.get_local_id() + num_groups = KernelIntrinsics.get_num_groups() + group_id = KernelIntrinsics.get_group_id() + + if UInt32(global_id.x) <= UInt32(global_size.x) + results[1, global_id.x] = global_id.x + results[2, global_id.x] = local_id.x + results[3, global_id.x] = group_id.x + results[4, global_id.x] = global_size.x + results[5, global_id.x] = local_size.x + results[6, global_id.x] = num_groups.x + end +end + + +function intrinsics_testsuite(backend, AT) + @testset "KernelIntrinsics Tests" begin + @testset "Basic intrinsics functionality" begin + + # Test with small kernel + N = 16 + results = AT(zeros(Int, 6, N)) + + kernel = test_intrinsics_kernel(backend(), 4, (N,)) + kernel(results, ndrange = N) + KernelAbstractions.synchronize(backend()) + + host_results = Array(results) + + # Verify results make sense + for i in 1:N + global_id_x, local_id_x, group_id_x, global_size_x, local_size_x, num_groups_x = host_results[:, i] + + # Global IDs should be 1-based and sequential + @test global_id_x == i + + # Global size should match our ndrange + @test global_size_x == N + + # Local size should be 4 (our workgroupsize) + @test local_size_x == 4 + + # Number of groups should be ceil(N/4) = 4 + @test num_groups_x == 4 + + # Group ID should be 1-based + expected_group = div(i - 1, 4) + 1 + @test group_id_x == expected_group + + # Local ID should be 1-based within group + expected_local = ((i - 1) % 4) + 1 + @test local_id_x == expected_local + end + end + end +end diff --git a/test/testsuite.jl b/test/testsuite.jl index 2418db99..3426e542 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -26,6 +26,7 @@ end include("test.jl") +include("intrinsics.jl") include("localmem.jl") include("private.jl") include("unroll.jl") @@ -47,6 +48,10 @@ function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{ specialfunctions_testsuite(backend) end + @conditional_testset "Intrinsics" skip_tests begin + intrinsics_testsuite(backend, AT) + end + @conditional_testset "Localmem" skip_tests begin localmem_testsuite(backend, AT) end