From feee33862fe3cafb909da33ed8b8e5c5fc6735ce Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 4 Feb 2025 15:50:59 +0100 Subject: [PATCH 1/8] define basic intrinsics --- src/KernelAbstractions.jl | 30 +++++++++++++++++----- src/intrinsics.jl | 52 +++++++++++++++++++++++++++++++++++++++ src/pocl/backend.jl | 25 ++++++------------- 3 files changed, 83 insertions(+), 24 deletions(-) create mode 100644 src/intrinsics.jl diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 9af2d9d4c..8c2844fd7 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -194,6 +194,10 @@ function unsafe_free! end unsafe_free!(::AbstractArray) = return +include("intrinsics.jl") +import .KernelIntrinsics +export KernelIntrinsics + ### # Kernel language # - @localmem @@ -460,13 +464,27 @@ end # Internal kernel functions ### -function __index_Local_Linear end -function __index_Group_Linear end -function __index_Global_Linear end +function __index_Local_Linear(ctx) + return KernelIntrinsics.get_local_id().x +end -function __index_Local_Cartesian end -function __index_Group_Cartesian end -function __index_Global_Cartesian end +function __index_Group_Linear(ctx) + return KernelIntrinsics.get_group_id().x +end + +function __index_Global_Linear(ctx) + return KernelIntrinsics.get_global_id().x +end + +function __index_Local_Cartesian(ctx) + return @inbounds workitems(__iterspace(ctx))[KernelIntrinsics.get_local_id().x] +end +function __index_Group_Cartesian(ctx) + return @inbounds blocks(__iterspace(ctx))[KernelIntrinsics.get_group_id().x] +end +function __index_Global_Cartesian(ctx) + return @inbounds expand(__iterspace(ctx), KernelIntrinsics.get_group_id().x, KernelIntrinsics.get_local_id().x) +end @inline __index_Local_NTuple(ctx, I...) = Tuple(__index_Local_Cartesian(ctx, I...)) @inline __index_Group_NTuple(ctx, I...) = Tuple(__index_Group_Cartesian(ctx, I...)) diff --git a/src/intrinsics.jl b/src/intrinsics.jl new file mode 100644 index 000000000..33ed56fed --- /dev/null +++ b/src/intrinsics.jl @@ -0,0 +1,52 @@ +module KernelIntrinsics + +""" + get_global_size()::@NamedTuple{x::Int32, y::Int32, z::Int32} + +Return the number of global work-items specified. + +!!! note + 1-based. +""" +function get_global_size end + +""" + get_global_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} + +Returns the unique global work-item ID. +""" +function get_global_id end + +""" + get_local_size()::@NamedTuple{x::Int32, y::Int32, z::Int32} + +Return the number of local work-items specified. +""" +function get_local_size end + +""" + get_local_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} + +Returns the unique local work-item ID. +""" +function get_local_id end + +""" + get_num_groups()::@NamedTuple{x::Int32, y::Int32, z::Int32} + +Returns the number of groups. +""" +function get_num_groups end + +""" + get_group_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} + +Returns the unique group ID. +""" +function get_group_id end + +function localmemory end +function barrier end +function print end + +end diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 87fccdb93..32c2951a6 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -140,29 +140,18 @@ end ## Indexing Functions +const KI = KA.KernelIntrinsics -@device_override @inline function KA.__index_Local_Linear(ctx) - return get_local_id(1) +@device_override @inline function KI.get_local_id() + return (; x = get_local_id(1), y = get_local_id(2), z = get_local_id(3)) end -@device_override @inline function KA.__index_Group_Linear(ctx) - return get_group_id(1) +@device_override @inline function KI.get_group_id() + return (; x = get_group_id(1), y = get_group_id(2), z = get_group_id(3)) end -@device_override @inline function KA.__index_Global_Linear(ctx) - return get_global_id(1) -end - -@device_override @inline function KA.__index_Local_Cartesian(ctx) - @inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id(1)] -end - -@device_override @inline function KA.__index_Group_Cartesian(ctx) - @inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id(1)] -end - -@device_override @inline function KA.__index_Global_Cartesian(ctx) - return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1)) +@device_override @inline function KI.get_global_id() + return (; x = get_global_id(1), y = get_global_id(2), z = get_global_id(3)) end @device_override @inline function KA.__validindex(ctx) From ca815e26482d90da9577597be0a29b961354bbff Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 16 Sep 2025 12:37:32 -0300 Subject: [PATCH 2/8] Fix docstrings --- src/intrinsics.jl | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 33ed56fed..15202fab7 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -4,9 +4,6 @@ module KernelIntrinsics get_global_size()::@NamedTuple{x::Int32, y::Int32, z::Int32} Return the number of global work-items specified. - -!!! note - 1-based. """ function get_global_size end @@ -14,6 +11,9 @@ function get_global_size end get_global_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} Returns the unique global work-item ID. + +!!! note + 1-based. """ function get_global_id end @@ -28,6 +28,9 @@ function get_local_size end get_local_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} Returns the unique local work-item ID. + +!!! note + 1-based. """ function get_local_id end @@ -42,6 +45,9 @@ function get_num_groups end get_group_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} Returns the unique group ID. + +!!! note + 1-based. """ function get_group_id end From df8cbbadaced7da4a48050986d13ec4be929fddb Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 16 Sep 2025 13:51:00 -0300 Subject: [PATCH 3/8] localmemory --- src/KernelAbstractions.jl | 4 +++- src/pocl/backend.jl | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 8c2844fd7..f5f76306f 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -814,7 +814,9 @@ include("macros.jl") ### function Scratchpad end -function SharedMemory end +function SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} + KernelIntrinsics.localmemory(t, dims, id) +end function __synchronize() error("@synchronize used outside kernel or not captured") diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 32c2951a6..c06d014ce 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -166,7 +166,7 @@ end ## Shared and Scratch Memory -@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id} +@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id} ptr = POCL.emit_localmemory(T, Val(prod(Dims))) CLDeviceArray(Dims, ptr) end From 11b6a8ee9b48a3b2a70f09e3da36e92aabd2da79 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 18 Sep 2025 15:30:33 -0300 Subject: [PATCH 4/8] Move default barrier implementation to KernelIntrinsics --- src/KernelAbstractions.jl | 2 +- src/intrinsics.jl | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index f5f76306f..cd3d1fc99 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -819,7 +819,7 @@ function SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, end function __synchronize() - error("@synchronize used outside kernel or not captured") + KernelIntrinsics.barrier() end @generated function __print(items...) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 15202fab7..32cc125d1 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -52,7 +52,9 @@ Returns the unique group ID. function get_group_id end function localmemory end -function barrier end +function barrier() + error("Group barrier used outside kernel or not captured") +end function print end end From f1ad4125d457a1314e636e9d648e8c8f61515f20 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 18 Sep 2025 15:36:22 -0300 Subject: [PATCH 5/8] Implement remaining intrinsics for POCL --- src/pocl/backend.jl | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index c06d014ce..873458976 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -154,6 +154,18 @@ end return (; x = get_global_id(1), y = get_global_id(2), z = get_global_id(3)) end +@device_override @inline function KI.get_local_size() + return (; x = get_local_size(1), y = get_local_size(2), z = get_local_size(3)) +end + +@device_override @inline function KI.get_num_groups() + return (; x = get_num_groups(1), y = get_num_groups(2), z = get_num_groups(3)) +end + +@device_override @inline function KI.get_global_size() + return (; x = get_global_size(1), y = get_global_size(2), z = get_global_size(3)) +end + @device_override @inline function KA.__validindex(ctx) if KA.__dynamic_checkbounds(ctx) I = @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1)) @@ -178,7 +190,7 @@ end ## Synchronization and Printing -@device_override @inline function KA.__synchronize() +@device_override @inline function KI.barrier() work_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE) end From d60f6f22a76dd98e9a28a3a93837a4464867376b Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 18 Sep 2025 17:23:11 -0300 Subject: [PATCH 6/8] Add basic tests Modified from initial Claude code --- test/intrinsics.jl | 62 ++++++++++++++++++++++++++++++++++++++++++++++ test/testsuite.jl | 5 ++++ 2 files changed, 67 insertions(+) create mode 100644 test/intrinsics.jl diff --git a/test/intrinsics.jl b/test/intrinsics.jl new file mode 100644 index 000000000..693b2bd9e --- /dev/null +++ b/test/intrinsics.jl @@ -0,0 +1,62 @@ + +@kernel cpu = false inbounds = true unsafe_indices = true function test_intrinsics_kernel(results) + # Test all intrinsics return NamedTuples with x, y, z fields + global_size = KernelIntrinsics.get_global_size() + global_id = KernelIntrinsics.get_global_id() + local_size = KernelIntrinsics.get_local_size() + local_id = KernelIntrinsics.get_local_id() + num_groups = KernelIntrinsics.get_num_groups() + group_id = KernelIntrinsics.get_group_id() + + if UInt32(global_id.x) <= UInt32(global_size.x) + results[1, global_id.x] = global_id.x + results[2, global_id.x] = local_id.x + results[3, global_id.x] = group_id.x + results[4, global_id.x] = global_size.x + results[5, global_id.x] = local_size.x + results[6, global_id.x] = num_groups.x + end +end + + +function intrinsics_testsuite(backend, AT) + @testset "KernelIntrinsics Tests" begin + @testset "Basic intrinsics functionality" begin + + # Test with small kernel + N = 16 + results = AT(zeros(UInt32, 6, N)) + + kernel = test_intrinsics_kernel(backend(), 4, (N,)) + kernel(results, ndrange = N) + KernelAbstractions.synchronize(backend()) + + host_results = Array(results) + + # Verify results make sense + for i in 1:N + global_id_x, local_id_x, group_id_x, global_size_x, local_size_x, num_groups_x = host_results[:, i] + + # Global IDs should be 1-based and sequential + @test global_id_x == i + + # Global size should match our ndrange + @test global_size_x == N + + # Local size should be 4 (our workgroupsize) + @test local_size_x == 4 + + # Number of groups should be ceil(N/4) = 4 + @test num_groups_x == 4 + + # Group ID should be 1-based + expected_group = div(i - 1, 4) + 1 + @test group_id_x == expected_group + + # Local ID should be 1-based within group + expected_local = ((i - 1) % 4) + 1 + @test local_id_x == expected_local + end + end + end +end diff --git a/test/testsuite.jl b/test/testsuite.jl index 2418db998..3426e5423 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -26,6 +26,7 @@ end include("test.jl") +include("intrinsics.jl") include("localmem.jl") include("private.jl") include("unroll.jl") @@ -47,6 +48,10 @@ function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{ specialfunctions_testsuite(backend) end + @conditional_testset "Intrinsics" skip_tests begin + intrinsics_testsuite(backend, AT) + end + @conditional_testset "Localmem" skip_tests begin localmem_testsuite(backend, AT) end From 3fda18fee8097e07cd1b68f748782b6741f2238d Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 1 Oct 2025 13:24:43 -0300 Subject: [PATCH 7/8] Int32 -> Int https://github.com/JuliaGPU/KernelAbstractions.jl/pull/562/files#r1958255435 --- src/intrinsics.jl | 12 ++++++------ src/pocl/backend.jl | 12 ++++++------ test/intrinsics.jl | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 32cc125d1..060116200 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -1,14 +1,14 @@ module KernelIntrinsics """ - get_global_size()::@NamedTuple{x::Int32, y::Int32, z::Int32} + get_global_size()::@NamedTuple{x::Int, y::Int, z::Int} Return the number of global work-items specified. """ function get_global_size end """ - get_global_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} + get_global_id()::@NamedTuple{x::Int, y::Int, z::Int} Returns the unique global work-item ID. @@ -18,14 +18,14 @@ Returns the unique global work-item ID. function get_global_id end """ - get_local_size()::@NamedTuple{x::Int32, y::Int32, z::Int32} + get_local_size()::@NamedTuple{x::Int, y::Int, z::Int} Return the number of local work-items specified. """ function get_local_size end """ - get_local_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} + get_local_id()::@NamedTuple{x::Int, y::Int, z::Int} Returns the unique local work-item ID. @@ -35,14 +35,14 @@ Returns the unique local work-item ID. function get_local_id end """ - get_num_groups()::@NamedTuple{x::Int32, y::Int32, z::Int32} + get_num_groups()::@NamedTuple{x::Int, y::Int, z::Int} Returns the number of groups. """ function get_num_groups end """ - get_group_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} + get_group_id()::@NamedTuple{x::Int, y::Int, z::Int} Returns the unique group ID. diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 873458976..ef0aba6bd 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -143,27 +143,27 @@ end const KI = KA.KernelIntrinsics @device_override @inline function KI.get_local_id() - return (; x = get_local_id(1), y = get_local_id(2), z = get_local_id(3)) + return (; x = Int(get_local_id(1)), y = Int(get_local_id(2)), z = Int(get_local_id(3))) end @device_override @inline function KI.get_group_id() - return (; x = get_group_id(1), y = get_group_id(2), z = get_group_id(3)) + return (; x = Int(get_group_id(1)), y = Int(get_group_id(2)), z = Int(get_group_id(3))) end @device_override @inline function KI.get_global_id() - return (; x = get_global_id(1), y = get_global_id(2), z = get_global_id(3)) + return (; x = Int(get_global_id(1)), y = Int(get_global_id(2)), z = Int(get_global_id(3))) end @device_override @inline function KI.get_local_size() - return (; x = get_local_size(1), y = get_local_size(2), z = get_local_size(3)) + return (; x = Int(get_local_size(1)), y = Int(get_local_size(2)), z = Int(get_local_size(3))) end @device_override @inline function KI.get_num_groups() - return (; x = get_num_groups(1), y = get_num_groups(2), z = get_num_groups(3)) + return (; x = Int(get_num_groups(1)), y = Int(get_num_groups(2)), z = Int(get_num_groups(3))) end @device_override @inline function KI.get_global_size() - return (; x = get_global_size(1), y = get_global_size(2), z = get_global_size(3)) + return (; x = Int(get_global_size(1)), y = Int(get_global_size(2)), z = Int(get_global_size(3))) end @device_override @inline function KA.__validindex(ctx) diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 693b2bd9e..e58b58a63 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -25,7 +25,7 @@ function intrinsics_testsuite(backend, AT) # Test with small kernel N = 16 - results = AT(zeros(UInt32, 6, N)) + results = AT(zeros(Int, 6, N)) kernel = test_intrinsics_kernel(backend(), 4, (N,)) kernel(results, ndrange = N) From 84d0c68f61a33fdf621bf67419fd3ffb70ca4841 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 2 Oct 2025 16:58:13 -0300 Subject: [PATCH 8/8] Format --- src/KernelAbstractions.jl | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index cd3d1fc99..138c63c52 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -814,13 +814,9 @@ include("macros.jl") ### function Scratchpad end -function SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} - KernelIntrinsics.localmemory(t, dims, id) -end +SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims, id) -function __synchronize() - KernelIntrinsics.barrier() -end +__synchronize() = KernelIntrinsics.barrier() @generated function __print(items...) str = ""