Skip to content

Commit 49a39e4

Browse files
authored
Merge pull request #2264 from Bensuo/ben/cmdbuf-local-arg-fix
[CMDBUF] Fix incorrect handling of shared local mem args in CUDA/HIP
2 parents eb45729 + 164314c commit 49a39e4

File tree

9 files changed

+581
-4
lines changed

9 files changed

+581
-4
lines changed

include/ur_api.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8329,7 +8329,7 @@ typedef struct ur_exp_command_buffer_update_value_arg_desc_t {
83298329
///< ::UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC
83308330
const void *pNext; ///< [in][optional] pointer to extension-specific structure
83318331
uint32_t argIndex; ///< [in] Argument index.
8332-
uint32_t argSize; ///< [in] Argument size.
8332+
size_t argSize; ///< [in] Argument size.
83338333
const ur_kernel_arg_value_properties_t *pProperties; ///< [in][optional] Pointer to value properties.
83348334
const void *pNewValueArg; ///< [in][optional] Argument value representing matching kernel arg type to
83358335
///< set at argument index.

scripts/core/exp-command-buffer.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ members:
188188
- type: uint32_t
189189
name: argIndex
190190
desc: "[in] Argument index."
191-
- type: uint32_t
191+
- type: size_t
192192
name: argSize
193193
desc: "[in] Argument size."
194194
- type: "const ur_kernel_arg_value_properties_t *"

source/adapters/cuda/command_buffer.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1304,7 +1304,12 @@ updateKernelArguments(kernel_command_handle *Command,
13041304

13051305
ur_result_t Result = UR_RESULT_SUCCESS;
13061306
try {
1307-
Kernel->setKernelArg(ArgIndex, ArgSize, ArgValue);
1307+
// Local memory args are passed as value args with nullptr value
1308+
if (ArgValue) {
1309+
Kernel->setKernelArg(ArgIndex, ArgSize, ArgValue);
1310+
} else {
1311+
Kernel->setKernelLocalArg(ArgIndex, ArgSize);
1312+
}
13081313
} catch (ur_result_t Err) {
13091314
Result = Err;
13101315
return Result;

source/adapters/hip/command_buffer.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1013,7 +1013,12 @@ updateKernelArguments(ur_exp_command_buffer_command_handle_t Command,
10131013
const void *ArgValue = ValueArgDesc.pNewValueArg;
10141014

10151015
try {
1016-
Kernel->setKernelArg(ArgIndex, ArgSize, ArgValue);
1016+
// Local memory args are passed as value args with nullptr value
1017+
if (ArgValue) {
1018+
Kernel->setKernelArg(ArgIndex, ArgSize, ArgValue);
1019+
} else {
1020+
Kernel->setKernelLocalArg(ArgIndex, ArgSize);
1021+
}
10171022
} catch (ur_result_t Err) {
10181023
return Err;
10191024
}

test/conformance/device_code/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/sequence.cpp)
162162
add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/standard_types.cpp)
163163
add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/subgroup.cpp)
164164
add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/linker_error.cpp)
165+
add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/saxpy_usm_local_mem.cpp)
165166

166167
set(KERNEL_HEADER ${UR_CONFORMANCE_DEVICE_BINARIES_DIR}/kernel_entry_points.h)
167168
add_custom_command(OUTPUT ${KERNEL_HEADER}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
// Copyright (C) 2024 Intel Corporation
2+
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
3+
// See LICENSE.TXT
4+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5+
6+
#include <sycl/sycl.hpp>
7+
8+
int main() {
9+
size_t array_size = 16;
10+
size_t local_size = 4;
11+
sycl::queue sycl_queue;
12+
uint32_t *X = sycl::malloc_shared<uint32_t>(array_size, sycl_queue);
13+
uint32_t *Y = sycl::malloc_shared<uint32_t>(array_size, sycl_queue);
14+
uint32_t *Z = sycl::malloc_shared<uint32_t>(array_size, sycl_queue);
15+
uint32_t A = 42;
16+
17+
sycl_queue.submit([&](sycl::handler &cgh) {
18+
sycl::local_accessor<uint32_t, 1> local_mem(local_size, cgh);
19+
cgh.parallel_for<class saxpy_usm_local_mem>(
20+
sycl::nd_range<1>{{array_size}, {local_size}},
21+
[=](sycl::nd_item<1> itemId) {
22+
auto i = itemId.get_global_linear_id();
23+
auto local_id = itemId.get_local_linear_id();
24+
local_mem[local_id] = i;
25+
Z[i] = A * X[i] + Y[i] + local_mem[local_id] +
26+
itemId.get_local_range(0);
27+
});
28+
});
29+
return 0;
30+
}

test/conformance/exp_command_buffer/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,5 @@ add_conformance_test_with_kernels_environment(exp_command_buffer
1919
update/usm_saxpy_kernel_update.cpp
2020
update/event_sync.cpp
2121
update/kernel_event_sync.cpp
22+
update/local_memory_update.cpp
2223
)

test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,7 @@
3636
{{OPT}}KernelCommandEventSyncUpdateTest.TwoWaitEvents/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}
3737
{{OPT}}KernelCommandEventSyncUpdateTest.InvalidWaitUpdate/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}
3838
{{OPT}}KernelCommandEventSyncUpdateTest.InvalidSignalUpdate/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}
39+
{{OPT}}LocalMemoryUpdateTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}
40+
{{OPT}}LocalMemoryUpdateTest.UpdateParametersAndLocalSize/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}
41+
{{OPT}}LocalMemoryMultiUpdateTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}
42+
{{OPT}}LocalMemoryMultiUpdateTest.UpdateWithoutBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}

0 commit comments

Comments
 (0)