Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
cebad02
[UR][SYCL] Introduce UR api to set kernel args + launch in one call.
aarongreig Jun 3, 2025
eff5f5e
Merge branch 'sycl' into aaron/enqueueKernelWithArgs
aarongreig Jun 10, 2025
82176ff
Fix tsan launchinfo
aarongreig Jun 10, 2025
9ccdfcd
Fix unit tests.
aarongreig Jun 10, 2025
2762c70
Fix native cpu + some cuda/hip fails.
aarongreig Jun 12, 2025
e25f390
Mechanically replace urEnqueueKernelLaunch in e2e tests.
aarongreig Jun 12, 2025
ffa9a11
Merge branch 'sycl' into aaron/enqueueKernelWithArgs
aarongreig Jun 12, 2025
3c87170
Fix a couple of tests and an oversight in the sanitizer layer.
aarongreig Jun 13, 2025
3c38b26
Fix fallthrough.
aarongreig Jun 13, 2025
d118cec
Remove missed SetArg calls.
aarongreig Jun 13, 2025
28d539e
Fix a test and move asan kernel arg handling to helpers.
aarongreig Jun 16, 2025
1fda654
Add missing locks to sanitizer launch with args.
aarongreig Jun 17, 2025
26976b6
Set kernel args in sanitizer layers rather than passing them through.
aarongreig Jun 18, 2025
70aa909
Merge branch 'sycl' into aaron/enqueueKernelWithArgs
aarongreig Jun 18, 2025
cda9d00
Fix printing by adding separate value union member.
aarongreig Jun 18, 2025
29cbd08
Fix global size validation and add some negative tests.
aarongreig Jun 18, 2025
9153d56
Expand testing + validation.
aarongreig Jun 20, 2025
74e19e1
Add kernel arg storage to queue_impl rather than re-allocate for ever…
aarongreig Jun 20, 2025
2879552
Spec cleanup, add rst file
aarongreig Jun 20, 2025
4744151
Fix unittest build.
aarongreig Jun 20, 2025
bafcebd
Minor l0 fix: handle sampler args with SetArgValueHelper.
aarongreig Jun 20, 2025
aa2367f
Move native cpu set arg logic entirely to common code.
aarongreig Jun 24, 2025
a5d4b8c
Merge branch 'sycl' into aaron/enqueueKernelWithArgs
aarongreig Jun 24, 2025
4e16f64
Merge branch 'sycl' into aaron/enqueueKernelWithArgs
aarongreig Jun 26, 2025
a6d4bed
Add offload implementation.
aarongreig Jun 26, 2025
b712d44
Update offload interface loader.
aarongreig Jun 26, 2025
d03b90d
Merge branch 'sycl' into aaron/enqueueKernelWithArgs
aarongreig Jun 27, 2025
8c6ebf2
Add out of order definition for l0 v2 adapter.
aarongreig Jun 27, 2025
8e52a62
Merge branch 'sycl' into aaron/enqueueKernelWithArgs
aarongreig Jun 30, 2025
ed4dded
Merge branch 'sycl' into aaron/enqueueKernelWithArgs
aarongreig Jul 1, 2025
2735bde
Merge branch 'sycl' into aaron/enqueueKernelWithArgs
aarongreig Jul 7, 2025
2f81949
Merge branch 'sycl' into aaron/enqueueKernelWithArgs
aarongreig Jul 8, 2025
a5cad22
Move minimum global size logic to adjustNDRangePerKernel.
aarongreig Jul 14, 2025
793376a
Correct work dimension normalization.
aarongreig Jul 16, 2025
81b0816
Fix msvc.
aarongreig Jul 17, 2025
7c5c8bc
Merge branch 'sycl' into aaron/enqueueKernelWithArgs
aarongreig Jul 18, 2025
09eb83a
Allow arbitrary work dims
aarongreig Jul 24, 2025
71094ce
Don't rely on queue based shared kernel arg storage.
aarongreig Jul 24, 2025
327c6c8
Merge branch 'sycl' into aaron/enqueueKernelWithArgs
aarongreig Jul 24, 2025
0f8c250
Fix incorrect vector initialization.
aarongreig Jul 24, 2025
9362484
Remove unused kernel arg storage.
aarongreig Jul 25, 2025
f3916b5
Merge branch 'sycl' into aaron/enqueueKernelWithArgs
aarongreig Jul 25, 2025
22f7783
Merge branch 'sycl' into aaron/enqueueKernelWithArgs
aarongreig Jul 25, 2025
0cfad2a
Fix image_selection e2e test.
aarongreig Jul 25, 2025
71512e6
Fix bad find/replace in comment.
aarongreig Jul 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
207 changes: 157 additions & 50 deletions sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2315,14 +2315,14 @@ ur_mem_flags_t AccessModeToUr(access::mode AccessorMode) {
}
}

// Sets arguments for a given kernel and device based on the argument type.
// Refactored from SetKernelParamsAndLaunch to allow it to be used in the graphs
// extension.
static void SetArgBasedOnType(
adapter_impl &Adapter, ur_kernel_handle_t Kernel,
// Gets UR argument struct for a given kernel and device based on the argument
// type. Refactored from SetKernelParamsAndLaunch to allow it to be used in
// the graphs extension (LaunchWithArgs for graphs is planned future work).
static void GetUrArgsBasedOnType(
device_image_impl *DeviceImageImpl,
const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
context_impl &ContextImpl, detail::ArgDesc &Arg, size_t NextTrueIndex) {
context_impl &ContextImpl, detail::ArgDesc &Arg, size_t NextTrueIndex,
std::vector<ur_exp_kernel_arg_properties_t> &UrArgs) {
switch (Arg.MType) {
case kernel_param_kind_t::kind_dynamic_work_group_memory:
break;
Expand All @@ -2342,52 +2342,61 @@ static void SetArgBasedOnType(
getMemAllocationFunc
? reinterpret_cast<ur_mem_handle_t>(getMemAllocationFunc(Req))
: nullptr;
ur_kernel_arg_mem_obj_properties_t MemObjData{};
MemObjData.stype = UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES;
MemObjData.memoryAccess = AccessModeToUr(Req->MAccessMode);
Adapter.call<UrApiKind::urKernelSetArgMemObj>(Kernel, NextTrueIndex,
&MemObjData, MemArg);
ur_exp_kernel_arg_value_t Value = {};
Value.memObjTuple = {MemArg, AccessModeToUr(Req->MAccessMode)};
UrArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr,
UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ,
static_cast<uint32_t>(NextTrueIndex), sizeof(MemArg),
Value});
break;
}
case kernel_param_kind_t::kind_std_layout: {
ur_exp_kernel_arg_type_t Type;
if (Arg.MPtr) {
Adapter.call<UrApiKind::urKernelSetArgValue>(
Kernel, NextTrueIndex, Arg.MSize, nullptr, Arg.MPtr);
Type = UR_EXP_KERNEL_ARG_TYPE_VALUE;
} else {
Adapter.call<UrApiKind::urKernelSetArgLocal>(Kernel, NextTrueIndex,
Arg.MSize, nullptr);
Type = UR_EXP_KERNEL_ARG_TYPE_LOCAL;
}
ur_exp_kernel_arg_value_t Value = {};
Value.value = {Arg.MPtr};
UrArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr,
Type, static_cast<uint32_t>(NextTrueIndex),
static_cast<size_t>(Arg.MSize), Value});

break;
}
case kernel_param_kind_t::kind_sampler: {
sampler *SamplerPtr = (sampler *)Arg.MPtr;
ur_sampler_handle_t Sampler =
(ur_sampler_handle_t)detail::getSyclObjImpl(*SamplerPtr)
->getOrCreateSampler(ContextImpl);
Adapter.call<UrApiKind::urKernelSetArgSampler>(Kernel, NextTrueIndex,
nullptr, Sampler);
ur_exp_kernel_arg_value_t Value = {};
Value.sampler = (ur_sampler_handle_t)detail::getSyclObjImpl(*SamplerPtr)
->getOrCreateSampler(ContextImpl);
UrArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr,
UR_EXP_KERNEL_ARG_TYPE_SAMPLER,
static_cast<uint32_t>(NextTrueIndex),
sizeof(ur_sampler_handle_t), Value});
break;
}
case kernel_param_kind_t::kind_pointer: {
// We need to de-rerence this to get the actual USM allocation - that's the
ur_exp_kernel_arg_value_t Value = {};
// We need to de-rerence to get the actual USM allocation - that's the
// pointer UR is expecting.
const void *Ptr = *static_cast<const void *const *>(Arg.MPtr);
Adapter.call<UrApiKind::urKernelSetArgPointer>(Kernel, NextTrueIndex,
nullptr, Ptr);
Value.pointer = *static_cast<void *const *>(Arg.MPtr);
UrArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr,
UR_EXP_KERNEL_ARG_TYPE_POINTER,
static_cast<uint32_t>(NextTrueIndex), sizeof(Arg.MPtr),
Value});
break;
}
case kernel_param_kind_t::kind_specialization_constants_buffer: {
assert(DeviceImageImpl != nullptr);
ur_mem_handle_t SpecConstsBuffer =
DeviceImageImpl->get_spec_const_buffer_ref();

ur_kernel_arg_mem_obj_properties_t MemObjProps{};
MemObjProps.pNext = nullptr;
MemObjProps.stype = UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES;
MemObjProps.memoryAccess = UR_MEM_FLAG_READ_ONLY;
Adapter.call<UrApiKind::urKernelSetArgMemObj>(
Kernel, NextTrueIndex, &MemObjProps, SpecConstsBuffer);
ur_exp_kernel_arg_value_t Value = {};
Value.memObjTuple = {SpecConstsBuffer, UR_MEM_FLAG_READ_ONLY};
UrArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr,
UR_EXP_KERNEL_ARG_TYPE_MEM_OBJ,
static_cast<uint32_t>(NextTrueIndex),
sizeof(SpecConstsBuffer), Value});
break;
}
case kernel_param_kind_t::kind_invalid:
Expand Down Expand Up @@ -2420,22 +2429,32 @@ static ur_result_t SetKernelParamsAndLaunch(
DeviceImageImpl ? DeviceImageImpl->get_spec_const_blob_ref() : Empty);
}

std::vector<ur_exp_kernel_arg_properties_t> UrArgs;
UrArgs.reserve(Args.size());

if (KernelFuncPtr && !KernelHasSpecialCaptures) {
auto setFunc = [&Adapter, Kernel,
auto setFunc = [&UrArgs,
KernelFuncPtr](const detail::kernel_param_desc_t &ParamDesc,
size_t NextTrueIndex) {
const void *ArgPtr = (const char *)KernelFuncPtr + ParamDesc.offset;
switch (ParamDesc.kind) {
case kernel_param_kind_t::kind_std_layout: {
int Size = ParamDesc.info;
Adapter.call<UrApiKind::urKernelSetArgValue>(Kernel, NextTrueIndex,
Size, nullptr, ArgPtr);
ur_exp_kernel_arg_value_t Value = {};
Value.value = ArgPtr;
UrArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr,
UR_EXP_KERNEL_ARG_TYPE_VALUE,
static_cast<uint32_t>(NextTrueIndex),
static_cast<size_t>(Size), Value});
break;
}
case kernel_param_kind_t::kind_pointer: {
const void *Ptr = *static_cast<const void *const *>(ArgPtr);
Adapter.call<UrApiKind::urKernelSetArgPointer>(Kernel, NextTrueIndex,
nullptr, Ptr);
ur_exp_kernel_arg_value_t Value = {};
Value.pointer = *static_cast<const void *const *>(ArgPtr);
UrArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES, nullptr,
UR_EXP_KERNEL_ARG_TYPE_POINTER,
static_cast<uint32_t>(NextTrueIndex),
sizeof(Value.pointer), Value});
break;
}
default:
Expand All @@ -2445,10 +2464,10 @@ static ur_result_t SetKernelParamsAndLaunch(
applyFuncOnFilteredArgs(EliminatedArgMask, KernelNumArgs,
KernelParamDescGetter, setFunc);
} else {
auto setFunc = [&Adapter, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
&Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) {
SetArgBasedOnType(Adapter, Kernel, DeviceImageImpl, getMemAllocationFunc,
Queue.getContextImpl(), Arg, NextTrueIndex);
auto setFunc = [&DeviceImageImpl, &getMemAllocationFunc, &Queue,
&UrArgs](detail::ArgDesc &Arg, size_t NextTrueIndex) {
GetUrArgsBasedOnType(DeviceImageImpl, getMemAllocationFunc,
Queue.getContextImpl(), Arg, NextTrueIndex, UrArgs);
};
applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc);
}
Expand All @@ -2461,8 +2480,12 @@ static ur_result_t SetKernelParamsAndLaunch(
// CUDA-style local memory setting. Note that we may have -1 as a position,
// this indicates the buffer is actually unused and was elided.
if (ImplicitLocalArg.has_value() && ImplicitLocalArg.value() != -1) {
Adapter.call<UrApiKind::urKernelSetArgLocal>(
Kernel, ImplicitLocalArg.value(), WorkGroupMemorySize, nullptr);
UrArgs.push_back({UR_STRUCTURE_TYPE_EXP_KERNEL_ARG_PROPERTIES,
nullptr,
UR_EXP_KERNEL_ARG_TYPE_LOCAL,
static_cast<uint32_t>(ImplicitLocalArg.value()),
WorkGroupMemorySize,
{nullptr}});
}

adjustNDRangePerKernel(NDRDesc, Kernel, Queue.getDeviceImpl());
Expand Down Expand Up @@ -2520,20 +2543,104 @@ static ur_result_t SetKernelParamsAndLaunch(
{{WorkGroupMemorySize}}});
}
ur_event_handle_t UREvent = nullptr;
ur_result_t Error = Adapter.call_nocheck<UrApiKind::urEnqueueKernelLaunch>(
Queue.getHandleRef(), Kernel, NDRDesc.Dims,
HasOffset ? &NDRDesc.GlobalOffset[0] : nullptr, &NDRDesc.GlobalSize[0],
LocalSize, property_list.size(),
property_list.empty() ? nullptr : property_list.data(), RawEvents.size(),
RawEvents.empty() ? nullptr : &RawEvents[0],
OutEventImpl ? &UREvent : nullptr);
ur_result_t Error =
Adapter.call_nocheck<UrApiKind::urEnqueueKernelLaunchWithArgsExp>(
Queue.getHandleRef(), Kernel, NDRDesc.Dims,
HasOffset ? &NDRDesc.GlobalOffset[0] : nullptr,
&NDRDesc.GlobalSize[0], LocalSize, UrArgs.size(), UrArgs.data(),
property_list.size(),
property_list.empty() ? nullptr : property_list.data(),
RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0],
OutEventImpl ? &UREvent : nullptr);
if (Error == UR_RESULT_SUCCESS && OutEventImpl) {
OutEventImpl->setHandle(UREvent);
}

return Error;
}

// Sets arguments for a given kernel and device based on the argument type.
// This is a legacy path which the graphs extension still uses.
static void SetArgBasedOnType(
adapter_impl &Adapter, ur_kernel_handle_t Kernel,
device_image_impl *DeviceImageImpl,
const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
context_impl &ContextImpl, detail::ArgDesc &Arg, size_t NextTrueIndex) {
switch (Arg.MType) {
case kernel_param_kind_t::kind_dynamic_work_group_memory:
break;
case kernel_param_kind_t::kind_work_group_memory:
break;
case kernel_param_kind_t::kind_stream:
break;
case kernel_param_kind_t::kind_dynamic_accessor:
case kernel_param_kind_t::kind_accessor: {
Requirement *Req = (Requirement *)(Arg.MPtr);

// getMemAllocationFunc is nullptr when there are no requirements. However,
// we may pass default constructed accessors to a command, which don't add
// requirements. In such case, getMemAllocationFunc is nullptr, but it's a
// valid case, so we need to properly handle it.
ur_mem_handle_t MemArg =
getMemAllocationFunc
? reinterpret_cast<ur_mem_handle_t>(getMemAllocationFunc(Req))
: nullptr;
ur_kernel_arg_mem_obj_properties_t MemObjData{};
MemObjData.stype = UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES;
MemObjData.memoryAccess = AccessModeToUr(Req->MAccessMode);
Adapter.call<UrApiKind::urKernelSetArgMemObj>(Kernel, NextTrueIndex,
&MemObjData, MemArg);
break;
}
case kernel_param_kind_t::kind_std_layout: {
if (Arg.MPtr) {
Adapter.call<UrApiKind::urKernelSetArgValue>(
Kernel, NextTrueIndex, Arg.MSize, nullptr, Arg.MPtr);
} else {
Adapter.call<UrApiKind::urKernelSetArgLocal>(Kernel, NextTrueIndex,
Arg.MSize, nullptr);
}

break;
}
case kernel_param_kind_t::kind_sampler: {
sampler *SamplerPtr = (sampler *)Arg.MPtr;
ur_sampler_handle_t Sampler =
(ur_sampler_handle_t)detail::getSyclObjImpl(*SamplerPtr)
->getOrCreateSampler(ContextImpl);
Adapter.call<UrApiKind::urKernelSetArgSampler>(Kernel, NextTrueIndex,
nullptr, Sampler);
break;
}
case kernel_param_kind_t::kind_pointer: {
// We need to de-rerence this to get the actual USM allocation - that's the
// pointer UR is expecting.
const void *Ptr = *static_cast<const void *const *>(Arg.MPtr);
Adapter.call<UrApiKind::urKernelSetArgPointer>(Kernel, NextTrueIndex,
nullptr, Ptr);
break;
}
case kernel_param_kind_t::kind_specialization_constants_buffer: {
assert(DeviceImageImpl != nullptr);
ur_mem_handle_t SpecConstsBuffer =
DeviceImageImpl->get_spec_const_buffer_ref();

ur_kernel_arg_mem_obj_properties_t MemObjProps{};
MemObjProps.pNext = nullptr;
MemObjProps.stype = UR_STRUCTURE_TYPE_KERNEL_ARG_MEM_OBJ_PROPERTIES;
MemObjProps.memoryAccess = UR_MEM_FLAG_READ_ONLY;
Adapter.call<UrApiKind::urKernelSetArgMemObj>(
Kernel, NextTrueIndex, &MemObjProps, SpecConstsBuffer);
break;
}
case kernel_param_kind_t::kind_invalid:
throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
"Invalid kernel param kind " +
codeToString(UR_RESULT_ERROR_INVALID_VALUE));
break;
}
}

static std::tuple<ur_kernel_handle_t, device_image_impl *,
const KernelArgMask *>
getCGKernelInfo(const CGExecKernel &CommandGroup, context_impl &ContextImpl,
Expand Down
4 changes: 2 additions & 2 deletions sycl/test-e2e/Adapters/level_zero/batch_barrier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ int main(int argc, char *argv[]) {
queue q;

submit_kernel(q); // starts a batch
// CHECK: ---> urEnqueueKernelLaunch
// CHECK: ---> urEnqueueKernelLaunchWithArgsExp
// CHECK-NOT: zeCommandQueueExecuteCommandLists

// Initialize Level Zero driver is required if this test is linked
Expand All @@ -41,7 +41,7 @@ int main(int argc, char *argv[]) {
// CHECK-NOT: zeCommandQueueExecuteCommandLists

submit_kernel(q);
// CHECK: ---> urEnqueueKernelLaunch
// CHECK: ---> urEnqueueKernelLaunchWithArgsExp
// CHECK-NOT: zeCommandQueueExecuteCommandLists

// interop should close the batch
Expand Down
Loading
Loading