diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 0713a66ef3ef5..36551f70b10a0 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -223,3 +223,6 @@ llvm/test/Instrumentation/ThreadSanitizer/ @intel/dpcpp-sanitizers-review
sycl/test-e2e/AddressSanitizer/ @intel/dpcpp-sanitizers-review
sycl/test-e2e/MemorySanitizer/ @intel/dpcpp-sanitizers-review
sycl/test-e2e/ThreadSanitizer/ @intel/dpcpp-sanitizers-review
+
+# ABI compatibility
+devops/compat_ci_exclude.sycl-rel-** @gmlueck @xtian-github
diff --git a/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp b/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp
index ee913fc8d0eeb..75a4baea8bb0c 100644
--- a/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp
+++ b/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp
@@ -1293,6 +1293,49 @@ class BinaryWrapper {
appendToGlobalDtors(M, Func, /*Priority*/ 1);
}
+ void createSyclRegisterWithAtexitUnregister(GlobalVariable *BinDesc) {
+ auto *UnregFuncTy =
+ FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
+ auto *UnregFunc =
+ Function::Create(UnregFuncTy, GlobalValue::InternalLinkage,
+ "sycl.descriptor_unreg.atexit", &M);
+ UnregFunc->setSection(".text.startup");
+
+ // Declaration for __sycl_unregister_lib(void*).
+ auto *UnregTargetTy =
+ FunctionType::get(Type::getVoidTy(C), getPtrTy(), false);
+ FunctionCallee UnregTargetC =
+ M.getOrInsertFunction("__sycl_unregister_lib", UnregTargetTy);
+
+ IRBuilder<> UnregBuilder(BasicBlock::Create(C, "entry", UnregFunc));
+ UnregBuilder.CreateCall(UnregTargetC, BinDesc);
+ UnregBuilder.CreateRetVoid();
+
+ auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
+ auto *RegFunc = Function::Create(RegFuncTy, GlobalValue::InternalLinkage,
+ "sycl.descriptor_reg", &M);
+ RegFunc->setSection(".text.startup");
+
+ auto *RegTargetTy =
+ FunctionType::get(Type::getVoidTy(C), getPtrTy(), false);
+ FunctionCallee RegTargetC =
+ M.getOrInsertFunction("__sycl_register_lib", RegTargetTy);
+
+ // `atexit` takes a `void(*)()` function pointer. In LLVM IR, this is
+ // typically represented as `i32 (ptr)`.
+ FunctionType *AtExitTy =
+ FunctionType::get(Type::getInt32Ty(C), getPtrTy(), false);
+ FunctionCallee AtExitC = M.getOrInsertFunction("atexit", AtExitTy);
+
+ IRBuilder<> RegBuilder(BasicBlock::Create(C, "entry", RegFunc));
+ RegBuilder.CreateCall(RegTargetC, BinDesc);
+ RegBuilder.CreateCall(AtExitC, UnregFunc);
+ RegBuilder.CreateRetVoid();
+
+ // Add this function to global destructors.
+ appendToGlobalCtors(M, RegFunc, /*Priority*/ 1);
+ }
+
public:
BinaryWrapper(StringRef Target, StringRef ToolName,
StringRef SymPropBCFiles = "")
@@ -1370,8 +1413,13 @@ class BinaryWrapper {
if (EmitRegFuncs) {
GlobalVariable *Desc = *DescOrErr;
- createRegisterFunction(Kind, Desc);
- createUnregisterFunction(Kind, Desc);
+ if (Kind == OffloadKind::SYCL &&
+ Triple(M.getTargetTriple()).isOSWindows()) {
+ createSyclRegisterWithAtexitUnregister(Desc);
+ } else {
+ createRegisterFunction(Kind, Desc);
+ createUnregisterFunction(Kind, Desc);
+ }
}
}
return &M;
diff --git a/devops/scripts/benchmarks/html/scripts.js b/devops/scripts/benchmarks/html/scripts.js
index 558021a13ab4a..28e3a708c2956 100644
--- a/devops/scripts/benchmarks/html/scripts.js
+++ b/devops/scripts/benchmarks/html/scripts.js
@@ -17,6 +17,125 @@ let annotationsOptions = new Map(); // Global options map for annotations
let archivedDataLoaded = false;
let loadedBenchmarkRuns = []; // Loaded results from the js/json files
+// Toggle configuration and abstraction
+//
+// HOW TO ADD A NEW TOGGLE:
+// 1. Add HTML checkbox to index.html:
+//
+//
+// 2. Add configuration below:
+// 'my-toggle': {
+// defaultValue: false, // true = enabled by default, false = disabled by default
+// urlParam: 'myParam', // Name shown in URL (?myParam=true)
+// invertUrlParam: false, // false = normal behavior, true = legacy inverted logic
+// onChange: function(isEnabled) { // Function called when toggle state changes
+// // Your logic here
+// updateURL(); // Always call this to update the browser URL
+// }
+// }
+//
+// 3. (Optional) Add helper function for cleaner, more readable code:
+// function isMyToggleEnabled() { return isToggleEnabled('my-toggle'); }
+//
+// This lets you write: if (isMyToggleEnabled()) { ... }
+// Instead of: if (isToggleEnabled('my-toggle')) { ... }
+//
+
+const toggleConfigs = {
+ 'show-notes': {
+ defaultValue: true,
+ urlParam: 'notes',
+ invertUrlParam: true, // Store false in URL when enabled (legacy behavior)
+ onChange: function(isEnabled) {
+ document.querySelectorAll('.benchmark-note').forEach(note => {
+ note.style.display = isEnabled ? 'block' : 'none';
+ });
+ updateURL();
+ }
+ },
+ 'show-unstable': {
+ defaultValue: false,
+ urlParam: 'unstable',
+ invertUrlParam: false,
+ onChange: function(isEnabled) {
+ document.querySelectorAll('.benchmark-unstable').forEach(warning => {
+ warning.style.display = isEnabled ? 'block' : 'none';
+ });
+ filterCharts();
+ }
+ },
+ 'custom-range': {
+ defaultValue: false,
+ urlParam: 'customRange',
+ invertUrlParam: false,
+ onChange: function(isEnabled) {
+ updateCharts();
+ }
+ },
+ 'show-archived-data': {
+ defaultValue: false,
+ urlParam: 'archived',
+ invertUrlParam: false,
+ onChange: function(isEnabled) {
+ if (isEnabled) {
+ loadArchivedData();
+ } else {
+ if (archivedDataLoaded) {
+ location.reload();
+ }
+ }
+ updateURL();
+ }
+ }
+};
+
+// Generic toggle helper functions
+function isToggleEnabled(toggleId) {
+ const toggle = document.getElementById(toggleId);
+ return toggle ? toggle.checked : toggleConfigs[toggleId]?.defaultValue || false;
+}
+
+function setupToggle(toggleId, config) {
+ const toggle = document.getElementById(toggleId);
+ if (!toggle) return;
+
+ // Set up event listener
+ toggle.addEventListener('change', function() {
+ config.onChange(toggle.checked);
+ });
+
+ // Initialize from URL params if present
+ const urlParam = getQueryParam(config.urlParam);
+ if (urlParam !== null) {
+ const urlValue = urlParam === 'true';
+ // Handle inverted URL params (like notes where false means enabled)
+ toggle.checked = config.invertUrlParam ? !urlValue : urlValue;
+ } else {
+ // Use default value
+ toggle.checked = config.defaultValue;
+ }
+}
+
+function updateToggleURL(toggleId, config, url) {
+ const isEnabled = isToggleEnabled(toggleId);
+
+ if (config.invertUrlParam) {
+ // For inverted params, store in URL when disabled
+ if (isEnabled) {
+ url.searchParams.delete(config.urlParam);
+ } else {
+ url.searchParams.set(config.urlParam, 'false');
+ }
+ } else {
+ // For normal params, store in URL when enabled
+ if (!isEnabled) {
+ url.searchParams.delete(config.urlParam);
+ } else {
+ url.searchParams.set(config.urlParam, 'true');
+ }
+ }
+}
+
// DOM Elements
let runSelect, selectedRunsDiv, suiteFiltersContainer, tagFiltersContainer;
@@ -627,30 +746,10 @@ function updateURL() {
url.searchParams.delete('runs');
}
- // Add toggle states to URL
- if (isNotesEnabled()) {
- url.searchParams.delete('notes');
- } else {
- url.searchParams.set('notes', 'false');
- }
-
- if (!isUnstableEnabled()) {
- url.searchParams.delete('unstable');
- } else {
- url.searchParams.set('unstable', 'true');
- }
-
- if (!isCustomRangesEnabled()) {
- url.searchParams.delete('customRange');
- } else {
- url.searchParams.set('customRange', 'true');
- }
-
- if (!isArchivedDataEnabled()) {
- url.searchParams.delete('archived');
- } else {
- url.searchParams.set('archived', 'true');
- }
+ // Update toggle states in URL using the generic helper
+ Object.entries(toggleConfigs).forEach(([toggleId, config]) => {
+ updateToggleURL(toggleId, config, url);
+ });
history.replaceState(null, '', url);
}
@@ -949,94 +1048,26 @@ function setupSuiteFilters() {
}
function isNotesEnabled() {
- const notesToggle = document.getElementById('show-notes');
- return notesToggle.checked;
+ return isToggleEnabled('show-notes');
}
function isUnstableEnabled() {
- const unstableToggle = document.getElementById('show-unstable');
- return unstableToggle.checked;
+ return isToggleEnabled('show-unstable');
}
function isCustomRangesEnabled() {
- const rangesToggle = document.getElementById('custom-range');
- return rangesToggle.checked;
+ return isToggleEnabled('custom-range');
}
function isArchivedDataEnabled() {
- const archivedDataToggle = document.getElementById('show-archived-data');
- return archivedDataToggle.checked;
+ return isToggleEnabled('show-archived-data');
}
function setupToggles() {
- const notesToggle = document.getElementById('show-notes');
- const unstableToggle = document.getElementById('show-unstable');
- const customRangeToggle = document.getElementById('custom-range');
- const archivedDataToggle = document.getElementById('show-archived-data');
-
- notesToggle.addEventListener('change', function () {
- // Update all note elements visibility
- document.querySelectorAll('.benchmark-note').forEach(note => {
- note.style.display = isNotesEnabled() ? 'block' : 'none';
- });
- updateURL();
- });
-
- unstableToggle.addEventListener('change', function () {
- // Update all unstable warning elements visibility
- document.querySelectorAll('.benchmark-unstable').forEach(warning => {
- warning.style.display = isUnstableEnabled() ? 'block' : 'none';
- });
- filterCharts();
- });
-
- customRangeToggle.addEventListener('change', function () {
- // redraw all charts
- updateCharts();
+ // Set up all toggles using the configuration
+ Object.entries(toggleConfigs).forEach(([toggleId, config]) => {
+ setupToggle(toggleId, config);
});
-
- // Add event listener for archived data toggle
- if (archivedDataToggle) {
- archivedDataToggle.addEventListener('change', function() {
- if (archivedDataToggle.checked) {
- loadArchivedData();
- } else {
- if (archivedDataLoaded) {
- // Reload the page to reset
- location.reload();
- }
- }
- updateURL();
- });
- }
-
- // Initialize from URL params if present
- const notesParam = getQueryParam('notes');
- const unstableParam = getQueryParam('unstable');
- const archivedParam = getQueryParam('archived');
-
- if (notesParam !== null) {
- let showNotes = notesParam === 'true';
- notesToggle.checked = showNotes;
- }
-
- if (unstableParam !== null) {
- let showUnstable = unstableParam === 'true';
- unstableToggle.checked = showUnstable;
- }
-
- const customRangesParam = getQueryParam('customRange');
- if (customRangesParam !== null) {
- customRangeToggle.checked = customRangesParam === 'true';
- }
-
- if (archivedDataToggle && archivedParam !== null) {
- archivedDataToggle.checked = archivedParam === 'true';
-
- if (archivedDataToggle.checked) {
- loadArchivedData();
- }
- }
}
function setupTagFilters() {
@@ -1154,9 +1185,10 @@ function initializeCharts() {
// Setup UI components
setupRunSelector();
setupSuiteFilters();
- setupTagFilters();
setupToggles();
initializePlatformTab();
+ // Setup tag filters after everything else is ready
+ setupTagFilters();
// Apply URL parameters
const regexParam = getQueryParam('regex');
diff --git a/libdevice/sanitizer/msan_rtl.cpp b/libdevice/sanitizer/msan_rtl.cpp
index 68c0db6000497..87d57fc6950c5 100644
--- a/libdevice/sanitizer/msan_rtl.cpp
+++ b/libdevice/sanitizer/msan_rtl.cpp
@@ -671,7 +671,7 @@ __msan_unpoison_shadow_dynamic_local(uptr ptr, uint32_t num_args) {
"__msan_unpoison_shadow_dynamic_local"));
}
-static __SYCL_CONSTANT__ const char __msan_print_set_shadow_private[] =
+static __SYCL_CONSTANT__ const char __msan_print_set_shadow[] =
"[kernel] __msan_set_value(beg=%p, end=%p, val=%02X)\n";
// We outline the function of setting shadow memory of private memory, because
@@ -684,8 +684,7 @@ DEVICE_EXTERN_C_NOINLINE void __msan_poison_stack(__SYCL_PRIVATE__ void *ptr,
MSAN_DEBUG(__spirv_ocl_printf(__msan_print_func_beg, "__msan_poison_stack"));
auto shadow_address = MemToShadow((uptr)ptr, ADDRESS_SPACE_PRIVATE);
- MSAN_DEBUG(__spirv_ocl_printf(__msan_print_set_shadow_private,
- (void *)shadow_address,
+ MSAN_DEBUG(__spirv_ocl_printf(__msan_print_set_shadow, (void *)shadow_address,
(void *)(shadow_address + size), 0xff));
if (shadow_address != GetMsanLaunchInfo->CleanShadow) {
@@ -704,8 +703,7 @@ DEVICE_EXTERN_C_NOINLINE void __msan_unpoison_stack(__SYCL_PRIVATE__ void *ptr,
__spirv_ocl_printf(__msan_print_func_beg, "__msan_unpoison_stack"));
auto shadow_address = MemToShadow((uptr)ptr, ADDRESS_SPACE_PRIVATE);
- MSAN_DEBUG(__spirv_ocl_printf(__msan_print_set_shadow_private,
- (void *)shadow_address,
+ MSAN_DEBUG(__spirv_ocl_printf(__msan_print_set_shadow, (void *)shadow_address,
(void *)(shadow_address + size), 0x0));
if (shadow_address != GetMsanLaunchInfo->CleanShadow) {
@@ -716,6 +714,26 @@ DEVICE_EXTERN_C_NOINLINE void __msan_unpoison_stack(__SYCL_PRIVATE__ void *ptr,
__spirv_ocl_printf(__msan_print_func_end, "__msan_unpoison_stack"));
}
+DEVICE_EXTERN_C_NOINLINE void __msan_unpoison_shadow(uptr ptr, uint32_t as,
+ uptr size) {
+ if (!GetMsanLaunchInfo)
+ return;
+
+ MSAN_DEBUG(
+ __spirv_ocl_printf(__msan_print_func_beg, "__msan_unpoison_shadow"));
+
+ auto shadow_address = MemToShadow(ptr, as);
+ MSAN_DEBUG(__spirv_ocl_printf(__msan_print_set_shadow, (void *)shadow_address,
+ (void *)(shadow_address + size), 0x0));
+
+ if (shadow_address != GetMsanLaunchInfo->CleanShadow) {
+ Memset((__SYCL_GLOBAL__ char *)shadow_address, 0, size);
+ }
+
+ MSAN_DEBUG(
+ __spirv_ocl_printf(__msan_print_func_end, "__msan_unpoison_shadow"));
+}
+
static __SYCL_CONSTANT__ const char __msan_print_private_base[] =
"[kernel] __msan_set_private_base(sid=%llu): %p\n";
diff --git a/llvm/lib/Frontend/Offloading/SYCLOffloadWrapper.cpp b/llvm/lib/Frontend/Offloading/SYCLOffloadWrapper.cpp
index 3d227d0c2e050..f54d552f10629 100644
--- a/llvm/lib/Frontend/Offloading/SYCLOffloadWrapper.cpp
+++ b/llvm/lib/Frontend/Offloading/SYCLOffloadWrapper.cpp
@@ -34,6 +34,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/LineIterator.h"
#include "llvm/Support/PropertySetIO.h"
+#include "llvm/TargetParser/Triple.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
#include
#include
@@ -734,6 +735,51 @@ struct Wrapper {
// Add this function to global destructors.
appendToGlobalDtors(M, Func, /*Priority*/ 1);
}
+
+ void createSyclRegisterWithAtexitUnregister(GlobalVariable *FatbinDesc) {
+ auto *UnregFuncTy =
+ FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
+ auto *UnregFunc =
+ Function::Create(UnregFuncTy, GlobalValue::InternalLinkage,
+ "sycl.descriptor_unreg.atexit", &M);
+ UnregFunc->setSection(".text.startup");
+
+ // Declaration for __sycl_unregister_lib(void*).
+ auto *UnregTargetTy =
+ FunctionType::get(Type::getVoidTy(C), PointerType::getUnqual(C), false);
+ FunctionCallee UnregTargetC =
+ M.getOrInsertFunction("__sycl_unregister_lib", UnregTargetTy);
+
+ // Body of the unregister wrapper.
+ IRBuilder<> UnregBuilder(BasicBlock::Create(C, "entry", UnregFunc));
+ UnregBuilder.CreateCall(UnregTargetC, FatbinDesc);
+ UnregBuilder.CreateRetVoid();
+
+ auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
+ auto *RegFunc = Function::Create(RegFuncTy, GlobalValue::InternalLinkage,
+ "sycl.descriptor_reg", &M);
+ RegFunc->setSection(".text.startup");
+
+ auto *RegTargetTy =
+ FunctionType::get(Type::getVoidTy(C), PointerType::getUnqual(C), false);
+ FunctionCallee RegTargetC =
+ M.getOrInsertFunction("__sycl_register_lib", RegTargetTy);
+
+ // `atexit` takes a `void(*)()` function pointer. In LLVM IR, this is
+ // typically represented as `i32 (ptr)`.
+ FunctionType *AtExitTy = FunctionType::get(
+ Type::getInt32Ty(C), PointerType::getUnqual(C), false);
+ FunctionCallee AtExitC = M.getOrInsertFunction("atexit", AtExitTy);
+
+ IRBuilder<> RegBuilder(BasicBlock::Create(C, "entry", RegFunc));
+ RegBuilder.CreateCall(RegTargetC, FatbinDesc);
+ RegBuilder.CreateCall(AtExitC, UnregFunc);
+ RegBuilder.CreateRetVoid();
+
+ // Finally, add to global constructors.
+ appendToGlobalCtors(M, RegFunc, /*Priority*/ 1);
+ }
+
}; // end of Wrapper
} // anonymous namespace
@@ -747,7 +793,11 @@ Error llvm::offloading::wrapSYCLBinaries(llvm::Module &M,
return createStringError(inconvertibleErrorCode(),
"No binary descriptors created.");
- W.createRegisterFatbinFunction(Desc);
- W.createUnregisterFunction(Desc);
+ if (Triple(M.getTargetTriple()).isOSWindows()) {
+ W.createSyclRegisterWithAtexitUnregister(Desc);
+ } else {
+ W.createRegisterFatbinFunction(Desc);
+ W.createUnregisterFunction(Desc);
+ }
return Error::success();
}
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index bb9c7611059bb..f14c538ec8637 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -813,6 +813,8 @@ class MemorySanitizerOnSpirv {
Constant *getOrCreateGlobalString(StringRef Name, StringRef Value,
unsigned AddressSpace);
+ static bool isSupportedBuiltIn(StringRef Name);
+
operator bool() const { return IsSPIRV; }
private:
@@ -823,7 +825,6 @@ class MemorySanitizerOnSpirv {
void instrumentKernelsMetadata(int TrackOrigins);
void instrumentPrivateArguments(Function &F, Instruction *FnPrologueEnd);
void instrumentPrivateBase(Function &F);
-
void initializeRetVecMap(Function *F);
void initializeKernelCallerMap(Function *F);
@@ -856,6 +857,7 @@ class MemorySanitizerOnSpirv {
FunctionCallee MsanUnpoisonShadowDynamicLocalFunc;
FunctionCallee MsanBarrierFunc;
FunctionCallee MsanUnpoisonStackFunc;
+ FunctionCallee MsanUnpoisonShadowFunc;
FunctionCallee MsanSetPrivateBaseFunc;
FunctionCallee MsanUnpoisonStridedCopyFunc;
};
@@ -949,6 +951,14 @@ void MemorySanitizerOnSpirv::initializeCallbacks() {
MsanUnpoisonStackFunc = M.getOrInsertFunction(
"__msan_unpoison_stack", IRB.getVoidTy(), PtrTy, IntptrTy);
+ // __msan_unpoison_(
+ // uptr ptr,
+ // uint32_t as,
+ // size_t size
+ // )
+ MsanUnpoisonShadowFunc = M.getOrInsertFunction(
+ "__msan_unpoison_shadow", IRB.getVoidTy(), IntptrTy, Int32Ty, IntptrTy);
+
// __msan_set_private_base(
// as(0) void * ptr
// )
@@ -987,9 +997,16 @@ void MemorySanitizerOnSpirv::instrumentGlobalVariables() {
G.setName("nameless_global");
if (isUnsupportedDeviceGlobal(&G)) {
- for (auto *User : G.users())
- if (auto *Inst = dyn_cast(User))
- Inst->setNoSanitizeMetadata();
+ for (auto *User : G.users()) {
+ if (!isa(User))
+ continue;
+ if (auto *CI = dyn_cast(User)) {
+ Function *Callee = CI->getCalledFunction();
+ if (Callee && isSupportedBuiltIn(Callee->getName()))
+ continue;
+ }
+ cast(User)->setNoSanitizeMetadata();
+ }
continue;
}
@@ -1150,6 +1167,10 @@ void MemorySanitizerOnSpirv::instrumentPrivateBase(Function &F) {
IRB.CreateCall(MsanSetPrivateBaseFunc, {PrivateBase});
}
+bool MemorySanitizerOnSpirv::isSupportedBuiltIn(StringRef Name) {
+ return Name.contains("__sycl_getComposite2020SpecConstantValue");
+}
+
void MemorySanitizerOnSpirv::instrumentPrivateArguments(
Function &F, Instruction *FnPrologueEnd) {
if (!ClSpirOffloadPrivates)
@@ -6994,6 +7015,25 @@ struct MemorySanitizerVisitor : public InstVisitor {
IRB.CreatePointerCast(Src, MS.Spirv.IntptrTy),
IRB.getInt32(Src->getType()->getPointerAddressSpace()),
IRB.getInt32(ElementSize), NumElements, Stride});
+ } else if (FuncName.contains(
+ "__sycl_getComposite2020SpecConstantValue")) {
+ // clang-format off
+ // Handle builtin functions like "_Z40__sycl_getComposite2020SpecConstantValue"
+ // Structs which are larger than 64b will be returned via sret arguments
+ // and will be initialized inside the function. So we need to unpoison
+ // the sret arguments.
+ // clang-format on
+ if (Func->hasStructRetAttr()) {
+ Type *SCTy = Func->getParamStructRetType(0);
+ unsigned Size = Func->getDataLayout().getTypeStoreSize(SCTy);
+ auto *Addr = CB.getArgOperand(0);
+ IRB.CreateCall(
+ MS.Spirv.MsanUnpoisonShadowFunc,
+ {IRB.CreatePointerCast(Addr, MS.Spirv.IntptrTy),
+ ConstantInt::get(MS.Spirv.Int32Ty,
+ Addr->getType()->getPointerAddressSpace()),
+ ConstantInt::get(MS.Spirv.IntptrTy, Size)});
+ }
}
}
}
diff --git a/llvm/test/Instrumentation/MemorySanitizer/SPIRV/spec_constants.ll b/llvm/test/Instrumentation/MemorySanitizer/SPIRV/spec_constants.ll
new file mode 100644
index 0000000000000..f910af0cf92bc
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/SPIRV/spec_constants.ll
@@ -0,0 +1,22 @@
+; RUN: opt < %s -passes=msan -msan-instrumentation-with-call-threshold=0 -msan-eager-checks=1 -msan-poison-stack-with-call=1 -S | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1"
+target triple = "spir64-unknown-unknown"
+
+%"class.sycl::_V1::specialization_id" = type { %"struct.user_def_types::no_cnstr" }
+%"struct.user_def_types::no_cnstr" = type { float, i32, i8 }
+
+@__usid_str = external addrspace(4) constant [57 x i8]
+@_Z19spec_const_externalIN14user_def_types8no_cnstrELi1EE = external addrspace(1) constant %"class.sycl::_V1::specialization_id"
+
+define spir_func i1 @_Z50check_kernel_handler_by_reference_external_handlerRN4sycl3_V114kernel_handlerEN14user_def_types8no_cnstrE() {
+entry:
+ %ref.tmp.i = alloca %"struct.user_def_types::no_cnstr", align 4
+ %ref.tmp.ascast.i = addrspacecast ptr %ref.tmp.i to ptr addrspace(4)
+; CHECK: [[REG1:%[0-9]+]] = ptrtoint ptr addrspace(4) %ref.tmp.ascast.i to i64
+; CHECK: call void @__msan_unpoison_shadow(i64 [[REG1]], i32 4, i64 12)
+ call spir_func void @_Z40__sycl_getComposite2020SpecConstantValueIN14user_def_types8no_cnstrEET_PKcPKvS6_(ptr addrspace(4) dead_on_unwind writable sret(%"struct.user_def_types::no_cnstr") align 4 %ref.tmp.ascast.i, ptr addrspace(4) noundef @__usid_str, ptr addrspace(4) noundef addrspacecast (ptr addrspace(1) @_Z19spec_const_externalIN14user_def_types8no_cnstrELi1EE to ptr addrspace(4)), ptr addrspace(4) noundef null)
+ ret i1 false
+}
+
+declare spir_func void @_Z40__sycl_getComposite2020SpecConstantValueIN14user_def_types8no_cnstrEET_PKcPKvS6_(ptr addrspace(4) sret(%"struct.user_def_types::no_cnstr"), ptr addrspace(4), ptr addrspace(4), ptr addrspace(4))
diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md
index 7e995509da350..d587d0e4ed111 100644
--- a/sycl/doc/EnvironmentVariables.md
+++ b/sycl/doc/EnvironmentVariables.md
@@ -8,6 +8,7 @@ compiler and runtime.
| Environment variable | Values | Description |
| -------------------- | ------ | ----------- |
| `ONEAPI_DEVICE_SELECTOR` | [See below.](#oneapi_device_selector) | This device selection environment variable can be used to limit the choice of devices available when the SYCL-using application is run. Useful for limiting devices to a certain type (like GPUs or accelerators) or backends (like Level Zero or OpenCL). This device selection mechanism is replacing `SYCL_DEVICE_FILTER` . The `ONEAPI_DEVICE_SELECTOR` syntax is shared with OpenMP and also allows sub-devices to be chosen. [See below.](#oneapi_device_selector) for a full description. |
+| `ONEAPI_PVC_SEND_WAR_WA` | '1' or '0' | Controls the workaround for Erratum "FP64 register ordering violation" on Intel Ponte Vecchio GPUs. Setting `ONEAPI_PVC_SEND_WAR_WA=0` disables the workaround and is only safe if the secondary FP64 pipeline is disabled. Default is enabled ('1') and applied throughout the oneAPI software stack - including OneDNN, OneMKL, OpenCL and Level Zero Runtimes, and Intel Graphics Compiler. |
| `SYCL_DEVICE_ALLOWLIST` | See [below](#sycl_device_allowlist) | Filter out devices that do not match the pattern specified. `BackendName` accepts `host`, `opencl`, `level_zero`, `native_cpu` or `cuda`. `DeviceType` accepts `host`, `cpu`, `gpu`, `fpga`, or `acc`. `fpga` and `acc` are handled in the same manner. `DeviceVendorId` accepts uint32_t in hex form (`0xXYZW`). `DriverVersion`, `PlatformVersion`, `DeviceName` and `PlatformName` accept regular expression. Special characters, such as parenthesis, must be escaped. DPC++ runtime will select only those devices which satisfy provided values above and regex. More than one device can be specified using the piping symbol "\|".|
| `SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING` | Any(\*) | Disables automatic rounding-up of `parallel_for` invocation ranges. |
| `SYCL_CACHE_DIR` | Path | Path to persistent cache root directory. Default values are `%AppData%\libsycl_cache` for Windows and `$XDG_CACHE_HOME/libsycl_cache` on Linux, if `XDG_CACHE_HOME` is not set then `$HOME/.cache/libsycl_cache`. When none of the environment variables are set SYCL persistent cache is disabled. |
diff --git a/sycl/source/detail/context_impl.cpp b/sycl/source/detail/context_impl.cpp
index 5e027466d7949..6fb2dd375fe37 100644
--- a/sycl/source/detail/context_impl.cpp
+++ b/sycl/source/detail/context_impl.cpp
@@ -125,7 +125,8 @@ context_impl::~context_impl() {
DeviceGlobalMapEntry *DGEntry =
detail::ProgramManager::getInstance().getDeviceGlobalEntry(
DeviceGlobal);
- DGEntry->removeAssociatedResources(this);
+ if (DGEntry != nullptr)
+ DGEntry->removeAssociatedResources(this);
}
MCachedLibPrograms.clear();
// TODO catch an exception and put it to list of asynchronous exceptions
diff --git a/sycl/source/detail/device_binary_image.hpp b/sycl/source/detail/device_binary_image.hpp
index ee96ccc998d27..ac4fb92d3f9a8 100644
--- a/sycl/source/detail/device_binary_image.hpp
+++ b/sycl/source/detail/device_binary_image.hpp
@@ -250,7 +250,7 @@ class RTDeviceBinaryImage {
protected:
sycl_device_binary get() const { return Bin; }
- sycl_device_binary Bin;
+ sycl_device_binary Bin = nullptr;
ur::DeviceBinaryType Format = SYCL_DEVICE_BINARY_TYPE_NONE;
RTDeviceBinaryImage::PropertyRange SpecConstIDMap;
diff --git a/sycl/source/detail/device_global_map.hpp b/sycl/source/detail/device_global_map.hpp
index 256c48066ec87..8ff48b3f29c52 100644
--- a/sycl/source/detail/device_global_map.hpp
+++ b/sycl/source/detail/device_global_map.hpp
@@ -76,6 +76,7 @@ class DeviceGlobalMap {
void eraseEntries(const RTDeviceBinaryImage *Img) {
const auto &DeviceGlobals = Img->getDeviceGlobals();
std::lock_guard DeviceGlobalsGuard(MDeviceGlobalsMutex);
+ std::cout << "DeviceGlobalMap::eraseEntries() with: " << DeviceGlobals.size() << " entries." << std::endl;
for (const sycl_device_binary_property &DeviceGlobal : DeviceGlobals) {
if (auto DevGlobalIt = MDeviceGlobals.find(DeviceGlobal->Name);
DevGlobalIt != MDeviceGlobals.end()) {
@@ -87,6 +88,7 @@ class DeviceGlobalMap {
});
if (findDevGlobalByValue != MPtr2DeviceGlobal.end())
MPtr2DeviceGlobal.erase(findDevGlobalByValue);
+
MDeviceGlobals.erase(DevGlobalIt);
}
}
@@ -112,8 +114,7 @@ class DeviceGlobalMap {
DeviceGlobalMapEntry *getEntry(const void *DeviceGlobalPtr) {
std::lock_guard DeviceGlobalsGuard(MDeviceGlobalsMutex);
auto Entry = MPtr2DeviceGlobal.find(DeviceGlobalPtr);
- assert(Entry != MPtr2DeviceGlobal.end() && "Device global entry not found");
- return Entry->second;
+ return (Entry != MPtr2DeviceGlobal.end()) ? Entry->second : nullptr;
}
DeviceGlobalMapEntry *
diff --git a/sycl/source/detail/device_global_map_entry.cpp b/sycl/source/detail/device_global_map_entry.cpp
index 1f82a605056dc..7b5ecd0a62133 100644
--- a/sycl/source/detail/device_global_map_entry.cpp
+++ b/sycl/source/detail/device_global_map_entry.cpp
@@ -17,14 +17,6 @@ namespace sycl {
inline namespace _V1 {
namespace detail {
-DeviceGlobalUSMMem::~DeviceGlobalUSMMem() {
- // removeAssociatedResources is expected to have cleaned up both the pointer
- // and the event. When asserts are enabled the values are set, so we check
- // these here.
- assert(MPtr == nullptr && "MPtr has not been cleaned up.");
- assert(MInitEvent == nullptr && "MInitEvent has not been cleaned up.");
-}
-
OwnedUrEvent DeviceGlobalUSMMem::getInitEvent(adapter_impl &Adapter) {
std::lock_guard Lock(MInitEventMutex);
if (MInitEvent == nullptr)
diff --git a/sycl/source/detail/device_global_map_entry.hpp b/sycl/source/detail/device_global_map_entry.hpp
index 1796e8d179db1..19d37f3210298 100644
--- a/sycl/source/detail/device_global_map_entry.hpp
+++ b/sycl/source/detail/device_global_map_entry.hpp
@@ -33,7 +33,7 @@ using EventImplPtr = std::shared_ptr;
struct DeviceGlobalUSMMem {
DeviceGlobalUSMMem(void *Ptr) : MPtr(Ptr) {}
- ~DeviceGlobalUSMMem();
+ ~DeviceGlobalUSMMem() = default;
void *const &getPtr() const noexcept { return MPtr; }
diff --git a/sycl/source/detail/global_handler.cpp b/sycl/source/detail/global_handler.cpp
index ff89080ad19eb..66b181a9bc0eb 100644
--- a/sycl/source/detail/global_handler.cpp
+++ b/sycl/source/detail/global_handler.cpp
@@ -272,7 +272,9 @@ void GlobalHandler::releaseDefaultContexts() {
// For Linux, early shutdown is here, and late shutdown is called from
// a low priority destructor.
struct StaticVarShutdownHandler {
-
+ StaticVarShutdownHandler(const StaticVarShutdownHandler &) = delete;
+ StaticVarShutdownHandler &
+ operator=(const StaticVarShutdownHandler &) = delete;
~StaticVarShutdownHandler() {
try {
#ifdef _WIN32
diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index 77f28a5131f8a..dd8a3dd72d6b3 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -3886,10 +3886,5 @@ extern "C" void __sycl_register_lib(sycl_device_binaries desc) {
// Executed as a part of current module's (.exe, .dll) static initialization
extern "C" void __sycl_unregister_lib(sycl_device_binaries desc) {
- // Partial cleanup is not necessary at shutdown
-#ifndef _WIN32
- if (!sycl::detail::GlobalHandler::instance().isOkToDefer())
- return;
sycl::detail::ProgramManager::getInstance().removeImages(desc);
-#endif
}
diff --git a/sycl/test-e2e/Adapters/level_zero/ext_intel_cslice.cpp b/sycl/test-e2e/Adapters/level_zero/ext_intel_cslice.cpp
index e30c3b8f1f949..951ae6ce8c474 100644
--- a/sycl/test-e2e/Adapters/level_zero/ext_intel_cslice.cpp
+++ b/sycl/test-e2e/Adapters/level_zero/ext_intel_cslice.cpp
@@ -4,7 +4,10 @@
// XFAIL: gpu-intel-pvc-1T
// XFAIL-TRACKER: https://github.com/intel/llvm/issues/15699
-// XFAIL: linux && run-mode && (arch-intel_gpu_bmg_g21 || gpu-intel-dg2) && !igc-dev
+// XFAIL: gpu-intel-dg2
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/18576
+
+// XFAIL: linux && run-mode && arch-intel_gpu_bmg_g21 && !igc-dev
// XFAIL-TRACKER: https://github.com/intel/llvm/issues/18576
// XFAIL: windows && arch-intel_gpu_bmg_g21
diff --git a/sycl/test-e2e/AddressSanitizer/lit.local.cfg b/sycl/test-e2e/AddressSanitizer/lit.local.cfg
index 8a4709eb254bb..c2bc429f1bb3f 100644
--- a/sycl/test-e2e/AddressSanitizer/lit.local.cfg
+++ b/sycl/test-e2e/AddressSanitizer/lit.local.cfg
@@ -28,3 +28,5 @@ unsupported_san_flags = [
]
if any(flag in config.cxx_flags for flag in unsupported_san_flags):
config.unsupported=True
+
+config.environment["ZE_AFFINITY_MASK"] = "0"
diff --git a/sycl/test-e2e/IntermediateLib/Inputs/incrementing_lib.cpp b/sycl/test-e2e/IntermediateLib/Inputs/incrementing_lib.cpp
new file mode 100644
index 0000000000000..eae3329599deb
--- /dev/null
+++ b/sycl/test-e2e/IntermediateLib/Inputs/incrementing_lib.cpp
@@ -0,0 +1,38 @@
+#include
+
+#if defined(_WIN32)
+#define API_EXPORT __declspec(dllexport)
+#else
+#define API_EXPORT
+#endif
+
+#ifndef INC
+#define INC 1
+#endif
+
+#ifndef CLASSNAME
+#define CLASSNAME same
+#endif
+
+#ifdef WITH_DEVICE_GLOBALS
+// Using device globals within the shared libraries only
+// works if the names do not collide. Note that we cannot
+// load a library multiple times if it has a device global.
+#define CONCAT_HELPER(a, b) a##b
+#define CONCAT(a, b) CONCAT_HELPER(a, b)
+
+using SomeProperties = decltype(sycl::ext::oneapi::experimental::properties{});
+sycl::ext::oneapi::experimental::device_global
+ CONCAT(DGVar, CLASSNAME) __attribute__((visibility("default")));
+
+#endif // WITH_DEVICE_GLOBALS
+
+extern "C" API_EXPORT void performIncrementation(sycl::queue &q,
+ sycl::buffer &buf) {
+ sycl::range<1> r = buf.get_range();
+ q.submit([&](sycl::handler &cgh) {
+ auto acc = buf.get_access(cgh);
+ cgh.parallel_for(
+ r, [=](sycl::id<1> idx) { acc[idx] += INC; });
+ });
+}
\ No newline at end of file
diff --git a/sycl/test-e2e/IntermediateLib/multi_lib_app.cpp b/sycl/test-e2e/IntermediateLib/multi_lib_app.cpp
new file mode 100644
index 0000000000000..e161ae0f6557f
--- /dev/null
+++ b/sycl/test-e2e/IntermediateLib/multi_lib_app.cpp
@@ -0,0 +1,154 @@
+// UNSUPPORTED: cuda || hip
+// UNSUPPORTED-TRACKER: CMPLRLLVM-69415
+
+// DEFINE: %{fPIC_flag} = %if windows %{%} %else %{-fPIC%}
+// DEFINE: %{shared_lib_ext} = %if windows %{dll%} %else %{so%}
+
+// clang-format off
+// IMPORTANT -DSO_PATH='R"(%T)"'
+// We need to capture %T, the build directory, in a string
+// and the normal STRINGIFY() macros hack won't work.
+// Because on Windows, the path delimiters are \,
+// which C++ preprocessor converts to escape sequences,
+// which becomes a nightmare.
+// So the hack here is to put heredoc in the definition
+// and use single quotes, which Python forgivingly accepts.
+// clang-format on
+
+// RUN: %{build} %{fPIC_flag} -DSO_PATH='R"(%T)"' -o %t.out
+
+// RUN: %clangxx -fsycl %{fPIC_flag} -shared -DINC=1 -o %T/lib_a.%{shared_lib_ext} %S/Inputs/incrementing_lib.cpp
+// RUN: %clangxx -fsycl %{fPIC_flag} -shared -DINC=2 -o %T/lib_b.%{shared_lib_ext} %S/Inputs/incrementing_lib.cpp
+// RUN: %clangxx -fsycl %{fPIC_flag} -shared -DINC=4 -o %T/lib_c.%{shared_lib_ext} %S/Inputs/incrementing_lib.cpp
+
+// RUN: env UR_L0_LEAKS_DEBUG=1 %{run} %t.out
+
+// This test uses a kernel of the same name in three different shared libraries.
+// It loads each library, calls the kernel, and checks that the incrementation
+// is done correctly, and then unloads the library.
+// It also reloads the first library after unloading it.
+// This test ensures that __sycl_register_lib() and __sycl_unregister_lib()
+// are called correctly, and that the device images are cleaned up properly.
+
+
+#include
+
+using namespace sycl::ext::oneapi::experimental;
+
+
+#ifdef _WIN32
+#include
+
+void *loadOsLibrary(const std::string &LibraryPath) {
+ HMODULE h =
+ LoadLibraryExA(LibraryPath.c_str(), NULL, LOAD_WITH_ALTERED_SEARCH_PATH);
+ return (void *)h;
+}
+int unloadOsLibrary(void *Library) {
+ return FreeLibrary((HMODULE)Library) ? 0 : 1;
+}
+void *getOsLibraryFuncAddress(void *Library, const std::string &FunctionName) {
+ return (void *)GetProcAddress((HMODULE)Library, FunctionName.c_str());
+}
+
+#else
+#include
+
+void *loadOsLibrary(const std::string &LibraryPath) {
+ void *so = dlopen(LibraryPath.c_str(), RTLD_NOW);
+ if (!so) {
+ char *Error = dlerror();
+ std::cerr << "dlopen(" << LibraryPath << ") failed with <"
+ << (Error ? Error : "unknown error") << ">" << std::endl;
+ }
+ return so;
+}
+
+int unloadOsLibrary(void *Library) { return dlclose(Library); }
+
+void *getOsLibraryFuncAddress(void *Library, const std::string &FunctionName) {
+ return dlsym(Library, FunctionName.c_str());
+}
+#endif
+
+// Define the function pointer type for performIncrementation
+using IncFuncT = void(sycl::queue &, sycl::buffer &);
+
+void initializeBuffer(sycl::buffer &buf) {
+ auto acc = sycl::host_accessor(buf);
+ for (size_t i = 0; i < buf.size(); ++i)
+ acc[i] = 0;
+}
+
+void checkIncrementation(sycl::buffer &buf, int val) {
+ auto acc = sycl::host_accessor(buf);
+ for (size_t i = 0; i < buf.size(); ++i) {
+ std::cout << acc[i] << " ";
+ assert(acc[i] == val);
+ }
+ std::cout << std::endl;
+}
+
+int main() {
+ sycl::queue q;
+
+ sycl::range<1> r(8);
+ sycl::buffer buf(r);
+ initializeBuffer(buf);
+
+ std::string base_path = SO_PATH;
+
+#ifdef _WIN32
+ std::string path_to_lib_a = base_path + "\\lib_a.dll";
+ std::string path_to_lib_b = base_path + "\\lib_b.dll";
+ std::string path_to_lib_c = base_path + "\\lib_c.dll";
+#else
+ std::string path_to_lib_a = base_path + "/lib_a.so";
+ std::string path_to_lib_b = base_path + "/lib_b.so";
+ std::string path_to_lib_c = base_path + "/lib_c.so";
+#endif
+
+ std::cout << "paths: " << path_to_lib_a << std::endl;
+ std::cout << "SO_PATH: " << SO_PATH << std::endl;
+
+ void *lib_a = loadOsLibrary(path_to_lib_a);
+ void *f = getOsLibraryFuncAddress(lib_a, "performIncrementation");
+ auto performIncrementationFuncA = reinterpret_cast(f);
+ performIncrementationFuncA(q, buf); // call the function from lib_a
+ q.wait();
+ checkIncrementation(buf, 1);
+ unloadOsLibrary(lib_a);
+ std::cout << "lib_a done" << std::endl;
+
+
+ // Now RELOAD lib_a and try it again.
+ lib_a = loadOsLibrary(path_to_lib_a);
+ f = getOsLibraryFuncAddress(lib_a, "performIncrementation");
+ performIncrementationFuncA = reinterpret_cast(f);
+ performIncrementationFuncA(q, buf); // call the function from lib_a
+ q.wait();
+ checkIncrementation(buf, 1 + 1);
+ unloadOsLibrary(lib_a);
+ std::cout << "reload of lib_a done" << std::endl;
+
+
+ void *lib_b = loadOsLibrary(path_to_lib_b);
+ f = getOsLibraryFuncAddress(lib_b, "performIncrementation");
+ auto performIncrementationFuncB = reinterpret_cast(f);
+ performIncrementationFuncB(q, buf); // call the function from lib_b
+ q.wait();
+ checkIncrementation(buf, 1 + 1 + 2);
+ unloadOsLibrary(lib_b);
+ std::cout << "lib_b done" << std::endl;
+
+ void *lib_c = loadOsLibrary(path_to_lib_c);
+ f = getOsLibraryFuncAddress(lib_c, "performIncrementation");
+ auto performIncrementationFuncC = reinterpret_cast(f);
+ q.wait();
+ performIncrementationFuncC(q, buf); // call the function from lib_c
+ checkIncrementation(buf, 1 + 1 + 2 + 4);
+ unloadOsLibrary(lib_c);
+ std::cout << "lib_c done" << std::endl;
+
+ return 0;
+}
\ No newline at end of file
diff --git a/sycl/test-e2e/MemorySanitizer/lit.local.cfg b/sycl/test-e2e/MemorySanitizer/lit.local.cfg
index d6da6eb7bf3bf..617db32b60624 100644
--- a/sycl/test-e2e/MemorySanitizer/lit.local.cfg
+++ b/sycl/test-e2e/MemorySanitizer/lit.local.cfg
@@ -35,3 +35,5 @@ unsupported_san_flags = [
]
if any(flag in config.cxx_flags for flag in unsupported_san_flags):
config.unsupported=True
+
+config.environment["ZE_AFFINITY_MASK"] = "0"
diff --git a/sycl/test-e2e/ThreadSanitizer/lit.local.cfg b/sycl/test-e2e/ThreadSanitizer/lit.local.cfg
index fe03e06b8d89a..aee25f0a5ba0f 100644
--- a/sycl/test-e2e/ThreadSanitizer/lit.local.cfg
+++ b/sycl/test-e2e/ThreadSanitizer/lit.local.cfg
@@ -33,3 +33,5 @@ unsupported_san_flags = [
]
if any(flag in config.cxx_flags for flag in unsupported_san_flags):
config.unsupported=True
+
+config.environment["ZE_AFFINITY_MASK"] = "0"
diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp
index 5fecdc5b8f67d..86da10bbffef7 100644
--- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp
+++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp
@@ -50,8 +50,42 @@ struct NDRDescT {
<< GlobalOffset[2] << "\n";
}
};
+
+namespace {
+class WaitInfo {
+ std::vector *const events;
+ static_assert(std::is_pointer_v);
+
+public:
+ WaitInfo(uint32_t numEvents, const ur_event_handle_t *WaitList)
+ : events(numEvents ? new std::vector(
+ WaitList, WaitList + numEvents)
+ : nullptr) {}
+ void wait() const {
+ if (events)
+ urEventWait(events->size(), events->data());
+ }
+ std::unique_ptr> getUniquePtr() {
+ return std::unique_ptr>(events);
+ }
+};
+
+inline static WaitInfo getWaitInfo(uint32_t numEventsInWaitList,
+ const ur_event_handle_t *phEventWaitList) {
+ return native_cpu::WaitInfo(numEventsInWaitList, phEventWaitList);
+}
+
+} // namespace
} // namespace native_cpu
+static inline native_cpu::state getState(const native_cpu::NDRDescT &ndr) {
+ native_cpu::state resized_state(
+ ndr.GlobalSize[0], ndr.GlobalSize[1], ndr.GlobalSize[2], ndr.LocalSize[0],
+ ndr.LocalSize[1], ndr.LocalSize[2], ndr.GlobalOffset[0],
+ ndr.GlobalOffset[1], ndr.GlobalOffset[2]);
+ return resized_state;
+}
+
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
@@ -67,7 +101,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
}
}
- urEventWait(numEventsInWaitList, phEventWaitList);
UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
@@ -119,14 +152,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
auto &tp = hQueue->getDevice()->tp;
const size_t numParallelThreads = tp.num_threads();
std::vector> futures;
- std::vector> groups;
auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0];
auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1];
auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2];
- native_cpu::state state(ndr.GlobalSize[0], ndr.GlobalSize[1],
- ndr.GlobalSize[2], ndr.LocalSize[0], ndr.LocalSize[1],
- ndr.LocalSize[2], ndr.GlobalOffset[0],
- ndr.GlobalOffset[1], ndr.GlobalOffset[2]);
auto event = new ur_event_handle_t_(hQueue, UR_COMMAND_KERNEL_LAUNCH);
event->tick_start();
@@ -134,6 +162,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
auto kernel = std::make_unique(*hKernel);
kernel->updateMemPool(numParallelThreads);
+ auto InEvents = native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList);
+
const size_t numWG = numWG0 * numWG1 * numWG2;
const size_t numWGPerThread = numWG / numParallelThreads;
const size_t remainderWG = numWG - numWGPerThread * numParallelThreads;
@@ -147,13 +177,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
rangeEnd[0] = rangeEnd[3] % numWG0;
rangeEnd[1] = (rangeEnd[3] / numWG0) % numWG1;
rangeEnd[2] = rangeEnd[3] / (numWG0 * numWG1);
- futures.emplace_back(
- tp.schedule_task([state, &kernel = *kernel, rangeStart,
- rangeEnd = rangeEnd[3], numWG0, numWG1,
-#ifndef NATIVECPU_USE_OCK
- localSize = ndr.LocalSize,
-#endif
- numParallelThreads](size_t threadId) mutable {
+ futures.emplace_back(tp.schedule_task(
+ [ndr, InEvents, &kernel = *kernel, rangeStart, rangeEnd = rangeEnd[3],
+ numWG0, numWG1, numParallelThreads](size_t threadId) {
+ auto state = getState(ndr);
+ InEvents.wait();
for (size_t g0 = rangeStart[0], g1 = rangeStart[1],
g2 = rangeStart[2], g3 = rangeStart[3];
g3 < rangeEnd; ++g3) {
@@ -162,9 +190,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
kernel._subhandler(
kernel.getArgs(numParallelThreads, threadId).data(), &state);
#else
- for (size_t local2 = 0; local2 < localSize[2]; ++local2) {
- for (size_t local1 = 0; local1 < localSize[1]; ++local1) {
- for (size_t local0 = 0; local0 < localSize[0]; ++local0) {
+ for (size_t local2 = 0; local2 < ndr.LocalSize[2]; ++local2) {
+ for (size_t local1 = 0; local1 < ndr.LocalSize[1]; ++local1) {
+ for (size_t local0 = 0; local0 < ndr.LocalSize[0]; ++local0) {
state.update(g0, g1, g2, local0, local1, local2);
kernel._subhandler(
kernel.getArgs(numParallelThreads, threadId).data(),
@@ -189,7 +217,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
if (phEvent) {
*phEvent = event;
}
- event->set_callback([kernel = std::move(kernel), hKernel, event]() {
+ event->set_callback([kernel = std::move(kernel), hKernel, event,
+ InEvents = InEvents.getUniquePtr()]() {
event->tick_end();
// TODO: avoid calling clear() here.
hKernel->_localArgInfo.clear();
@@ -207,20 +236,32 @@ static inline ur_result_t
withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue,
uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList,
- ur_event_handle_t *phEvent, T &&f) {
- urEventWait(numEventsInWaitList, phEventWaitList);
- ur_event_handle_t event = nullptr;
+ ur_event_handle_t *phEvent, T &&f, bool blocking = true) {
if (phEvent) {
- event = new ur_event_handle_t_(hQueue, command_type);
+ ur_event_handle_t event = new ur_event_handle_t_(hQueue, command_type);
+ *phEvent = event;
event->tick_start();
+ if (blocking || hQueue->isInOrder()) {
+ urEventWait(numEventsInWaitList, phEventWaitList);
+ ur_result_t result = f();
+ event->tick_end();
+ return result;
+ }
+ auto &tp = hQueue->getDevice()->tp;
+ std::vector> futures;
+ auto InEvents =
+ native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList);
+ futures.emplace_back(tp.schedule_task([f, InEvents](size_t) {
+ InEvents.wait();
+ f();
+ }));
+ event->set_futures(futures);
+ event->set_callback(
+ [event, InEvents = InEvents.getUniquePtr()]() { event->tick_end(); });
+ return UR_RESULT_SUCCESS;
}
-
+ urEventWait(numEventsInWaitList, phEventWaitList);
ur_result_t result = f();
-
- if (phEvent) {
- event->tick_end();
- *phEvent = event;
- }
return result;
}
@@ -231,7 +272,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
// TODO: the wait here should be async
return withTimingEvent(UR_COMMAND_EVENTS_WAIT, hQueue, numEventsInWaitList,
phEventWaitList, phEvent,
- [&]() { return UR_RESULT_SUCCESS; });
+ []() { return UR_RESULT_SUCCESS; });
}
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
@@ -239,7 +280,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
return withTimingEvent(UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, hQueue,
numEventsInWaitList, phEventWaitList, phEvent,
- [&]() { return UR_RESULT_SUCCESS; });
+ []() { return UR_RESULT_SUCCESS; });
}
UR_APIEXPORT ur_result_t urEnqueueEventsWaitWithBarrierExt(
@@ -250,9 +291,43 @@ UR_APIEXPORT ur_result_t urEnqueueEventsWaitWithBarrierExt(
phEventWaitList, phEvent);
}
+template
+static inline void MemBufferReadWriteRect_impl(
+ ur_mem_handle_t Buff, ur_rect_offset_t BufferOffset,
+ ur_rect_offset_t HostOffset, ur_rect_region_t region, size_t BufferRowPitch,
+ size_t BufferSlicePitch, size_t HostRowPitch, size_t HostSlicePitch,
+ typename std::conditional::type DstMem) {
+ // TODO: check other constraints, performance optimizations
+ // More sharing with level_zero where possible
+
+ if (BufferRowPitch == 0)
+ BufferRowPitch = region.width;
+ if (BufferSlicePitch == 0)
+ BufferSlicePitch = BufferRowPitch * region.height;
+ if (HostRowPitch == 0)
+ HostRowPitch = region.width;
+ if (HostSlicePitch == 0)
+ HostSlicePitch = HostRowPitch * region.height;
+ for (size_t w = 0; w < region.width; w++)
+ for (size_t h = 0; h < region.height; h++)
+ for (size_t d = 0; d < region.depth; d++) {
+ size_t buff_orign = (d + BufferOffset.z) * BufferSlicePitch +
+ (h + BufferOffset.y) * BufferRowPitch + w +
+ BufferOffset.x;
+ size_t host_origin = (d + HostOffset.z) * HostSlicePitch +
+ (h + HostOffset.y) * HostRowPitch + w +
+ HostOffset.x;
+ int8_t &buff_mem = ur_cast(Buff->_mem)[buff_orign];
+ if constexpr (IsRead)
+ ur_cast(DstMem)[host_origin] = buff_mem;
+ else
+ buff_mem = ur_cast(DstMem)[host_origin];
+ }
+}
+
template
static inline ur_result_t enqueueMemBufferReadWriteRect_impl(
- ur_queue_handle_t hQueue, ur_mem_handle_t Buff, bool,
+ ur_queue_handle_t hQueue, ur_mem_handle_t Buff, bool blocking,
ur_rect_offset_t BufferOffset, ur_rect_offset_t HostOffset,
ur_rect_region_t region, size_t BufferRowPitch, size_t BufferSlicePitch,
size_t HostRowPitch, size_t HostSlicePitch,
@@ -265,71 +340,63 @@ static inline ur_result_t enqueueMemBufferReadWriteRect_impl(
else
command_t = UR_COMMAND_MEM_BUFFER_WRITE_RECT;
return withTimingEvent(
- command_t, hQueue, NumEventsInWaitList, phEventWaitList, phEvent, [&]() {
- // TODO: blocking, check other constraints, performance optimizations
- // More sharing with level_zero where possible
-
- if (BufferRowPitch == 0)
- BufferRowPitch = region.width;
- if (BufferSlicePitch == 0)
- BufferSlicePitch = BufferRowPitch * region.height;
- if (HostRowPitch == 0)
- HostRowPitch = region.width;
- if (HostSlicePitch == 0)
- HostSlicePitch = HostRowPitch * region.height;
- for (size_t w = 0; w < region.width; w++)
- for (size_t h = 0; h < region.height; h++)
- for (size_t d = 0; d < region.depth; d++) {
- size_t buff_orign = (d + BufferOffset.z) * BufferSlicePitch +
- (h + BufferOffset.y) * BufferRowPitch + w +
- BufferOffset.x;
- size_t host_origin = (d + HostOffset.z) * HostSlicePitch +
- (h + HostOffset.y) * HostRowPitch + w +
- HostOffset.x;
- int8_t &buff_mem = ur_cast(Buff->_mem)[buff_orign];
- if constexpr (IsRead)
- ur_cast(DstMem)[host_origin] = buff_mem;
- else
- buff_mem = ur_cast(DstMem)[host_origin];
- }
-
+ command_t, hQueue, NumEventsInWaitList, phEventWaitList, phEvent,
+ [BufferRowPitch, region, BufferSlicePitch, HostRowPitch, HostSlicePitch,
+ BufferOffset, HostOffset, Buff, DstMem]() {
+ MemBufferReadWriteRect_impl(
+ Buff, BufferOffset, HostOffset, region, BufferRowPitch,
+ BufferSlicePitch, HostRowPitch, HostSlicePitch, DstMem);
return UR_RESULT_SUCCESS;
- });
+ },
+ blocking);
}
-static inline ur_result_t doCopy_impl(ur_queue_handle_t hQueue, void *DstPtr,
- const void *SrcPtr, size_t Size,
- uint32_t numEventsInWaitList,
- const ur_event_handle_t *phEventWaitList,
- ur_event_handle_t *phEvent,
- ur_command_t command_type) {
- return withTimingEvent(command_type, hQueue, numEventsInWaitList,
- phEventWaitList, phEvent, [&]() {
- if (SrcPtr != DstPtr && Size)
- memmove(DstPtr, SrcPtr, Size);
- return UR_RESULT_SUCCESS;
- });
+template
+static inline ur_result_t doCopy_impl(
+ ur_queue_handle_t hQueue, void *DstPtr, const void *SrcPtr, size_t Size,
+ uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+ ur_event_handle_t *phEvent, ur_command_t command_type, bool blocking) {
+ if (SrcPtr == DstPtr || Size == 0) {
+ bool hasInEvents = numEventsInWaitList && phEventWaitList;
+ return withTimingEvent(
+ command_type, hQueue, numEventsInWaitList, phEventWaitList, phEvent,
+ []() { return UR_RESULT_SUCCESS; }, blocking || !hasInEvents);
+ }
+
+ return withTimingEvent(
+ command_type, hQueue, numEventsInWaitList, phEventWaitList, phEvent,
+ [DstPtr, SrcPtr, Size]() {
+ if constexpr (AllowPartialOverlap) {
+ memmove(DstPtr, SrcPtr, Size);
+ } else {
+ memcpy(DstPtr, SrcPtr, Size);
+ }
+ return UR_RESULT_SUCCESS;
+ },
+ blocking);
}
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
- ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool /*blockingRead*/,
+ ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingRead,
size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
void *FromPtr = /*Src*/ hBuffer->_mem + offset;
auto res = doCopy_impl(hQueue, pDst, FromPtr, size, numEventsInWaitList,
- phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_READ);
+ phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_READ,
+ blockingRead);
return res;
}
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
- ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool /*blockingWrite*/,
+ ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingWrite,
size_t offset, size_t size, const void *pSrc, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
void *ToPtr = hBuffer->_mem + offset;
auto res = doCopy_impl(hQueue, ToPtr, pSrc, size, numEventsInWaitList,
- phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_WRITE);
+ phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_WRITE,
+ blockingWrite);
return res;
}
@@ -368,7 +435,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy(
const void *SrcPtr = hBufferSrc->_mem + srcOffset;
void *DstPtr = hBufferDst->_mem + dstOffset;
return doCopy_impl(hQueue, DstPtr, SrcPtr, size, numEventsInWaitList,
- phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_COPY);
+ phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_COPY,
+ true /*TODO: check false for non-blocking*/);
}
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(
@@ -379,7 +447,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent) {
return enqueueMemBufferReadWriteRect_impl(
- hQueue, hBufferSrc, false /*todo: check blocking*/, srcOrigin,
+ hQueue, hBufferSrc, true /*todo: check false for non-blocking*/,
+ srcOrigin,
/*HostOffset*/ dstOrigin, region, srcRowPitch, srcSlicePitch, dstRowPitch,
dstSlicePitch, hBufferDst->_mem, numEventsInWaitList, phEventWaitList,
phEvent);
@@ -390,12 +459,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
size_t patternSize, size_t offset, size_t size,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent) {
-
+ UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
return withTimingEvent(
UR_COMMAND_MEM_BUFFER_FILL, hQueue, numEventsInWaitList, phEventWaitList,
- phEvent, [&]() {
- UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
-
+ phEvent, [hBuffer, offset, size, patternSize, pPattern]() {
// TODO: error checking
// TODO: handle async
void *startingPtr = hBuffer->_mem + offset;
@@ -449,7 +516,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
ur_event_handle_t *phEvent, void **ppRetMap) {
return withTimingEvent(UR_COMMAND_MEM_BUFFER_MAP, hQueue, numEventsInWaitList,
- phEventWaitList, phEvent, [&]() {
+ phEventWaitList, phEvent,
+ [ppRetMap, hBuffer, offset]() {
*ppRetMap = hBuffer->_mem + offset;
return UR_RESULT_SUCCESS;
});
@@ -461,7 +529,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
ur_event_handle_t *phEvent) {
return withTimingEvent(UR_COMMAND_MEM_UNMAP, hQueue, numEventsInWaitList,
phEventWaitList, phEvent,
- [&]() { return UR_RESULT_SUCCESS; });
+ []() { return UR_RESULT_SUCCESS; });
}
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
@@ -470,7 +538,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
return withTimingEvent(
UR_COMMAND_USM_FILL, hQueue, numEventsInWaitList, phEventWaitList,
- phEvent, [&]() {
+ phEvent, [ptr, pPattern, patternSize, size]() {
UR_ASSERT(ptr, UR_RESULT_ERROR_INVALID_NULL_POINTER);
UR_ASSERT(pPattern, UR_RESULT_ERROR_INVALID_NULL_POINTER);
UR_ASSERT(patternSize != 0, UR_RESULT_ERROR_INVALID_SIZE)
@@ -520,20 +588,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
}
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy(
- ur_queue_handle_t hQueue, bool /*blocking*/, void *pDst, const void *pSrc,
+ ur_queue_handle_t hQueue, bool blocking, void *pDst, const void *pSrc,
size_t size, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
- return withTimingEvent(
- UR_COMMAND_USM_MEMCPY, hQueue, numEventsInWaitList, phEventWaitList,
- phEvent, [&]() {
- UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE);
- UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER);
- UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER);
-
- memcpy(pDst, pSrc, size);
+ UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE);
+ UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+ UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER);
- return UR_RESULT_SUCCESS;
- });
+ return doCopy_impl(
+ hQueue, pDst, pSrc, size, numEventsInWaitList, phEventWaitList, phEvent,
+ UR_COMMAND_USM_MEMCPY, blocking);
}
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(