diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 0713a66ef3ef5..36551f70b10a0 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -223,3 +223,6 @@ llvm/test/Instrumentation/ThreadSanitizer/ @intel/dpcpp-sanitizers-review sycl/test-e2e/AddressSanitizer/ @intel/dpcpp-sanitizers-review sycl/test-e2e/MemorySanitizer/ @intel/dpcpp-sanitizers-review sycl/test-e2e/ThreadSanitizer/ @intel/dpcpp-sanitizers-review + +# ABI compatibility +devops/compat_ci_exclude.sycl-rel-** @gmlueck @xtian-github diff --git a/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp b/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp index ee913fc8d0eeb..75a4baea8bb0c 100644 --- a/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp +++ b/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp @@ -1293,6 +1293,49 @@ class BinaryWrapper { appendToGlobalDtors(M, Func, /*Priority*/ 1); } + void createSyclRegisterWithAtexitUnregister(GlobalVariable *BinDesc) { + auto *UnregFuncTy = + FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); + auto *UnregFunc = + Function::Create(UnregFuncTy, GlobalValue::InternalLinkage, + "sycl.descriptor_unreg.atexit", &M); + UnregFunc->setSection(".text.startup"); + + // Declaration for __sycl_unregister_lib(void*). + auto *UnregTargetTy = + FunctionType::get(Type::getVoidTy(C), getPtrTy(), false); + FunctionCallee UnregTargetC = + M.getOrInsertFunction("__sycl_unregister_lib", UnregTargetTy); + + IRBuilder<> UnregBuilder(BasicBlock::Create(C, "entry", UnregFunc)); + UnregBuilder.CreateCall(UnregTargetC, BinDesc); + UnregBuilder.CreateRetVoid(); + + auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); + auto *RegFunc = Function::Create(RegFuncTy, GlobalValue::InternalLinkage, + "sycl.descriptor_reg", &M); + RegFunc->setSection(".text.startup"); + + auto *RegTargetTy = + FunctionType::get(Type::getVoidTy(C), getPtrTy(), false); + FunctionCallee RegTargetC = + M.getOrInsertFunction("__sycl_register_lib", RegTargetTy); + + // `atexit` takes a `void(*)()` function pointer. In LLVM IR, this is + // typically represented as `i32 (ptr)`. + FunctionType *AtExitTy = + FunctionType::get(Type::getInt32Ty(C), getPtrTy(), false); + FunctionCallee AtExitC = M.getOrInsertFunction("atexit", AtExitTy); + + IRBuilder<> RegBuilder(BasicBlock::Create(C, "entry", RegFunc)); + RegBuilder.CreateCall(RegTargetC, BinDesc); + RegBuilder.CreateCall(AtExitC, UnregFunc); + RegBuilder.CreateRetVoid(); + + // Add this function to global destructors. + appendToGlobalCtors(M, RegFunc, /*Priority*/ 1); + } + public: BinaryWrapper(StringRef Target, StringRef ToolName, StringRef SymPropBCFiles = "") @@ -1370,8 +1413,13 @@ class BinaryWrapper { if (EmitRegFuncs) { GlobalVariable *Desc = *DescOrErr; - createRegisterFunction(Kind, Desc); - createUnregisterFunction(Kind, Desc); + if (Kind == OffloadKind::SYCL && + Triple(M.getTargetTriple()).isOSWindows()) { + createSyclRegisterWithAtexitUnregister(Desc); + } else { + createRegisterFunction(Kind, Desc); + createUnregisterFunction(Kind, Desc); + } } } return &M; diff --git a/devops/scripts/benchmarks/html/scripts.js b/devops/scripts/benchmarks/html/scripts.js index 558021a13ab4a..28e3a708c2956 100644 --- a/devops/scripts/benchmarks/html/scripts.js +++ b/devops/scripts/benchmarks/html/scripts.js @@ -17,6 +17,125 @@ let annotationsOptions = new Map(); // Global options map for annotations let archivedDataLoaded = false; let loadedBenchmarkRuns = []; // Loaded results from the js/json files +// Toggle configuration and abstraction +// +// HOW TO ADD A NEW TOGGLE: +// 1. Add HTML checkbox to index.html: +// +// +// 2. Add configuration below: +// 'my-toggle': { +// defaultValue: false, // true = enabled by default, false = disabled by default +// urlParam: 'myParam', // Name shown in URL (?myParam=true) +// invertUrlParam: false, // false = normal behavior, true = legacy inverted logic +// onChange: function(isEnabled) { // Function called when toggle state changes +// // Your logic here +// updateURL(); // Always call this to update the browser URL +// } +// } +// +// 3. (Optional) Add helper function for cleaner, more readable code: +// function isMyToggleEnabled() { return isToggleEnabled('my-toggle'); } +// +// This lets you write: if (isMyToggleEnabled()) { ... } +// Instead of: if (isToggleEnabled('my-toggle')) { ... } +// + +const toggleConfigs = { + 'show-notes': { + defaultValue: true, + urlParam: 'notes', + invertUrlParam: true, // Store false in URL when enabled (legacy behavior) + onChange: function(isEnabled) { + document.querySelectorAll('.benchmark-note').forEach(note => { + note.style.display = isEnabled ? 'block' : 'none'; + }); + updateURL(); + } + }, + 'show-unstable': { + defaultValue: false, + urlParam: 'unstable', + invertUrlParam: false, + onChange: function(isEnabled) { + document.querySelectorAll('.benchmark-unstable').forEach(warning => { + warning.style.display = isEnabled ? 'block' : 'none'; + }); + filterCharts(); + } + }, + 'custom-range': { + defaultValue: false, + urlParam: 'customRange', + invertUrlParam: false, + onChange: function(isEnabled) { + updateCharts(); + } + }, + 'show-archived-data': { + defaultValue: false, + urlParam: 'archived', + invertUrlParam: false, + onChange: function(isEnabled) { + if (isEnabled) { + loadArchivedData(); + } else { + if (archivedDataLoaded) { + location.reload(); + } + } + updateURL(); + } + } +}; + +// Generic toggle helper functions +function isToggleEnabled(toggleId) { + const toggle = document.getElementById(toggleId); + return toggle ? toggle.checked : toggleConfigs[toggleId]?.defaultValue || false; +} + +function setupToggle(toggleId, config) { + const toggle = document.getElementById(toggleId); + if (!toggle) return; + + // Set up event listener + toggle.addEventListener('change', function() { + config.onChange(toggle.checked); + }); + + // Initialize from URL params if present + const urlParam = getQueryParam(config.urlParam); + if (urlParam !== null) { + const urlValue = urlParam === 'true'; + // Handle inverted URL params (like notes where false means enabled) + toggle.checked = config.invertUrlParam ? !urlValue : urlValue; + } else { + // Use default value + toggle.checked = config.defaultValue; + } +} + +function updateToggleURL(toggleId, config, url) { + const isEnabled = isToggleEnabled(toggleId); + + if (config.invertUrlParam) { + // For inverted params, store in URL when disabled + if (isEnabled) { + url.searchParams.delete(config.urlParam); + } else { + url.searchParams.set(config.urlParam, 'false'); + } + } else { + // For normal params, store in URL when enabled + if (!isEnabled) { + url.searchParams.delete(config.urlParam); + } else { + url.searchParams.set(config.urlParam, 'true'); + } + } +} + // DOM Elements let runSelect, selectedRunsDiv, suiteFiltersContainer, tagFiltersContainer; @@ -627,30 +746,10 @@ function updateURL() { url.searchParams.delete('runs'); } - // Add toggle states to URL - if (isNotesEnabled()) { - url.searchParams.delete('notes'); - } else { - url.searchParams.set('notes', 'false'); - } - - if (!isUnstableEnabled()) { - url.searchParams.delete('unstable'); - } else { - url.searchParams.set('unstable', 'true'); - } - - if (!isCustomRangesEnabled()) { - url.searchParams.delete('customRange'); - } else { - url.searchParams.set('customRange', 'true'); - } - - if (!isArchivedDataEnabled()) { - url.searchParams.delete('archived'); - } else { - url.searchParams.set('archived', 'true'); - } + // Update toggle states in URL using the generic helper + Object.entries(toggleConfigs).forEach(([toggleId, config]) => { + updateToggleURL(toggleId, config, url); + }); history.replaceState(null, '', url); } @@ -949,94 +1048,26 @@ function setupSuiteFilters() { } function isNotesEnabled() { - const notesToggle = document.getElementById('show-notes'); - return notesToggle.checked; + return isToggleEnabled('show-notes'); } function isUnstableEnabled() { - const unstableToggle = document.getElementById('show-unstable'); - return unstableToggle.checked; + return isToggleEnabled('show-unstable'); } function isCustomRangesEnabled() { - const rangesToggle = document.getElementById('custom-range'); - return rangesToggle.checked; + return isToggleEnabled('custom-range'); } function isArchivedDataEnabled() { - const archivedDataToggle = document.getElementById('show-archived-data'); - return archivedDataToggle.checked; + return isToggleEnabled('show-archived-data'); } function setupToggles() { - const notesToggle = document.getElementById('show-notes'); - const unstableToggle = document.getElementById('show-unstable'); - const customRangeToggle = document.getElementById('custom-range'); - const archivedDataToggle = document.getElementById('show-archived-data'); - - notesToggle.addEventListener('change', function () { - // Update all note elements visibility - document.querySelectorAll('.benchmark-note').forEach(note => { - note.style.display = isNotesEnabled() ? 'block' : 'none'; - }); - updateURL(); - }); - - unstableToggle.addEventListener('change', function () { - // Update all unstable warning elements visibility - document.querySelectorAll('.benchmark-unstable').forEach(warning => { - warning.style.display = isUnstableEnabled() ? 'block' : 'none'; - }); - filterCharts(); - }); - - customRangeToggle.addEventListener('change', function () { - // redraw all charts - updateCharts(); + // Set up all toggles using the configuration + Object.entries(toggleConfigs).forEach(([toggleId, config]) => { + setupToggle(toggleId, config); }); - - // Add event listener for archived data toggle - if (archivedDataToggle) { - archivedDataToggle.addEventListener('change', function() { - if (archivedDataToggle.checked) { - loadArchivedData(); - } else { - if (archivedDataLoaded) { - // Reload the page to reset - location.reload(); - } - } - updateURL(); - }); - } - - // Initialize from URL params if present - const notesParam = getQueryParam('notes'); - const unstableParam = getQueryParam('unstable'); - const archivedParam = getQueryParam('archived'); - - if (notesParam !== null) { - let showNotes = notesParam === 'true'; - notesToggle.checked = showNotes; - } - - if (unstableParam !== null) { - let showUnstable = unstableParam === 'true'; - unstableToggle.checked = showUnstable; - } - - const customRangesParam = getQueryParam('customRange'); - if (customRangesParam !== null) { - customRangeToggle.checked = customRangesParam === 'true'; - } - - if (archivedDataToggle && archivedParam !== null) { - archivedDataToggle.checked = archivedParam === 'true'; - - if (archivedDataToggle.checked) { - loadArchivedData(); - } - } } function setupTagFilters() { @@ -1154,9 +1185,10 @@ function initializeCharts() { // Setup UI components setupRunSelector(); setupSuiteFilters(); - setupTagFilters(); setupToggles(); initializePlatformTab(); + // Setup tag filters after everything else is ready + setupTagFilters(); // Apply URL parameters const regexParam = getQueryParam('regex'); diff --git a/libdevice/sanitizer/msan_rtl.cpp b/libdevice/sanitizer/msan_rtl.cpp index 68c0db6000497..87d57fc6950c5 100644 --- a/libdevice/sanitizer/msan_rtl.cpp +++ b/libdevice/sanitizer/msan_rtl.cpp @@ -671,7 +671,7 @@ __msan_unpoison_shadow_dynamic_local(uptr ptr, uint32_t num_args) { "__msan_unpoison_shadow_dynamic_local")); } -static __SYCL_CONSTANT__ const char __msan_print_set_shadow_private[] = +static __SYCL_CONSTANT__ const char __msan_print_set_shadow[] = "[kernel] __msan_set_value(beg=%p, end=%p, val=%02X)\n"; // We outline the function of setting shadow memory of private memory, because @@ -684,8 +684,7 @@ DEVICE_EXTERN_C_NOINLINE void __msan_poison_stack(__SYCL_PRIVATE__ void *ptr, MSAN_DEBUG(__spirv_ocl_printf(__msan_print_func_beg, "__msan_poison_stack")); auto shadow_address = MemToShadow((uptr)ptr, ADDRESS_SPACE_PRIVATE); - MSAN_DEBUG(__spirv_ocl_printf(__msan_print_set_shadow_private, - (void *)shadow_address, + MSAN_DEBUG(__spirv_ocl_printf(__msan_print_set_shadow, (void *)shadow_address, (void *)(shadow_address + size), 0xff)); if (shadow_address != GetMsanLaunchInfo->CleanShadow) { @@ -704,8 +703,7 @@ DEVICE_EXTERN_C_NOINLINE void __msan_unpoison_stack(__SYCL_PRIVATE__ void *ptr, __spirv_ocl_printf(__msan_print_func_beg, "__msan_unpoison_stack")); auto shadow_address = MemToShadow((uptr)ptr, ADDRESS_SPACE_PRIVATE); - MSAN_DEBUG(__spirv_ocl_printf(__msan_print_set_shadow_private, - (void *)shadow_address, + MSAN_DEBUG(__spirv_ocl_printf(__msan_print_set_shadow, (void *)shadow_address, (void *)(shadow_address + size), 0x0)); if (shadow_address != GetMsanLaunchInfo->CleanShadow) { @@ -716,6 +714,26 @@ DEVICE_EXTERN_C_NOINLINE void __msan_unpoison_stack(__SYCL_PRIVATE__ void *ptr, __spirv_ocl_printf(__msan_print_func_end, "__msan_unpoison_stack")); } +DEVICE_EXTERN_C_NOINLINE void __msan_unpoison_shadow(uptr ptr, uint32_t as, + uptr size) { + if (!GetMsanLaunchInfo) + return; + + MSAN_DEBUG( + __spirv_ocl_printf(__msan_print_func_beg, "__msan_unpoison_shadow")); + + auto shadow_address = MemToShadow(ptr, as); + MSAN_DEBUG(__spirv_ocl_printf(__msan_print_set_shadow, (void *)shadow_address, + (void *)(shadow_address + size), 0x0)); + + if (shadow_address != GetMsanLaunchInfo->CleanShadow) { + Memset((__SYCL_GLOBAL__ char *)shadow_address, 0, size); + } + + MSAN_DEBUG( + __spirv_ocl_printf(__msan_print_func_end, "__msan_unpoison_shadow")); +} + static __SYCL_CONSTANT__ const char __msan_print_private_base[] = "[kernel] __msan_set_private_base(sid=%llu): %p\n"; diff --git a/llvm/lib/Frontend/Offloading/SYCLOffloadWrapper.cpp b/llvm/lib/Frontend/Offloading/SYCLOffloadWrapper.cpp index 3d227d0c2e050..f54d552f10629 100644 --- a/llvm/lib/Frontend/Offloading/SYCLOffloadWrapper.cpp +++ b/llvm/lib/Frontend/Offloading/SYCLOffloadWrapper.cpp @@ -34,6 +34,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/LineIterator.h" #include "llvm/Support/PropertySetIO.h" +#include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/Utils/ModuleUtils.h" #include #include @@ -734,6 +735,51 @@ struct Wrapper { // Add this function to global destructors. appendToGlobalDtors(M, Func, /*Priority*/ 1); } + + void createSyclRegisterWithAtexitUnregister(GlobalVariable *FatbinDesc) { + auto *UnregFuncTy = + FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); + auto *UnregFunc = + Function::Create(UnregFuncTy, GlobalValue::InternalLinkage, + "sycl.descriptor_unreg.atexit", &M); + UnregFunc->setSection(".text.startup"); + + // Declaration for __sycl_unregister_lib(void*). + auto *UnregTargetTy = + FunctionType::get(Type::getVoidTy(C), PointerType::getUnqual(C), false); + FunctionCallee UnregTargetC = + M.getOrInsertFunction("__sycl_unregister_lib", UnregTargetTy); + + // Body of the unregister wrapper. + IRBuilder<> UnregBuilder(BasicBlock::Create(C, "entry", UnregFunc)); + UnregBuilder.CreateCall(UnregTargetC, FatbinDesc); + UnregBuilder.CreateRetVoid(); + + auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); + auto *RegFunc = Function::Create(RegFuncTy, GlobalValue::InternalLinkage, + "sycl.descriptor_reg", &M); + RegFunc->setSection(".text.startup"); + + auto *RegTargetTy = + FunctionType::get(Type::getVoidTy(C), PointerType::getUnqual(C), false); + FunctionCallee RegTargetC = + M.getOrInsertFunction("__sycl_register_lib", RegTargetTy); + + // `atexit` takes a `void(*)()` function pointer. In LLVM IR, this is + // typically represented as `i32 (ptr)`. + FunctionType *AtExitTy = FunctionType::get( + Type::getInt32Ty(C), PointerType::getUnqual(C), false); + FunctionCallee AtExitC = M.getOrInsertFunction("atexit", AtExitTy); + + IRBuilder<> RegBuilder(BasicBlock::Create(C, "entry", RegFunc)); + RegBuilder.CreateCall(RegTargetC, FatbinDesc); + RegBuilder.CreateCall(AtExitC, UnregFunc); + RegBuilder.CreateRetVoid(); + + // Finally, add to global constructors. + appendToGlobalCtors(M, RegFunc, /*Priority*/ 1); + } + }; // end of Wrapper } // anonymous namespace @@ -747,7 +793,11 @@ Error llvm::offloading::wrapSYCLBinaries(llvm::Module &M, return createStringError(inconvertibleErrorCode(), "No binary descriptors created."); - W.createRegisterFatbinFunction(Desc); - W.createUnregisterFunction(Desc); + if (Triple(M.getTargetTriple()).isOSWindows()) { + W.createSyclRegisterWithAtexitUnregister(Desc); + } else { + W.createRegisterFatbinFunction(Desc); + W.createUnregisterFunction(Desc); + } return Error::success(); } diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index bb9c7611059bb..f14c538ec8637 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -813,6 +813,8 @@ class MemorySanitizerOnSpirv { Constant *getOrCreateGlobalString(StringRef Name, StringRef Value, unsigned AddressSpace); + static bool isSupportedBuiltIn(StringRef Name); + operator bool() const { return IsSPIRV; } private: @@ -823,7 +825,6 @@ class MemorySanitizerOnSpirv { void instrumentKernelsMetadata(int TrackOrigins); void instrumentPrivateArguments(Function &F, Instruction *FnPrologueEnd); void instrumentPrivateBase(Function &F); - void initializeRetVecMap(Function *F); void initializeKernelCallerMap(Function *F); @@ -856,6 +857,7 @@ class MemorySanitizerOnSpirv { FunctionCallee MsanUnpoisonShadowDynamicLocalFunc; FunctionCallee MsanBarrierFunc; FunctionCallee MsanUnpoisonStackFunc; + FunctionCallee MsanUnpoisonShadowFunc; FunctionCallee MsanSetPrivateBaseFunc; FunctionCallee MsanUnpoisonStridedCopyFunc; }; @@ -949,6 +951,14 @@ void MemorySanitizerOnSpirv::initializeCallbacks() { MsanUnpoisonStackFunc = M.getOrInsertFunction( "__msan_unpoison_stack", IRB.getVoidTy(), PtrTy, IntptrTy); + // __msan_unpoison_( + // uptr ptr, + // uint32_t as, + // size_t size + // ) + MsanUnpoisonShadowFunc = M.getOrInsertFunction( + "__msan_unpoison_shadow", IRB.getVoidTy(), IntptrTy, Int32Ty, IntptrTy); + // __msan_set_private_base( // as(0) void * ptr // ) @@ -987,9 +997,16 @@ void MemorySanitizerOnSpirv::instrumentGlobalVariables() { G.setName("nameless_global"); if (isUnsupportedDeviceGlobal(&G)) { - for (auto *User : G.users()) - if (auto *Inst = dyn_cast(User)) - Inst->setNoSanitizeMetadata(); + for (auto *User : G.users()) { + if (!isa(User)) + continue; + if (auto *CI = dyn_cast(User)) { + Function *Callee = CI->getCalledFunction(); + if (Callee && isSupportedBuiltIn(Callee->getName())) + continue; + } + cast(User)->setNoSanitizeMetadata(); + } continue; } @@ -1150,6 +1167,10 @@ void MemorySanitizerOnSpirv::instrumentPrivateBase(Function &F) { IRB.CreateCall(MsanSetPrivateBaseFunc, {PrivateBase}); } +bool MemorySanitizerOnSpirv::isSupportedBuiltIn(StringRef Name) { + return Name.contains("__sycl_getComposite2020SpecConstantValue"); +} + void MemorySanitizerOnSpirv::instrumentPrivateArguments( Function &F, Instruction *FnPrologueEnd) { if (!ClSpirOffloadPrivates) @@ -6994,6 +7015,25 @@ struct MemorySanitizerVisitor : public InstVisitor { IRB.CreatePointerCast(Src, MS.Spirv.IntptrTy), IRB.getInt32(Src->getType()->getPointerAddressSpace()), IRB.getInt32(ElementSize), NumElements, Stride}); + } else if (FuncName.contains( + "__sycl_getComposite2020SpecConstantValue")) { + // clang-format off + // Handle builtin functions like "_Z40__sycl_getComposite2020SpecConstantValue" + // Structs which are larger than 64b will be returned via sret arguments + // and will be initialized inside the function. So we need to unpoison + // the sret arguments. + // clang-format on + if (Func->hasStructRetAttr()) { + Type *SCTy = Func->getParamStructRetType(0); + unsigned Size = Func->getDataLayout().getTypeStoreSize(SCTy); + auto *Addr = CB.getArgOperand(0); + IRB.CreateCall( + MS.Spirv.MsanUnpoisonShadowFunc, + {IRB.CreatePointerCast(Addr, MS.Spirv.IntptrTy), + ConstantInt::get(MS.Spirv.Int32Ty, + Addr->getType()->getPointerAddressSpace()), + ConstantInt::get(MS.Spirv.IntptrTy, Size)}); + } } } } diff --git a/llvm/test/Instrumentation/MemorySanitizer/SPIRV/spec_constants.ll b/llvm/test/Instrumentation/MemorySanitizer/SPIRV/spec_constants.ll new file mode 100644 index 0000000000000..f910af0cf92bc --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/SPIRV/spec_constants.ll @@ -0,0 +1,22 @@ +; RUN: opt < %s -passes=msan -msan-instrumentation-with-call-threshold=0 -msan-eager-checks=1 -msan-poison-stack-with-call=1 -S | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1" +target triple = "spir64-unknown-unknown" + +%"class.sycl::_V1::specialization_id" = type { %"struct.user_def_types::no_cnstr" } +%"struct.user_def_types::no_cnstr" = type { float, i32, i8 } + +@__usid_str = external addrspace(4) constant [57 x i8] +@_Z19spec_const_externalIN14user_def_types8no_cnstrELi1EE = external addrspace(1) constant %"class.sycl::_V1::specialization_id" + +define spir_func i1 @_Z50check_kernel_handler_by_reference_external_handlerRN4sycl3_V114kernel_handlerEN14user_def_types8no_cnstrE() { +entry: + %ref.tmp.i = alloca %"struct.user_def_types::no_cnstr", align 4 + %ref.tmp.ascast.i = addrspacecast ptr %ref.tmp.i to ptr addrspace(4) +; CHECK: [[REG1:%[0-9]+]] = ptrtoint ptr addrspace(4) %ref.tmp.ascast.i to i64 +; CHECK: call void @__msan_unpoison_shadow(i64 [[REG1]], i32 4, i64 12) + call spir_func void @_Z40__sycl_getComposite2020SpecConstantValueIN14user_def_types8no_cnstrEET_PKcPKvS6_(ptr addrspace(4) dead_on_unwind writable sret(%"struct.user_def_types::no_cnstr") align 4 %ref.tmp.ascast.i, ptr addrspace(4) noundef @__usid_str, ptr addrspace(4) noundef addrspacecast (ptr addrspace(1) @_Z19spec_const_externalIN14user_def_types8no_cnstrELi1EE to ptr addrspace(4)), ptr addrspace(4) noundef null) + ret i1 false +} + +declare spir_func void @_Z40__sycl_getComposite2020SpecConstantValueIN14user_def_types8no_cnstrEET_PKcPKvS6_(ptr addrspace(4) sret(%"struct.user_def_types::no_cnstr"), ptr addrspace(4), ptr addrspace(4), ptr addrspace(4)) diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 7e995509da350..d587d0e4ed111 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -8,6 +8,7 @@ compiler and runtime. | Environment variable | Values | Description | | -------------------- | ------ | ----------- | | `ONEAPI_DEVICE_SELECTOR` | [See below.](#oneapi_device_selector) | This device selection environment variable can be used to limit the choice of devices available when the SYCL-using application is run. Useful for limiting devices to a certain type (like GPUs or accelerators) or backends (like Level Zero or OpenCL). This device selection mechanism is replacing `SYCL_DEVICE_FILTER` . The `ONEAPI_DEVICE_SELECTOR` syntax is shared with OpenMP and also allows sub-devices to be chosen. [See below.](#oneapi_device_selector) for a full description. | +| `ONEAPI_PVC_SEND_WAR_WA` | '1' or '0' | Controls the workaround for Erratum "FP64 register ordering violation" on Intel Ponte Vecchio GPUs. Setting `ONEAPI_PVC_SEND_WAR_WA=0` disables the workaround and is only safe if the secondary FP64 pipeline is disabled. Default is enabled ('1') and applied throughout the oneAPI software stack - including OneDNN, OneMKL, OpenCL and Level Zero Runtimes, and Intel Graphics Compiler. | | `SYCL_DEVICE_ALLOWLIST` | See [below](#sycl_device_allowlist) | Filter out devices that do not match the pattern specified. `BackendName` accepts `host`, `opencl`, `level_zero`, `native_cpu` or `cuda`. `DeviceType` accepts `host`, `cpu`, `gpu`, `fpga`, or `acc`. `fpga` and `acc` are handled in the same manner. `DeviceVendorId` accepts uint32_t in hex form (`0xXYZW`). `DriverVersion`, `PlatformVersion`, `DeviceName` and `PlatformName` accept regular expression. Special characters, such as parenthesis, must be escaped. DPC++ runtime will select only those devices which satisfy provided values above and regex. More than one device can be specified using the piping symbol "\|".| | `SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING` | Any(\*) | Disables automatic rounding-up of `parallel_for` invocation ranges. | | `SYCL_CACHE_DIR` | Path | Path to persistent cache root directory. Default values are `%AppData%\libsycl_cache` for Windows and `$XDG_CACHE_HOME/libsycl_cache` on Linux, if `XDG_CACHE_HOME` is not set then `$HOME/.cache/libsycl_cache`. When none of the environment variables are set SYCL persistent cache is disabled. | diff --git a/sycl/source/detail/context_impl.cpp b/sycl/source/detail/context_impl.cpp index 5e027466d7949..6fb2dd375fe37 100644 --- a/sycl/source/detail/context_impl.cpp +++ b/sycl/source/detail/context_impl.cpp @@ -125,7 +125,8 @@ context_impl::~context_impl() { DeviceGlobalMapEntry *DGEntry = detail::ProgramManager::getInstance().getDeviceGlobalEntry( DeviceGlobal); - DGEntry->removeAssociatedResources(this); + if (DGEntry != nullptr) + DGEntry->removeAssociatedResources(this); } MCachedLibPrograms.clear(); // TODO catch an exception and put it to list of asynchronous exceptions diff --git a/sycl/source/detail/device_binary_image.hpp b/sycl/source/detail/device_binary_image.hpp index ee96ccc998d27..ac4fb92d3f9a8 100644 --- a/sycl/source/detail/device_binary_image.hpp +++ b/sycl/source/detail/device_binary_image.hpp @@ -250,7 +250,7 @@ class RTDeviceBinaryImage { protected: sycl_device_binary get() const { return Bin; } - sycl_device_binary Bin; + sycl_device_binary Bin = nullptr; ur::DeviceBinaryType Format = SYCL_DEVICE_BINARY_TYPE_NONE; RTDeviceBinaryImage::PropertyRange SpecConstIDMap; diff --git a/sycl/source/detail/device_global_map.hpp b/sycl/source/detail/device_global_map.hpp index 256c48066ec87..8ff48b3f29c52 100644 --- a/sycl/source/detail/device_global_map.hpp +++ b/sycl/source/detail/device_global_map.hpp @@ -76,6 +76,7 @@ class DeviceGlobalMap { void eraseEntries(const RTDeviceBinaryImage *Img) { const auto &DeviceGlobals = Img->getDeviceGlobals(); std::lock_guard DeviceGlobalsGuard(MDeviceGlobalsMutex); + std::cout << "DeviceGlobalMap::eraseEntries() with: " << DeviceGlobals.size() << " entries." << std::endl; for (const sycl_device_binary_property &DeviceGlobal : DeviceGlobals) { if (auto DevGlobalIt = MDeviceGlobals.find(DeviceGlobal->Name); DevGlobalIt != MDeviceGlobals.end()) { @@ -87,6 +88,7 @@ class DeviceGlobalMap { }); if (findDevGlobalByValue != MPtr2DeviceGlobal.end()) MPtr2DeviceGlobal.erase(findDevGlobalByValue); + MDeviceGlobals.erase(DevGlobalIt); } } @@ -112,8 +114,7 @@ class DeviceGlobalMap { DeviceGlobalMapEntry *getEntry(const void *DeviceGlobalPtr) { std::lock_guard DeviceGlobalsGuard(MDeviceGlobalsMutex); auto Entry = MPtr2DeviceGlobal.find(DeviceGlobalPtr); - assert(Entry != MPtr2DeviceGlobal.end() && "Device global entry not found"); - return Entry->second; + return (Entry != MPtr2DeviceGlobal.end()) ? Entry->second : nullptr; } DeviceGlobalMapEntry * diff --git a/sycl/source/detail/device_global_map_entry.cpp b/sycl/source/detail/device_global_map_entry.cpp index 1f82a605056dc..7b5ecd0a62133 100644 --- a/sycl/source/detail/device_global_map_entry.cpp +++ b/sycl/source/detail/device_global_map_entry.cpp @@ -17,14 +17,6 @@ namespace sycl { inline namespace _V1 { namespace detail { -DeviceGlobalUSMMem::~DeviceGlobalUSMMem() { - // removeAssociatedResources is expected to have cleaned up both the pointer - // and the event. When asserts are enabled the values are set, so we check - // these here. - assert(MPtr == nullptr && "MPtr has not been cleaned up."); - assert(MInitEvent == nullptr && "MInitEvent has not been cleaned up."); -} - OwnedUrEvent DeviceGlobalUSMMem::getInitEvent(adapter_impl &Adapter) { std::lock_guard Lock(MInitEventMutex); if (MInitEvent == nullptr) diff --git a/sycl/source/detail/device_global_map_entry.hpp b/sycl/source/detail/device_global_map_entry.hpp index 1796e8d179db1..19d37f3210298 100644 --- a/sycl/source/detail/device_global_map_entry.hpp +++ b/sycl/source/detail/device_global_map_entry.hpp @@ -33,7 +33,7 @@ using EventImplPtr = std::shared_ptr; struct DeviceGlobalUSMMem { DeviceGlobalUSMMem(void *Ptr) : MPtr(Ptr) {} - ~DeviceGlobalUSMMem(); + ~DeviceGlobalUSMMem() = default; void *const &getPtr() const noexcept { return MPtr; } diff --git a/sycl/source/detail/global_handler.cpp b/sycl/source/detail/global_handler.cpp index ff89080ad19eb..66b181a9bc0eb 100644 --- a/sycl/source/detail/global_handler.cpp +++ b/sycl/source/detail/global_handler.cpp @@ -272,7 +272,9 @@ void GlobalHandler::releaseDefaultContexts() { // For Linux, early shutdown is here, and late shutdown is called from // a low priority destructor. struct StaticVarShutdownHandler { - + StaticVarShutdownHandler(const StaticVarShutdownHandler &) = delete; + StaticVarShutdownHandler & + operator=(const StaticVarShutdownHandler &) = delete; ~StaticVarShutdownHandler() { try { #ifdef _WIN32 diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index 77f28a5131f8a..dd8a3dd72d6b3 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -3886,10 +3886,5 @@ extern "C" void __sycl_register_lib(sycl_device_binaries desc) { // Executed as a part of current module's (.exe, .dll) static initialization extern "C" void __sycl_unregister_lib(sycl_device_binaries desc) { - // Partial cleanup is not necessary at shutdown -#ifndef _WIN32 - if (!sycl::detail::GlobalHandler::instance().isOkToDefer()) - return; sycl::detail::ProgramManager::getInstance().removeImages(desc); -#endif } diff --git a/sycl/test-e2e/Adapters/level_zero/ext_intel_cslice.cpp b/sycl/test-e2e/Adapters/level_zero/ext_intel_cslice.cpp index e30c3b8f1f949..951ae6ce8c474 100644 --- a/sycl/test-e2e/Adapters/level_zero/ext_intel_cslice.cpp +++ b/sycl/test-e2e/Adapters/level_zero/ext_intel_cslice.cpp @@ -4,7 +4,10 @@ // XFAIL: gpu-intel-pvc-1T // XFAIL-TRACKER: https://github.com/intel/llvm/issues/15699 -// XFAIL: linux && run-mode && (arch-intel_gpu_bmg_g21 || gpu-intel-dg2) && !igc-dev +// XFAIL: gpu-intel-dg2 +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/18576 + +// XFAIL: linux && run-mode && arch-intel_gpu_bmg_g21 && !igc-dev // XFAIL-TRACKER: https://github.com/intel/llvm/issues/18576 // XFAIL: windows && arch-intel_gpu_bmg_g21 diff --git a/sycl/test-e2e/AddressSanitizer/lit.local.cfg b/sycl/test-e2e/AddressSanitizer/lit.local.cfg index 8a4709eb254bb..c2bc429f1bb3f 100644 --- a/sycl/test-e2e/AddressSanitizer/lit.local.cfg +++ b/sycl/test-e2e/AddressSanitizer/lit.local.cfg @@ -28,3 +28,5 @@ unsupported_san_flags = [ ] if any(flag in config.cxx_flags for flag in unsupported_san_flags): config.unsupported=True + +config.environment["ZE_AFFINITY_MASK"] = "0" diff --git a/sycl/test-e2e/IntermediateLib/Inputs/incrementing_lib.cpp b/sycl/test-e2e/IntermediateLib/Inputs/incrementing_lib.cpp new file mode 100644 index 0000000000000..eae3329599deb --- /dev/null +++ b/sycl/test-e2e/IntermediateLib/Inputs/incrementing_lib.cpp @@ -0,0 +1,38 @@ +#include + +#if defined(_WIN32) +#define API_EXPORT __declspec(dllexport) +#else +#define API_EXPORT +#endif + +#ifndef INC +#define INC 1 +#endif + +#ifndef CLASSNAME +#define CLASSNAME same +#endif + +#ifdef WITH_DEVICE_GLOBALS +// Using device globals within the shared libraries only +// works if the names do not collide. Note that we cannot +// load a library multiple times if it has a device global. +#define CONCAT_HELPER(a, b) a##b +#define CONCAT(a, b) CONCAT_HELPER(a, b) + +using SomeProperties = decltype(sycl::ext::oneapi::experimental::properties{}); +sycl::ext::oneapi::experimental::device_global + CONCAT(DGVar, CLASSNAME) __attribute__((visibility("default"))); + +#endif // WITH_DEVICE_GLOBALS + +extern "C" API_EXPORT void performIncrementation(sycl::queue &q, + sycl::buffer &buf) { + sycl::range<1> r = buf.get_range(); + q.submit([&](sycl::handler &cgh) { + auto acc = buf.get_access(cgh); + cgh.parallel_for( + r, [=](sycl::id<1> idx) { acc[idx] += INC; }); + }); +} \ No newline at end of file diff --git a/sycl/test-e2e/IntermediateLib/multi_lib_app.cpp b/sycl/test-e2e/IntermediateLib/multi_lib_app.cpp new file mode 100644 index 0000000000000..e161ae0f6557f --- /dev/null +++ b/sycl/test-e2e/IntermediateLib/multi_lib_app.cpp @@ -0,0 +1,154 @@ +// UNSUPPORTED: cuda || hip +// UNSUPPORTED-TRACKER: CMPLRLLVM-69415 + +// DEFINE: %{fPIC_flag} = %if windows %{%} %else %{-fPIC%} +// DEFINE: %{shared_lib_ext} = %if windows %{dll%} %else %{so%} + +// clang-format off +// IMPORTANT -DSO_PATH='R"(%T)"' +// We need to capture %T, the build directory, in a string +// and the normal STRINGIFY() macros hack won't work. +// Because on Windows, the path delimiters are \, +// which C++ preprocessor converts to escape sequences, +// which becomes a nightmare. +// So the hack here is to put heredoc in the definition +// and use single quotes, which Python forgivingly accepts. +// clang-format on + +// RUN: %{build} %{fPIC_flag} -DSO_PATH='R"(%T)"' -o %t.out + +// RUN: %clangxx -fsycl %{fPIC_flag} -shared -DINC=1 -o %T/lib_a.%{shared_lib_ext} %S/Inputs/incrementing_lib.cpp +// RUN: %clangxx -fsycl %{fPIC_flag} -shared -DINC=2 -o %T/lib_b.%{shared_lib_ext} %S/Inputs/incrementing_lib.cpp +// RUN: %clangxx -fsycl %{fPIC_flag} -shared -DINC=4 -o %T/lib_c.%{shared_lib_ext} %S/Inputs/incrementing_lib.cpp + +// RUN: env UR_L0_LEAKS_DEBUG=1 %{run} %t.out + +// This test uses a kernel of the same name in three different shared libraries. +// It loads each library, calls the kernel, and checks that the incrementation +// is done correctly, and then unloads the library. +// It also reloads the first library after unloading it. +// This test ensures that __sycl_register_lib() and __sycl_unregister_lib() +// are called correctly, and that the device images are cleaned up properly. + + +#include + +using namespace sycl::ext::oneapi::experimental; + + +#ifdef _WIN32 +#include + +void *loadOsLibrary(const std::string &LibraryPath) { + HMODULE h = + LoadLibraryExA(LibraryPath.c_str(), NULL, LOAD_WITH_ALTERED_SEARCH_PATH); + return (void *)h; +} +int unloadOsLibrary(void *Library) { + return FreeLibrary((HMODULE)Library) ? 0 : 1; +} +void *getOsLibraryFuncAddress(void *Library, const std::string &FunctionName) { + return (void *)GetProcAddress((HMODULE)Library, FunctionName.c_str()); +} + +#else +#include + +void *loadOsLibrary(const std::string &LibraryPath) { + void *so = dlopen(LibraryPath.c_str(), RTLD_NOW); + if (!so) { + char *Error = dlerror(); + std::cerr << "dlopen(" << LibraryPath << ") failed with <" + << (Error ? Error : "unknown error") << ">" << std::endl; + } + return so; +} + +int unloadOsLibrary(void *Library) { return dlclose(Library); } + +void *getOsLibraryFuncAddress(void *Library, const std::string &FunctionName) { + return dlsym(Library, FunctionName.c_str()); +} +#endif + +// Define the function pointer type for performIncrementation +using IncFuncT = void(sycl::queue &, sycl::buffer &); + +void initializeBuffer(sycl::buffer &buf) { + auto acc = sycl::host_accessor(buf); + for (size_t i = 0; i < buf.size(); ++i) + acc[i] = 0; +} + +void checkIncrementation(sycl::buffer &buf, int val) { + auto acc = sycl::host_accessor(buf); + for (size_t i = 0; i < buf.size(); ++i) { + std::cout << acc[i] << " "; + assert(acc[i] == val); + } + std::cout << std::endl; +} + +int main() { + sycl::queue q; + + sycl::range<1> r(8); + sycl::buffer buf(r); + initializeBuffer(buf); + + std::string base_path = SO_PATH; + +#ifdef _WIN32 + std::string path_to_lib_a = base_path + "\\lib_a.dll"; + std::string path_to_lib_b = base_path + "\\lib_b.dll"; + std::string path_to_lib_c = base_path + "\\lib_c.dll"; +#else + std::string path_to_lib_a = base_path + "/lib_a.so"; + std::string path_to_lib_b = base_path + "/lib_b.so"; + std::string path_to_lib_c = base_path + "/lib_c.so"; +#endif + + std::cout << "paths: " << path_to_lib_a << std::endl; + std::cout << "SO_PATH: " << SO_PATH << std::endl; + + void *lib_a = loadOsLibrary(path_to_lib_a); + void *f = getOsLibraryFuncAddress(lib_a, "performIncrementation"); + auto performIncrementationFuncA = reinterpret_cast(f); + performIncrementationFuncA(q, buf); // call the function from lib_a + q.wait(); + checkIncrementation(buf, 1); + unloadOsLibrary(lib_a); + std::cout << "lib_a done" << std::endl; + + + // Now RELOAD lib_a and try it again. + lib_a = loadOsLibrary(path_to_lib_a); + f = getOsLibraryFuncAddress(lib_a, "performIncrementation"); + performIncrementationFuncA = reinterpret_cast(f); + performIncrementationFuncA(q, buf); // call the function from lib_a + q.wait(); + checkIncrementation(buf, 1 + 1); + unloadOsLibrary(lib_a); + std::cout << "reload of lib_a done" << std::endl; + + + void *lib_b = loadOsLibrary(path_to_lib_b); + f = getOsLibraryFuncAddress(lib_b, "performIncrementation"); + auto performIncrementationFuncB = reinterpret_cast(f); + performIncrementationFuncB(q, buf); // call the function from lib_b + q.wait(); + checkIncrementation(buf, 1 + 1 + 2); + unloadOsLibrary(lib_b); + std::cout << "lib_b done" << std::endl; + + void *lib_c = loadOsLibrary(path_to_lib_c); + f = getOsLibraryFuncAddress(lib_c, "performIncrementation"); + auto performIncrementationFuncC = reinterpret_cast(f); + q.wait(); + performIncrementationFuncC(q, buf); // call the function from lib_c + checkIncrementation(buf, 1 + 1 + 2 + 4); + unloadOsLibrary(lib_c); + std::cout << "lib_c done" << std::endl; + + return 0; +} \ No newline at end of file diff --git a/sycl/test-e2e/MemorySanitizer/lit.local.cfg b/sycl/test-e2e/MemorySanitizer/lit.local.cfg index d6da6eb7bf3bf..617db32b60624 100644 --- a/sycl/test-e2e/MemorySanitizer/lit.local.cfg +++ b/sycl/test-e2e/MemorySanitizer/lit.local.cfg @@ -35,3 +35,5 @@ unsupported_san_flags = [ ] if any(flag in config.cxx_flags for flag in unsupported_san_flags): config.unsupported=True + +config.environment["ZE_AFFINITY_MASK"] = "0" diff --git a/sycl/test-e2e/ThreadSanitizer/lit.local.cfg b/sycl/test-e2e/ThreadSanitizer/lit.local.cfg index fe03e06b8d89a..aee25f0a5ba0f 100644 --- a/sycl/test-e2e/ThreadSanitizer/lit.local.cfg +++ b/sycl/test-e2e/ThreadSanitizer/lit.local.cfg @@ -33,3 +33,5 @@ unsupported_san_flags = [ ] if any(flag in config.cxx_flags for flag in unsupported_san_flags): config.unsupported=True + +config.environment["ZE_AFFINITY_MASK"] = "0" diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index 5fecdc5b8f67d..86da10bbffef7 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -50,8 +50,42 @@ struct NDRDescT { << GlobalOffset[2] << "\n"; } }; + +namespace { +class WaitInfo { + std::vector *const events; + static_assert(std::is_pointer_v); + +public: + WaitInfo(uint32_t numEvents, const ur_event_handle_t *WaitList) + : events(numEvents ? new std::vector( + WaitList, WaitList + numEvents) + : nullptr) {} + void wait() const { + if (events) + urEventWait(events->size(), events->data()); + } + std::unique_ptr> getUniquePtr() { + return std::unique_ptr>(events); + } +}; + +inline static WaitInfo getWaitInfo(uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList) { + return native_cpu::WaitInfo(numEventsInWaitList, phEventWaitList); +} + +} // namespace } // namespace native_cpu +static inline native_cpu::state getState(const native_cpu::NDRDescT &ndr) { + native_cpu::state resized_state( + ndr.GlobalSize[0], ndr.GlobalSize[1], ndr.GlobalSize[2], ndr.LocalSize[0], + ndr.LocalSize[1], ndr.LocalSize[2], ndr.GlobalOffset[0], + ndr.GlobalOffset[1], ndr.GlobalOffset[2]); + return resized_state; +} + UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, @@ -67,7 +101,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } } - urEventWait(numEventsInWaitList, phEventWaitList); UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); @@ -119,14 +152,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( auto &tp = hQueue->getDevice()->tp; const size_t numParallelThreads = tp.num_threads(); std::vector> futures; - std::vector> groups; auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0]; auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1]; auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2]; - native_cpu::state state(ndr.GlobalSize[0], ndr.GlobalSize[1], - ndr.GlobalSize[2], ndr.LocalSize[0], ndr.LocalSize[1], - ndr.LocalSize[2], ndr.GlobalOffset[0], - ndr.GlobalOffset[1], ndr.GlobalOffset[2]); auto event = new ur_event_handle_t_(hQueue, UR_COMMAND_KERNEL_LAUNCH); event->tick_start(); @@ -134,6 +162,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( auto kernel = std::make_unique(*hKernel); kernel->updateMemPool(numParallelThreads); + auto InEvents = native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList); + const size_t numWG = numWG0 * numWG1 * numWG2; const size_t numWGPerThread = numWG / numParallelThreads; const size_t remainderWG = numWG - numWGPerThread * numParallelThreads; @@ -147,13 +177,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( rangeEnd[0] = rangeEnd[3] % numWG0; rangeEnd[1] = (rangeEnd[3] / numWG0) % numWG1; rangeEnd[2] = rangeEnd[3] / (numWG0 * numWG1); - futures.emplace_back( - tp.schedule_task([state, &kernel = *kernel, rangeStart, - rangeEnd = rangeEnd[3], numWG0, numWG1, -#ifndef NATIVECPU_USE_OCK - localSize = ndr.LocalSize, -#endif - numParallelThreads](size_t threadId) mutable { + futures.emplace_back(tp.schedule_task( + [ndr, InEvents, &kernel = *kernel, rangeStart, rangeEnd = rangeEnd[3], + numWG0, numWG1, numParallelThreads](size_t threadId) { + auto state = getState(ndr); + InEvents.wait(); for (size_t g0 = rangeStart[0], g1 = rangeStart[1], g2 = rangeStart[2], g3 = rangeStart[3]; g3 < rangeEnd; ++g3) { @@ -162,9 +190,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( kernel._subhandler( kernel.getArgs(numParallelThreads, threadId).data(), &state); #else - for (size_t local2 = 0; local2 < localSize[2]; ++local2) { - for (size_t local1 = 0; local1 < localSize[1]; ++local1) { - for (size_t local0 = 0; local0 < localSize[0]; ++local0) { + for (size_t local2 = 0; local2 < ndr.LocalSize[2]; ++local2) { + for (size_t local1 = 0; local1 < ndr.LocalSize[1]; ++local1) { + for (size_t local0 = 0; local0 < ndr.LocalSize[0]; ++local0) { state.update(g0, g1, g2, local0, local1, local2); kernel._subhandler( kernel.getArgs(numParallelThreads, threadId).data(), @@ -189,7 +217,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( if (phEvent) { *phEvent = event; } - event->set_callback([kernel = std::move(kernel), hKernel, event]() { + event->set_callback([kernel = std::move(kernel), hKernel, event, + InEvents = InEvents.getUniquePtr()]() { event->tick_end(); // TODO: avoid calling clear() here. hKernel->_localArgInfo.clear(); @@ -207,20 +236,32 @@ static inline ur_result_t withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent, T &&f) { - urEventWait(numEventsInWaitList, phEventWaitList); - ur_event_handle_t event = nullptr; + ur_event_handle_t *phEvent, T &&f, bool blocking = true) { if (phEvent) { - event = new ur_event_handle_t_(hQueue, command_type); + ur_event_handle_t event = new ur_event_handle_t_(hQueue, command_type); + *phEvent = event; event->tick_start(); + if (blocking || hQueue->isInOrder()) { + urEventWait(numEventsInWaitList, phEventWaitList); + ur_result_t result = f(); + event->tick_end(); + return result; + } + auto &tp = hQueue->getDevice()->tp; + std::vector> futures; + auto InEvents = + native_cpu::getWaitInfo(numEventsInWaitList, phEventWaitList); + futures.emplace_back(tp.schedule_task([f, InEvents](size_t) { + InEvents.wait(); + f(); + })); + event->set_futures(futures); + event->set_callback( + [event, InEvents = InEvents.getUniquePtr()]() { event->tick_end(); }); + return UR_RESULT_SUCCESS; } - + urEventWait(numEventsInWaitList, phEventWaitList); ur_result_t result = f(); - - if (phEvent) { - event->tick_end(); - *phEvent = event; - } return result; } @@ -231,7 +272,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( // TODO: the wait here should be async return withTimingEvent(UR_COMMAND_EVENTS_WAIT, hQueue, numEventsInWaitList, phEventWaitList, phEvent, - [&]() { return UR_RESULT_SUCCESS; }); + []() { return UR_RESULT_SUCCESS; }); } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( @@ -239,7 +280,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { return withTimingEvent(UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, hQueue, numEventsInWaitList, phEventWaitList, phEvent, - [&]() { return UR_RESULT_SUCCESS; }); + []() { return UR_RESULT_SUCCESS; }); } UR_APIEXPORT ur_result_t urEnqueueEventsWaitWithBarrierExt( @@ -250,9 +291,43 @@ UR_APIEXPORT ur_result_t urEnqueueEventsWaitWithBarrierExt( phEventWaitList, phEvent); } +template +static inline void MemBufferReadWriteRect_impl( + ur_mem_handle_t Buff, ur_rect_offset_t BufferOffset, + ur_rect_offset_t HostOffset, ur_rect_region_t region, size_t BufferRowPitch, + size_t BufferSlicePitch, size_t HostRowPitch, size_t HostSlicePitch, + typename std::conditional::type DstMem) { + // TODO: check other constraints, performance optimizations + // More sharing with level_zero where possible + + if (BufferRowPitch == 0) + BufferRowPitch = region.width; + if (BufferSlicePitch == 0) + BufferSlicePitch = BufferRowPitch * region.height; + if (HostRowPitch == 0) + HostRowPitch = region.width; + if (HostSlicePitch == 0) + HostSlicePitch = HostRowPitch * region.height; + for (size_t w = 0; w < region.width; w++) + for (size_t h = 0; h < region.height; h++) + for (size_t d = 0; d < region.depth; d++) { + size_t buff_orign = (d + BufferOffset.z) * BufferSlicePitch + + (h + BufferOffset.y) * BufferRowPitch + w + + BufferOffset.x; + size_t host_origin = (d + HostOffset.z) * HostSlicePitch + + (h + HostOffset.y) * HostRowPitch + w + + HostOffset.x; + int8_t &buff_mem = ur_cast(Buff->_mem)[buff_orign]; + if constexpr (IsRead) + ur_cast(DstMem)[host_origin] = buff_mem; + else + buff_mem = ur_cast(DstMem)[host_origin]; + } +} + template static inline ur_result_t enqueueMemBufferReadWriteRect_impl( - ur_queue_handle_t hQueue, ur_mem_handle_t Buff, bool, + ur_queue_handle_t hQueue, ur_mem_handle_t Buff, bool blocking, ur_rect_offset_t BufferOffset, ur_rect_offset_t HostOffset, ur_rect_region_t region, size_t BufferRowPitch, size_t BufferSlicePitch, size_t HostRowPitch, size_t HostSlicePitch, @@ -265,71 +340,63 @@ static inline ur_result_t enqueueMemBufferReadWriteRect_impl( else command_t = UR_COMMAND_MEM_BUFFER_WRITE_RECT; return withTimingEvent( - command_t, hQueue, NumEventsInWaitList, phEventWaitList, phEvent, [&]() { - // TODO: blocking, check other constraints, performance optimizations - // More sharing with level_zero where possible - - if (BufferRowPitch == 0) - BufferRowPitch = region.width; - if (BufferSlicePitch == 0) - BufferSlicePitch = BufferRowPitch * region.height; - if (HostRowPitch == 0) - HostRowPitch = region.width; - if (HostSlicePitch == 0) - HostSlicePitch = HostRowPitch * region.height; - for (size_t w = 0; w < region.width; w++) - for (size_t h = 0; h < region.height; h++) - for (size_t d = 0; d < region.depth; d++) { - size_t buff_orign = (d + BufferOffset.z) * BufferSlicePitch + - (h + BufferOffset.y) * BufferRowPitch + w + - BufferOffset.x; - size_t host_origin = (d + HostOffset.z) * HostSlicePitch + - (h + HostOffset.y) * HostRowPitch + w + - HostOffset.x; - int8_t &buff_mem = ur_cast(Buff->_mem)[buff_orign]; - if constexpr (IsRead) - ur_cast(DstMem)[host_origin] = buff_mem; - else - buff_mem = ur_cast(DstMem)[host_origin]; - } - + command_t, hQueue, NumEventsInWaitList, phEventWaitList, phEvent, + [BufferRowPitch, region, BufferSlicePitch, HostRowPitch, HostSlicePitch, + BufferOffset, HostOffset, Buff, DstMem]() { + MemBufferReadWriteRect_impl( + Buff, BufferOffset, HostOffset, region, BufferRowPitch, + BufferSlicePitch, HostRowPitch, HostSlicePitch, DstMem); return UR_RESULT_SUCCESS; - }); + }, + blocking); } -static inline ur_result_t doCopy_impl(ur_queue_handle_t hQueue, void *DstPtr, - const void *SrcPtr, size_t Size, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent, - ur_command_t command_type) { - return withTimingEvent(command_type, hQueue, numEventsInWaitList, - phEventWaitList, phEvent, [&]() { - if (SrcPtr != DstPtr && Size) - memmove(DstPtr, SrcPtr, Size); - return UR_RESULT_SUCCESS; - }); +template +static inline ur_result_t doCopy_impl( + ur_queue_handle_t hQueue, void *DstPtr, const void *SrcPtr, size_t Size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent, ur_command_t command_type, bool blocking) { + if (SrcPtr == DstPtr || Size == 0) { + bool hasInEvents = numEventsInWaitList && phEventWaitList; + return withTimingEvent( + command_type, hQueue, numEventsInWaitList, phEventWaitList, phEvent, + []() { return UR_RESULT_SUCCESS; }, blocking || !hasInEvents); + } + + return withTimingEvent( + command_type, hQueue, numEventsInWaitList, phEventWaitList, phEvent, + [DstPtr, SrcPtr, Size]() { + if constexpr (AllowPartialOverlap) { + memmove(DstPtr, SrcPtr, Size); + } else { + memcpy(DstPtr, SrcPtr, Size); + } + return UR_RESULT_SUCCESS; + }, + blocking); } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( - ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool /*blockingRead*/, + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingRead, size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { void *FromPtr = /*Src*/ hBuffer->_mem + offset; auto res = doCopy_impl(hQueue, pDst, FromPtr, size, numEventsInWaitList, - phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_READ); + phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_READ, + blockingRead); return res; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( - ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool /*blockingWrite*/, + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingWrite, size_t offset, size_t size, const void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { void *ToPtr = hBuffer->_mem + offset; auto res = doCopy_impl(hQueue, ToPtr, pSrc, size, numEventsInWaitList, - phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_WRITE); + phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_WRITE, + blockingWrite); return res; } @@ -368,7 +435,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( const void *SrcPtr = hBufferSrc->_mem + srcOffset; void *DstPtr = hBufferDst->_mem + dstOffset; return doCopy_impl(hQueue, DstPtr, SrcPtr, size, numEventsInWaitList, - phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_COPY); + phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_COPY, + true /*TODO: check false for non-blocking*/); } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( @@ -379,7 +447,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { return enqueueMemBufferReadWriteRect_impl( - hQueue, hBufferSrc, false /*todo: check blocking*/, srcOrigin, + hQueue, hBufferSrc, true /*todo: check false for non-blocking*/, + srcOrigin, /*HostOffset*/ dstOrigin, region, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, hBufferDst->_mem, numEventsInWaitList, phEventWaitList, phEvent); @@ -390,12 +459,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( size_t patternSize, size_t offset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); return withTimingEvent( UR_COMMAND_MEM_BUFFER_FILL, hQueue, numEventsInWaitList, phEventWaitList, - phEvent, [&]() { - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - + phEvent, [hBuffer, offset, size, patternSize, pPattern]() { // TODO: error checking // TODO: handle async void *startingPtr = hBuffer->_mem + offset; @@ -449,7 +516,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( ur_event_handle_t *phEvent, void **ppRetMap) { return withTimingEvent(UR_COMMAND_MEM_BUFFER_MAP, hQueue, numEventsInWaitList, - phEventWaitList, phEvent, [&]() { + phEventWaitList, phEvent, + [ppRetMap, hBuffer, offset]() { *ppRetMap = hBuffer->_mem + offset; return UR_RESULT_SUCCESS; }); @@ -461,7 +529,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( ur_event_handle_t *phEvent) { return withTimingEvent(UR_COMMAND_MEM_UNMAP, hQueue, numEventsInWaitList, phEventWaitList, phEvent, - [&]() { return UR_RESULT_SUCCESS; }); + []() { return UR_RESULT_SUCCESS; }); } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( @@ -470,7 +538,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { return withTimingEvent( UR_COMMAND_USM_FILL, hQueue, numEventsInWaitList, phEventWaitList, - phEvent, [&]() { + phEvent, [ptr, pPattern, patternSize, size]() { UR_ASSERT(ptr, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(pPattern, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(patternSize != 0, UR_RESULT_ERROR_INVALID_SIZE) @@ -520,20 +588,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( - ur_queue_handle_t hQueue, bool /*blocking*/, void *pDst, const void *pSrc, + ur_queue_handle_t hQueue, bool blocking, void *pDst, const void *pSrc, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - return withTimingEvent( - UR_COMMAND_USM_MEMCPY, hQueue, numEventsInWaitList, phEventWaitList, - phEvent, [&]() { - UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE); - UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); - UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER); - - memcpy(pDst, pSrc, size); + UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE); + UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER); + UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER); - return UR_RESULT_SUCCESS; - }); + return doCopy_impl( + hQueue, pDst, pSrc, size, numEventsInWaitList, phEventWaitList, phEvent, + UR_COMMAND_USM_MEMCPY, blocking); } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(