diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index d50c665ed7f92..c6838ba2c0999 100755 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -259,7 +259,7 @@ variables in production code. | `SYCL_PI_LEVEL_ZERO_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING` (Deprecated) | Integer | When set to non-zero value exposes compute slices as sub-sub-devices in `sycl::info::partition_property::partition_by_affinity_domain` partitioning scheme. Default is zero meaning that they are only exposed when partitioning by `sycl::info::partition_property::ext_intel_partition_by_cslice`. This option is introduced for compatibility reasons and is immediately deprecated. New code must not rely on this behavior. Also note that even if sub-sub-device was created using `partition_by_affinity_domain` it would still be reported as created via partitioning by compute slices. | | `SYCL_PI_LEVEL_ZERO_COMMANDLISTS_CLEANUP_THRESHOLD` | Integer | If non-negative then the threshold is set to this value. If negative, the threshold is set to INT_MAX. Whenever the number of command lists in a queue exceeds this threshold, an attempt is made to cleanup completed command lists for their subsequent reuse. The default is 20. | | `SYCL_PI_LEVEL_ZERO_IMMEDIATE_COMMANDLISTS_EVENT_CLEANUP_THRESHOLD` | Integer | If non-negative then the threshold is set to this value. If negative, the threshold is set to INT_MAX. Whenever the number of events associated with an immediate command list exceeds this threshold, a check is made for signaled events and these events are recycled. Setting this threshold low causes events to be checked more often, which could result in unneeded events being recycled sooner. However, more frequent event status checks may cost time. The default is 1000. | -| `SYCL_PI_LEVEL_ZERO_USM_RESIDENT` | Integer | Controls if/where to make USM allocations resident at the time of allocation. If set to 0 (default) then no special residency is forced. If set to 1 then allocation (device or shared) is made resident at the device of allocation. If set to 2 then allocation (device or shared) is made resident on all devices in the context of allocation that have P2P access to the device of allocation. For host allocation, any non-0 setting forces the allocation resident on all devices in the context. | +| `SYCL_PI_LEVEL_ZERO_USM_RESIDENT` | Integer | Controls if/where to make USM allocations resident at the time of allocation. If set to 0 then no special residency is forced. If set to 1 then allocation (device or shared) is made resident at the device of allocation. If set to 2 then allocation (device or shared) is made resident on all devices in the context of allocation that have P2P access to the device of allocation. For host allocation, any non-0 setting forces the allocation resident on all devices in the context. Default is 2. | | `SYCL_PI_LEVEL_ZERO_USE_NATIVE_USM_MEMCPY2D` | Integer | When set to a positive value enables the use of Level Zero USM 2D memory copy operations. Default is 0. | ## Debugging variables for CUDA Plugin diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index ae392ee933841..b13bed5224c18 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -7190,12 +7190,12 @@ static bool UseUSMAllocator = [] { }(); enum class USMAllocationForceResidencyType { - // [Default] Do not force memory residency at allocation time. + // Do not force memory residency at allocation time. None = 0, // Force memory resident on the device of allocation at allocation time. // For host allocation force residency on all devices in a context. Device = 1, - // Force memory resident on all devices in the context with P2P + // [Default] Force memory resident on all devices in the context with P2P // access to the device of allocation. // For host allocation force residency on all devices in a context. P2PDevices = 2 @@ -7205,7 +7205,7 @@ enum class USMAllocationForceResidencyType { static USMAllocationForceResidencyType USMAllocationForceResidency = [] { const auto Str = std::getenv("SYCL_PI_LEVEL_ZERO_USM_RESIDENT"); if (!Str) - return USMAllocationForceResidencyType::None; + return USMAllocationForceResidencyType::P2PDevices; switch (std::atoi(Str)) { case 1: return USMAllocationForceResidencyType::Device; diff --git a/sycl/test-e2e/USM/usm_pooling.cpp b/sycl/test-e2e/USM/usm_pooling.cpp old mode 100755 new mode 100644 index 45b7142e832b8..8113c1794e15c --- a/sycl/test-e2e/USM/usm_pooling.cpp +++ b/sycl/test-e2e/USM/usm_pooling.cpp @@ -4,29 +4,29 @@ // Allocate 2 items of 2MB. Free 2. Allocate 3 more of 2MB. // With no pooling: 1,2,3,4,5 allocs lead to ZE call. -// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR=1 %t.out h 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-NOPOOL -// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR=1 %t.out d 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-NOPOOL -// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR=1 %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-NOPOOL +// RUN: env ZE_DEBUG=1 SYCL_PI_LEVEL_ZERO_USM_RESIDENT=0 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR=1 %t.out h 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-NOPOOL +// RUN: env ZE_DEBUG=1 SYCL_PI_LEVEL_ZERO_USM_RESIDENT=0 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR=1 %t.out d 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-NOPOOL +// RUN: env ZE_DEBUG=1 SYCL_PI_LEVEL_ZERO_USM_RESIDENT=0 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR=1 %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-NOPOOL // With pooling enabled and MaxPooolable=1MB: 1,2,3,4,5 allocs lead to ZE call. -// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=";;1M,4,64K" %t.out h 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-12345 -// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=";;1M,4,64K" %t.out d 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-12345 -// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=";;1M,4,64K" %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-12345 +// RUN: env ZE_DEBUG=1 SYCL_PI_LEVEL_ZERO_USM_RESIDENT=0 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=";;1M,4,64K" %t.out h 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-12345 +// RUN: env ZE_DEBUG=1 SYCL_PI_LEVEL_ZERO_USM_RESIDENT=0 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=";;1M,4,64K" %t.out d 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-12345 +// RUN: env ZE_DEBUG=1 SYCL_PI_LEVEL_ZERO_USM_RESIDENT=0 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=";;1M,4,64K" %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-12345 // With pooling enabled and capacity=1: 1,2,4,5 allocs lead to ZE call. -// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=";;2M,1,64K" %t.out h 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245 -// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=";;2M,1,64K" %t.out d 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245 -// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=";;2M,1,64K" %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245 +// RUN: env ZE_DEBUG=1 SYCL_PI_LEVEL_ZERO_USM_RESIDENT=0 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=";;2M,1,64K" %t.out h 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245 +// RUN: env ZE_DEBUG=1 SYCL_PI_LEVEL_ZERO_USM_RESIDENT=0 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=";;2M,1,64K" %t.out d 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245 +// RUN: env ZE_DEBUG=1 SYCL_PI_LEVEL_ZERO_USM_RESIDENT=0 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=";;2M,1,64K" %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245 // With pooling enabled and MaxPoolSize=2MB: 1,2,4,5 allocs lead to ZE call. -// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=";2M;2M,4,64K" %t.out h 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245 -// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=";2M;2M,4,64K" %t.out d 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245 -// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=";2M;2M,4,64K" %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245 +// RUN: env ZE_DEBUG=1 SYCL_PI_LEVEL_ZERO_USM_RESIDENT=0 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=";2M;2M,4,64K" %t.out h 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245 +// RUN: env ZE_DEBUG=1 SYCL_PI_LEVEL_ZERO_USM_RESIDENT=0 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=";2M;2M,4,64K" %t.out d 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245 +// RUN: env ZE_DEBUG=1 SYCL_PI_LEVEL_ZERO_USM_RESIDENT=0 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=";2M;2M,4,64K" %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245 // With pooling enabled and SlabMinSize of 4 MB: 1,5 allocs lead to ZE call. -// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=";;2M,4,4M" %t.out h 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-15 -// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=";;2M,4,4M" %t.out d 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-15 -// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=";;2M,4,4M" %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-15 +// RUN: env ZE_DEBUG=1 SYCL_PI_LEVEL_ZERO_USM_RESIDENT=0 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=";;2M,4,4M" %t.out h 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-15 +// RUN: env ZE_DEBUG=1 SYCL_PI_LEVEL_ZERO_USM_RESIDENT=0 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=";;2M,4,4M" %t.out d 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-15 +// RUN: env ZE_DEBUG=1 SYCL_PI_LEVEL_ZERO_USM_RESIDENT=0 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=";;2M,4,4M" %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-15 #include "CL/sycl.hpp" #include using namespace sycl;