Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 96 additions & 38 deletions sycl/plugins/level_zero/pi_level_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,26 +47,37 @@ namespace {
// This is an experimental option to test performance of device to device copy
// operations on copy engines (versus compute engine)
static const bool UseCopyEngineForD2DCopy = [] {
const char *CopyEngineForD2DCopy =
const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE_FOR_D2D_COPY");
const char *PiRet =
std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY");
const char *CopyEngineForD2DCopy = UrRet ? UrRet : (PiRet ? PiRet : nullptr);

return (CopyEngineForD2DCopy && (std::stoi(CopyEngineForD2DCopy) != 0));
}();

// This is an experimental option that allows the use of copy engine, if
// available in the device, in Level Zero plugin for copy operations submitted
// to an in-order queue. The default is 1.
static const bool UseCopyEngineForInOrderQueue = [] {
const char *CopyEngineForInOrderQueue =
const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE");
const char *PiRet =
std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE");
const char *CopyEngineForInOrderQueue =
UrRet ? UrRet : (PiRet ? PiRet : nullptr);

return (!CopyEngineForInOrderQueue ||
(std::stoi(CopyEngineForInOrderQueue) != 0));
}();

// This is an experimental option that allows the use of multiple command lists
// when submitting barriers. The default is 0.
static const bool UseMultipleCmdlistBarriers = [] {
const char *UseMultipleCmdlistBarriersFlag =
const char *UrRet = std::getenv("UR_L0_USE_MULTIPLE_COMMANDLIST_BARRIERS");
const char *PiRet =
std::getenv("SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS");
const char *UseMultipleCmdlistBarriersFlag =
UrRet ? UrRet : (PiRet ? PiRet : nullptr);

if (!UseMultipleCmdlistBarriersFlag)
return true;
return std::stoi(UseMultipleCmdlistBarriersFlag) > 0;
Expand All @@ -75,8 +86,11 @@ static const bool UseMultipleCmdlistBarriers = [] {
// This is an experimental option that allows to disable caching of events in
// the context.
static const bool DisableEventsCaching = [] {
const char *UrRet = std::getenv("UR_L0_DISABLE_EVENTS_CACHING");
const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_DISABLE_EVENTS_CACHING");
const char *DisableEventsCachingFlag =
std::getenv("SYCL_PI_LEVEL_ZERO_DISABLE_EVENTS_CACHING");
UrRet ? UrRet : (PiRet ? PiRet : nullptr);

if (!DisableEventsCachingFlag)
return false;
return std::stoi(DisableEventsCachingFlag) != 0;
Expand All @@ -85,8 +99,11 @@ static const bool DisableEventsCaching = [] {
// This is an experimental option that allows reset and reuse of uncompleted
// events in the in-order queue with discard_events property.
static const bool ReuseDiscardedEvents = [] {
const char *UrRet = std::getenv("UR_L0_REUSE_DISCARDED_EVENTS");
const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_REUSE_DISCARDED_EVENTS");
const char *ReuseDiscardedEventsFlag =
std::getenv("SYCL_PI_LEVEL_ZERO_REUSE_DISCARDED_EVENTS");
UrRet ? UrRet : (PiRet ? PiRet : nullptr);

if (!ReuseDiscardedEventsFlag)
return true;
return std::stoi(ReuseDiscardedEventsFlag) > 0;
Expand All @@ -95,8 +112,11 @@ static const bool ReuseDiscardedEvents = [] {
// Due to a bug with 2D memory copy to and from non-USM pointers, this option is
// disabled by default.
static const bool UseMemcpy2DOperations = [] {
const char *UrRet = std::getenv("UR_L0_USE_NATIVE_USM_MEMCPY2D");
const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USE_NATIVE_USM_MEMCPY2D");
const char *UseMemcpy2DOperationsFlag =
std::getenv("SYCL_PI_LEVEL_ZERO_USE_NATIVE_USM_MEMCPY2D");
UrRet ? UrRet : (PiRet ? PiRet : nullptr);

if (!UseMemcpy2DOperationsFlag)
return false;
return std::stoi(UseMemcpy2DOperationsFlag) > 0;
Expand Down Expand Up @@ -130,16 +150,21 @@ static inline pi_result mapError(ze_result_t Result) {
// paths be less likely affected.
//
static bool doEagerInit = [] {
const char *EagerInit = std::getenv("SYCL_EAGER_INIT");
const char *UrRet = std::getenv("UR_L0_EAGER_INIT");
const char *PiRet = std::getenv("SYCL_EAGER_INIT");
const char *EagerInit = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
return EagerInit ? std::atoi(EagerInit) != 0 : false;
}();

// Maximum number of events that can be present in an event ZePool is captured
// here. Setting it to 256 gave best possible performance for several
// benchmarks.
static const pi_uint32 MaxNumEventsPerPool = [] {
const auto MaxNumEventsPerPoolEnv =
std::getenv("ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL");
const char *UrRet = std::getenv("UR_L0_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL");
const char *PiRet = std::getenv("ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL");
const char *MaxNumEventsPerPoolEnv =
UrRet ? UrRet : (PiRet ? PiRet : nullptr);

pi_uint32 Result =
MaxNumEventsPerPoolEnv ? std::atoi(MaxNumEventsPerPoolEnv) : 256;
if (Result <= 0)
Expand Down Expand Up @@ -177,16 +202,18 @@ template <> ze_result_t zeHostSynchronize(ze_command_queue_handle_t Handle) {

} // anonymous namespace

// SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE can be set to an integer (>=0) in
// UR_L0_LEVEL_ZERO_USE_COMPUTE_ENGINE can be set to an integer (>=0) in
// which case all compute commands will be submitted to the command-queue
// with the given index in the compute command group. If it is instead set
// to negative then all available compute engines may be used.
//
// The default value is "0".
//
static const std::pair<int, int> getRangeOfAllowedComputeEngines() {
static const char *EnvVar =
std::getenv("SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE");
const char *UrRet = std::getenv("UR_L0_USE_COMPUTE_ENGINE");
const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE");
const char *EnvVar = UrRet ? UrRet : (PiRet ? PiRet : nullptr);

// If the environment variable is not set only use "0" CCS for now.
// TODO: allow all CCSs when HW support is complete.
if (!EnvVar)
Expand Down Expand Up @@ -466,8 +493,13 @@ pi_result _pi_queue::addEventToQueueCache(pi_event Event) {
// If number of events in the immediate command list exceeds this threshold then
// cleanup process for those events is executed.
static const size_t ImmCmdListsEventCleanupThreshold = [] {
const char *ImmCmdListsEventCleanupThresholdStr = std::getenv(
const char *UrRet =
std::getenv("UR_L0_IMMEDIATE_COMMANDLISTS_EVENT_CLEANUP_THRESHOLD");
const char *PiRet = std::getenv(
"SYCL_PI_LEVEL_ZERO_IMMEDIATE_COMMANDLISTS_EVENT_CLEANUP_THRESHOLD");
const char *ImmCmdListsEventCleanupThresholdStr =
UrRet ? UrRet : (PiRet ? PiRet : nullptr);

static constexpr int Default = 1000;
if (!ImmCmdListsEventCleanupThresholdStr)
return Default;
Expand All @@ -484,8 +516,12 @@ static const size_t ImmCmdListsEventCleanupThreshold = [] {
// Get value of the threshold for number of active command lists allowed before
// we start heuristically cleaning them up.
static const size_t CmdListsCleanupThreshold = [] {
const char *CmdListsCleanupThresholdStr =
const char *UrRet = std::getenv("UR_L0_COMMANDLISTS_CLEANUP_THRESHOLD");
const char *PiRet =
std::getenv("SYCL_PI_LEVEL_ZERO_COMMANDLISTS_CLEANUP_THRESHOLD");
const char *CmdListsCleanupThresholdStr =
UrRet ? UrRet : (PiRet ? PiRet : nullptr);

static constexpr int Default = 20;
if (!CmdListsCleanupThresholdStr)
return Default;
Expand Down Expand Up @@ -826,9 +862,17 @@ static const zeCommandListBatchConfig ZeCommandListBatchConfig(bool IsCopy) {
zeCommandListBatchConfig Config{}; // default initialize

// Default value of 0. This specifies to use dynamic batch size adjustment.
const auto BatchSizeStr =
(IsCopy) ? std::getenv("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE")
: std::getenv("SYCL_PI_LEVEL_ZERO_BATCH_SIZE");
const char *UrRet = nullptr;
const char *PiRet = nullptr;
if (IsCopy) {
UrRet = std::getenv("UR_L0_COPY_BATCH_SIZE");
PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE");
} else {
UrRet = std::getenv("UR_L0_BATCH_SIZE");
PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_BATCH_SIZE");
}
const char *BatchSizeStr = UrRet ? UrRet : (PiRet ? PiRet : nullptr);

if (BatchSizeStr) {
pi_int32 BatchSizeStrVal = std::atoi(BatchSizeStr);
// Level Zero may only support a limted number of commands per command
Expand Down Expand Up @@ -861,10 +905,9 @@ static const zeCommandListBatchConfig ZeCommandListBatchConfig(bool IsCopy) {
Val = std::stoi(BatchConfig.substr(Pos));
} catch (...) {
if (IsCopy)
urPrint(
"SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: failed to parse value\n");
urPrint("UR_L0_COPY_BATCH_SIZE: failed to parse value\n");
else
urPrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: failed to parse value\n");
urPrint("UR_L0_BATCH_SIZE: failed to parse value\n");
break;
}
switch (Ord) {
Expand All @@ -887,21 +930,20 @@ static const zeCommandListBatchConfig ZeCommandListBatchConfig(bool IsCopy) {
die("Unexpected batch config");
}
if (IsCopy)
urPrint("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: dynamic batch param "
urPrint("UR_L0_COPY_BATCH_SIZE: dynamic batch param "
"#%d: %d\n",
(int)Ord, (int)Val);
else
urPrint(
"SYCL_PI_LEVEL_ZERO_BATCH_SIZE: dynamic batch param #%d: %d\n",
(int)Ord, (int)Val);
urPrint("UR_L0_BATCH_SIZE: dynamic batch param #%d: %d\n", (int)Ord,
(int)Val);
};

} else {
// Negative batch sizes are silently ignored.
if (IsCopy)
urPrint("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: ignored negative value\n");
urPrint("UR_L0_COPY_BATCH_SIZE: ignored negative value\n");
else
urPrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: ignored negative value\n");
urPrint("UR_L0_BATCH_SIZE: ignored negative value\n");
}
}
return Config;
Expand All @@ -922,7 +964,10 @@ static const zeCommandListBatchConfig ZeCommandListBatchCopyConfig = [] {
// Temporarily check whether immediate command list env var has been set. This
// affects default behavior of make_queue API.
static const bool ImmediateCommandlistEnvVarIsSet = [] {
return std::getenv("SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS");
const char *UrRet = std::getenv("UR_L0_USE_IMMEDIATE_COMMANDLISTS");
const char *PiRet =
std::getenv("SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS");
return (UrRet ? std::stoi(UrRet) : (PiRet ? std::stoi(PiRet) : 0));
}();

_pi_queue::_pi_queue(std::vector<ze_command_queue_handle_t> &ComputeQueues,
Expand Down Expand Up @@ -1893,9 +1938,9 @@ pi_result _pi_queue::executeOpenCommandList(bool IsCopy) {
}

static const bool FilterEventWaitList = [] {
const char *Ret = std::getenv("SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST");
const bool RetVal = Ret ? std::stoi(Ret) : 0;
return RetVal;
const char *UrRet = std::getenv("UR_L0_FILTER_EVENT_WAIT_LIST");
const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST");
return (UrRet ? std::stoi(UrRet) : (PiRet ? std::stoi(PiRet) : 0));
}();

pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList(
Expand Down Expand Up @@ -2398,7 +2443,7 @@ pi_result piContextGetInfo(pi_context Context, pi_context_info ParamName,
return ReturnValue(pi_uint32{Context->RefCount.load()});
case PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT:
// 2D USM memcpy is supported unless disabled through
// SYCL_PI_LEVEL_ZERO_USE_NATIVE_USM_MEMCPY2D.
// UR_L0_LEVEL_ZERO_USE_NATIVE_USM_MEMCPY2D.
return ReturnValue(pi_bool{UseMemcpy2DOperations});
case PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT:
case PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMSET2D_SUPPORT:
Expand Down Expand Up @@ -2901,8 +2946,12 @@ pi_result piQueueFinish(pi_queue Queue) {
// TODO: this currently exhibits some issues in the driver, so
// we control this with an env var. Remove this control when
// we settle one way or the other.
static bool HoldLock =
std::getenv("SYCL_PI_LEVEL_ZERO_QUEUE_FINISH_HOLD_LOCK") != nullptr;
const char *UrRet = std::getenv("UR_L0_QUEUE_FINISH_HOLD_LOCK");
const char *PiRet =
std::getenv("SYCL_PI_LEVEL_ZERO_QUEUE_FINISH_HOLD_LOCK");
const bool HoldLock =
UrRet ? std::stoi(UrRet) : (PiRet ? std::stoi(PiRet) : 0);

if (!HoldLock) {
Lock.unlock();
}
Expand Down Expand Up @@ -5793,7 +5842,7 @@ pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue,
// If we have a list of events to make the barrier from, then we can create a
// barrier on these and use the resulting event as our future barrier.
// We use the same approach if
// SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS is not set to a
// UR_L0_USE_MULTIPLE_COMMANDLIST_BARRIERS is not set to a
// positive value.
// We use the same approach if we have in-order queue because every command
// depends on previous one, so we don't need to insert barrier to multiple
Expand Down Expand Up @@ -6346,8 +6395,10 @@ pi_result piEnqueueMemBufferCopyRect(
// Default to using compute engine for fill operation, but allow to
// override this with an environment variable.
static bool PreferCopyEngine = [] {
const char *Env = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_FILL");
return Env ? std::stoi(Env) != 0 : false;
const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE_FOR_FILL");
const char *PiRet =
std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_FILL");
return (UrRet ? std::stoi(UrRet) : (PiRet ? std::stoi(PiRet) : 0));
}();

// PI interfaces must have queue's and buffer's mutexes locked on entry.
Expand Down Expand Up @@ -7188,7 +7239,10 @@ enum class USMAllocationForceResidencyType {

// Returns the desired USM residency setting
static USMAllocationForceResidencyType USMAllocationForceResidency = [] {
const auto Str = std::getenv("SYCL_PI_LEVEL_ZERO_USM_RESIDENT");
const char *UrRet = std::getenv("UR_L0_USM_RESIDENT");
const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USM_RESIDENT");
const char *Str = UrRet ? UrRet : (PiRet ? PiRet : nullptr);

if (!Str)
return USMAllocationForceResidencyType::P2PDevices;
switch (std::atoi(Str)) {
Expand Down Expand Up @@ -8599,8 +8653,12 @@ pi_result _pi_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode,
// cross-tile traffic.
//
static const bool SingleRootDeviceBufferMigration = [] {
const char *EnvStr =
const char *UrRet =
std::getenv("UR_L0_SINGLE_ROOT_DEVICE_BUFFER_MIGRATION");
const char *PiRet =
std::getenv("SYCL_PI_LEVEL_ZERO_SINGLE_ROOT_DEVICE_BUFFER_MIGRATION");
const char *EnvStr = UrRet ? UrRet : (PiRet ? PiRet : nullptr);

if (EnvStr)
return (std::stoi(EnvStr) != 0);
// The default is to migrate normally, which may not always be the
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1028,7 +1028,7 @@ ur_result_t urDeviceGetInfo(
return UR_RESULT_SUCCESS;
}

// SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE can be set to an integer value, or
// UR_L0_USE_COPY_ENGINE can be set to an integer value, or
// a pair of integer values of the form "lower_index:upper_index".
// Here, the indices point to copy engines in a list of all available copy
// engines.
Expand All @@ -1038,7 +1038,10 @@ ur_result_t urDeviceGetInfo(
// available copy engines can be used.
const std::pair<int, int>
getRangeOfAllowedCopyEngines(const ur_device_handle_t &Device) {
static const char *EnvVar = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE");
const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE");
const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE");
static const char *EnvVar = UrRet ? UrRet : (PiRet ? PiRet : nullptr);

// If the environment variable is not set, no copy engines are used when
// immediate commandlists are being used. For standard commandlists all are
// used.
Expand All @@ -1061,7 +1064,7 @@ getRangeOfAllowedCopyEngines(const ur_device_handle_t &Device) {
int UpperCopyEngineIndex = std::stoi(CopyEngineRange.substr(pos + 1));
if ((LowerCopyEngineIndex > UpperCopyEngineIndex) ||
(LowerCopyEngineIndex < -1) || (UpperCopyEngineIndex < -1)) {
urPrint("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE: invalid value provided, "
urPrint("UR_L0_LEVEL_ZERO_USE_COPY_ENGINE: invalid value provided, "
"default set.\n");
LowerCopyEngineIndex = 0;
UpperCopyEngineIndex = INT_MAX;
Expand All @@ -1085,8 +1088,10 @@ _ur_device_handle_t::useImmediateCommandLists() {
// If immediate commandlist setting is not explicitly set, then use the device
// default.
static const int ImmediateCommandlistsSetting = [] {
char *UrRet = std::getenv("UR_L0_USE_IMMEDIATE_COMMANDLISTS");
char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS");
const char *ImmediateCommandlistsSettingStr =
std::getenv("SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS");
UrRet ? UrRet : (PiRet ? PiRet : nullptr);
if (!ImmediateCommandlistsSettingStr)
return -1;
return std::stoi(ImmediateCommandlistsSettingStr);
Expand Down Expand Up @@ -1114,8 +1119,10 @@ _ur_device_handle_t::useImmediateCommandLists() {

// Get value of device scope events env var setting or default setting
static const EventsScope DeviceEventsSetting = [] {
char *UrRet = std::getenv("UR_L0_DEVICE_SCOPE_EVENTS");
char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS");
const char *DeviceEventsSettingStr =
std::getenv("SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS");
UrRet ? UrRet : (PiRet ? PiRet : nullptr);
if (DeviceEventsSettingStr) {
// Override the default if user has explicitly chosen the events scope.
switch (std::stoi(DeviceEventsSettingStr)) {
Expand Down Expand Up @@ -1532,7 +1539,7 @@ ur_result_t urDevicePartition(

// Sub-Sub-Devices are partitioned by CSlices, not by affinity domain.
// However, if
// SYCL_PI_LEVEL_ZERO_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING overrides that
// UR_L0_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING overrides that
// still expose CSlices in partitioning by affinity domain for compatibility
// reasons.
if (Properties[0] == UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN &&
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ struct _ur_device_handle_t : _ur_object {
ImmCmdlistMode ImmCommandListUsed{};

// Scope of events used for events on the device
// Can be adjusted with SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS
// Can be adjusted with UR_DEVICE_SCOPE_EVENTS
// for non-immediate command lists
EventsScope ZeEventsScope = AllHostVisible;

Expand Down
Loading