Skip to content

Commit 5342ec1

Browse files
[SYCL][L0] Add experimental options for fine-tune of dynamic batching (#4492)
Signed-off-by: Sergey V Maslov <sergey.v.maslov@intel.com>
1 parent 9dd1ea3 commit 5342ec1

File tree

2 files changed

+110
-39
lines changed

2 files changed

+110
-39
lines changed

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 107 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,10 @@ enum DebugLevel {
170170
};
171171

172172
// Controls Level Zero calls tracing.
173-
static int ZeDebug = ZE_DEBUG_NONE;
173+
static const int ZeDebug = [] {
174+
const char *DebugMode = std::getenv("ZE_DEBUG");
175+
return DebugMode ? std::atoi(DebugMode) : ZE_DEBUG_NONE;
176+
}();
174177

175178
static void zePrint(const char *Format, ...) {
176179
if (ZeDebug & ZE_DEBUG_BASIC) {
@@ -843,20 +846,100 @@ static const int ZeMaxCommandListCacheSize = [] {
843846
return CommandListCacheSizeValue;
844847
}();
845848

846-
static const pi_uint32 ZeCommandListBatchSize = [] {
849+
// Configuration of the command-list batching.
850+
typedef struct {
851+
// Default value of 0. This specifies to use dynamic batch size adjustment.
852+
// Other values will try to collect specified amount of commands.
853+
pi_uint32 Size{0};
854+
855+
// If doing dynamic batching, specifies start batch size.
856+
pi_uint32 DynamicSizeStart{4};
857+
858+
// The maximum size for dynamic batch.
859+
pi_uint32 DynamicSizeMax{16};
860+
861+
// The step size for dynamic batch increases.
862+
pi_uint32 DynamicSizeStep{1};
863+
864+
// Thresholds for when increase batch size (number of closed early is small
865+
// and number of closed full is high).
866+
pi_uint32 NumTimesClosedEarlyThreshold{2};
867+
pi_uint32 NumTimesClosedFullThreshold{10};
868+
869+
// Tells the starting size of a batch.
870+
pi_uint32 startSize() const { return Size > 0 ? Size : DynamicSizeStart; }
871+
// Tells is we are doing dynamic batch size adjustment.
872+
bool dynamic() const { return Size == 0; }
873+
} zeCommandListBatchConfig;
874+
875+
static const zeCommandListBatchConfig ZeCommandListBatch = [] {
876+
zeCommandListBatchConfig Config{}; // default initialize
877+
847878
// Default value of 0. This specifies to use dynamic batch size adjustment.
848-
pi_uint32 BatchSizeVal = 0;
849879
const auto BatchSizeStr = std::getenv("SYCL_PI_LEVEL_ZERO_BATCH_SIZE");
850880
if (BatchSizeStr) {
851881
pi_int32 BatchSizeStrVal = std::atoi(BatchSizeStr);
852882
// Level Zero may only support a limted number of commands per command
853883
// list. The actual upper limit is not specified by the Level Zero
854884
// Specification. For now we allow an arbitrary upper limit.
855-
// Negative numbers will be silently ignored.
856-
if (BatchSizeStrVal >= 0)
857-
BatchSizeVal = BatchSizeStrVal;
885+
if (BatchSizeStrVal > 0) {
886+
Config.Size = BatchSizeStrVal;
887+
} else if (BatchSizeStrVal == 0) {
888+
Config.Size = 0;
889+
// We are requested to do dynamic batching. Collect specifics, if any.
890+
// The extended format supported is ":" separated values.
891+
//
892+
// NOTE: these extra settings are experimental and are intended to
893+
// be used only for finding a better default heuristic.
894+
//
895+
std::string BatchConfig(BatchSizeStr);
896+
size_t Ord = 0;
897+
size_t Pos = 0;
898+
while (true) {
899+
if (++Ord > 5)
900+
break;
901+
902+
Pos = BatchConfig.find(":", Pos);
903+
if (Pos == std::string::npos)
904+
break;
905+
++Pos; // past the ":"
906+
907+
pi_uint32 Val;
908+
try {
909+
Val = std::stoi(BatchConfig.substr(Pos));
910+
} catch (...) {
911+
zePrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: failed to parse value\n");
912+
break;
913+
}
914+
switch (Ord) {
915+
case 1:
916+
Config.DynamicSizeStart = Val;
917+
break;
918+
case 2:
919+
Config.DynamicSizeMax = Val;
920+
break;
921+
case 3:
922+
Config.DynamicSizeStep = Val;
923+
break;
924+
case 4:
925+
Config.NumTimesClosedEarlyThreshold = Val;
926+
break;
927+
case 5:
928+
Config.NumTimesClosedFullThreshold = Val;
929+
break;
930+
default:
931+
die("Unexpected batch config");
932+
}
933+
zePrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: dynamic batch param #%d: %d\n",
934+
(int)Ord, (int)Val);
935+
};
936+
937+
} else {
938+
// Negative batch sizes are silently ignored.
939+
zePrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: ignored negative value\n");
940+
}
858941
}
859-
return BatchSizeVal;
942+
return Config;
860943
}();
861944

862945
// Retrieve an available command list to be used in a PI call
@@ -1000,7 +1083,7 @@ pi_result _pi_context::getAvailableCommandList(
10001083

10011084
void _pi_queue::adjustBatchSizeForFullBatch() {
10021085
// QueueBatchSize of 0 means never allow batching.
1003-
if (QueueBatchSize == 0 || !UseDynamicBatching)
1086+
if (QueueBatchSize == 0 || !ZeCommandListBatch.dynamic())
10041087
return;
10051088

10061089
NumTimesClosedFull += 1;
@@ -1009,19 +1092,20 @@ void _pi_queue::adjustBatchSizeForFullBatch() {
10091092
// the number of times it has been closed full is high, then raise
10101093
// the batching size slowly. Don't raise it if it is already pretty
10111094
// high.
1012-
if (NumTimesClosedEarly <= 2 && NumTimesClosedFull > 10) {
1013-
if (QueueBatchSize < 16) {
1014-
QueueBatchSize = QueueBatchSize + 1;
1095+
if (NumTimesClosedEarly <= ZeCommandListBatch.NumTimesClosedEarlyThreshold &&
1096+
NumTimesClosedFull > ZeCommandListBatch.NumTimesClosedFullThreshold) {
1097+
if (QueueBatchSize < ZeCommandListBatch.DynamicSizeMax) {
1098+
QueueBatchSize += ZeCommandListBatch.DynamicSizeStep;
10151099
zePrint("Raising QueueBatchSize to %d\n", QueueBatchSize);
10161100
}
10171101
NumTimesClosedEarly = 0;
10181102
NumTimesClosedFull = 0;
10191103
}
10201104
}
10211105

1022-
void _pi_queue::adjustBatchSizeForPartialBatch(pi_uint32 PartialBatchSize) {
1106+
void _pi_queue::adjustBatchSizeForPartialBatch() {
10231107
// QueueBatchSize of 0 means never allow batching.
1024-
if (QueueBatchSize == 0 || !UseDynamicBatching)
1108+
if (QueueBatchSize == 0 || !ZeCommandListBatch.dynamic())
10251109
return;
10261110

10271111
NumTimesClosedEarly += 1;
@@ -1032,7 +1116,7 @@ void _pi_queue::adjustBatchSizeForPartialBatch(pi_uint32 PartialBatchSize) {
10321116
// batch size that will be able to be closed full at least once
10331117
// in a while.
10341118
if (NumTimesClosedEarly > (NumTimesClosedFull + 1) * 3) {
1035-
QueueBatchSize = PartialBatchSize - 1;
1119+
QueueBatchSize = OpenCommandList->second.size() - 1;
10361120
if (QueueBatchSize < 1)
10371121
QueueBatchSize = 1;
10381122
zePrint("Lowering QueueBatchSize to %d\n", QueueBatchSize);
@@ -1057,10 +1141,11 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList,
10571141
// kernels started as soon as possible when there are no kernels from this
10581142
// queue awaiting execution, while allowing batching to occur when there
10591143
// are kernels already executing. Also, if we are using fixed size batching,
1060-
// as indicated by !UseDynamicBatching, then just ignore CurrentlyEmpty
1061-
// as we want to strictly follow the batching the user specified.
1144+
// as indicated by !ZeCommandListBatch.dynamic(), then just ignore
1145+
// CurrentlyEmpty as we want to strictly follow the batching the user
1146+
// specified.
10621147
if (OKToBatchCommand && this->isBatchingAllowed() &&
1063-
(!UseDynamicBatching || !CurrentlyEmpty)) {
1148+
(!ZeCommandListBatch.dynamic() || !CurrentlyEmpty)) {
10641149

10651150
if (hasOpenCommandList() && OpenCommandList != CommandList)
10661151
die("executeCommandList: OpenCommandList should be equal to"
@@ -1207,7 +1292,7 @@ pi_result _pi_queue::executeOpenCommandList() {
12071292
// If there are any commands still in the open command list for this
12081293
// queue, then close and execute that command list now.
12091294
if (hasOpenCommandList()) {
1210-
adjustBatchSizeForPartialBatch(OpenCommandList->second.size());
1295+
adjustBatchSizeForPartialBatch();
12111296
auto Res = executeCommandList(OpenCommandList, false, false);
12121297
OpenCommandList = CommandListMap.end();
12131298
return Res;
@@ -1444,10 +1529,6 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms,
14441529
PrintPiTrace = true;
14451530
}
14461531

1447-
static const char *DebugMode = std::getenv("ZE_DEBUG");
1448-
static const int DebugModeValue = DebugMode ? std::stoi(DebugMode) : 0;
1449-
ZeDebug = DebugModeValue;
1450-
14511532
if (ZeDebug & ZE_DEBUG_CALL_COUNT) {
14521533
ZeCallCount = new std::map<const char *, int>;
14531534
}
@@ -2694,8 +2775,9 @@ pi_result piQueueCreate(pi_context Context, pi_device Device,
26942775
PI_ASSERT(Queue, PI_INVALID_QUEUE);
26952776

26962777
try {
2697-
*Queue = new _pi_queue(ZeComputeCommandQueue, ZeCopyCommandQueues, Context,
2698-
Device, ZeCommandListBatchSize, true, Properties);
2778+
*Queue =
2779+
new _pi_queue(ZeComputeCommandQueue, ZeCopyCommandQueues, Context,
2780+
Device, ZeCommandListBatch.startSize(), true, Properties);
26992781
} catch (const std::bad_alloc &) {
27002782
return PI_OUT_OF_HOST_MEMORY;
27012783
} catch (...) {
@@ -2879,7 +2961,7 @@ pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle,
28792961
// compute vs. copy Level-Zero queue.
28802962
std::vector<ze_command_queue_handle_t> ZeroCopyQueues;
28812963
*Queue = new _pi_queue(ZeQueue, ZeroCopyQueues, Context, Device,
2882-
ZeCommandListBatchSize, OwnNativeHandle);
2964+
ZeCommandListBatch.startSize(), OwnNativeHandle);
28832965
return PI_SUCCESS;
28842966
}
28852967

sycl/plugins/level_zero/pi_level_zero.hpp

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -587,19 +587,14 @@ struct _pi_context : _pi_object {
587587
std::mutex NumEventsUnreleasedInEventPoolMutex;
588588
};
589589

590-
// If doing dynamic batching, start batch size at 4.
591-
const pi_uint32 DynamicBatchStartSize = 4;
592-
593590
struct _pi_queue : _pi_object {
594591
_pi_queue(ze_command_queue_handle_t Queue,
595592
std::vector<ze_command_queue_handle_t> &CopyQueues,
596593
pi_context Context, pi_device Device, pi_uint32 BatchSize,
597594
bool OwnZeCommandQueue, pi_queue_properties PiQueueProperties = 0)
598595
: ZeComputeCommandQueue{Queue},
599596
ZeCopyCommandQueues{CopyQueues}, Context{Context}, Device{Device},
600-
QueueBatchSize{BatchSize > 0 ? BatchSize : DynamicBatchStartSize},
601-
OwnZeCommandQueue{OwnZeCommandQueue}, UseDynamicBatching{BatchSize ==
602-
0},
597+
QueueBatchSize{BatchSize}, OwnZeCommandQueue{OwnZeCommandQueue},
603598
PiQueueProperties(PiQueueProperties) {
604599
OpenCommandList = CommandListMap.end();
605600
}
@@ -668,11 +663,6 @@ struct _pi_queue : _pi_object {
668663
// asked to not transfer the ownership to SYCL RT.
669664
bool OwnZeCommandQueue;
670665

671-
// specifies whether this queue will be using dynamic batch size adjustment
672-
// or not. This is set only at queue creation time, and is therefore
673-
// const for the life of the queue.
674-
const bool UseDynamicBatching;
675-
676666
// These two members are used to keep track of how often the
677667
// batching closes and executes a command list before reaching the
678668
// QueueBatchSize limit, versus how often we reach the limit.
@@ -704,9 +694,8 @@ struct _pi_queue : _pi_object {
704694
void adjustBatchSizeForFullBatch();
705695

706696
// adjust the queue's batch size, knowing that the current command list
707-
// is being closed with only a partial batch of commands. How many commands
708-
// are in this partial closure is passed as the parameter.
709-
void adjustBatchSizeForPartialBatch(pi_uint32 PartialBatchSize);
697+
// is being closed with only a partial batch of commands.
698+
void adjustBatchSizeForPartialBatch();
710699

711700
// Resets the Command List and Associated fence in the ZeCommandListFenceMap.
712701
// If the reset command list should be made available, then MakeAvailable

0 commit comments

Comments
 (0)