Skip to content

Commit 0e3f77a

Browse files
authored
[GPU] Choose byxf for dynamic-batch case of OneDNN (#32874)
### Description of the issue(symptom, root-cause, how it was resolved) FP16 Performance Lower Than FP32 For GPU When Batch Size Increases. Currently, OneDNN uses format like b_fs_yx_fsv16 or bs_fs_yx_bsv16_fsv16 based on the batch size. But in the case of dynamic batches, it is difficult to effectively use the format according to the batch size. - How it was resolved Used byxf format instead in dynamic onednn convolution whitelist. #### Reproduction step and snapshot (if applicable. Do not attach for customer model) $ python openvino_bench_test.py --data raw_events.csv --checkpoint best_model_lse.pth --device gpu --batch_size 1 $ python openvino_bench_test.py --data raw_events.csv --checkpoint best_model_lse.pth --device gpu --batch_size 32 ... $ python openvino_bench_test.py --data raw_events.csv --checkpoint best_model_lse.pth --device gpu --batch_size 1024 $ python openvino_bench_test.py --data raw_events.csv --checkpoint best_model_lse.pth --device gpu --batch_size 2048 Model is in the ticket. #### Checklist - [x] Is it a proper fix? - [x] Did you include test case for this fix, if necessary? - [x] Did you review existing test that can be extended to cover this scenario? Which test did you review? ### Tickets: - *CVS-176149* --------- Signed-off-by: hyunback <hyunback.kim@intel.com>
1 parent d7eb258 commit 0e3f77a

File tree

5 files changed

+173
-26
lines changed

5 files changed

+173
-26
lines changed

src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,12 @@ dnnl::memory::desc layout_to_memory_desc(cldnn::layout l, dnnl::memory::format_t
356356
// In cldnn::layer, when it is a 3D shape, the values ​​of the XY axes can sometimes be flipped,
357357
// so the larger value of the two is used.
358358
dims.push_back(std::max(l.spatial(0), l.spatial(1)));
359-
target_fmt = dnnl::memory::format_tag::abc;
359+
if (l.get_format() == format::bfyx)
360+
target_fmt = dnnl::memory::format_tag::abc;
361+
else if (l.get_format() == format::byxf)
362+
target_fmt = dnnl::memory::format_tag::acb;
363+
else
364+
OPENVINO_THROW("[GPU] Unexpected layout format " + l.to_short_string());
360365
} else {
361366
auto rank = cldnn::format::dimension(l.format);
362367
dims = convert_tensor(l.get_tensor(), rank, cldnn::format::is_grouped(l.format));

src/plugins/intel_gpu/src/graph/include/layout_optimizer.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ class layout_optimizer {
8888
public:
8989
enum class optimization_attributes_type {
9090
group_convolution,
91+
byxf_onednn_convolution,
9192
bfyx_only_layer,
9293
fs_b_yx_fsv32_network,
9394
b_fs_zyx_fsv32_network,
@@ -98,6 +99,7 @@ class layout_optimizer {
9899

99100
struct optimization_attributes {
100101
int32_t group_convolution = 0;
102+
int32_t byxf_onednn_convolution = 0;
101103
int32_t bfyx_only_layer = 0;
102104
int32_t fs_b_yx_fsv32_network = 0;
103105
int32_t b_fs_zyx_fsv32_network = 0;

src/plugins/intel_gpu/src/graph/layout_optimizer.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -971,6 +971,17 @@ void layout_optimizer::set_onednn_dyn_conv_preferred_format(convolution_node& no
971971
OPENVINO_ASSERT(rank == output_layout.get_partial_shape().size(), "Input and output ranks must match");
972972
OPENVINO_ASSERT(rank <= 5, "Not supported rank");
973973

974+
if (_optimization_attributes.byxf_onednn_convolution) {
975+
if (rank <= 4) {
976+
node.set_preferred_input_fmt(0, cldnn::format::byxf);
977+
node.set_preferred_output_fmt(0, cldnn::format::byxf);
978+
} else {
979+
node.set_preferred_input_fmt(0, cldnn::format::bzyxf);
980+
node.set_preferred_output_fmt(0, cldnn::format::bzyxf);
981+
}
982+
return;
983+
}
984+
974985
// Data type classification
975986
bool i8_u8_input = (input_layout.data_type == data_types::u8 || input_layout.data_type == data_types::i8);
976987
bool i8_u8_output = (output_layout.data_type == data_types::u8 || output_layout.data_type == data_types::i8);
@@ -1475,6 +1486,9 @@ void layout_optimizer::set_optimization_attribute(optimization_attributes_type a
14751486
case optimization_attributes_type::group_convolution:
14761487
_optimization_attributes.group_convolution = val;
14771488
break;
1489+
case optimization_attributes_type::byxf_onednn_convolution:
1490+
_optimization_attributes.byxf_onednn_convolution = val;
1491+
break;
14781492
case optimization_attributes_type::bfyx_only_layer:
14791493
_optimization_attributes.bfyx_only_layer = val;
14801494
break;

src/plugins/intel_gpu/src/graph/program.cpp

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1456,6 +1456,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
14561456
bool can_use_fsv16 = true;
14571457
bool can_use_bs_fs_yx_bsv16_fsv16 = true;
14581458
bool is_quantized_int8_model = false;
1459+
bool is_dynamic_batch_onednn_conv = false;
14591460
size_t total_asym_quantized_conv_layers = 0;
14601461
size_t total_dw_conv_layers = 0;
14611462
size_t total_dw_splitted_conv_layers = 0;
@@ -1464,16 +1465,34 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
14641465
size_t opt_deconv_layers_b_fs_zyx_fsv16 = 0;
14651466
size_t opt_deconv_layers_b_fs_yx_fsv16 = 0;
14661467
size_t total_crop_layers = 0;
1467-
1468+
size_t total_non_byxf_onednn_conv_whitelist_layers = 0;
1469+
1470+
// OneDNN previously selects formats like b_fs_yx_fsv16 or bs_fs_yx_bsv16_fsv16 based on batch size.
1471+
// For dynamic batches, this approach is inefficient.
1472+
// We plan to switch to byxf for better flexibility across varying batch sizes.
1473+
// The whitelist below defines the initial target scope (CVS-176149).
1474+
const std::unordered_set<primitive_type_id> byxf_onednn_conv_whitelist = {cldnn::input_layout::type_id(),
1475+
cldnn::permute::type_id(),
1476+
cldnn::convolution::type_id(),
1477+
cldnn::fully_connected::type_id(),
1478+
cldnn::activation::type_id(),
1479+
cldnn::softmax::type_id(),
1480+
cldnn::reduce::type_id(),
1481+
cldnn::reorder::type_id(),
1482+
cldnn::eltwise::type_id()};
14681483
for (auto& node : get_processing_order()) {
14691484
auto &prim = *node;
14701485
if (prim.type() == cldnn::convolution::type_id()) {
14711486
auto &conv = prim.as<convolution>();
14721487
if (conv.get_primitive()->groups > 1)
14731488
lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::group_convolution, 1);
14741489

1475-
if (!conv.is_dynamic()) {
1476-
// In dynamic shape, conv is fixed as a predefined format b_fs_yx_fsv16
1490+
if (conv.is_dynamic()) {
1491+
bool is_dynamic_batch = !node->get_output_layout().get_partial_shape()[0].is_static();
1492+
bool is_fp32_conv = (node->get_input_layout().data_type == data_types::f32) &&
1493+
(node->get_output_layout().data_type == data_types::f32);
1494+
is_dynamic_batch_onednn_conv = is_dynamic_batch && !is_fp32_conv;
1495+
} else {
14771496
auto input_size = node->get_input_layout(0).get_tensor();
14781497
auto ifm = static_cast<uint32_t>(input_size.feature[0]);
14791498
if (conv.get_primitive()->groups == ifm && conv.get_primitive()->groups >= 16)
@@ -1628,6 +1647,10 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
16281647
prim.type() != cldnn::experimental_detectron_generate_proposals_single_image::type_id()) {
16291648
can_use_bs_fs_yx_bsv16_fsv16 = false;
16301649
}
1650+
1651+
if (prim.is_in_data_flow() && (byxf_onednn_conv_whitelist.count(prim.type()) == 0)) {
1652+
total_non_byxf_onednn_conv_whitelist_layers++;
1653+
}
16311654
}
16321655

16331656
size_t total_conv_layers = lo.get_total_conv_count();
@@ -1681,15 +1704,18 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
16811704
if (engine.get_device_info().vendor_id == INTEL_VENDOR_ID &&
16821705
get_config().get_queue_type() == QueueTypes::in_order &&
16831706
enable_onednn_for_tests) {
1684-
if (engine.get_device_info().supports_immad) {
1685-
lo.add_all_onednn_impls_optimization_attribute();
1686-
} else {
1687-
if (get_config().get_use_onednn()) {
1688-
lo.enable_onednn_for<lstm_seq>();
1689-
lo.enable_onednn_for<gru_seq>();
1690-
}
1707+
if (engine.get_device_info().supports_immad) {
1708+
lo.add_all_onednn_impls_optimization_attribute();
1709+
} else {
1710+
if (get_config().get_use_onednn()) {
1711+
lo.enable_onednn_for<lstm_seq>();
1712+
lo.enable_onednn_for<gru_seq>();
16911713
}
16921714
}
1715+
}
1716+
bool should_use_byxf_onednn_conv = is_dynamic_batch_onednn_conv && (total_non_byxf_onednn_conv_whitelist_layers == 0);
1717+
if (should_use_byxf_onednn_conv)
1718+
lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::byxf_onednn_convolution, 1);
16931719
#endif
16941720
}
16951721

src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp

Lines changed: 115 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9809,7 +9809,9 @@ using TestParamType_convolution_gpu_onednn = ::testing::tuple< int, // 0 - I
98099809
int, // 10 - Batch
98109810
format, // 11 - Input data format
98119811
std::string, // 12 - Implementation name
9812-
bool>; // 13 - With bias
9812+
bool, // 13 - With bias
9813+
format, // 14 - Expected convolution format
9814+
bool>; // 15 - Is dynamic
98139815

98149816
struct convolution_gpu_onednn : public ::testing::TestWithParam<TestParamType_convolution_gpu_onednn> {
98159817
static std::string PrintToStringParamName(
@@ -9827,7 +9829,9 @@ struct convolution_gpu_onednn : public ::testing::TestWithParam<TestParamType_co
98279829
std::to_string(testing::get<9>(param_info.param)) + "_batch" +
98289830
std::to_string(testing::get<10>(param_info.param)) + "_format" +
98299831
std::to_string(testing::get<11>(param_info.param)) + "_with_bias_" +
9830-
std::to_string(testing::get<13>(param_info.param));
9832+
std::to_string(testing::get<13>(param_info.param)) + "_conv_format_" +
9833+
std::to_string(testing::get<14>(param_info.param)) + "_is_dynamic_" +
9834+
std::to_string(testing::get<15>(param_info.param));
98319835

98329836
if (testing::get<12>(param_info.param) != "") {
98339837
res += "_kernel_" + testing::get<12>(param_info.param);
@@ -9842,11 +9846,11 @@ INSTANTIATE_TEST_SUITE_P(conv_onednn_cases,
98429846
::testing::Values(
98439847
// Input X size, Input Y size, Input Z size, Input features, Output features,
98449848
// Kernel size X, Kernel size Y, Kernel size Z, Groups number, Stride, Batch,
9845-
// Input data format, Implementation name, WithBias
9846-
TestParamType_convolution_gpu_onednn(8, 8, 1, 32, 32, 3, 3, 1, 1, 1, 32, format::bfyx, "", true),
9847-
TestParamType_convolution_gpu_onednn(8, 8, 1, 32, 32, 3, 3, 1, 1, 1, 32, format::bfyx, "", false)
9848-
// TestParamType_convolution_gpu_onednn(8, 8, 1, 32, 32, 3, 3, 1, 1, 1, 32, format::bfyx, "", true),
9849-
// TestParamType_convolution_gpu_onednn(8, 8, 1, 32, 32, 3, 3, 1, 1, 1, 32, format::bfyx, "", false)
9849+
// Input data format, Implementation name, WithBias, Expected Conv format, Is-dynamic
9850+
TestParamType_convolution_gpu_onednn(8, 8, 1, 32, 32, 3, 3, 1, 1, 1, 32, format::bfyx, "", true, format::bs_fs_yx_bsv32_fsv16, false),
9851+
TestParamType_convolution_gpu_onednn(8, 8, 1, 32, 32, 3, 3, 1, 1, 1, 32, format::bfyx, "", true, format::byxf, true),
9852+
TestParamType_convolution_gpu_onednn(8, 8, 1, 32, 32, 3, 3, 1, 1, 1, 32, format::bfyx, "", false, format::bs_fs_yx_bsv32_fsv16, false),
9853+
TestParamType_convolution_gpu_onednn(8, 8, 1, 32, 32, 3, 3, 1, 1, 1, 32, format::bfyx, "", false, format::byxf, true)
98509854
),
98519855
convolution_gpu_onednn::PrintToStringParamName);
98529856

@@ -9876,11 +9880,25 @@ TEST_P(convolution_gpu_onednn, conv_onednn_cases) {
98769880
auto input_data_format = testing::get<11>(GetParam());
98779881
auto impl_name = testing::get<12>(GetParam());
98789882
auto with_bias = testing::get<13>(GetParam());
9883+
auto expected_conv_format = testing::get<14>(GetParam());
9884+
auto is_dynamic = testing::get<15>(GetParam());
9885+
9886+
ov::PartialShape target_pshape = {batch_num, input_f, input_x, input_y};
9887+
ov::PartialShape input_pshape;
9888+
9889+
if (is_dynamic) {
9890+
for (size_t i = 0; i < target_pshape.size(); ++i) {
9891+
input_pshape.emplace_back(ov::Dimension());
9892+
}
9893+
input_pshape[1] = target_pshape[1];
9894+
} else {
9895+
input_pshape = target_pshape;
9896+
}
9897+
layout in_layout{input_pshape, data_types::f16, format::bfyx};
98799898

9880-
auto input_size = tensor(batch_num, input_f, input_x, input_y);
98819899
auto input_data = rg.generate_random_4d<ov::float16>(batch_num, input_f, input_y, input_x, -1, 1);
98829900
auto input_data_bfyx = flatten_4d(format::bfyx, input_data);
9883-
auto input_mem = engine.allocate_memory({ data_types::f16, format::bfyx, input_size });
9901+
auto input_mem = engine.allocate_memory({ target_pshape, data_types::f16, format::bfyx });
98849902
set_values(input_mem, input_data_bfyx);
98859903

98869904
auto weights_size = tensor(output_f, input_f, filter_y, filter_x, 1);
@@ -9912,10 +9930,10 @@ TEST_P(convolution_gpu_onednn, conv_onednn_cases) {
99129930
}
99139931
}
99149932

9915-
topology.add(input_layout("input", input_mem->get_layout()),
9933+
topology.add(input_layout("input", in_layout),
99169934
data("weights_fsv", weights_mem),
99179935
data("bias", biases_mem),
9918-
reorder("input_fsv", input_info("input"), { data_types::f16, input_data_format, input_size }));
9936+
reorder("input_fsv", input_info("input"), input_data_format, data_types::f16));
99199937

99209938
auto conv_fsv = convolution("conv_fsv",
99219939
input_info("input_fsv"),
@@ -9943,9 +9961,9 @@ TEST_P(convolution_gpu_onednn, conv_onednn_cases) {
99439961
}
99449962
}
99459963

9946-
topology.add(input_layout("input", input_mem->get_layout()),
9964+
topology.add(input_layout("input", in_layout),
99479965
data("weights_fsv", weights_mem),
9948-
reorder("input_fsv", input_info("input"), { data_types::f16, input_data_format, input_size }));
9966+
reorder("input_fsv", input_info("input"), input_data_format, data_types::f16));
99499967

99509968
auto conv_fsv = convolution("conv_fsv",
99519969
input_info("input_fsv"),
@@ -9964,6 +9982,8 @@ TEST_P(convolution_gpu_onednn, conv_onednn_cases) {
99649982
ExecutionConfig config = get_test_default_config(engine);
99659983
config.set_property(ov::intel_gpu::optimize_data(true));
99669984
config.set_property(ov::intel_gpu::custom_outputs(std::vector<std::string>{"conv_fsv","reorder_bfyx"}));
9985+
if (is_dynamic)
9986+
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
99679987
network network(engine, topology, config);
99689988

99699989
network.set_input_data("input", input_mem);
@@ -9974,8 +9994,10 @@ TEST_P(convolution_gpu_onednn, conv_onednn_cases) {
99749994
for (auto& p : network.get_primitives_info())
99759995
std::cerr << p.original_id << " " << p.kernel_id << std::endl;
99769996

9977-
auto out_ptr = get_output_values_to_float<ov::float16>(network, outputs.find("conv_fsv")->second);
9978-
auto out_lay = network.get_primitive("conv_fsv")->get_node_output_layout();
9997+
auto out_ptr = get_output_values_to_float<float>(network, outputs.find("reorder_bfyx")->second);
9998+
auto output_memory = outputs.at("reorder_bfyx").get_memory();
9999+
auto out_lay = output_memory->get_layout();
10000+
997910001
ASSERT_EQ(out_lay.batch(), expected_result.size());
998010002
ASSERT_EQ(out_lay.feature(), expected_result[0].size());
998110003
ASSERT_EQ(out_lay.spatial(1), expected_result[0][0].size());
@@ -9998,6 +10020,9 @@ TEST_P(convolution_gpu_onednn, conv_onednn_cases) {
999810020
}
999910021
ASSERT_TRUE(equal);
1000010022
}
10023+
10024+
out_lay = network.get_primitive("conv_fsv")->get_node_output_layout();
10025+
ASSERT_EQ(out_lay.get_format(), expected_conv_format);
1000110026
}
1000210027

1000310028
TEST(convolution_gpu_onednn, padding_for_cldnn_kernel_after_onednn) {
@@ -10174,6 +10199,81 @@ TEST(convolution_gpu_onednn, spatial_1d) {
1017410199
}
1017510200
}
1017610201

10202+
TEST(convolution_gpu_onednn, spatial_1d_dynamic) {
10203+
auto& engine = get_test_engine();
10204+
if (!engine.get_device_info().supports_immad)
10205+
return;
10206+
10207+
tests::random_generator rg(GET_SUITE_NAME);
10208+
ov::PartialShape target_pshape = {1, 16, 6};
10209+
ov::PartialShape input_pshape = {ov::Dimension(), 16, 6};
10210+
ov::PartialShape weights_pshape = {16, 16, 3};
10211+
layout in_layout{ input_pshape, data_types::f16, format::bfyx };
10212+
layout in_ref_layout{ target_pshape, data_types::f16, format::bfyx };
10213+
layout weights_layout{ weights_pshape, data_types::f16, format::bfyx };
10214+
auto input_data = rg.generate_random_1d<ov::float16>(ov::shape_size(target_pshape.get_shape()), -1, 1);
10215+
auto input_mem = engine.allocate_memory({target_pshape, data_types::f16, format::bfyx});
10216+
set_values(input_mem, input_data);
10217+
10218+
auto weights_data = rg.generate_random_1d<ov::float16>(weights_layout.count(), -1, 1);
10219+
auto weights_mem = engine.allocate_memory(weights_layout);
10220+
set_values(weights_mem, weights_data);
10221+
10222+
auto input = input_layout("input", in_layout);
10223+
auto input_ref = input_layout("input", in_ref_layout);
10224+
auto weights = data("weights", weights_mem);
10225+
auto conv = convolution("conv",
10226+
input_info("input"),
10227+
"weights",
10228+
no_bias,
10229+
1,
10230+
ov::Strides{1},
10231+
ov::Strides{1},
10232+
ov::CoordinateDiff{0},
10233+
ov::CoordinateDiff{0},
10234+
false);
10235+
auto output_reorder = reorder("reorder", input_info("conv"), format::bfyx, data_types::f32 );
10236+
10237+
topology t(input, weights, conv, output_reorder);
10238+
topology t_ref(input_ref, weights, conv, output_reorder);
10239+
10240+
ExecutionConfig config_test_dynamic = get_test_default_config(engine);
10241+
config_test_dynamic.set_property(ov::intel_gpu::optimize_data(true));
10242+
config_test_dynamic.set_property(ov::intel_gpu::allow_new_shape_infer(true));
10243+
10244+
ExecutionConfig config_ref = get_test_default_config(engine);
10245+
ov::intel_gpu::ImplementationDesc conv_impl_ref = { format::bfyx, "", impl_types::ocl };
10246+
config_ref.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{{ "conv", conv_impl_ref } }));
10247+
config_ref.set_property(ov::intel_gpu::optimize_data(true));
10248+
config_ref.set_property(ov::intel_gpu::allow_new_shape_infer(true));
10249+
10250+
network network_ref(engine, t_ref, config_ref);
10251+
network_ref.set_input_data("input", input_mem);
10252+
auto outputs_ref = network_ref.execute();
10253+
ASSERT_EQ(outputs_ref.size(), size_t(1));
10254+
ASSERT_EQ(outputs_ref.begin()->first, "reorder");
10255+
auto output_memory_ref = outputs_ref.at("reorder").get_memory();
10256+
auto output_layout_ref = output_memory_ref->get_layout();
10257+
cldnn::mem_lock<float> output_ptr_ref(output_memory_ref, get_test_stream());
10258+
10259+
network network_test_dynamic(engine, t, config_test_dynamic);
10260+
network_test_dynamic.set_input_data("input", input_mem);
10261+
auto outputs_test_dynamic = network_test_dynamic.execute();
10262+
ASSERT_EQ(outputs_test_dynamic.size(), size_t(1));
10263+
ASSERT_EQ(outputs_test_dynamic.begin()->first, "reorder");
10264+
auto output_memory_test_dynamic = outputs_test_dynamic.at("reorder").get_memory();
10265+
auto output_layout_test_dynamic = output_memory_test_dynamic->get_layout();
10266+
cldnn::mem_lock<float> output_ptr_test_dynamic(output_memory_test_dynamic, get_test_stream());
10267+
10268+
ov::PartialShape expected_shape = {1, 16, 4};
10269+
ASSERT_EQ(output_layout_test_dynamic.get_partial_shape(), expected_shape);
10270+
ASSERT_EQ(output_layout_ref.get_partial_shape(), expected_shape);
10271+
10272+
for (size_t i = 0; i < output_memory_ref->count(); i++) {
10273+
ASSERT_EQ(output_ptr_ref.data()[i] , output_ptr_test_dynamic.data()[i]);
10274+
}
10275+
}
10276+
1017710277
TEST(convolution_gpu_onednn, spatial_1d_quantize_post_ops_blocked_format) {
1017810278
auto& engine = get_test_engine();
1017910279
if (!engine.get_device_info().supports_immad)

0 commit comments

Comments
 (0)