From aef8348cbb79c7c17514abec152e0894b5fcc47e Mon Sep 17 00:00:00 2001 From: Kelvin Choi Date: Tue, 19 Mar 2024 02:55:00 +0900 Subject: [PATCH] [GPU] Fix dynamic loop's not matched issue during multiple shapes are inferenced (#22806) ### Details: - *Fix the issue which second infer with updated shape in dynamic loop doesn't update sliced layout.* - *Fix the issue that the optimized reshape doesn't reinterpret output memory in update_output_layout()* ### Tickets: - *122739* - *131544* --- .../graph_optimizer/prepare_buffer_fusing.cpp | 2 + src/plugins/intel_gpu/src/graph/loop.cpp | 13 +- src/plugins/intel_gpu/src/graph/reshape.cpp | 3 +- .../tests/unit/test_cases/loop_gpu_test.cpp | 186 ++++++++++++++++++ 4 files changed, 199 insertions(+), 5 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index d616a55dc456ed..21a84781f178a9 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -503,6 +503,8 @@ void prepare_buffer_fusing::run(program& p) { return; if (user->is_type() || user->is_type()) return; + } + for (auto user : node.get_users()) { if (user->is_type()) { auto& reshape_node = user->as(); if (can_reshape_be_optimized(reshape_node)) diff --git a/src/plugins/intel_gpu/src/graph/loop.cpp b/src/plugins/intel_gpu/src/graph/loop.cpp index dd80737f908f56..50c1a02eee0016 100644 --- a/src/plugins/intel_gpu/src/graph/loop.cpp +++ b/src/plugins/intel_gpu/src/graph/loop.cpp @@ -375,17 +375,22 @@ loop_inst::concatenated_memory_mapping::ptr loop_inst::create_concat_memory_map( if (extern_mem_ptr != nullptr) { layout sliced_layout = intern_prim->get_output_layout(internal_id.idx); auto inter_mem_ptr = intern_prim->output_memory_ptr(internal_id.idx); - if (inter_mem_ptr == nullptr) { + if (inter_mem_ptr == nullptr || shape_changed()) { // if inner body intern_prim has no output memory because it has dynamic shape, // calculate inner body intern_prim layout using concat_mem's layout. auto updated_sliced_layout = sliced_layout.get_partial_shape(); OPENVINO_ASSERT(updated_sliced_layout[io_prim_map.axis].is_static() || num_iterations > 0, "Not allowed dynamic dimension for axis when num_iteraiont is negative"); + + auto origin_input_layout = body_network->get_primitive(internal_id.pid)->get_node_output_layout(); auto concat_pshape = extern_prim->get_output_layout().get_partial_shape(); const auto shape_size = concat_pshape.size(); - for (size_t i = 0; i < shape_size; i++) { - if (updated_sliced_layout[i].is_dynamic()) { - updated_sliced_layout[i] = concat_pshape[i]; + if (origin_input_layout.is_dynamic()) { + auto origin_input_pshape = origin_input_layout.get_partial_shape(); + for (size_t i = 0; i < shape_size; i++) { + if (origin_input_pshape[i].is_dynamic()) { + updated_sliced_layout[i] = concat_pshape[i]; + } } } GPU_DEBUG_LOG << "output pshape for [" << intern_prim->id() << "] is changed from " diff --git a/src/plugins/intel_gpu/src/graph/reshape.cpp b/src/plugins/intel_gpu/src/graph/reshape.cpp index 46f1b560fad027..b5268a7c056b16 100644 --- a/src/plugins/intel_gpu/src/graph/reshape.cpp +++ b/src/plugins/intel_gpu/src/graph/reshape.cpp @@ -203,7 +203,8 @@ void reshape_inst::update_output_memory() { if (!can_be_optimized()) return; - if (_outputs[0] && _network.get_engine().is_the_same_buffer(output_memory(), input_memory())) + if (_outputs[0] && _network.get_engine().is_the_same_buffer(output_memory(), input_memory()) && + output_memory().get_layout() == _impl_params->get_output_layout()) return; build_deps(); // reshape need deps diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/loop_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/loop_gpu_test.cpp index fb4e54b1980c58..26ca489c5a8115 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/loop_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/loop_gpu_test.cpp @@ -11,6 +11,9 @@ #include "intel_gpu/primitives/eltwise.hpp" #include #include +#include +#include +#include #include #include #include @@ -601,3 +604,186 @@ TEST(loop_gpu, support_dynamic_tensoriterator_outer_axis) { test_loop_gpu_wo_trip_count({ 2, 1, 1, 2}, { 2, 5, 1, 2}, input_data_5_4, output_data_5_4, 1, 4); } + +static void test_loop_gpu_wo_trip_count_w_multiple_shapes(ov::PartialShape body_input_layout, + std::vector whole_layouts, + std::vector> input_data_list, + std::vector expected_output_data, + size_t axis, + size_t exit_value, + bool is_caching_test = false) { + auto& engine = get_test_engine(); + + auto b_input_layout = cldnn::layout{ body_input_layout, data_types::f32, format::bfyx }; + + ov::PartialShape sliced_input_shape = body_input_layout; + sliced_input_shape[axis] = 1; + auto sliced_input_layout = cldnn::layout{ sliced_input_shape, data_types::f32, format::bfyx }; + + auto const_layout = cldnn::layout{ {}, data_types::i64, format::bfyx }; + + auto e_initial_condition_mem = engine.allocate_memory(const_layout); + auto e_num_iteration_mem = engine.allocate_memory(const_layout); + auto b_exit_value_mem = engine.allocate_memory(const_layout); + auto b_index_inc_mem = engine.allocate_memory(const_layout); + + // initialize input buffers + set_values(e_initial_condition_mem, {1}); + set_values(b_exit_value_mem, {exit_value}); + set_values(b_index_inc_mem, {1}); + set_values(e_num_iteration_mem, {0}); + + primitive_id body_current_iteration_id = "b_index"; + primitive_id body_execution_condition_id = "b_cond_exit_value"; + + cldnn::topology body( + input_layout(body_current_iteration_id, const_layout), + input_layout("b_add_data", sliced_input_layout), + input_layout("b_mul_data", sliced_input_layout), + data("b_exit_value", b_exit_value_mem), + data("b_index_inc", b_index_inc_mem), + eltwise("b_index_update", input_info(body_current_iteration_id), input_info("b_index_inc"), eltwise_mode::sum), + reorder("b_index_cast", input_info("b_index_update"), + cldnn::format::any, data_types::f32, {}, cldnn::reorder_mean_mode::subtract, cldnn::padding(), true), + eltwise(body_execution_condition_id, input_info("b_index"), input_info("b_exit_value"), eltwise_mode::lt), + eltwise("b_add", input_info("b_add_data"), input_info("b_index_cast"), eltwise_mode::sum), + eltwise("b_mul", input_info("b_mul_data"), input_info("b_index_cast"), eltwise_mode::prod)); + + primitive_id trip_count_id = ""; + primitive_id actual_iteration_count_id = "actual_iteration_count"; + primitive_id initial_condition_id = "initial_condition"; + int64_t num_iterations = -1; + + std::vector input_primitive_maps { + loop::io_primitive_map("input", "b_add_data", axis), + loop::io_primitive_map("input", "b_mul_data", axis), + loop::io_primitive_map(actual_iteration_count_id, body_current_iteration_id) }; + std::vector output_primitive_maps { + loop::io_primitive_map(cldnn::input_info("loop", 0), cldnn::input_info("b_add", 0), axis), + loop::io_primitive_map(cldnn::input_info("loop", 1), cldnn::input_info("b_mul", 0), axis) }; + std::vector back_edges { + loop::backedge_mapping("b_index_update", body_current_iteration_id) }; + + auto body_program = build_program(engine, body, body_execution_condition_id, output_primitive_maps, back_edges, true); + + auto const_shape = engine.allocate_memory({ov::PartialShape{4}, data_types::i32, format::bfyx}); + std::vector body_input_layouts; + for (size_t i = 0; i < body_input_layout.size(); i++) { + if (body_input_layout[i].is_dynamic()) + body_input_layouts.push_back(-1); + else + body_input_layouts.push_back(body_input_layout[i].get_length()); + } + set_values(const_shape, body_input_layouts); + + cldnn::topology topology( + input_layout("input_origin", b_input_layout), + input_layout(initial_condition_id, e_initial_condition_mem->get_layout()), + mutable_data(actual_iteration_count_id, e_num_iteration_mem), + + shape_of("shape_of_input", input_info("input_origin"), data_types::i32), + reduce("reduced_shape", input_info("shape_of_input"), reduce_mode::prod, {0}, true), + reshape("reshape1", input_info("input_origin"), input_info("reduced_shape"), false, ov::PartialShape::dynamic(1)), + data("const", const_shape), + reshape("input", input_info("reshape1"), input_info("const"), false, ov::PartialShape::dynamic(4)), + + loop("loop", { input_info(actual_iteration_count_id), input_info(initial_condition_id), input_info("input") }, body_program, + trip_count_id, initial_condition_id, actual_iteration_count_id, + input_primitive_maps, output_primitive_maps, back_edges, + num_iterations, body_current_iteration_id, body_execution_condition_id, 2), + eltwise("out_sum", input_info("loop", 0), input_info("loop", 1), eltwise_mode::sum)); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + + cldnn::network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); + + for (size_t i = 0 ; i < whole_layouts.size(); i++) { + auto whole_layout = whole_layouts[i]; + auto input_data = input_data_list[i]; + + // initialize input buffers + set_values(e_initial_condition_mem, {1}); + set_values(b_exit_value_mem, {exit_value}); + set_values(b_index_inc_mem, {1}); + set_values(e_num_iteration_mem, {0}); + + auto e_input_layout = cldnn::layout{ whole_layout, data_types::f32, format::bfyx }; + auto e_input_mem = engine.allocate_memory(e_input_layout); // b,f,x,y + auto expected_output_layout = whole_layout; + set_values(e_input_mem, input_data); + network->set_input_data("input_origin", e_input_mem); + + network->set_input_data(initial_condition_id, e_initial_condition_mem); + + auto outputs = network->execute(); + ASSERT_EQ(outputs.size(), 1); + + auto expected_num_iterations = (exit_value + 1); + expected_output_layout[axis] = expected_num_iterations; + auto e_output_layout = cldnn::layout{ expected_output_layout, data_types::f32, format::bfyx }; + + auto num_iter_mem = network->get_output_memory(actual_iteration_count_id); + if (num_iter_mem != nullptr) { + mem_lock num_iter_ptr{ num_iter_mem, get_test_stream() }; + ASSERT_EQ(num_iter_ptr.data()[0], expected_num_iterations); + } + + std::vector expected(input_data.size()); + if (expected_output_data.size() == 0) { + size_t unit = 1; + for (size_t k = axis; k < whole_layout.size(); k++) { + unit *= whole_layout[k].get_length(); + } + + for (size_t j = 0; j < input_data.size(); j++) { + auto val = static_cast((j % unit) / 4) + 1; + expected[j] = static_cast(input_data[j] + val) + static_cast(input_data[j] * val); + } + } else { + expected = expected_output_data; + } + + auto output_mem = outputs.begin()->second.get_memory(); + auto output_layout = output_mem->get_layout(); + ASSERT_EQ(output_layout.batch(), e_output_layout.batch()); + ASSERT_EQ(output_layout.feature(), e_output_layout.feature()); + ASSERT_EQ(output_layout.spatial(0), e_output_layout.spatial(0)); + ASSERT_EQ(output_layout.spatial(1), e_output_layout.spatial(1)); + // value check + { + mem_lock output_ptr{ output_mem, get_test_stream() }; + for (size_t i = 0, iend = output_layout.count(); i < iend; ++i) { + ASSERT_FLOAT_EQ(output_ptr[i], expected.at(i)); + } + } + } +} + +std::vector input_data_4_4{ + 1.0f, 2.0f, -15.f, 3.0f, + 4.0f, -15.f, 5.0f, 6.0f, + -15.f, 7.0f, -15.f, 0.0f, + 0.0f, -15.f, 0.5f, -0.5f, +}; + +std::vector input_data_2_4_4{ + 1.0f, 2.0f, -15.f, 3.0f, + 4.0f, -15.f, 5.0f, 6.0f, + -15.f, 7.0f, -15.f, 0.0f, + 0.0f, -15.f, 0.5f, -0.5f, + + 1.0f, 2.0f, -15.f, 3.0f, + 4.0f, -15.f, 5.0f, 6.0f, + -15.f, 7.0f, -15.f, 0.0f, + 0.0f, -15.f, 0.5f, -0.5f, +}; + +TEST(loop_gpu, support_loop_w_dynamic_input_w_various_shapes) { + test_loop_gpu_wo_trip_count_w_multiple_shapes( + { 1, -1, 4, 4 }, + {{ 1, 1, 4, 4 }, { 1, 2, 4, 4 }}, // axis value should be iter_num = (exit_value + 1) + {input_data_4_4, input_data_2_4_4}, + std::vector(), + 2, 3); +}