[GPU] Fix access dimension error in dynamic shape (#32626)

hyunback · web-flow · commit b4bac1c7a832 · 2025-11-03T09:13:16.000Z
### Description of the issue(symptom, root-cause, how it was resolved) The regression was observed in the recent memory reset bugfix PR. The issue is get_dims() failure that occurs when calling the convolution feature in dynamic shape. Solution: The fix integrates the logic from the existing get_conv_channel_count() utility to correctly and safely determine the channel dimension of the convolution's input/output under dynamic condition, preventing the dimension access failure. #### Reproduction step and snapshot (if applicable. Do not attach for customer model) ##### E2E python tools/llm_bench/benchmark.py -m ov-share-05.sclab.intel.com/cv_bench_cache/latest_models_llm/qwen2-vl-7b-instruct/pytorch/ov/OV_FP16-4BIT_DEFAULT -d GPU.1 -mc 1 -ic 256 -n 3 -pf frameworks.ai.openvino.llm.prompts/32_1024/qwen2-vl-7b-instruct.jsonl ###### Benchmark_app benchmark_app -d GPU.1 --hint none -nireq 1 -niter 1 -m ov-share-05.sclab.intel.com/cv_bench_cache/latest_models_llm/qwen2-vl-7b-instruct/pytorch/ov/OV_FP16-4BIT_DEFAULT/openvino_vision_embeddings_model.xml -data_shape hidden_states[1,1176] #### Checklist - [x] Is it a proper fix? Yes - [x] Did you include test case for this fix, if necessary? - [x] Did you review existing test that can be extended to cover this scenario? Which test did you review? mem_reset_test.cpp ### Tickets: - *CVS-175613* --------- Signed-off-by: hyunback <hyunback.kim@intel.com>
diff --git a/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h b/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h
@@ -82,6 +82,8 @@ class reorder_factory {
     std::map<reorder_cache_key, std::shared_ptr<reorder>> _cached_reorders;
 };
 
+int64_t get_convolution_channel_count(const convolution_node& conv_node, const layout& layout, bool is_input);
+
 class layout_optimizer {
 public:
     enum class optimization_attributes_type {
diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@@ -112,6 +112,21 @@ std::pair<std::shared_ptr<primitive>, bool> reorder_factory::get_weights_reorder
     }
 }
 
+int64_t cldnn::get_convolution_channel_count(const convolution_node& conv_node, const layout& layout, bool is_input) {
+    auto channel_count = layout.get_partial_shape()[1].is_static() ? layout.get_partial_shape()[1].get_length() : -1;
+    if (channel_count == -1) {
+        auto weights_layout = conv_node.weights().get_output_layout();
+        if (weights_layout.is_static()) {
+            const auto& shape = weights_layout.get_partial_shape();
+            if (is_input)
+                channel_count = shape[conv_node.get_groups() > 1 ? 2 : 1].get_length();
+            else
+                channel_count = shape[conv_node.get_groups() > 1 ? 1 : 0].get_length();
+        }
+    }
+    return channel_count;
+}
+
 bool layout_optimizer::is_format_supported(program_node& node, format::type fmt) {
     if (node.is_type<fully_connected>() && fmt == format::byxf)
         return false;
@@ -250,24 +265,9 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
             return false;
         };
 
-        auto get_conv_channel_count = [](const convolution_node& conv_node, const layout& layout, bool is_input) -> int64_t {
-            auto channel_count = layout.get_partial_shape()[1].is_static() ? layout.get_partial_shape()[1].get_length() : -1;
-            if (channel_count == -1) {
-                auto weights_layout = conv_node.weights().get_output_layout();
-                if (weights_layout.is_static()) {
-                    const auto& shape = weights_layout.get_partial_shape();
-                    if (is_input)
-                        channel_count = shape[conv_node.get_groups() > 1 ? 2 : 1].get_length();
-                    else
-                        channel_count = shape[conv_node.get_groups() > 1 ? 1 : 0].get_length();
-                }
-            }
-            return channel_count;
-        };
-
         auto& conv_node = next.as<convolution>();
-        auto in_channel_count = get_conv_channel_count(conv_node, prev_output_layout, true);
-        auto out_channel_count = get_conv_channel_count(conv_node, next_output_layout, false);
+        auto in_channel_count = get_convolution_channel_count(conv_node, prev_output_layout, true);
+        auto out_channel_count = get_convolution_channel_count(conv_node, next_output_layout, false);
 
         if ((prev.is_dynamic() || next.is_dynamic()) && (in_channel_count == -1 || out_channel_count == -1))
             return false;
@@ -276,7 +276,7 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
         if (next.get_preferred_impl_type() == impl_types::onednn &&
             ((fmt_prev == format::byxf && fmt_next == format::byxf) ||
              (fmt_prev == format::bfyx && fmt_next == format::byxf &&
-                (prev_dt == data_types::f16 && get_conv_channel_count(conv_node, next.get_input_layout(0), false) <= 8))) &&
+                (prev_dt == data_types::f16 && get_convolution_channel_count(conv_node, next.get_input_layout(0), false) <= 8))) &&
             is_input_reorder(prev, next))
             return true;
 
@@ -989,22 +989,9 @@ void layout_optimizer::set_onednn_dyn_conv_preferred_format(convolution_node& no
         return (rank <= 4) ? cldnn::format::byxf : cldnn::format::bzyxf;
     };
 
-    // Helper function to get channel count safely
-    auto get_channel_count = [](const layout& layout) -> int64_t {
-        return layout.get_partial_shape()[1].is_static() ? layout.get_partial_shape()[1].get_length() : -1;
-    };
-
     // Get channel counts once
-    int64_t input_channels = get_channel_count(input_layout);
-    int64_t output_channels = get_channel_count(output_layout);
-    auto weights_layout = node.weights().get_output_layout();
-    // Try to get channel counts from weight layout
-    if (input_channels == -1 && weights_layout.is_static()) {
-        input_channels = weights_layout.get_partial_shape()[node.get_groups() > 1 ? 2 : 1].get_length();
-    }
-    if (output_channels == -1 && weights_layout.is_static()) {
-        output_channels = weights_layout.get_partial_shape()[node.get_groups() > 1 ? 1 : 0].get_length();
-    }
+    auto input_channels = get_convolution_channel_count(node, input_layout, true);
+    auto output_channels = get_convolution_channel_count(node, output_layout, false);
 
     if (i8_u8_input) {
         // Set default input format for i8/u8 input
diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -606,7 +606,13 @@ bool primitive_inst::need_reset_output_memory() const {
         const bool is_user_onednn_impl = user_inst->get_node().get_preferred_impl_type() == impl_types::onednn;
         const bool is_user_conv = user_inst->get_node().is_type<convolution>();
         if (is_user_conv && is_user_onednn_impl) {
+            auto& conv_node = user_inst->get_node().as<convolution>();
             auto& output_layout = _impl_params->get_output_layout(0);
+            auto in_channel_count = get_convolution_channel_count(conv_node, output_layout, true);
+            // If the channel count is dynamic, we cannot verify feature alignment,
+            // so we conservatively do the reset and return true for this condition.
+            if (in_channel_count == -1)
+                return true;
 
             auto get_feature_block_size = [](format fmt) {
                         int feature_block_size = 1;
@@ -623,7 +629,7 @@ bool primitive_inst::need_reset_output_memory() const {
             auto feature_block_size = get_feature_block_size(fmt);
             // if layout is single blocked and feature size is not aligned with the blocking size, need to reset output so that we can guarantee zero-filling
             // NOTE: We may improve this logic to avoid reset if we are sure that it is not "corrupted" by other layers.
-            if (output_layout.feature() % feature_block_size != 0) {
+            if (in_channel_count % feature_block_size != 0) {
                 return true;
             }
         }
diff --git a/src/plugins/intel_gpu/tests/unit/module_tests/mem_reset_test.cpp b/src/plugins/intel_gpu/tests/unit/module_tests/mem_reset_test.cpp
@@ -31,6 +31,7 @@ const std::string no_bias = "";
 
 struct mem_reset_params {
     ov::Dimension::value_type in_channel;
+    bool is_dynamic;
     bool need_reset;
 };
 
@@ -43,20 +44,32 @@ TEST_P(mem_reset_test, need_reset_output_memory_test) {
         return;
 
     tests::random_generator rg(GET_SUITE_NAME);
-    ov::PartialShape input_pshape = {1, p.in_channel, 64, 64};
+
+    ov::PartialShape target_pshape = {1, p.in_channel, 64, 64};
+    ov::PartialShape input_pshape;
+
+    if (p.is_dynamic) {
+        for (size_t i = 0; i < target_pshape.size(); ++i) {
+            input_pshape.emplace_back(ov::Dimension());
+        }
+        input_pshape[1] = target_pshape[1];
+    } else {
+        input_pshape = target_pshape;
+    }
+
     ov::PartialShape weights_pshape = {16, p.in_channel, 3, 3};
     layout in_layout{ input_pshape, data_types::f16, format::bfyx };
     layout weights_layout{ weights_pshape, data_types::f16, format::bfyx };
-    auto input_data = rg.generate_random_1d<ov::float16>(in_layout.count(), -1, 1);
-    auto input_mem = engine.allocate_memory(in_layout);
+    auto input_data = rg.generate_random_1d<ov::float16>(ov::shape_size(target_pshape.get_shape()), -1, 1);
+    auto input_mem = engine.allocate_memory({ target_pshape, data_types::f16, format::bfyx });
     set_values(input_mem, input_data);
 
     auto weights_data = rg.generate_random_1d<ov::float16>(weights_layout.count(), -1, 1);
     auto weights_mem = engine.allocate_memory(weights_layout);
     set_values(weights_mem, weights_data);
 
-    auto input1 = input_layout("input1", input_mem->get_layout());
-    auto input2 = input_layout("input2", input_mem->get_layout());
+    auto input1 = input_layout("input1", in_layout);
+    auto input2 = input_layout("input2", in_layout);
     auto weights = data("weights", weights_mem);
     auto eltw = eltwise("eltwise", {input_info("input1"), input_info("input2")}, eltwise_mode::sum);
     auto eltw_reorder = reorder("reorder1", input_info("eltwise"), format::b_fs_yx_fsv16, data_types::f16 );
@@ -87,13 +100,19 @@ TEST_P(mem_reset_test, need_reset_output_memory_test) {
 
     auto outputs_test_blocked = network_test_blocked.execute();
 
-    auto reorder_inst = network_test_blocked.get_primitive("reorder1");
+    // Additional reorder is added and fused when force_implemenetations enable in dynamic
+    auto target_primitive_id = p.is_dynamic ? "reorder1_0_reorder_2" : "reorder1";
+    auto reorder_inst = network_test_blocked.get_primitive(target_primitive_id);
     ASSERT_TRUE(PrimitiveInstTestHelper::need_reset_output_memory(reorder_inst) == p.need_reset);
 }
 
 INSTANTIATE_TEST_SUITE_P(smoke, mem_reset_test,
     testing::Values(
-        mem_reset_params{ 9, true },        // If tensor is not packed(not aligned to 16), need_reset_output_memory == true
-        mem_reset_params{ 16, false }
+        // static
+        mem_reset_params{ 9, false, true },        // If tensor is not packed(not aligned to 16), need_reset_output_memory == true
+        mem_reset_params{ 16, false, false },
+        // dynamic
+        mem_reset_params{ 9, true, true },
+        mem_reset_params{ 16, true, false }
     )
 );