Reduce the peak memory even with CPU fallback by moving the fallback within the basic_backend.cc scope (#723)

preetha-intel · web-flow · commit 80daa9b575f3 · 2025-07-02T20:13:11.000+05:30
diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -153,52 +153,28 @@ BackendManager::BackendManager(SessionContext& session_context,
                                                       model_stream);
     } catch (const OnnxRuntimeException& ex) {
       std::string exception_str = ex.what();
-      bool eligible_for_cpu_fallback = device_type.find("NPU") != std::string::npos &&
-                                       !session_context_.so_disable_cpu_ep_fallback &&
-                                       !subgraph_context_.is_ep_ctx_graph;
-#if defined(OPENVINO_DISABLE_NPU_FALLBACK)
-      eligible_for_cpu_fallback = false;
-#else
-      if (eligible_for_cpu_fallback) {
-        LOGS_DEFAULT(VERBOSE) << exception_str;
-        LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
-                              << "Falling back to OV CPU for execution";
-        session_context_.device_type = "CPU";
-        session_context_.precision = "FP32";
-        try {
-          concrete_backend_ = BackendFactory::MakeBackend(model_proto,
-                                                          session_context_,
-                                                          subgraph_context_,
-                                                          shared_context_,
-                                                          model_stream);
-        } catch (std::string const& msg) {
-          ORT_THROW(msg);
-        }
-      }
-#endif
-      if (!eligible_for_cpu_fallback) {
-        if (device_type.find("NPU") != std::string::npos &&
-            exception_str.find("intel_npu") != std::string::npos) {
-          // Handle NPU device related errors
+
+      if (session_context_.device_type.find("NPU") != std::string::npos &&
+          exception_str.find("intel_npu") != std::string::npos) {
+        // Handle NPU device related errors
 #ifndef NDEBUG
-          ORT_THROW(exception_str + "\nModel needs to be recompiled\n");
+        ORT_THROW(exception_str + "\nModel needs to be recompiled\n");
 #else
-          std::string error_message = "UNKNOWN NPU ERROR";
-          std::string error_code = "code 0x0";
-          std::regex error_message_pattern(R"(\bZE_\w*\b)");
-          std::regex error_code_pattern("code 0x[0-9a-fA-F]+");
-          std::smatch matches;
-          if (std::regex_search(exception_str, matches, error_message_pattern)) {
-            error_message = matches[0];
-          }
-          if (std::regex_search(exception_str, matches, error_code_pattern)) {
-            error_code = matches[0];
-          }
-          throw std::runtime_error(error_message + ", " + error_code + "\nModel needs to be recompiled\n");
-#endif
-        } else {
-          ORT_THROW(exception_str);
+        std::string error_message = "UNKNOWN NPU ERROR";
+        std::string error_code = "code 0x0";
+        std::regex error_message_pattern(R"(\bZE_\w*\b)");
+        std::regex error_code_pattern("code 0x[0-9a-fA-F]+");
+        std::smatch matches;
+        if (std::regex_search(exception_str, matches, error_message_pattern)) {
+          error_message = matches[0];
         }
+        if (std::regex_search(exception_str, matches, error_code_pattern)) {
+          error_code = matches[0];
+        }
+        throw std::runtime_error(error_message + ", " + error_code + "\nModel needs to be recompiled\n");
+#endif
+      } else {
+        ORT_THROW(exception_str);
       }
     }
   }
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -36,42 +36,14 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
   if (ValidateSubgraph(const_outputs_map_))
     return;
 
-  // OV Config
+  // Pre-requisite is provider_option "context" must be set
+  auto auto_unified_compile = ((hw_target.find("AUTO") == std::string::npos) ||
+                               (session_context_.OpenVINO_Version.at(0) >= 2024 &&
+                                session_context_.OpenVINO_Version.at(1) > 2));
   ov::AnyMap device_config;
-  PopulateConfigValue(device_config);
-
-  // Enable caching
-  EnableCaching();
-
-  // Setting OpenCL queue throttling for GPU
-  EnableGPUThrottling(device_config);
-
-  // Enable streams; default=1 unless overridden by user configuration
-  EnableStreams();
-
-  // Set the inference_num_threads property of the CPU
-  SetNumThreads(device_config);
-
-  auto npuw_status =
-      std::any_of(device_config.begin(), device_config.end(), [&](const std::pair<std::string, ov::Any>& pair) {
-        return (pair.first.find("NPU_USE_NPUW") != std::string::npos) && (pair.second.is<std::string>()) &&
-               (pair.second.as<std::string>() == "YES");
-      });
-
-  if (npuw_status) {
-    LOGS_DEFAULT(INFO) << log_tag << "NPUW Enabled during compilation";
-  }
-
-  try {
-    // IO_BUFFER is enabled on GPU HW.
-    // Pre-requisite is provider_option "context" must be set
-    auto auto_unified_compile = ((hw_target.find("AUTO") == std::string::npos) ||
-                                 (session_context_.OpenVINO_Version.at(0) >= 2024 &&
-                                  session_context_.OpenVINO_Version.at(1) > 2));
-    bool disable_cpu_fallback = !(hw_target.find("NPU") != std::string::npos &&
-                                  !session_context_.so_disable_cpu_ep_fallback &&
-                                  !subgraph_context_.is_ep_ctx_graph);
-    if (subgraph_context_.is_ep_ctx_graph) {
+  SetOVDeviceConfiguration(device_config);
+  if (subgraph_context_.is_ep_ctx_graph) {
+    try {
       if (subgraph_context_.is_ep_ctx_ovir_encapsulated) {
         // model_file_path will use so_context_file_path if the onnx_model_path_name is not available,
         // especially in case of CreateSessionFormArray() where user must explicitly
@@ -104,41 +76,67 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
                                                   device_config,
                                                   subgraph_context_.subgraph_name);
       }
-      model_stream.reset();  // Delete stream after it is no longer needed
-    } else if (!session_context_.has_external_weights &&
-               !subgraph_context_.has_dynamic_input_shape &&
-               !session_context_.so_context_enable &&
-               session_context_.reshape.empty() &&
-               !enable_causallm &&
-               auto_unified_compile) {
-      // Unified OV compile_model is efficient when ov model caching is enabled
-      // Unified OV compile_model API is supported with AUTO from version 2024.3 and above
-      // Inputs with static dimensions
-      // Not enabled for models with external weights and when ep context is set.
-      const std::string model = model_proto->SerializeAsString();
-      // we have the serialized string, so we can release model proto to lower the peak memory consumption
-      if (disable_cpu_fallback) model_proto.reset();
-      exe_network_ = OVCore::Get()->CompileModel(model,
-                                                 hw_target,
-                                                 device_config,
-                                                 subgraph_context_.subgraph_name);
-    } else {  // For all other types use ov::ov_core read_model() to generate OV IR
-              // followed by ov::ov_core compile_model()
-      std::string model = model_proto->SerializeAsString();
-      // Reset model proto only when cpu fallback is disabled or when the model has dynamic input shapes.
-      // This is to avoid memory peak usage when the model is large.
-      if (!subgraph_context.has_dynamic_input_shape && disable_cpu_fallback) {
-        model_proto.reset();
+      model_stream.reset();
+    } catch (const char* msg) {
+      ORT_THROW(msg);
+    }  // Delete stream after it is no longer needed
+  } else {
+    std::string model = model_proto->SerializeAsString();
+    if (!subgraph_context.has_dynamic_input_shape) {
+      model_proto.reset();
+    }
+    try {
+      // SetOVDeviceConfiguration(device_config);
+      if (!session_context_.has_external_weights &&
+          !subgraph_context_.has_dynamic_input_shape &&
+          !session_context_.so_context_enable &&
+          session_context_.reshape.empty() &&
+          !enable_causallm &&
+          auto_unified_compile) {
+        // Unified OV compile_model is efficient when ov model caching is enabled
+        // Unified OV compile_model API is supported with AUTO from version 2024.3 and above
+        // Inputs with static dimensions
+        // Not enabled for models with external weights and when ep context is set.
+
+        exe_network_ = OVCore::Get()->CompileModel(model,
+                                                   hw_target,
+                                                   device_config,
+                                                   subgraph_context_.subgraph_name);
+      } else {  // For all other types use ov::ov_core read_model() to generate OV IR
+                // followed by ov::ov_core compile_model()
+        auto ov_model = CreateOVModel(std::move(model), session_context_, const_outputs_map_);
+        exe_network_ = OVCore::Get()->CompileModel(
+            ov_model, hw_target, device_config, enable_causallm, subgraph_context_.subgraph_name);
+      }
+      LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
+    } catch (const OnnxRuntimeException& ex) {
+      std::string exception_str = ex.what();
+      bool eligible_for_cpu_fallback = session_context_.device_type.find("NPU") != std::string::npos &&
+                                       !session_context_.so_disable_cpu_ep_fallback &&
+                                       !subgraph_context_.is_ep_ctx_graph;
+#if defined(OPENVINO_DISABLE_NPU_FALLBACK)
+      eligible_for_cpu_fallback = false;
+#endif
+      if (eligible_for_cpu_fallback) {
+        LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
+                              << "Falling back to OV CPU for execution";
+        session_context_.device_type = "CPU";
+        session_context_.precision = "FP32";
+        device_config.clear();
+        SetOVDeviceConfiguration(device_config);
+        try {
+          // Recreate the model with CPU device type
+          auto ov_model = CreateOVModel(std::move(model), session_context_, const_outputs_map_);
+          exe_network_ = OVCore::Get()->CompileModel(
+              ov_model, hw_target, device_config, enable_causallm, subgraph_context_.subgraph_name);
+        } catch (std::string const& msg) {
+          ORT_THROW(msg);
+        }
+      } else {
+        ORT_THROW(ex.what());
       }
-      auto ov_model = CreateOVModel(std::move(model), session_context_, const_outputs_map_);
-      exe_network_ = OVCore::Get()->CompileModel(
-          ov_model, hw_target, device_config, enable_causallm, subgraph_context_.subgraph_name);
     }
-    LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
-  } catch (const char* msg) {
-    ORT_THROW(msg);
   }
-
   int num_infer_req = (session_context_.num_of_threads > 0) ? session_context_.num_of_threads : 1;
   std::function<void(OVInferRequestPtr)> initializer = [](OVInferRequestPtr) {};
   auto metadata = shared_context_.shared_weights.metadata;
@@ -385,6 +383,32 @@ void BasicBackend::SetNumThreads(ov::AnyMap& device_config) {
     device_config.emplace(ov::inference_num_threads(session_context_.num_of_threads));
 }
 
+void BasicBackend::SetOVDeviceConfiguration(ov::AnyMap& device_config) {
+  PopulateConfigValue(device_config);
+
+  // Enable caching
+  EnableCaching();
+
+  // Setting OpenCL queue throttling for GPU
+  EnableGPUThrottling(device_config);
+
+  // Enable streams; default=1 unless overridden by user configuration
+  EnableStreams();
+
+  // Set the inference_num_threads property of the CPU
+  SetNumThreads(device_config);
+
+  auto npuw_status =
+      std::any_of(device_config.begin(), device_config.end(), [&](const std::pair<std::string, ov::Any>& pair) {
+        return (pair.first.find("NPU_USE_NPUW") != std::string::npos) && (pair.second.is<std::string>()) &&
+               (pair.second.as<std::string>() == "YES");
+      });
+
+  if (npuw_status) {
+    LOGS_DEFAULT(INFO) << log_tag << "NPUW Enabled during compilation";
+  }
+}
+
 void BasicBackend::ValidateOrtDimsAgainstPartialShape(const std::vector<int64_t>& ort_dims,
                                                       const ov::PartialShape& partial_shape) const {
   // Check if the number of dimensions matches
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h
@@ -146,6 +146,7 @@ class BasicBackend : public IBackend {
   void EnableGPUThrottling(ov::AnyMap& device_config);
   void EnableStreams();
   void SetNumThreads(ov::AnyMap& device_config);
+  void SetOVDeviceConfiguration(ov::AnyMap& device_config);
   void ValidateOrtDimsAgainstPartialShape(const std::vector<int64_t>& ort_dims,
                                           const ov::PartialShape& partial_shape) const;