Merge pull request #821 from Kotomi-Du/make_stateful_phisilica

Kotomi-Du · web-flow · commit b9a73f32dfd0 · 2025-11-03T18:47:21.000Z
CVS-175736-[OVEP] Enable stateful mode for Phi-silica models
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -361,7 +361,11 @@ void OVInferRequest::Infer() {
 StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device)
     : OVInferRequest(std::move(infer_request)), target_device(device) {
   bool gpu_or_npu = ((device.find("NPU") != std::string::npos) || (device.find("GPU") != std::string::npos));
-  if (gpu_or_npu) {
+
+  // check if there is input_ids tensors and if the tensor type is int64,
+  // because logic prefill_use_full_chat_history is only for specific inputs and data type
+  auto input_ids_opt = FindTensor("input_ids");
+  if (gpu_or_npu && input_ids_opt.has_value() && input_ids_opt->get_element_type() == ov::element::i64) {
     prefill_use_full_chat_history = true;
   }
 }
diff --git a/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc b/onnxruntime/core/providers/openvino/ov_stateful_patch_utils.cc
@@ -59,6 +59,17 @@ bool ModelHasInputOutputNames(std::shared_ptr<ov::Model> model, const std::strin
   return false;
 }
 
+std::string GetInputOutputName(std::shared_ptr<ov::Model> ov_model,
+                            const std::vector<std::string>& candidate_names) {
+  for (const auto& name : candidate_names) {
+    if (ModelHasInputOutputNames(ov_model, name)) {
+      return name;
+    }
+  }
+  // Return the first candidate as default if none are found
+  return candidate_names.empty() ? "" : candidate_names[0];
+}
+
 void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
                       std::vector<std::string>& not_kv_inputs,
                       const std::vector<std::string>& key_value_input_names,
@@ -67,10 +78,15 @@ void FuseCacheReorder(std::shared_ptr<ov::Model> ov_model,
     throw std::runtime_error("Model already has fused cache");
   }
 
-  std::string main_input_name = "inputs_embeds";
-  if (ModelHasInputOutputNames(ov_model, "input_ids")) {
-    main_input_name = "input_ids";
-  }
+    // Define input name candidates in priority order
+  const std::vector<std::string> input_name_candidates = {
+    "inputs_embeds",                           // Default fallback
+    "input_ids",                               // Most common
+    "input_hidden_states",                     // Alternative
+    "/model/embed_tokens/Gather_output_0"      // Specific model type
+  };
+
+  std::string main_input_name = GetInputOutputName(ov_model, input_name_candidates);
 
   auto input_batch = ov_model->input(main_input_name).get_partial_shape()[0];
 
@@ -130,6 +146,14 @@ void PatchStatefulDecoder(std::shared_ptr<ov::Model> model) {
         key_value_input_names.push_back(name);
         found = true;
         break;
+      } else if (name.find("keys") != std::string::npos) {
+        key_value_input_names.push_back(name);
+        found = true;
+        break;
+      } else if (name.find("values") != std::string::npos) {
+        key_value_input_names.push_back(name);
+        found = true;
+        break;
       }
     }
 

Original file line number	Diff line number	Diff line change
`@@ -361,7 +361,11 @@ void OVInferRequest::Infer() {`
`361`	`361`	`StatefulOVInferRequest::StatefulOVInferRequest(ov::InferRequest infer_request, std::string device)`
`362`	`362`	`: OVInferRequest(std::move(infer_request)), target_device(device) {`
`363`	`363`	`bool gpu_or_npu = ((device.find("NPU") != std::string::npos) \|\| (device.find("GPU") != std::string::npos));`
`364`		`- if (gpu_or_npu) {`
	`364`	`+`
	`365`	`+ // check if there is input_ids tensors and if the tensor type is int64,`
	`366`	`+ // because logic prefill_use_full_chat_history is only for specific inputs and data type`
	`367`	`+ auto input_ids_opt = FindTensor("input_ids");`
	`368`	`+ if (gpu_or_npu && input_ids_opt.has_value() && input_ids_opt->get_element_type() == ov::element::i64) {`
`365`	`369`	`prefill_use_full_chat_history = true;`
`366`	`370`	`}`
`367`	`371`	`}`