diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index c4de7e676c1b..04a87a120bf0 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -1018,6 +1018,7 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, // Note that nodes with only inputs from initializer would not be place on CUDA // Ideally, those nodes should be eliminated in constant folding bool should_force_outside = true; + bool all_input_are_initializer = true; node.ForEachWithIndex( node.InputDefs(), [&](const NodeArg& def, size_t index) { @@ -1025,12 +1026,17 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, // The input is not a initializer and the input is from CPU // or the input declared as CPU memory and is from CPU // in that case we should still keep the node on CUDA - if ((!graph.GetInitializedTensor(def.Name(), initializer) && !defs_outside_cuda.count(&def)) || + bool initializer_input = graph.GetInitializedTensor(def.Name(), initializer); + if ((!initializer_input && !defs_outside_cuda.count(&def)) || (defs_outside_cuda.count(&def) && cuda_kernel_def->kernel_def->IsInputOnCpu(index))) should_force_outside = false; + if (!initializer_input) { + all_input_are_initializer = false; + } return Status::OK(); }); - if (should_force_outside) { + // If all the inputs are initialier, we shouldn't force it to CPU + if (should_force_outside && !all_input_are_initializer) { force_outside = true; } }