diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index c4de7e676c1b..04a87a120bf0 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -1018,6 +1018,7 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
       // Note that nodes with only inputs from initializer would not be place on CUDA
       // Ideally, those nodes should be eliminated in constant folding
       bool should_force_outside = true;
+      bool all_input_are_initializer = true;
       node.ForEachWithIndex(
           node.InputDefs(),
           [&](const NodeArg& def, size_t index) {
@@ -1025,12 +1026,17 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
             // The input is not a initializer and the input is from CPU
             // or the input declared as CPU memory and is from CPU
             // in that case we should still keep the node on CUDA
-            if ((!graph.GetInitializedTensor(def.Name(), initializer) && !defs_outside_cuda.count(&def)) ||
+            bool initializer_input = graph.GetInitializedTensor(def.Name(), initializer);
+            if ((!initializer_input && !defs_outside_cuda.count(&def)) ||
                 (defs_outside_cuda.count(&def) && cuda_kernel_def->kernel_def->IsInputOnCpu(index)))
               should_force_outside = false;
+            if (!initializer_input) {
+              all_input_are_initializer = false;
+            }
             return Status::OK();
           });
-      if (should_force_outside) {
+      // If all the inputs are initialier, we shouldn't force it to CPU
+      if (should_force_outside && !all_input_are_initializer) {
         force_outside = true;
       }
     }