From 401a79c6d49afa0fd1a6b4797eb7e67ab7e75ab3 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Thu, 15 Aug 2019 17:08:00 -0700
Subject: [PATCH 1/7] update

---
 cmake/CMakeLists.txt                          |   5 -
 .../onnxruntime/core/framework/op_kernel.h    |   2 +-
 include/onnxruntime/core/framework/tensor.h   |   4 +-
 .../onnxruntime/core/session/environment.h    |  12 +-
 .../core/session/onnxruntime_c_api.h          |  19 +--
 .../core/session/onnxruntime_cxx_api.h        |   2 +
 .../cpu/attnlstm/attention_wrapper.cc         |  25 ++--
 .../cpu/attnlstm/attention_wrapper.h          |   4 +-
 .../cpu/attnlstm/bahdanau_attention.cc        |  34 ++---
 .../cpu/attnlstm/bahdanau_attention.h         |   3 +-
 .../cpu/attnlstm/deep_cpu_attn_lstm.cc        |  23 +--
 .../cpu/attnlstm/uni_dir_attn_lstm.cc         |   8 +-
 .../cpu/attnlstm/uni_dir_attn_lstm.h          |   4 +-
 onnxruntime/contrib_ops/cpu/nchwc_ops.cc      |   3 -
 onnxruntime/contrib_ops/cpu/nchwc_ops.h       |   2 +
 .../contrib_ops/cpu/word_conv_embedding.cc    |  12 +-
 .../contrib_ops/cpu/word_conv_embedding.h     |   5 +-
 onnxruntime/core/framework/bfc_arena.h        |   2 +-
 onnxruntime/core/framework/error_code.cc      |   5 +-
 onnxruntime/core/framework/mem_pattern.h      |   4 +-
 .../framework/op_kernel_context_internal.h    |   4 +-
 .../core/framework/parallel_executor.h        |   1 -
 onnxruntime/core/framework/session_state.h    |  10 +-
 onnxruntime/core/framework/tensor.cc          |   4 +-
 onnxruntime/core/mlas/lib/mlasi.h             |   6 -
 onnxruntime/core/mlas/lib/platform.cpp        |  19 ---
 onnxruntime/core/mlas/lib/threading.cpp       |  92 +-----------
 onnxruntime/core/providers/cpu/math/gemm.h    |   8 +-
 .../core/providers/cpu/math/logsoftmax.cc     |   7 +-
 onnxruntime/core/providers/cpu/math/matmul.cc |   7 +-
 .../core/providers/cpu/math/matmul_helper.h   |   5 +-
 .../core/providers/cpu/math/softmax.cc        |   6 +-
 .../core/providers/cpu/math/softmax_shared.cc |   5 +-
 .../core/providers/cpu/math/softmax_shared.h  |   5 +-
 onnxruntime/core/providers/cpu/nn/conv.cc     |  23 +--
 .../core/providers/cpu/nn/conv_transpose.cc   |   9 +-
 onnxruntime/core/providers/cpu/nn/pool.cc     |   2 +-
 onnxruntime/core/providers/cpu/nn/pool_base.h |  13 +-
 .../core/providers/cpu/rnn/deep_cpu_gru.cc    |  29 ++--
 .../core/providers/cpu/rnn/deep_cpu_lstm.cc   |  32 +++--
 .../core/providers/cpu/rnn/deep_cpu_lstm.h    |   4 +-
 onnxruntime/core/providers/cpu/rnn/rnn.cc     |  11 +-
 .../core/providers/cpu/rnn/rnn_helpers.h      |   6 +-
 .../core/providers/cpu/tensor/cast_op.cc      |   4 +-
 onnxruntime/core/session/environment.cc       |   1 +
 onnxruntime/core/session/inference_session.cc |  33 ++---
 onnxruntime/core/session/onnxruntime_c_api.cc |  16 +--
 onnxruntime/core/util/math.h                  |   9 +-
 onnxruntime/core/util/math_cpu.cc             |  55 ++++----
 .../test/framework/allocation_planner_test.cc |   3 +-
 .../test/framework/execution_frame_test.cc    |  19 ++-
 onnxruntime/test/framework/math_test.cc       |  58 +++++---
 .../test/framework/session_state_test.cc      | 131 ++++++++++--------
 onnxruntime/test/mlas/unittest.cpp            |  64 +++++----
 .../test/onnx/microbenchmark/model_init.cc    |  13 +-
 .../test/onnx/microbenchmark/modeltest.cc     |   3 +-
 onnxruntime/test/onnx/tensorprotoutils.cc     |   2 +-
 .../test/providers/cpu/math/softmax_test.cc   |   8 +-
 onnxruntime/test/providers/memcpy_test.cc     |   4 +-
 59 files changed, 463 insertions(+), 446 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index d9c6068c0d1c4..9b084286b4c6d 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -50,7 +50,6 @@ option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF)
 option(onnxruntime_USE_NSYNC "Build with NSYNC support. This option only takes effect on Linux" OFF)
 option(onnxruntime_USE_EIGEN_FOR_BLAS "Use eign for blas" ON)
 option(onnxruntime_USE_NNAPI "Build with DNNLibrary for Android NNAPI support" OFF)
-option(onnxruntime_USE_MLAS "Use optimized blas library for GEMM and 2D Convolution" ON)
 option(onnxruntime_USE_MKLDNN "Build with MKL-DNN support" OFF)
 option(onnxruntime_USE_MKLML "Build MKL-DNN with MKL-ML binary dependency" OFF)
 option(onnxruntime_USE_AUTOML "Build AutoML support" ON)
@@ -368,10 +367,6 @@ if (onnxruntime_RUN_ONNX_TESTS)
   add_definitions(-DORT_RUN_EXTERNAL_ONNX_TESTS)
 endif()
 
-if (onnxruntime_USE_MLAS)
-  add_definitions(-DUSE_MLAS)
-endif()
-
 #Adjust warning flags
 if (WIN32)
     add_definitions(-DPLATFORM_WINDOWS -DNOGDI -DNOMINMAX -D_USE_MATH_DEFINES)
diff --git a/include/onnxruntime/core/framework/op_kernel.h b/include/onnxruntime/core/framework/op_kernel.h
index d396551a1b407..6e98dbc20588b 100644
--- a/include/onnxruntime/core/framework/op_kernel.h
+++ b/include/onnxruntime/core/framework/op_kernel.h
@@ -210,7 +210,7 @@ struct KernelCreateInfo {
       : kernel_def(std::move(definition)),
         kernel_create_func(create_func) {}
 
-  KernelCreateInfo(KernelCreateInfo&& other)
+  KernelCreateInfo(KernelCreateInfo&& other) noexcept
       : kernel_def(std::move(other.kernel_def)),
         kernel_create_func(std::move(other.kernel_create_func)) {}
 };
diff --git a/include/onnxruntime/core/framework/tensor.h b/include/onnxruntime/core/framework/tensor.h
index 35eb359c714a3..31a43c7d905cb 100644
--- a/include/onnxruntime/core/framework/tensor.h
+++ b/include/onnxruntime/core/framework/tensor.h
@@ -78,9 +78,9 @@ class Tensor final {
   //Move is allowed
   ORT_DISALLOW_COPY_AND_ASSIGNMENT(Tensor);
 
-  Tensor(Tensor&& other);
+  Tensor(Tensor&& other) noexcept;
 
-  Tensor& operator=(Tensor&& other);
+  Tensor& operator=(Tensor&& other) noexcept;
 
   /**
      Returns the data type.
diff --git a/include/onnxruntime/core/session/environment.h b/include/onnxruntime/core/session/environment.h
index f36ebb60d0011..e11eb66072d1a 100644
--- a/include/onnxruntime/core/session/environment.h
+++ b/include/onnxruntime/core/session/environment.h
@@ -5,8 +5,11 @@
 
 #include <atomic>
 #include <memory>
+#include <mutex>
+#include <thread>
 #include "core/common/common.h"
 #include "core/common/status.h"
+#include "core/platform/threadpool.h"
 
 namespace onnxruntime {
 /**
@@ -29,13 +32,20 @@ class Environment {
      Returns whether any runtime environment instance has been initialized.
   */
   static bool IsInitialized() { return is_initialized_; }
+  concurrency::ThreadPool* GetThreadPool() {
+    std::call_once(tp_once_, [this]() {
+      tp_ = new concurrency::ThreadPool("default", std::max<int>(std::thread::hardware_concurrency() - 1, 1));
+    });
+    return tp_;
+  }
 
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Environment);
 
   Environment() = default;
   Status Initialize();
-
+  concurrency::ThreadPool* tp_ = nullptr;
+  std::once_flag tp_once_;
   static std::atomic<bool> is_initialized_;
 };
 }  // namespace onnxruntime
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 2ff53c9b50fe1..74cd44bd2ab39 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -23,6 +23,10 @@ extern "C" {
 #define _Inout_
 #define _Inout_opt_
 #define _Frees_ptr_opt_
+#define _Ret_maybenull_
+#define _Ret_notnull_
+#define _Check_return_
+#define _Success_(X)
 #define ORT_ALL_ARGS_NONNULL __attribute__((nonnull))
 #else
 #include <specstrings.h>
@@ -127,11 +131,11 @@ typedef enum OrtErrorCode {
   ORT_EXPORT RETURN_TYPE ORT_API_CALL NAME(__VA_ARGS__) NO_EXCEPTION
 
 #define ORT_API_STATUS(NAME, ...) \
-  ORT_EXPORT OrtStatus* ORT_API_CALL NAME(__VA_ARGS__) NO_EXCEPTION ORT_MUST_USE_RESULT
+  ORT_EXPORT _Check_return_ _Success_(return == 0) _Ret_maybenull_ OrtStatus* ORT_API_CALL NAME(__VA_ARGS__) NO_EXCEPTION ORT_MUST_USE_RESULT
 
 // Used in *.cc files. Almost as same as ORT_API_STATUS, except without ORT_MUST_USE_RESULT
 #define ORT_API_STATUS_IMPL(NAME, ...) \
-  ORT_EXPORT OrtStatus* ORT_API_CALL NAME(__VA_ARGS__) NO_EXCEPTION
+  ORT_EXPORT _Check_return_ _Success_(return == 0) _Ret_maybenull_ OrtStatus* ORT_API_CALL NAME(__VA_ARGS__) NO_EXCEPTION
 
 #define ORT_RUNTIME_CLASS(X)    \
   struct Ort##X;                \
@@ -140,12 +144,11 @@ typedef enum OrtErrorCode {
 
 // The actual types defined have an Ort prefix
 ORT_RUNTIME_CLASS(Env);
-ORT_RUNTIME_CLASS(Status);  // nullptr for Status* indicates success
+ORT_RUNTIME_CLASS(Status);
 ORT_RUNTIME_CLASS(Provider);
 ORT_RUNTIME_CLASS(AllocatorInfo);
-ORT_RUNTIME_CLASS(Session);
+ORT_RUNTIME_CLASS(Session); //Don't call OrtReleaseSession from Dllmain (because session owns a thread pool)
 ORT_RUNTIME_CLASS(Value);
-ORT_RUNTIME_CLASS(ValueList);
 ORT_RUNTIME_CLASS(RunOptions);
 ORT_RUNTIME_CLASS(TypeInfo);
 ORT_RUNTIME_CLASS(TensorTypeAndShapeInfo);
@@ -339,7 +342,7 @@ ORT_API_STATUS(OrtGetStringTensorDataLength, _In_ const OrtValue* value, _Out_ s
  * \param value A tensor created from OrtCreateTensor... function.
  * \param s_len total data length, get it from OrtGetStringTensorDataLength
  */
-ORT_API_STATUS(OrtGetStringTensorContent, _In_ const OrtValue* value, _Out_ void* s, size_t s_len,
+ORT_API_STATUS(OrtGetStringTensorContent, _In_ const OrtValue* value, _In_ void* s, size_t s_len,
                _Out_ size_t* offsets, size_t offsets_len);
 
 /**
@@ -444,8 +447,8 @@ ORT_API(const char*, OrtGetVersionString);
 /**
  * \param msg A null-terminated string. Its content will be copied into the newly created OrtStatus
  */
-ORT_API(OrtStatus*, OrtCreateStatus, OrtErrorCode code, _In_ const char* msg)
-ORT_ALL_ARGS_NONNULL;
+ORT_EXPORT _Check_return_ _Ret_notnull_ OrtStatus* ORT_API_CALL OrtCreateStatus(OrtErrorCode code, _In_ const char* msg) NO_EXCEPTION
+    ORT_ALL_ARGS_NONNULL;
 
 ORT_API(OrtErrorCode, OrtGetErrorCode, _In_ const OrtStatus* status)
 ORT_ALL_ARGS_NONNULL;
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index e1397105c3bef..992afe2c6fa89 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -99,6 +99,7 @@ struct Env;
 struct TypeInfo;
 struct Value;
 
+//Don't put such an object as a global(or thread local) variable in a DLL
 struct Env : Base<OrtEnv> {
   Env(nullptr_t) {}
   Env(OrtLoggingLevel default_logging_level, _In_ const char* logid);
@@ -156,6 +157,7 @@ struct SessionOptions : Base<OrtSessionOptions> {
   SessionOptions& Add(OrtCustomOpDomain* custom_op_domain);
 };
 
+//Don't put such an object as a global(or thread local) variable in a DLL
 struct Session : Base<OrtSession> {
   explicit Session(nullptr_t) {}
   Session(Env& env, const ORTCHAR_T* model_path, const SessionOptions& options);
diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.cc b/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.cc
index 4555713a59fe1..8757ccb35f771 100644
--- a/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.cc
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.cc
@@ -16,7 +16,7 @@ template <typename T>
 AttentionWrapper<T>::AttentionWrapper(AllocatorPtr alloc, const logging::Logger& logger,
                                       int batch_size, int attn_context_depth, int attn_layer_depth,
                                       int inner_cell_hidden_size, bool has_attn_layer,
-                                      const IAttentionMechanism<T>& attention_mechanism)
+                                      const IAttentionMechanism<T>& attention_mechanism, concurrency::ThreadPool* threadpool)
     : allocator_(alloc),
       logger_(logger),
       batch_size_(batch_size),
@@ -24,7 +24,8 @@ AttentionWrapper<T>::AttentionWrapper(AllocatorPtr alloc, const logging::Logger&
       attn_layer_depth_(attn_layer_depth),
       inner_cell_hidden_size_(inner_cell_hidden_size),
       has_attn_layer_(has_attn_layer),
-      attention_mechanism_(attention_mechanism) {
+      attention_mechanism_(attention_mechanism),
+      ttp_(threadpool) {
   auto mem_max_steps = attention_mechanism_.GetMaxMemorySteps();
   prev_alignments_ = Allocate(allocator_, batch_size_ * mem_max_steps, prev_alignments_ptr_, true);
   alignments_ = Allocate(allocator_, batch_size_ * mem_max_steps, alignments_ptr_, true);
@@ -37,11 +38,11 @@ template <typename T>
 void AttentionWrapper<T>::ProcessOutput(const gsl::span<const T>& rnn_cell_output) {
   if (has_attn_layer_) {
     // rnn_cell_output * cell_weights, (part of the attention layer above the attention mechanism).
-    math::GemmEx<T, CPUMathUtil>(CblasNoTrans, CblasNoTrans,
-                                 batch_size_, attn_layer_depth_, inner_cell_hidden_size_, T{1.0},
-                                 rnn_cell_output.data(), inner_cell_hidden_size_,
-                                 attn_layer_cell_weights_.data(), attn_layer_depth_, T{0.0},
-                                 attn_states_.data(), attn_layer_depth_, &CPUMathUtil::Instance());
+    math::GemmEx<T>(CblasNoTrans, CblasNoTrans,
+                    batch_size_, attn_layer_depth_, inner_cell_hidden_size_, T{1.0},
+                    rnn_cell_output.data(), inner_cell_hidden_size_,
+                    attn_layer_cell_weights_.data(), attn_layer_depth_, T{0.0},
+                    attn_states_.data(), attn_layer_depth_, ttp_);
   }
 
   // Get the context which is calculated within attention mechanism.
@@ -54,11 +55,11 @@ void AttentionWrapper<T>::ProcessOutput(const gsl::span<const T>& rnn_cell_outpu
     //concat([p_cell_output, context]) * stack([attn_layer_cell_weights_, attn_layer_attn_weights_]) =
     //     p_cell_output * attn_layer_cell_weights_ + context * attn_layer_attn_weights_
     // The first part is calulated above. Here just add the later.
-    math::GemmEx<T, CPUMathUtil>(CblasNoTrans, CblasNoTrans,
-                                 batch_size_, attn_layer_depth_, attn_context_depth_, T{1.0},
-                                 attn_context_.data(), attn_context_depth_,
-                                 attn_layer_attn_weights_.data(), attn_layer_depth_, T{1.0},
-                                 attn_states_.data(), attn_layer_depth_, &CPUMathUtil::Instance());
+    math::GemmEx<T>(CblasNoTrans, CblasNoTrans,
+                    batch_size_, attn_layer_depth_, attn_context_depth_, T{1.0},
+                    attn_context_.data(), attn_context_depth_,
+                    attn_layer_attn_weights_.data(), attn_layer_depth_, T{1.0},
+                    attn_states_.data(), attn_layer_depth_, ttp_);
   }
 }
 
diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.h b/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.h
index 2469a7b99a3fb..b6cc06c040e3a 100644
--- a/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.h
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.h
@@ -8,6 +8,7 @@
 #include "core/common/common.h"
 #include "core/common/logging/logging.h"
 #include "core/framework/allocator.h"
+#include "core/platform/threadpool.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -22,7 +23,7 @@ class AttentionWrapper {
                    int attn_layer_depth,
                    int inner_cell_hidden_size,
                    bool has_attn_layer,
-                   const IAttentionMechanism<T>& attention_mechanism);
+                   const IAttentionMechanism<T>& attention_mechanism, concurrency::ThreadPool* threadpool);
 
   virtual ~AttentionWrapper() = default;
 
@@ -69,6 +70,7 @@ class AttentionWrapper {
   bool has_attn_layer_;
 
   const IAttentionMechanism<T>& attention_mechanism_;
+  concurrency::ThreadPool* ttp_;
 };
 
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc b/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc
index 932ac263f8e22..74ad84b5af839 100644
--- a/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc
@@ -15,8 +15,8 @@ namespace contrib {
 template <typename T>
 BahdanauAttention<T>::BahdanauAttention(AllocatorPtr allocator, const logging::Logger& logger,
                                         int batch_size, int max_memory_step, int memory_depth,
-                                        int query_depth, int attn_depth, bool normalize)
-    : allocator_(allocator), logger_(logger), batch_size_(batch_size), max_memory_steps_(max_memory_step), memory_depth_(memory_depth), query_depth_(query_depth), attn_depth_(attn_depth), normalize_(normalize) {
+                                        int query_depth, int attn_depth, bool normalize, concurrency::ThreadPool* threadpool)
+    : allocator_(allocator), logger_(logger), batch_size_(batch_size), max_memory_steps_(max_memory_step), memory_depth_(memory_depth), query_depth_(query_depth), attn_depth_(attn_depth), normalize_(normalize), ttp_(threadpool) {
   values_ = Allocate(allocator_, batch_size_ * max_memory_steps_ * memory_depth_, values_ptr_, true);
   keys_ = Allocate(allocator_, batch_size_ * max_memory_steps_ * attn_depth_, keys_ptr_, true);
   processed_query_ = Allocate(allocator_, batch_size_ * attn_depth_, processed_query_ptr_, true);
@@ -72,11 +72,11 @@ void BahdanauAttention<T>::PrepareMemory(
                 "Real memory steps ", mem_steps, " is not in (0, ", max_memory_steps_, "]");
   }
 
-  math::GemmEx<T, CPUMathUtil>(CblasNoTrans, CblasNoTrans,
-                               batch_size_ * max_memory_steps_, attn_depth_, memory_depth_, T{1.0},
-                               memory.data(), memory_depth_,
-                               memory_layer_weights_.data(), attn_depth_, T{0.0},
-                               keys_.data(), attn_depth_, &CPUMathUtil::Instance());
+  math::GemmEx<T>(CblasNoTrans, CblasNoTrans,
+                  batch_size_ * max_memory_steps_, attn_depth_, memory_depth_, T{1.0},
+                  memory.data(), memory_depth_,
+                  memory_layer_weights_.data(), attn_depth_, T{0.0},
+                  keys_.data(), attn_depth_, ttp_);
 }
 
 template <typename T>
@@ -115,11 +115,11 @@ void BahdanauAttention<T>::Compute(
     const gsl::span<T>& output,
     const gsl::span<T>& aligns) const {
   //process query in dense query layer without bias
-  math::GemmEx<T, CPUMathUtil>(CblasNoTrans, CblasNoTrans,
-                               batch_size_, attn_depth_, query_depth_, T{1.0},
-                               queries.data(), query_depth_,
-                               query_layer_weights_.data(), attn_depth_, T{0.0},
-                               processed_query_.data(), attn_depth_, &CPUMathUtil::Instance());
+  math::GemmEx<T>(CblasNoTrans, CblasNoTrans,
+                  batch_size_, attn_depth_, query_depth_, T{1.0},
+                  queries.data(), query_depth_,
+                  query_layer_weights_.data(), attn_depth_, T{0.0},
+                  processed_query_.data(), attn_depth_, ttp_);
 
   std::fill(aligns.begin(), aligns.end(), T{});
 
@@ -146,11 +146,11 @@ void BahdanauAttention<T>::Compute(
     // Calculate the context
     auto outspan = output.subspan(b * memory_depth_);
     auto values = values_.subspan(b * max_memory_steps_ * memory_depth_);
-    math::GemmEx<T, CPUMathUtil>(CblasNoTrans, CblasNoTrans,
-                                 1, memory_depth_, max_memory_steps_, T{1.0},
-                                 alignments, max_memory_steps_,
-                                 values.data(), memory_depth_, T{0.0},
-                                 outspan.data(), memory_depth_, &CPUMathUtil::Instance());
+    math::GemmEx<T>(CblasNoTrans, CblasNoTrans,
+                    1, memory_depth_, max_memory_steps_, T{1.0},
+                    alignments, max_memory_steps_,
+                    values.data(), memory_depth_, T{0.0},
+                    outspan.data(), memory_depth_, ttp_);
   }
 }
 
diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.h b/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.h
index 755af6ba6d5c3..c2bfee15c5bcc 100644
--- a/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.h
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.h
@@ -23,7 +23,7 @@ class BahdanauAttention : public IAttentionMechanism<T> {
       int memory_depth,
       int query_depth,
       int attn_depth,
-      bool normalize);
+      bool normalize, concurrency::ThreadPool* threadpool);
 
   void SetWeights(
       const gsl::span<const T>& attn_weights,
@@ -77,6 +77,7 @@ class BahdanauAttention : public IAttentionMechanism<T> {
   gsl::span<int> mem_seq_lengths_;
 
   bool normalize_;
+  concurrency::ThreadPool* ttp_;
 };
 
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc
index 7f7102475c620..50e98f834260b 100644
--- a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc
@@ -8,7 +8,9 @@
 
 #include "core/common/common.h"
 #include "core/common/logging/logging.h"
+#include "core/platform/threadpool.h"
 #include "core/framework/allocator.h"
+#include "core/framework/op_kernel_context_internal.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -70,6 +72,9 @@ static gsl::span<const T> SecondHalfSpan(const gsl::span<const T>& dspan) {
 
 template <typename T>
 Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
+  auto ctx_internal = static_cast<OpKernelContextInternal*>(&context);
+  concurrency::ThreadPool* thread_pool = ctx_internal->GetOperatorThreadPool();
+
   auto& logger = context.Logger();
 
   // original lstm processing
@@ -236,7 +241,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
         memory_depth,
         query_depth,
         am_attn_size,
-        false);
+        false, thread_pool);
 
     fam.SetWeights(
         FirstHalfSpan(am_v_weights.DataAsSpan<T>()),
@@ -252,7 +257,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
         attn_layer_depth,
         hidden_size_,
         has_attention_layer,
-        fam);
+        fam, thread_pool);
     faw.SetWeights(FirstHalfSpan(attn_layer_weights_span));
 
     UniDirectionalAttnLstm<T> fw(
@@ -263,7 +268,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
         activation_funcs_.Entries()[0],
         activation_funcs_.Entries()[1],
         activation_funcs_.Entries()[2],
-        clip_, ttp_);
+        clip_, thread_pool);
 
     BahdanauAttention<T> bam(
         alloc,
@@ -273,7 +278,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
         memory_depth,
         query_depth,
         am_attn_size,
-        false);
+        false, thread_pool);
     bam.SetWeights(
         SecondHalfSpan(am_v_weights.DataAsSpan<T>()),
         SecondHalfSpan(am_query_layer_weights.DataAsSpan<T>()),
@@ -288,7 +293,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
         attn_layer_depth,
         hidden_size_,
         has_attention_layer,
-        bam);
+        bam, thread_pool);
     baw.SetWeights(SecondHalfSpan(attn_layer_weights_span));
 
     UniDirectionalAttnLstm<T> bw(
@@ -299,7 +304,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
         activation_funcs_.Entries()[3],
         activation_funcs_.Entries()[4],
         activation_funcs_.Entries()[5],
-        clip_, ttp_);
+        clip_, thread_pool);
 
     fw.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, output_1, hidden_output_1, last_cell_1);
     bw.Compute(input, sequence_lens_span, num_directions_, input_weights_2, hidden_weights_2, output_2, hidden_output_2, last_cell_2);
@@ -313,7 +318,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
         memory_depth,
         query_depth,
         am_attn_size,
-        false);
+        false, thread_pool);
 
     fam.SetWeights(
         am_v_weights.DataAsSpan<T>(),
@@ -329,7 +334,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
         attn_layer_depth,
         hidden_size_,
         has_attention_layer,
-        fam);
+        fam, thread_pool);
 
     faw.SetWeights(attn_layer_weights_span);
 
@@ -341,7 +346,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const {
         activation_funcs_.Entries()[0],
         activation_funcs_.Entries()[1],
         activation_funcs_.Entries()[2],
-        clip_, ttp_);
+        clip_, thread_pool);
 
     fw.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, output_1, hidden_output_1, last_cell_1);
   }
diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc b/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc
index caa05f9d5ceff..4183b6e2d6de4 100644
--- a/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc
@@ -45,7 +45,7 @@ UniDirectionalAttnLstm<T>::UniDirectionalAttnLstm(AllocatorPtr allocator,
                                                   const ActivationFuncs::Entry& activation_func_g,
                                                   const ActivationFuncs::Entry& activation_func_h,
                                                   const float clip,
-                                                  onnxruntime::concurrency::ThreadPool& ttp)
+                                                  onnxruntime::concurrency::ThreadPool* ttp)
     : allocator_(allocator),
       logger_(logger),
       seq_length_(seq_length),
@@ -254,7 +254,7 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
               input_weights.cbegin(), input_weights.cend(),  // W[iofc]^T
               input_size_ + attention_size_, T{0.0},
               output_iofc_.begin(), output_iofc_.end(),
-              hidden_size_x4);
+              hidden_size_x4, ttp_);
 
   DumpMatrix("Xt*(W[iofc]^T)", output_iofc_.data(), total_rows, hidden_size_x4);
 
@@ -296,7 +296,7 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
                   input_weights.cbegin() + input_size_, input_weights.cend(),  // WA[iofc]
                   input_size_ + attention_size_, T{1.0},
                   step_out_IOFC, output_iofc_.end(),  // input contains Xt*(W[iofc]^T)
-                  hidden_size_x4);
+                  hidden_size_x4, ttp_);
 
       // calculate Xt*(W[iofc]^T) + Ht-1*R[iofc]
       ComputeGemm(batch_size_, hidden_size_x4, hidden_size_, T{1.0},
@@ -305,7 +305,7 @@ void UniDirectionalAttnLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
                   recurrent_weights.cbegin(), recurrent_weights.cend(),  // R[iofc]
                   hidden_size_, T{1.0},
                   step_out_IOFC, output_iofc_.end(),  // input contains Xt*(W[iofc]^T)
-                  hidden_size_x4);
+                  hidden_size_x4, ttp_);
 
       span_T_iter batched_output, batched_output_end;
       if (output_sequence) {
diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.h b/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.h
index 5a8e4e3224a25..2d3a6f20fe1e9 100644
--- a/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.h
+++ b/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.h
@@ -51,7 +51,7 @@ class UniDirectionalAttnLstm {
                          const ActivationFuncs::Entry& activation_func_g,
                          const ActivationFuncs::Entry& activation_func_h,
                          const float clip,
-                         onnxruntime::concurrency::ThreadPool& ttp);
+                         onnxruntime::concurrency::ThreadPool* ttp);
 
   void Compute(const gsl::span<const T>& inputs,
                const gsl::span<const int>& sequence_lengths,
@@ -151,7 +151,7 @@ class UniDirectionalAttnLstm {
 
   AttentionWrapper<T>& attention_wrapper_;
 
-  onnxruntime::concurrency::ThreadPool& ttp_;
+  onnxruntime::concurrency::ThreadPool* ttp_;
 };
 
 }  // namespace detail
diff --git a/onnxruntime/contrib_ops/cpu/nchwc_ops.cc b/onnxruntime/contrib_ops/cpu/nchwc_ops.cc
index b5625551ad104..3b14b21a79533 100644
--- a/onnxruntime/contrib_ops/cpu/nchwc_ops.cc
+++ b/onnxruntime/contrib_ops/cpu/nchwc_ops.cc
@@ -170,9 +170,6 @@ Status NchwcPoolBase::NchwcPool(OpKernelContext* context, MLAS_POOLING_KIND kind
   ORT_ENFORCE(X_shape.NumDimensions() == 4);
   ORT_ENFORCE((X_shape[1] % MlasNchwcGetBlockSize()) == 0);
 
-  if (!global_pooling_) {
-    ORT_RETURN_IF_NOT(kernel_shape_.size() == 2, "kernel_shape num_dims is not compatible with X num_dims.");
-  }
 
   std::vector<int64_t> pads = pads_;
   std::vector<int64_t> output_dims = PoolBase::SetOutputSize(X_shape, X_shape[1], &pads, dilations_, ceil_mode_);
diff --git a/onnxruntime/contrib_ops/cpu/nchwc_ops.h b/onnxruntime/contrib_ops/cpu/nchwc_ops.h
index 65045cd0eeb85..b9f8993114094 100644
--- a/onnxruntime/contrib_ops/cpu/nchwc_ops.h
+++ b/onnxruntime/contrib_ops/cpu/nchwc_ops.h
@@ -50,6 +50,8 @@ class NchwcConv : public OpKernel, public ConvBase {
 class NchwcPoolBase : public PoolBase {
  public:
   NchwcPoolBase(const OpKernelInfo& info) : PoolBase(info) {
+    if (!global_pooling_)
+      ORT_ENFORCE(kernel_shape_.size() == 2, "kernel_shape num_dims is not compatible with X num_dims.");
   }
 
   Status NchwcPool(OpKernelContext* context, MLAS_POOLING_KIND kind) const;
diff --git a/onnxruntime/contrib_ops/cpu/word_conv_embedding.cc b/onnxruntime/contrib_ops/cpu/word_conv_embedding.cc
index 7d7f577d5e3a1..3213ff4fc1db3 100644
--- a/onnxruntime/contrib_ops/cpu/word_conv_embedding.cc
+++ b/onnxruntime/contrib_ops/cpu/word_conv_embedding.cc
@@ -6,6 +6,7 @@
 #include "core/util/math.h"
 #include "core/util/math_cpuonly.h"
 #include "core/mlas/inc/mlas.h"
+#include "core/framework/op_kernel_context_internal.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -45,7 +46,7 @@ void WordConvEmbedding::ComputeConvMaxPoolWithActivation(
     int64_t char_embedding_size,
     int64_t filter_width,
     int64_t num_filters,
-    float* output) const {
+    float* output, concurrency::ThreadPool* tp) const {
   int64_t input_word_size = word_len * char_embedding_size;
   int64_t unfolded_width = word_len - filter_width + 1;
   int64_t unfolded_kernal_size = filter_width * char_embedding_size;
@@ -83,12 +84,12 @@ void WordConvEmbedding::ComputeConvMaxPoolWithActivation(
       tmp_word_inx++;
     }
 
-    math::GemmEx<float, CPUMathUtil>(
+    math::GemmEx<float>(
         CblasNoTrans, CblasTrans,
         static_cast<int>(words_unfolded_width), static_cast<int>(num_filters), static_cast<int>(unfolded_kernal_size), 1.0f,
         unfolded_buffer_p.get(), static_cast<int>(unfolded_kernal_size),
         weights, static_cast<int>(unfolded_kernal_size), 0.0f,
-        conv_buf_p, static_cast<int>(num_filters), &CPUMathUtil::Instance());
+        conv_buf_p, static_cast<int>(num_filters), tp);
 
     for (int64_t unfolded_inx = 0; unfolded_inx < words_unfolded_width; unfolded_inx++)
       for (int64_t filter_inx = 0; filter_inx < num_filters; filter_inx++) {
@@ -160,6 +161,9 @@ Status WordConvEmbedding::ValidateInputShape(const TensorShape& w_conv_shape, co
 }
 
 Status WordConvEmbedding::Compute(OpKernelContext* ctx) const {
+  auto ctx_internal = static_cast<OpKernelContextInternal*>(ctx);
+  concurrency::ThreadPool* tp = ctx_internal->GetOperatorThreadPool();
+
   // original lstm processing
   const Tensor& sequence = *(ctx->Input<Tensor>(0));          // sequence: [sequence_length, word_length]
   const Tensor& w_conv = *(ctx->Input<Tensor>(1));            // conv weight: [M, C/group, kH, kW]
@@ -216,7 +220,7 @@ Status WordConvEmbedding::Compute(OpKernelContext* ctx) const {
       char_embedding_size,
       filter_width,
       filter_size,
-      Y->MutableData<float>());
+      Y->MutableData<float>(), tp);
 
   return Status::OK();
 }
diff --git a/onnxruntime/contrib_ops/cpu/word_conv_embedding.h b/onnxruntime/contrib_ops/cpu/word_conv_embedding.h
index e74afab169fd8..5ee4127e3bfb9 100644
--- a/onnxruntime/contrib_ops/cpu/word_conv_embedding.h
+++ b/onnxruntime/contrib_ops/cpu/word_conv_embedding.h
@@ -8,6 +8,9 @@
 #include "core/framework/tensor.h"
 
 namespace onnxruntime {
+namespace concurrency {
+class ThreadPool;
+}
 namespace contrib {
 
 class WordConvEmbedding final : public OpKernel {
@@ -38,7 +41,7 @@ class WordConvEmbedding final : public OpKernel {
       int64_t char_embedding_size,
       int64_t filter_width,
       int64_t num_filters,
-      float* output) const;
+      float* output, onnxruntime::concurrency::ThreadPool* tp) const;
   void CalculateLengthOfEachWordInSequence(
       const int* seq_ptr,
       int* words_len_ptr,
diff --git a/onnxruntime/core/framework/bfc_arena.h b/onnxruntime/core/framework/bfc_arena.h
index 664f6fa72a04b..bdc6496c63205 100644
--- a/onnxruntime/core/framework/bfc_arena.h
+++ b/onnxruntime/core/framework/bfc_arena.h
@@ -244,7 +244,7 @@ class BFCArena : public IArenaAllocator {
 
     ~AllocationRegion() { delete[] handles_; }
 
-    AllocationRegion(AllocationRegion&& other) { Swap(other); }
+    AllocationRegion(AllocationRegion&& other) noexcept { Swap(other); }
 
     AllocationRegion& operator=(AllocationRegion&& other) {
       Swap(other);
diff --git a/onnxruntime/core/framework/error_code.cc b/onnxruntime/core/framework/error_code.cc
index 2cf11f4e1de8e..c727b7464f3ac 100644
--- a/onnxruntime/core/framework/error_code.cc
+++ b/onnxruntime/core/framework/error_code.cc
@@ -12,11 +12,12 @@ struct OrtStatus {
   char msg[1];  // a null-terminated string
 };
 
-ORT_API(OrtStatus*, OrtCreateStatus, OrtErrorCode code, _In_ const char* msg) {
+//Even we say it may not return NULL, indeed it may.
+ORT_EXPORT _Check_return_ _Ret_notnull_ OrtStatus* ORT_API_CALL OrtCreateStatus(OrtErrorCode code, _In_ const char* msg) NO_EXCEPTION {
   assert(!(code == 0 && msg != nullptr));
   size_t clen = strlen(msg);
   OrtStatus* p = reinterpret_cast<OrtStatus*>(::malloc(sizeof(OrtStatus) + clen));
-  if (p == nullptr) return nullptr;  // OOM
+  if (p == nullptr) return nullptr;  // OOM. What we can do here? abort()?
   p->code = code;
   memcpy(p->msg, msg, clen);
   p->msg[clen] = '\0';
diff --git a/onnxruntime/core/framework/mem_pattern.h b/onnxruntime/core/framework/mem_pattern.h
index 57d9e99360b13..2aa1e3cad32eb 100644
--- a/onnxruntime/core/framework/mem_pattern.h
+++ b/onnxruntime/core/framework/mem_pattern.h
@@ -20,11 +20,11 @@ class MemoryPattern {
  public:
   MemoryPattern() = default;
 
-  MemoryPattern(MemoryPattern&& rhs)
+  MemoryPattern(MemoryPattern&& rhs) noexcept
       : patterns_{std::move(rhs.patterns_)},
         peak_size_{std::move(rhs.peak_size_)} {}
 
-  MemoryPattern& operator=(MemoryPattern&& rhs) {
+  MemoryPattern& operator=(MemoryPattern&& rhs) noexcept {
     patterns_ = std::move(rhs.patterns_);
     peak_size_ = std::move(rhs.peak_size_);
     return *this;
diff --git a/onnxruntime/core/framework/op_kernel_context_internal.h b/onnxruntime/core/framework/op_kernel_context_internal.h
index 02515ba39a160..b837356504d36 100644
--- a/onnxruntime/core/framework/op_kernel_context_internal.h
+++ b/onnxruntime/core/framework/op_kernel_context_internal.h
@@ -5,6 +5,7 @@
 
 #include "core/framework/op_kernel.h"
 #include "core/framework/session_state.h"
+#include "core/session/onnxruntime_c_api.h"
 
 // onnxruntime internal OpKernelContext derived class to provide additional
 // APIs that aren't desirable to add to the public OpKernelContext API
@@ -57,7 +58,8 @@ class OpKernelContextInternal : public OpKernelContext {
 
   const bool& GetTerminateFlag() const noexcept { return terminate_flag_; }
 
-  const onnxruntime::concurrency::ThreadPool* GetOperatorThreadPool() const { return session_state_.GetThreadPool(); }
+  _Ret_maybenull_ const onnxruntime::concurrency::ThreadPool* GetOperatorThreadPool() const { return session_state_.GetThreadPool(); }
+  _Ret_maybenull_ onnxruntime::concurrency::ThreadPool* GetOperatorThreadPool() { return session_state_.GetThreadPool(); }
 
  private:
   const SessionState& session_state_;
diff --git a/onnxruntime/core/framework/parallel_executor.h b/onnxruntime/core/framework/parallel_executor.h
index 5f34309937bac..74d3fbce3d8d4 100644
--- a/onnxruntime/core/framework/parallel_executor.h
+++ b/onnxruntime/core/framework/parallel_executor.h
@@ -21,7 +21,6 @@ class ExecutionFrame;
 
 class ParallelExecutor : public IExecutor {
  public:
-  ParallelExecutor(const bool& terminate_flag = false) : terminate_flag_{terminate_flag} {}
   ParallelExecutor(const SessionState& session_state, const bool& terminate_flag = false);
 
   common::Status Execute(const SessionState& session_state, const std::vector<int>& feed_mlvalue_idxs,
diff --git a/onnxruntime/core/framework/session_state.h b/onnxruntime/core/framework/session_state.h
index 92a6f107e5058..0f64b2b943c08 100644
--- a/onnxruntime/core/framework/session_state.h
+++ b/onnxruntime/core/framework/session_state.h
@@ -43,8 +43,8 @@ struct MemoryPatternGroup;
  */
 class SessionState {
  public:
-  SessionState(const ExecutionProviders& execution_providers, bool enable_mem_pattern)
-      : execution_providers_{execution_providers}, enable_mem_pattern_(enable_mem_pattern) {}
+  SessionState(const ExecutionProviders& execution_providers, bool enable_mem_pattern, concurrency::ThreadPool* thread_pool)
+      : execution_providers_{execution_providers}, enable_mem_pattern_(enable_mem_pattern), thread_pool_(thread_pool) {}
 
   ~SessionState() {
     for (auto& kvp : deleter_for_initialized_tensors_) {
@@ -175,8 +175,7 @@ class SessionState {
 
   SessionState* GetMutableSubgraphSessionState(onnxruntime::NodeIndex index, const std::string& attribute_name);
 
-  onnxruntime::concurrency::ThreadPool* GetThreadPool() const { return thread_pool_; }
-  void SetThreadPool(onnxruntime::concurrency::ThreadPool* p_pool) { thread_pool_ = p_pool; }
+  concurrency::ThreadPool* GetThreadPool() const { return thread_pool_; }
 
   bool ExportDll() const { return export_fused_dll_; }
   void SetExportDllFlag(bool flag) { export_fused_dll_ = flag; }
@@ -232,7 +231,8 @@ class SessionState {
       std::unordered_map<onnxruntime::NodeIndex, std::unordered_map<std::string, std::unique_ptr<SessionState>>>;
   SubgraphSessionStateMap subgraph_session_states_;
 
-  onnxruntime::concurrency::ThreadPool* thread_pool_ = nullptr;
+  //It could be NULL
+  concurrency::ThreadPool* const thread_pool_;
 
   bool export_fused_dll_ = false;
   FuncManager fused_funcs_mgr_;
diff --git a/onnxruntime/core/framework/tensor.cc b/onnxruntime/core/framework/tensor.cc
index d0085c0fe6c1a..692232a6a8abc 100644
--- a/onnxruntime/core/framework/tensor.cc
+++ b/onnxruntime/core/framework/tensor.cc
@@ -47,7 +47,7 @@ void Tensor::Init(MLDataType p_type, const TensorShape& shape, void* p_raw_data,
   byte_offset_ = offset;
 }
 
-Tensor::Tensor(Tensor&& other)
+Tensor::Tensor(Tensor&& other) noexcept
     : p_data_(other.p_data_),
       buffer_deleter_(other.buffer_deleter_),
       shape_(other.shape_),
@@ -61,7 +61,7 @@ Tensor::Tensor(Tensor&& other)
   other.byte_offset_ = 0;
 }
 
-Tensor& Tensor::operator=(Tensor&& other) {
+Tensor& Tensor::operator=(Tensor&& other) noexcept {
   if (this != &other) {
     ReleaseBuffer();
 
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
index b191c155928d9..1b9c6e505affb 100644
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@@ -92,8 +92,6 @@ Module Name:
 
 #if defined(_OPENMP)
 #include <omp.h>
-#elif defined(_WIN32)
-#define MLAS_USE_WIN32_THREADPOOL
 #endif
 
 //
@@ -423,10 +421,6 @@ struct MLAS_PLATFORM {
     uint32_t NchwcBlockSize;
     uint32_t PreferredBufferAlignment;
 #endif
-
-#if defined(MLAS_USE_WIN32_THREADPOOL)
-    int32_t MaximumThreadCount;
-#endif
 };
 
 extern MLAS_PLATFORM MlasPlatform;
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
index 4f99d50fb27b0..d4f3324f10ed6 100644
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@@ -192,25 +192,6 @@ Return Value:
     }
 
 #endif
-
-#if defined(MLAS_USE_WIN32_THREADPOOL)
-
-    //
-    // Retrieve the number of processors in the system.
-    //
-
-    SYSTEM_INFO SystemInfo;
-
-    GetSystemInfo(&SystemInfo);
-
-    if (SystemInfo.dwNumberOfProcessors <= MLAS_MAXIMUM_THREAD_COUNT) {
-        this->MaximumThreadCount = int32_t(SystemInfo.dwNumberOfProcessors);
-    } else {
-        this->MaximumThreadCount = MLAS_MAXIMUM_THREAD_COUNT;
-    }
-
-#endif
-
 }
 
 size_t
diff --git a/onnxruntime/core/mlas/lib/threading.cpp b/onnxruntime/core/mlas/lib/threading.cpp
index 858b72722e8bc..ef30de9499bb2 100644
--- a/onnxruntime/core/mlas/lib/threading.cpp
+++ b/onnxruntime/core/mlas/lib/threading.cpp
@@ -16,59 +16,6 @@ Module Name:
 
 #include "mlasi.h"
 
-#if defined(MLAS_USE_WIN32_THREADPOOL)
-
-//
-// Define the parameters to execute threaded work using the Windows thread pool
-// library.
-//
-
-struct MLAS_THREADED_WORK_BLOCK {
-    volatile LONG Counter;
-    PMLAS_THREADED_ROUTINE ThreadedRoutine;
-    void* Context;
-};
-
-void
-CALLBACK
-MlasThreadedWorkCallback(
-    PTP_CALLBACK_INSTANCE Instance,
-    void* Context,
-    PTP_WORK WorkObject
-    )
-/*++
-
-Routine Description:
-
-    This routine is invoked from a worker thread to execute one iteration of a
-    batch of threaded work.
-
-Arguments:
-
-    Instance - Supplies the callback instance object.
-
-    Context - Supplies the pointer to the parameters for the operation.
-
-    WorkObject - Supplies the threadpool work object.
-
-Return Value:
-
-    None.
-
---*/
-{
-    MLAS_UNREFERENCED_PARAMETER(Instance);
-    MLAS_UNREFERENCED_PARAMETER(WorkObject);
-
-    MLAS_THREADED_WORK_BLOCK* WorkBlock = (MLAS_THREADED_WORK_BLOCK*)Context;
-
-    LONG Index = InterlockedIncrement(&WorkBlock->Counter) - 1;
-
-    WorkBlock->ThreadedRoutine(WorkBlock->Context, Index);
-}
-
-#endif
-
 void
 MlasExecuteThreaded(
     MLAS_THREADED_ROUTINE ThreadedRoutine,
@@ -99,48 +46,11 @@ MlasExecuteThreaded(
     }
 #endif
 
-#if defined(MLAS_USE_WIN32_THREADPOOL)
 
     //
-    // Schedule the threaded iterations using a work object.
+    // Fallback to OpenMP or a serialized implementation.
     //
 
-    MLAS_THREADED_WORK_BLOCK WorkBlock;
-
-    PTP_WORK WorkObject = CreateThreadpoolWork(MlasThreadedWorkCallback, &WorkBlock, nullptr);
-
-    if (WorkObject != nullptr) {
-
-        WorkBlock.Counter = 0;
-        WorkBlock.ThreadedRoutine = ThreadedRoutine;
-        WorkBlock.Context = Context;
-
-        for (int32_t tid = 1; tid < Iterations; tid++) {
-            SubmitThreadpoolWork(WorkObject);
-        }
-
-        //
-        // Execute the remaining iteration on this thread.
-        //
-
-        ThreadedRoutine(Context, Iterations - 1);
-
-        //
-        // Wait for the work object callbacks to complete.
-        //
-
-        WaitForThreadpoolWorkCallbacks(WorkObject, FALSE);
-        CloseThreadpoolWork(WorkObject);
-
-        return;
-    }
-
-    //
-    // Fallback to a serialized implementation.
-    //
-
-#endif
-
     //
     // Execute the routine for the specified number of iterations.
     //
diff --git a/onnxruntime/core/providers/cpu/math/gemm.h b/onnxruntime/core/providers/cpu/math/gemm.h
index a3aa724ab410d..225754141a6d7 100644
--- a/onnxruntime/core/providers/cpu/math/gemm.h
+++ b/onnxruntime/core/providers/cpu/math/gemm.h
@@ -8,6 +8,7 @@
 #include "core/util/math.h"
 #include "core/util/math_cpuonly.h"
 #include "gemm_helper.h"
+#include "core/framework/op_kernel_context_internal.h"
 
 namespace onnxruntime {
 
@@ -27,6 +28,9 @@ class Gemm : public OpKernel {
   }
 
   Status Compute(OpKernelContext* context) const override {
+    auto ctx_internal = static_cast<OpKernelContextInternal*>(context);
+    concurrency::ThreadPool* tp = ctx_internal->GetOperatorThreadPool();
+
     const auto X = context->Input<Tensor>(0);
     const auto W = context->Input<Tensor>(1);
     const auto B = context->Input<Tensor>(2);
@@ -64,7 +68,7 @@ class Gemm : public OpKernel {
     }
 
     // W * x
-    math::Gemm<T, CPUMathUtil>(
+    math::Gemm<T>(
         trans_A_,
         trans_B_,
         M,
@@ -75,7 +79,7 @@ class Gemm : public OpKernel {
         W->template Data<T>(),
         beta_,
         y_data,
-        &CPUMathUtil::Instance());
+        tp);
 
     FuseActivation<T>(activation_, y_data, M * N, leaky_relu_alpha_);
 
diff --git a/onnxruntime/core/providers/cpu/math/logsoftmax.cc b/onnxruntime/core/providers/cpu/math/logsoftmax.cc
index 281031e71568e..19fbb9897c699 100644
--- a/onnxruntime/core/providers/cpu/math/logsoftmax.cc
+++ b/onnxruntime/core/providers/cpu/math/logsoftmax.cc
@@ -4,6 +4,8 @@
 #include "core/providers/cpu/math/logsoftmax.h"
 
 #include "core/framework/op_kernel.h"
+#include "core/framework/op_kernel_context_internal.h"
+
 #include "core/providers/common.h"
 #include "core/providers/cpu/math/softmax_shared.h"
 #include "core/util/math.h"
@@ -12,6 +14,9 @@ namespace onnxruntime {
 
 template <>
 Status LogSoftmax<float>::Compute(OpKernelContext* ctx) const {
+  auto ctx_internal = static_cast<OpKernelContextInternal*>(ctx);
+  concurrency::ThreadPool* tp = ctx_internal->GetOperatorThreadPool();
+
   const auto* tensor_pointer = ctx->Input<Tensor>(0);
   if (tensor_pointer == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
   const Tensor& X = *tensor_pointer;
@@ -32,7 +37,7 @@ Status LogSoftmax<float>::Compute(OpKernelContext* ctx) const {
 
   const bool logarithmic = true;
   auto status = SoftmaxCPU(N, D, X.template Data<float>(), Ydata,
-                           scale_.data(), sum_multiplier_.data(), logarithmic, rowmax_.data());
+                           scale_.data(), sum_multiplier_.data(), logarithmic, rowmax_.data(), tp);
 
   return status;
 }
diff --git a/onnxruntime/core/providers/cpu/math/matmul.cc b/onnxruntime/core/providers/cpu/math/matmul.cc
index 539157e92bd95..4f4bacc34baeb 100644
--- a/onnxruntime/core/providers/cpu/math/matmul.cc
+++ b/onnxruntime/core/providers/cpu/math/matmul.cc
@@ -1,6 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-
+#include "core/framework/op_kernel_context_internal.h"
 #include "core/providers/cpu/math/matmul.h"
 
 #include "core/util/math.h"
@@ -53,6 +53,9 @@ ONNX_CPU_OPERATOR_VERSIONED_TYPED_KERNEL(
 
 template <typename T>
 Status MatMul<T>::Compute(OpKernelContext* ctx) const {
+  auto ctx_internal = static_cast<OpKernelContextInternal*>(ctx);
+  concurrency::ThreadPool* thread_pool = ctx_internal->GetOperatorThreadPool();
+
   const auto* left_X = ctx->Input<Tensor>(0);
   const auto* right_X = ctx->Input<Tensor>(1);
 
@@ -69,7 +72,7 @@ Status MatMul<T>::Compute(OpKernelContext* ctx) const {
         static_cast<int>(helper.K()),
         left_X->template Data<T>() + helper.LeftOffsets()[i],
         right_X->template Data<T>() + helper.RightOffsets()[i],
-        Y->template MutableData<T>() + helper.OutputOffsets()[i]);
+        Y->template MutableData<T>() + helper.OutputOffsets()[i], thread_pool);
   }
 
   return Status::OK();
diff --git a/onnxruntime/core/providers/cpu/math/matmul_helper.h b/onnxruntime/core/providers/cpu/math/matmul_helper.h
index af82037a7c465..e5095e0ea1382 100644
--- a/onnxruntime/core/providers/cpu/math/matmul_helper.h
+++ b/onnxruntime/core/providers/cpu/math/matmul_helper.h
@@ -29,9 +29,8 @@ class MatMulComputeHelper {
       M_ = left_shape.SizeToDimension(left_num_dims - 1);
       K_ = left_shape[left_num_dims - 1];
       N_ = right_shape[right_num_dims - 1];
-      std::vector<int64_t> output_dims = left_shape.GetDims();
-      output_dims[left_num_dims - 1] = N_;
-      output_shape_ = TensorShape(output_dims);
+      output_shape_ = left_shape;
+      output_shape_[left_num_dims - 1] = N_;
       output_offsets_ = {0};
       left_offsets_ = {0};
       right_offsets_ = {0};
diff --git a/onnxruntime/core/providers/cpu/math/softmax.cc b/onnxruntime/core/providers/cpu/math/softmax.cc
index 9242967901e46..542e20e79f79c 100644
--- a/onnxruntime/core/providers/cpu/math/softmax.cc
+++ b/onnxruntime/core/providers/cpu/math/softmax.cc
@@ -4,6 +4,7 @@
 #include "core/providers/cpu/math/softmax.h"
 
 #include "core/framework/op_kernel.h"
+#include "core/framework/op_kernel_context_internal.h"
 #include "core/providers/common.h"
 #include "core/providers/cpu/math/softmax_shared.h"
 #include "core/util/math.h"
@@ -12,6 +13,9 @@ namespace onnxruntime {
 
 template <>
 Status Softmax<float>::Compute(OpKernelContext* ctx) const {
+  auto ctx_internal = static_cast<OpKernelContextInternal*>(ctx);
+  concurrency::ThreadPool* tp = ctx_internal->GetOperatorThreadPool();
+
   const auto* tensor_pointer = ctx->Input<Tensor>(0);
   if (tensor_pointer == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
   const Tensor& X = *tensor_pointer;
@@ -34,7 +38,7 @@ Status Softmax<float>::Compute(OpKernelContext* ctx) const {
 
   const bool logarithmic = false;
   auto status = SoftmaxCPU(N, D, X.template Data<float>(), Ydata,
-                           scale_.data(), sum_multiplier_.data(), logarithmic, rowmax_.data());
+                           scale_.data(), sum_multiplier_.data(), logarithmic, rowmax_.data(), tp);
 
   return status;
 }
diff --git a/onnxruntime/core/providers/cpu/math/softmax_shared.cc b/onnxruntime/core/providers/cpu/math/softmax_shared.cc
index 7dd3a10cfc598..18277f6b4137c 100644
--- a/onnxruntime/core/providers/cpu/math/softmax_shared.cc
+++ b/onnxruntime/core/providers/cpu/math/softmax_shared.cc
@@ -31,6 +31,7 @@
 #endif
 
 #include "core/providers/cpu/math/softmax_shared.h"
+
 #include "core/util/math.h"
 #include "core/util/math_cpuonly.h"
 
@@ -46,7 +47,7 @@ common::Status SoftmaxCPU(const int64_t N,
                           float* scale,
                           const float* sum_multiplier,
                           bool logarithmic,
-                          float* rowmax) {
+                          float* rowmax, onnxruntime::concurrency::ThreadPool* tp) {
   // the Math functions SoftmaxCPU uses only support int32_t as input, so enforce that
   if (N * D > INT32_MAX || N > INT32_MAX || D > INT32_MAX) {
     std::ostringstream ss;
@@ -65,7 +66,7 @@ common::Status SoftmaxCPU(const int64_t N,
   // Put the intermediate result X - max(X) into Y by first copying X to Y, and then subtracting max from each entry
   gsl::copy(gsl::make_span(Xdata, nd), gsl::make_span(Ydata, nd));
 
-  math::Gemm<float, CPUMathUtil>(CblasNoTrans, CblasNoTrans, n, d, 1, -1, rowmax, sum_multiplier, 1, Ydata, nullptr);
+  math::Gemm<float>(CblasNoTrans, CblasNoTrans, n, d, 1, -1, rowmax, sum_multiplier, 1, Ydata, tp);
 
   // Exponentiation
   math::Exp<float, CPUMathUtil>(nd, Ydata, Ydata, nullptr);
diff --git a/onnxruntime/core/providers/cpu/math/softmax_shared.h b/onnxruntime/core/providers/cpu/math/softmax_shared.h
index 3439b9717f051..26ffeb193fe4f 100644
--- a/onnxruntime/core/providers/cpu/math/softmax_shared.h
+++ b/onnxruntime/core/providers/cpu/math/softmax_shared.h
@@ -6,6 +6,9 @@
 #include "core/common/status.h"
 
 namespace onnxruntime {
+namespace concurrency {
+class ThreadPool;
+}
 /**
 Calculate Softmax using CPU memory.
 @param N Number of rows
@@ -18,5 +21,5 @@ Calculate Softmax using CPU memory.
 @param rowmax Storage for calculation of maximum in each row. Size must be >= N.
 */
 common::Status SoftmaxCPU(int64_t N, int64_t D, const float* Xdata, float* Ydata, float* scale,
-                          const float* sum_multiplier, bool logarithmic, float* rowmax);
+                          const float* sum_multiplier, bool logarithmic, float* rowmax, concurrency::ThreadPool* tp);
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/nn/conv.cc b/onnxruntime/core/providers/cpu/nn/conv.cc
index c3acbd02a62c5..c0091936704d8 100644
--- a/onnxruntime/core/providers/cpu/nn/conv.cc
+++ b/onnxruntime/core/providers/cpu/nn/conv.cc
@@ -14,6 +14,7 @@
 * limitations under the License.
 */
 /* Modifications Copyright (c) Microsoft. */
+#include "core/framework/op_kernel_context_internal.h"
 
 #include "core/providers/cpu/nn/conv.h"
 #include "core/framework/op_kernel_context_internal.h"
@@ -24,6 +25,8 @@ namespace onnxruntime {
 template <typename T>
 Status Conv<T>::Compute(OpKernelContext* context) const {
   size_t num_inputs = OpKernel::Node().InputDefs().size();
+  auto ctx_internal = static_cast<OpKernelContextInternal*>(context);
+  concurrency::ThreadPool* tp = ctx_internal->GetOperatorThreadPool();
 
   const auto* X = context->Input<Tensor>(0);
   const auto* W = context->Input<Tensor>(1);
@@ -116,7 +119,7 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
             col_buffer_data,
             &CPUMathUtil::Instance());
       }
-      math::Gemm<T, CPUMathUtil>(
+      math::Gemm<T>(
           CblasNoTrans,
           CblasNoTrans,
           M / group_,
@@ -127,7 +130,7 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
           col_buffer_data,
           0,
           Ydata + group_id * Y_offset,
-          &CPUMathUtil::Instance());
+          tp);
     }
 
     if (B != nullptr) {
@@ -144,6 +147,9 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
 }
 
 Status Conv<float>::Compute(OpKernelContext* context) const {
+  auto ctx_internal = static_cast<OpKernelContextInternal*>(context);
+  concurrency::ThreadPool* tp = ctx_internal->GetOperatorThreadPool();
+
   size_t num_inputs = OpKernel::Node().InputDefs().size();
   const auto* X = context->Input<Tensor>(0);
   const auto* W = context->Input<Tensor>(1);
@@ -186,11 +192,6 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
   const size_t kernel_rank = kernel_shape.size();
 
   if (kernel_rank == 2 || kernel_rank == 3) {
-    // Get access to the internal threadpool
-    // Temporarily derive concurrency parameters without access to session state
-    auto ctx_internal = static_cast<OpKernelContextInternal*>(context);
-    auto thread_pool = ctx_internal->GetOperatorThreadPool();
-
     MLAS_CONV_PARAMETERS Parameters;
     size_t WorkingBufferSize;
     MlasConvPrepare(&Parameters,
@@ -207,7 +208,7 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
                     static_cast<size_t>(M / group_),
                     &activation_,
                     &WorkingBufferSize,
-                    const_cast<concurrency::ThreadPool*>(thread_pool));
+                    tp);
 
     auto working_data = WorkingBufferSize > 0 ? alloc->Alloc(sizeof(float) * WorkingBufferSize) : nullptr;
     BufferUniquePtr working_buffer(working_data, BufferDeleter(alloc));
@@ -218,7 +219,7 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
              Bdata,
              static_cast<float*>(working_buffer.get()),
              Ydata,
-             const_cast<concurrency::ThreadPool*>(thread_pool));
+             tp);
   } else {
     const int64_t input_image_size = input_shape.Size();
     const int64_t output_image_size = output_shape.Size();
@@ -253,7 +254,7 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
             static_cast<int>(kernel_shape.size()),
             col_buffer_data,
             &CPUMathUtil::Instance());
-        math::Gemm<float, CPUMathUtil>(
+        math::Gemm<float>(
             CblasNoTrans,
             CblasNoTrans,
             M / group_,
@@ -264,7 +265,7 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
             col_buffer_data,
             0,
             Ydata + group_id * Y_offset,
-            &CPUMathUtil::Instance());
+            tp);
       }
 
       MlasActivation(&activation_, Ydata, Bdata, M, output_image_size, output_image_size);
diff --git a/onnxruntime/core/providers/cpu/nn/conv_transpose.cc b/onnxruntime/core/providers/cpu/nn/conv_transpose.cc
index 14f13ccd20198..9fd9cd1502147 100644
--- a/onnxruntime/core/providers/cpu/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/cpu/nn/conv_transpose.cc
@@ -16,6 +16,8 @@
 /* Modifications Copyright (c) Microsoft. */
 
 #include "core/providers/cpu/nn/conv_transpose.h"
+#include "core/framework/op_kernel_context_internal.h"
+
 #include "core/util/math.h"
 #include "core/util/math_cpuonly.h"
 
@@ -228,6 +230,9 @@ Status ConvTranspose<T>::Compute(OpKernelContext* context) const {
 
 template <typename T>
 Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_padding) const {
+  auto ctx_internal = static_cast<OpKernelContextInternal*>(context);
+  concurrency::ThreadPool* tp = ctx_internal->GetOperatorThreadPool();
+
   size_t num_inputs = OpKernel::Node().InputDefs().size();
   Prepare p;
   bool has_bias = dynamic_padding ? num_inputs == 4 : num_inputs == 3;
@@ -254,7 +259,7 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
   for (auto image_id = 0; image_id < p.N; ++image_id) {
     for (int group_id = 0; group_id < group_; ++group_id) {
       // Weight term
-      math::Gemm<T, CPUMathUtil>(
+      math::Gemm<T>(
           CblasTrans,
           CblasNoTrans,
           kernel_dim,
@@ -265,7 +270,7 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
           Xdata + group_id * X_offset,
           0,
           col_buffer_data,
-          &CPUMathUtil::Instance());
+          tp);
 
       // Col2im
       math::Col2im<T, CPUMathUtil, StorageOrder::NCHW>(
diff --git a/onnxruntime/core/providers/cpu/nn/pool.cc b/onnxruntime/core/providers/cpu/nn/pool.cc
index 367a9256a0c16..47bc8fc856bb3 100644
--- a/onnxruntime/core/providers/cpu/nn/pool.cc
+++ b/onnxruntime/core/providers/cpu/nn/pool.cc
@@ -190,7 +190,7 @@ Status PoolBase::Compute(OpKernelContext* context, MLAS_POOLING_KIND kind) const
   // Get access to the internal threadpool
   // Temporarily derive concurrency parameters without access to session state
   auto ctx_internal = static_cast<OpKernelContextInternal*>(context);
-  auto thread_pool = ctx_internal->GetOperatorThreadPool();
+  concurrency::ThreadPool* thread_pool = ctx_internal->GetOperatorThreadPool();
 
   MlasPool(kind,
            pooling_dims,
diff --git a/onnxruntime/core/providers/cpu/nn/pool_base.h b/onnxruntime/core/providers/cpu/nn/pool_base.h
index 43f81982dd3a9..606ac909f08f1 100644
--- a/onnxruntime/core/providers/cpu/nn/pool_base.h
+++ b/onnxruntime/core/providers/cpu/nn/pool_base.h
@@ -99,10 +99,13 @@ class LpPool {
 };
 
 class PoolBase {
+ private:
+  static bool IsGlobalPooling(const std::string& op_name) {
+    return op_name == "GlobalAveragePool" || op_name == "GlobalMaxPool" || op_name == "GlobalLpPool";
+  }
+
  protected:
-  PoolBase(const OpKernelInfo& info) {
-    op_name_ = info.GetKernelDef().OpName();
-    global_pooling_ = (op_name_ == "GlobalAveragePool" || op_name_ == "GlobalMaxPool" || op_name_ == "GlobalLpPool");
+  PoolBase(const OpKernelInfo& info) : op_name_(info.GetKernelDef().OpName()), global_pooling_(IsGlobalPooling(op_name_)) {
     int end;
     info.GetKernelDef().SinceVersion(&start_version_, &end);
 
@@ -256,8 +259,8 @@ class PoolBase {
   Status Compute(OpKernelContext* context, MLAS_POOLING_KIND kind) const;
 
  protected:
-  std::string op_name_;
-  bool global_pooling_{};
+  const std::string op_name_;
+  const bool global_pooling_;
   bool count_include_pad_{};
   int64_t storage_order_{0};  // MaxPool_8 only. 0 is row major, and 1 is column major. Default is 0.
   int64_t ceil_mode_{0};      // Introduced in MaxPool_10
diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc b/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc
index c5be268f59e2d..0dd13269bfacd 100644
--- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc
+++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc
@@ -1,5 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
+#include "core/platform/threadpool.h"
+#include "core/framework/op_kernel_context_internal.h"
 
 // there's no way to use a raw pointer as the copy destination with std::copy_n
 // (which gsl::copy uses with span::data() which returns a raw pointer) with the 14.11 toolset
@@ -167,7 +169,8 @@ class UniDirectionalGru {
   UniDirectionalGru(AllocatorPtr allocator, int seq_length, int batch_size, int input_size, int hidden_size,
                     bool linear_before_reset, Direction direction, const gsl::span<const T>& bias,
                     const gsl::span<const T>& initial_hidden_state, const ActivationFuncs::Entry& activation_func_f,
-                    const ActivationFuncs::Entry& activation_func_g, float clip);
+                    const ActivationFuncs::Entry& activation_func_g, float clip,
+                    onnxruntime::concurrency::ThreadPool* ttp);
 
   void Compute(const gsl::span<const T>& inputs, const gsl::span<const int>& sequence_lengths, int num_directions,
                const gsl::span<const T>& input_weights, const gsl::span<const T>& recurrent_weights,
@@ -233,6 +236,8 @@ class UniDirectionalGru {
   deepcpu::GruOutputGateFuncPtr output_gate_{};
 
   void AllocateBuffers();
+
+  onnxruntime::concurrency::ThreadPool* ttp_;
 };
 }  // namespace detail
 
@@ -263,6 +268,9 @@ Status DeepCpuGruOp::Compute(OpKernelContext* context) const {
 
 template <typename T>
 Status DeepCpuGruOp::ComputeImpl(OpKernelContext& context) const {
+  auto ctx_internal = static_cast<OpKernelContextInternal*>(&context);
+  concurrency::ThreadPool* thread_pool = ctx_internal->GetOperatorThreadPool();
+
   const Tensor& X = *context.Input<Tensor>(0);  // inputs. [seq_length, batch_size, input_size]
   const Tensor& W = *context.Input<Tensor>(1);  // weights. [num_directions, 3*hidden_size, input_size]
   const Tensor& R = *context.Input<Tensor>(2);  // recurrence weights. [num_directions, 3*hidden_size, hidden_size]
@@ -367,7 +375,7 @@ Status DeepCpuGruOp::ComputeImpl(OpKernelContext& context) const {
                                     linear_before_reset_, Direction::kForward, bias_1, initial_hidden_1,
                                     activation_funcs_.Entries()[0],
                                     activation_funcs_.Entries()[1],
-                                    clip_);
+                                    clip_, thread_pool);
     fw.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1,
                output_1, hidden_output_1);
 
@@ -375,7 +383,7 @@ Status DeepCpuGruOp::ComputeImpl(OpKernelContext& context) const {
                                     linear_before_reset_, Direction::kReverse, bias_2, initial_hidden_2,
                                     activation_funcs_.Entries()[2],
                                     activation_funcs_.Entries()[3],
-                                    clip_);
+                                    clip_, thread_pool);
     bw.Compute(input, sequence_lens_span, num_directions_, input_weights_2, recurrent_weights_2,
                output_2, hidden_output_2);
   } else {
@@ -383,7 +391,7 @@ Status DeepCpuGruOp::ComputeImpl(OpKernelContext& context) const {
                                        linear_before_reset_, direction_, bias_1, initial_hidden_1,
                                        activation_funcs_.Entries()[0],
                                        activation_funcs_.Entries()[1],
-                                       clip_);
+                                       clip_, thread_pool);
     gru_p.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1,
                   output_1, hidden_output_1);
   }
@@ -412,7 +420,7 @@ UniDirectionalGru<T>::UniDirectionalGru(AllocatorPtr allocator,
                                         const gsl::span<const T>& initial_hidden_state,
                                         const ActivationFuncs::Entry& activation_func_f,
                                         const ActivationFuncs::Entry& activation_func_g,
-                                        const float clip)
+                                        const float clip, onnxruntime::concurrency::ThreadPool* ttp)
     : allocator_(allocator),
       seq_length_(seq_length),
       batch_size_(batch_size),
@@ -421,7 +429,8 @@ UniDirectionalGru<T>::UniDirectionalGru(AllocatorPtr allocator,
       linear_before_reset_(linear_before_reset),
       clip_(clip),
       direction_(direction),
-      use_bias_(!bias.empty()) {
+      use_bias_(!bias.empty()),
+      ttp_(ttp) {
   clip_with_bias_ptr_ = use_bias_ ? deepcpu::clip_add_bias : deepcpu::clip_ignore_bias;
 
   // setup activation function pointers and alpha/beta values to use with them
@@ -540,7 +549,7 @@ void UniDirectionalGru<T>::Compute(const gsl::span<const T>& inputs_arg,
               input_weights.cbegin(), input_weights.cend(),
               input_size_, beta,
               outputZRH_.begin(), outputZRH_.end(),
-              hidden_size_x3);
+              hidden_size_x3, ttp_);
 
   DumpMatrix("inputs with weights applied", outputZRH_.data(), seq_length_ * batch_size_ * 3, hidden_size_);
 
@@ -606,7 +615,7 @@ void UniDirectionalGru<T>::Compute(const gsl::span<const T>& inputs_arg,
                 recurrent_weightsZR.cbegin(), recurrent_weightsZR.cend(),
                 hidden_size_, beta,
                 outputZRH_.begin() + out_added_offset, outputZRH_.end(),
-                hidden_size_x3);
+                hidden_size_x3, ttp_);
 
     DumpMatrix("Ht-1 * R[zr] + Xt*(W[zr]^T)" + seqno_str,
                outputZRH_.data() + out_added_offset, batch_size_, hidden_size_x2, 0, hidden_size_x3);
@@ -622,7 +631,7 @@ void UniDirectionalGru<T>::Compute(const gsl::span<const T>& inputs_arg,
                   recurrent_weightsH.cbegin(), recurrent_weightsH.cend(),  // Rh^T
                   hidden_size_, beta,
                   linear_output_.begin(), linear_output_.end(),  // pre: Rbh, post:output
-                  hidden_size_);
+                  hidden_size_, ttp_);
 
       DumpMatrix("Ht-1 * (Rh^T) + Rbh " + seqno_str, linear_output_.data(), batch_size_, hidden_size_);
     }
@@ -693,7 +702,7 @@ void UniDirectionalGru<T>::Compute(const gsl::span<const T>& inputs_arg,
                   recurrent_weightsH.cbegin(), recurrent_weightsH.cend(),  // Rh^T
                   hidden_size_, beta,
                   out_H, outputZRH_.end(),
-                  hidden_size_x3);
+                  hidden_size_x3, ttp_);
     }
 
     DumpMatrix("Xt*(Wh^T) + (" + label + ")" + seqno_str, outputZRH_.data() + out_added_offset,
diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc
index 8f4e8236981f8..682dabd9262ca 100644
--- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc
+++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc
@@ -9,6 +9,9 @@
 #pragma warning(disable : 4996)
 #endif
 
+#include "core/platform/threadpool.h"
+#include "core/framework/op_kernel_context_internal.h"
+
 #include "core/providers/cpu/rnn/deep_cpu_lstm.h"
 
 #include "core/common/common.h"
@@ -193,7 +196,8 @@ class UniDirectionalLstm {
                      const gsl::span<const T>& initial_hidden_state, const gsl::span<const T>& initial_cell_state,
                      const ActivationFuncs::Entry& activation_func_f, const ActivationFuncs::Entry& activation_func_g,
                      const ActivationFuncs::Entry& activation_func_h, float clip,
-                     onnxruntime::concurrency::ThreadPool& ttp);
+                     concurrency::ThreadPool& lstm_tp_,
+                     concurrency::ThreadPool* mlas_tp_);
 
   void Compute(const gsl::span<const T>& inputs, const gsl::span<const int>& sequence_lengths, int num_directions,
                const gsl::span<const T>& input_weights, const gsl::span<const T>& recurrent_weights,
@@ -275,7 +279,8 @@ class UniDirectionalLstm {
   ActivationInfo<deepcpu::ActivationFuncPtr> activation_g_;
   ActivationInfo<deepcpu::LstmMergeGatesFuncPtr> activation_h_;
 
-  onnxruntime::concurrency::ThreadPool& ttp_;
+  concurrency::ThreadPool& lstm_tp_;
+  concurrency::ThreadPool* mlas_tp_;
 };
 
 }  // namespace detail
@@ -309,6 +314,9 @@ DeepCpuLstmOp::Compute(OpKernelContext* context) const {
 
 template <typename T>
 Status DeepCpuLstmOp::ComputeImpl(OpKernelContext& context) const {
+  auto ctx_internal = static_cast<OpKernelContextInternal*>(&context);
+  concurrency::ThreadPool* mlas_thread_pool = ctx_internal->GetOperatorThreadPool();
+
   auto& logger = context.Logger();
 
   const Tensor& X = *context.Input<Tensor>(0);  // inputs. [seq_length, batch_size, input_size]
@@ -452,7 +460,7 @@ Status DeepCpuLstmOp::ComputeImpl(OpKernelContext& context) const {
                                      activation_funcs_.Entries()[0],
                                      activation_funcs_.Entries()[1],
                                      activation_funcs_.Entries()[2],
-                                     clip_, ttp_);
+                                     clip_, lstm_tp_, mlas_thread_pool);
 
     detail::UniDirectionalLstm<T> bw(alloc, logger, seq_length, batch_size, input_size,
                                      hidden_size_, Direction::kReverse, input_forget_,
@@ -460,7 +468,7 @@ Status DeepCpuLstmOp::ComputeImpl(OpKernelContext& context) const {
                                      activation_funcs_.Entries()[3],
                                      activation_funcs_.Entries()[4],
                                      activation_funcs_.Entries()[5],
-                                     clip_, ttp_);
+                                     clip_, lstm_tp_, mlas_thread_pool);
 
     fw.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1,
                output_1, hidden_output_1, last_cell_1);
@@ -473,7 +481,7 @@ Status DeepCpuLstmOp::ComputeImpl(OpKernelContext& context) const {
                                      activation_funcs_.Entries()[0],
                                      activation_funcs_.Entries()[1],
                                      activation_funcs_.Entries()[2],
-                                     clip_, ttp_);
+                                     clip_, lstm_tp_, mlas_thread_pool);
 
     fw.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1,
                output_1, hidden_output_1, last_cell_1);
@@ -546,7 +554,8 @@ UniDirectionalLstm<T>::UniDirectionalLstm(AllocatorPtr allocator,
                                           const ActivationFuncs::Entry& activation_func_g,
                                           const ActivationFuncs::Entry& activation_func_h,
                                           const float clip,
-                                          onnxruntime::concurrency::ThreadPool& ttp)
+                                          concurrency::ThreadPool& lstm_tp,
+                                          concurrency::ThreadPool* mlas_tp)
     : allocator_(allocator),
       logger_(logger),
       seq_length_(seq_length),
@@ -558,7 +567,8 @@ UniDirectionalLstm<T>::UniDirectionalLstm(AllocatorPtr allocator,
       clip_(clip),
       use_bias_(!bias.empty()),
       use_peepholes_(!peephole_weights.empty()),
-      ttp_(ttp) {
+      lstm_tp_(lstm_tp),
+      mlas_tp_(mlas_tp) {
   activation_f_ = {deepcpu::ActivationFuncByName(activation_func_f.name),
                    activation_func_f.alpha,
                    activation_func_f.beta};
@@ -774,7 +784,7 @@ void UniDirectionalLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
               input_weights.cbegin(), input_weights.cend(),  // W[iofc]
               input_size_, beta,
               output_iofc_.begin(), output_iofc_.end(),
-              hidden_size_x4);
+              hidden_size_x4, mlas_tp_);
 
   DumpMatrix("Xt*(W[iofc]^T)", output_iofc_.data(), total_rows, hidden_size_x4);
 
@@ -823,7 +833,7 @@ void UniDirectionalLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
                     recurrent_weights.cbegin(), recurrent_weights.cend(),  // R[iofc]
                     hidden_size_, beta,
                     step_out_IOFC, output_iofc_.end(),  // input contains Xt*(W[iofc]^T)
-                    hidden_size_x4);
+                    hidden_size_x4, mlas_tp_);
 
         DumpMatrix("Xt*(W[iofc]^T) + Ht-t*R[iofc]" + row_str,
                    &*step_out_IOFC, local_fused_hidden_rows, hidden_size_x4);
@@ -874,7 +884,7 @@ void UniDirectionalLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
       }
     };
 
-    ExecuteLambdaInParallel("Processing batch", hidden_gemm_and_activations, batch_size_, fused_hidden_rows, ttp_, logger_);
+    ExecuteLambdaInParallel("Processing batch", hidden_gemm_and_activations, batch_size_, fused_hidden_rows, lstm_tp_, logger_);
 
   } else {
     span_T_iter c_prev = batched_internal_state_prev_one_step.begin();
@@ -901,7 +911,7 @@ void UniDirectionalLstm<T>::Compute(const gsl::span<const T>& inputs_arg,
                   recurrent_weights.cbegin(), recurrent_weights.cend(),  // R[iofc]
                   hidden_size_, beta,
                   step_out_IOFC, output_iofc_.end(),  // input contains Xt*(W[iofc]^T)
-                  hidden_size_x4);
+                  hidden_size_x4, mlas_tp_);
 
       span_T_iter batched_output;
       span_T_iter batched_output_end;
diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h
index 606dfbf5b190c..faf32e3a77a2f 100644
--- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h
+++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h
@@ -82,8 +82,8 @@ class DeepCpuLstmOp final : public OpKernel {
   // across them. mutable due to this.
   // The alternative would be to create a threadpool in each call to Compute but that would incur thread creation
   // cost on every call.
-  mutable onnxruntime::concurrency::ThreadPool ttp_{"DEEPCPU_LSTM",
-                                                    static_cast<int>(std::thread::hardware_concurrency())};
+  mutable onnxruntime::concurrency::ThreadPool lstm_tp_{"DEEPCPU_LSTM",
+                                                        static_cast<int>(std::thread::hardware_concurrency())};
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/rnn/rnn.cc b/onnxruntime/core/providers/cpu/rnn/rnn.cc
index 4030d65a94d45..1576b5192da68 100644
--- a/onnxruntime/core/providers/cpu/rnn/rnn.cc
+++ b/onnxruntime/core/providers/cpu/rnn/rnn.cc
@@ -1,5 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
+#include "core/framework/op_kernel_context_internal.h"
 
 #include "core/providers/cpu/rnn/rnn.h"
 #include "core/providers/cpu/rnn/rnn_activation_functors.h"
@@ -99,6 +100,8 @@ using EigenMatrixMapRowMajor = Eigen::Map<
 template <>
 Status RNN<float>::Compute(OpKernelContext* ctx) const {
   using namespace rnn::detail;
+  auto ctx_internal = static_cast<OpKernelContextInternal*>(ctx);
+  concurrency::ThreadPool* tp = ctx_internal->GetOperatorThreadPool();
 
   // inputs
   const Tensor& X = *ctx->Input<Tensor>(0);
@@ -160,7 +163,7 @@ Status RNN<float>::Compute(OpKernelContext* ctx) const {
     }
 
     // X * W[direction]^t + B
-    math::Gemm<float, CPUMathUtil>(
+    math::Gemm<float>(
         CblasNoTrans,
         CblasTrans,
         static_cast<int>(seq_length * batch_size),
@@ -171,7 +174,7 @@ Status RNN<float>::Compute(OpKernelContext* ctx) const {
         W.template Data<float>() + direction * hidden_size_ * input_size,
         1,
         x_matmul_w_buffer_data,
-        &CPUMathUtil::Instance());
+        tp);
 
     for (int64_t t = 0; t < seq_length; t++) {
       int64_t time_step = isReverse ? (seq_length - t - 1) : t;
@@ -192,7 +195,7 @@ Status RNN<float>::Compute(OpKernelContext* ctx) const {
 
       if (h_prev != nullptr) {
         // H_t_1 * R[direction]^t
-        math::Gemm<float, CPUMathUtil>(
+        math::Gemm<float>(
             CblasNoTrans,
             CblasTrans,
             static_cast<int>(batch_size),
@@ -203,7 +206,7 @@ Status RNN<float>::Compute(OpKernelContext* ctx) const {
             R.template Data<float>() + direction * hidden_size_ * hidden_size_,
             0,
             Y_buffer_data_current_frame,
-            &CPUMathUtil::Instance());
+            tp);
       } else {
         math::Set<float, CPUMathUtil>(batch_size * hidden_size_, 0, Y_buffer_data_current_frame, &CPUMathUtil::Instance());
       }
diff --git a/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h b/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h
index 2e3e5f88d72ec..f1038e63a350e 100644
--- a/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h
+++ b/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h
@@ -159,7 +159,7 @@ void ComputeGemm(const int M,
                  const float beta,
                  TSpanCIter C,
                  TSpanCIter C_end,
-                 const int ldc) {
+                 const int ldc, concurrency::ThreadPool* tp) {
   // validate all the inputs
   // need to use the lda/ldb/ldc strides which should be >= the columns for the span
   ORT_ENFORCE(lda >= K && ldb >= K && ldc >= N);
@@ -167,12 +167,12 @@ void ComputeGemm(const int M,
   ORT_ENFORCE(B + (N * ldb - (ldb - K)) <= B_end);
   ORT_ENFORCE(C + (M * ldc - (ldc - N)) <= C_end);
 
-  ::onnxruntime::math::GemmEx<float, CPUMathUtil>(
+  ::onnxruntime::math::GemmEx<float>(
       CblasNoTrans, CblasTrans,
       M, N, K, alpha,
       &*A, lda,
       &*B, ldb, beta,
-      &*C, ldc, &CPUMathUtil::Instance());
+      &*C, ldc, tp);
 }
 
 // helper to convert a span to a raw pointer
diff --git a/onnxruntime/core/providers/cpu/tensor/cast_op.cc b/onnxruntime/core/providers/cpu/tensor/cast_op.cc
index c326d25ef17a0..0f8da8eaff2a6 100644
--- a/onnxruntime/core/providers/cpu/tensor/cast_op.cc
+++ b/onnxruntime/core/providers/cpu/tensor/cast_op.cc
@@ -10,7 +10,7 @@
 #include "Eigen/src/Core/arch/GPU/Half.h"
 #include "core/common/common.h"
 
-#if defined(USE_MLAS) && defined(_M_AMD64)
+#if defined(_M_AMD64)
 #include "core/mlas/inc/mlas.h"
 #endif
 
@@ -40,7 +40,7 @@ inline void CastData<MLFloat16, float>(const Tensor* in, Tensor* out, const Tens
   auto out_data = out->template MutableData<float>();
   auto in_data = in->template Data<MLFloat16>();
   auto shape_size = shape.Size();
-#if defined(USE_MLAS) && defined(_M_AMD64)
+#if defined(_M_AMD64)
   MlasConvertHalfToFloatBuffer(&in_data[0].val, out_data, shape_size);
 #else
   auto in_vector = ConstEigenVectorMap<Eigen::half>(static_cast<const Eigen::half*>(static_cast<const void*>(in_data)), shape_size);
diff --git a/onnxruntime/core/session/environment.cc b/onnxruntime/core/session/environment.cc
index d1f9041c9253f..539d92bd4323a 100644
--- a/onnxruntime/core/session/environment.cc
+++ b/onnxruntime/core/session/environment.cc
@@ -87,6 +87,7 @@ Internal copy node
 }
 
 Environment::~Environment() {
+  delete tp_;
   ::google::protobuf::ShutdownProtobufLibrary();
 }
 
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 678163f92c1fa..1d7623d0165a6 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -90,15 +90,26 @@ inline std::basic_string<T> GetCurrentTimeString() {
   OrtStrftime<T>(time_str, sizeof(time_str), GetDateFormatString<T>(), &local_tm);
   return std::basic_string<T>(time_str);
 }
+
+concurrency::ThreadPool* CreateThreadPool(int size) {
+  if (size == 1)
+    return nullptr;
+  if (size <= 0)
+    size = std::max<int>(std::thread::hardware_concurrency() - 1, 1);
+  return new concurrency::ThreadPool("SESSION", size);
+}
+
 }  // namespace
 
 InferenceSession::InferenceSession(const SessionOptions& session_options,
                                    logging::LoggingManager* logging_manager)
     : session_options_{session_options},
-      graph_transformation_mgr_{session_options_.max_num_graph_transformation_steps},
+      graph_transformation_mgr_{session_options.max_num_graph_transformation_steps},
       logging_manager_{logging_manager},
+      thread_pool_(CreateThreadPool(session_options.session_thread_pool_size)),
       session_state_(execution_providers_,
-                     session_options.enable_mem_pattern && session_options.enable_sequential_execution),
+                     session_options.enable_mem_pattern && session_options.enable_sequential_execution,
+                     thread_pool_.get()),
       insert_cast_transformer_{"CastFloat16Transformer"} {
   ORT_ENFORCE(Environment::IsInitialized(),
               "Environment must be initialized before creating an InferenceSession.");
@@ -106,18 +117,6 @@ InferenceSession::InferenceSession(const SessionOptions& session_options,
   InitLogger(logging_manager);
 
   session_state_.SetDataTransferMgr(&data_transfer_mgr_);
-
-  // The threadpool is currently evolving.  We will always create a per session threadpool.
-  // Beyond this, we will create a global thread pool to share across sessions.
-  {
-    int pool_size = session_options_.session_thread_pool_size <= 0
-                        ? std::thread::hardware_concurrency() / 2
-                        : session_options_.session_thread_pool_size;
-
-    thread_pool_ = std::make_unique<onnxruntime::concurrency::ThreadPool>("SESSION", pool_size);
-  }
-
-  session_state_.SetThreadPool(thread_pool_.get());
   session_profiler_.Initialize(session_logger_);
   session_state_.SetProfiler(session_profiler_);
   if (session_options.enable_profiling) {
@@ -398,11 +397,9 @@ common::Status InferenceSession::CreateSubgraphSessionState(Graph& graph, Sessio
       ORT_ENFORCE(subgraph, "Main Graph instance should have populated all subgraphs when being resolved.");
 
       auto subgraph_session_state =
-          std::make_unique<SessionState>(execution_providers_, session_state.GetEnableMemoryPattern());
+          std::make_unique<SessionState>(execution_providers_, session_state.GetEnableMemoryPattern(), session_state.GetThreadPool());
       subgraph_session_state->SetProfiler(session_profiler_);
       subgraph_session_state->SetLogger(*session_logger_);
-      // Pass threadpool to subgraph
-      subgraph_session_state->SetThreadPool(session_state.GetThreadPool());
       // Pass data transfer manager to subgraph.
       subgraph_session_state->SetDataTransferMgr(&session_state.GetDataTransferMgr());
       // Pass fused function manager to subgraph
@@ -534,7 +531,7 @@ common::Status InferenceSession::Initialize() {
         ORT_RETURN_IF_ERROR(Model::Save(*model_, session_options_.optimized_model_filepath));
       } else {
         LOGS(*session_logger_, WARNING) << "Serializing Optimized ONNX model with Graph Optimization"
-                                        " level greater than 2 is not supported.";
+                                           " level greater than 2 is not supported.";
       }
     }
 
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 219002507b8ec..80f550cfdccb2 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -67,9 +67,9 @@ struct OrtEnv {
   auto v = reinterpret_cast<const ::OrtValue*>(value); \
   auto& tensor = v->Get<onnxruntime::Tensor>();
 
-#define TENSOR_READWRITE_API_BEGIN               \
-  API_IMPL_BEGIN                                 \
-  auto v = reinterpret_cast<::OrtValue*>(value); \
+#define TENSOR_READWRITE_API_BEGIN \
+  API_IMPL_BEGIN                   \
+  auto v = (value);                \
   auto tensor = v->GetMutable<onnxruntime::Tensor>();
 
 class LoggingWrapper : public ISink {
@@ -416,7 +416,7 @@ ORT_API_STATUS_IMPL(OrtCreateSessionFromArray, _In_ const OrtEnv* env, _In_ cons
 }
 
 ORT_API_STATUS_IMPL(OrtRun, _Inout_ OrtSession* sess,
-                    _In_ const OrtRunOptions* run_options,
+                    _In_opt_ const OrtRunOptions* run_options,
                     _In_ const char* const* input_names, _In_ const OrtValue* const* input, size_t input_len,
                     _In_ const char* const* output_names1, size_t output_names_len, _Outptr_ OrtValue** output) {
   API_IMPL_BEGIN
@@ -449,7 +449,7 @@ ORT_API_STATUS_IMPL(OrtRun, _Inout_ OrtSession* sess,
   std::vector<OrtValue> fetches(output_names_len);
   for (size_t i = 0; i != output_names_len; ++i) {
     if (output[i] != nullptr) {
-      ::OrtValue& value = *reinterpret_cast<::OrtValue*>(output[i]);
+      ::OrtValue& value = *(output[i]);
       if (value.Fence())
         value.Fence()->BeforeUsingAsOutput(onnxruntime::kCpuExecutionProvider, queue_id);
       fetches[i] = value;
@@ -520,9 +520,9 @@ ORT_API_STATUS_IMPL(OrtGetStringTensorContent, _In_ const OrtValue* value,
     if ((!_status.IsOK())) return ToOrtStatus(_status); \
   } while (0)
 
-#define DEFINE_RELEASE_ORT_OBJECT_FUNCTION(INPUT_TYPE, REAL_TYPE) \
-  ORT_API(void, OrtRelease##INPUT_TYPE, Ort##INPUT_TYPE* value) { \
-    delete reinterpret_cast<REAL_TYPE*>(value);                   \
+#define DEFINE_RELEASE_ORT_OBJECT_FUNCTION(INPUT_TYPE, REAL_TYPE)                 \
+  ORT_API(void, OrtRelease##INPUT_TYPE, _Frees_ptr_opt_ Ort##INPUT_TYPE* value) { \
+    delete reinterpret_cast<REAL_TYPE*>(value);                                   \
   }
 
 ORT_API_STATUS_IMPL(OrtSessionGetInputCount, _In_ const OrtSession* sess, _Out_ size_t* out) {
diff --git a/onnxruntime/core/util/math.h b/onnxruntime/core/util/math.h
index 21593c69326ee..7e200b7b6a7ad 100644
--- a/onnxruntime/core/util/math.h
+++ b/onnxruntime/core/util/math.h
@@ -35,6 +35,9 @@ extern "C" {
 #include "core/framework/tensor.h"
 
 namespace onnxruntime {
+namespace concurrency {
+class ThreadPool;
+}
 
 enum StorageOrder {
   UNKNOWN = 0,
@@ -74,7 +77,7 @@ void MatMul(
     int K,
     const T* A,
     const T* B,
-    T* C);
+    T* C, concurrency::ThreadPool* threadpool);
 
 // Decaf gemm provides a simpler interface to the gemm functions, with the
 // limitation that the data has to be contiguous in memory.
@@ -90,7 +93,7 @@ void Gemm(
     const T* B,
     float beta,
     T* C,
-    Provider* provider);
+    Provider*);
 
 // We also provide a gemm that has explicit lda, ldb and ldc specified.
 // In most cases you probably want to use the function above, though.
@@ -109,7 +112,7 @@ void GemmEx(
     T beta,
     T* C,
     int ldc,
-    Provider* provider);
+    Provider*);
 
 // Gemv always takes in a M*N matrix A, and depending on whether we set TransA
 // to Trans, the output is:
diff --git a/onnxruntime/core/util/math_cpu.cc b/onnxruntime/core/util/math_cpu.cc
index 822d58ec63140..9cf66e0037852 100644
--- a/onnxruntime/core/util/math_cpu.cc
+++ b/onnxruntime/core/util/math_cpu.cc
@@ -20,16 +20,17 @@
 #include "core/util/math_cpuonly.h"
 #include "core/mlas/inc/mlas.h"
 #include "Eigen/src/Core/arch/GPU/Half.h"
+using onnxruntime::concurrency::ThreadPool;
 
 namespace onnxruntime {
 namespace math {
 
 // MatMul implementation purely based on Eigen.
-#define EIGEN_MATMUL_FUNCTION(T)                                                         \
-  template <>                                                                            \
-  void MatMul<T>(int M, int N, int K, const T* A, const T* B, T* C) {                    \
-    auto C_mat = EigenMatrixMap<T>(C, N, M);                                             \
-    C_mat.noalias() = ConstEigenMatrixMap<T>(B, N, K) * ConstEigenMatrixMap<T>(A, K, M); \
+#define EIGEN_MATMUL_FUNCTION(T)                                                                \
+  template <>                                                                                   \
+  void MatMul<T>(int M, int N, int K, const T* A, const T* B, T* C, concurrency::ThreadPool*) { \
+    auto C_mat = EigenMatrixMap<T>(C, N, M);                                                    \
+    C_mat.noalias() = ConstEigenMatrixMap<T>(B, N, K) * ConstEigenMatrixMap<T>(A, K, M);        \
   }
 
 EIGEN_MATMUL_FUNCTION(int32_t)
@@ -44,7 +45,7 @@ EIGEN_MATMUL_FUNCTION(uint64_t)
 // CBLAS call or the Eigen implementation.
 ////////////////////////////////////////////////////////////////////////////////
 // when USE_MKLML is defined, use cblas APIs for MKLML
-#if defined(USE_EIGEN_FOR_BLAS) && !defined(USE_MKLML_FOR_BLAS)
+#if !defined(USE_MKLML_FOR_BLAS)
 
 // Caffe2 gemm provides a simpler interface to the gemm functions, with the
 // limitation that the data has to be contiguous in memory.
@@ -62,28 +63,26 @@ EIGEN_MATMUL_FUNCTION(uint64_t)
 // (transpose) if the argument TransA or TransB is set to CblasNoTrans or
 // CblasTrans, respectively, for each of A and B.
 template <>
-void Gemm<float, CPUMathUtil>(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int64_t M,
-                              const int64_t N, const int64_t K, float alpha, const float* A, const float* B, float beta,
-                              float* C, CPUMathUtil* /*provider*/) {
+void Gemm<float, ThreadPool>(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int64_t M,
+                             const int64_t N, const int64_t K, float alpha, const float* A, const float* B, float beta,
+                             float* C, ThreadPool* threadpool) {
   int lda = static_cast<int>((TransA == CblasNoTrans) ? K : M);
   int ldb = static_cast<int>((TransB == CblasNoTrans) ? N : K);
-  // TODO: Make this use the operator threadpool
-  MlasSgemm(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N, nullptr);
+  MlasSgemm(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N, threadpool);
 }
 
 template <>
-void MatMul<float>(int M, int N, int K, const float* A, const float* B, float* C) {
-  // TODO: Make this use the operator threadpool
-  MlasSgemm(CblasNoTrans, CblasNoTrans, M, N, K, 1.f, A, K, B, N, 0.f, C, N, nullptr);
+void MatMul<float>(int M, int N, int K, const float* A, const float* B, float* C, ThreadPool* threadpool) {
+  MlasSgemm(CblasNoTrans, CblasNoTrans, M, N, K, 1.f, A, K, B, N, 0.f, C, N, threadpool);
 }
 
 EIGEN_MATMUL_FUNCTION(double)
 
 template <>
-void GemmEx<float, CPUMathUtil>(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, int M, int N, int K,
-                                float alpha, const float* A, int lda, const float* B, int ldb, float beta, float* C,
-                                int ldc, CPUMathUtil*) {
-  MlasSgemm(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, nullptr);
+void GemmEx<float, ThreadPool>(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, int M, int N, int K,
+                               float alpha, const float* A, int lda, const float* B, int ldb, float beta, float* C,
+                               int ldc, ThreadPool* threadpool) {
+  MlasSgemm(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, threadpool);
 }
 
 template <>
@@ -125,12 +124,12 @@ void Gemv<float, CPUMathUtil>(const CBLAS_TRANSPOSE TransA, int M, int N, float
 SPECIALIZED_AXPY(float)
 #undef SPECIALIZED_AXPY
 
-#else  // USE_EIGEN_FOR_BLAS
+#else
 
 template <>
-void Gemm<float, CPUMathUtil>(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int64_t M,
-                              const int64_t N, const int64_t K, float alpha, const float* A, const float* B, float beta,
-                              float* C, CPUMathUtil* /*context*/) {
+void Gemm<float, ThreadPool>(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int64_t M,
+                             const int64_t N, const int64_t K, float alpha, const float* A, const float* B, float beta,
+                             float* C, ThreadPool* /*context*/) {
   int lda = gsl::narrow_cast<int>((TransA == CblasNoTrans) ? K : M);
   int ldb = gsl::narrow_cast<int>((TransB == CblasNoTrans) ? N : K);
   cblas_sgemm(CblasRowMajor, TransA, TransB,
@@ -142,19 +141,19 @@ void Gemm<float, CPUMathUtil>(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOS
 }
 
 template <>
-void MatMul<float>(int M, int N, int K, const float* A, const float* B, float* C) {
+void MatMul<float>(int M, int N, int K, const float* A, const float* B, float* C, ThreadPool*) {
   cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, 1, A, K, B, N, 0, C, N);
 }
 
 template <>
-void MatMul<double>(int M, int N, int K, const double* A, const double* B, double* C) {
+void MatMul<double>(int M, int N, int K, const double* A, const double* B, double* C, ThreadPool*) {
   cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, 1, A, K, B, N, 0, C, N);
 }
 
 template <>
-void GemmEx<float, CPUMathUtil>(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, int M, int N, int K,
-                                float alpha, const float* A, int lda, const float* B, int ldb, float beta, float* C,
-                                int ldc, CPUMathUtil* /*context*/) {
+void GemmEx<float, ThreadPool>(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, int M, int N, int K,
+                               float alpha, const float* A, int lda, const float* B, int ldb, float beta, float* C,
+                               int ldc, ThreadPool* /*context*/) {
   cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
               beta, C, ldc);
 }
@@ -177,7 +176,7 @@ void Gemv<float, CPUMathUtil>(const CBLAS_TRANSPOSE TransA, int M, int N, float
 CAFFE2_SPECIALIZED_AXPY(float, s)
 #undef CAFFE2_SPECIALIZED_AXPY
 
-#endif  // USE_EIGEN_FOR_BLAS
+#endif
 
 #define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, expr)                  \
   template <>                                                              \
diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc
index c555c9ba21900..e51cf53a8235a 100644
--- a/onnxruntime/test/framework/allocation_planner_test.cc
+++ b/onnxruntime/test/framework/allocation_planner_test.cc
@@ -155,12 +155,13 @@ class PlannerTest : public ::testing::Test {
   std::vector<std::unique_ptr<OpKernelInfo>> op_kernel_infos_;
   std::vector<std::pair<onnxruntime::Node*, KernelDef&>> kernel_bindings_;
   ExecutionProviders execution_providers_;
+  concurrency::ThreadPool tp_;
   SessionState state_;
   ShapeMap shape_map_;
   std::unique_ptr<SequentialExecutionPlan> plan_;
 
  public:
-  PlannerTest() : model_("test"), graph_{model_.MainGraph()}, state_{execution_providers_, false} {
+  PlannerTest() : model_("test"), graph_(model_.MainGraph()), tp_("test", 1), state_(execution_providers_, false, &tp_) {
     std_kernel_ = KernelDefBuilder().SetName("Transpose").Build();
     in_place_kernel_ = KernelDefBuilder().SetName("Clip").MayInplace(0, 0).Build();
     CPUExecutionProviderInfo epi;
diff --git a/onnxruntime/test/framework/execution_frame_test.cc b/onnxruntime/test/framework/execution_frame_test.cc
index 48e2ce8c3b0e6..62f87ebd073a1 100644
--- a/onnxruntime/test/framework/execution_frame_test.cc
+++ b/onnxruntime/test/framework/execution_frame_test.cc
@@ -36,7 +36,12 @@ std::unique_ptr<IExecutionProvider> CreateCPUExecutionProvider() {
   return std::make_unique<CPUExecutionProvider>(info);
 }
 
-TEST(ExecutionFrameTest, TensorAllocationTest) {
+class ExecutionFrameTest : public ::testing::Test {
+ protected:
+  concurrency::ThreadPool tp_{"test", 1};
+};
+
+TEST_F(ExecutionFrameTest, TensorAllocationTest) {
   onnxruntime::Model model("test");
   onnxruntime::Graph& graph = model.MainGraph();
   TypeProto tensor_float;
@@ -57,7 +62,7 @@ TEST(ExecutionFrameTest, TensorAllocationTest) {
   status = kernel_registry_manager.RegisterKernels(execution_providers);
   EXPECT_TRUE(status.IsOK()) << status.ErrorMessage();
 
-  SessionState state{execution_providers, true};
+  SessionState state{execution_providers, true, &tp_};
   state.SetGraphViewer(std::make_unique<GraphViewer>(graph));
 
   OrtValueNameIdxMap& mlvalue_name_idx_map{state.GetOrtValueNameIdxMap()};
@@ -111,7 +116,7 @@ TEST(ExecutionFrameTest, TensorAllocationTest) {
   EXPECT_EQ(tensor2->template Data<float>(), p_tensor->template Data<float>());
 }
 
-TEST(ExecutionFrameTest, FeedInDataTest) {
+TEST_F(ExecutionFrameTest, FeedInDataTest) {
   onnxruntime::Model model("test");
   onnxruntime::Graph& graph = model.MainGraph();
   TypeProto tensor_float;
@@ -140,7 +145,7 @@ TEST(ExecutionFrameTest, FeedInDataTest) {
   execution_providers.Add(xp_typ, std::move(cpu_xp));
   EXPECT_TRUE(kernel_registry_manager.RegisterKernels(execution_providers).IsOK());
 
-  SessionState state{execution_providers, true};
+  SessionState state{execution_providers, true, &tp_};
   state.SetGraphViewer(std::make_unique<GraphViewer>(graph));
 
   OrtValueNameIdxMap& mlvalue_name_idx_map{state.GetOrtValueNameIdxMap()};
@@ -160,7 +165,7 @@ TEST(ExecutionFrameTest, FeedInDataTest) {
   EXPECT_EQ(p_tensor_arg_0->MutableData<float>(), value.GetMutable<Tensor>()->MutableData<float>());
 }
 
-TEST(ExecutionFrameTest, MemPatternTest) {
+TEST_F(ExecutionFrameTest, MemPatternTest) {
   auto cpu_xp = CreateCPUExecutionProvider();
   auto xp_type = cpu_xp->Type();
   std::unordered_map<std::string, int> domain_to_version;
@@ -192,7 +197,7 @@ TEST(ExecutionFrameTest, MemPatternTest) {
   execution_providers.Add(xp_type, std::move(cpu_xp));
   kernel_registry_manager.RegisterKernels(execution_providers);
   //1. prepare input
-  SessionState state{execution_providers, true};
+  SessionState state{execution_providers, true, &tp_};
   state.SetGraphViewer(std::make_unique<GraphViewer>(graph));
 
   OrtValueNameIdxMap& mlvalue_name_idx_map{state.GetOrtValueNameIdxMap()};
@@ -264,7 +269,7 @@ TEST(ExecutionFrameTest, MemPatternTest) {
   EXPECT_EQ(p->GetBlock(4)->offset_, 64);
 }
 
-TEST(ExecutionFrameTest, BadModelInvalidDimParamUsage) {
+TEST(ExecutionFrameTestWithoutSessionState, BadModelInvalidDimParamUsage) {
   // load model with 2 Scan ops that both incorrectly use shapes of { 'None', 'None' } for their outputs.
   // as 'None' is not a special value it's treated as a variable name, leading to a runtime error when we
   // attempt to re-use the output from the first Scan node for the second. validate we detect this and error out.
diff --git a/onnxruntime/test/framework/math_test.cc b/onnxruntime/test/framework/math_test.cc
index 97104647334d4..2c254bc3cc823 100644
--- a/onnxruntime/test/framework/math_test.cc
+++ b/onnxruntime/test/framework/math_test.cc
@@ -17,12 +17,24 @@
 
 #include "core/util/math.h"
 #include <gtest/gtest.h>
+#include "core/platform/threadpool.h"
 #include "core/util/math_cpuonly.h"
 namespace onnxruntime {
 
 #define VECTOR_HEAD(x) x.size() > 0 ? &x[0] : NULL
 
-TEST(MathTest, GemmNoTransNoTrans) {
+//parameter is thread pool size
+class MathGemmTest : public testing::TestWithParam<int> {
+ protected:
+  static concurrency::ThreadPool* CreateThreadPool(int size) {
+    if (size == 1)
+      return nullptr;
+    return new concurrency::ThreadPool("test", size);
+  }
+  std::unique_ptr<concurrency::ThreadPool> tp{CreateThreadPool(GetParam())};
+};
+
+TEST_P(MathGemmTest, GemmNoTransNoTrans) {
   auto& provider = CPUMathUtil::Instance();
   std::vector<float> X(50);  // 5 * 10
   std::vector<float> W(60);  // 10 * 6
@@ -40,34 +52,35 @@ TEST(MathTest, GemmNoTransNoTrans) {
   const float kOne = 1.0;
   const float kPointFive = 0.5;
   const float kZero = 0.0;
-  math::Gemm<float, CPUMathUtil>(CblasNoTrans, CblasNoTrans, 5, 6, 10, kOne,
-                                 VECTOR_HEAD(X), VECTOR_HEAD(W), kZero, VECTOR_HEAD(Y),
-                                 &provider);
+  math::Gemm<float>(CblasNoTrans, CblasNoTrans, 5, 6, 10, kOne,
+                    VECTOR_HEAD(X), VECTOR_HEAD(W), kZero, VECTOR_HEAD(Y),
+                    tp.get());
   EXPECT_EQ(Y.size(), 30);
   for (size_t i = 0; i < Y.size(); ++i) {
     EXPECT_EQ(Y[i], 10) << i;
   }
   // Test Accumulate
-  math::Gemm<float, CPUMathUtil>(CblasNoTrans, CblasNoTrans, 5, 6, 10, kOne,
-                                 VECTOR_HEAD(X), VECTOR_HEAD(W), kPointFive,
-                                 VECTOR_HEAD(Y), &provider);
+  math::Gemm<float>(CblasNoTrans, CblasNoTrans, 5, 6, 10, kOne,
+                    VECTOR_HEAD(X), VECTOR_HEAD(W), kPointFive,
+                    VECTOR_HEAD(Y), tp.get());
   EXPECT_EQ(Y.size(), 30);
   for (size_t i = 0; i < Y.size(); ++i) {
     EXPECT_EQ(Y[i], 15) << i;
   }
   // Test Accumulate
-  math::Gemm<float, CPUMathUtil>(CblasNoTrans, CblasNoTrans, 5, 6, 10,
-                                 kPointFive,
-                                 VECTOR_HEAD(X), VECTOR_HEAD(W), kOne, VECTOR_HEAD(Y),
-                                 &provider);
+  math::Gemm<float>(CblasNoTrans, CblasNoTrans, 5, 6, 10,
+                    kPointFive,
+                    VECTOR_HEAD(X), VECTOR_HEAD(W), kOne, VECTOR_HEAD(Y),
+                    tp.get());
   EXPECT_EQ(Y.size(), 30);
   for (size_t i = 0; i < Y.size(); ++i) {
     EXPECT_EQ(Y[i], 20) << i;
   }
 }
 
-TEST(MathTest, GemmNoTransTrans) {
+TEST_P(MathGemmTest, GemmNoTransTrans) {
   auto& provider = CPUMathUtil::Instance();
+
   std::vector<float> X(50);  // 5 * 10
   std::vector<float> W(60);  // 10 * 6
   std::vector<float> Y(30);  // 5 * 6
@@ -84,30 +97,33 @@ TEST(MathTest, GemmNoTransTrans) {
   const float kOne = 1.0;
   const float kPointFive = 0.5;
   const float kZero = 0.0;
-  math::Gemm<float, CPUMathUtil>(CblasNoTrans, CblasTrans, 5, 6, 10, kOne,
-                                 VECTOR_HEAD(X), VECTOR_HEAD(W), kZero, VECTOR_HEAD(Y),
-                                 &provider);
+  math::Gemm<float>(CblasNoTrans, CblasTrans, 5, 6, 10, kOne,
+                    VECTOR_HEAD(X), VECTOR_HEAD(W), kZero, VECTOR_HEAD(Y),
+                    tp.get());
   EXPECT_EQ(Y.size(), 30);
   for (size_t i = 0; i < Y.size(); ++i) {
     EXPECT_EQ(Y[i], 10) << i;
   }
   // Test Accumulate
-  math::Gemm<float, CPUMathUtil>(CblasNoTrans, CblasTrans, 5, 6, 10, kOne,
-                                 VECTOR_HEAD(X), VECTOR_HEAD(W), kPointFive,
-                                 VECTOR_HEAD(Y), &provider);
+  math::Gemm<float>(CblasNoTrans, CblasTrans, 5, 6, 10, kOne,
+                    VECTOR_HEAD(X), VECTOR_HEAD(W), kPointFive,
+                    VECTOR_HEAD(Y), tp.get());
   EXPECT_EQ(Y.size(), 30);
   for (size_t i = 0; i < Y.size(); ++i) {
     EXPECT_EQ(Y[i], 15) << i;
   }
-  math::Gemm<float, CPUMathUtil>(CblasNoTrans, CblasTrans, 5, 6, 10, kPointFive,
-                                 VECTOR_HEAD(X), VECTOR_HEAD(W), kOne, VECTOR_HEAD(Y),
-                                 &provider);
+  math::Gemm<float>(CblasNoTrans, CblasTrans, 5, 6, 10, kPointFive,
+                    VECTOR_HEAD(X), VECTOR_HEAD(W), kOne, VECTOR_HEAD(Y),
+                    tp.get());
   EXPECT_EQ(Y.size(), 30);
   for (size_t i = 0; i < Y.size(); ++i) {
     EXPECT_EQ(Y[i], 20) << i;
   }
 }
 
+INSTANTIATE_TEST_CASE_P(MathGemmTests, MathGemmTest,
+                        testing::Values(1, 4));
+
 TEST(MathTest, GemvNoTrans) {
   auto& provider = CPUMathUtil::Instance();
   std::vector<float> A(50);  // 5 * 10
diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc
index 6064b74630897..dec86627b06fe 100644
--- a/onnxruntime/test/framework/session_state_test.cc
+++ b/onnxruntime/test/framework/session_state_test.cc
@@ -35,11 +35,12 @@ class TestOpKernel : public OpKernel {
 };
 
 TEST(SessionStateTest, AddGetKernelTest) {
+  concurrency::ThreadPool tp{"test", 1};
   ONNX_OPERATOR_SCHEMA(Variable)
       .SetDoc("Input variable.")
       .Output(0, "output_1", "docstr for output_1.", "tensor(int32)");
   ExecutionProviders execution_providers;
-  SessionState s{execution_providers, true};
+  SessionState s{execution_providers, true, &tp};
 
   onnxruntime::Model model("graph_1");
   auto& graph = model.MainGraph();
@@ -70,68 +71,80 @@ TEST(SessionStateTest, AddGetKernelTest) {
   EXPECT_EQ(orig_num_outputs, test_kernel->Node().OutputDefs().size());
 }
 
+namespace {
+class TestParam {
+ public:
+  int ir_version;
+  bool enable_mem_pattern;
+};
+TestParam param_list[] = {{3, true}, {4, true}, {3, false}, {4, false}};
+}  // namespace
+class SessionStateTestP : public testing::TestWithParam<TestParam> {
+};
 // Test that we separate out constant and non-constant initializers correctly
-TEST(SessionStateTest, TestInitializerProcessing) {
-  std::vector<int> ir_versions = {3, 4};
-  for (auto ir_version : ir_versions) {
-    std::string model_path = "testdata/optional_inputs_ir" + std::to_string(ir_version) + ".onnx";
-    Status status;
-    std::shared_ptr<Model> model;
-    ASSERT_TRUE((status = Model::Load(model_path, model)).IsOK()) << status;
-    Graph& graph = model->MainGraph();
-    // take a copy as this gets cleared during session state initialization
-    InitializedTensorSet initializers = graph.GetAllInitializedTensors();
-
-    const bool enable_mem_pattern = false;
-    ExecutionProviders execution_providers;
-    CPUExecutionProviderInfo epi{false};
-    status = execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique<CPUExecutionProvider>(epi));
-    ASSERT_TRUE(status.IsOK()) << status;
-
-    KernelRegistryManager krm;
-    status = krm.RegisterKernels(execution_providers);
-    ASSERT_TRUE(status.IsOK()) << status;
-
-    SessionState session_state(execution_providers, enable_mem_pattern);
-    SessionStateInitializer session_initializer(enable_mem_pattern, ToWideString(model_path), graph,
-                                                session_state, execution_providers, krm);
-
-    GraphPartitioner partitioner(krm, execution_providers);
-    status = partitioner.Partition(graph, session_state.ExportDll(), session_state.GetMutableFuncMgr());
-    ASSERT_TRUE(status.IsOK()) << status;
-
-    status = session_initializer.CreatePlan(nullptr, nullptr, true);
-    ASSERT_TRUE(status.IsOK()) << status;
-
-    status = session_initializer.InitializeAndSave(nullptr);
-    ASSERT_TRUE(status.IsOK()) << status;
-
-    const auto& initialized_tensors = session_state.GetInitializedTensors();
-    const auto& const_initialized_tensors = session_state.GetConstantInitializedTensors();
-
-    ASSERT_EQ(initializers.size(), initialized_tensors.size())
-        << "SessionState should have an entry for all initializers in Graph.";
-
-    if (ir_version < 4) {
-      ASSERT_EQ(initialized_tensors.size(), const_initialized_tensors.size())
-          << "All initializers should be considered constant if IR version < 4.";
-    } else {
-      const auto& name_to_idx = session_state.GetOrtValueNameIdxMap();
-
-      for (auto entry : initializers) {
-        int idx;
-        name_to_idx.GetIdx(entry.first, idx);
-
-        bool found = initialized_tensors.find(idx) != initialized_tensors.cend();
-        ASSERT_TRUE(found) << "Missing entry for " << entry.first << " in session state initialized tensors";
-
-        if (graph_utils::IsConstantInitializer(graph, entry.first, false)) {
-          found = const_initialized_tensors.find(idx) != const_initialized_tensors.cend();
-          ASSERT_TRUE(found) << "Missing entry for " << entry.first << " in session state const initialized tensors";
-        }
+TEST_P(SessionStateTestP, TestInitializerProcessing) {
+  const TestParam& param = GetParam();
+  concurrency::ThreadPool tp{"test", 1};
+
+  std::string model_path = "testdata/optional_inputs_ir" + std::to_string(param.ir_version) + ".onnx";
+  Status status;
+  std::shared_ptr<Model> model;
+  ASSERT_TRUE((status = Model::Load(model_path, model)).IsOK()) << status;
+  Graph& graph = model->MainGraph();
+  // take a copy as this gets cleared during session state initialization
+  InitializedTensorSet initializers = graph.GetAllInitializedTensors();
+
+  ExecutionProviders execution_providers;
+  CPUExecutionProviderInfo epi{false};
+  status = execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique<CPUExecutionProvider>(epi));
+  ASSERT_TRUE(status.IsOK()) << status;
+
+  KernelRegistryManager krm;
+  status = krm.RegisterKernels(execution_providers);
+  ASSERT_TRUE(status.IsOK()) << status;
+
+  SessionState session_state(execution_providers, param.enable_mem_pattern, &tp);
+  SessionStateInitializer session_initializer(param.enable_mem_pattern, ToWideString(model_path), graph,
+                                              session_state, execution_providers, krm);
+
+  GraphPartitioner partitioner(krm, execution_providers);
+  status = partitioner.Partition(graph, session_state.ExportDll(), session_state.GetMutableFuncMgr());
+  ASSERT_TRUE(status.IsOK()) << status;
+
+  status = session_initializer.CreatePlan(nullptr, nullptr, true);
+  ASSERT_TRUE(status.IsOK()) << status;
+
+  status = session_initializer.InitializeAndSave(nullptr);
+  ASSERT_TRUE(status.IsOK()) << status;
+
+  const auto& initialized_tensors = session_state.GetInitializedTensors();
+  const auto& const_initialized_tensors = session_state.GetConstantInitializedTensors();
+
+  ASSERT_EQ(initializers.size(), initialized_tensors.size())
+      << "SessionState should have an entry for all initializers in Graph.";
+
+  if (param.ir_version < 4) {
+    ASSERT_EQ(initialized_tensors.size(), const_initialized_tensors.size())
+        << "All initializers should be considered constant if IR version < 4.";
+  } else {
+    const auto& name_to_idx = session_state.GetOrtValueNameIdxMap();
+
+    for (auto entry : initializers) {
+      int idx;
+      name_to_idx.GetIdx(entry.first, idx);
+
+      bool found = initialized_tensors.find(idx) != initialized_tensors.cend();
+      ASSERT_TRUE(found) << "Missing entry for " << entry.first << " in session state initialized tensors";
+
+      if (graph_utils::IsConstantInitializer(graph, entry.first, false)) {
+        found = const_initialized_tensors.find(idx) != const_initialized_tensors.cend();
+        ASSERT_TRUE(found) << "Missing entry for " << entry.first << " in session state const initialized tensors";
       }
     }
   }
 }
+
+INSTANTIATE_TEST_CASE_P(SessionStateTests, SessionStateTestP,
+                        testing::ValuesIn(param_list));
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/mlas/unittest.cpp b/onnxruntime/test/mlas/unittest.cpp
index 36c96c819c87d..2b999f3b97d95 100644
--- a/onnxruntime/test/mlas/unittest.cpp
+++ b/onnxruntime/test/mlas/unittest.cpp
@@ -26,11 +26,16 @@ Module Name:
 #else
 #include <sys/mman.h>
 #endif
+#if !defined(MLAS_NO_ONNXRUNTIME_THREADPOOL)
+#include "core/platform/threadpool.h"
+#endif
 
 #if !defined(_countof)
 #define _countof(_Array) (sizeof(_Array) / sizeof(_Array[0]))
 #endif
 
+MLAS_THREADPOOL* threadpool = nullptr;
+
 class MatrixGuardBuffer
 {
 public:
@@ -225,7 +230,7 @@ class MlasSgemmTest : public MlasTestBase
         std::fill_n(C, M * N, -0.5f);
         std::fill_n(CReference, M * N, -0.5f);
 
-        MlasSgemm(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, nullptr);
+        MlasSgemm(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, threadpool);
         ReferenceSgemm(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, CReference, ldc);
 
         for (size_t f = 0; f < M * N; f++) {
@@ -667,7 +672,7 @@ class MlasConv2DTest : public MlasTestBase
                 }
 
                 MlasSgemm(CblasNoTrans, CblasNoTrans, FilterCount, OutputSize, K, 1.0f,
-                    filter, K, Im2Col, OutputSize, 0.0f, Output, OutputSize, nullptr);
+                    filter, K, Im2Col, OutputSize, 0.0f, Output, OutputSize, threadpool);
 
                 //
                 // Apply the bias.
@@ -1072,7 +1077,7 @@ class MlasPool2DTest : public MlasTestBase
         float* Output
         )
     {
-        MlasPool(PoolingKind, 2, InputShape, KernelShape, Padding, StrideShape, OutputShape, Input, Output, nullptr);
+        MlasPool(PoolingKind, 2, InputShape, KernelShape, Padding, StrideShape, OutputShape, Input, Output, threadpool);
     }
 
     void
@@ -1417,7 +1422,7 @@ class MlasPool3DTest : public MlasTestBase
         float* Output = BufferOutput.GetBuffer(OutputBufferElements);
         float* OutputReference = BufferOutputReference.GetBuffer(OutputBufferElements);
 
-        MlasPool(MlasMaximumPooling, 3, InputShape, KernelShape, Padding, StrideShape, OutputShape, Input, Output, nullptr);
+        MlasPool(MlasMaximumPooling, 3, InputShape, KernelShape, Padding, StrideShape, OutputShape, Input, Output, threadpool);
         ReferenceMaximumPool3D(InputShape, KernelShape, Padding, StrideShape, Input, OutputReference);
 
         if (memcmp(Output, OutputReference, OutputBufferElements * sizeof(float)) != 0) {
@@ -1425,7 +1430,7 @@ class MlasPool3DTest : public MlasTestBase
                 InputChannels, InputDepth, InputHeight, InputWidth, KernelDepth, KernelHeight, KernelWidth);
         }
 
-        MlasPool(MlasAveragePoolingExcludePad, 3, InputShape, KernelShape, Padding, StrideShape, OutputShape, Input, Output, nullptr);
+        MlasPool(MlasAveragePoolingExcludePad, 3, InputShape, KernelShape, Padding, StrideShape, OutputShape, Input, Output, threadpool);
         ReferenceAveragePool3D(InputShape, KernelShape, Padding, StrideShape, Input, OutputReference, false);
 
         if (memcmp(Output, OutputReference, OutputBufferElements * sizeof(float)) != 0) {
@@ -1433,7 +1438,7 @@ class MlasPool3DTest : public MlasTestBase
                 InputChannels, InputDepth, InputHeight, InputWidth, KernelDepth, KernelHeight, KernelWidth);
         }
 
-        MlasPool(MlasAveragePoolingIncludePad, 3, InputShape, KernelShape, Padding, StrideShape, OutputShape, Input, Output, nullptr);
+        MlasPool(MlasAveragePoolingIncludePad, 3, InputShape, KernelShape, Padding, StrideShape, OutputShape, Input, Output, threadpool);
         ReferenceAveragePool3D(InputShape, KernelShape, Padding, StrideShape, Input, OutputReference, true);
 
         if (memcmp(Output, OutputReference, OutputBufferElements * sizeof(float)) != 0) {
@@ -1781,28 +1786,37 @@ main(
     void
     )
 {
-    printf("SGEMM tests.\n");
-    std::make_unique<MlasSgemmTest>()->ExecuteShort();
-
-    printf("Conv2D tests.\n");
-    std::make_unique<MlasConv2DTest>()->ExecuteShort();
-    if (MlasNchwcGetBlockSize() > 1) {
-        std::make_unique<MlasNchwcConv2DTest>()->ExecuteShort();
-    }
-
-    printf("Pool2D tests.\n");
-    std::make_unique<MlasPool2DTest>()->ExecuteShort();
-    if (MlasNchwcGetBlockSize() > 1) {
-        std::make_unique<MlasNchwcPool2DTest>()->ExecuteShort();
-    }
+    for (int i = 0; i != 2; ++i) {
+        printf("SGEMM tests.\n");
+        std::make_unique<MlasSgemmTest>()->ExecuteShort();
+
+        printf("Conv2D tests.\n");
+        std::make_unique<MlasConv2DTest>()->ExecuteShort();
+        if (MlasNchwcGetBlockSize() > 1) {
+            std::make_unique<MlasNchwcConv2DTest>()->ExecuteShort();
+        }
 
-    printf("Pool3D tests.\n");
-    std::make_unique<MlasPool3DTest>()->ExecuteShort();
+        printf("Pool2D tests.\n");
+        std::make_unique<MlasPool2DTest>()->ExecuteShort();
+        if (MlasNchwcGetBlockSize() > 1) {
+            std::make_unique<MlasNchwcPool2DTest>()->ExecuteShort();
+        }
 
-    printf("Activation tests.\n");
-    std::make_unique<MlasActivationTest>()->ExecuteShort();
+        printf("Pool3D tests.\n");
+        std::make_unique<MlasPool3DTest>()->ExecuteShort();
 
-    printf("Done.\n");
+        printf("Activation tests.\n");
+        std::make_unique<MlasActivationTest>()->ExecuteShort();
 
+        printf("Done.\n");
+#if !defined(MLAS_NO_ONNXRUNTIME_THREADPOOL)
+        threadpool = new onnxruntime::concurrency::ThreadPool("test", 2);
+#else
+	    break;
+#endif
+	}
+#if !defined(MLAS_NO_ONNXRUNTIME_THREADPOOL)
+    delete threadpool;
+#endif
     return 0;
 }
diff --git a/onnxruntime/test/onnx/microbenchmark/model_init.cc b/onnxruntime/test/onnx/microbenchmark/model_init.cc
index 5a841276e06bf..ecf48fa6554ce 100644
--- a/onnxruntime/test/onnx/microbenchmark/model_init.cc
+++ b/onnxruntime/test/onnx/microbenchmark/model_init.cc
@@ -107,12 +107,12 @@ Status CreateExecutionProviders(std::unique_ptr<ExecutionProviders>* ret) {
   return Status::OK();
 }
 
-Status CreateKernelRegistryManagerFromModel(std::unique_ptr<KernelRegistryManager>* ret, Model* model) {
+Status CreateKernelRegistryManagerFromModel(std::unique_ptr<KernelRegistryManager>* ret, Model* model, concurrency::ThreadPool& tp) {
   std::unique_ptr<ExecutionProviders> execution_providers;
   ORT_RETURN_IF_ERROR(CreateExecutionProviders(&execution_providers));
   std::unique_ptr<KernelRegistryManager> kernel_registry_manager = std::make_unique<KernelRegistryManager>();
   ORT_RETURN_IF_ERROR(kernel_registry_manager->RegisterKernels(*execution_providers));
-  SessionState s{*execution_providers, true};
+  SessionState s{*execution_providers, true, &tp};
   s.SetLogger(logging::LoggingManager::DefaultLogger());
 
   ORT_RETURN_IF_ERROR(model->MainGraph().Resolve());
@@ -125,7 +125,8 @@ Status CreateKernelRegistryManagerFromModel(std::unique_ptr<KernelRegistryManage
 
 static void SearchKernelRegistry_IMPL(benchmark::State& state, Model* model) {
   std::unique_ptr<KernelRegistryManager> kernel_registry_manager;
-  auto st = CreateKernelRegistryManagerFromModel(&kernel_registry_manager, model);
+  concurrency::ThreadPool tp{"test", 1};
+  auto st = CreateKernelRegistryManagerFromModel(&kernel_registry_manager, model, tp);
   if (!st.IsOK()) throw std::runtime_error("failed");
   for (auto _ : state) {
     for (const auto& n : model->MainGraph().Nodes()) {
@@ -175,11 +176,12 @@ static void BM_PartitionModel_tiny_yolo(benchmark::State& state) {
   std::unique_ptr<KernelRegistryManager> kernel_registry_manager = std::make_unique<KernelRegistryManager>();
   status = kernel_registry_manager->RegisterKernels(*execution_providers);
   if (!status.IsOK()) throw std::runtime_error("RegisterKernels failed");
+  concurrency::ThreadPool tp{"test", 1};
 
   for (auto _ : state) {
     state.PauseTiming();
     std::shared_ptr<onnxruntime::Model> model = std::make_shared<onnxruntime::Model>(model_proto);
-    SessionState s{*execution_providers, true};
+    SessionState s{*execution_providers, true, &tp};
     s.SetLogger(logging::LoggingManager::DefaultLogger());
     BM_BREAK_IF_ERROR(model->MainGraph().Resolve());
     s.SetGraphViewer(std::make_unique<GraphViewer>(model->MainGraph()));
@@ -205,11 +207,12 @@ static void BM_PartitionModel_inception_v4(benchmark::State& state) {
   std::unique_ptr<KernelRegistryManager> kernel_registry_manager = std::make_unique<KernelRegistryManager>();
   status = kernel_registry_manager->RegisterKernels(*execution_providers);
   if (!status.IsOK()) throw std::runtime_error("RegisterKernels failed");
+  concurrency::ThreadPool tp{"test", 1};
 
   for (auto _ : state) {
     state.PauseTiming();
     std::shared_ptr<onnxruntime::Model> model = std::make_shared<onnxruntime::Model>(model_proto);
-    SessionState s{*execution_providers, true};
+    SessionState s{*execution_providers, true, &tp};
     s.SetLogger(logging::LoggingManager::DefaultLogger());
     BM_BREAK_IF_ERROR(model->MainGraph().Resolve());
     s.SetGraphViewer(std::make_unique<GraphViewer>(model->MainGraph()));
diff --git a/onnxruntime/test/onnx/microbenchmark/modeltest.cc b/onnxruntime/test/onnx/microbenchmark/modeltest.cc
index 4f256393ca5c1..d9f32ad744b06 100644
--- a/onnxruntime/test/onnx/microbenchmark/modeltest.cc
+++ b/onnxruntime/test/onnx/microbenchmark/modeltest.cc
@@ -50,7 +50,8 @@ BENCHMARK(BM_CreateSession_WithGPU);
 
 static void BM_CreateSession(benchmark::State& state) {
   const ORTCHAR_T* model_path = ORT_TSTR("../models/opset8/test_bvlc_alexnet/model.onnx");
-  OrtSessionOptions* session_option = OrtCreateSessionOptions();
+  OrtSessionOptions* session_option;
+  ORT_BREAK_ON_ERROR(OrtCreateSessionOptions(&session_option));
   for (auto _ : state) {
     OrtSession* session;
     ORT_BREAK_ON_ERROR(OrtCreateSession(env, model_path, session_option, &session));
diff --git a/onnxruntime/test/onnx/tensorprotoutils.cc b/onnxruntime/test/onnx/tensorprotoutils.cc
index 27442273f26fa..8b17df4aa1ae0 100644
--- a/onnxruntime/test/onnx/tensorprotoutils.cc
+++ b/onnxruntime/test/onnx/tensorprotoutils.cc
@@ -407,7 +407,7 @@ Status TensorProtoToMLValue(const onnx::TensorProto& tensor_proto, const MemBuff
         return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Size overflow");
       }
       size_t size_to_allocate;
-      GetSizeInBytesFromTensorProto<0>(tensor_proto, &size_to_allocate);
+      ORT_RETURN_IF_ERROR(GetSizeInBytesFromTensorProto<0>(tensor_proto, &size_to_allocate));
 
       if (preallocated && preallocated_size < size_to_allocate)
         return Status(common::ONNXRUNTIME, common::FAIL, MakeString("The buffer planner is not consistent with tensor buffer size, expected ", size_to_allocate, ", got ", preallocated_size));
diff --git a/onnxruntime/test/providers/cpu/math/softmax_test.cc b/onnxruntime/test/providers/cpu/math/softmax_test.cc
index aad97cfaf8754..0273378194b5d 100644
--- a/onnxruntime/test/providers/cpu/math/softmax_test.cc
+++ b/onnxruntime/test/providers/cpu/math/softmax_test.cc
@@ -194,23 +194,23 @@ TEST(SoftmaxOperator, InvalidAxis) {
 
 TEST(SoftmaxOperator, TestInputTooLarge) {
   float* ignored = nullptr;
-
+  concurrency::ThreadPool tp("", 1);
   // N > INT32_MAX
   int64_t N = int64_t(INT32_MAX) + 1;
   int64_t D = 1;
-  auto status = SoftmaxCPU(N, D, ignored, ignored, ignored, ignored, true, ignored);
+  auto status = SoftmaxCPU(N, D, ignored, ignored, ignored, ignored, true, ignored, &tp);
   EXPECT_EQ(status.Code(), common::INVALID_ARGUMENT);
 
   // D > INT32_MAX
   N = 1;
   D = int64_t(INT32_MAX) + 1;
-  status = SoftmaxCPU(N, D, ignored, ignored, ignored, ignored, true, ignored);
+  status = SoftmaxCPU(N, D, ignored, ignored, ignored, ignored, true, ignored, &tp);
   EXPECT_EQ(status.Code(), common::INVALID_ARGUMENT);
 
   // N * D > INT32_MAX
   N = int64_t(INT32_MAX) / 2;
   D = 3;
-  status = SoftmaxCPU(N, D, ignored, ignored, ignored, ignored, true, ignored);
+  status = SoftmaxCPU(N, D, ignored, ignored, ignored, ignored, true, ignored, &tp);
   EXPECT_EQ(status.Code(), common::INVALID_ARGUMENT);
 
   /*
diff --git a/onnxruntime/test/providers/memcpy_test.cc b/onnxruntime/test/providers/memcpy_test.cc
index c43779875fb02..133c2873e20ad 100644
--- a/onnxruntime/test/providers/memcpy_test.cc
+++ b/onnxruntime/test/providers/memcpy_test.cc
@@ -22,11 +22,13 @@ void PutAllNodesOnOneProvider(Graph& graph, const std::string& provider_type) {
 }
 }  // namespace
 TEST(MemcpyTest, copy1) {
+  concurrency::ThreadPool tp{"test", 1};
+
   ExecutionProviders execution_providers;
   CPUExecutionProviderInfo epi;
   auto st = execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique<CPUExecutionProvider>(epi));
   ASSERT_TRUE(st.IsOK()) << st.ErrorMessage();
-  SessionState s{execution_providers, true};
+  SessionState s{execution_providers, true, &tp};
   s.SetLogger(logging::LoggingManager::DefaultLogger());
   KernelRegistryManager kernel_registry_manager;
   kernel_registry_manager.RegisterKernels(execution_providers);

From 11e1a7ff59ade6d5b5726077d56a11d422dd2ad9 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Thu, 15 Aug 2019 18:13:04 -0700
Subject: [PATCH 2/7] fix warning

---
 onnxruntime/core/session/inference_session.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h
index f8bd6cf57bf0d..7fcbe0ad87138 100644
--- a/onnxruntime/core/session/inference_session.h
+++ b/onnxruntime/core/session/inference_session.h
@@ -407,6 +407,10 @@ class InferenceSession {
   // The list of execution providers.
   ExecutionProviders execution_providers_;
 
+ private:
+  // Threadpool for this session
+  std::unique_ptr<onnxruntime::concurrency::ThreadPool> thread_pool_;
+
  protected:
   // Immutable state for each op in the model. Shared by all executors.
   // It has a dependency on execution_providers_.
@@ -433,8 +437,6 @@ class InferenceSession {
   std::unordered_map<std::string, InputDefMetaData> input_def_map_;
   OutputDefList output_def_list_;
 
-  // Threadpool for this session
-  std::unique_ptr<onnxruntime::concurrency::ThreadPool> thread_pool_;
   // Data transfer manager.
   DataTransferManager data_transfer_mgr_;
 

From 4b736bfe2460f5ac146f19d1af4d7025289d2c0d Mon Sep 17 00:00:00 2001
From: Changming Sun <me@sunchangming.com>
Date: Thu, 15 Aug 2019 20:22:35 -0700
Subject: [PATCH 3/7] revert

---
 include/onnxruntime/core/session/environment.h       | 12 +-----------
 include/onnxruntime/core/session/onnxruntime_c_api.h |  4 ++--
 onnxruntime/core/session/environment.cc              |  1 -
 3 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/include/onnxruntime/core/session/environment.h b/include/onnxruntime/core/session/environment.h
index e11eb66072d1a..f36ebb60d0011 100644
--- a/include/onnxruntime/core/session/environment.h
+++ b/include/onnxruntime/core/session/environment.h
@@ -5,11 +5,8 @@
 
 #include <atomic>
 #include <memory>
-#include <mutex>
-#include <thread>
 #include "core/common/common.h"
 #include "core/common/status.h"
-#include "core/platform/threadpool.h"
 
 namespace onnxruntime {
 /**
@@ -32,20 +29,13 @@ class Environment {
      Returns whether any runtime environment instance has been initialized.
   */
   static bool IsInitialized() { return is_initialized_; }
-  concurrency::ThreadPool* GetThreadPool() {
-    std::call_once(tp_once_, [this]() {
-      tp_ = new concurrency::ThreadPool("default", std::max<int>(std::thread::hardware_concurrency() - 1, 1));
-    });
-    return tp_;
-  }
 
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Environment);
 
   Environment() = default;
   Status Initialize();
-  concurrency::ThreadPool* tp_ = nullptr;
-  std::once_flag tp_once_;
+
   static std::atomic<bool> is_initialized_;
 };
 }  // namespace onnxruntime
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 74cd44bd2ab39..87e143b8bda10 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -144,7 +144,7 @@ typedef enum OrtErrorCode {
 
 // The actual types defined have an Ort prefix
 ORT_RUNTIME_CLASS(Env);
-ORT_RUNTIME_CLASS(Status);
+ORT_RUNTIME_CLASS(Status); // nullptr for Status* indicates success
 ORT_RUNTIME_CLASS(Provider);
 ORT_RUNTIME_CLASS(AllocatorInfo);
 ORT_RUNTIME_CLASS(Session); //Don't call OrtReleaseSession from Dllmain (because session owns a thread pool)
@@ -342,7 +342,7 @@ ORT_API_STATUS(OrtGetStringTensorDataLength, _In_ const OrtValue* value, _Out_ s
  * \param value A tensor created from OrtCreateTensor... function.
  * \param s_len total data length, get it from OrtGetStringTensorDataLength
  */
-ORT_API_STATUS(OrtGetStringTensorContent, _In_ const OrtValue* value, _In_ void* s, size_t s_len,
+ORT_API_STATUS(OrtGetStringTensorContent, _In_ const OrtValue* value, _Out_ void* s, size_t s_len,
                _Out_ size_t* offsets, size_t offsets_len);
 
 /**
diff --git a/onnxruntime/core/session/environment.cc b/onnxruntime/core/session/environment.cc
index 539d92bd4323a..d1f9041c9253f 100644
--- a/onnxruntime/core/session/environment.cc
+++ b/onnxruntime/core/session/environment.cc
@@ -87,7 +87,6 @@ Internal copy node
 }
 
 Environment::~Environment() {
-  delete tp_;
   ::google::protobuf::ShutdownProtobufLibrary();
 }
 

From ad8351bc1300d5a6944dd35b0c78a2b2fb3cadf7 Mon Sep 17 00:00:00 2001
From: Changming Sun <me@sunchangming.com>
Date: Thu, 15 Aug 2019 20:24:08 -0700
Subject: [PATCH 4/7] format code

---
 include/onnxruntime/core/session/onnxruntime_c_api.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 87e143b8bda10..8566125ac14d1 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -144,10 +144,10 @@ typedef enum OrtErrorCode {
 
 // The actual types defined have an Ort prefix
 ORT_RUNTIME_CLASS(Env);
-ORT_RUNTIME_CLASS(Status); // nullptr for Status* indicates success
+ORT_RUNTIME_CLASS(Status);  // nullptr for Status* indicates success
 ORT_RUNTIME_CLASS(Provider);
 ORT_RUNTIME_CLASS(AllocatorInfo);
-ORT_RUNTIME_CLASS(Session); //Don't call OrtReleaseSession from Dllmain (because session owns a thread pool)
+ORT_RUNTIME_CLASS(Session);  //Don't call OrtReleaseSession from Dllmain (because session owns a thread pool)
 ORT_RUNTIME_CLASS(Value);
 ORT_RUNTIME_CLASS(RunOptions);
 ORT_RUNTIME_CLASS(TypeInfo);

From dff70df296bad1edeaa60e23619c63ed95d9e184 Mon Sep 17 00:00:00 2001
From: Changming Sun <me@sunchangming.com>
Date: Thu, 15 Aug 2019 20:24:38 -0700
Subject: [PATCH 5/7] revert more

---
 include/onnxruntime/core/session/onnxruntime_cxx_api.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 992afe2c6fa89..e1397105c3bef 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -99,7 +99,6 @@ struct Env;
 struct TypeInfo;
 struct Value;
 
-//Don't put such an object as a global(or thread local) variable in a DLL
 struct Env : Base<OrtEnv> {
   Env(nullptr_t) {}
   Env(OrtLoggingLevel default_logging_level, _In_ const char* logid);
@@ -157,7 +156,6 @@ struct SessionOptions : Base<OrtSessionOptions> {
   SessionOptions& Add(OrtCustomOpDomain* custom_op_domain);
 };
 
-//Don't put such an object as a global(or thread local) variable in a DLL
 struct Session : Base<OrtSession> {
   explicit Session(nullptr_t) {}
   Session(Env& env, const ORTCHAR_T* model_path, const SessionOptions& options);

From 399e81b6db66e616fccd6be59921b84217b00a4a Mon Sep 17 00:00:00 2001
From: Changming Sun <me@sunchangming.com>
Date: Thu, 15 Aug 2019 22:20:14 -0700
Subject: [PATCH 6/7] update

---
 onnxruntime/core/providers/cpu/object_detection/roialign.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/cpu/object_detection/roialign.cc b/onnxruntime/core/providers/cpu/object_detection/roialign.cc
index 9453039aa8753..4d27e957e9f44 100644
--- a/onnxruntime/core/providers/cpu/object_detection/roialign.cc
+++ b/onnxruntime/core/providers/cpu/object_detection/roialign.cc
@@ -268,7 +268,7 @@ void RoiAlignForward(
       }    // for ph
     }      // for c
   };       // for n
-  const_cast<ThreadPool*>(ttp)->ParallelFor(static_cast<int32_t>(n_rois), work_object);
+  if (ttp != nullptr) const_cast<ThreadPool*>(ttp)->ParallelFor(static_cast<int32_t>(n_rois), work_object);
 }
 }  // namespace
 

From 11d4f67ce290aa1f602cb1998d403ffccd1222d5 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Fri, 16 Aug 2019 11:32:37 -0700
Subject: [PATCH 7/7] update

---
 onnxruntime/core/session/inference_session.cc | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 1d7623d0165a6..6dc370b797f1c 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -92,11 +92,9 @@ inline std::basic_string<T> GetCurrentTimeString() {
 }
 
 concurrency::ThreadPool* CreateThreadPool(int size) {
-  if (size == 1)
-    return nullptr;
   if (size <= 0)
-    size = std::max<int>(std::thread::hardware_concurrency() - 1, 1);
-  return new concurrency::ThreadPool("SESSION", size);
+    size = std::thread::hardware_concurrency() / 2;
+  return size > 0 ? new concurrency::ThreadPool("SESSION", size) : nullptr;
 }
 
 }  // namespace