From 401a79c6d49afa0fd1a6b4797eb7e67ab7e75ab3 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Thu, 15 Aug 2019 17:08:00 -0700 Subject: [PATCH 1/7] update --- cmake/CMakeLists.txt | 5 - .../onnxruntime/core/framework/op_kernel.h | 2 +- include/onnxruntime/core/framework/tensor.h | 4 +- .../onnxruntime/core/session/environment.h | 12 +- .../core/session/onnxruntime_c_api.h | 19 +-- .../core/session/onnxruntime_cxx_api.h | 2 + .../cpu/attnlstm/attention_wrapper.cc | 25 ++-- .../cpu/attnlstm/attention_wrapper.h | 4 +- .../cpu/attnlstm/bahdanau_attention.cc | 34 ++--- .../cpu/attnlstm/bahdanau_attention.h | 3 +- .../cpu/attnlstm/deep_cpu_attn_lstm.cc | 23 +-- .../cpu/attnlstm/uni_dir_attn_lstm.cc | 8 +- .../cpu/attnlstm/uni_dir_attn_lstm.h | 4 +- onnxruntime/contrib_ops/cpu/nchwc_ops.cc | 3 - onnxruntime/contrib_ops/cpu/nchwc_ops.h | 2 + .../contrib_ops/cpu/word_conv_embedding.cc | 12 +- .../contrib_ops/cpu/word_conv_embedding.h | 5 +- onnxruntime/core/framework/bfc_arena.h | 2 +- onnxruntime/core/framework/error_code.cc | 5 +- onnxruntime/core/framework/mem_pattern.h | 4 +- .../framework/op_kernel_context_internal.h | 4 +- .../core/framework/parallel_executor.h | 1 - onnxruntime/core/framework/session_state.h | 10 +- onnxruntime/core/framework/tensor.cc | 4 +- onnxruntime/core/mlas/lib/mlasi.h | 6 - onnxruntime/core/mlas/lib/platform.cpp | 19 --- onnxruntime/core/mlas/lib/threading.cpp | 92 +----------- onnxruntime/core/providers/cpu/math/gemm.h | 8 +- .../core/providers/cpu/math/logsoftmax.cc | 7 +- onnxruntime/core/providers/cpu/math/matmul.cc | 7 +- .../core/providers/cpu/math/matmul_helper.h | 5 +- .../core/providers/cpu/math/softmax.cc | 6 +- .../core/providers/cpu/math/softmax_shared.cc | 5 +- .../core/providers/cpu/math/softmax_shared.h | 5 +- onnxruntime/core/providers/cpu/nn/conv.cc | 23 +-- .../core/providers/cpu/nn/conv_transpose.cc | 9 +- onnxruntime/core/providers/cpu/nn/pool.cc | 2 +- onnxruntime/core/providers/cpu/nn/pool_base.h | 13 +- .../core/providers/cpu/rnn/deep_cpu_gru.cc | 29 ++-- .../core/providers/cpu/rnn/deep_cpu_lstm.cc | 32 +++-- .../core/providers/cpu/rnn/deep_cpu_lstm.h | 4 +- onnxruntime/core/providers/cpu/rnn/rnn.cc | 11 +- .../core/providers/cpu/rnn/rnn_helpers.h | 6 +- .../core/providers/cpu/tensor/cast_op.cc | 4 +- onnxruntime/core/session/environment.cc | 1 + onnxruntime/core/session/inference_session.cc | 33 ++--- onnxruntime/core/session/onnxruntime_c_api.cc | 16 +-- onnxruntime/core/util/math.h | 9 +- onnxruntime/core/util/math_cpu.cc | 55 ++++---- .../test/framework/allocation_planner_test.cc | 3 +- .../test/framework/execution_frame_test.cc | 19 ++- onnxruntime/test/framework/math_test.cc | 58 +++++--- .../test/framework/session_state_test.cc | 131 ++++++++++-------- onnxruntime/test/mlas/unittest.cpp | 64 +++++---- .../test/onnx/microbenchmark/model_init.cc | 13 +- .../test/onnx/microbenchmark/modeltest.cc | 3 +- onnxruntime/test/onnx/tensorprotoutils.cc | 2 +- .../test/providers/cpu/math/softmax_test.cc | 8 +- onnxruntime/test/providers/memcpy_test.cc | 4 +- 59 files changed, 463 insertions(+), 446 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index d9c6068c0d1c4..9b084286b4c6d 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -50,7 +50,6 @@ option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF) option(onnxruntime_USE_NSYNC "Build with NSYNC support. This option only takes effect on Linux" OFF) option(onnxruntime_USE_EIGEN_FOR_BLAS "Use eign for blas" ON) option(onnxruntime_USE_NNAPI "Build with DNNLibrary for Android NNAPI support" OFF) -option(onnxruntime_USE_MLAS "Use optimized blas library for GEMM and 2D Convolution" ON) option(onnxruntime_USE_MKLDNN "Build with MKL-DNN support" OFF) option(onnxruntime_USE_MKLML "Build MKL-DNN with MKL-ML binary dependency" OFF) option(onnxruntime_USE_AUTOML "Build AutoML support" ON) @@ -368,10 +367,6 @@ if (onnxruntime_RUN_ONNX_TESTS) add_definitions(-DORT_RUN_EXTERNAL_ONNX_TESTS) endif() -if (onnxruntime_USE_MLAS) - add_definitions(-DUSE_MLAS) -endif() - #Adjust warning flags if (WIN32) add_definitions(-DPLATFORM_WINDOWS -DNOGDI -DNOMINMAX -D_USE_MATH_DEFINES) diff --git a/include/onnxruntime/core/framework/op_kernel.h b/include/onnxruntime/core/framework/op_kernel.h index d396551a1b407..6e98dbc20588b 100644 --- a/include/onnxruntime/core/framework/op_kernel.h +++ b/include/onnxruntime/core/framework/op_kernel.h @@ -210,7 +210,7 @@ struct KernelCreateInfo { : kernel_def(std::move(definition)), kernel_create_func(create_func) {} - KernelCreateInfo(KernelCreateInfo&& other) + KernelCreateInfo(KernelCreateInfo&& other) noexcept : kernel_def(std::move(other.kernel_def)), kernel_create_func(std::move(other.kernel_create_func)) {} }; diff --git a/include/onnxruntime/core/framework/tensor.h b/include/onnxruntime/core/framework/tensor.h index 35eb359c714a3..31a43c7d905cb 100644 --- a/include/onnxruntime/core/framework/tensor.h +++ b/include/onnxruntime/core/framework/tensor.h @@ -78,9 +78,9 @@ class Tensor final { //Move is allowed ORT_DISALLOW_COPY_AND_ASSIGNMENT(Tensor); - Tensor(Tensor&& other); + Tensor(Tensor&& other) noexcept; - Tensor& operator=(Tensor&& other); + Tensor& operator=(Tensor&& other) noexcept; /** Returns the data type. diff --git a/include/onnxruntime/core/session/environment.h b/include/onnxruntime/core/session/environment.h index f36ebb60d0011..e11eb66072d1a 100644 --- a/include/onnxruntime/core/session/environment.h +++ b/include/onnxruntime/core/session/environment.h @@ -5,8 +5,11 @@ #include #include +#include +#include #include "core/common/common.h" #include "core/common/status.h" +#include "core/platform/threadpool.h" namespace onnxruntime { /** @@ -29,13 +32,20 @@ class Environment { Returns whether any runtime environment instance has been initialized. */ static bool IsInitialized() { return is_initialized_; } + concurrency::ThreadPool* GetThreadPool() { + std::call_once(tp_once_, [this]() { + tp_ = new concurrency::ThreadPool("default", std::max(std::thread::hardware_concurrency() - 1, 1)); + }); + return tp_; + } private: ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Environment); Environment() = default; Status Initialize(); - + concurrency::ThreadPool* tp_ = nullptr; + std::once_flag tp_once_; static std::atomic is_initialized_; }; } // namespace onnxruntime diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index 2ff53c9b50fe1..74cd44bd2ab39 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -23,6 +23,10 @@ extern "C" { #define _Inout_ #define _Inout_opt_ #define _Frees_ptr_opt_ +#define _Ret_maybenull_ +#define _Ret_notnull_ +#define _Check_return_ +#define _Success_(X) #define ORT_ALL_ARGS_NONNULL __attribute__((nonnull)) #else #include @@ -127,11 +131,11 @@ typedef enum OrtErrorCode { ORT_EXPORT RETURN_TYPE ORT_API_CALL NAME(__VA_ARGS__) NO_EXCEPTION #define ORT_API_STATUS(NAME, ...) \ - ORT_EXPORT OrtStatus* ORT_API_CALL NAME(__VA_ARGS__) NO_EXCEPTION ORT_MUST_USE_RESULT + ORT_EXPORT _Check_return_ _Success_(return == 0) _Ret_maybenull_ OrtStatus* ORT_API_CALL NAME(__VA_ARGS__) NO_EXCEPTION ORT_MUST_USE_RESULT // Used in *.cc files. Almost as same as ORT_API_STATUS, except without ORT_MUST_USE_RESULT #define ORT_API_STATUS_IMPL(NAME, ...) \ - ORT_EXPORT OrtStatus* ORT_API_CALL NAME(__VA_ARGS__) NO_EXCEPTION + ORT_EXPORT _Check_return_ _Success_(return == 0) _Ret_maybenull_ OrtStatus* ORT_API_CALL NAME(__VA_ARGS__) NO_EXCEPTION #define ORT_RUNTIME_CLASS(X) \ struct Ort##X; \ @@ -140,12 +144,11 @@ typedef enum OrtErrorCode { // The actual types defined have an Ort prefix ORT_RUNTIME_CLASS(Env); -ORT_RUNTIME_CLASS(Status); // nullptr for Status* indicates success +ORT_RUNTIME_CLASS(Status); ORT_RUNTIME_CLASS(Provider); ORT_RUNTIME_CLASS(AllocatorInfo); -ORT_RUNTIME_CLASS(Session); +ORT_RUNTIME_CLASS(Session); //Don't call OrtReleaseSession from Dllmain (because session owns a thread pool) ORT_RUNTIME_CLASS(Value); -ORT_RUNTIME_CLASS(ValueList); ORT_RUNTIME_CLASS(RunOptions); ORT_RUNTIME_CLASS(TypeInfo); ORT_RUNTIME_CLASS(TensorTypeAndShapeInfo); @@ -339,7 +342,7 @@ ORT_API_STATUS(OrtGetStringTensorDataLength, _In_ const OrtValue* value, _Out_ s * \param value A tensor created from OrtCreateTensor... function. * \param s_len total data length, get it from OrtGetStringTensorDataLength */ -ORT_API_STATUS(OrtGetStringTensorContent, _In_ const OrtValue* value, _Out_ void* s, size_t s_len, +ORT_API_STATUS(OrtGetStringTensorContent, _In_ const OrtValue* value, _In_ void* s, size_t s_len, _Out_ size_t* offsets, size_t offsets_len); /** @@ -444,8 +447,8 @@ ORT_API(const char*, OrtGetVersionString); /** * \param msg A null-terminated string. Its content will be copied into the newly created OrtStatus */ -ORT_API(OrtStatus*, OrtCreateStatus, OrtErrorCode code, _In_ const char* msg) -ORT_ALL_ARGS_NONNULL; +ORT_EXPORT _Check_return_ _Ret_notnull_ OrtStatus* ORT_API_CALL OrtCreateStatus(OrtErrorCode code, _In_ const char* msg) NO_EXCEPTION + ORT_ALL_ARGS_NONNULL; ORT_API(OrtErrorCode, OrtGetErrorCode, _In_ const OrtStatus* status) ORT_ALL_ARGS_NONNULL; diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h index e1397105c3bef..992afe2c6fa89 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h @@ -99,6 +99,7 @@ struct Env; struct TypeInfo; struct Value; +//Don't put such an object as a global(or thread local) variable in a DLL struct Env : Base { Env(nullptr_t) {} Env(OrtLoggingLevel default_logging_level, _In_ const char* logid); @@ -156,6 +157,7 @@ struct SessionOptions : Base { SessionOptions& Add(OrtCustomOpDomain* custom_op_domain); }; +//Don't put such an object as a global(or thread local) variable in a DLL struct Session : Base { explicit Session(nullptr_t) {} Session(Env& env, const ORTCHAR_T* model_path, const SessionOptions& options); diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.cc b/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.cc index 4555713a59fe1..8757ccb35f771 100644 --- a/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.cc +++ b/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.cc @@ -16,7 +16,7 @@ template AttentionWrapper::AttentionWrapper(AllocatorPtr alloc, const logging::Logger& logger, int batch_size, int attn_context_depth, int attn_layer_depth, int inner_cell_hidden_size, bool has_attn_layer, - const IAttentionMechanism& attention_mechanism) + const IAttentionMechanism& attention_mechanism, concurrency::ThreadPool* threadpool) : allocator_(alloc), logger_(logger), batch_size_(batch_size), @@ -24,7 +24,8 @@ AttentionWrapper::AttentionWrapper(AllocatorPtr alloc, const logging::Logger& attn_layer_depth_(attn_layer_depth), inner_cell_hidden_size_(inner_cell_hidden_size), has_attn_layer_(has_attn_layer), - attention_mechanism_(attention_mechanism) { + attention_mechanism_(attention_mechanism), + ttp_(threadpool) { auto mem_max_steps = attention_mechanism_.GetMaxMemorySteps(); prev_alignments_ = Allocate(allocator_, batch_size_ * mem_max_steps, prev_alignments_ptr_, true); alignments_ = Allocate(allocator_, batch_size_ * mem_max_steps, alignments_ptr_, true); @@ -37,11 +38,11 @@ template void AttentionWrapper::ProcessOutput(const gsl::span& rnn_cell_output) { if (has_attn_layer_) { // rnn_cell_output * cell_weights, (part of the attention layer above the attention mechanism). - math::GemmEx(CblasNoTrans, CblasNoTrans, - batch_size_, attn_layer_depth_, inner_cell_hidden_size_, T{1.0}, - rnn_cell_output.data(), inner_cell_hidden_size_, - attn_layer_cell_weights_.data(), attn_layer_depth_, T{0.0}, - attn_states_.data(), attn_layer_depth_, &CPUMathUtil::Instance()); + math::GemmEx(CblasNoTrans, CblasNoTrans, + batch_size_, attn_layer_depth_, inner_cell_hidden_size_, T{1.0}, + rnn_cell_output.data(), inner_cell_hidden_size_, + attn_layer_cell_weights_.data(), attn_layer_depth_, T{0.0}, + attn_states_.data(), attn_layer_depth_, ttp_); } // Get the context which is calculated within attention mechanism. @@ -54,11 +55,11 @@ void AttentionWrapper::ProcessOutput(const gsl::span& rnn_cell_outpu //concat([p_cell_output, context]) * stack([attn_layer_cell_weights_, attn_layer_attn_weights_]) = // p_cell_output * attn_layer_cell_weights_ + context * attn_layer_attn_weights_ // The first part is calulated above. Here just add the later. - math::GemmEx(CblasNoTrans, CblasNoTrans, - batch_size_, attn_layer_depth_, attn_context_depth_, T{1.0}, - attn_context_.data(), attn_context_depth_, - attn_layer_attn_weights_.data(), attn_layer_depth_, T{1.0}, - attn_states_.data(), attn_layer_depth_, &CPUMathUtil::Instance()); + math::GemmEx(CblasNoTrans, CblasNoTrans, + batch_size_, attn_layer_depth_, attn_context_depth_, T{1.0}, + attn_context_.data(), attn_context_depth_, + attn_layer_attn_weights_.data(), attn_layer_depth_, T{1.0}, + attn_states_.data(), attn_layer_depth_, ttp_); } } diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.h b/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.h index 2469a7b99a3fb..b6cc06c040e3a 100644 --- a/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.h +++ b/onnxruntime/contrib_ops/cpu/attnlstm/attention_wrapper.h @@ -8,6 +8,7 @@ #include "core/common/common.h" #include "core/common/logging/logging.h" #include "core/framework/allocator.h" +#include "core/platform/threadpool.h" namespace onnxruntime { namespace contrib { @@ -22,7 +23,7 @@ class AttentionWrapper { int attn_layer_depth, int inner_cell_hidden_size, bool has_attn_layer, - const IAttentionMechanism& attention_mechanism); + const IAttentionMechanism& attention_mechanism, concurrency::ThreadPool* threadpool); virtual ~AttentionWrapper() = default; @@ -69,6 +70,7 @@ class AttentionWrapper { bool has_attn_layer_; const IAttentionMechanism& attention_mechanism_; + concurrency::ThreadPool* ttp_; }; } // namespace contrib diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc b/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc index 932ac263f8e22..74ad84b5af839 100644 --- a/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc +++ b/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.cc @@ -15,8 +15,8 @@ namespace contrib { template BahdanauAttention::BahdanauAttention(AllocatorPtr allocator, const logging::Logger& logger, int batch_size, int max_memory_step, int memory_depth, - int query_depth, int attn_depth, bool normalize) - : allocator_(allocator), logger_(logger), batch_size_(batch_size), max_memory_steps_(max_memory_step), memory_depth_(memory_depth), query_depth_(query_depth), attn_depth_(attn_depth), normalize_(normalize) { + int query_depth, int attn_depth, bool normalize, concurrency::ThreadPool* threadpool) + : allocator_(allocator), logger_(logger), batch_size_(batch_size), max_memory_steps_(max_memory_step), memory_depth_(memory_depth), query_depth_(query_depth), attn_depth_(attn_depth), normalize_(normalize), ttp_(threadpool) { values_ = Allocate(allocator_, batch_size_ * max_memory_steps_ * memory_depth_, values_ptr_, true); keys_ = Allocate(allocator_, batch_size_ * max_memory_steps_ * attn_depth_, keys_ptr_, true); processed_query_ = Allocate(allocator_, batch_size_ * attn_depth_, processed_query_ptr_, true); @@ -72,11 +72,11 @@ void BahdanauAttention::PrepareMemory( "Real memory steps ", mem_steps, " is not in (0, ", max_memory_steps_, "]"); } - math::GemmEx(CblasNoTrans, CblasNoTrans, - batch_size_ * max_memory_steps_, attn_depth_, memory_depth_, T{1.0}, - memory.data(), memory_depth_, - memory_layer_weights_.data(), attn_depth_, T{0.0}, - keys_.data(), attn_depth_, &CPUMathUtil::Instance()); + math::GemmEx(CblasNoTrans, CblasNoTrans, + batch_size_ * max_memory_steps_, attn_depth_, memory_depth_, T{1.0}, + memory.data(), memory_depth_, + memory_layer_weights_.data(), attn_depth_, T{0.0}, + keys_.data(), attn_depth_, ttp_); } template @@ -115,11 +115,11 @@ void BahdanauAttention::Compute( const gsl::span& output, const gsl::span& aligns) const { //process query in dense query layer without bias - math::GemmEx(CblasNoTrans, CblasNoTrans, - batch_size_, attn_depth_, query_depth_, T{1.0}, - queries.data(), query_depth_, - query_layer_weights_.data(), attn_depth_, T{0.0}, - processed_query_.data(), attn_depth_, &CPUMathUtil::Instance()); + math::GemmEx(CblasNoTrans, CblasNoTrans, + batch_size_, attn_depth_, query_depth_, T{1.0}, + queries.data(), query_depth_, + query_layer_weights_.data(), attn_depth_, T{0.0}, + processed_query_.data(), attn_depth_, ttp_); std::fill(aligns.begin(), aligns.end(), T{}); @@ -146,11 +146,11 @@ void BahdanauAttention::Compute( // Calculate the context auto outspan = output.subspan(b * memory_depth_); auto values = values_.subspan(b * max_memory_steps_ * memory_depth_); - math::GemmEx(CblasNoTrans, CblasNoTrans, - 1, memory_depth_, max_memory_steps_, T{1.0}, - alignments, max_memory_steps_, - values.data(), memory_depth_, T{0.0}, - outspan.data(), memory_depth_, &CPUMathUtil::Instance()); + math::GemmEx(CblasNoTrans, CblasNoTrans, + 1, memory_depth_, max_memory_steps_, T{1.0}, + alignments, max_memory_steps_, + values.data(), memory_depth_, T{0.0}, + outspan.data(), memory_depth_, ttp_); } } diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.h b/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.h index 755af6ba6d5c3..c2bfee15c5bcc 100644 --- a/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.h +++ b/onnxruntime/contrib_ops/cpu/attnlstm/bahdanau_attention.h @@ -23,7 +23,7 @@ class BahdanauAttention : public IAttentionMechanism { int memory_depth, int query_depth, int attn_depth, - bool normalize); + bool normalize, concurrency::ThreadPool* threadpool); void SetWeights( const gsl::span& attn_weights, @@ -77,6 +77,7 @@ class BahdanauAttention : public IAttentionMechanism { gsl::span mem_seq_lengths_; bool normalize_; + concurrency::ThreadPool* ttp_; }; } // namespace contrib diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc index 7f7102475c620..50e98f834260b 100644 --- a/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc +++ b/onnxruntime/contrib_ops/cpu/attnlstm/deep_cpu_attn_lstm.cc @@ -8,7 +8,9 @@ #include "core/common/common.h" #include "core/common/logging/logging.h" +#include "core/platform/threadpool.h" #include "core/framework/allocator.h" +#include "core/framework/op_kernel_context_internal.h" namespace onnxruntime { namespace contrib { @@ -70,6 +72,9 @@ static gsl::span SecondHalfSpan(const gsl::span& dspan) { template Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const { + auto ctx_internal = static_cast(&context); + concurrency::ThreadPool* thread_pool = ctx_internal->GetOperatorThreadPool(); + auto& logger = context.Logger(); // original lstm processing @@ -236,7 +241,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const { memory_depth, query_depth, am_attn_size, - false); + false, thread_pool); fam.SetWeights( FirstHalfSpan(am_v_weights.DataAsSpan()), @@ -252,7 +257,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const { attn_layer_depth, hidden_size_, has_attention_layer, - fam); + fam, thread_pool); faw.SetWeights(FirstHalfSpan(attn_layer_weights_span)); UniDirectionalAttnLstm fw( @@ -263,7 +268,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const { activation_funcs_.Entries()[0], activation_funcs_.Entries()[1], activation_funcs_.Entries()[2], - clip_, ttp_); + clip_, thread_pool); BahdanauAttention bam( alloc, @@ -273,7 +278,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const { memory_depth, query_depth, am_attn_size, - false); + false, thread_pool); bam.SetWeights( SecondHalfSpan(am_v_weights.DataAsSpan()), SecondHalfSpan(am_query_layer_weights.DataAsSpan()), @@ -288,7 +293,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const { attn_layer_depth, hidden_size_, has_attention_layer, - bam); + bam, thread_pool); baw.SetWeights(SecondHalfSpan(attn_layer_weights_span)); UniDirectionalAttnLstm bw( @@ -299,7 +304,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const { activation_funcs_.Entries()[3], activation_funcs_.Entries()[4], activation_funcs_.Entries()[5], - clip_, ttp_); + clip_, thread_pool); fw.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, output_1, hidden_output_1, last_cell_1); bw.Compute(input, sequence_lens_span, num_directions_, input_weights_2, hidden_weights_2, output_2, hidden_output_2, last_cell_2); @@ -313,7 +318,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const { memory_depth, query_depth, am_attn_size, - false); + false, thread_pool); fam.SetWeights( am_v_weights.DataAsSpan(), @@ -329,7 +334,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const { attn_layer_depth, hidden_size_, has_attention_layer, - fam); + fam, thread_pool); faw.SetWeights(attn_layer_weights_span); @@ -341,7 +346,7 @@ Status DeepCpuAttnLstmOp::ComputeImpl(OpKernelContext& context) const { activation_funcs_.Entries()[0], activation_funcs_.Entries()[1], activation_funcs_.Entries()[2], - clip_, ttp_); + clip_, thread_pool); fw.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, output_1, hidden_output_1, last_cell_1); } diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc b/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc index caa05f9d5ceff..4183b6e2d6de4 100644 --- a/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc +++ b/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.cc @@ -45,7 +45,7 @@ UniDirectionalAttnLstm::UniDirectionalAttnLstm(AllocatorPtr allocator, const ActivationFuncs::Entry& activation_func_g, const ActivationFuncs::Entry& activation_func_h, const float clip, - onnxruntime::concurrency::ThreadPool& ttp) + onnxruntime::concurrency::ThreadPool* ttp) : allocator_(allocator), logger_(logger), seq_length_(seq_length), @@ -254,7 +254,7 @@ void UniDirectionalAttnLstm::Compute(const gsl::span& inputs_arg, input_weights.cbegin(), input_weights.cend(), // W[iofc]^T input_size_ + attention_size_, T{0.0}, output_iofc_.begin(), output_iofc_.end(), - hidden_size_x4); + hidden_size_x4, ttp_); DumpMatrix("Xt*(W[iofc]^T)", output_iofc_.data(), total_rows, hidden_size_x4); @@ -296,7 +296,7 @@ void UniDirectionalAttnLstm::Compute(const gsl::span& inputs_arg, input_weights.cbegin() + input_size_, input_weights.cend(), // WA[iofc] input_size_ + attention_size_, T{1.0}, step_out_IOFC, output_iofc_.end(), // input contains Xt*(W[iofc]^T) - hidden_size_x4); + hidden_size_x4, ttp_); // calculate Xt*(W[iofc]^T) + Ht-1*R[iofc] ComputeGemm(batch_size_, hidden_size_x4, hidden_size_, T{1.0}, @@ -305,7 +305,7 @@ void UniDirectionalAttnLstm::Compute(const gsl::span& inputs_arg, recurrent_weights.cbegin(), recurrent_weights.cend(), // R[iofc] hidden_size_, T{1.0}, step_out_IOFC, output_iofc_.end(), // input contains Xt*(W[iofc]^T) - hidden_size_x4); + hidden_size_x4, ttp_); span_T_iter batched_output, batched_output_end; if (output_sequence) { diff --git a/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.h b/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.h index 5a8e4e3224a25..2d3a6f20fe1e9 100644 --- a/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.h +++ b/onnxruntime/contrib_ops/cpu/attnlstm/uni_dir_attn_lstm.h @@ -51,7 +51,7 @@ class UniDirectionalAttnLstm { const ActivationFuncs::Entry& activation_func_g, const ActivationFuncs::Entry& activation_func_h, const float clip, - onnxruntime::concurrency::ThreadPool& ttp); + onnxruntime::concurrency::ThreadPool* ttp); void Compute(const gsl::span& inputs, const gsl::span& sequence_lengths, @@ -151,7 +151,7 @@ class UniDirectionalAttnLstm { AttentionWrapper& attention_wrapper_; - onnxruntime::concurrency::ThreadPool& ttp_; + onnxruntime::concurrency::ThreadPool* ttp_; }; } // namespace detail diff --git a/onnxruntime/contrib_ops/cpu/nchwc_ops.cc b/onnxruntime/contrib_ops/cpu/nchwc_ops.cc index b5625551ad104..3b14b21a79533 100644 --- a/onnxruntime/contrib_ops/cpu/nchwc_ops.cc +++ b/onnxruntime/contrib_ops/cpu/nchwc_ops.cc @@ -170,9 +170,6 @@ Status NchwcPoolBase::NchwcPool(OpKernelContext* context, MLAS_POOLING_KIND kind ORT_ENFORCE(X_shape.NumDimensions() == 4); ORT_ENFORCE((X_shape[1] % MlasNchwcGetBlockSize()) == 0); - if (!global_pooling_) { - ORT_RETURN_IF_NOT(kernel_shape_.size() == 2, "kernel_shape num_dims is not compatible with X num_dims."); - } std::vector pads = pads_; std::vector output_dims = PoolBase::SetOutputSize(X_shape, X_shape[1], &pads, dilations_, ceil_mode_); diff --git a/onnxruntime/contrib_ops/cpu/nchwc_ops.h b/onnxruntime/contrib_ops/cpu/nchwc_ops.h index 65045cd0eeb85..b9f8993114094 100644 --- a/onnxruntime/contrib_ops/cpu/nchwc_ops.h +++ b/onnxruntime/contrib_ops/cpu/nchwc_ops.h @@ -50,6 +50,8 @@ class NchwcConv : public OpKernel, public ConvBase { class NchwcPoolBase : public PoolBase { public: NchwcPoolBase(const OpKernelInfo& info) : PoolBase(info) { + if (!global_pooling_) + ORT_ENFORCE(kernel_shape_.size() == 2, "kernel_shape num_dims is not compatible with X num_dims."); } Status NchwcPool(OpKernelContext* context, MLAS_POOLING_KIND kind) const; diff --git a/onnxruntime/contrib_ops/cpu/word_conv_embedding.cc b/onnxruntime/contrib_ops/cpu/word_conv_embedding.cc index 7d7f577d5e3a1..3213ff4fc1db3 100644 --- a/onnxruntime/contrib_ops/cpu/word_conv_embedding.cc +++ b/onnxruntime/contrib_ops/cpu/word_conv_embedding.cc @@ -6,6 +6,7 @@ #include "core/util/math.h" #include "core/util/math_cpuonly.h" #include "core/mlas/inc/mlas.h" +#include "core/framework/op_kernel_context_internal.h" namespace onnxruntime { namespace contrib { @@ -45,7 +46,7 @@ void WordConvEmbedding::ComputeConvMaxPoolWithActivation( int64_t char_embedding_size, int64_t filter_width, int64_t num_filters, - float* output) const { + float* output, concurrency::ThreadPool* tp) const { int64_t input_word_size = word_len * char_embedding_size; int64_t unfolded_width = word_len - filter_width + 1; int64_t unfolded_kernal_size = filter_width * char_embedding_size; @@ -83,12 +84,12 @@ void WordConvEmbedding::ComputeConvMaxPoolWithActivation( tmp_word_inx++; } - math::GemmEx( + math::GemmEx( CblasNoTrans, CblasTrans, static_cast(words_unfolded_width), static_cast(num_filters), static_cast(unfolded_kernal_size), 1.0f, unfolded_buffer_p.get(), static_cast(unfolded_kernal_size), weights, static_cast(unfolded_kernal_size), 0.0f, - conv_buf_p, static_cast(num_filters), &CPUMathUtil::Instance()); + conv_buf_p, static_cast(num_filters), tp); for (int64_t unfolded_inx = 0; unfolded_inx < words_unfolded_width; unfolded_inx++) for (int64_t filter_inx = 0; filter_inx < num_filters; filter_inx++) { @@ -160,6 +161,9 @@ Status WordConvEmbedding::ValidateInputShape(const TensorShape& w_conv_shape, co } Status WordConvEmbedding::Compute(OpKernelContext* ctx) const { + auto ctx_internal = static_cast(ctx); + concurrency::ThreadPool* tp = ctx_internal->GetOperatorThreadPool(); + // original lstm processing const Tensor& sequence = *(ctx->Input(0)); // sequence: [sequence_length, word_length] const Tensor& w_conv = *(ctx->Input(1)); // conv weight: [M, C/group, kH, kW] @@ -216,7 +220,7 @@ Status WordConvEmbedding::Compute(OpKernelContext* ctx) const { char_embedding_size, filter_width, filter_size, - Y->MutableData()); + Y->MutableData(), tp); return Status::OK(); } diff --git a/onnxruntime/contrib_ops/cpu/word_conv_embedding.h b/onnxruntime/contrib_ops/cpu/word_conv_embedding.h index e74afab169fd8..5ee4127e3bfb9 100644 --- a/onnxruntime/contrib_ops/cpu/word_conv_embedding.h +++ b/onnxruntime/contrib_ops/cpu/word_conv_embedding.h @@ -8,6 +8,9 @@ #include "core/framework/tensor.h" namespace onnxruntime { +namespace concurrency { +class ThreadPool; +} namespace contrib { class WordConvEmbedding final : public OpKernel { @@ -38,7 +41,7 @@ class WordConvEmbedding final : public OpKernel { int64_t char_embedding_size, int64_t filter_width, int64_t num_filters, - float* output) const; + float* output, onnxruntime::concurrency::ThreadPool* tp) const; void CalculateLengthOfEachWordInSequence( const int* seq_ptr, int* words_len_ptr, diff --git a/onnxruntime/core/framework/bfc_arena.h b/onnxruntime/core/framework/bfc_arena.h index 664f6fa72a04b..bdc6496c63205 100644 --- a/onnxruntime/core/framework/bfc_arena.h +++ b/onnxruntime/core/framework/bfc_arena.h @@ -244,7 +244,7 @@ class BFCArena : public IArenaAllocator { ~AllocationRegion() { delete[] handles_; } - AllocationRegion(AllocationRegion&& other) { Swap(other); } + AllocationRegion(AllocationRegion&& other) noexcept { Swap(other); } AllocationRegion& operator=(AllocationRegion&& other) { Swap(other); diff --git a/onnxruntime/core/framework/error_code.cc b/onnxruntime/core/framework/error_code.cc index 2cf11f4e1de8e..c727b7464f3ac 100644 --- a/onnxruntime/core/framework/error_code.cc +++ b/onnxruntime/core/framework/error_code.cc @@ -12,11 +12,12 @@ struct OrtStatus { char msg[1]; // a null-terminated string }; -ORT_API(OrtStatus*, OrtCreateStatus, OrtErrorCode code, _In_ const char* msg) { +//Even we say it may not return NULL, indeed it may. +ORT_EXPORT _Check_return_ _Ret_notnull_ OrtStatus* ORT_API_CALL OrtCreateStatus(OrtErrorCode code, _In_ const char* msg) NO_EXCEPTION { assert(!(code == 0 && msg != nullptr)); size_t clen = strlen(msg); OrtStatus* p = reinterpret_cast(::malloc(sizeof(OrtStatus) + clen)); - if (p == nullptr) return nullptr; // OOM + if (p == nullptr) return nullptr; // OOM. What we can do here? abort()? p->code = code; memcpy(p->msg, msg, clen); p->msg[clen] = '\0'; diff --git a/onnxruntime/core/framework/mem_pattern.h b/onnxruntime/core/framework/mem_pattern.h index 57d9e99360b13..2aa1e3cad32eb 100644 --- a/onnxruntime/core/framework/mem_pattern.h +++ b/onnxruntime/core/framework/mem_pattern.h @@ -20,11 +20,11 @@ class MemoryPattern { public: MemoryPattern() = default; - MemoryPattern(MemoryPattern&& rhs) + MemoryPattern(MemoryPattern&& rhs) noexcept : patterns_{std::move(rhs.patterns_)}, peak_size_{std::move(rhs.peak_size_)} {} - MemoryPattern& operator=(MemoryPattern&& rhs) { + MemoryPattern& operator=(MemoryPattern&& rhs) noexcept { patterns_ = std::move(rhs.patterns_); peak_size_ = std::move(rhs.peak_size_); return *this; diff --git a/onnxruntime/core/framework/op_kernel_context_internal.h b/onnxruntime/core/framework/op_kernel_context_internal.h index 02515ba39a160..b837356504d36 100644 --- a/onnxruntime/core/framework/op_kernel_context_internal.h +++ b/onnxruntime/core/framework/op_kernel_context_internal.h @@ -5,6 +5,7 @@ #include "core/framework/op_kernel.h" #include "core/framework/session_state.h" +#include "core/session/onnxruntime_c_api.h" // onnxruntime internal OpKernelContext derived class to provide additional // APIs that aren't desirable to add to the public OpKernelContext API @@ -57,7 +58,8 @@ class OpKernelContextInternal : public OpKernelContext { const bool& GetTerminateFlag() const noexcept { return terminate_flag_; } - const onnxruntime::concurrency::ThreadPool* GetOperatorThreadPool() const { return session_state_.GetThreadPool(); } + _Ret_maybenull_ const onnxruntime::concurrency::ThreadPool* GetOperatorThreadPool() const { return session_state_.GetThreadPool(); } + _Ret_maybenull_ onnxruntime::concurrency::ThreadPool* GetOperatorThreadPool() { return session_state_.GetThreadPool(); } private: const SessionState& session_state_; diff --git a/onnxruntime/core/framework/parallel_executor.h b/onnxruntime/core/framework/parallel_executor.h index 5f34309937bac..74d3fbce3d8d4 100644 --- a/onnxruntime/core/framework/parallel_executor.h +++ b/onnxruntime/core/framework/parallel_executor.h @@ -21,7 +21,6 @@ class ExecutionFrame; class ParallelExecutor : public IExecutor { public: - ParallelExecutor(const bool& terminate_flag = false) : terminate_flag_{terminate_flag} {} ParallelExecutor(const SessionState& session_state, const bool& terminate_flag = false); common::Status Execute(const SessionState& session_state, const std::vector& feed_mlvalue_idxs, diff --git a/onnxruntime/core/framework/session_state.h b/onnxruntime/core/framework/session_state.h index 92a6f107e5058..0f64b2b943c08 100644 --- a/onnxruntime/core/framework/session_state.h +++ b/onnxruntime/core/framework/session_state.h @@ -43,8 +43,8 @@ struct MemoryPatternGroup; */ class SessionState { public: - SessionState(const ExecutionProviders& execution_providers, bool enable_mem_pattern) - : execution_providers_{execution_providers}, enable_mem_pattern_(enable_mem_pattern) {} + SessionState(const ExecutionProviders& execution_providers, bool enable_mem_pattern, concurrency::ThreadPool* thread_pool) + : execution_providers_{execution_providers}, enable_mem_pattern_(enable_mem_pattern), thread_pool_(thread_pool) {} ~SessionState() { for (auto& kvp : deleter_for_initialized_tensors_) { @@ -175,8 +175,7 @@ class SessionState { SessionState* GetMutableSubgraphSessionState(onnxruntime::NodeIndex index, const std::string& attribute_name); - onnxruntime::concurrency::ThreadPool* GetThreadPool() const { return thread_pool_; } - void SetThreadPool(onnxruntime::concurrency::ThreadPool* p_pool) { thread_pool_ = p_pool; } + concurrency::ThreadPool* GetThreadPool() const { return thread_pool_; } bool ExportDll() const { return export_fused_dll_; } void SetExportDllFlag(bool flag) { export_fused_dll_ = flag; } @@ -232,7 +231,8 @@ class SessionState { std::unordered_map>>; SubgraphSessionStateMap subgraph_session_states_; - onnxruntime::concurrency::ThreadPool* thread_pool_ = nullptr; + //It could be NULL + concurrency::ThreadPool* const thread_pool_; bool export_fused_dll_ = false; FuncManager fused_funcs_mgr_; diff --git a/onnxruntime/core/framework/tensor.cc b/onnxruntime/core/framework/tensor.cc index d0085c0fe6c1a..692232a6a8abc 100644 --- a/onnxruntime/core/framework/tensor.cc +++ b/onnxruntime/core/framework/tensor.cc @@ -47,7 +47,7 @@ void Tensor::Init(MLDataType p_type, const TensorShape& shape, void* p_raw_data, byte_offset_ = offset; } -Tensor::Tensor(Tensor&& other) +Tensor::Tensor(Tensor&& other) noexcept : p_data_(other.p_data_), buffer_deleter_(other.buffer_deleter_), shape_(other.shape_), @@ -61,7 +61,7 @@ Tensor::Tensor(Tensor&& other) other.byte_offset_ = 0; } -Tensor& Tensor::operator=(Tensor&& other) { +Tensor& Tensor::operator=(Tensor&& other) noexcept { if (this != &other) { ReleaseBuffer(); diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h index b191c155928d9..1b9c6e505affb 100644 --- a/onnxruntime/core/mlas/lib/mlasi.h +++ b/onnxruntime/core/mlas/lib/mlasi.h @@ -92,8 +92,6 @@ Module Name: #if defined(_OPENMP) #include -#elif defined(_WIN32) -#define MLAS_USE_WIN32_THREADPOOL #endif // @@ -423,10 +421,6 @@ struct MLAS_PLATFORM { uint32_t NchwcBlockSize; uint32_t PreferredBufferAlignment; #endif - -#if defined(MLAS_USE_WIN32_THREADPOOL) - int32_t MaximumThreadCount; -#endif }; extern MLAS_PLATFORM MlasPlatform; diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp index 4f99d50fb27b0..d4f3324f10ed6 100644 --- a/onnxruntime/core/mlas/lib/platform.cpp +++ b/onnxruntime/core/mlas/lib/platform.cpp @@ -192,25 +192,6 @@ Return Value: } #endif - -#if defined(MLAS_USE_WIN32_THREADPOOL) - - // - // Retrieve the number of processors in the system. - // - - SYSTEM_INFO SystemInfo; - - GetSystemInfo(&SystemInfo); - - if (SystemInfo.dwNumberOfProcessors <= MLAS_MAXIMUM_THREAD_COUNT) { - this->MaximumThreadCount = int32_t(SystemInfo.dwNumberOfProcessors); - } else { - this->MaximumThreadCount = MLAS_MAXIMUM_THREAD_COUNT; - } - -#endif - } size_t diff --git a/onnxruntime/core/mlas/lib/threading.cpp b/onnxruntime/core/mlas/lib/threading.cpp index 858b72722e8bc..ef30de9499bb2 100644 --- a/onnxruntime/core/mlas/lib/threading.cpp +++ b/onnxruntime/core/mlas/lib/threading.cpp @@ -16,59 +16,6 @@ Module Name: #include "mlasi.h" -#if defined(MLAS_USE_WIN32_THREADPOOL) - -// -// Define the parameters to execute threaded work using the Windows thread pool -// library. -// - -struct MLAS_THREADED_WORK_BLOCK { - volatile LONG Counter; - PMLAS_THREADED_ROUTINE ThreadedRoutine; - void* Context; -}; - -void -CALLBACK -MlasThreadedWorkCallback( - PTP_CALLBACK_INSTANCE Instance, - void* Context, - PTP_WORK WorkObject - ) -/*++ - -Routine Description: - - This routine is invoked from a worker thread to execute one iteration of a - batch of threaded work. - -Arguments: - - Instance - Supplies the callback instance object. - - Context - Supplies the pointer to the parameters for the operation. - - WorkObject - Supplies the threadpool work object. - -Return Value: - - None. - ---*/ -{ - MLAS_UNREFERENCED_PARAMETER(Instance); - MLAS_UNREFERENCED_PARAMETER(WorkObject); - - MLAS_THREADED_WORK_BLOCK* WorkBlock = (MLAS_THREADED_WORK_BLOCK*)Context; - - LONG Index = InterlockedIncrement(&WorkBlock->Counter) - 1; - - WorkBlock->ThreadedRoutine(WorkBlock->Context, Index); -} - -#endif - void MlasExecuteThreaded( MLAS_THREADED_ROUTINE ThreadedRoutine, @@ -99,48 +46,11 @@ MlasExecuteThreaded( } #endif -#if defined(MLAS_USE_WIN32_THREADPOOL) // - // Schedule the threaded iterations using a work object. + // Fallback to OpenMP or a serialized implementation. // - MLAS_THREADED_WORK_BLOCK WorkBlock; - - PTP_WORK WorkObject = CreateThreadpoolWork(MlasThreadedWorkCallback, &WorkBlock, nullptr); - - if (WorkObject != nullptr) { - - WorkBlock.Counter = 0; - WorkBlock.ThreadedRoutine = ThreadedRoutine; - WorkBlock.Context = Context; - - for (int32_t tid = 1; tid < Iterations; tid++) { - SubmitThreadpoolWork(WorkObject); - } - - // - // Execute the remaining iteration on this thread. - // - - ThreadedRoutine(Context, Iterations - 1); - - // - // Wait for the work object callbacks to complete. - // - - WaitForThreadpoolWorkCallbacks(WorkObject, FALSE); - CloseThreadpoolWork(WorkObject); - - return; - } - - // - // Fallback to a serialized implementation. - // - -#endif - // // Execute the routine for the specified number of iterations. // diff --git a/onnxruntime/core/providers/cpu/math/gemm.h b/onnxruntime/core/providers/cpu/math/gemm.h index a3aa724ab410d..225754141a6d7 100644 --- a/onnxruntime/core/providers/cpu/math/gemm.h +++ b/onnxruntime/core/providers/cpu/math/gemm.h @@ -8,6 +8,7 @@ #include "core/util/math.h" #include "core/util/math_cpuonly.h" #include "gemm_helper.h" +#include "core/framework/op_kernel_context_internal.h" namespace onnxruntime { @@ -27,6 +28,9 @@ class Gemm : public OpKernel { } Status Compute(OpKernelContext* context) const override { + auto ctx_internal = static_cast(context); + concurrency::ThreadPool* tp = ctx_internal->GetOperatorThreadPool(); + const auto X = context->Input(0); const auto W = context->Input(1); const auto B = context->Input(2); @@ -64,7 +68,7 @@ class Gemm : public OpKernel { } // W * x - math::Gemm( + math::Gemm( trans_A_, trans_B_, M, @@ -75,7 +79,7 @@ class Gemm : public OpKernel { W->template Data(), beta_, y_data, - &CPUMathUtil::Instance()); + tp); FuseActivation(activation_, y_data, M * N, leaky_relu_alpha_); diff --git a/onnxruntime/core/providers/cpu/math/logsoftmax.cc b/onnxruntime/core/providers/cpu/math/logsoftmax.cc index 281031e71568e..19fbb9897c699 100644 --- a/onnxruntime/core/providers/cpu/math/logsoftmax.cc +++ b/onnxruntime/core/providers/cpu/math/logsoftmax.cc @@ -4,6 +4,8 @@ #include "core/providers/cpu/math/logsoftmax.h" #include "core/framework/op_kernel.h" +#include "core/framework/op_kernel_context_internal.h" + #include "core/providers/common.h" #include "core/providers/cpu/math/softmax_shared.h" #include "core/util/math.h" @@ -12,6 +14,9 @@ namespace onnxruntime { template <> Status LogSoftmax::Compute(OpKernelContext* ctx) const { + auto ctx_internal = static_cast(ctx); + concurrency::ThreadPool* tp = ctx_internal->GetOperatorThreadPool(); + const auto* tensor_pointer = ctx->Input(0); if (tensor_pointer == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch"); const Tensor& X = *tensor_pointer; @@ -32,7 +37,7 @@ Status LogSoftmax::Compute(OpKernelContext* ctx) const { const bool logarithmic = true; auto status = SoftmaxCPU(N, D, X.template Data(), Ydata, - scale_.data(), sum_multiplier_.data(), logarithmic, rowmax_.data()); + scale_.data(), sum_multiplier_.data(), logarithmic, rowmax_.data(), tp); return status; } diff --git a/onnxruntime/core/providers/cpu/math/matmul.cc b/onnxruntime/core/providers/cpu/math/matmul.cc index 539157e92bd95..4f4bacc34baeb 100644 --- a/onnxruntime/core/providers/cpu/math/matmul.cc +++ b/onnxruntime/core/providers/cpu/math/matmul.cc @@ -1,6 +1,6 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. - +#include "core/framework/op_kernel_context_internal.h" #include "core/providers/cpu/math/matmul.h" #include "core/util/math.h" @@ -53,6 +53,9 @@ ONNX_CPU_OPERATOR_VERSIONED_TYPED_KERNEL( template Status MatMul::Compute(OpKernelContext* ctx) const { + auto ctx_internal = static_cast(ctx); + concurrency::ThreadPool* thread_pool = ctx_internal->GetOperatorThreadPool(); + const auto* left_X = ctx->Input(0); const auto* right_X = ctx->Input(1); @@ -69,7 +72,7 @@ Status MatMul::Compute(OpKernelContext* ctx) const { static_cast(helper.K()), left_X->template Data() + helper.LeftOffsets()[i], right_X->template Data() + helper.RightOffsets()[i], - Y->template MutableData() + helper.OutputOffsets()[i]); + Y->template MutableData() + helper.OutputOffsets()[i], thread_pool); } return Status::OK(); diff --git a/onnxruntime/core/providers/cpu/math/matmul_helper.h b/onnxruntime/core/providers/cpu/math/matmul_helper.h index af82037a7c465..e5095e0ea1382 100644 --- a/onnxruntime/core/providers/cpu/math/matmul_helper.h +++ b/onnxruntime/core/providers/cpu/math/matmul_helper.h @@ -29,9 +29,8 @@ class MatMulComputeHelper { M_ = left_shape.SizeToDimension(left_num_dims - 1); K_ = left_shape[left_num_dims - 1]; N_ = right_shape[right_num_dims - 1]; - std::vector output_dims = left_shape.GetDims(); - output_dims[left_num_dims - 1] = N_; - output_shape_ = TensorShape(output_dims); + output_shape_ = left_shape; + output_shape_[left_num_dims - 1] = N_; output_offsets_ = {0}; left_offsets_ = {0}; right_offsets_ = {0}; diff --git a/onnxruntime/core/providers/cpu/math/softmax.cc b/onnxruntime/core/providers/cpu/math/softmax.cc index 9242967901e46..542e20e79f79c 100644 --- a/onnxruntime/core/providers/cpu/math/softmax.cc +++ b/onnxruntime/core/providers/cpu/math/softmax.cc @@ -4,6 +4,7 @@ #include "core/providers/cpu/math/softmax.h" #include "core/framework/op_kernel.h" +#include "core/framework/op_kernel_context_internal.h" #include "core/providers/common.h" #include "core/providers/cpu/math/softmax_shared.h" #include "core/util/math.h" @@ -12,6 +13,9 @@ namespace onnxruntime { template <> Status Softmax::Compute(OpKernelContext* ctx) const { + auto ctx_internal = static_cast(ctx); + concurrency::ThreadPool* tp = ctx_internal->GetOperatorThreadPool(); + const auto* tensor_pointer = ctx->Input(0); if (tensor_pointer == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch"); const Tensor& X = *tensor_pointer; @@ -34,7 +38,7 @@ Status Softmax::Compute(OpKernelContext* ctx) const { const bool logarithmic = false; auto status = SoftmaxCPU(N, D, X.template Data(), Ydata, - scale_.data(), sum_multiplier_.data(), logarithmic, rowmax_.data()); + scale_.data(), sum_multiplier_.data(), logarithmic, rowmax_.data(), tp); return status; } diff --git a/onnxruntime/core/providers/cpu/math/softmax_shared.cc b/onnxruntime/core/providers/cpu/math/softmax_shared.cc index 7dd3a10cfc598..18277f6b4137c 100644 --- a/onnxruntime/core/providers/cpu/math/softmax_shared.cc +++ b/onnxruntime/core/providers/cpu/math/softmax_shared.cc @@ -31,6 +31,7 @@ #endif #include "core/providers/cpu/math/softmax_shared.h" + #include "core/util/math.h" #include "core/util/math_cpuonly.h" @@ -46,7 +47,7 @@ common::Status SoftmaxCPU(const int64_t N, float* scale, const float* sum_multiplier, bool logarithmic, - float* rowmax) { + float* rowmax, onnxruntime::concurrency::ThreadPool* tp) { // the Math functions SoftmaxCPU uses only support int32_t as input, so enforce that if (N * D > INT32_MAX || N > INT32_MAX || D > INT32_MAX) { std::ostringstream ss; @@ -65,7 +66,7 @@ common::Status SoftmaxCPU(const int64_t N, // Put the intermediate result X - max(X) into Y by first copying X to Y, and then subtracting max from each entry gsl::copy(gsl::make_span(Xdata, nd), gsl::make_span(Ydata, nd)); - math::Gemm(CblasNoTrans, CblasNoTrans, n, d, 1, -1, rowmax, sum_multiplier, 1, Ydata, nullptr); + math::Gemm(CblasNoTrans, CblasNoTrans, n, d, 1, -1, rowmax, sum_multiplier, 1, Ydata, tp); // Exponentiation math::Exp(nd, Ydata, Ydata, nullptr); diff --git a/onnxruntime/core/providers/cpu/math/softmax_shared.h b/onnxruntime/core/providers/cpu/math/softmax_shared.h index 3439b9717f051..26ffeb193fe4f 100644 --- a/onnxruntime/core/providers/cpu/math/softmax_shared.h +++ b/onnxruntime/core/providers/cpu/math/softmax_shared.h @@ -6,6 +6,9 @@ #include "core/common/status.h" namespace onnxruntime { +namespace concurrency { +class ThreadPool; +} /** Calculate Softmax using CPU memory. @param N Number of rows @@ -18,5 +21,5 @@ Calculate Softmax using CPU memory. @param rowmax Storage for calculation of maximum in each row. Size must be >= N. */ common::Status SoftmaxCPU(int64_t N, int64_t D, const float* Xdata, float* Ydata, float* scale, - const float* sum_multiplier, bool logarithmic, float* rowmax); + const float* sum_multiplier, bool logarithmic, float* rowmax, concurrency::ThreadPool* tp); } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/nn/conv.cc b/onnxruntime/core/providers/cpu/nn/conv.cc index c3acbd02a62c5..c0091936704d8 100644 --- a/onnxruntime/core/providers/cpu/nn/conv.cc +++ b/onnxruntime/core/providers/cpu/nn/conv.cc @@ -14,6 +14,7 @@ * limitations under the License. */ /* Modifications Copyright (c) Microsoft. */ +#include "core/framework/op_kernel_context_internal.h" #include "core/providers/cpu/nn/conv.h" #include "core/framework/op_kernel_context_internal.h" @@ -24,6 +25,8 @@ namespace onnxruntime { template Status Conv::Compute(OpKernelContext* context) const { size_t num_inputs = OpKernel::Node().InputDefs().size(); + auto ctx_internal = static_cast(context); + concurrency::ThreadPool* tp = ctx_internal->GetOperatorThreadPool(); const auto* X = context->Input(0); const auto* W = context->Input(1); @@ -116,7 +119,7 @@ Status Conv::Compute(OpKernelContext* context) const { col_buffer_data, &CPUMathUtil::Instance()); } - math::Gemm( + math::Gemm( CblasNoTrans, CblasNoTrans, M / group_, @@ -127,7 +130,7 @@ Status Conv::Compute(OpKernelContext* context) const { col_buffer_data, 0, Ydata + group_id * Y_offset, - &CPUMathUtil::Instance()); + tp); } if (B != nullptr) { @@ -144,6 +147,9 @@ Status Conv::Compute(OpKernelContext* context) const { } Status Conv::Compute(OpKernelContext* context) const { + auto ctx_internal = static_cast(context); + concurrency::ThreadPool* tp = ctx_internal->GetOperatorThreadPool(); + size_t num_inputs = OpKernel::Node().InputDefs().size(); const auto* X = context->Input(0); const auto* W = context->Input(1); @@ -186,11 +192,6 @@ Status Conv::Compute(OpKernelContext* context) const { const size_t kernel_rank = kernel_shape.size(); if (kernel_rank == 2 || kernel_rank == 3) { - // Get access to the internal threadpool - // Temporarily derive concurrency parameters without access to session state - auto ctx_internal = static_cast(context); - auto thread_pool = ctx_internal->GetOperatorThreadPool(); - MLAS_CONV_PARAMETERS Parameters; size_t WorkingBufferSize; MlasConvPrepare(&Parameters, @@ -207,7 +208,7 @@ Status Conv::Compute(OpKernelContext* context) const { static_cast(M / group_), &activation_, &WorkingBufferSize, - const_cast(thread_pool)); + tp); auto working_data = WorkingBufferSize > 0 ? alloc->Alloc(sizeof(float) * WorkingBufferSize) : nullptr; BufferUniquePtr working_buffer(working_data, BufferDeleter(alloc)); @@ -218,7 +219,7 @@ Status Conv::Compute(OpKernelContext* context) const { Bdata, static_cast(working_buffer.get()), Ydata, - const_cast(thread_pool)); + tp); } else { const int64_t input_image_size = input_shape.Size(); const int64_t output_image_size = output_shape.Size(); @@ -253,7 +254,7 @@ Status Conv::Compute(OpKernelContext* context) const { static_cast(kernel_shape.size()), col_buffer_data, &CPUMathUtil::Instance()); - math::Gemm( + math::Gemm( CblasNoTrans, CblasNoTrans, M / group_, @@ -264,7 +265,7 @@ Status Conv::Compute(OpKernelContext* context) const { col_buffer_data, 0, Ydata + group_id * Y_offset, - &CPUMathUtil::Instance()); + tp); } MlasActivation(&activation_, Ydata, Bdata, M, output_image_size, output_image_size); diff --git a/onnxruntime/core/providers/cpu/nn/conv_transpose.cc b/onnxruntime/core/providers/cpu/nn/conv_transpose.cc index 14f13ccd20198..9fd9cd1502147 100644 --- a/onnxruntime/core/providers/cpu/nn/conv_transpose.cc +++ b/onnxruntime/core/providers/cpu/nn/conv_transpose.cc @@ -16,6 +16,8 @@ /* Modifications Copyright (c) Microsoft. */ #include "core/providers/cpu/nn/conv_transpose.h" +#include "core/framework/op_kernel_context_internal.h" + #include "core/util/math.h" #include "core/util/math_cpuonly.h" @@ -228,6 +230,9 @@ Status ConvTranspose::Compute(OpKernelContext* context) const { template Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_padding) const { + auto ctx_internal = static_cast(context); + concurrency::ThreadPool* tp = ctx_internal->GetOperatorThreadPool(); + size_t num_inputs = OpKernel::Node().InputDefs().size(); Prepare p; bool has_bias = dynamic_padding ? num_inputs == 4 : num_inputs == 3; @@ -254,7 +259,7 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_ for (auto image_id = 0; image_id < p.N; ++image_id) { for (int group_id = 0; group_id < group_; ++group_id) { // Weight term - math::Gemm( + math::Gemm( CblasTrans, CblasNoTrans, kernel_dim, @@ -265,7 +270,7 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_ Xdata + group_id * X_offset, 0, col_buffer_data, - &CPUMathUtil::Instance()); + tp); // Col2im math::Col2im( diff --git a/onnxruntime/core/providers/cpu/nn/pool.cc b/onnxruntime/core/providers/cpu/nn/pool.cc index 367a9256a0c16..47bc8fc856bb3 100644 --- a/onnxruntime/core/providers/cpu/nn/pool.cc +++ b/onnxruntime/core/providers/cpu/nn/pool.cc @@ -190,7 +190,7 @@ Status PoolBase::Compute(OpKernelContext* context, MLAS_POOLING_KIND kind) const // Get access to the internal threadpool // Temporarily derive concurrency parameters without access to session state auto ctx_internal = static_cast(context); - auto thread_pool = ctx_internal->GetOperatorThreadPool(); + concurrency::ThreadPool* thread_pool = ctx_internal->GetOperatorThreadPool(); MlasPool(kind, pooling_dims, diff --git a/onnxruntime/core/providers/cpu/nn/pool_base.h b/onnxruntime/core/providers/cpu/nn/pool_base.h index 43f81982dd3a9..606ac909f08f1 100644 --- a/onnxruntime/core/providers/cpu/nn/pool_base.h +++ b/onnxruntime/core/providers/cpu/nn/pool_base.h @@ -99,10 +99,13 @@ class LpPool { }; class PoolBase { + private: + static bool IsGlobalPooling(const std::string& op_name) { + return op_name == "GlobalAveragePool" || op_name == "GlobalMaxPool" || op_name == "GlobalLpPool"; + } + protected: - PoolBase(const OpKernelInfo& info) { - op_name_ = info.GetKernelDef().OpName(); - global_pooling_ = (op_name_ == "GlobalAveragePool" || op_name_ == "GlobalMaxPool" || op_name_ == "GlobalLpPool"); + PoolBase(const OpKernelInfo& info) : op_name_(info.GetKernelDef().OpName()), global_pooling_(IsGlobalPooling(op_name_)) { int end; info.GetKernelDef().SinceVersion(&start_version_, &end); @@ -256,8 +259,8 @@ class PoolBase { Status Compute(OpKernelContext* context, MLAS_POOLING_KIND kind) const; protected: - std::string op_name_; - bool global_pooling_{}; + const std::string op_name_; + const bool global_pooling_; bool count_include_pad_{}; int64_t storage_order_{0}; // MaxPool_8 only. 0 is row major, and 1 is column major. Default is 0. int64_t ceil_mode_{0}; // Introduced in MaxPool_10 diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc b/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc index c5be268f59e2d..0dd13269bfacd 100644 --- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc +++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_gru.cc @@ -1,5 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#include "core/platform/threadpool.h" +#include "core/framework/op_kernel_context_internal.h" // there's no way to use a raw pointer as the copy destination with std::copy_n // (which gsl::copy uses with span::data() which returns a raw pointer) with the 14.11 toolset @@ -167,7 +169,8 @@ class UniDirectionalGru { UniDirectionalGru(AllocatorPtr allocator, int seq_length, int batch_size, int input_size, int hidden_size, bool linear_before_reset, Direction direction, const gsl::span& bias, const gsl::span& initial_hidden_state, const ActivationFuncs::Entry& activation_func_f, - const ActivationFuncs::Entry& activation_func_g, float clip); + const ActivationFuncs::Entry& activation_func_g, float clip, + onnxruntime::concurrency::ThreadPool* ttp); void Compute(const gsl::span& inputs, const gsl::span& sequence_lengths, int num_directions, const gsl::span& input_weights, const gsl::span& recurrent_weights, @@ -233,6 +236,8 @@ class UniDirectionalGru { deepcpu::GruOutputGateFuncPtr output_gate_{}; void AllocateBuffers(); + + onnxruntime::concurrency::ThreadPool* ttp_; }; } // namespace detail @@ -263,6 +268,9 @@ Status DeepCpuGruOp::Compute(OpKernelContext* context) const { template Status DeepCpuGruOp::ComputeImpl(OpKernelContext& context) const { + auto ctx_internal = static_cast(&context); + concurrency::ThreadPool* thread_pool = ctx_internal->GetOperatorThreadPool(); + const Tensor& X = *context.Input(0); // inputs. [seq_length, batch_size, input_size] const Tensor& W = *context.Input(1); // weights. [num_directions, 3*hidden_size, input_size] const Tensor& R = *context.Input(2); // recurrence weights. [num_directions, 3*hidden_size, hidden_size] @@ -367,7 +375,7 @@ Status DeepCpuGruOp::ComputeImpl(OpKernelContext& context) const { linear_before_reset_, Direction::kForward, bias_1, initial_hidden_1, activation_funcs_.Entries()[0], activation_funcs_.Entries()[1], - clip_); + clip_, thread_pool); fw.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, output_1, hidden_output_1); @@ -375,7 +383,7 @@ Status DeepCpuGruOp::ComputeImpl(OpKernelContext& context) const { linear_before_reset_, Direction::kReverse, bias_2, initial_hidden_2, activation_funcs_.Entries()[2], activation_funcs_.Entries()[3], - clip_); + clip_, thread_pool); bw.Compute(input, sequence_lens_span, num_directions_, input_weights_2, recurrent_weights_2, output_2, hidden_output_2); } else { @@ -383,7 +391,7 @@ Status DeepCpuGruOp::ComputeImpl(OpKernelContext& context) const { linear_before_reset_, direction_, bias_1, initial_hidden_1, activation_funcs_.Entries()[0], activation_funcs_.Entries()[1], - clip_); + clip_, thread_pool); gru_p.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, output_1, hidden_output_1); } @@ -412,7 +420,7 @@ UniDirectionalGru::UniDirectionalGru(AllocatorPtr allocator, const gsl::span& initial_hidden_state, const ActivationFuncs::Entry& activation_func_f, const ActivationFuncs::Entry& activation_func_g, - const float clip) + const float clip, onnxruntime::concurrency::ThreadPool* ttp) : allocator_(allocator), seq_length_(seq_length), batch_size_(batch_size), @@ -421,7 +429,8 @@ UniDirectionalGru::UniDirectionalGru(AllocatorPtr allocator, linear_before_reset_(linear_before_reset), clip_(clip), direction_(direction), - use_bias_(!bias.empty()) { + use_bias_(!bias.empty()), + ttp_(ttp) { clip_with_bias_ptr_ = use_bias_ ? deepcpu::clip_add_bias : deepcpu::clip_ignore_bias; // setup activation function pointers and alpha/beta values to use with them @@ -540,7 +549,7 @@ void UniDirectionalGru::Compute(const gsl::span& inputs_arg, input_weights.cbegin(), input_weights.cend(), input_size_, beta, outputZRH_.begin(), outputZRH_.end(), - hidden_size_x3); + hidden_size_x3, ttp_); DumpMatrix("inputs with weights applied", outputZRH_.data(), seq_length_ * batch_size_ * 3, hidden_size_); @@ -606,7 +615,7 @@ void UniDirectionalGru::Compute(const gsl::span& inputs_arg, recurrent_weightsZR.cbegin(), recurrent_weightsZR.cend(), hidden_size_, beta, outputZRH_.begin() + out_added_offset, outputZRH_.end(), - hidden_size_x3); + hidden_size_x3, ttp_); DumpMatrix("Ht-1 * R[zr] + Xt*(W[zr]^T)" + seqno_str, outputZRH_.data() + out_added_offset, batch_size_, hidden_size_x2, 0, hidden_size_x3); @@ -622,7 +631,7 @@ void UniDirectionalGru::Compute(const gsl::span& inputs_arg, recurrent_weightsH.cbegin(), recurrent_weightsH.cend(), // Rh^T hidden_size_, beta, linear_output_.begin(), linear_output_.end(), // pre: Rbh, post:output - hidden_size_); + hidden_size_, ttp_); DumpMatrix("Ht-1 * (Rh^T) + Rbh " + seqno_str, linear_output_.data(), batch_size_, hidden_size_); } @@ -693,7 +702,7 @@ void UniDirectionalGru::Compute(const gsl::span& inputs_arg, recurrent_weightsH.cbegin(), recurrent_weightsH.cend(), // Rh^T hidden_size_, beta, out_H, outputZRH_.end(), - hidden_size_x3); + hidden_size_x3, ttp_); } DumpMatrix("Xt*(Wh^T) + (" + label + ")" + seqno_str, outputZRH_.data() + out_added_offset, diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc index 8f4e8236981f8..682dabd9262ca 100644 --- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc +++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.cc @@ -9,6 +9,9 @@ #pragma warning(disable : 4996) #endif +#include "core/platform/threadpool.h" +#include "core/framework/op_kernel_context_internal.h" + #include "core/providers/cpu/rnn/deep_cpu_lstm.h" #include "core/common/common.h" @@ -193,7 +196,8 @@ class UniDirectionalLstm { const gsl::span& initial_hidden_state, const gsl::span& initial_cell_state, const ActivationFuncs::Entry& activation_func_f, const ActivationFuncs::Entry& activation_func_g, const ActivationFuncs::Entry& activation_func_h, float clip, - onnxruntime::concurrency::ThreadPool& ttp); + concurrency::ThreadPool& lstm_tp_, + concurrency::ThreadPool* mlas_tp_); void Compute(const gsl::span& inputs, const gsl::span& sequence_lengths, int num_directions, const gsl::span& input_weights, const gsl::span& recurrent_weights, @@ -275,7 +279,8 @@ class UniDirectionalLstm { ActivationInfo activation_g_; ActivationInfo activation_h_; - onnxruntime::concurrency::ThreadPool& ttp_; + concurrency::ThreadPool& lstm_tp_; + concurrency::ThreadPool* mlas_tp_; }; } // namespace detail @@ -309,6 +314,9 @@ DeepCpuLstmOp::Compute(OpKernelContext* context) const { template Status DeepCpuLstmOp::ComputeImpl(OpKernelContext& context) const { + auto ctx_internal = static_cast(&context); + concurrency::ThreadPool* mlas_thread_pool = ctx_internal->GetOperatorThreadPool(); + auto& logger = context.Logger(); const Tensor& X = *context.Input(0); // inputs. [seq_length, batch_size, input_size] @@ -452,7 +460,7 @@ Status DeepCpuLstmOp::ComputeImpl(OpKernelContext& context) const { activation_funcs_.Entries()[0], activation_funcs_.Entries()[1], activation_funcs_.Entries()[2], - clip_, ttp_); + clip_, lstm_tp_, mlas_thread_pool); detail::UniDirectionalLstm bw(alloc, logger, seq_length, batch_size, input_size, hidden_size_, Direction::kReverse, input_forget_, @@ -460,7 +468,7 @@ Status DeepCpuLstmOp::ComputeImpl(OpKernelContext& context) const { activation_funcs_.Entries()[3], activation_funcs_.Entries()[4], activation_funcs_.Entries()[5], - clip_, ttp_); + clip_, lstm_tp_, mlas_thread_pool); fw.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, output_1, hidden_output_1, last_cell_1); @@ -473,7 +481,7 @@ Status DeepCpuLstmOp::ComputeImpl(OpKernelContext& context) const { activation_funcs_.Entries()[0], activation_funcs_.Entries()[1], activation_funcs_.Entries()[2], - clip_, ttp_); + clip_, lstm_tp_, mlas_thread_pool); fw.Compute(input, sequence_lens_span, num_directions_, input_weights_1, recurrent_weights_1, output_1, hidden_output_1, last_cell_1); @@ -546,7 +554,8 @@ UniDirectionalLstm::UniDirectionalLstm(AllocatorPtr allocator, const ActivationFuncs::Entry& activation_func_g, const ActivationFuncs::Entry& activation_func_h, const float clip, - onnxruntime::concurrency::ThreadPool& ttp) + concurrency::ThreadPool& lstm_tp, + concurrency::ThreadPool* mlas_tp) : allocator_(allocator), logger_(logger), seq_length_(seq_length), @@ -558,7 +567,8 @@ UniDirectionalLstm::UniDirectionalLstm(AllocatorPtr allocator, clip_(clip), use_bias_(!bias.empty()), use_peepholes_(!peephole_weights.empty()), - ttp_(ttp) { + lstm_tp_(lstm_tp), + mlas_tp_(mlas_tp) { activation_f_ = {deepcpu::ActivationFuncByName(activation_func_f.name), activation_func_f.alpha, activation_func_f.beta}; @@ -774,7 +784,7 @@ void UniDirectionalLstm::Compute(const gsl::span& inputs_arg, input_weights.cbegin(), input_weights.cend(), // W[iofc] input_size_, beta, output_iofc_.begin(), output_iofc_.end(), - hidden_size_x4); + hidden_size_x4, mlas_tp_); DumpMatrix("Xt*(W[iofc]^T)", output_iofc_.data(), total_rows, hidden_size_x4); @@ -823,7 +833,7 @@ void UniDirectionalLstm::Compute(const gsl::span& inputs_arg, recurrent_weights.cbegin(), recurrent_weights.cend(), // R[iofc] hidden_size_, beta, step_out_IOFC, output_iofc_.end(), // input contains Xt*(W[iofc]^T) - hidden_size_x4); + hidden_size_x4, mlas_tp_); DumpMatrix("Xt*(W[iofc]^T) + Ht-t*R[iofc]" + row_str, &*step_out_IOFC, local_fused_hidden_rows, hidden_size_x4); @@ -874,7 +884,7 @@ void UniDirectionalLstm::Compute(const gsl::span& inputs_arg, } }; - ExecuteLambdaInParallel("Processing batch", hidden_gemm_and_activations, batch_size_, fused_hidden_rows, ttp_, logger_); + ExecuteLambdaInParallel("Processing batch", hidden_gemm_and_activations, batch_size_, fused_hidden_rows, lstm_tp_, logger_); } else { span_T_iter c_prev = batched_internal_state_prev_one_step.begin(); @@ -901,7 +911,7 @@ void UniDirectionalLstm::Compute(const gsl::span& inputs_arg, recurrent_weights.cbegin(), recurrent_weights.cend(), // R[iofc] hidden_size_, beta, step_out_IOFC, output_iofc_.end(), // input contains Xt*(W[iofc]^T) - hidden_size_x4); + hidden_size_x4, mlas_tp_); span_T_iter batched_output; span_T_iter batched_output_end; diff --git a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h index 606dfbf5b190c..faf32e3a77a2f 100644 --- a/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h +++ b/onnxruntime/core/providers/cpu/rnn/deep_cpu_lstm.h @@ -82,8 +82,8 @@ class DeepCpuLstmOp final : public OpKernel { // across them. mutable due to this. // The alternative would be to create a threadpool in each call to Compute but that would incur thread creation // cost on every call. - mutable onnxruntime::concurrency::ThreadPool ttp_{"DEEPCPU_LSTM", - static_cast(std::thread::hardware_concurrency())}; + mutable onnxruntime::concurrency::ThreadPool lstm_tp_{"DEEPCPU_LSTM", + static_cast(std::thread::hardware_concurrency())}; }; } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/rnn/rnn.cc b/onnxruntime/core/providers/cpu/rnn/rnn.cc index 4030d65a94d45..1576b5192da68 100644 --- a/onnxruntime/core/providers/cpu/rnn/rnn.cc +++ b/onnxruntime/core/providers/cpu/rnn/rnn.cc @@ -1,5 +1,6 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#include "core/framework/op_kernel_context_internal.h" #include "core/providers/cpu/rnn/rnn.h" #include "core/providers/cpu/rnn/rnn_activation_functors.h" @@ -99,6 +100,8 @@ using EigenMatrixMapRowMajor = Eigen::Map< template <> Status RNN::Compute(OpKernelContext* ctx) const { using namespace rnn::detail; + auto ctx_internal = static_cast(ctx); + concurrency::ThreadPool* tp = ctx_internal->GetOperatorThreadPool(); // inputs const Tensor& X = *ctx->Input(0); @@ -160,7 +163,7 @@ Status RNN::Compute(OpKernelContext* ctx) const { } // X * W[direction]^t + B - math::Gemm( + math::Gemm( CblasNoTrans, CblasTrans, static_cast(seq_length * batch_size), @@ -171,7 +174,7 @@ Status RNN::Compute(OpKernelContext* ctx) const { W.template Data() + direction * hidden_size_ * input_size, 1, x_matmul_w_buffer_data, - &CPUMathUtil::Instance()); + tp); for (int64_t t = 0; t < seq_length; t++) { int64_t time_step = isReverse ? (seq_length - t - 1) : t; @@ -192,7 +195,7 @@ Status RNN::Compute(OpKernelContext* ctx) const { if (h_prev != nullptr) { // H_t_1 * R[direction]^t - math::Gemm( + math::Gemm( CblasNoTrans, CblasTrans, static_cast(batch_size), @@ -203,7 +206,7 @@ Status RNN::Compute(OpKernelContext* ctx) const { R.template Data() + direction * hidden_size_ * hidden_size_, 0, Y_buffer_data_current_frame, - &CPUMathUtil::Instance()); + tp); } else { math::Set(batch_size * hidden_size_, 0, Y_buffer_data_current_frame, &CPUMathUtil::Instance()); } diff --git a/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h b/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h index 2e3e5f88d72ec..f1038e63a350e 100644 --- a/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h +++ b/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h @@ -159,7 +159,7 @@ void ComputeGemm(const int M, const float beta, TSpanCIter C, TSpanCIter C_end, - const int ldc) { + const int ldc, concurrency::ThreadPool* tp) { // validate all the inputs // need to use the lda/ldb/ldc strides which should be >= the columns for the span ORT_ENFORCE(lda >= K && ldb >= K && ldc >= N); @@ -167,12 +167,12 @@ void ComputeGemm(const int M, ORT_ENFORCE(B + (N * ldb - (ldb - K)) <= B_end); ORT_ENFORCE(C + (M * ldc - (ldc - N)) <= C_end); - ::onnxruntime::math::GemmEx( + ::onnxruntime::math::GemmEx( CblasNoTrans, CblasTrans, M, N, K, alpha, &*A, lda, &*B, ldb, beta, - &*C, ldc, &CPUMathUtil::Instance()); + &*C, ldc, tp); } // helper to convert a span to a raw pointer diff --git a/onnxruntime/core/providers/cpu/tensor/cast_op.cc b/onnxruntime/core/providers/cpu/tensor/cast_op.cc index c326d25ef17a0..0f8da8eaff2a6 100644 --- a/onnxruntime/core/providers/cpu/tensor/cast_op.cc +++ b/onnxruntime/core/providers/cpu/tensor/cast_op.cc @@ -10,7 +10,7 @@ #include "Eigen/src/Core/arch/GPU/Half.h" #include "core/common/common.h" -#if defined(USE_MLAS) && defined(_M_AMD64) +#if defined(_M_AMD64) #include "core/mlas/inc/mlas.h" #endif @@ -40,7 +40,7 @@ inline void CastData(const Tensor* in, Tensor* out, const Tens auto out_data = out->template MutableData(); auto in_data = in->template Data(); auto shape_size = shape.Size(); -#if defined(USE_MLAS) && defined(_M_AMD64) +#if defined(_M_AMD64) MlasConvertHalfToFloatBuffer(&in_data[0].val, out_data, shape_size); #else auto in_vector = ConstEigenVectorMap(static_cast(static_cast(in_data)), shape_size); diff --git a/onnxruntime/core/session/environment.cc b/onnxruntime/core/session/environment.cc index d1f9041c9253f..539d92bd4323a 100644 --- a/onnxruntime/core/session/environment.cc +++ b/onnxruntime/core/session/environment.cc @@ -87,6 +87,7 @@ Internal copy node } Environment::~Environment() { + delete tp_; ::google::protobuf::ShutdownProtobufLibrary(); } diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index 678163f92c1fa..1d7623d0165a6 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -90,15 +90,26 @@ inline std::basic_string GetCurrentTimeString() { OrtStrftime(time_str, sizeof(time_str), GetDateFormatString(), &local_tm); return std::basic_string(time_str); } + +concurrency::ThreadPool* CreateThreadPool(int size) { + if (size == 1) + return nullptr; + if (size <= 0) + size = std::max(std::thread::hardware_concurrency() - 1, 1); + return new concurrency::ThreadPool("SESSION", size); +} + } // namespace InferenceSession::InferenceSession(const SessionOptions& session_options, logging::LoggingManager* logging_manager) : session_options_{session_options}, - graph_transformation_mgr_{session_options_.max_num_graph_transformation_steps}, + graph_transformation_mgr_{session_options.max_num_graph_transformation_steps}, logging_manager_{logging_manager}, + thread_pool_(CreateThreadPool(session_options.session_thread_pool_size)), session_state_(execution_providers_, - session_options.enable_mem_pattern && session_options.enable_sequential_execution), + session_options.enable_mem_pattern && session_options.enable_sequential_execution, + thread_pool_.get()), insert_cast_transformer_{"CastFloat16Transformer"} { ORT_ENFORCE(Environment::IsInitialized(), "Environment must be initialized before creating an InferenceSession."); @@ -106,18 +117,6 @@ InferenceSession::InferenceSession(const SessionOptions& session_options, InitLogger(logging_manager); session_state_.SetDataTransferMgr(&data_transfer_mgr_); - - // The threadpool is currently evolving. We will always create a per session threadpool. - // Beyond this, we will create a global thread pool to share across sessions. - { - int pool_size = session_options_.session_thread_pool_size <= 0 - ? std::thread::hardware_concurrency() / 2 - : session_options_.session_thread_pool_size; - - thread_pool_ = std::make_unique("SESSION", pool_size); - } - - session_state_.SetThreadPool(thread_pool_.get()); session_profiler_.Initialize(session_logger_); session_state_.SetProfiler(session_profiler_); if (session_options.enable_profiling) { @@ -398,11 +397,9 @@ common::Status InferenceSession::CreateSubgraphSessionState(Graph& graph, Sessio ORT_ENFORCE(subgraph, "Main Graph instance should have populated all subgraphs when being resolved."); auto subgraph_session_state = - std::make_unique(execution_providers_, session_state.GetEnableMemoryPattern()); + std::make_unique(execution_providers_, session_state.GetEnableMemoryPattern(), session_state.GetThreadPool()); subgraph_session_state->SetProfiler(session_profiler_); subgraph_session_state->SetLogger(*session_logger_); - // Pass threadpool to subgraph - subgraph_session_state->SetThreadPool(session_state.GetThreadPool()); // Pass data transfer manager to subgraph. subgraph_session_state->SetDataTransferMgr(&session_state.GetDataTransferMgr()); // Pass fused function manager to subgraph @@ -534,7 +531,7 @@ common::Status InferenceSession::Initialize() { ORT_RETURN_IF_ERROR(Model::Save(*model_, session_options_.optimized_model_filepath)); } else { LOGS(*session_logger_, WARNING) << "Serializing Optimized ONNX model with Graph Optimization" - " level greater than 2 is not supported."; + " level greater than 2 is not supported."; } } diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc index 219002507b8ec..80f550cfdccb2 100644 --- a/onnxruntime/core/session/onnxruntime_c_api.cc +++ b/onnxruntime/core/session/onnxruntime_c_api.cc @@ -67,9 +67,9 @@ struct OrtEnv { auto v = reinterpret_cast(value); \ auto& tensor = v->Get(); -#define TENSOR_READWRITE_API_BEGIN \ - API_IMPL_BEGIN \ - auto v = reinterpret_cast<::OrtValue*>(value); \ +#define TENSOR_READWRITE_API_BEGIN \ + API_IMPL_BEGIN \ + auto v = (value); \ auto tensor = v->GetMutable(); class LoggingWrapper : public ISink { @@ -416,7 +416,7 @@ ORT_API_STATUS_IMPL(OrtCreateSessionFromArray, _In_ const OrtEnv* env, _In_ cons } ORT_API_STATUS_IMPL(OrtRun, _Inout_ OrtSession* sess, - _In_ const OrtRunOptions* run_options, + _In_opt_ const OrtRunOptions* run_options, _In_ const char* const* input_names, _In_ const OrtValue* const* input, size_t input_len, _In_ const char* const* output_names1, size_t output_names_len, _Outptr_ OrtValue** output) { API_IMPL_BEGIN @@ -449,7 +449,7 @@ ORT_API_STATUS_IMPL(OrtRun, _Inout_ OrtSession* sess, std::vector fetches(output_names_len); for (size_t i = 0; i != output_names_len; ++i) { if (output[i] != nullptr) { - ::OrtValue& value = *reinterpret_cast<::OrtValue*>(output[i]); + ::OrtValue& value = *(output[i]); if (value.Fence()) value.Fence()->BeforeUsingAsOutput(onnxruntime::kCpuExecutionProvider, queue_id); fetches[i] = value; @@ -520,9 +520,9 @@ ORT_API_STATUS_IMPL(OrtGetStringTensorContent, _In_ const OrtValue* value, if ((!_status.IsOK())) return ToOrtStatus(_status); \ } while (0) -#define DEFINE_RELEASE_ORT_OBJECT_FUNCTION(INPUT_TYPE, REAL_TYPE) \ - ORT_API(void, OrtRelease##INPUT_TYPE, Ort##INPUT_TYPE* value) { \ - delete reinterpret_cast(value); \ +#define DEFINE_RELEASE_ORT_OBJECT_FUNCTION(INPUT_TYPE, REAL_TYPE) \ + ORT_API(void, OrtRelease##INPUT_TYPE, _Frees_ptr_opt_ Ort##INPUT_TYPE* value) { \ + delete reinterpret_cast(value); \ } ORT_API_STATUS_IMPL(OrtSessionGetInputCount, _In_ const OrtSession* sess, _Out_ size_t* out) { diff --git a/onnxruntime/core/util/math.h b/onnxruntime/core/util/math.h index 21593c69326ee..7e200b7b6a7ad 100644 --- a/onnxruntime/core/util/math.h +++ b/onnxruntime/core/util/math.h @@ -35,6 +35,9 @@ extern "C" { #include "core/framework/tensor.h" namespace onnxruntime { +namespace concurrency { +class ThreadPool; +} enum StorageOrder { UNKNOWN = 0, @@ -74,7 +77,7 @@ void MatMul( int K, const T* A, const T* B, - T* C); + T* C, concurrency::ThreadPool* threadpool); // Decaf gemm provides a simpler interface to the gemm functions, with the // limitation that the data has to be contiguous in memory. @@ -90,7 +93,7 @@ void Gemm( const T* B, float beta, T* C, - Provider* provider); + Provider*); // We also provide a gemm that has explicit lda, ldb and ldc specified. // In most cases you probably want to use the function above, though. @@ -109,7 +112,7 @@ void GemmEx( T beta, T* C, int ldc, - Provider* provider); + Provider*); // Gemv always takes in a M*N matrix A, and depending on whether we set TransA // to Trans, the output is: diff --git a/onnxruntime/core/util/math_cpu.cc b/onnxruntime/core/util/math_cpu.cc index 822d58ec63140..9cf66e0037852 100644 --- a/onnxruntime/core/util/math_cpu.cc +++ b/onnxruntime/core/util/math_cpu.cc @@ -20,16 +20,17 @@ #include "core/util/math_cpuonly.h" #include "core/mlas/inc/mlas.h" #include "Eigen/src/Core/arch/GPU/Half.h" +using onnxruntime::concurrency::ThreadPool; namespace onnxruntime { namespace math { // MatMul implementation purely based on Eigen. -#define EIGEN_MATMUL_FUNCTION(T) \ - template <> \ - void MatMul(int M, int N, int K, const T* A, const T* B, T* C) { \ - auto C_mat = EigenMatrixMap(C, N, M); \ - C_mat.noalias() = ConstEigenMatrixMap(B, N, K) * ConstEigenMatrixMap(A, K, M); \ +#define EIGEN_MATMUL_FUNCTION(T) \ + template <> \ + void MatMul(int M, int N, int K, const T* A, const T* B, T* C, concurrency::ThreadPool*) { \ + auto C_mat = EigenMatrixMap(C, N, M); \ + C_mat.noalias() = ConstEigenMatrixMap(B, N, K) * ConstEigenMatrixMap(A, K, M); \ } EIGEN_MATMUL_FUNCTION(int32_t) @@ -44,7 +45,7 @@ EIGEN_MATMUL_FUNCTION(uint64_t) // CBLAS call or the Eigen implementation. //////////////////////////////////////////////////////////////////////////////// // when USE_MKLML is defined, use cblas APIs for MKLML -#if defined(USE_EIGEN_FOR_BLAS) && !defined(USE_MKLML_FOR_BLAS) +#if !defined(USE_MKLML_FOR_BLAS) // Caffe2 gemm provides a simpler interface to the gemm functions, with the // limitation that the data has to be contiguous in memory. @@ -62,28 +63,26 @@ EIGEN_MATMUL_FUNCTION(uint64_t) // (transpose) if the argument TransA or TransB is set to CblasNoTrans or // CblasTrans, respectively, for each of A and B. template <> -void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int64_t M, - const int64_t N, const int64_t K, float alpha, const float* A, const float* B, float beta, - float* C, CPUMathUtil* /*provider*/) { +void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int64_t M, + const int64_t N, const int64_t K, float alpha, const float* A, const float* B, float beta, + float* C, ThreadPool* threadpool) { int lda = static_cast((TransA == CblasNoTrans) ? K : M); int ldb = static_cast((TransB == CblasNoTrans) ? N : K); - // TODO: Make this use the operator threadpool - MlasSgemm(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N, nullptr); + MlasSgemm(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N, threadpool); } template <> -void MatMul(int M, int N, int K, const float* A, const float* B, float* C) { - // TODO: Make this use the operator threadpool - MlasSgemm(CblasNoTrans, CblasNoTrans, M, N, K, 1.f, A, K, B, N, 0.f, C, N, nullptr); +void MatMul(int M, int N, int K, const float* A, const float* B, float* C, ThreadPool* threadpool) { + MlasSgemm(CblasNoTrans, CblasNoTrans, M, N, K, 1.f, A, K, B, N, 0.f, C, N, threadpool); } EIGEN_MATMUL_FUNCTION(double) template <> -void GemmEx(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, int M, int N, int K, - float alpha, const float* A, int lda, const float* B, int ldb, float beta, float* C, - int ldc, CPUMathUtil*) { - MlasSgemm(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, nullptr); +void GemmEx(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, int M, int N, int K, + float alpha, const float* A, int lda, const float* B, int ldb, float beta, float* C, + int ldc, ThreadPool* threadpool) { + MlasSgemm(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, threadpool); } template <> @@ -125,12 +124,12 @@ void Gemv(const CBLAS_TRANSPOSE TransA, int M, int N, float SPECIALIZED_AXPY(float) #undef SPECIALIZED_AXPY -#else // USE_EIGEN_FOR_BLAS +#else template <> -void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int64_t M, - const int64_t N, const int64_t K, float alpha, const float* A, const float* B, float beta, - float* C, CPUMathUtil* /*context*/) { +void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int64_t M, + const int64_t N, const int64_t K, float alpha, const float* A, const float* B, float beta, + float* C, ThreadPool* /*context*/) { int lda = gsl::narrow_cast((TransA == CblasNoTrans) ? K : M); int ldb = gsl::narrow_cast((TransB == CblasNoTrans) ? N : K); cblas_sgemm(CblasRowMajor, TransA, TransB, @@ -142,19 +141,19 @@ void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOS } template <> -void MatMul(int M, int N, int K, const float* A, const float* B, float* C) { +void MatMul(int M, int N, int K, const float* A, const float* B, float* C, ThreadPool*) { cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, 1, A, K, B, N, 0, C, N); } template <> -void MatMul(int M, int N, int K, const double* A, const double* B, double* C) { +void MatMul(int M, int N, int K, const double* A, const double* B, double* C, ThreadPool*) { cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, 1, A, K, B, N, 0, C, N); } template <> -void GemmEx(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, int M, int N, int K, - float alpha, const float* A, int lda, const float* B, int ldb, float beta, float* C, - int ldc, CPUMathUtil* /*context*/) { +void GemmEx(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, int M, int N, int K, + float alpha, const float* A, int lda, const float* B, int ldb, float beta, float* C, + int ldc, ThreadPool* /*context*/) { cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); } @@ -177,7 +176,7 @@ void Gemv(const CBLAS_TRANSPOSE TransA, int M, int N, float CAFFE2_SPECIALIZED_AXPY(float, s) #undef CAFFE2_SPECIALIZED_AXPY -#endif // USE_EIGEN_FOR_BLAS +#endif #define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, expr) \ template <> \ diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc index c555c9ba21900..e51cf53a8235a 100644 --- a/onnxruntime/test/framework/allocation_planner_test.cc +++ b/onnxruntime/test/framework/allocation_planner_test.cc @@ -155,12 +155,13 @@ class PlannerTest : public ::testing::Test { std::vector> op_kernel_infos_; std::vector> kernel_bindings_; ExecutionProviders execution_providers_; + concurrency::ThreadPool tp_; SessionState state_; ShapeMap shape_map_; std::unique_ptr plan_; public: - PlannerTest() : model_("test"), graph_{model_.MainGraph()}, state_{execution_providers_, false} { + PlannerTest() : model_("test"), graph_(model_.MainGraph()), tp_("test", 1), state_(execution_providers_, false, &tp_) { std_kernel_ = KernelDefBuilder().SetName("Transpose").Build(); in_place_kernel_ = KernelDefBuilder().SetName("Clip").MayInplace(0, 0).Build(); CPUExecutionProviderInfo epi; diff --git a/onnxruntime/test/framework/execution_frame_test.cc b/onnxruntime/test/framework/execution_frame_test.cc index 48e2ce8c3b0e6..62f87ebd073a1 100644 --- a/onnxruntime/test/framework/execution_frame_test.cc +++ b/onnxruntime/test/framework/execution_frame_test.cc @@ -36,7 +36,12 @@ std::unique_ptr CreateCPUExecutionProvider() { return std::make_unique(info); } -TEST(ExecutionFrameTest, TensorAllocationTest) { +class ExecutionFrameTest : public ::testing::Test { + protected: + concurrency::ThreadPool tp_{"test", 1}; +}; + +TEST_F(ExecutionFrameTest, TensorAllocationTest) { onnxruntime::Model model("test"); onnxruntime::Graph& graph = model.MainGraph(); TypeProto tensor_float; @@ -57,7 +62,7 @@ TEST(ExecutionFrameTest, TensorAllocationTest) { status = kernel_registry_manager.RegisterKernels(execution_providers); EXPECT_TRUE(status.IsOK()) << status.ErrorMessage(); - SessionState state{execution_providers, true}; + SessionState state{execution_providers, true, &tp_}; state.SetGraphViewer(std::make_unique(graph)); OrtValueNameIdxMap& mlvalue_name_idx_map{state.GetOrtValueNameIdxMap()}; @@ -111,7 +116,7 @@ TEST(ExecutionFrameTest, TensorAllocationTest) { EXPECT_EQ(tensor2->template Data(), p_tensor->template Data()); } -TEST(ExecutionFrameTest, FeedInDataTest) { +TEST_F(ExecutionFrameTest, FeedInDataTest) { onnxruntime::Model model("test"); onnxruntime::Graph& graph = model.MainGraph(); TypeProto tensor_float; @@ -140,7 +145,7 @@ TEST(ExecutionFrameTest, FeedInDataTest) { execution_providers.Add(xp_typ, std::move(cpu_xp)); EXPECT_TRUE(kernel_registry_manager.RegisterKernels(execution_providers).IsOK()); - SessionState state{execution_providers, true}; + SessionState state{execution_providers, true, &tp_}; state.SetGraphViewer(std::make_unique(graph)); OrtValueNameIdxMap& mlvalue_name_idx_map{state.GetOrtValueNameIdxMap()}; @@ -160,7 +165,7 @@ TEST(ExecutionFrameTest, FeedInDataTest) { EXPECT_EQ(p_tensor_arg_0->MutableData(), value.GetMutable()->MutableData()); } -TEST(ExecutionFrameTest, MemPatternTest) { +TEST_F(ExecutionFrameTest, MemPatternTest) { auto cpu_xp = CreateCPUExecutionProvider(); auto xp_type = cpu_xp->Type(); std::unordered_map domain_to_version; @@ -192,7 +197,7 @@ TEST(ExecutionFrameTest, MemPatternTest) { execution_providers.Add(xp_type, std::move(cpu_xp)); kernel_registry_manager.RegisterKernels(execution_providers); //1. prepare input - SessionState state{execution_providers, true}; + SessionState state{execution_providers, true, &tp_}; state.SetGraphViewer(std::make_unique(graph)); OrtValueNameIdxMap& mlvalue_name_idx_map{state.GetOrtValueNameIdxMap()}; @@ -264,7 +269,7 @@ TEST(ExecutionFrameTest, MemPatternTest) { EXPECT_EQ(p->GetBlock(4)->offset_, 64); } -TEST(ExecutionFrameTest, BadModelInvalidDimParamUsage) { +TEST(ExecutionFrameTestWithoutSessionState, BadModelInvalidDimParamUsage) { // load model with 2 Scan ops that both incorrectly use shapes of { 'None', 'None' } for their outputs. // as 'None' is not a special value it's treated as a variable name, leading to a runtime error when we // attempt to re-use the output from the first Scan node for the second. validate we detect this and error out. diff --git a/onnxruntime/test/framework/math_test.cc b/onnxruntime/test/framework/math_test.cc index 97104647334d4..2c254bc3cc823 100644 --- a/onnxruntime/test/framework/math_test.cc +++ b/onnxruntime/test/framework/math_test.cc @@ -17,12 +17,24 @@ #include "core/util/math.h" #include +#include "core/platform/threadpool.h" #include "core/util/math_cpuonly.h" namespace onnxruntime { #define VECTOR_HEAD(x) x.size() > 0 ? &x[0] : NULL -TEST(MathTest, GemmNoTransNoTrans) { +//parameter is thread pool size +class MathGemmTest : public testing::TestWithParam { + protected: + static concurrency::ThreadPool* CreateThreadPool(int size) { + if (size == 1) + return nullptr; + return new concurrency::ThreadPool("test", size); + } + std::unique_ptr tp{CreateThreadPool(GetParam())}; +}; + +TEST_P(MathGemmTest, GemmNoTransNoTrans) { auto& provider = CPUMathUtil::Instance(); std::vector X(50); // 5 * 10 std::vector W(60); // 10 * 6 @@ -40,34 +52,35 @@ TEST(MathTest, GemmNoTransNoTrans) { const float kOne = 1.0; const float kPointFive = 0.5; const float kZero = 0.0; - math::Gemm(CblasNoTrans, CblasNoTrans, 5, 6, 10, kOne, - VECTOR_HEAD(X), VECTOR_HEAD(W), kZero, VECTOR_HEAD(Y), - &provider); + math::Gemm(CblasNoTrans, CblasNoTrans, 5, 6, 10, kOne, + VECTOR_HEAD(X), VECTOR_HEAD(W), kZero, VECTOR_HEAD(Y), + tp.get()); EXPECT_EQ(Y.size(), 30); for (size_t i = 0; i < Y.size(); ++i) { EXPECT_EQ(Y[i], 10) << i; } // Test Accumulate - math::Gemm(CblasNoTrans, CblasNoTrans, 5, 6, 10, kOne, - VECTOR_HEAD(X), VECTOR_HEAD(W), kPointFive, - VECTOR_HEAD(Y), &provider); + math::Gemm(CblasNoTrans, CblasNoTrans, 5, 6, 10, kOne, + VECTOR_HEAD(X), VECTOR_HEAD(W), kPointFive, + VECTOR_HEAD(Y), tp.get()); EXPECT_EQ(Y.size(), 30); for (size_t i = 0; i < Y.size(); ++i) { EXPECT_EQ(Y[i], 15) << i; } // Test Accumulate - math::Gemm(CblasNoTrans, CblasNoTrans, 5, 6, 10, - kPointFive, - VECTOR_HEAD(X), VECTOR_HEAD(W), kOne, VECTOR_HEAD(Y), - &provider); + math::Gemm(CblasNoTrans, CblasNoTrans, 5, 6, 10, + kPointFive, + VECTOR_HEAD(X), VECTOR_HEAD(W), kOne, VECTOR_HEAD(Y), + tp.get()); EXPECT_EQ(Y.size(), 30); for (size_t i = 0; i < Y.size(); ++i) { EXPECT_EQ(Y[i], 20) << i; } } -TEST(MathTest, GemmNoTransTrans) { +TEST_P(MathGemmTest, GemmNoTransTrans) { auto& provider = CPUMathUtil::Instance(); + std::vector X(50); // 5 * 10 std::vector W(60); // 10 * 6 std::vector Y(30); // 5 * 6 @@ -84,30 +97,33 @@ TEST(MathTest, GemmNoTransTrans) { const float kOne = 1.0; const float kPointFive = 0.5; const float kZero = 0.0; - math::Gemm(CblasNoTrans, CblasTrans, 5, 6, 10, kOne, - VECTOR_HEAD(X), VECTOR_HEAD(W), kZero, VECTOR_HEAD(Y), - &provider); + math::Gemm(CblasNoTrans, CblasTrans, 5, 6, 10, kOne, + VECTOR_HEAD(X), VECTOR_HEAD(W), kZero, VECTOR_HEAD(Y), + tp.get()); EXPECT_EQ(Y.size(), 30); for (size_t i = 0; i < Y.size(); ++i) { EXPECT_EQ(Y[i], 10) << i; } // Test Accumulate - math::Gemm(CblasNoTrans, CblasTrans, 5, 6, 10, kOne, - VECTOR_HEAD(X), VECTOR_HEAD(W), kPointFive, - VECTOR_HEAD(Y), &provider); + math::Gemm(CblasNoTrans, CblasTrans, 5, 6, 10, kOne, + VECTOR_HEAD(X), VECTOR_HEAD(W), kPointFive, + VECTOR_HEAD(Y), tp.get()); EXPECT_EQ(Y.size(), 30); for (size_t i = 0; i < Y.size(); ++i) { EXPECT_EQ(Y[i], 15) << i; } - math::Gemm(CblasNoTrans, CblasTrans, 5, 6, 10, kPointFive, - VECTOR_HEAD(X), VECTOR_HEAD(W), kOne, VECTOR_HEAD(Y), - &provider); + math::Gemm(CblasNoTrans, CblasTrans, 5, 6, 10, kPointFive, + VECTOR_HEAD(X), VECTOR_HEAD(W), kOne, VECTOR_HEAD(Y), + tp.get()); EXPECT_EQ(Y.size(), 30); for (size_t i = 0; i < Y.size(); ++i) { EXPECT_EQ(Y[i], 20) << i; } } +INSTANTIATE_TEST_CASE_P(MathGemmTests, MathGemmTest, + testing::Values(1, 4)); + TEST(MathTest, GemvNoTrans) { auto& provider = CPUMathUtil::Instance(); std::vector A(50); // 5 * 10 diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc index 6064b74630897..dec86627b06fe 100644 --- a/onnxruntime/test/framework/session_state_test.cc +++ b/onnxruntime/test/framework/session_state_test.cc @@ -35,11 +35,12 @@ class TestOpKernel : public OpKernel { }; TEST(SessionStateTest, AddGetKernelTest) { + concurrency::ThreadPool tp{"test", 1}; ONNX_OPERATOR_SCHEMA(Variable) .SetDoc("Input variable.") .Output(0, "output_1", "docstr for output_1.", "tensor(int32)"); ExecutionProviders execution_providers; - SessionState s{execution_providers, true}; + SessionState s{execution_providers, true, &tp}; onnxruntime::Model model("graph_1"); auto& graph = model.MainGraph(); @@ -70,68 +71,80 @@ TEST(SessionStateTest, AddGetKernelTest) { EXPECT_EQ(orig_num_outputs, test_kernel->Node().OutputDefs().size()); } +namespace { +class TestParam { + public: + int ir_version; + bool enable_mem_pattern; +}; +TestParam param_list[] = {{3, true}, {4, true}, {3, false}, {4, false}}; +} // namespace +class SessionStateTestP : public testing::TestWithParam { +}; // Test that we separate out constant and non-constant initializers correctly -TEST(SessionStateTest, TestInitializerProcessing) { - std::vector ir_versions = {3, 4}; - for (auto ir_version : ir_versions) { - std::string model_path = "testdata/optional_inputs_ir" + std::to_string(ir_version) + ".onnx"; - Status status; - std::shared_ptr model; - ASSERT_TRUE((status = Model::Load(model_path, model)).IsOK()) << status; - Graph& graph = model->MainGraph(); - // take a copy as this gets cleared during session state initialization - InitializedTensorSet initializers = graph.GetAllInitializedTensors(); - - const bool enable_mem_pattern = false; - ExecutionProviders execution_providers; - CPUExecutionProviderInfo epi{false}; - status = execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique(epi)); - ASSERT_TRUE(status.IsOK()) << status; - - KernelRegistryManager krm; - status = krm.RegisterKernels(execution_providers); - ASSERT_TRUE(status.IsOK()) << status; - - SessionState session_state(execution_providers, enable_mem_pattern); - SessionStateInitializer session_initializer(enable_mem_pattern, ToWideString(model_path), graph, - session_state, execution_providers, krm); - - GraphPartitioner partitioner(krm, execution_providers); - status = partitioner.Partition(graph, session_state.ExportDll(), session_state.GetMutableFuncMgr()); - ASSERT_TRUE(status.IsOK()) << status; - - status = session_initializer.CreatePlan(nullptr, nullptr, true); - ASSERT_TRUE(status.IsOK()) << status; - - status = session_initializer.InitializeAndSave(nullptr); - ASSERT_TRUE(status.IsOK()) << status; - - const auto& initialized_tensors = session_state.GetInitializedTensors(); - const auto& const_initialized_tensors = session_state.GetConstantInitializedTensors(); - - ASSERT_EQ(initializers.size(), initialized_tensors.size()) - << "SessionState should have an entry for all initializers in Graph."; - - if (ir_version < 4) { - ASSERT_EQ(initialized_tensors.size(), const_initialized_tensors.size()) - << "All initializers should be considered constant if IR version < 4."; - } else { - const auto& name_to_idx = session_state.GetOrtValueNameIdxMap(); - - for (auto entry : initializers) { - int idx; - name_to_idx.GetIdx(entry.first, idx); - - bool found = initialized_tensors.find(idx) != initialized_tensors.cend(); - ASSERT_TRUE(found) << "Missing entry for " << entry.first << " in session state initialized tensors"; - - if (graph_utils::IsConstantInitializer(graph, entry.first, false)) { - found = const_initialized_tensors.find(idx) != const_initialized_tensors.cend(); - ASSERT_TRUE(found) << "Missing entry for " << entry.first << " in session state const initialized tensors"; - } +TEST_P(SessionStateTestP, TestInitializerProcessing) { + const TestParam& param = GetParam(); + concurrency::ThreadPool tp{"test", 1}; + + std::string model_path = "testdata/optional_inputs_ir" + std::to_string(param.ir_version) + ".onnx"; + Status status; + std::shared_ptr model; + ASSERT_TRUE((status = Model::Load(model_path, model)).IsOK()) << status; + Graph& graph = model->MainGraph(); + // take a copy as this gets cleared during session state initialization + InitializedTensorSet initializers = graph.GetAllInitializedTensors(); + + ExecutionProviders execution_providers; + CPUExecutionProviderInfo epi{false}; + status = execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique(epi)); + ASSERT_TRUE(status.IsOK()) << status; + + KernelRegistryManager krm; + status = krm.RegisterKernels(execution_providers); + ASSERT_TRUE(status.IsOK()) << status; + + SessionState session_state(execution_providers, param.enable_mem_pattern, &tp); + SessionStateInitializer session_initializer(param.enable_mem_pattern, ToWideString(model_path), graph, + session_state, execution_providers, krm); + + GraphPartitioner partitioner(krm, execution_providers); + status = partitioner.Partition(graph, session_state.ExportDll(), session_state.GetMutableFuncMgr()); + ASSERT_TRUE(status.IsOK()) << status; + + status = session_initializer.CreatePlan(nullptr, nullptr, true); + ASSERT_TRUE(status.IsOK()) << status; + + status = session_initializer.InitializeAndSave(nullptr); + ASSERT_TRUE(status.IsOK()) << status; + + const auto& initialized_tensors = session_state.GetInitializedTensors(); + const auto& const_initialized_tensors = session_state.GetConstantInitializedTensors(); + + ASSERT_EQ(initializers.size(), initialized_tensors.size()) + << "SessionState should have an entry for all initializers in Graph."; + + if (param.ir_version < 4) { + ASSERT_EQ(initialized_tensors.size(), const_initialized_tensors.size()) + << "All initializers should be considered constant if IR version < 4."; + } else { + const auto& name_to_idx = session_state.GetOrtValueNameIdxMap(); + + for (auto entry : initializers) { + int idx; + name_to_idx.GetIdx(entry.first, idx); + + bool found = initialized_tensors.find(idx) != initialized_tensors.cend(); + ASSERT_TRUE(found) << "Missing entry for " << entry.first << " in session state initialized tensors"; + + if (graph_utils::IsConstantInitializer(graph, entry.first, false)) { + found = const_initialized_tensors.find(idx) != const_initialized_tensors.cend(); + ASSERT_TRUE(found) << "Missing entry for " << entry.first << " in session state const initialized tensors"; } } } } + +INSTANTIATE_TEST_CASE_P(SessionStateTests, SessionStateTestP, + testing::ValuesIn(param_list)); } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/mlas/unittest.cpp b/onnxruntime/test/mlas/unittest.cpp index 36c96c819c87d..2b999f3b97d95 100644 --- a/onnxruntime/test/mlas/unittest.cpp +++ b/onnxruntime/test/mlas/unittest.cpp @@ -26,11 +26,16 @@ Module Name: #else #include #endif +#if !defined(MLAS_NO_ONNXRUNTIME_THREADPOOL) +#include "core/platform/threadpool.h" +#endif #if !defined(_countof) #define _countof(_Array) (sizeof(_Array) / sizeof(_Array[0])) #endif +MLAS_THREADPOOL* threadpool = nullptr; + class MatrixGuardBuffer { public: @@ -225,7 +230,7 @@ class MlasSgemmTest : public MlasTestBase std::fill_n(C, M * N, -0.5f); std::fill_n(CReference, M * N, -0.5f); - MlasSgemm(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, nullptr); + MlasSgemm(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, threadpool); ReferenceSgemm(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, CReference, ldc); for (size_t f = 0; f < M * N; f++) { @@ -667,7 +672,7 @@ class MlasConv2DTest : public MlasTestBase } MlasSgemm(CblasNoTrans, CblasNoTrans, FilterCount, OutputSize, K, 1.0f, - filter, K, Im2Col, OutputSize, 0.0f, Output, OutputSize, nullptr); + filter, K, Im2Col, OutputSize, 0.0f, Output, OutputSize, threadpool); // // Apply the bias. @@ -1072,7 +1077,7 @@ class MlasPool2DTest : public MlasTestBase float* Output ) { - MlasPool(PoolingKind, 2, InputShape, KernelShape, Padding, StrideShape, OutputShape, Input, Output, nullptr); + MlasPool(PoolingKind, 2, InputShape, KernelShape, Padding, StrideShape, OutputShape, Input, Output, threadpool); } void @@ -1417,7 +1422,7 @@ class MlasPool3DTest : public MlasTestBase float* Output = BufferOutput.GetBuffer(OutputBufferElements); float* OutputReference = BufferOutputReference.GetBuffer(OutputBufferElements); - MlasPool(MlasMaximumPooling, 3, InputShape, KernelShape, Padding, StrideShape, OutputShape, Input, Output, nullptr); + MlasPool(MlasMaximumPooling, 3, InputShape, KernelShape, Padding, StrideShape, OutputShape, Input, Output, threadpool); ReferenceMaximumPool3D(InputShape, KernelShape, Padding, StrideShape, Input, OutputReference); if (memcmp(Output, OutputReference, OutputBufferElements * sizeof(float)) != 0) { @@ -1425,7 +1430,7 @@ class MlasPool3DTest : public MlasTestBase InputChannels, InputDepth, InputHeight, InputWidth, KernelDepth, KernelHeight, KernelWidth); } - MlasPool(MlasAveragePoolingExcludePad, 3, InputShape, KernelShape, Padding, StrideShape, OutputShape, Input, Output, nullptr); + MlasPool(MlasAveragePoolingExcludePad, 3, InputShape, KernelShape, Padding, StrideShape, OutputShape, Input, Output, threadpool); ReferenceAveragePool3D(InputShape, KernelShape, Padding, StrideShape, Input, OutputReference, false); if (memcmp(Output, OutputReference, OutputBufferElements * sizeof(float)) != 0) { @@ -1433,7 +1438,7 @@ class MlasPool3DTest : public MlasTestBase InputChannels, InputDepth, InputHeight, InputWidth, KernelDepth, KernelHeight, KernelWidth); } - MlasPool(MlasAveragePoolingIncludePad, 3, InputShape, KernelShape, Padding, StrideShape, OutputShape, Input, Output, nullptr); + MlasPool(MlasAveragePoolingIncludePad, 3, InputShape, KernelShape, Padding, StrideShape, OutputShape, Input, Output, threadpool); ReferenceAveragePool3D(InputShape, KernelShape, Padding, StrideShape, Input, OutputReference, true); if (memcmp(Output, OutputReference, OutputBufferElements * sizeof(float)) != 0) { @@ -1781,28 +1786,37 @@ main( void ) { - printf("SGEMM tests.\n"); - std::make_unique()->ExecuteShort(); - - printf("Conv2D tests.\n"); - std::make_unique()->ExecuteShort(); - if (MlasNchwcGetBlockSize() > 1) { - std::make_unique()->ExecuteShort(); - } - - printf("Pool2D tests.\n"); - std::make_unique()->ExecuteShort(); - if (MlasNchwcGetBlockSize() > 1) { - std::make_unique()->ExecuteShort(); - } + for (int i = 0; i != 2; ++i) { + printf("SGEMM tests.\n"); + std::make_unique()->ExecuteShort(); + + printf("Conv2D tests.\n"); + std::make_unique()->ExecuteShort(); + if (MlasNchwcGetBlockSize() > 1) { + std::make_unique()->ExecuteShort(); + } - printf("Pool3D tests.\n"); - std::make_unique()->ExecuteShort(); + printf("Pool2D tests.\n"); + std::make_unique()->ExecuteShort(); + if (MlasNchwcGetBlockSize() > 1) { + std::make_unique()->ExecuteShort(); + } - printf("Activation tests.\n"); - std::make_unique()->ExecuteShort(); + printf("Pool3D tests.\n"); + std::make_unique()->ExecuteShort(); - printf("Done.\n"); + printf("Activation tests.\n"); + std::make_unique()->ExecuteShort(); + printf("Done.\n"); +#if !defined(MLAS_NO_ONNXRUNTIME_THREADPOOL) + threadpool = new onnxruntime::concurrency::ThreadPool("test", 2); +#else + break; +#endif + } +#if !defined(MLAS_NO_ONNXRUNTIME_THREADPOOL) + delete threadpool; +#endif return 0; } diff --git a/onnxruntime/test/onnx/microbenchmark/model_init.cc b/onnxruntime/test/onnx/microbenchmark/model_init.cc index 5a841276e06bf..ecf48fa6554ce 100644 --- a/onnxruntime/test/onnx/microbenchmark/model_init.cc +++ b/onnxruntime/test/onnx/microbenchmark/model_init.cc @@ -107,12 +107,12 @@ Status CreateExecutionProviders(std::unique_ptr* ret) { return Status::OK(); } -Status CreateKernelRegistryManagerFromModel(std::unique_ptr* ret, Model* model) { +Status CreateKernelRegistryManagerFromModel(std::unique_ptr* ret, Model* model, concurrency::ThreadPool& tp) { std::unique_ptr execution_providers; ORT_RETURN_IF_ERROR(CreateExecutionProviders(&execution_providers)); std::unique_ptr kernel_registry_manager = std::make_unique(); ORT_RETURN_IF_ERROR(kernel_registry_manager->RegisterKernels(*execution_providers)); - SessionState s{*execution_providers, true}; + SessionState s{*execution_providers, true, &tp}; s.SetLogger(logging::LoggingManager::DefaultLogger()); ORT_RETURN_IF_ERROR(model->MainGraph().Resolve()); @@ -125,7 +125,8 @@ Status CreateKernelRegistryManagerFromModel(std::unique_ptr kernel_registry_manager; - auto st = CreateKernelRegistryManagerFromModel(&kernel_registry_manager, model); + concurrency::ThreadPool tp{"test", 1}; + auto st = CreateKernelRegistryManagerFromModel(&kernel_registry_manager, model, tp); if (!st.IsOK()) throw std::runtime_error("failed"); for (auto _ : state) { for (const auto& n : model->MainGraph().Nodes()) { @@ -175,11 +176,12 @@ static void BM_PartitionModel_tiny_yolo(benchmark::State& state) { std::unique_ptr kernel_registry_manager = std::make_unique(); status = kernel_registry_manager->RegisterKernels(*execution_providers); if (!status.IsOK()) throw std::runtime_error("RegisterKernels failed"); + concurrency::ThreadPool tp{"test", 1}; for (auto _ : state) { state.PauseTiming(); std::shared_ptr model = std::make_shared(model_proto); - SessionState s{*execution_providers, true}; + SessionState s{*execution_providers, true, &tp}; s.SetLogger(logging::LoggingManager::DefaultLogger()); BM_BREAK_IF_ERROR(model->MainGraph().Resolve()); s.SetGraphViewer(std::make_unique(model->MainGraph())); @@ -205,11 +207,12 @@ static void BM_PartitionModel_inception_v4(benchmark::State& state) { std::unique_ptr kernel_registry_manager = std::make_unique(); status = kernel_registry_manager->RegisterKernels(*execution_providers); if (!status.IsOK()) throw std::runtime_error("RegisterKernels failed"); + concurrency::ThreadPool tp{"test", 1}; for (auto _ : state) { state.PauseTiming(); std::shared_ptr model = std::make_shared(model_proto); - SessionState s{*execution_providers, true}; + SessionState s{*execution_providers, true, &tp}; s.SetLogger(logging::LoggingManager::DefaultLogger()); BM_BREAK_IF_ERROR(model->MainGraph().Resolve()); s.SetGraphViewer(std::make_unique(model->MainGraph())); diff --git a/onnxruntime/test/onnx/microbenchmark/modeltest.cc b/onnxruntime/test/onnx/microbenchmark/modeltest.cc index 4f256393ca5c1..d9f32ad744b06 100644 --- a/onnxruntime/test/onnx/microbenchmark/modeltest.cc +++ b/onnxruntime/test/onnx/microbenchmark/modeltest.cc @@ -50,7 +50,8 @@ BENCHMARK(BM_CreateSession_WithGPU); static void BM_CreateSession(benchmark::State& state) { const ORTCHAR_T* model_path = ORT_TSTR("../models/opset8/test_bvlc_alexnet/model.onnx"); - OrtSessionOptions* session_option = OrtCreateSessionOptions(); + OrtSessionOptions* session_option; + ORT_BREAK_ON_ERROR(OrtCreateSessionOptions(&session_option)); for (auto _ : state) { OrtSession* session; ORT_BREAK_ON_ERROR(OrtCreateSession(env, model_path, session_option, &session)); diff --git a/onnxruntime/test/onnx/tensorprotoutils.cc b/onnxruntime/test/onnx/tensorprotoutils.cc index 27442273f26fa..8b17df4aa1ae0 100644 --- a/onnxruntime/test/onnx/tensorprotoutils.cc +++ b/onnxruntime/test/onnx/tensorprotoutils.cc @@ -407,7 +407,7 @@ Status TensorProtoToMLValue(const onnx::TensorProto& tensor_proto, const MemBuff return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Size overflow"); } size_t size_to_allocate; - GetSizeInBytesFromTensorProto<0>(tensor_proto, &size_to_allocate); + ORT_RETURN_IF_ERROR(GetSizeInBytesFromTensorProto<0>(tensor_proto, &size_to_allocate)); if (preallocated && preallocated_size < size_to_allocate) return Status(common::ONNXRUNTIME, common::FAIL, MakeString("The buffer planner is not consistent with tensor buffer size, expected ", size_to_allocate, ", got ", preallocated_size)); diff --git a/onnxruntime/test/providers/cpu/math/softmax_test.cc b/onnxruntime/test/providers/cpu/math/softmax_test.cc index aad97cfaf8754..0273378194b5d 100644 --- a/onnxruntime/test/providers/cpu/math/softmax_test.cc +++ b/onnxruntime/test/providers/cpu/math/softmax_test.cc @@ -194,23 +194,23 @@ TEST(SoftmaxOperator, InvalidAxis) { TEST(SoftmaxOperator, TestInputTooLarge) { float* ignored = nullptr; - + concurrency::ThreadPool tp("", 1); // N > INT32_MAX int64_t N = int64_t(INT32_MAX) + 1; int64_t D = 1; - auto status = SoftmaxCPU(N, D, ignored, ignored, ignored, ignored, true, ignored); + auto status = SoftmaxCPU(N, D, ignored, ignored, ignored, ignored, true, ignored, &tp); EXPECT_EQ(status.Code(), common::INVALID_ARGUMENT); // D > INT32_MAX N = 1; D = int64_t(INT32_MAX) + 1; - status = SoftmaxCPU(N, D, ignored, ignored, ignored, ignored, true, ignored); + status = SoftmaxCPU(N, D, ignored, ignored, ignored, ignored, true, ignored, &tp); EXPECT_EQ(status.Code(), common::INVALID_ARGUMENT); // N * D > INT32_MAX N = int64_t(INT32_MAX) / 2; D = 3; - status = SoftmaxCPU(N, D, ignored, ignored, ignored, ignored, true, ignored); + status = SoftmaxCPU(N, D, ignored, ignored, ignored, ignored, true, ignored, &tp); EXPECT_EQ(status.Code(), common::INVALID_ARGUMENT); /* diff --git a/onnxruntime/test/providers/memcpy_test.cc b/onnxruntime/test/providers/memcpy_test.cc index c43779875fb02..133c2873e20ad 100644 --- a/onnxruntime/test/providers/memcpy_test.cc +++ b/onnxruntime/test/providers/memcpy_test.cc @@ -22,11 +22,13 @@ void PutAllNodesOnOneProvider(Graph& graph, const std::string& provider_type) { } } // namespace TEST(MemcpyTest, copy1) { + concurrency::ThreadPool tp{"test", 1}; + ExecutionProviders execution_providers; CPUExecutionProviderInfo epi; auto st = execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique(epi)); ASSERT_TRUE(st.IsOK()) << st.ErrorMessage(); - SessionState s{execution_providers, true}; + SessionState s{execution_providers, true, &tp}; s.SetLogger(logging::LoggingManager::DefaultLogger()); KernelRegistryManager kernel_registry_manager; kernel_registry_manager.RegisterKernels(execution_providers); From 11e1a7ff59ade6d5b5726077d56a11d422dd2ad9 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Thu, 15 Aug 2019 18:13:04 -0700 Subject: [PATCH 2/7] fix warning --- onnxruntime/core/session/inference_session.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h index f8bd6cf57bf0d..7fcbe0ad87138 100644 --- a/onnxruntime/core/session/inference_session.h +++ b/onnxruntime/core/session/inference_session.h @@ -407,6 +407,10 @@ class InferenceSession { // The list of execution providers. ExecutionProviders execution_providers_; + private: + // Threadpool for this session + std::unique_ptr thread_pool_; + protected: // Immutable state for each op in the model. Shared by all executors. // It has a dependency on execution_providers_. @@ -433,8 +437,6 @@ class InferenceSession { std::unordered_map input_def_map_; OutputDefList output_def_list_; - // Threadpool for this session - std::unique_ptr thread_pool_; // Data transfer manager. DataTransferManager data_transfer_mgr_; From 4b736bfe2460f5ac146f19d1af4d7025289d2c0d Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Thu, 15 Aug 2019 20:22:35 -0700 Subject: [PATCH 3/7] revert --- include/onnxruntime/core/session/environment.h | 12 +----------- include/onnxruntime/core/session/onnxruntime_c_api.h | 4 ++-- onnxruntime/core/session/environment.cc | 1 - 3 files changed, 3 insertions(+), 14 deletions(-) diff --git a/include/onnxruntime/core/session/environment.h b/include/onnxruntime/core/session/environment.h index e11eb66072d1a..f36ebb60d0011 100644 --- a/include/onnxruntime/core/session/environment.h +++ b/include/onnxruntime/core/session/environment.h @@ -5,11 +5,8 @@ #include #include -#include -#include #include "core/common/common.h" #include "core/common/status.h" -#include "core/platform/threadpool.h" namespace onnxruntime { /** @@ -32,20 +29,13 @@ class Environment { Returns whether any runtime environment instance has been initialized. */ static bool IsInitialized() { return is_initialized_; } - concurrency::ThreadPool* GetThreadPool() { - std::call_once(tp_once_, [this]() { - tp_ = new concurrency::ThreadPool("default", std::max(std::thread::hardware_concurrency() - 1, 1)); - }); - return tp_; - } private: ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Environment); Environment() = default; Status Initialize(); - concurrency::ThreadPool* tp_ = nullptr; - std::once_flag tp_once_; + static std::atomic is_initialized_; }; } // namespace onnxruntime diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index 74cd44bd2ab39..87e143b8bda10 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -144,7 +144,7 @@ typedef enum OrtErrorCode { // The actual types defined have an Ort prefix ORT_RUNTIME_CLASS(Env); -ORT_RUNTIME_CLASS(Status); +ORT_RUNTIME_CLASS(Status); // nullptr for Status* indicates success ORT_RUNTIME_CLASS(Provider); ORT_RUNTIME_CLASS(AllocatorInfo); ORT_RUNTIME_CLASS(Session); //Don't call OrtReleaseSession from Dllmain (because session owns a thread pool) @@ -342,7 +342,7 @@ ORT_API_STATUS(OrtGetStringTensorDataLength, _In_ const OrtValue* value, _Out_ s * \param value A tensor created from OrtCreateTensor... function. * \param s_len total data length, get it from OrtGetStringTensorDataLength */ -ORT_API_STATUS(OrtGetStringTensorContent, _In_ const OrtValue* value, _In_ void* s, size_t s_len, +ORT_API_STATUS(OrtGetStringTensorContent, _In_ const OrtValue* value, _Out_ void* s, size_t s_len, _Out_ size_t* offsets, size_t offsets_len); /** diff --git a/onnxruntime/core/session/environment.cc b/onnxruntime/core/session/environment.cc index 539d92bd4323a..d1f9041c9253f 100644 --- a/onnxruntime/core/session/environment.cc +++ b/onnxruntime/core/session/environment.cc @@ -87,7 +87,6 @@ Internal copy node } Environment::~Environment() { - delete tp_; ::google::protobuf::ShutdownProtobufLibrary(); } From ad8351bc1300d5a6944dd35b0c78a2b2fb3cadf7 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Thu, 15 Aug 2019 20:24:08 -0700 Subject: [PATCH 4/7] format code --- include/onnxruntime/core/session/onnxruntime_c_api.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index 87e143b8bda10..8566125ac14d1 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -144,10 +144,10 @@ typedef enum OrtErrorCode { // The actual types defined have an Ort prefix ORT_RUNTIME_CLASS(Env); -ORT_RUNTIME_CLASS(Status); // nullptr for Status* indicates success +ORT_RUNTIME_CLASS(Status); // nullptr for Status* indicates success ORT_RUNTIME_CLASS(Provider); ORT_RUNTIME_CLASS(AllocatorInfo); -ORT_RUNTIME_CLASS(Session); //Don't call OrtReleaseSession from Dllmain (because session owns a thread pool) +ORT_RUNTIME_CLASS(Session); //Don't call OrtReleaseSession from Dllmain (because session owns a thread pool) ORT_RUNTIME_CLASS(Value); ORT_RUNTIME_CLASS(RunOptions); ORT_RUNTIME_CLASS(TypeInfo); From dff70df296bad1edeaa60e23619c63ed95d9e184 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Thu, 15 Aug 2019 20:24:38 -0700 Subject: [PATCH 5/7] revert more --- include/onnxruntime/core/session/onnxruntime_cxx_api.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h index 992afe2c6fa89..e1397105c3bef 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h @@ -99,7 +99,6 @@ struct Env; struct TypeInfo; struct Value; -//Don't put such an object as a global(or thread local) variable in a DLL struct Env : Base { Env(nullptr_t) {} Env(OrtLoggingLevel default_logging_level, _In_ const char* logid); @@ -157,7 +156,6 @@ struct SessionOptions : Base { SessionOptions& Add(OrtCustomOpDomain* custom_op_domain); }; -//Don't put such an object as a global(or thread local) variable in a DLL struct Session : Base { explicit Session(nullptr_t) {} Session(Env& env, const ORTCHAR_T* model_path, const SessionOptions& options); From 399e81b6db66e616fccd6be59921b84217b00a4a Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Thu, 15 Aug 2019 22:20:14 -0700 Subject: [PATCH 6/7] update --- onnxruntime/core/providers/cpu/object_detection/roialign.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/cpu/object_detection/roialign.cc b/onnxruntime/core/providers/cpu/object_detection/roialign.cc index 9453039aa8753..4d27e957e9f44 100644 --- a/onnxruntime/core/providers/cpu/object_detection/roialign.cc +++ b/onnxruntime/core/providers/cpu/object_detection/roialign.cc @@ -268,7 +268,7 @@ void RoiAlignForward( } // for ph } // for c }; // for n - const_cast(ttp)->ParallelFor(static_cast(n_rois), work_object); + if (ttp != nullptr) const_cast(ttp)->ParallelFor(static_cast(n_rois), work_object); } } // namespace From 11d4f67ce290aa1f602cb1998d403ffccd1222d5 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Fri, 16 Aug 2019 11:32:37 -0700 Subject: [PATCH 7/7] update --- onnxruntime/core/session/inference_session.cc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index 1d7623d0165a6..6dc370b797f1c 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -92,11 +92,9 @@ inline std::basic_string GetCurrentTimeString() { } concurrency::ThreadPool* CreateThreadPool(int size) { - if (size == 1) - return nullptr; if (size <= 0) - size = std::max(std::thread::hardware_concurrency() - 1, 1); - return new concurrency::ThreadPool("SESSION", size); + size = std::thread::hardware_concurrency() / 2; + return size > 0 ? new concurrency::ThreadPool("SESSION", size) : nullptr; } } // namespace