From 4ea200e8f1bf5cefe538f8d2409960c0db5ab561 Mon Sep 17 00:00:00 2001
From: Leo Zhao <leo.zhao@intel.com>
Date: Mon, 24 Jun 2019 12:22:10 +0800
Subject: [PATCH 01/21] clear cache when tid == 1 and cache size exceeds max
 capacity

test=develop
---
 .../fluid/operators/mkldnn/conv_mkldnn_op.cc  |  8 ++++---
 .../fluid/operators/mkldnn/pool_mkldnn_op.cc  | 13 +++++++----
 paddle/fluid/platform/device_context.cc       | 23 +++++++++++++++----
 paddle/fluid/platform/device_context.h        |  2 +-
 4 files changed, 33 insertions(+), 13 deletions(-)
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 647e09a92911e..e20dfb3568275 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -221,6 +221,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         user_weights_memory_p, pipeline, is_test);
 
     std::shared_ptr<mkldnn::memory> dst_memory_p;
+    std::shared_ptr<mkldnn::memory> user_residual_memory_p;
 
     if (fuse_residual_conn) {
       auto residual_param = ctx.Input<Tensor>("ResidualData");
@@ -243,7 +244,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
         auto user_residual_md = platform::MKLDNNMemDesc(
             residual_data_tz, residual_data_type, residual_param->format());
-        auto user_residual_memory_p = handler.AcquireResidualDataMemory(
+        user_residual_memory_p = handler.AcquireResidualDataMemory(
             user_residual_md, to_void_cast<T>(residual_param_data));
 
         dst_memory_p = handler.AcquireDstMemoryFromResidualDataMemory(
@@ -263,14 +264,15 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     // create convolution op primitive
     std::shared_ptr<mkldnn::convolution_forward> conv_p;
+    std::shared_ptr<mkldnn::memory> user_bias_memory_p, bias_memory_p;
     if (bias) {
       const T* bias_data = bias->data<T>();
       auto user_bias_md = platform::MKLDNNMemDesc(
           {bias_tz}, platform::MKLDNNGetDataType<T>(), memory::format::x);
-      auto user_bias_memory_p =
+      user_bias_memory_p =
           handler.AcquireBiasMemory(user_bias_md, to_void_cast<T>(bias_data));
 
-      auto bias_memory_p =
+      bias_memory_p =
           handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline);
       conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
                                           bias_memory_p, dst_memory_p);
diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index c635fd11c37ae..1ceedc63d8100 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -128,6 +128,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const std::string key_pool_workspace_memory =
         key + "@pool_workspace_memory";
 
+    std::shared_ptr<mkldnn::memory> src_memory, dst_memory;
+    std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd;
+    std::shared_ptr<mkldnn::memory> pool_src_memory_p, pool_dst_memory_p;
     auto pool_p =
         std::static_pointer_cast<pooling_forward>(dev_ctx.GetBlob(key_pool_p));
     if (pool_p == nullptr) {
@@ -150,7 +153,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       auto propagation = src_md.data.data_type == mkldnn_f32
                              ? mkldnn::prop_kind::forward_training
                              : mkldnn::prop_kind::forward_scoring;
-      std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
+      pool_pd =
           CreatePrimitiveDesc(src_md, dst_md, propagation, strides,
                               padding_left_top, padding_right_bottom, ksize,
                               pooling_type, mkldnn_engine, ceil_mode, is_test);
@@ -158,9 +161,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       // save pool_pd into global device context to be referred in backward path
       if (!is_test) dev_ctx.SetBlob(key_pool_pd, pool_pd);
 
-      auto src_memory = std::make_shared<memory>(pool_pd->src_primitive_desc(),
-                                                 to_void_cast<T>(input_data));
-      auto dst_memory =
+      src_memory = std::make_shared<memory>(pool_pd->src_primitive_desc(),
+                                            to_void_cast<T>(input_data));
+      dst_memory =
           std::make_shared<memory>(pool_pd->dst_primitive_desc(), output_data);
 
       dev_ctx.SetBlob(key_pool_src_mem_p, src_memory);
@@ -186,7 +189,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           (memory::format)dst_memory->get_primitive_desc().desc().data.format;
     } else {
       // Primitives already exist
-      auto pool_src_memory_p =
+      pool_src_memory_p =
           std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_src_mem_p));
       PADDLE_ENFORCE(pool_src_memory_p != nullptr,
                      "Fail to find pooling src mem_p in device context");
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 4f048d44685a8..7ba3a7a52bd13 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -407,6 +407,8 @@ thread_local int cur_thread_id = 0;
 
 void set_cur_thread_id(int tid) { cur_thread_id = tid; }
 int get_cur_thread_id(void) { return cur_thread_id; }
+#define MKLDNN_CAP 100
+#define MKLDNN_CLEAR_PERCENTAGE 10
 
 void MKLDNNDeviceContext::SetBlob(const std::string& name,
                                   std::shared_ptr<void> data) const {
@@ -429,14 +431,23 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
   }
 
   // Find Key in found (or newly created) KeyBlob
-  auto key_it = pBlob->find(name);
+  auto key_it = std::find_if(
+      pBlob->begin(), pBlob->end(),
+      [=](std::pair<std::string, std::shared_ptr<void>> const& obj) {
+        return obj.first == name;
+      });
 
   if (key_it == pBlob->end()) {
-    (*pBlob)[name] = data;  // create new blob
+    if ((tid == 1) && (pBlob->size() >= MKLDNN_CAP)) {
+      VLOG(3) << "remove head " << pBlob->begin()->first << " in SetBlob\n";
+      pBlob->erase(pBlob->begin());
+      //         pBlob->clear();
+    }
+    pBlob->push_back(std::make_pair(name, data));
   } else {
     key_it->second = data;  // set data to existing blob
   }
-
+  VLOG(3) << "SetBlob " << name << "\n";
   // lock will be automatically released when out of scope
   return;
 }
@@ -456,7 +467,11 @@ std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
   pBlob = map_it->second;
 
   // Find Blob via name
-  auto key_it = pBlob->find(name);
+  auto key_it = std::find_if(
+      pBlob->begin(), pBlob->end(),
+      [=](std::pair<std::string, std::shared_ptr<void>> const& obj) {
+        return obj.first == name;
+      });
 
   if (key_it == pBlob->end()) return nullptr;
 
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 812181563e6e5..628273a110e3f 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -378,7 +378,7 @@ struct DefaultDeviceContextType<platform::CUDAPinnedPlace> {
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN
-using KeyBlob = std::unordered_map<std::string, std::shared_ptr<void>>;
+using KeyBlob = std::vector<std::pair<std::string, std::shared_ptr<void>>>;
 using BlobMap = std::unordered_map<int, std::shared_ptr<KeyBlob>>;
 
 void set_cur_thread_id(int);

From 3d2e563b94a15337bd6256711988b7d5f2fa393f Mon Sep 17 00:00:00 2001
From: Leo Zhao <leo.zhao@intel.com>
Date: Mon, 24 Jun 2019 17:45:31 +0800
Subject: [PATCH 02/21] add more logs to print blob status test=develop

---
 paddle/fluid/platform/device_context.cc | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 7ba3a7a52bd13..831bf1eaf875f 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -439,7 +439,8 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
 
   if (key_it == pBlob->end()) {
     if ((tid == 1) && (pBlob->size() >= MKLDNN_CAP)) {
-      VLOG(3) << "remove head " << pBlob->begin()->first << " in SetBlob\n";
+      VLOG(3) << "SetBlob: tid=" << tid << ", remove head blob "
+              << pBlob->begin()->first << "\n";
       pBlob->erase(pBlob->begin());
       //         pBlob->clear();
     }
@@ -447,7 +448,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
   } else {
     key_it->second = data;  // set data to existing blob
   }
-  VLOG(3) << "SetBlob " << name << "\n";
+  VLOG(3) << "SetBlob: tid=" << tid << ", add blob=" << name << "\n";
   // lock will be automatically released when out of scope
   return;
 }
@@ -463,7 +464,10 @@ std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
 
   // Find KeyBlob for current thread firstly
   auto map_it = pMap->find(tid);
-  if (map_it == pMap->end()) return nullptr;
+  if (map_it == pMap->end()) {
+    VLOG(3) << "GetBlob: tid=" << tid << ", miss tid\n";
+    return nullptr;
+  }
   pBlob = map_it->second;
 
   // Find Blob via name
@@ -473,8 +477,12 @@ std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
         return obj.first == name;
       });
 
-  if (key_it == pBlob->end()) return nullptr;
+  if (key_it == pBlob->end()) {
+    VLOG(3) << "GetBlob tid=" << tid << ", miss blob=" << name << "\n";
+    return nullptr;
+  }
 
+  VLOG(3) << "GetBlob tid=" << tid << ", get blob=" << name << "\n";
   // lock will be automatically released when out of scope
   return key_it->second;
 }

From 14c5b2ea03a8ec73bf803df9a639ace01e1d0b81 Mon Sep 17 00:00:00 2001
From: Leo Zhao <leo.zhao@intel.com>
Date: Tue, 25 Jun 2019 13:29:40 +0800
Subject: [PATCH 03/21] 1. Add new interface in AnalysisConfig to set mkldnn
 thread id 2. Few fix in concat/pool mkldnn kernel for key generation 3.
 Enable cache clearing mechanism

test=develop
---
 paddle/fluid/inference/api/analysis_config.cc | 10 ++++++
 .../fluid/inference/api/analysis_predictor.cc | 35 +++++++++++++++++--
 .../inference/api/paddle_analysis_config.h    |  4 +++
 .../operators/mkldnn/concat_mkldnn_op.cc      |  7 ++++
 .../fluid/operators/mkldnn/pool_mkldnn_op.cc  |  7 ++++
 paddle/fluid/platform/device_context.cc       |  7 ++--
 paddle/fluid/platform/mkldnn_reuse.h          |  3 ++
 7 files changed, 67 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 890c90697bcd5..b1221984f66b5 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -114,6 +114,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   // MKLDNN related.
   CP_MEMBER(use_mkldnn_);
   CP_MEMBER(mkldnn_enabled_op_types_);
+  CP_MEMBER(mkldnn_thread_id_);
   // Quantization related.
   CP_MEMBER(use_mkldnn_quantizer_);
   CP_MEMBER(mkldnn_quantizer_config_);
@@ -161,6 +162,15 @@ void AnalysisConfig::EnableMKLDNN() {
   Update();
 }
 
+void AnalysisConfig::SetMKLDNNThreadId(int id) {
+#ifdef PADDLE_WITH_MKLDNN
+  mkldnn_thread_id_ = id;
+#else
+  LOG(ERROR) << "Please compile with MKLDNN first to set MKLDNN Thread Id";
+  mkldnn_thread_id_ = 0;
+#endif
+}
+
 void AnalysisConfig::EnableMkldnnQuantizer() {
 #ifdef PADDLE_WITH_MKLDNN
   if (!mkldnn_quantizer_config_)
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 5d9d5a3178aaa..e5f1c87024cfc 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -197,6 +197,16 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
                             std::vector<PaddleTensor> *output_data,
                             int batch_size) {
   paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
+#ifdef PADDLE_WITH_MKLDNN
+  // TODO(intel): will refactor this code later
+  // Make sure it not conflict with AnalysisPredictor::SetMkldnnthreadid case
+  VLOG(3) << "AnalysisPredictor::Run get_cur_thread_id="
+          << paddle::platform::get_cur_thread_id()
+          << ", mkldnn_thread_id_=" << config_.mkldnn_thread_id_ << "\n";
+  if (paddle::platform::get_cur_thread_id() == 0)
+    paddle::platform::set_cur_thread_id(config_.mkldnn_thread_id_);
+#endif
+
   VLOG(3) << "Predictor::predict";
   inference::Timer timer;
   timer.tic();
@@ -238,7 +248,13 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
   // recover the cpu_math_library_num_threads to 1, in order to avoid thread
   // conflict when integrating it into deployment service.
   paddle::platform::SetNumThreads(1);
-
+#ifdef PADDLE_WITH_MKLDNN
+  // TODO(intel): will refactor this code later
+  // reset thread id to avoid confusion when thread is reused from pool again
+  // mkldnn_thread_id_ = -1 is reserved for cache clearing mode only
+  if (paddle::platform::get_cur_thread_id() == -1)
+    paddle::platform::set_cur_thread_id(0);
+#endif
   return true;
 }
 
@@ -595,6 +611,15 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 
 bool AnalysisPredictor::ZeroCopyRun() {
   paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
+#ifdef PADDLE_WITH_MKLDNN
+  // TODO(intel): will refactor this code later
+  // Make sure it not conflict with AnalysisPredictor::SetMkldnnthreadid case
+  VLOG(3) << "AnalysisPredictor::Run get_cur_thread_id="
+          << paddle::platform::get_cur_thread_id()
+          << ", mkldnn_thread_id_=" << config_.mkldnn_thread_id_ << "\n";
+  if (paddle::platform::get_cur_thread_id() == 0)
+    paddle::platform::set_cur_thread_id(config_.mkldnn_thread_id_);
+#endif
   executor_->Run();
   // Fix TensorArray reuse not cleaned bug.
   tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
@@ -603,7 +628,13 @@ bool AnalysisPredictor::ZeroCopyRun() {
   // recover the cpu_math_library_num_threads to 1, in order to avoid thread
   // conflict when integrating it into deployment service.
   paddle::platform::SetNumThreads(1);
-
+#ifdef PADDLE_WITH_MKLDNN
+  // TODO(intel): will refactor this code later
+  // reset thread id to avoid confusion when thread is reused from pool again
+  // mkldnn_thread_id_ = -1 is reserved for cache clearing mode only
+  if (paddle::platform::get_cur_thread_id() == -1)
+    paddle::platform::set_cur_thread_id(0);
+#endif
   return true;
 }
 
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index e3682d27054a1..43fd321fa27ae 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -182,6 +182,9 @@ struct AnalysisConfig {
   /** A boolean state telling whether to use the MKLDNN.
    */
   bool mkldnn_enabled() const { return use_mkldnn_; }
+  /** Set MKLDNN thread id.
+   */
+  void SetMKLDNNThreadId(int id);
 
   /** Set and get the number of cpu math library threads.
    */
@@ -287,6 +290,7 @@ struct AnalysisConfig {
   bool use_ngraph_{false};
   bool use_mkldnn_{false};
   std::unordered_set<std::string> mkldnn_enabled_op_types_;
+  int mkldnn_thread_id_{0};
 
   bool model_from_memory_{false};
 
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index a855ba8475a1b..ac9164a77f893 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -81,6 +81,13 @@ std::string CreateKey(const paddle::framework::ExecutionContext& ctx,
   platform::MKLDNNHandler::AppendKey(&key, std::to_string(dt));
   platform::MKLDNNHandler::AppendKey(&key,
                                      std::to_string(multi_input[0]->format()));
+  if (platform::get_cur_thread_id() != -1) {
+    auto tid = std::this_thread::get_id();
+    std::stringstream ss;
+    ss << tid;
+    platform::MKLDNNHandler::AppendKey(&key, "-t:");
+    platform::MKLDNNHandler::AppendKey(&key, ss.str());
+  }
   return key;
 }
 
diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index 1ceedc63d8100..5f797f3581ec5 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -48,6 +48,13 @@ std::string CreateKey(const paddle::framework::ExecutionContext& ctx,
   platform::MKLDNNHandler::AppendKey(&key, std::to_string(dt));
   platform::MKLDNNHandler::AppendKey(&key, std::to_string(fmt));
   platform::MKLDNNHandler::AppendKey(&key, suffix);
+  if (platform::get_cur_thread_id() != -1) {
+    auto tid = std::this_thread::get_id();
+    std::stringstream ss;
+    ss << tid;
+    platform::MKLDNNHandler::AppendKey(&key, "-t:");
+    platform::MKLDNNHandler::AppendKey(&key, ss.str());
+  }
   return key;
 }
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 831bf1eaf875f..a54e3d4d6d9a3 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -407,8 +407,7 @@ thread_local int cur_thread_id = 0;
 
 void set_cur_thread_id(int tid) { cur_thread_id = tid; }
 int get_cur_thread_id(void) { return cur_thread_id; }
-#define MKLDNN_CAP 100
-#define MKLDNN_CLEAR_PERCENTAGE 10
+#define MKLDNN_CAP 10000
 
 void MKLDNNDeviceContext::SetBlob(const std::string& name,
                                   std::shared_ptr<void> data) const {
@@ -438,11 +437,11 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
       });
 
   if (key_it == pBlob->end()) {
-    if ((tid == 1) && (pBlob->size() >= MKLDNN_CAP)) {
+    // tid = -1 means cache clearing mode, MKLDNN_CAP defines max blob capacity
+    if ((tid == -1) && (pBlob->size() > MKLDNN_CAP)) {
       VLOG(3) << "SetBlob: tid=" << tid << ", remove head blob "
               << pBlob->begin()->first << "\n";
       pBlob->erase(pBlob->begin());
-      //         pBlob->clear();
     }
     pBlob->push_back(std::make_pair(name, data));
   } else {
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index f1fb6b156aedc..76302d2bc4246 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -38,6 +38,9 @@ class MKLDNNHandler {
     std::stringstream ss;
     ss << tid;
     key_ = key_common_ + "-t:" + ss.str();
+    if (platform::get_cur_thread_id() == -1) {
+      key_ = key_common_;
+    }
   }
 
   std::shared_ptr<mkldnn::memory> AcquireSrcMemory(

From 29ca76079317b04dd8ca9f03501904f962776184 Mon Sep 17 00:00:00 2001
From: Leo Zhao <leo.zhao@intel.com>
Date: Tue, 25 Jun 2019 14:09:48 +0800
Subject: [PATCH 04/21] change to use VLOG(2)

test=develop
---
 paddle/fluid/inference/api/analysis_predictor.cc | 12 ++++++++----
 paddle/fluid/platform/device_context.cc          | 11 ++++++-----
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index e5f1c87024cfc..e839b6c2c93cf 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -200,7 +200,7 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
 #ifdef PADDLE_WITH_MKLDNN
   // TODO(intel): will refactor this code later
   // Make sure it not conflict with AnalysisPredictor::SetMkldnnthreadid case
-  VLOG(3) << "AnalysisPredictor::Run get_cur_thread_id="
+  VLOG(2) << "AnalysisPredictor::Run get_cur_thread_id="
           << paddle::platform::get_cur_thread_id()
           << ", mkldnn_thread_id_=" << config_.mkldnn_thread_id_ << "\n";
   if (paddle::platform::get_cur_thread_id() == 0)
@@ -252,8 +252,10 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
   // TODO(intel): will refactor this code later
   // reset thread id to avoid confusion when thread is reused from pool again
   // mkldnn_thread_id_ = -1 is reserved for cache clearing mode only
-  if (paddle::platform::get_cur_thread_id() == -1)
+  if (paddle::platform::get_cur_thread_id() == -1) {
+    VLOG(2) << "Clear previous mkldnn thread id -1\n";
     paddle::platform::set_cur_thread_id(0);
+  }
 #endif
   return true;
 }
@@ -614,7 +616,7 @@ bool AnalysisPredictor::ZeroCopyRun() {
 #ifdef PADDLE_WITH_MKLDNN
   // TODO(intel): will refactor this code later
   // Make sure it not conflict with AnalysisPredictor::SetMkldnnthreadid case
-  VLOG(3) << "AnalysisPredictor::Run get_cur_thread_id="
+  VLOG(2) << "AnalysisPredictor::Run get_cur_thread_id="
           << paddle::platform::get_cur_thread_id()
           << ", mkldnn_thread_id_=" << config_.mkldnn_thread_id_ << "\n";
   if (paddle::platform::get_cur_thread_id() == 0)
@@ -632,8 +634,10 @@ bool AnalysisPredictor::ZeroCopyRun() {
   // TODO(intel): will refactor this code later
   // reset thread id to avoid confusion when thread is reused from pool again
   // mkldnn_thread_id_ = -1 is reserved for cache clearing mode only
-  if (paddle::platform::get_cur_thread_id() == -1)
+  if (paddle::platform::get_cur_thread_id() == -1) {
+    VLOG(2) << "Clear previous mkldnn thread id setting\n";
     paddle::platform::set_cur_thread_id(0);
+  }
 #endif
   return true;
 }
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index a54e3d4d6d9a3..41cdc92e20d2a 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -425,6 +425,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
     // 1st time to set blob in current thread
     pBlob = std::shared_ptr<KeyBlob>(new KeyBlob());
     (*pMap)[tid] = pBlob;
+    VLOG(2) << "SetBlob: tid=" << tid << ", add new tid\n";
   } else {
     pBlob = map_it->second;
   }
@@ -439,7 +440,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
   if (key_it == pBlob->end()) {
     // tid = -1 means cache clearing mode, MKLDNN_CAP defines max blob capacity
     if ((tid == -1) && (pBlob->size() > MKLDNN_CAP)) {
-      VLOG(3) << "SetBlob: tid=" << tid << ", remove head blob "
+      VLOG(2) << "SetBlob: tid=" << tid << ", remove head blob "
               << pBlob->begin()->first << "\n";
       pBlob->erase(pBlob->begin());
     }
@@ -447,7 +448,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
   } else {
     key_it->second = data;  // set data to existing blob
   }
-  VLOG(3) << "SetBlob: tid=" << tid << ", add blob=" << name << "\n";
+  VLOG(2) << "SetBlob: tid=" << tid << ", add blob=" << name << "\n";
   // lock will be automatically released when out of scope
   return;
 }
@@ -464,7 +465,7 @@ std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
   // Find KeyBlob for current thread firstly
   auto map_it = pMap->find(tid);
   if (map_it == pMap->end()) {
-    VLOG(3) << "GetBlob: tid=" << tid << ", miss tid\n";
+    VLOG(2) << "GetBlob: tid=" << tid << ", miss tid\n";
     return nullptr;
   }
   pBlob = map_it->second;
@@ -477,11 +478,11 @@ std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
       });
 
   if (key_it == pBlob->end()) {
-    VLOG(3) << "GetBlob tid=" << tid << ", miss blob=" << name << "\n";
+    VLOG(2) << "GetBlob tid=" << tid << ", miss blob=" << name << "\n";
     return nullptr;
   }
 
-  VLOG(3) << "GetBlob tid=" << tid << ", get blob=" << name << "\n";
+  VLOG(2) << "GetBlob tid=" << tid << ", get blob=" << name << "\n";
   // lock will be automatically released when out of scope
   return key_it->second;
 }

From 76db898aebe38b90e52e97289edc44ced873c60a Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Wed, 26 Jun 2019 00:09:31 +0800
Subject: [PATCH 05/21] detect model test for dynamic shape

---
 .../fluid/inference/tests/api/CMakeLists.txt  |   3 +
 .../tests/api/analyzer_detect_tester.cc       | 150 ++++++++++++++++++
 2 files changed, 153 insertions(+)
 create mode 100644 paddle/fluid/inference/tests/api/analyzer_detect_tester.cc

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 243f5cef00835..ec33df962e46e 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -157,6 +157,9 @@ if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
 endif()
 inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc)
 
+# detect
+inference_analysis_api_test_with_refer_result(test_analyzer_detect ${OCR_INSTALL_DIR} analyzer_detect_tester.cc)
+
 ### Image classification tests with fake data
 set(IMG_CLASS_TEST_APP "test_analyzer_image_classification")
 set(IMG_CLASS_TEST_APP_SRC "analyzer_image_classification_tester.cc")
diff --git a/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
new file mode 100644
index 0000000000000..d09f1ff81a218
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
@@ -0,0 +1,150 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+DEFINE_string(infer_shape, "", "data shape file");
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+struct Record {
+  std::vector<float> data;
+  std::vector<int32_t> shape;
+};
+
+Record ProcessALine(const std::string &line, const std::string &shape_line) {
+  VLOG(3) << "process a line";
+  std::vector<std::string> columns;
+
+  Record record;
+  std::vector<std::string> data_strs;
+  split(line, ' ', &data_strs);
+  for (auto &d : data_strs) {
+    record.data.push_back(std::stof(d));
+  }
+
+  std::vector<std::string> shape_strs;
+  split(shape_line, ' ', &shape_strs);
+  for (auto &s : shape_strs) {
+    record.shape.push_back(std::stoi(s));
+  }
+  // VLOG(3) << "data size " << record.data.size();
+  // VLOG(3) << "data shape size " << record.shape.size();
+  VLOG(2) << "data shape size " << record.shape[3];
+  return record;
+}
+
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
+  cfg->DisableGpu();
+  cfg->SwitchIrDebug();
+  cfg->SwitchSpecifyInputNames(false);
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
+}
+
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
+  std::string line;
+  std::ifstream file(FLAGS_infer_data);
+  std::string shape_line;
+  std::ifstream infer_file(FLAGS_infer_shape);
+
+  int iteration = FLAGS_test_all_data ? 1000 : 1;
+  for (int k = 0; k < iteration; k++) {
+    std::getline(file, line);
+    std::getline(infer_file, shape_line);
+    auto record = ProcessALine(line, shape_line);
+
+    PaddleTensor input;
+    input.shape = record.shape;
+    input.dtype = PaddleDType::FLOAT32;
+    size_t input_size = record.data.size() * sizeof(float);
+    input.data.Resize(input_size);
+    memcpy(input.data.data(), record.data.data(), input_size);
+    std::vector<PaddleTensor> input_slots;
+    input_slots.assign({input});
+    (*inputs).emplace_back(input_slots);
+  }
+}
+
+// Easy for profiling independently.
+//  ocr, mobilenet and se_resnext50
+void profile(bool use_mkldnn = false) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
+  }
+  // cfg.pass_builder()->TurnOnDebug();
+  std::vector<std::vector<PaddleTensor>> outputs;
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
+}
+
+TEST(Analyzer_vis, profile) { profile(); }
+
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_vis, profile_mkldnn) { profile(true /* use_mkldnn */); }
+#endif
+
+// Check the fuse status
+TEST(Analyzer_vis, fuse_statis) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  int num_ops;
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  GetFuseStatis(predictor.get(), &num_ops);
+}
+
+// Compare result of NativeConfig and AnalysisConfig
+void compare(bool use_mkldnn = false) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
+  }
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
+}
+
+TEST(Analyzer_vis, compare) { compare(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_vis, compare_mkldnn) { compare(true /* use_mkldnn */); }
+#endif
+
+// Compare Deterministic result
+TEST(Analyzer_vis, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle

From 6a97049076230ef136f586dfdb16811a18802183 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Wed, 26 Jun 2019 19:17:04 +0800
Subject: [PATCH 06/21] load input data one by one

---
 .../tests/api/analyzer_detect_tester.cc       | 110 +++++++-----------
 1 file changed, 45 insertions(+), 65 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
index d09f1ff81a218..5ee418534f6c5 100644
--- a/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <iostream>
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 DEFINE_string(infer_shape, "", "data shape file");
+DEFINE_int32(sample, 1, "number of sample");
 
 namespace paddle {
 namespace inference {
@@ -45,7 +46,8 @@ Record ProcessALine(const std::string &line, const std::string &shape_line) {
   }
   // VLOG(3) << "data size " << record.data.size();
   // VLOG(3) << "data shape size " << record.shape.size();
-  VLOG(2) << "data shape size " << record.shape[3];
+  // VLOG(2) << "data shape size " << record.shape[3];
+  LOG(INFO) << "data shape size " << record.shape[3];
   return record;
 }
 
@@ -57,28 +59,19 @@ void SetConfig(AnalysisConfig *cfg) {
   cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
 }
 
-void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
-  std::string line;
-  std::ifstream file(FLAGS_infer_data);
-  std::string shape_line;
-  std::ifstream infer_file(FLAGS_infer_shape);
-
-  int iteration = FLAGS_test_all_data ? 1000 : 1;
-  for (int k = 0; k < iteration; k++) {
-    std::getline(file, line);
-    std::getline(infer_file, shape_line);
-    auto record = ProcessALine(line, shape_line);
-
-    PaddleTensor input;
-    input.shape = record.shape;
-    input.dtype = PaddleDType::FLOAT32;
-    size_t input_size = record.data.size() * sizeof(float);
-    input.data.Resize(input_size);
-    memcpy(input.data.data(), record.data.data(), input_size);
-    std::vector<PaddleTensor> input_slots;
-    input_slots.assign({input});
-    (*inputs).emplace_back(input_slots);
-  }
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
+              const std::string &line, const std::string &shape_line) {
+  auto record = ProcessALine(line, shape_line);
+
+  PaddleTensor input;
+  input.shape = record.shape;
+  input.dtype = PaddleDType::FLOAT32;
+  size_t input_size = record.data.size() * sizeof(float);
+  input.data.Resize(input_size);
+  memcpy(input.data.data(), record.data.data(), input_size);
+  std::vector<PaddleTensor> input_slots;
+  input_slots.assign({input});
+  (*inputs).emplace_back(input_slots);
 }
 
 // Easy for profiling independently.
@@ -92,59 +85,46 @@ void profile(bool use_mkldnn = false) {
   }
   // cfg.pass_builder()->TurnOnDebug();
   std::vector<std::vector<PaddleTensor>> outputs;
-
   std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                 input_slots_all, &outputs, FLAGS_num_threads);
-}
 
-TEST(Analyzer_vis, profile) { profile(); }
-
-#ifdef PADDLE_WITH_MKLDNN
-TEST(Analyzer_vis, profile_mkldnn) { profile(true /* use_mkldnn */); }
-#endif
+  Timer run_timer;
+  double elapsed_time = 0;
 
-// Check the fuse status
-TEST(Analyzer_vis, fuse_statis) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  int num_ops;
+  int iterations = FLAGS_sample;
+  int num_times = FLAGS_repeat;
   auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
-  GetFuseStatis(predictor.get(), &num_ops);
-}
-
-// Compare result of NativeConfig and AnalysisConfig
-void compare(bool use_mkldnn = false) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-  if (use_mkldnn) {
-    cfg.EnableMKLDNN();
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
+  outputs.resize(iterations);
+
+  for (int j = 0; j < num_times; j++) {
+    std::ifstream file(FLAGS_infer_data);
+    std::ifstream infer_file(FLAGS_infer_shape);
+    std::string line;
+    std::string shape_line;
+
+    for (int i = 0; i < iterations; i++) {
+      std::getline(file, line);
+      std::getline(infer_file, shape_line);
+      SetInput(&input_slots_all, line, shape_line);
+
+      run_timer.tic();
+      predictor->Run(input_slots_all[i], &outputs[i], FLAGS_batch_size);
+      elapsed_time += run_timer.toc();
+    }
+    file.close();
+    infer_file.close();
   }
 
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareNativeAndAnalysis(
-      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
+  auto batch_latency = elapsed_time / (iterations * num_times);
+  PrintTime(FLAGS_batch_size, num_times, FLAGS_num_threads, 0, batch_latency,
+            iterations, VarType::FP32);
 }
 
-TEST(Analyzer_vis, compare) { compare(); }
+TEST(Analyzer_vis, profile) { profile(); }
+
 #ifdef PADDLE_WITH_MKLDNN
-TEST(Analyzer_vis, compare_mkldnn) { compare(true /* use_mkldnn */); }
+TEST(Analyzer_vis, profile_mkldnn) { profile(true /* use_mkldnn */); }
 #endif
 
-// Compare Deterministic result
-TEST(Analyzer_vis, compare_determine) {
-  AnalysisConfig cfg;
-  SetConfig(&cfg);
-
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  SetInput(&input_slots_all);
-  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-                       input_slots_all);
-}
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle

From 1a473737a8ad6cc94d01c0b469a31e5e6d0b3ce7 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Wed, 26 Jun 2019 20:06:27 +0800
Subject: [PATCH 07/21] each iteration use new threads

---
 .../tests/api/analyzer_detect_tester.cc       | 22 +++++++++++++------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
index 5ee418534f6c5..72889f9a61dea 100644
--- a/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
@@ -95,6 +95,8 @@ void profile(bool use_mkldnn = false) {
   auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
   outputs.resize(iterations);
 
+  std::vector<std::thread> threads;
+
   for (int j = 0; j < num_times; j++) {
     std::ifstream file(FLAGS_infer_data);
     std::ifstream infer_file(FLAGS_infer_shape);
@@ -102,14 +104,20 @@ void profile(bool use_mkldnn = false) {
     std::string shape_line;
 
     for (int i = 0; i < iterations; i++) {
-      std::getline(file, line);
-      std::getline(infer_file, shape_line);
-      SetInput(&input_slots_all, line, shape_line);
-
-      run_timer.tic();
-      predictor->Run(input_slots_all[i], &outputs[i], FLAGS_batch_size);
-      elapsed_time += run_timer.toc();
+      threads.emplace_back([&, i]() {
+        std::getline(file, line);
+        std::getline(infer_file, shape_line);
+        SetInput(&input_slots_all, line, shape_line);
+
+        run_timer.tic();
+        predictor->Run(input_slots_all[i], &outputs[i], FLAGS_batch_size);
+        elapsed_time += run_timer.toc();
+      });
+      LOG(INFO) << "threads size: " << threads.size();
+      threads[0].join();
+      threads.clear();
     }
+
     file.close();
     infer_file.close();
   }

From d27c75705bf7bef619993f746c5b05f043e20759 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Thu, 27 Jun 2019 16:13:04 +0800
Subject: [PATCH 08/21] fix input_slot_all memory leak

---
 .../fluid/inference/tests/api/analyzer_detect_tester.cc  | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
index 72889f9a61dea..7213e119eaef3 100644
--- a/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
@@ -46,8 +46,7 @@ Record ProcessALine(const std::string &line, const std::string &shape_line) {
   }
   // VLOG(3) << "data size " << record.data.size();
   // VLOG(3) << "data shape size " << record.shape.size();
-  // VLOG(2) << "data shape size " << record.shape[3];
-  LOG(INFO) << "data shape size " << record.shape[3];
+  // LOG(INFO) << "data shape size " << record.shape[3];
   return record;
 }
 
@@ -81,7 +80,6 @@ void profile(bool use_mkldnn = false) {
   SetConfig(&cfg);
   if (use_mkldnn) {
     cfg.EnableMKLDNN();
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
   }
   // cfg.pass_builder()->TurnOnDebug();
   std::vector<std::vector<PaddleTensor>> outputs;
@@ -110,12 +108,13 @@ void profile(bool use_mkldnn = false) {
         SetInput(&input_slots_all, line, shape_line);
 
         run_timer.tic();
-        predictor->Run(input_slots_all[i], &outputs[i], FLAGS_batch_size);
+        predictor->Run(input_slots_all[0], &outputs[0], FLAGS_batch_size);
         elapsed_time += run_timer.toc();
       });
-      LOG(INFO) << "threads size: " << threads.size();
       threads[0].join();
       threads.clear();
+      if (i % 100 == 0) LOG(INFO) << i << " samples";
+      std::vector<std::vector<PaddleTensor>>().swap(input_slots_all);
     }
 
     file.close();

From 634d8c62f44ac737b2a3d2ed1ae4f609652da8de Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Thu, 27 Jun 2019 18:07:28 +0800
Subject: [PATCH 09/21] change KeyBlob from vector to map for speedup

---
 paddle/fluid/platform/device_context.cc | 14 +++-----------
 paddle/fluid/platform/device_context.h  |  2 +-
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 6b3317ef2247f..61c9586e6b7c6 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -433,11 +433,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
   }
 
   // Find Key in found (or newly created) KeyBlob
-  auto key_it = std::find_if(
-      pBlob->begin(), pBlob->end(),
-      [=](std::pair<std::string, std::shared_ptr<void>> const& obj) {
-        return obj.first == name;
-      });
+  auto key_it = pBlob->find(name);
 
   if (key_it == pBlob->end()) {
     // tid = -1 means cache clearing mode, MKLDNN_CAP defines max blob capacity
@@ -446,7 +442,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
               << pBlob->begin()->first << "\n";
       pBlob->erase(pBlob->begin());
     }
-    pBlob->push_back(std::make_pair(name, data));
+    (*pBlob)[name] = data;
   } else {
     key_it->second = data;  // set data to existing blob
   }
@@ -473,11 +469,7 @@ std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
   pBlob = map_it->second;
 
   // Find Blob via name
-  auto key_it = std::find_if(
-      pBlob->begin(), pBlob->end(),
-      [=](std::pair<std::string, std::shared_ptr<void>> const& obj) {
-        return obj.first == name;
-      });
+  auto key_it = pBlob->find(name);
 
   if (key_it == pBlob->end()) {
     VLOG(2) << "GetBlob tid=" << tid << ", miss blob=" << name << "\n";
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index cf1c29a5d9b18..0da64aea4297d 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -378,7 +378,7 @@ struct DefaultDeviceContextType<platform::CUDAPinnedPlace> {
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN
-using KeyBlob = std::vector<std::pair<std::string, std::shared_ptr<void>>>;
+using KeyBlob = std::unordered_map<std::string, std::shared_ptr<void>>;
 using BlobMap = std::unordered_map<int, std::shared_ptr<KeyBlob>>;
 
 void set_cur_thread_id(int);

From 2de7f415fb3abfb327381e3bc078d752c067b3b5 Mon Sep 17 00:00:00 2001
From: Pawel Piotrowicz <pawel.piotrowicz@intel.com>
Date: Thu, 27 Jun 2019 14:08:45 +0200
Subject: [PATCH 10/21] PaddlePaddle memory leak test=develop

---
 paddle/fluid/framework/details/op_registry.h  |  5 +++--
 paddle/fluid/framework/op_desc.cc             |  2 +-
 paddle/fluid/framework/op_info.h              | 10 ++++++----
 paddle/fluid/framework/op_proto_maker.cc      |  5 +++--
 paddle/fluid/framework/op_proto_maker.h       | 10 ++++++----
 paddle/fluid/framework/op_proto_maker_test.cc | 12 ++++++------
 6 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
index 0f03ca51da778..519e007161c1c 100644
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <string>
 #include <tuple>
 #include <type_traits>
@@ -161,8 +162,8 @@ struct OpInfoFiller<T, kOperator> {
 template <typename T>
 struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
   void operator()(const char* op_type, OpInfo* info) const {
-    info->proto_ = new proto::OpProto;
-    info->checker_ = new OpAttrChecker();
+    info->proto_ = std::make_shared<proto::OpProto>();
+    info->checker_ = std::make_shared<OpAttrChecker>();
     T maker;
     maker(info->proto_, info->checker_);
     info->proto_->set_type(op_type);
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 1ea93b7638a85..bf38bfa07e9fd 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -668,7 +668,7 @@ static void InitInferShapeFuncs() {
 void OpDesc::CheckAttrs() {
   PADDLE_ENFORCE(!Type().empty(),
                  "CheckAttr() can not be called before type is setted.");
-  auto *checker = OpInfoMap::Instance().Get(Type()).Checker();
+  auto checker = OpInfoMap::Instance().Get(Type()).Checker();
   if (checker == nullptr) {
     // checker is not configured. That operator could be generated by Paddle,
     // not by users.
diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h
index daa72769c4957..ad21d6b7d86ed 100644
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
@@ -15,8 +15,10 @@ limitations under the License. */
 #pragma once
 #include <functional>
 #include <map>
+#include <memory>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "paddle/fluid/framework/attribute.h"
@@ -36,13 +38,13 @@ class InferShapeBase {
 struct OpInfo {
   OpCreator creator_;
   GradOpMakerFN grad_op_maker_;
-  proto::OpProto* proto_{nullptr};
-  OpAttrChecker* checker_{nullptr};
+  std::shared_ptr<proto::OpProto> proto_;
+  std::shared_ptr<OpAttrChecker> checker_;
   InferVarTypeFN infer_var_type_;
   InferShapeFN infer_shape_;
   InferInplaceOpFN infer_inplace_;
   InferNoNeedBufferVarsFN infer_no_need_buffer_vars_;
-
+  OpInfo() : proto_{nullptr}, checker_{nullptr} {}
   // NOTE(zjl): this flag is added to check whether
   // the grad maker is the default one.
   bool use_default_grad_op_desc_maker_{false};
@@ -70,7 +72,7 @@ struct OpInfo {
     return grad_op_maker_;
   }
 
-  const OpAttrChecker* Checker() const { return checker_; }
+  const std::shared_ptr<OpAttrChecker> Checker() const { return checker_; }
 
   const InferNoNeedBufferVarsFN& NoNeedBufferVarsInferer() const {
     return infer_no_need_buffer_vars_;
diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index 2311614c335a5..06ffdea636578 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -56,8 +56,9 @@ void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
   }
 }
 
-void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
-                                        OpAttrChecker* attr_checker) {
+void OpProtoAndCheckerMaker::operator()(
+    std::shared_ptr<proto::OpProto> proto,
+    std::shared_ptr<OpAttrChecker> attr_checker) {
   proto_ = proto;
   op_checker_ = attr_checker;
   Make();
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 5f3ce60e1d94e..3e415d0ba904e 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -13,7 +13,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <string>
+#include <unordered_set>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -49,7 +51,8 @@ class OpProtoAndCheckerMaker {
   static const char *OpNamescopeAttrName() { return "op_namescope"; }
   static const char *OpCreationCallstackAttrName() { return "op_callstack"; }
 
-  void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);
+  void operator()(std::shared_ptr<proto::OpProto> proto,
+                  std::shared_ptr<OpAttrChecker> attr_checker);
 
   virtual void Make() = 0;
 
@@ -99,9 +102,8 @@ class OpProtoAndCheckerMaker {
  private:
   void CheckNoDuplicatedInOutAttrs();
   void Validate();
-
-  proto::OpProto *proto_;
-  OpAttrChecker *op_checker_;
+  std::shared_ptr<proto::OpProto> proto_;
+  std::shared_ptr<OpAttrChecker> op_checker_;
   bool validated_{false};
 };
 }  // namespace framework
diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc
index a8030d377fdb4..e28048137c5fc 100644
--- a/paddle/fluid/framework/op_proto_maker_test.cc
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
@@ -25,10 +25,10 @@ class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 };
 
 TEST(ProtoMaker, DuplicatedAttr) {
-  paddle::framework::proto::OpProto op_proto;
-  paddle::framework::OpAttrChecker op_checker;
+  auto op_proto = std::make_shared<paddle::framework::proto::OpProto>();
+  auto op_checker = std::make_shared<paddle::framework::OpAttrChecker>();
   TestAttrProtoMaker proto_maker;
-  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
+  ASSERT_THROW(proto_maker(op_proto, op_checker),
                paddle::platform::EnforceNotMet);
 }
 
@@ -41,9 +41,9 @@ class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 };
 
 TEST(ProtoMaker, DuplicatedInOut) {
-  paddle::framework::proto::OpProto op_proto;
-  paddle::framework::OpAttrChecker op_checker;
+  auto op_proto = std::make_shared<paddle::framework::proto::OpProto>();
+  auto op_checker = std::make_shared<paddle::framework::OpAttrChecker>();
   TestAttrProtoMaker proto_maker;
-  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
+  ASSERT_THROW(proto_maker(op_proto, op_checker),
                paddle::platform::EnforceNotMet);
 }

From 482619648fba3e80280a38dae717470b7006d586 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Fri, 28 Jun 2019 00:39:50 +0800
Subject: [PATCH 11/21] use input_shape to setBlob

---
 .../fluid/inference/api/analysis_predictor.cc |  8 ++++
 .../tests/api/analyzer_detect_tester.cc       |  1 +
 paddle/fluid/platform/device_context.cc       | 44 +++++++++++++++----
 paddle/fluid/platform/device_context.h        |  5 ++-
 4 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index e839b6c2c93cf..99f764573bc1e 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -205,6 +205,14 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
           << ", mkldnn_thread_id_=" << config_.mkldnn_thread_id_ << "\n";
   if (paddle::platform::get_cur_thread_id() == 0)
     paddle::platform::set_cur_thread_id(config_.mkldnn_thread_id_);
+  if (paddle::platform::get_cur_thread_id() == -1) {
+    std::stringstream ss;
+    for (size_t i = 0; i < inputs[0].shape.size(); ++i) {
+      ss << inputs[0].shape[i] << "-";
+    }
+    VLOG(2) << "Set input shape=" << ss.str();
+    paddle::platform::set_cur_input_shape_str(ss.str());
+  }
 #endif
 
   VLOG(3) << "Predictor::predict";
diff --git a/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
index 7213e119eaef3..10947d4594ae0 100644
--- a/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
@@ -80,6 +80,7 @@ void profile(bool use_mkldnn = false) {
   SetConfig(&cfg);
   if (use_mkldnn) {
     cfg.EnableMKLDNN();
+    cfg.SetMKLDNNThreadId(-1);
   }
   // cfg.pass_builder()->TurnOnDebug();
   std::vector<std::vector<PaddleTensor>> outputs;
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 61c9586e6b7c6..9cd7fe3364b64 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -403,11 +403,16 @@ MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
 namespace {
 // Current thread's id.
 thread_local int cur_thread_id = 0;
+thread_local std::string cur_input_shape_str = "";
 }
 
 void set_cur_thread_id(int tid) { cur_thread_id = tid; }
 int get_cur_thread_id(void) { return cur_thread_id; }
-#define MKLDNN_CAP 10000
+void set_cur_input_shape_str(std::string input_shape_str) {
+  cur_input_shape_str = input_shape_str;
+}
+std::string get_cur_input_shape_str(void) { return cur_input_shape_str; }
+#define MKLDNN_CAP 10
 
 void MKLDNNDeviceContext::ResetBlobMap() const { p_blobmap_->clear(); }
 
@@ -415,6 +420,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
                                   std::shared_ptr<void> data) const {
   BlobMap* pMap = p_blobmap_.get();
   std::shared_ptr<KeyBlob> pBlob = nullptr;
+  std::shared_ptr<Blob> blob = nullptr;
 
   int tid = platform::get_cur_thread_id();
 
@@ -432,19 +438,29 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
     pBlob = map_it->second;
   }
 
+  std::string cur_input_shape_str = platform::get_cur_input_shape_str();
   // Find Key in found (or newly created) KeyBlob
-  auto key_it = pBlob->find(name);
+  auto key_it = pBlob->find(cur_input_shape_str);
 
   if (key_it == pBlob->end()) {
     // tid = -1 means cache clearing mode, MKLDNN_CAP defines max blob capacity
     if ((tid == -1) && (pBlob->size() > MKLDNN_CAP)) {
-      VLOG(2) << "SetBlob: tid=" << tid << ", remove head blob "
-              << pBlob->begin()->first << "\n";
-      pBlob->erase(pBlob->begin());
+      VLOG(2) << "tid=" << tid
+              << ", remove all head blob of shape: " << pBlob->begin()->first
+              << "\n";
+      pBlob->erase(pBlob->begin()->first);
     }
-    (*pBlob)[name] = data;
+    blob = std::shared_ptr<Blob>(new Blob());
+    (*pBlob)[cur_input_shape_str] = blob;
+  } else {
+    blob = key_it->second;
+  }
+  // Find Blob via name
+  auto blob_it = blob->find(name);
+  if (blob_it == blob->end()) {
+    (*blob)[name] = data;
   } else {
-    key_it->second = data;  // set data to existing blob
+    blob_it->second = data;  // set data to existing blob
   }
   VLOG(2) << "SetBlob: tid=" << tid << ", add blob=" << name << "\n";
   // lock will be automatically released when out of scope
@@ -455,6 +471,7 @@ std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
     const std::string& name) const {
   BlobMap* pMap = p_blobmap_.get();
   std::shared_ptr<KeyBlob> pBlob = nullptr;
+  std::shared_ptr<Blob> blob = nullptr;
 
   int tid = platform::get_cur_thread_id();
 
@@ -466,12 +483,21 @@ std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
     VLOG(2) << "GetBlob: tid=" << tid << ", miss tid\n";
     return nullptr;
   }
+  std::string cur_input_shape_str = platform::get_cur_input_shape_str();
   pBlob = map_it->second;
 
+  auto pBlob_it = pBlob->find(cur_input_shape_str);
+  if (pBlob_it == pBlob->end()) {
+    VLOG(2) << "GetBlob: tid=" << cur_input_shape_str
+            << ", miss input_shape_str\n";
+    return nullptr;
+  }
+  blob = pBlob_it->second;
+
   // Find Blob via name
-  auto key_it = pBlob->find(name);
+  auto key_it = blob->find(name);
 
-  if (key_it == pBlob->end()) {
+  if (key_it == blob->end()) {
     VLOG(2) << "GetBlob tid=" << tid << ", miss blob=" << name << "\n";
     return nullptr;
   }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 0da64aea4297d..5571035f51090 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -378,11 +378,14 @@ struct DefaultDeviceContextType<platform::CUDAPinnedPlace> {
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN
-using KeyBlob = std::unordered_map<std::string, std::shared_ptr<void>>;
+using Blob = std::unordered_map<std::string, std::shared_ptr<void>>;
+using KeyBlob = std::unordered_map<std::string, std::shared_ptr<Blob>>;
 using BlobMap = std::unordered_map<int, std::shared_ptr<KeyBlob>>;
 
 void set_cur_thread_id(int);
 int get_cur_thread_id(void);
+void set_cur_input_shape_str(std::string input_shape_str);
+std::string get_cur_input_shape_str(void);
 
 class MKLDNNDeviceContext : public CPUDeviceContext {
  public:

From 725f45101f20989e6e0f3eab65026f01f2c3e9de Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Fri, 28 Jun 2019 13:06:23 +0800
Subject: [PATCH 12/21] refine device_context, make blobMap etc more cleaner

---
 paddle/fluid/platform/device_context.cc | 61 +++++++++++++------------
 paddle/fluid/platform/device_context.h  | 12 +++--
 2 files changed, 42 insertions(+), 31 deletions(-)

diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 9cd7fe3364b64..295b3ea53be2b 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -401,8 +401,12 @@ MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
 }
 
 namespace {
-// Current thread's id.
+// Current thread's id. -1 means cache clearing mode.
 thread_local int cur_thread_id = 0;
+// Current data input shape string.
+// - If cur_thread_id != -1, it's a null string in default.
+// - Else, for a 4-dimention input [1, 3, 18, 128],
+//   cur_input_shape_str = 1-3-18-128- .
 thread_local std::string cur_input_shape_str = "";
 }
 
@@ -419,46 +423,46 @@ void MKLDNNDeviceContext::ResetBlobMap() const { p_blobmap_->clear(); }
 void MKLDNNDeviceContext::SetBlob(const std::string& name,
                                   std::shared_ptr<void> data) const {
   BlobMap* pMap = p_blobmap_.get();
+  std::shared_ptr<ShapeBlob> sBlob = nullptr;
   std::shared_ptr<KeyBlob> pBlob = nullptr;
-  std::shared_ptr<Blob> blob = nullptr;
 
   int tid = platform::get_cur_thread_id();
 
   std::lock_guard<std::mutex> lock(*p_mutex_);
 
-  // Find KeyBlob for current thread
+  // Find ShapeBlob for current thread
   auto map_it = pMap->find(tid);
 
   if (map_it == pMap->end()) {
     // 1st time to set blob in current thread
-    pBlob = std::shared_ptr<KeyBlob>(new KeyBlob());
-    (*pMap)[tid] = pBlob;
+    sBlob = std::shared_ptr<ShapeBlob>(new ShapeBlob());
+    (*pMap)[tid] = sBlob;
     VLOG(2) << "SetBlob: tid=" << tid << ", add new tid\n";
   } else {
-    pBlob = map_it->second;
+    sBlob = map_it->second;
   }
 
+  // Find KeyBlob for current input shape
   std::string cur_input_shape_str = platform::get_cur_input_shape_str();
-  // Find Key in found (or newly created) KeyBlob
-  auto key_it = pBlob->find(cur_input_shape_str);
+  auto key_it = sBlob->find(cur_input_shape_str);
 
-  if (key_it == pBlob->end()) {
-    // tid = -1 means cache clearing mode, MKLDNN_CAP defines max blob capacity
-    if ((tid == -1) && (pBlob->size() > MKLDNN_CAP)) {
+  if (key_it == sBlob->end()) {
+    // tid = -1 means cache clearing mode, MKLDNN_CAP defines max pblob capacity
+    if ((tid == -1) && (sBlob->size() > MKLDNN_CAP)) {
       VLOG(2) << "tid=" << tid
-              << ", remove all head blob of shape: " << pBlob->begin()->first
-              << "\n";
-      pBlob->erase(pBlob->begin()->first);
+              << ", remove all head blob of shape: " << sBlob->begin()->first;
+      sBlob->erase(sBlob->begin()->first);
     }
-    blob = std::shared_ptr<Blob>(new Blob());
-    (*pBlob)[cur_input_shape_str] = blob;
+    pBlob = std::shared_ptr<KeyBlob>(new KeyBlob());
+    (*sBlob)[cur_input_shape_str] = pBlob;
   } else {
-    blob = key_it->second;
+    pBlob = key_it->second;
   }
+
   // Find Blob via name
-  auto blob_it = blob->find(name);
-  if (blob_it == blob->end()) {
-    (*blob)[name] = data;
+  auto blob_it = pBlob->find(name);
+  if (blob_it == pBlob->end()) {
+    (*pBlob)[name] = data;
   } else {
     blob_it->second = data;  // set data to existing blob
   }
@@ -470,34 +474,35 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
 std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
     const std::string& name) const {
   BlobMap* pMap = p_blobmap_.get();
+  std::shared_ptr<ShapeBlob> sBlob = nullptr;
   std::shared_ptr<KeyBlob> pBlob = nullptr;
-  std::shared_ptr<Blob> blob = nullptr;
 
   int tid = platform::get_cur_thread_id();
 
   std::lock_guard<std::mutex> lock(*p_mutex_);
 
-  // Find KeyBlob for current thread firstly
+  // Find ShapeBlob for current thread firstly
   auto map_it = pMap->find(tid);
   if (map_it == pMap->end()) {
     VLOG(2) << "GetBlob: tid=" << tid << ", miss tid\n";
     return nullptr;
   }
   std::string cur_input_shape_str = platform::get_cur_input_shape_str();
-  pBlob = map_it->second;
+  sBlob = map_it->second;
 
-  auto pBlob_it = pBlob->find(cur_input_shape_str);
-  if (pBlob_it == pBlob->end()) {
+  // Find KeyBlob for current input shape secondly
+  auto sBlob_it = sBlob->find(cur_input_shape_str);
+  if (sBlob_it == sBlob->end()) {
     VLOG(2) << "GetBlob: tid=" << cur_input_shape_str
             << ", miss input_shape_str\n";
     return nullptr;
   }
-  blob = pBlob_it->second;
+  pBlob = sBlob_it->second;
 
   // Find Blob via name
-  auto key_it = blob->find(name);
+  auto key_it = pBlob->find(name);
 
-  if (key_it == blob->end()) {
+  if (key_it == pBlob->end()) {
     VLOG(2) << "GetBlob tid=" << tid << ", miss blob=" << name << "\n";
     return nullptr;
   }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 5571035f51090..5afecb4fbf165 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -378,9 +378,15 @@ struct DefaultDeviceContextType<platform::CUDAPinnedPlace> {
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN
-using Blob = std::unordered_map<std::string, std::shared_ptr<void>>;
-using KeyBlob = std::unordered_map<std::string, std::shared_ptr<Blob>>;
-using BlobMap = std::unordered_map<int, std::shared_ptr<KeyBlob>>;
+// Following three maps are used to cache MKLDNN primitives.
+// There relations are:
+// - BlobMap = Map<cur_thread_id, KeyBlob>
+// - ShapeBlob = Map<cur_input_shape_str, KeyBlob>
+// - KeyBlob  = Map<blob_name, blob>
+// Where:
+using KeyBlob = std::unordered_map<std::string, std::shared_ptr<void>>;
+using ShapeBlob = std::unordered_map<std::string, std::shared_ptr<KeyBlob>>;
+using BlobMap = std::unordered_map<int, std::shared_ptr<ShapeBlob>>;
 
 void set_cur_thread_id(int);
 int get_cur_thread_id(void);

From e8305ea4b0a85da2aab8f182f3bb9534e62023a3 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Fri, 28 Jun 2019 16:37:31 +0800
Subject: [PATCH 13/21] add MkldnnPreRun and MkldnnPostRun interface

---
 .../fluid/inference/api/analysis_predictor.cc | 59 +++++++++----------
 .../fluid/inference/api/analysis_predictor.h  |  5 ++
 2 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 99f764573bc1e..65a5d062dbb12 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -193,19 +193,21 @@ void AnalysisPredictor::SetMkldnnThreadID(int tid) {
 #endif
 }
 
-bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
-                            std::vector<PaddleTensor> *output_data,
-                            int batch_size) {
-  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
-#ifdef PADDLE_WITH_MKLDNN
-  // TODO(intel): will refactor this code later
+void AnalysisPredictor::MkldnnPreRun(const std::vector<PaddleTensor> &inputs) {
+  // TODO(intel, luotao): will refactor this code later
   // Make sure it not conflict with AnalysisPredictor::SetMkldnnthreadid case
   VLOG(2) << "AnalysisPredictor::Run get_cur_thread_id="
           << paddle::platform::get_cur_thread_id()
           << ", mkldnn_thread_id_=" << config_.mkldnn_thread_id_ << "\n";
   if (paddle::platform::get_cur_thread_id() == 0)
     paddle::platform::set_cur_thread_id(config_.mkldnn_thread_id_);
+  // -1 means cache cleaning mode.
+  // Set current_input_shape for caching dynamic shape.
+  // Only used when batch_size=1.
   if (paddle::platform::get_cur_thread_id() == -1) {
+    PADDLE_ENFORCE(
+        inputs.size() == 1,
+        "Can not process batch_size > 1 in MKLDNN cache cleaning mode now.");
     std::stringstream ss;
     for (size_t i = 0; i < inputs[0].shape.size(); ++i) {
       ss << inputs[0].shape[i] << "-";
@@ -213,8 +215,25 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
     VLOG(2) << "Set input shape=" << ss.str();
     paddle::platform::set_cur_input_shape_str(ss.str());
   }
-#endif
+}
+
+void AnalysisPredictor::MkldnnPostRun() {
+  // TODO(intel): will refactor this code later
+  // reset thread id to avoid confusion when thread is reused from pool again
+  // mkldnn_thread_id_ = -1 is reserved for cache clearing mode only
+  if (paddle::platform::get_cur_thread_id() == -1) {
+    VLOG(2) << "Clear previous mkldnn thread id -1\n";
+    paddle::platform::set_cur_thread_id(0);
+  }
+}
 
+bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
+                            std::vector<PaddleTensor> *output_data,
+                            int batch_size) {
+  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
+#ifdef PADDLE_WITH_MKLDNN
+  MkldnnPreRun(inputs);
+#endif
   VLOG(3) << "Predictor::predict";
   inference::Timer timer;
   timer.tic();
@@ -257,13 +276,7 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
   // conflict when integrating it into deployment service.
   paddle::platform::SetNumThreads(1);
 #ifdef PADDLE_WITH_MKLDNN
-  // TODO(intel): will refactor this code later
-  // reset thread id to avoid confusion when thread is reused from pool again
-  // mkldnn_thread_id_ = -1 is reserved for cache clearing mode only
-  if (paddle::platform::get_cur_thread_id() == -1) {
-    VLOG(2) << "Clear previous mkldnn thread id -1\n";
-    paddle::platform::set_cur_thread_id(0);
-  }
+  MkldnnPostRun();
 #endif
   return true;
 }
@@ -621,15 +634,6 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 
 bool AnalysisPredictor::ZeroCopyRun() {
   paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
-#ifdef PADDLE_WITH_MKLDNN
-  // TODO(intel): will refactor this code later
-  // Make sure it not conflict with AnalysisPredictor::SetMkldnnthreadid case
-  VLOG(2) << "AnalysisPredictor::Run get_cur_thread_id="
-          << paddle::platform::get_cur_thread_id()
-          << ", mkldnn_thread_id_=" << config_.mkldnn_thread_id_ << "\n";
-  if (paddle::platform::get_cur_thread_id() == 0)
-    paddle::platform::set_cur_thread_id(config_.mkldnn_thread_id_);
-#endif
   executor_->Run();
   // Fix TensorArray reuse not cleaned bug.
   tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
@@ -638,15 +642,6 @@ bool AnalysisPredictor::ZeroCopyRun() {
   // recover the cpu_math_library_num_threads to 1, in order to avoid thread
   // conflict when integrating it into deployment service.
   paddle::platform::SetNumThreads(1);
-#ifdef PADDLE_WITH_MKLDNN
-  // TODO(intel): will refactor this code later
-  // reset thread id to avoid confusion when thread is reused from pool again
-  // mkldnn_thread_id_ = -1 is reserved for cache clearing mode only
-  if (paddle::platform::get_cur_thread_id() == -1) {
-    VLOG(2) << "Clear previous mkldnn thread id setting\n";
-    paddle::platform::set_cur_thread_id(0);
-  }
-#endif
   return true;
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index b5e134ced70f8..ccfe21293465c 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -111,6 +111,11 @@ class AnalysisPredictor : public PaddlePredictor {
   template <typename T>
   void GetFetchOne(const framework::LoDTensor &fetchs,
                    PaddleTensor *output_data);
+  // Pre-process and Post-process for Mkldnn multi-thread and dynamic shape
+  // input. Used in AnalysisPredictor::Run(), do not support
+  // AnalysisPredictor::ZeroRun() now.
+  void MkldnnPreRun(const std::vector<PaddleTensor> &inputs);
+  void MkldnnPostRun();
 
 #if PADDLE_WITH_TENSORRT
   // When we use Paddle-TRT INT8 engine, we need to generate calibration table

From 266a2010863a124838add3fa865cebc60230101e Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Fri, 28 Jun 2019 18:40:10 +0800
Subject: [PATCH 14/21] add EnableMKLDNN(int mkldnn_input_shape_cache_size = 1)
 inferace

---
 paddle/fluid/inference/api/analysis_config.cc     | 14 +++-----------
 paddle/fluid/inference/api/analysis_predictor.cc  | 13 ++++++++-----
 .../fluid/inference/api/paddle_analysis_config.h  |  8 ++++++--
 .../inference/tests/api/analyzer_detect_tester.cc |  3 +--
 paddle/fluid/platform/device_context.cc           | 15 +++++++++++----
 paddle/fluid/platform/device_context.h            |  2 ++
 6 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index b1221984f66b5..5797e97b97208 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -114,7 +114,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   // MKLDNN related.
   CP_MEMBER(use_mkldnn_);
   CP_MEMBER(mkldnn_enabled_op_types_);
-  CP_MEMBER(mkldnn_thread_id_);
+  CP_MEMBER(mkldnn_input_shape_cache_size_);
   // Quantization related.
   CP_MEMBER(use_mkldnn_quantizer_);
   CP_MEMBER(mkldnn_quantizer_config_);
@@ -151,9 +151,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   Update();
 }
 
-void AnalysisConfig::EnableMKLDNN() {
+void AnalysisConfig::EnableMKLDNN(int mkldnn_input_shape_cache_size) {
 #ifdef PADDLE_WITH_MKLDNN
   use_mkldnn_ = true;
+  mkldnn_input_shape_cache_size_ = mkldnn_input_shape_cache_size;
 #else
   LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
   use_mkldnn_ = false;
@@ -162,15 +163,6 @@ void AnalysisConfig::EnableMKLDNN() {
   Update();
 }
 
-void AnalysisConfig::SetMKLDNNThreadId(int id) {
-#ifdef PADDLE_WITH_MKLDNN
-  mkldnn_thread_id_ = id;
-#else
-  LOG(ERROR) << "Please compile with MKLDNN first to set MKLDNN Thread Id";
-  mkldnn_thread_id_ = 0;
-#endif
-}
-
 void AnalysisConfig::EnableMkldnnQuantizer() {
 #ifdef PADDLE_WITH_MKLDNN
   if (!mkldnn_quantizer_config_)
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 65a5d062dbb12..1067fec4f9314 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -195,13 +195,16 @@ void AnalysisPredictor::SetMkldnnThreadID(int tid) {
 
 void AnalysisPredictor::MkldnnPreRun(const std::vector<PaddleTensor> &inputs) {
   // TODO(intel, luotao): will refactor this code later
-  // Make sure it not conflict with AnalysisPredictor::SetMkldnnthreadid case
+  // Make sure it not conflict with AnalysisPredictor::SetMkldnnThreadID case
   VLOG(2) << "AnalysisPredictor::Run get_cur_thread_id="
-          << paddle::platform::get_cur_thread_id()
-          << ", mkldnn_thread_id_=" << config_.mkldnn_thread_id_ << "\n";
-  if (paddle::platform::get_cur_thread_id() == 0)
-    paddle::platform::set_cur_thread_id(config_.mkldnn_thread_id_);
+          << paddle::platform::get_cur_thread_id();
   // -1 means cache cleaning mode.
+  if (paddle::platform::get_cur_thread_id() == 0 &&
+      config_.mkldnn_input_shape_cache_size_ > 1) {
+    paddle::platform::set_cur_thread_id(-1);
+    paddle::platform::set_cur_input_shape_cache_size(
+        config_.mkldnn_input_shape_cache_size_);
+  }
   // Set current_input_shape for caching dynamic shape.
   // Only used when batch_size=1.
   if (paddle::platform::get_cur_thread_id() == -1) {
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 43fd321fa27ae..da981dbac333d 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -177,14 +177,15 @@ struct AnalysisConfig {
   bool ngraph_enabled() const { return use_ngraph_; }
 
   /** Turn on MKLDNN.
+   *  And set the cache size of different input shapes for MKLDNN.
+   *  Default 1 means fixed input shape, not dynamic shape.
    */
-  void EnableMKLDNN();
+  void EnableMKLDNN(int mkldnn_input_shape_cache_size = 1);
   /** A boolean state telling whether to use the MKLDNN.
    */
   bool mkldnn_enabled() const { return use_mkldnn_; }
   /** Set MKLDNN thread id.
    */
-  void SetMKLDNNThreadId(int id);
 
   /** Set and get the number of cpu math library threads.
    */
@@ -317,8 +318,11 @@ struct AnalysisConfig {
   std::vector<std::string> anakin_ops_filter_;
   std::map<std::string, std::string> engine_opt_info_;
 
+  // mkldnn related.
+  int mkldnn_input_shape_cache_size_{1};
   bool use_mkldnn_quantizer_{false};
   std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_;
+
   // If the config is already used on a predictor, it becomes invalid.
   mutable bool is_valid_{true};
   // Any config can only be used with one predictor.
diff --git a/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
index 10947d4594ae0..e496030c79895 100644
--- a/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
@@ -79,8 +79,7 @@ void profile(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
   if (use_mkldnn) {
-    cfg.EnableMKLDNN();
-    cfg.SetMKLDNNThreadId(-1);
+    cfg.EnableMKLDNN(10);
   }
   // cfg.pass_builder()->TurnOnDebug();
   std::vector<std::vector<PaddleTensor>> outputs;
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 295b3ea53be2b..f275681e97eda 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -408,7 +408,10 @@ thread_local int cur_thread_id = 0;
 // - Else, for a 4-dimention input [1, 3, 18, 128],
 //   cur_input_shape_str = 1-3-18-128- .
 thread_local std::string cur_input_shape_str = "";
-}
+// the cache size of different input shapes for MKLDNN.
+// Default 1 means fixed input shape, not dynamic shape.
+thread_local int cur_input_shape_cache_size = 1;
+}  // namespace
 
 void set_cur_thread_id(int tid) { cur_thread_id = tid; }
 int get_cur_thread_id(void) { return cur_thread_id; }
@@ -416,7 +419,10 @@ void set_cur_input_shape_str(std::string input_shape_str) {
   cur_input_shape_str = input_shape_str;
 }
 std::string get_cur_input_shape_str(void) { return cur_input_shape_str; }
-#define MKLDNN_CAP 10
+void set_cur_input_shape_cache_size(int input_shape_cache_size) {
+  cur_input_shape_cache_size = input_shape_cache_size;
+}
+int get_cur_input_shape_cache_size(void) { return cur_input_shape_cache_size; }
 
 void MKLDNNDeviceContext::ResetBlobMap() const { p_blobmap_->clear(); }
 
@@ -447,8 +453,9 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
   auto key_it = sBlob->find(cur_input_shape_str);
 
   if (key_it == sBlob->end()) {
-    // tid = -1 means cache clearing mode, MKLDNN_CAP defines max pblob capacity
-    if ((tid == -1) && (sBlob->size() > MKLDNN_CAP)) {
+    // tid = -1 means cache clearing mode, cur_input_shape_cache_size defines
+    // max pblob capacity
+    if ((tid == -1) && (sBlob->size() > cur_input_shape_cache_size)) {
       VLOG(2) << "tid=" << tid
               << ", remove all head blob of shape: " << sBlob->begin()->first;
       sBlob->erase(sBlob->begin()->first);
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 5afecb4fbf165..5d726119eeb1d 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -392,6 +392,8 @@ void set_cur_thread_id(int);
 int get_cur_thread_id(void);
 void set_cur_input_shape_str(std::string input_shape_str);
 std::string get_cur_input_shape_str(void);
+void set_cur_input_shape_cache_size(int input_shape_cache_size);
+int get_cur_input_shape_cache_size(void);
 
 class MKLDNNDeviceContext : public CPUDeviceContext {
  public:

From 086f3471eeb8aa28705b346245e7dddf9acf978e Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Fri, 28 Jun 2019 20:24:32 +0800
Subject: [PATCH 15/21] Revert "PaddlePaddle memory leak test=develop"

---
 paddle/fluid/framework/details/op_registry.h  |  5 ++---
 paddle/fluid/framework/op_desc.cc             |  2 +-
 paddle/fluid/framework/op_info.h              | 10 ++++------
 paddle/fluid/framework/op_proto_maker.cc      |  5 ++---
 paddle/fluid/framework/op_proto_maker.h       | 10 ++++------
 paddle/fluid/framework/op_proto_maker_test.cc | 12 ++++++------
 6 files changed, 19 insertions(+), 25 deletions(-)

diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
index 519e007161c1c..0f03ca51da778 100644
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include <memory>
 #include <string>
 #include <tuple>
 #include <type_traits>
@@ -162,8 +161,8 @@ struct OpInfoFiller<T, kOperator> {
 template <typename T>
 struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
   void operator()(const char* op_type, OpInfo* info) const {
-    info->proto_ = std::make_shared<proto::OpProto>();
-    info->checker_ = std::make_shared<OpAttrChecker>();
+    info->proto_ = new proto::OpProto;
+    info->checker_ = new OpAttrChecker();
     T maker;
     maker(info->proto_, info->checker_);
     info->proto_->set_type(op_type);
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index bf38bfa07e9fd..1ea93b7638a85 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -668,7 +668,7 @@ static void InitInferShapeFuncs() {
 void OpDesc::CheckAttrs() {
   PADDLE_ENFORCE(!Type().empty(),
                  "CheckAttr() can not be called before type is setted.");
-  auto checker = OpInfoMap::Instance().Get(Type()).Checker();
+  auto *checker = OpInfoMap::Instance().Get(Type()).Checker();
   if (checker == nullptr) {
     // checker is not configured. That operator could be generated by Paddle,
     // not by users.
diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h
index ad21d6b7d86ed..daa72769c4957 100644
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
@@ -15,10 +15,8 @@ limitations under the License. */
 #pragma once
 #include <functional>
 #include <map>
-#include <memory>
 #include <string>
 #include <unordered_map>
-#include <unordered_set>
 #include <vector>
 
 #include "paddle/fluid/framework/attribute.h"
@@ -38,13 +36,13 @@ class InferShapeBase {
 struct OpInfo {
   OpCreator creator_;
   GradOpMakerFN grad_op_maker_;
-  std::shared_ptr<proto::OpProto> proto_;
-  std::shared_ptr<OpAttrChecker> checker_;
+  proto::OpProto* proto_{nullptr};
+  OpAttrChecker* checker_{nullptr};
   InferVarTypeFN infer_var_type_;
   InferShapeFN infer_shape_;
   InferInplaceOpFN infer_inplace_;
   InferNoNeedBufferVarsFN infer_no_need_buffer_vars_;
-  OpInfo() : proto_{nullptr}, checker_{nullptr} {}
+
   // NOTE(zjl): this flag is added to check whether
   // the grad maker is the default one.
   bool use_default_grad_op_desc_maker_{false};
@@ -72,7 +70,7 @@ struct OpInfo {
     return grad_op_maker_;
   }
 
-  const std::shared_ptr<OpAttrChecker> Checker() const { return checker_; }
+  const OpAttrChecker* Checker() const { return checker_; }
 
   const InferNoNeedBufferVarsFN& NoNeedBufferVarsInferer() const {
     return infer_no_need_buffer_vars_;
diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index 97550771f311a..27922c730471a 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -57,9 +57,8 @@ void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
   }
 }
 
-void OpProtoAndCheckerMaker::operator()(
-    std::shared_ptr<proto::OpProto> proto,
-    std::shared_ptr<OpAttrChecker> attr_checker) {
+void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
+                                        OpAttrChecker* attr_checker) {
   proto_ = proto;
   op_checker_ = attr_checker;
   Make();
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index e2462bc496acd..bf6528b2377dc 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -13,9 +13,7 @@ limitations under the License. */
 
 #pragma once
 
-#include <memory>
 #include <string>
-#include <unordered_set>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -54,8 +52,7 @@ class OpProtoAndCheckerMaker {
   static const char *OpNamescopeAttrName() { return "op_namescope"; }
   static const char *OpCreationCallstackAttrName() { return "op_callstack"; }
 
-  void operator()(std::shared_ptr<proto::OpProto> proto,
-                  std::shared_ptr<OpAttrChecker> attr_checker);
+  void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);
 
   virtual void Make() = 0;
 
@@ -105,8 +102,9 @@ class OpProtoAndCheckerMaker {
  private:
   void CheckNoDuplicatedInOutAttrs();
   void Validate();
-  std::shared_ptr<proto::OpProto> proto_;
-  std::shared_ptr<OpAttrChecker> op_checker_;
+
+  proto::OpProto *proto_;
+  OpAttrChecker *op_checker_;
   bool validated_{false};
 };
 }  // namespace framework
diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc
index e28048137c5fc..a8030d377fdb4 100644
--- a/paddle/fluid/framework/op_proto_maker_test.cc
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
@@ -25,10 +25,10 @@ class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 };
 
 TEST(ProtoMaker, DuplicatedAttr) {
-  auto op_proto = std::make_shared<paddle::framework::proto::OpProto>();
-  auto op_checker = std::make_shared<paddle::framework::OpAttrChecker>();
+  paddle::framework::proto::OpProto op_proto;
+  paddle::framework::OpAttrChecker op_checker;
   TestAttrProtoMaker proto_maker;
-  ASSERT_THROW(proto_maker(op_proto, op_checker),
+  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
                paddle::platform::EnforceNotMet);
 }
 
@@ -41,9 +41,9 @@ class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 };
 
 TEST(ProtoMaker, DuplicatedInOut) {
-  auto op_proto = std::make_shared<paddle::framework::proto::OpProto>();
-  auto op_checker = std::make_shared<paddle::framework::OpAttrChecker>();
+  paddle::framework::proto::OpProto op_proto;
+  paddle::framework::OpAttrChecker op_checker;
   TestAttrProtoMaker proto_maker;
-  ASSERT_THROW(proto_maker(op_proto, op_checker),
+  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
                paddle::platform::EnforceNotMet);
 }

From 6d5a84170b2db4f2f377177299e330c78b260fbc Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Sat, 29 Jun 2019 10:46:22 +0800
Subject: [PATCH 16/21] make unit-test support head-checker

---
 paddle/fluid/inference/tests/api/CMakeLists.txt   |  1 +
 .../inference/tests/api/analyzer_detect_tester.cc | 15 +++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 1ab4f215c7152..293c13e870fda 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -157,6 +157,7 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose
 
 # detect
 inference_analysis_api_test_with_refer_result(test_analyzer_detect ${OCR_INSTALL_DIR} analyzer_detect_tester.cc)
+target_link_libraries(test_analyzer_detect tcmalloc)
 
 ### Image classification tests with fake data
 set(IMG_CLASS_TEST_APP "test_analyzer_image_classification")
diff --git a/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
index e496030c79895..6a0d802383300 100644
--- a/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <gperftools/heap-checker.h>
 #include <gtest/gtest.h>
 #include <fstream>
 #include <iostream>
@@ -135,3 +136,17 @@ TEST(Analyzer_vis, profile_mkldnn) { profile(true /* use_mkldnn */); }
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
+
+// following lines are used for pprof
+int main(int argc, char **argv) {
+  HeapLeakChecker heap_checker("test_foo");
+  FLAGS_infer_model = "third_party/inference_demo/face_model/densebox";
+  FLAGS_infer_data = "third_party/inference_demo/face_model/detect_input.txt";
+  FLAGS_infer_shape = "third_party/inference_demo/face_model/shape.txt";
+  FLAGS_paddle_num_threads = 4;
+  FLAGS_repeat = 1;
+  FLAGS_batch_size = 1;
+  FLAGS_sample = 10;
+  paddle::inference::analysis::profile(true);
+  std::cout << heap_checker.NoLeaks() << std::endl;
+}

From d91c910f96a7dfe60d1515dc98abb4998094a181 Mon Sep 17 00:00:00 2001
From: Leo Zhao <leo.zhao@intel.com>
Date: Mon, 1 Jul 2019 11:12:51 +0800
Subject: [PATCH 17/21] use static variable to do cache instead of tread local
 in thread frequent switching case to avoid memory leak

test=develop
---
 .../fluid/framework/transfer_scope_cache.cc   | 25 ++++++++++++++++---
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/transfer_scope_cache.cc b/paddle/fluid/framework/transfer_scope_cache.cc
index e52a8317e2113..a2b9a5e171362 100644
--- a/paddle/fluid/framework/transfer_scope_cache.cc
+++ b/paddle/fluid/framework/transfer_scope_cache.cc
@@ -17,14 +17,31 @@
 namespace paddle {
 namespace framework {
 
+static std::unordered_map<size_t, Scope*>* static_transfer_data_cache = nullptr;
+static std::unordered_set<Scope*>* static_transfer_scope_cache = nullptr;
+
 std::unordered_map<size_t, Scope*>& global_transfer_data_cache() {
-  thread_local auto* x = new std::unordered_map<size_t, Scope*>;
-  return *x;
+  // if get_cur_thread_id() == -1, means not use thread local method to do cache
+  if (platform::get_cur_thread_id() == -1) {
+    if (!static_transfer_data_cache)
+      static_transfer_data_cache = new std::unordered_map<size_t, Scope*>;
+    return *static_transfer_data_cache;
+  } else {
+    thread_local auto* x = new std::unordered_map<size_t, Scope*>;
+    return *x;
+  }
 }
 
 std::unordered_set<Scope*>& global_transfer_scope_cache() {
-  thread_local auto* x = new std::unordered_set<Scope*>;
-  return *x;
+  // if get_cur_thread_id() == -1, means not use thread local method to do cache
+  if (platform::get_cur_thread_id() == -1) {
+    if (!static_transfer_scope_cache)
+      static_transfer_scope_cache = new std::unordered_set<Scope*>;
+    return *static_transfer_scope_cache;
+  } else {
+    thread_local auto* x = new std::unordered_set<Scope*>;
+    return *x;
+  }
 }
 
 Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,

From d6597b925642abf2704e63aff7fb62fa95684cda Mon Sep 17 00:00:00 2001
From: Leo Zhao <leo.zhao@intel.com>
Date: Mon, 1 Jul 2019 11:50:53 +0800
Subject: [PATCH 18/21] use marco to control code given it is specific for
 mkldnn

test=develop
---
 paddle/fluid/framework/transfer_scope_cache.cc | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/transfer_scope_cache.cc b/paddle/fluid/framework/transfer_scope_cache.cc
index a2b9a5e171362..74f0e8a140cb6 100644
--- a/paddle/fluid/framework/transfer_scope_cache.cc
+++ b/paddle/fluid/framework/transfer_scope_cache.cc
@@ -17,31 +17,43 @@
 namespace paddle {
 namespace framework {
 
+#ifdef PADDLE_WITH_MKLDNN
 static std::unordered_map<size_t, Scope*>* static_transfer_data_cache = nullptr;
 static std::unordered_set<Scope*>* static_transfer_scope_cache = nullptr;
+#endif
 
 std::unordered_map<size_t, Scope*>& global_transfer_data_cache() {
-  // if get_cur_thread_id() == -1, means not use thread local method to do cache
+#ifdef PADDLE_WITH_MKLDNN
+  // if get_cur_thread_id() == -1, means not using thread local method to do
+  // cache
   if (platform::get_cur_thread_id() == -1) {
     if (!static_transfer_data_cache)
       static_transfer_data_cache = new std::unordered_map<size_t, Scope*>;
     return *static_transfer_data_cache;
   } else {
+#endif
     thread_local auto* x = new std::unordered_map<size_t, Scope*>;
     return *x;
+#ifdef PADDLE_WITH_MKLDNN
   }
+#endif
 }
 
 std::unordered_set<Scope*>& global_transfer_scope_cache() {
-  // if get_cur_thread_id() == -1, means not use thread local method to do cache
+#ifdef PADDLE_WITH_MKLDNN
+  // if get_cur_thread_id() == -1, means not using thread local method to do
+  // cache
   if (platform::get_cur_thread_id() == -1) {
     if (!static_transfer_scope_cache)
       static_transfer_scope_cache = new std::unordered_set<Scope*>;
     return *static_transfer_scope_cache;
   } else {
+#endif
     thread_local auto* x = new std::unordered_set<Scope*>;
     return *x;
+#ifdef PADDLE_WITH_MKLDNN
   }
+#endif
 }
 
 Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1,

From 5480edf3d659dc41551f853ff225a6f31d3a1d1a Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Mon, 8 Jul 2019 10:20:29 +0800
Subject: [PATCH 19/21] fix conflict with develop

---
 paddle/fluid/inference/api/analysis_predictor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 2488ad1af2ac0..be46c8d3e8587 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -194,7 +194,7 @@ void AnalysisPredictor::MkldnnPreRun(const std::vector<PaddleTensor> &inputs) {
   if (paddle::platform::get_cur_mkldnn_session_id() == 0 &&
       config_.mkldnn_input_shape_cache_size_ > 1) {
     paddle::platform::set_cur_mkldnn_session_id(-1);
-    paddle::platform::set_cur_input_shape_cache_size(
+    paddle::platform::set_cur_input_shape_cache_capacity(
         config_.mkldnn_input_shape_cache_size_);
   }
   // Set current_input_shape for caching dynamic shape.

From 65e38651134352368dd486e515b490b1e2ad80ca Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Thu, 11 Jul 2019 21:01:47 +0800
Subject: [PATCH 20/21] checkout develop
 paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc

---
 paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index 9980a6ba48ab1..ea0abf930e7f5 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -162,7 +162,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       auto propagation = src_md.data.data_type == mkldnn_f32
                              ? mkldnn::prop_kind::forward_training
                              : mkldnn::prop_kind::forward_scoring;
-      pool_pd =
+      std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
           CreatePrimitiveDesc(src_md, dst_md, propagation, strides,
                               padding_left_top, padding_right_bottom, ksize,
                               pooling_type, mkldnn_engine, ceil_mode, is_test);

From 330207c69869f579e2d90e2c75db65a3929493eb Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Thu, 11 Jul 2019 21:04:28 +0800
Subject: [PATCH 21/21] clean detect unit-test

---
 .../fluid/inference/tests/api/CMakeLists.txt  |  2 +-
 .../tests/api/analyzer_detect_tester.cc       | 30 ++++++++++---------
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index df727a55f07c7..7898933223da2 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -157,7 +157,7 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose
 
 # detect
 inference_analysis_api_test_with_refer_result(test_analyzer_detect ${OCR_INSTALL_DIR} analyzer_detect_tester.cc)
-target_link_libraries(test_analyzer_detect tcmalloc)
+#target_link_libraries(test_analyzer_detect tcmalloc)
 
 ### Image classification tests with fake data
 set(IMG_CLASS_TEST_APP "test_analyzer_image_classification")
diff --git a/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
index 6a0d802383300..ea68d89870fe2 100644
--- a/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gperftools/heap-checker.h>
+// #include <gperftools/heap-checker.h>
 #include <gtest/gtest.h>
 #include <fstream>
 #include <iostream>
@@ -80,7 +80,8 @@ void profile(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
   if (use_mkldnn) {
-    cfg.EnableMKLDNN(10);
+    cfg.EnableMKLDNN();
+    cfg.SetMkldnnCacheCapacity(10);
   }
   // cfg.pass_builder()->TurnOnDebug();
   std::vector<std::vector<PaddleTensor>> outputs;
@@ -138,15 +139,16 @@ TEST(Analyzer_vis, profile_mkldnn) { profile(true /* use_mkldnn */); }
 }  // namespace paddle
 
 // following lines are used for pprof
-int main(int argc, char **argv) {
-  HeapLeakChecker heap_checker("test_foo");
-  FLAGS_infer_model = "third_party/inference_demo/face_model/densebox";
-  FLAGS_infer_data = "third_party/inference_demo/face_model/detect_input.txt";
-  FLAGS_infer_shape = "third_party/inference_demo/face_model/shape.txt";
-  FLAGS_paddle_num_threads = 4;
-  FLAGS_repeat = 1;
-  FLAGS_batch_size = 1;
-  FLAGS_sample = 10;
-  paddle::inference::analysis::profile(true);
-  std::cout << heap_checker.NoLeaks() << std::endl;
-}
+// int main(int argc, char **argv) {
+//   HeapLeakChecker heap_checker("test_foo");
+//   FLAGS_infer_model = "third_party/inference_demo/face_model/densebox";
+//   FLAGS_infer_data =
+//   "third_party/inference_demo/face_model/detect_input.txt";
+//   FLAGS_infer_shape = "third_party/inference_demo/face_model/shape.txt";
+//   FLAGS_paddle_num_threads = 4;
+//   FLAGS_repeat = 1;
+//   FLAGS_batch_size = 1;
+//   FLAGS_sample = 10;
+//   paddle::inference::analysis::profile(true);
+//   std::cout << heap_checker.NoLeaks() << std::endl;
+// }