diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h
index 0c54201c94034..f9411aa73fad4 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.h
+++ b/paddle/fluid/distributed/fleet_executor/carrier.h
@@ -75,6 +75,11 @@ class Carrier final {
 
   bool IsInit() const;
 
+  // NOTE: This mutex will be used in interceptor's RunOps function.
+  // This mutex is used for avoiding forward ops and backward ops run
+  // simultaneously, which will lead to a random hang for some sync ops.
+  std::mutex run;
+
   DISABLE_COPY_AND_ASSIGN(Carrier);
 
  private:
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
index 35905125a0a43..98583de84e7ea 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h"
+#include "paddle/fluid/distributed/fleet_executor/carrier.h"
 
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
@@ -169,6 +170,8 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
 }
 
 void ComputeInterceptor::RunOps() {
+  Carrier& carrier_instance = Carrier::Instance();
+  std::unique_lock<std::mutex> lock(carrier_instance.run);
   VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " running ops for the "
           << step_ + 1 << " time.";
   for (auto op : node_->ops()) {
diff --git a/paddle/fluid/eager/accumulation/gradient_accumulation.cc b/paddle/fluid/eager/accumulation/gradient_accumulation.cc
index 9d475d96e56ce..723bf5387c60a 100644
--- a/paddle/fluid/eager/accumulation/gradient_accumulation.cc
+++ b/paddle/fluid/eager/accumulation/gradient_accumulation.cc
@@ -116,6 +116,22 @@ class TensorAddFunctor : public boost::static_visitor<> {
   }
 #endif
 
+#ifdef PADDLE_WITH_IPU
+  void operator()(const paddle::platform::IPUPlace& place) {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#else
+  void operator()(const paddle::platform::IPUPlace& place) {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#endif
+
   void operator()(const paddle::platform::NPUPinnedPlace& place) {
     PADDLE_THROW(paddle::platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index fe29792b6e75c..2a5b158d315c3 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -29,15 +29,11 @@
 namespace paddle {
 namespace framework {
 
+/* --- Static maps to handle corner cases --- */
 static std::unordered_map<std::string, paddle::framework::AttributeMap>
     operators_with_attrs = {};
 
-static std::unordered_set<std::string> operators_to_skip = {
-    "minus",
-};
-
 static std::unordered_set<std::string> operators_to_codegen = {};
-static std::unordered_set<std::string> skipped_operators = {};
 
 static std::string LegalizeVariableName(const std::string& var_name) {
   std::string ret = var_name;
@@ -45,6 +41,132 @@ static std::string LegalizeVariableName(const std::string& var_name) {
   return ret;
 }
 
+/* --- Helper Objects --- */
+class ForwardGenerationInfo {
+ public:
+  const std::string& GetOpType() const { return op_type_; }
+  void SetOpType(const std::string& op_type) { op_type_ = op_type; }
+
+  const std::unordered_map<std::string, size_t>& GetFwdInputsNamePosMap()
+      const {
+    return fwd_inputs_name_pos_map_;
+  }
+  std::unordered_map<std::string, size_t>* GetMutableFwdInputsNamePosMap() {
+    return &fwd_inputs_name_pos_map_;
+  }
+
+  const std::unordered_map<std::string, size_t>& GetFwdOutputsNamePosMap()
+      const {
+    return fwd_outputs_name_pos_map_;
+  }
+  std::unordered_map<std::string, size_t>* GetMutableFwdOutputsNamePosMap() {
+    return &fwd_outputs_name_pos_map_;
+  }
+
+  const std::vector<proto::OpProto::Var>& GetInVars() const { return in_vars_; }
+  std::vector<proto::OpProto::Var>* GetMutableInVars() { return &in_vars_; }
+
+  const std::vector<proto::OpProto::Var>& GetOutVars() const {
+    return out_vars_;
+  }
+  std::vector<proto::OpProto::Var>* GetMutableOutVars() { return &out_vars_; }
+
+ private:
+  std::string op_type_;
+  std::unordered_map<std::string, size_t> fwd_inputs_name_pos_map_;
+  std::unordered_map<std::string, size_t> fwd_outputs_name_pos_map_;
+  std::vector<proto::OpProto::Var> in_vars_;
+  std::vector<proto::OpProto::Var> out_vars_;
+};
+
+class GradNodeGenerationInfo {
+  class OpBaseGenerationInfo {
+   public:
+    const std::string& GetOpBaseType() const { return op_base_type_; }
+    void SetOpBaseType(const std::string& op_type) { op_base_type_ = op_type; }
+
+    const std::map<std::string, std::string>& GetGradOutsSlotnameMap() const {
+      return grad_outs_slotname_map_;
+    }
+    std::map<std::string, std::string>* GetMutableGradOutsSlotnameMap() {
+      return &grad_outs_slotname_map_;
+    }
+
+    const std::map<std::string, std::string>& GetGradInsFwdSlotnameMap() const {
+      return grad_ins_fwd_slotname_map_;
+    }
+    std::map<std::string, std::string>* GetMutableGradInsFwdSlotnameMap() {
+      return &grad_ins_fwd_slotname_map_;
+    }
+
+    const std::map<std::string, std::string>& GetGradInsGradSlotnameMap()
+        const {
+      return grad_ins_grad_slotname_map_;
+    }
+    std::map<std::string, std::string>* GetMutableGradInsGradSlotnameMap() {
+      return &grad_ins_grad_slotname_map_;
+    }
+
+    const std::map<
+        std::string,
+        std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>&
+    GetGradIns() const {
+      return grad_ins_;
+    }
+    std::map<std::string,
+             std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>*
+    GetMutableGradIns() {
+      return &grad_ins_;
+    }
+
+    const std::map<
+        std::string,
+        std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>&
+    GetGradOuts() const {
+      return grad_outs_;
+    }
+    std::map<std::string,
+             std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>*
+    GetMutableGradOuts() {
+      return &grad_outs_;
+    }
+
+   private:
+    std::string op_base_type_;
+    std::map<std::string, std::string> grad_outs_slotname_map_;
+    std::map<std::string, std::string> grad_ins_fwd_slotname_map_;
+    std::map<std::string, std::string> grad_ins_grad_slotname_map_;
+    std::map<std::string,
+             std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>
+        grad_ins_;
+    std::map<std::string,
+             std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>
+        grad_outs_;
+  };
+
+ public:
+  const std::string& GetFwdOpType() const { return fwd_op_type_; }
+  void SetFwdOpType(const std::string& op_type) { fwd_op_type_ = op_type; }
+
+  bool GenerateForwardOnly() const { return generate_forward_only_; }
+  void SetGenerateForwardOnly(bool generate_forward_only) {
+    generate_forward_only_ = generate_forward_only;
+  }
+
+  const std::vector<OpBaseGenerationInfo>& GetOpBaseInfos() const {
+    return op_base_infos_;
+  }
+  std::vector<OpBaseGenerationInfo>* GetMutableOpBaseInfos() {
+    return &op_base_infos_;
+  }
+
+ private:
+  std::string fwd_op_type_;
+  bool generate_forward_only_ = false;
+  std::vector<OpBaseGenerationInfo> op_base_infos_;
+};
+
+/* --- Helper Functions --- */
 static std::string AttrTypeToString(const proto::AttrType& type) {
   std::string ret;
   switch (type) {
@@ -348,7 +470,6 @@ static bool CheckOpProto(proto::OpProto* op_proto) {
   VLOG(1) << "------ Analyzing Op ------: " << op_type;
 
   if (!operators_to_codegen.count(op_type)) return false;
-  if (operators_to_skip.count(op_type)) return false;
 
   return true;
 }
@@ -356,15 +477,16 @@ static bool CheckOpProto(proto::OpProto* op_proto) {
 /* --------------------------------------- */
 /* --------- Preprocess Ins/Outs --------- */
 /* --------------------------------------- */
-static void PurifyForwardOpProto(
-    const proto::OpProto& op_proto,
-    std::unordered_map<std::string, size_t>* fwd_inputs_name_pos_map,
-    std::unordered_map<std::string, size_t>* fwd_outputs_name_pos_map,
-    std::vector<proto::OpProto::Var>* in_vars,
-    std::vector<proto::OpProto::Var>* out_vars) {
+static void PurifyForwardOpProto(const proto::OpProto& op_proto,
+                                 ForwardGenerationInfo* fwd_info) {
   // Op Name
   const std::string op_name = op_proto.type();
 
+  auto* in_vars = fwd_info->GetMutableInVars();
+  auto* out_vars = fwd_info->GetMutableOutVars();
+  auto* fwd_inputs_name_pos_map = fwd_info->GetMutableFwdInputsNamePosMap();
+  auto* fwd_outputs_name_pos_map = fwd_info->GetMutableFwdOutputsNamePosMap();
+
   // Handle dispensable inputs
   for (const proto::OpProto::Var& input : op_proto.inputs()) {
     std::string input_name = input.name();
@@ -426,6 +548,104 @@ static void PurifyForwardOpProto(
   }
 }
 
+static void PurifyGradNodeGenerationInfo(const proto::OpProto& op_proto,
+                                         GradNodeGenerationInfo* bwd_info) {
+  auto* op_base_infos = bwd_info->GetMutableOpBaseInfos();
+  for (auto& iter : *op_base_infos) {
+    std::map<std::string, std::string>* grad_outs_slotname_map =
+        iter.GetMutableGradOutsSlotnameMap();
+    std::map<std::string, std::string>* grad_ins_fwd_slotname_map =
+        iter.GetMutableGradInsFwdSlotnameMap();
+    std::map<std::string, std::string>* grad_ins_grad_slotname_map =
+        iter.GetMutableGradInsGradSlotnameMap();
+    std::map<std::string,
+             std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>*
+        grad_ins = iter.GetMutableGradIns();
+    std::map<std::string,
+             std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>*
+        grad_outs = iter.GetMutableGradOuts();
+
+    // Op Name
+    const std::string op_name = op_proto.type();
+
+    // Handle dispensable inputs
+    for (const proto::OpProto::Var& input : op_proto.inputs()) {
+      std::string input_name = input.name();
+
+      // Delete dispensable tensor unless specified in op_ins_map
+      if (input.dispensable()) {
+        if (!op_ins_map.count(op_name) ||
+            !op_ins_map[op_name].count(input_name)) {
+          VLOG(6) << "Removing Dispensable Input: " << input_name;
+
+          // grad_outs_slotname_map
+          auto grad_outs_slotname_map_purified = *grad_outs_slotname_map;
+          for (const auto& iter : *grad_outs_slotname_map) {
+            const std::string& grad_output_name = iter.first;
+            const std::string& matched_input_name = iter.second;
+            if (matched_input_name == input_name) {
+              grad_outs_slotname_map_purified.erase(grad_output_name);
+
+              PADDLE_ENFORCE(
+                  grad_outs->count(grad_output_name) > 0,
+                  paddle::platform::errors::Fatal(
+                      "Unable to find gradient output name in grad_outs."));
+              // grad_outs
+              grad_outs->erase(grad_output_name);
+            }
+          }
+          *grad_outs_slotname_map = grad_outs_slotname_map_purified;
+
+          // grad_ins_fwd_slotname_map: output as tensorwrapper
+          if (grad_ins_fwd_slotname_map->count(input_name))
+            grad_ins_fwd_slotname_map->erase(input_name);
+
+          // grad_ins: output as tensorwrapper
+          if (grad_ins->count(input_name)) grad_ins->erase(input_name);
+        }
+      }
+    }
+
+    for (const proto::OpProto::Var& output : op_proto.outputs()) {
+      std::string output_name = output.name();
+
+      // Delete dispensable tensor unless specified in op_outs_map
+      if (output.dispensable()) {
+        if (!op_outs_map.count(op_name) ||
+            !op_outs_map[op_name].count(output_name)) {
+          VLOG(6) << "Removing Dispensable Output: " << output_name;
+
+          // grad_ins_grad_slotname_map
+          auto grad_ins_grad_slotname_map_purified =
+              *grad_ins_grad_slotname_map;
+          for (const auto& iter : *grad_ins_grad_slotname_map) {
+            const std::string& grad_input_name = iter.first;
+            const std::string& matched_output_name = iter.second;
+            if (matched_output_name == output_name) {
+              grad_ins_grad_slotname_map_purified.erase(grad_input_name);
+
+              PADDLE_ENFORCE(
+                  grad_ins->count(grad_input_name) > 0,
+                  paddle::platform::errors::Fatal(
+                      "Unable to find gradient input name in grad_ins."));
+              // grad_ins
+              grad_ins->erase(grad_input_name);
+            }
+          }
+          *grad_ins_grad_slotname_map = grad_ins_grad_slotname_map_purified;
+
+          // grad_ins_fwd_slotname_map: output as tensorwrapper
+          if (grad_ins_fwd_slotname_map->count(output_name))
+            grad_ins_fwd_slotname_map->erase(output_name);
+
+          // grad_ins: output as tensorwrapper
+          if (grad_ins->count(output_name)) grad_ins->erase(output_name);
+        }
+      }
+    }
+  }
+}
+
 static void PurifyGradOpProto(
     const proto::OpProto& op_proto,
     std::map<std::string, std::string>* grad_outs_slotname_map,
@@ -520,31 +740,22 @@ static void PurifyGradOpProto(
 /* --------- Collect Info --------- */
 /* -------------------------------- */
 static void CollectForwardInformationFromOpInfo(
-    const paddle::framework::OpInfo& op_info,
-    std::vector<proto::OpProto::Var>* in_vars,
-    std::vector<proto::OpProto::Var>* out_vars) {
+    const paddle::framework::OpInfo& op_info, ForwardGenerationInfo* fwd_info) {
   const proto::OpProto& op_proto = *op_info.proto_;
+
+  fwd_info->SetOpType(op_proto.type());
+
   for (const proto::OpProto::Var& input : op_proto.inputs()) {
-    in_vars->push_back(input);
+    fwd_info->GetMutableInVars()->push_back(input);
   }
   for (const proto::OpProto::Var& output : op_proto.outputs()) {
-    out_vars->push_back(output);
+    fwd_info->GetMutableOutVars()->push_back(output);
   }
 }
 
 static bool CollectGradInformationFromOpInfo(
-    const paddle::framework::OpInfo& op_info, bool* generate_forward_only,
-    std::vector<std::string>* grad_op_types,                         // grad
-    std::map<std::string, std::string>* grad_outs_slotname_map,      // grad
-    std::map<std::string, std::string>* grad_ins_fwd_slotname_map,   // grad
-    std::map<std::string, std::string>* grad_ins_grad_slotname_map,  // grad
-    std::map<std::string,
-             std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>*
-        grad_ins,  // grad
-    std::map<std::string,
-             std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>*
-        grad_outs  // grad
-    ) {
+    const paddle::framework::OpInfo& op_info,
+    GradNodeGenerationInfo* bwd_info) {
   const proto::OpProto& op_proto = *op_info.proto_;
   const std::string& op_type = op_proto.type();
   std::vector<int64_t> dims = {1, 1, 1, 1};
@@ -645,7 +856,7 @@ static bool CollectGradInformationFromOpInfo(
   /* ------ Run GradOpMaker ------ */
   if (!op_info.dygraph_grad_op_maker_) {
     VLOG(6) << op_type << " has no GradOpMaker";
-    *generate_forward_only = true;
+    bwd_info->SetGenerateForwardOnly(true);
     return false;
   }
 
@@ -656,32 +867,31 @@ static bool CollectGradInformationFromOpInfo(
   if (!grad_node) {
     VLOG(6) << "Got nullptr GradOpNode for " << op_type
             << " likely registered EmptyGradOpMaker";
-    *generate_forward_only = true;
+    bwd_info->SetGenerateForwardOnly(true);
     return false;
   }
 
-  /*
-  if (grad_node->size() > 1) {
-    // Backward attributes can be super complicated
-    VLOG(6) << "Skip GradOpNode with multiple OpBases for now: " << op_type;
-    skipped_operators.insert(op_type);
-    return false;
-  }
-  */
-
   VLOG(6) << "Prepared GradOpNode";
 
-  /* ---- Collect Default Attr Map ---- */
+  /* ---- Collect OpBase's op_types ---- */
+  bwd_info->SetFwdOpType(op_type);
+  auto* op_base_infos = bwd_info->GetMutableOpBaseInfos();
+  op_base_infos->resize(grad_node->size());
   for (auto iter = grad_node->begin(); iter < grad_node->end(); iter++) {
     // Each OpBase
+    int index = std::distance(grad_node->begin(), iter);
     paddle::imperative::OpBase& op_base = *iter;
-    grad_op_types->push_back(op_base.Type());
+    (*op_base_infos)[index].SetOpBaseType(op_base.Type());
   }
 
   /* ------ Get Grad ins/outs ---- */
   // In case of multiple OpBase, stitch all the respective ins/outs into one
   VLOG(6) << "In function size: " << grad_node->size();
   for (auto iter = grad_node->begin(); iter < grad_node->end(); iter++) {
+    int index = std::distance(grad_node->begin(), iter);
+    auto* op_base_grad_ins = (*op_base_infos)[index].GetMutableGradIns();
+    auto* op_base_grad_outs = (*op_base_infos)[index].GetMutableGradOuts();
+
     const paddle::imperative::OpBase& op_base = *iter;
     const std::map<std::string, paddle::imperative::SavedVariableWrapperList>&
         g_ins = op_base.GetInsMap();
@@ -689,34 +899,47 @@ static bool CollectGradInformationFromOpInfo(
         g_outs = op_base.GetOutsMap();
 
     for (const auto& it : g_ins) {
-      if (!grad_ins->count(it.first)) (*grad_ins)[it.first] = {};
+      if (!op_base_grad_ins->count(it.first))
+        (*op_base_grad_ins)[it.first] = {};
+
       for (auto vw_iter = it.second.begin(); vw_iter != it.second.end();
            vw_iter++) {
         std::shared_ptr<paddle::imperative::VariableWrapper> vw = *vw_iter;
-        (*grad_ins)[it.first].push_back(vw);
+
+        (*op_base_grad_ins)[it.first].push_back(vw);
+
+        VLOG(6) << "GradIns Name: " << it.first;
       }
     }
 
     for (const auto& it : g_outs) {
-      if (!grad_outs->count(it.first)) (*grad_outs)[it.first] = {};
+      if (!op_base_grad_outs->count(it.first))
+        (*op_base_grad_outs)[it.first] = {};
+
       for (auto vw_iter = it.second.begin(); vw_iter != it.second.end();
            vw_iter++) {
         std::shared_ptr<paddle::imperative::VariableWrapper> vw = *vw_iter;
-        (*grad_outs)[it.first].push_back(vw);
+
+        (*op_base_grad_outs)[it.first].push_back(vw);
+
+        VLOG(6) << "GradOuts Name: " << it.first;
       }
     }
   }
 
   /* ------ Slot Name Matching ---- */
-  // grad_ins -> fwd_ins, fwd_outs
-  SlotNameMatching(*grad_ins, fwd_ins, fwd_outs, grad_ins_fwd_slotname_map,
-                   grad_ins_grad_slotname_map);
-  VLOG(6) << "Finished Slotname Matching for Grad_Ins";
-
-  // grad_outs -> fwd_ins, fwd_outs
-  SlotNameMatching(*grad_outs, fwd_ins, fwd_outs, grad_outs_slotname_map,
-                   grad_outs_slotname_map);
-  VLOG(6) << "Finished Slotname Matching for Grad_Outs";
+  for (auto& iter : *op_base_infos) {
+    // grad_ins -> fwd_ins, fwd_outs
+    SlotNameMatching(iter.GetGradIns(), fwd_ins, fwd_outs,
+                     iter.GetMutableGradInsFwdSlotnameMap(),
+                     iter.GetMutableGradInsGradSlotnameMap());
+
+    // grad_outs -> fwd_ins, fwd_outs
+    SlotNameMatching(iter.GetGradOuts(), fwd_ins, fwd_outs,
+                     iter.GetMutableGradOutsSlotnameMap(),
+                     iter.GetMutableGradOutsSlotnameMap());
+  }
+  VLOG(6) << "Finished Slotname Matching";
 
   return true;
 }
@@ -725,13 +948,20 @@ static bool CollectGradInformationFromOpInfo(
 /* --------- CodeGen: Forward GradNode Creation ------ */
 /* --------------------------------------------------- */
 static std::string GenerateGradNodeCreationContent(
-    const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map,
-    const std::unordered_map<std::string, size_t>& fwd_outputs_name_pos_map,
-    const std::map<std::string, std::string>& grad_ins_fwd_slotname_map,
-    const std::string& op_type, const std::vector<proto::OpProto::Var>& in_vars,
-    const std::vector<proto::OpProto::Var>& out_vars) {
+    const ForwardGenerationInfo& fwd_info,
+    const GradNodeGenerationInfo& bwd_info) {
   VLOG(6) << "Generating GradNode Creation codes";
 
+  const std::string& op_type = fwd_info.GetOpType();
+  const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map =
+      fwd_info.GetFwdInputsNamePosMap();
+  const std::unordered_map<std::string, size_t>& fwd_outputs_name_pos_map =
+      fwd_info.GetFwdOutputsNamePosMap();
+  const std::vector<proto::OpProto::Var>& in_vars = fwd_info.GetInVars();
+  const std::vector<proto::OpProto::Var>& out_vars = fwd_info.GetOutVars();
+
+  const auto& op_base_infos = bwd_info.GetOpBaseInfos();
+
   // [Generation] Construct GradOpNode
   // Run ComputeRequiredGrad
 
@@ -817,12 +1047,17 @@ static std::string GenerateGradNodeCreationContent(
 
   // [GradOpNode] Set TensorWrappers
   grad_node_creation_str += "    // Set Tensor Wrappers\n";
-  for (auto& kv : grad_ins_fwd_slotname_map) {
-    const std::string& tensor_wrapper_name = kv.second;
-    const char* SET_TENSOR_WRAPPER_TEMPLATE =
-        "    grad_node->SetTensorWrapper%s(%s);\n";
-    grad_node_creation_str += paddle::string::Sprintf(
-        SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name, tensor_wrapper_name);
+  for (const auto& iter : op_base_infos) {
+    const std::map<std::string, std::string>& grad_ins_fwd_slotname_map =
+        iter.GetGradInsFwdSlotnameMap();
+    for (auto& kv : grad_ins_fwd_slotname_map) {
+      const std::string& tensor_wrapper_name = kv.second;
+      const char* SET_TENSOR_WRAPPER_TEMPLATE =
+          "    grad_node->SetTensorWrapper%s(%s);\n";
+      grad_node_creation_str +=
+          paddle::string::Sprintf(SET_TENSOR_WRAPPER_TEMPLATE,
+                                  tensor_wrapper_name, tensor_wrapper_name);
+    }
   }
   grad_node_creation_str += "\n";
   VLOG(6) << "Generated SetTensorWrapper";
@@ -892,22 +1127,17 @@ static std::string GenerateGradNodeCreationContent(
 /* --------- CodeGen: Forward ----- */
 /* -------------------------------- */
 static std::pair<std::string, std::string> GenerateForwardFunctionContents(
-    bool generate_forward_only,
-    const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map,
-    const std::unordered_map<std::string, size_t>& fwd_outputs_name_pos_map,
-    const std::map<std::string, std::string>& grad_ins_fwd_slotname_map,
-    const std::map<std::string, std::string>& grad_ins_grad_slotname_map,
-    const std::map<std::string, std::string>& grad_outs_slotname_map,
-    const std::map<
-        std::string,
-        std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>&
-        grad_ins,
-    const std::map<
-        std::string,
-        std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>&
-        grad_outs,
-    const std::string& op_type, const std::vector<proto::OpProto::Var>& in_vars,
-    const std::vector<proto::OpProto::Var>& out_vars) {
+    const ForwardGenerationInfo& fwd_info,
+    const GradNodeGenerationInfo& bwd_info) {
+  /* --- Process Forward Info ---*/
+  const std::string& op_type = fwd_info.GetOpType();
+  const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map =
+      fwd_info.GetFwdInputsNamePosMap();
+  const std::unordered_map<std::string, size_t>& fwd_outputs_name_pos_map =
+      fwd_info.GetFwdOutputsNamePosMap();
+  const std::vector<proto::OpProto::Var>& in_vars = fwd_info.GetInVars();
+  const std::vector<proto::OpProto::Var>& out_vars = fwd_info.GetOutVars();
+
   /*
     // Forward Function Example:
   std::tuple<vector<Tensor>, Tensor, vector<Tensor>>
@@ -999,24 +1229,53 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   for (const proto::OpProto::Var& output : out_vars) {
     const std::string& output_name = output.name();
     std::string outnum = "1";
-    if (output.duplicable()) {
-      outnum = output_name + "Num";
-
-      const char* FWD_NUM_ARG_TEMPLATE = ", size_t %s";
-      std::string arg_str =
-          paddle::string::Sprintf(FWD_NUM_ARG_TEMPLATE, outnum);
-      dygraph_function_args_str += arg_str;
-      const char* FWD_OUTS_CONTENT_TEMPLATE =
-          "{ \"%s\", egr::EagerUtils::ConstructDuplicableOutput(%s) },";
-      outs_contents_str += paddle::string::Sprintf(FWD_OUTS_CONTENT_TEMPLATE,
-                                                   output_name, outnum);
+    if (op_passing_outs_map[op_type].count(output_name)) {
+      const std::string output_var_name = output_name + "Var";
+
+      // Pass Output from function argument,
+      // in form of shared_ptr<EagerTensor>/vector<shared_ptr<EagerTensor>>
+      if (output.duplicable()) {
+        const char* FWD_NUM_ARG_TEMPLATE =
+            ", std::vector<std::shared_ptr<egr::EagerTensor>>& %s";
+        std::string arg_str =
+            paddle::string::Sprintf(FWD_NUM_ARG_TEMPLATE, output_var_name);
+        dygraph_function_args_str += arg_str;
+
+        const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", %s },";
+        outs_contents_str += paddle::string::Sprintf(
+            FWD_OUTS_CONTENT_TEMPLATE, output_name, output_var_name);
+      } else {
+        const char* FWD_NUM_ARG_TEMPLATE =
+            ", std::shared_ptr<egr::EagerTensor>& %s";
+        std::string arg_str =
+            paddle::string::Sprintf(FWD_NUM_ARG_TEMPLATE, output_var_name);
+        dygraph_function_args_str += arg_str;
+
+        const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", {%s} },";
+        outs_contents_str += paddle::string::Sprintf(
+            FWD_OUTS_CONTENT_TEMPLATE, output_name, output_var_name);
+      }
+
     } else {
-      const char* FWD_OUTS_CONTENT_TEMPLATE =
-          "{ \"%s\", "
-          "{std::make_shared<egr::EagerTensor>(egr::Controller::Instance()."
-          "GenerateUniqueName())}},";
-      outs_contents_str +=
-          paddle::string::Sprintf(FWD_OUTS_CONTENT_TEMPLATE, output_name);
+      if (output.duplicable()) {
+        outnum = output_name + "Num";
+
+        const char* FWD_NUM_ARG_TEMPLATE = ", size_t %s";
+        std::string arg_str =
+            paddle::string::Sprintf(FWD_NUM_ARG_TEMPLATE, outnum);
+        dygraph_function_args_str += arg_str;
+        const char* FWD_OUTS_CONTENT_TEMPLATE =
+            "{ \"%s\", egr::EagerUtils::ConstructDuplicableOutput(%s) },";
+        outs_contents_str += paddle::string::Sprintf(FWD_OUTS_CONTENT_TEMPLATE,
+                                                     output_name, outnum);
+      } else {
+        const char* FWD_OUTS_CONTENT_TEMPLATE =
+            "{ \"%s\", "
+            "{std::make_shared<egr::EagerTensor>(egr::Controller::Instance()."
+            "GenerateUniqueName())}},";
+        outs_contents_str +=
+            paddle::string::Sprintf(FWD_OUTS_CONTENT_TEMPLATE, output_name);
+      }
     }
   }
   if (outs_contents_str.size() > 0)
@@ -1084,10 +1343,9 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   VLOG(6) << "Converted Output VarBase to EagerTensor(s)";
 
   // [Generation] ComputeRequireGrad -> GradNodeCreation
-  if (!generate_forward_only) {
-    std::string grad_node_creation_body_str = GenerateGradNodeCreationContent(
-        fwd_inputs_name_pos_map, fwd_outputs_name_pos_map,
-        grad_ins_fwd_slotname_map, op_type, in_vars, out_vars);
+  if (!bwd_info.GenerateForwardOnly()) {
+    std::string grad_node_creation_body_str =
+        GenerateGradNodeCreationContent(fwd_info, bwd_info);
     generated_function_body += grad_node_creation_body_str;
     generated_function_body += "\n";
     VLOG(6) << "Generated GradNode Creation codes";
@@ -1162,22 +1420,16 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 /* --------- CodeGen: GradNode::operator() ------ */
 /* ---------------------------------------------- */
 static std::string GenerateGradNodeCCContents(
-    const std::vector<std::string>& grad_op_types,
-    const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map,
-    const std::unordered_map<std::string, size_t>& fwd_outputs_name_pos_map,
-    const std::map<std::string, std::string>& grad_ins_fwd_slotname_map,
-    const std::map<std::string, std::string>& grad_ins_grad_slotname_map,
-    const std::map<std::string, std::string>& grad_outs_slotname_map,
-    const std::map<
-        std::string,
-        std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>&
-        grad_ins,
-    const std::map<
-        std::string,
-        std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>&
-        grad_outs,
-    const std::string& op_type, const std::vector<proto::OpProto::Var>& in_vars,
-    const std::vector<proto::OpProto::Var>& out_vars) {
+    const ForwardGenerationInfo& fwd_info,
+    const GradNodeGenerationInfo& bwd_info) {
+  /* --- Process Forward Info --- */
+  const std::string& fwd_op_type = fwd_info.GetOpType();
+  const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map =
+      fwd_info.GetFwdInputsNamePosMap();
+  const std::unordered_map<std::string, size_t>& fwd_outputs_name_pos_map =
+      fwd_info.GetFwdOutputsNamePosMap();
+  const std::vector<proto::OpProto::Var>& in_vars = fwd_info.GetInVars();
+
   VLOG(6) << "Generating Grad Node CC";
 
   /* [Outline]
@@ -1224,227 +1476,247 @@ static std::string GenerateGradNodeCCContents(
   */
 
   std::string generated_grad_function_body = "";
+  size_t outs_size = 0;
+  const auto& op_base_infos = bwd_info.GetOpBaseInfos();
+  for (size_t i = 0; i < op_base_infos.size(); i++) {
+    const auto& op_base_info = op_base_infos[i];
+
+    const auto& grad_ins_fwd_slotname_map =
+        op_base_info.GetGradInsFwdSlotnameMap();
+    const auto& grad_ins_grad_slotname_map =
+        op_base_info.GetGradInsGradSlotnameMap();
+    const auto& grad_outs_slotname_map = op_base_info.GetGradOutsSlotnameMap();
+    const auto& grad_ins = op_base_info.GetGradIns();
+    const auto& grad_outs = op_base_info.GetGradOuts();
+
+    const std::string& op_base_type = op_base_info.GetOpBaseType();
+    const std::string& ins_name = "ins" + std::to_string(i);
+    const std::string& outs_name = "outs" + std::to_string(i);
+
+    outs_size += grad_outs.size();
+
+    // [Generation] Get Ins Map
+    std::string ins_contents_str = "";
+    for (auto iter : grad_ins) {
+      const std::string& grad_input_name = iter.first;
+
+      if (grad_ins_fwd_slotname_map.count(grad_input_name)) {
+        // Fwd Tensor
+        std::string struct_fwd_input_name =
+            grad_ins_fwd_slotname_map.at(grad_input_name) + "_";
+        const char* GRAD_INS_FWD_CONTENT_TEMPLATE =
+            "{ \"%s\", "
+            "egr::EagerUtils::SyncToVars(egr::EagerUtils::RecoverTensorWrapper("
+            "&"
+            "this->%s, "
+            "nullptr)) },";
+        ins_contents_str +=
+            paddle::string::Sprintf(GRAD_INS_FWD_CONTENT_TEMPLATE,
+                                    grad_input_name, struct_fwd_input_name);
+
+      } else if (grad_ins_grad_slotname_map.count(grad_input_name)) {
+        // Fwd Tensor's Grad
+        size_t fwd_output_position = fwd_outputs_name_pos_map.at(
+            grad_ins_grad_slotname_map.at(grad_input_name));
+        const char* GRAD_INS_GRAD_CONTENT_TEMPLATE =
+            "{ \"%s\", egr::EagerUtils::SyncToVars(grads[%d]) },";
+        ins_contents_str +=
+            paddle::string::Sprintf(GRAD_INS_GRAD_CONTENT_TEMPLATE,
+                                    grad_input_name, fwd_output_position);
 
-  // [Generation] Get Tracer
-  generated_grad_function_body += "\n";
-  generated_grad_function_body += "\n";
-
-  // [Generation] Get Ins Map
-  std::string ins_contents_str = "";
-  for (auto iter : grad_ins) {
-    const std::string& grad_input_name = iter.first;
-
-    if (grad_ins_fwd_slotname_map.count(grad_input_name)) {
-      // Fwd Tensor
-      std::string struct_fwd_input_name =
-          grad_ins_fwd_slotname_map.at(grad_input_name) + "_";
-      const char* GRAD_INS_FWD_CONTENT_TEMPLATE =
-          "{ \"%s\", "
-          "egr::EagerUtils::SyncToVars(egr::EagerUtils::RecoverTensorWrapper(&"
-          "this->%s, "
-          "nullptr)) },";
-      ins_contents_str +=
-          paddle::string::Sprintf(GRAD_INS_FWD_CONTENT_TEMPLATE,
-                                  grad_input_name, struct_fwd_input_name);
-
-    } else if (grad_ins_grad_slotname_map.count(grad_input_name)) {
-      // Fwd Tensor's Grad
-      size_t fwd_output_position = fwd_outputs_name_pos_map.at(
-          grad_ins_grad_slotname_map.at(grad_input_name));
-      const char* GRAD_INS_GRAD_CONTENT_TEMPLATE =
-          "{ \"%s\", egr::EagerUtils::SyncToVars(grads[%d]) },";
-      ins_contents_str += paddle::string::Sprintf(
-          GRAD_INS_GRAD_CONTENT_TEMPLATE, grad_input_name, fwd_output_position);
-
-    } else {
-      PADDLE_THROW(platform::errors::Fatal(
-          "Detected mismatched slot names."
-          "Unable to find forward slot name that matches %s",
-          grad_input_name));
+      } else {
+        PADDLE_THROW(platform::errors::Fatal(
+            "Detected mismatched slot names."
+            "Unable to find forward slot name that matches %s",
+            grad_input_name));
+      }
+    }
+    if (ins_contents_str.size() > 0)
+      ins_contents_str.pop_back();  // // Remove trailing ","
+
+    const char* BWD_INS_MAP_TEMPLATE =
+        "  std::map<std::string, "
+        "std::vector<std::shared_ptr<egr::EagerTensor>>> %s = { "
+        "%s };\n";
+    std::string ins_map_str = paddle::string::Sprintf(
+        BWD_INS_MAP_TEMPLATE, ins_name, ins_contents_str);
+    generated_grad_function_body += ins_map_str;
+
+    VLOG(6) << "Generated Ins Map";
+
+    // [Generation] Get Outs Map
+    std::unordered_set<std::string> duplicable_input_name_set;
+    for (const auto& in : in_vars) {
+      if (in.duplicable()) duplicable_input_name_set.insert(in.name());
     }
-  }
-  if (ins_contents_str.size() > 0)
-    ins_contents_str.pop_back();  // // Remove trailing ","
-
-  const char* BWD_INS_MAP_TEMPLATE =
-      "  std::map<std::string, "
-      "std::vector<std::shared_ptr<egr::EagerTensor>>> ins = { "
-      "%s };\n";
-  std::string ins_map_str =
-      paddle::string::Sprintf(BWD_INS_MAP_TEMPLATE, ins_contents_str);
-  generated_grad_function_body += ins_map_str;
-
-  VLOG(6) << "Generated Ins Map";
-
-  // [Generation] Get Outs Map
-  std::unordered_set<std::string> duplicable_input_name_set;
-  for (const auto& in : in_vars) {
-    if (in.duplicable()) duplicable_input_name_set.insert(in.name());
-  }
-
-  std::string outs_contents_str = "";
-  for (auto iter : grad_outs) {
-    const std::string& grad_output_name = iter.first;
-
-    if (grad_outs_slotname_map.count(grad_output_name)) {
-      // Fwd Tensor
-      const std::string& fwd_name = grad_outs_slotname_map.at(grad_output_name);
-
-      /* Handle Special Case: "PullSparseOp", etc
-
-          Forward:
-
-             Ids  W
-              |   |
-           PullSparseOp
-                |
-               Out
-
-          Backward:
-
-             Ids  GradOut  W
-              |      |     |
-             PullSparseGradOp
-                     |
-                  GradOut
-
-          Its grad output "GradOut" corresponds to forward output "Out",
-          where there is a hiden inplace involved. So we find "GradOut"'s index
-         in
-          grads, and perform the inplace operation by constructing outs =
-         {{"Out", grads[i]}}
-
-          GradOut -> Out -> fwd_output_pos -> grads position -> grads[i]
-          outs = {{"Out", grads[i]}}
-
-          For returns, append "GradOut" to the very end of return list.
-      */
-      if (!fwd_inputs_name_pos_map.count(fwd_name)) {
-        PADDLE_ENFORCE(fwd_outputs_name_pos_map.count(fwd_name),
-                       paddle::platform::errors::Fatal(
-                           "fwd_name not found in fwd_inputs_name_pos_map nor "
-                           "fwd_outputs_name_pos_map"));
-
-        size_t grads_position = fwd_outputs_name_pos_map.at(fwd_name);
-        std::string grad_ptr_name = fwd_name + "_ptrs";
-        const char* GET_GRADS_PTR_TEMPLATE =
-            "  std::vector<std::shared_ptr<egr::EagerTensor>> %s;\n"
-            "  for(const auto& t : grads[%d]) {\n    "
-            "%s.emplace_back(std::move(std::make_shared<egr::EagerTensor>(t)));"
-            "\n  }\n";
-        std::string grads_ptr_str =
-            paddle::string::Sprintf(GET_GRADS_PTR_TEMPLATE, grad_ptr_name,
-                                    grads_position, grad_ptr_name);
-        generated_grad_function_body += grads_ptr_str;
-        generated_grad_function_body += "\n";
-
-        const char* GRAD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", %s },";
-        outs_contents_str += paddle::string::Sprintf(
-            GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, grad_ptr_name);
 
-      } else {
-        size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
-        if (duplicable_input_name_set.count(fwd_name)) {
-          const char* GRAD_OUTS_CONTENT_TEMPLATE =
-              "{ \"%s\", egr::EagerUtils::ConstructDuplicableOutput( "
-              "this->OutputMeta()[%d].Size() ) },";
+    std::string outs_contents_str = "";
+    for (auto iter : grad_outs) {
+      const std::string& grad_output_name = iter.first;
+
+      if (grad_outs_slotname_map.count(grad_output_name)) {
+        // Fwd Tensor
+        const std::string& fwd_name =
+            grad_outs_slotname_map.at(grad_output_name);
+
+        /* Handle Special Case: "PullSparseOp", etc
+
+            Forward:
+
+               Ids  W
+                |   |
+             PullSparseOp
+                  |
+                 Out
+
+            Backward:
+
+               Ids  GradOut  W
+                |      |     |
+               PullSparseGradOp
+                       |
+                    GradOut
+
+            Its grad output "GradOut" corresponds to forward output "Out",
+            where there is a hiden inplace involved. So we find "GradOut"'s
+           index
+           in
+            grads, and perform the inplace operation by constructing outs =
+           {{"Out", grads[i]}}
+
+            GradOut -> Out -> fwd_output_pos -> grads position -> grads[i]
+            outs = {{"Out", grads[i]}}
+
+            For returns, append "GradOut" to the very end of return list.
+        */
+        if (!fwd_inputs_name_pos_map.count(fwd_name)) {
+          PADDLE_ENFORCE(
+              fwd_outputs_name_pos_map.count(fwd_name),
+              paddle::platform::errors::Fatal(
+                  "fwd_name not found in fwd_inputs_name_pos_map nor "
+                  "fwd_outputs_name_pos_map"));
+
+          size_t grads_position = fwd_outputs_name_pos_map.at(fwd_name);
+          std::string grad_ptr_name = fwd_name + "_ptrs";
+          const char* GET_GRADS_PTR_TEMPLATE =
+              "  std::vector<std::shared_ptr<egr::EagerTensor>> %s;\n"
+              "  for(const auto& t : grads[%d]) {\n    "
+              "%s.emplace_back(std::move(std::make_shared<egr::EagerTensor>(t))"
+              ");"
+              "\n  }\n";
+          std::string grads_ptr_str =
+              paddle::string::Sprintf(GET_GRADS_PTR_TEMPLATE, grad_ptr_name,
+                                      grads_position, grad_ptr_name);
+          generated_grad_function_body += grads_ptr_str;
+          generated_grad_function_body += "\n";
+
+          const char* GRAD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", %s },";
           outs_contents_str += paddle::string::Sprintf(
-              GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, fwd_input_position);
+              GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, grad_ptr_name);
+
         } else {
-          const char* GRAD_OUTS_CONTENT_TEMPLATE =
-              "{ \"%s\", "
-              "{std::make_shared<egr::EagerTensor>(egr::Controller::Instance()."
-              "GenerateUniqueName())}},";
-          outs_contents_str += paddle::string::Sprintf(
-              GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name);
+          size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
+          if (duplicable_input_name_set.count(fwd_name)) {
+            const char* GRAD_OUTS_CONTENT_TEMPLATE =
+                "{ \"%s\", egr::EagerUtils::ConstructDuplicableOutput( "
+                "this->OutputMeta()[%d].Size() ) },";
+            outs_contents_str +=
+                paddle::string::Sprintf(GRAD_OUTS_CONTENT_TEMPLATE,
+                                        grad_output_name, fwd_input_position);
+          } else {
+            const char* GRAD_OUTS_CONTENT_TEMPLATE =
+                "{ \"%s\", "
+                "{std::make_shared<egr::EagerTensor>(egr::Controller::Instance("
+                ")."
+                "GenerateUniqueName())}},";
+            outs_contents_str += paddle::string::Sprintf(
+                GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name);
+          }
         }
+      } else {
+        PADDLE_THROW(platform::errors::Fatal(
+            "Detected mismatched slot names."
+            "Unable to find forward slot name that matches %s",
+            grad_output_name));
       }
-    } else {
-      PADDLE_THROW(platform::errors::Fatal(
-          "Detected mismatched slot names."
-          "Unable to find forward slot name that matches %s",
-          grad_output_name));
     }
-  }
-  if (outs_contents_str.size() > 0)
-    outs_contents_str.pop_back();  // // Remove trailing ","
+    if (outs_contents_str.size() > 0)
+      outs_contents_str.pop_back();  // // Remove trailing ","
 
-  const char* BWD_OUTS_MAP_TEMPLATE =
-      "  std::map<std::string, "
-      "std::vector<std::shared_ptr<egr::EagerTensor>>> outs = { "
-      "%s };\n";
-  std::string outs_map_str =
-      paddle::string::Sprintf(BWD_OUTS_MAP_TEMPLATE, outs_contents_str);
-  generated_grad_function_body += outs_map_str;
-  generated_grad_function_body += "\n";
-
-  VLOG(6) << "Generated Outs Map";
+    const char* BWD_OUTS_MAP_TEMPLATE =
+        "  std::map<std::string, "
+        "std::vector<std::shared_ptr<egr::EagerTensor>>> %s = { "
+        "%s };\n";
+    std::string outs_map_str = paddle::string::Sprintf(
+        BWD_OUTS_MAP_TEMPLATE, outs_name, outs_contents_str);
+    generated_grad_function_body += outs_map_str;
+    generated_grad_function_body += "\n";
 
-  // [Generation] Get Attrs Map
-  std::string trace_opbase_str = "";
-  for (size_t i = 0; i < grad_op_types.size(); i++) {
-    const std::string& op_base_type = grad_op_types[i];
+    VLOG(6) << "Generated Outs Map";
 
+    // [Generation] Get Attrs Map
     const char* TRACE_OP_TEMPLATE =
         "  // Pass the entire attribute map to TraceOp\n"
         "  // The underlying kernel will pickup whatever attribute they need "
         "at runtime\n"
-        "  egr::legacy::RunOp(\"%s\", ins, outs, this->attr_map_,\n"
+        "  egr::legacy::RunOp(\"%s\", %s, %s, this->attr_map_,\n"
         "      egr::Controller::Instance().GetExpectedPlace(),\n"
         "      &this->default_attr_map_, false, {});\n";
-    trace_opbase_str = paddle::string::Sprintf(TRACE_OP_TEMPLATE, op_base_type);
-  }
+    std::string trace_opbase_str = paddle::string::Sprintf(
+        TRACE_OP_TEMPLATE, op_base_type, ins_name, outs_name);
 
-  generated_grad_function_body += trace_opbase_str;
+    generated_grad_function_body += trace_opbase_str;
 
-  VLOG(6) << "Generated Attrs Map";
+    VLOG(6) << "Generated Attrs Map";
 
-  // [Generation] Get Return
-  std::string outputs_str = "";
-  size_t num_appended_outputs = 0;
-  for (auto iter : grad_outs) {
-    const std::string& grad_out_name = iter.first;
-    const std::string& fwd_name = grad_outs_slotname_map.at(grad_out_name);
+    // [Generation] Get Return
+    std::string outputs_str = "";
+    size_t num_appended_outputs = 0;
+    for (auto iter : grad_outs) {
+      const std::string& grad_out_name = iter.first;
+      const std::string& fwd_name = grad_outs_slotname_map.at(grad_out_name);
 
-    if (fwd_inputs_name_pos_map.count(fwd_name)) {
-      size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
-      const char* BWD_OUTPUT_TEMPLATE =
-          "  outputs[%d] = egr::EagerUtils::GetOutputs(outs[\"%s\"]);\n";
-      outputs_str += paddle::string::Sprintf(BWD_OUTPUT_TEMPLATE,
-                                             fwd_input_position, grad_out_name);
-      num_appended_outputs++;
-    } else {
-      PADDLE_ENFORCE(fwd_outputs_name_pos_map.count(fwd_name),
-                     paddle::platform::errors::Fatal(
-                         "fwd_name not found in fwd_inputs_name_pos_map nor "
-                         "fwd_outputs_name_pos_map"));
+      if (fwd_inputs_name_pos_map.count(fwd_name)) {
+        size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
+        const char* BWD_OUTPUT_TEMPLATE =
+            "  outputs[%d] = egr::EagerUtils::GetOutputs(%s[\"%s\"]);\n";
+        outputs_str += paddle::string::Sprintf(
+            BWD_OUTPUT_TEMPLATE, fwd_input_position, outs_name, grad_out_name);
+        num_appended_outputs++;
+      } else {
+        PADDLE_ENFORCE(fwd_outputs_name_pos_map.count(fwd_name),
+                       paddle::platform::errors::Fatal(
+                           "fwd_name not found in fwd_inputs_name_pos_map nor "
+                           "fwd_outputs_name_pos_map"));
+      }
     }
-  }
 
-  /* Handle Special Case: "PullSparseOp", etc
-     For returns, append "GradOut" to the very end of return list. */
-  for (auto iter : grad_outs) {
-    const std::string& grad_out_name = iter.first;
-    const std::string& fwd_name = grad_outs_slotname_map.at(grad_out_name);
-
-    if (fwd_outputs_name_pos_map.count(fwd_name)) {
-      const char* BWD_OUTPUT_TEMPLATE =
-          "  outputs[%d] = egr::EagerUtils::GetOutputs(outs[\"%s\"]);\n";
-      outputs_str += paddle::string::Sprintf(
-          BWD_OUTPUT_TEMPLATE, num_appended_outputs, grad_out_name);
-      num_appended_outputs++;
+    /* Handle Special Case: "PullSparseOp", etc
+       For returns, append "GradOut" to the very end of return list. */
+    for (auto iter : grad_outs) {
+      const std::string& grad_out_name = iter.first;
+      const std::string& fwd_name = grad_outs_slotname_map.at(grad_out_name);
+
+      if (fwd_outputs_name_pos_map.count(fwd_name)) {
+        const char* BWD_OUTPUT_TEMPLATE =
+            "  outputs[%d] = egr::EagerUtils::GetOutputs(%s[\"%s\"]);\n";
+        outputs_str +=
+            paddle::string::Sprintf(BWD_OUTPUT_TEMPLATE, num_appended_outputs,
+                                    outs_name, grad_out_name);
+        num_appended_outputs++;
+      }
     }
+
+    generated_grad_function_body += outputs_str;
+    generated_grad_function_body += "\n";
   }
 
   const char* BWD_RETURN_TEMPLATE =
-      "  std::vector<std::vector<egr::EagerTensor>> "
-      "outputs(outs.size());\n%s\n  "
-      "return outputs;";
-  std::string return_str =
-      paddle::string::Sprintf(BWD_RETURN_TEMPLATE, outputs_str);
-
-  generated_grad_function_body += "\n";
-  generated_grad_function_body += return_str;
+      "  std::vector<std::vector<egr::EagerTensor>> outputs(%d);\n"
+      "  %s\n"
+      "  return outputs;\n";
+  generated_grad_function_body = paddle::string::Sprintf(
+      BWD_RETURN_TEMPLATE, outs_size, generated_grad_function_body);
 
   // [Generation] Get Full Grad Function
   const char* GRAD_FUNCTION_TEMPLATE =
@@ -1452,7 +1724,7 @@ static std::string GenerateGradNodeCCContents(
       "GradNode%s::operator()(const "
       "std::vector<std::vector<egr::EagerTensor>>& grads) {\n%s\n}";
   std::string grad_function_str = paddle::string::Sprintf(
-      GRAD_FUNCTION_TEMPLATE, op_type, generated_grad_function_body);
+      GRAD_FUNCTION_TEMPLATE, fwd_op_type, generated_grad_function_body);
 
   VLOG(6) << "Generated returns";
 
@@ -1463,9 +1735,14 @@ static std::string GenerateGradNodeCCContents(
 /* --------- CodeGen: GradNode Header ------ */
 /* ----------------------------------------- */
 static std::string GenerateGradNodeHeaderContents(
-    const std::map<std::string, std::string>& grad_ins_fwd_slotname_map,
-    const std::string& op_type, const std::vector<proto::OpProto::Var>& in_vars,
-    const std::vector<proto::OpProto::Var>& out_vars) {
+    const ForwardGenerationInfo& fwd_info,
+    const GradNodeGenerationInfo& bwd_info) {
+  const std::string& op_type = fwd_info.GetOpType();
+  const std::vector<proto::OpProto::Var>& in_vars = fwd_info.GetInVars();
+  const std::vector<proto::OpProto::Var>& out_vars = fwd_info.GetOutVars();
+
+  const auto& op_base_infos = bwd_info.GetOpBaseInfos();
+
   VLOG(6) << "Generating Grad Node Header";
 
   const char* GRAD_NODE_TEMPLATE =
@@ -1522,55 +1799,60 @@ static std::string GenerateGradNodeHeaderContents(
 
   std::string set_tensor_wrappers_str = "";
   std::string tensor_wrapper_members_str = "";
-  for (const auto& kv : grad_ins_fwd_slotname_map) {
-    const std::string& tensor_wrapper_name = kv.second;
-    const std::string& struct_tensor_wrapper_name = kv.second + "_";
-
-    std::string tensor_wrapper_arg_str;
-    std::string tensor_wrapper_body_str;
-    if (duplicable_tensors.count(tensor_wrapper_name)) {
-      const char* ATTR_TENSOR_WRAPPER_ARG_TEMPLATE =
-          "const std::vector<egr::EagerTensor>& %s";
-      tensor_wrapper_arg_str = paddle::string::Sprintf(
-          ATTR_TENSOR_WRAPPER_ARG_TEMPLATE, tensor_wrapper_name);
-
-      const char* TENSOR_WRAPPER_MEMBER_TEMPLATE =
-          "   std::vector<egr::TensorWrapper> %s;\n";
-      tensor_wrapper_members_str += paddle::string::Sprintf(
-          TENSOR_WRAPPER_MEMBER_TEMPLATE, struct_tensor_wrapper_name);
-
-      const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE =
-          "for(const auto& eager_tensor : %s) {\n"
-          "          %s.emplace_back( egr::TensorWrapper(eager_tensor, true "
-          "/*full_reserved*/) );\n"
-          "      }\n";
-      tensor_wrapper_body_str = paddle::string::Sprintf(
-          SET_TENSOR_WRAPPER_BODY_TEMPLATE, tensor_wrapper_name,
-          struct_tensor_wrapper_name);
+  for (const auto& iter : op_base_infos) {
+    const std::map<std::string, std::string>& grad_ins_fwd_slotname_map =
+        iter.GetGradInsFwdSlotnameMap();
+
+    for (const auto& kv : grad_ins_fwd_slotname_map) {
+      const std::string& tensor_wrapper_name = kv.second;
+      const std::string& struct_tensor_wrapper_name = kv.second + "_";
+
+      std::string tensor_wrapper_arg_str;
+      std::string tensor_wrapper_body_str;
+      if (duplicable_tensors.count(tensor_wrapper_name)) {
+        const char* ATTR_TENSOR_WRAPPER_ARG_TEMPLATE =
+            "const std::vector<egr::EagerTensor>& %s";
+        tensor_wrapper_arg_str = paddle::string::Sprintf(
+            ATTR_TENSOR_WRAPPER_ARG_TEMPLATE, tensor_wrapper_name);
+
+        const char* TENSOR_WRAPPER_MEMBER_TEMPLATE =
+            "   std::vector<egr::TensorWrapper> %s;\n";
+        tensor_wrapper_members_str += paddle::string::Sprintf(
+            TENSOR_WRAPPER_MEMBER_TEMPLATE, struct_tensor_wrapper_name);
+
+        const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE =
+            "for(const auto& eager_tensor : %s) {\n"
+            "          %s.emplace_back( egr::TensorWrapper(eager_tensor, true "
+            "/*full_reserved*/) );\n"
+            "      }\n";
+        tensor_wrapper_body_str = paddle::string::Sprintf(
+            SET_TENSOR_WRAPPER_BODY_TEMPLATE, tensor_wrapper_name,
+            struct_tensor_wrapper_name);
 
-    } else {
-      const char* ATTR_TENSOR_WRAPPER_ARG_TEMPLATE =
-          "const egr::EagerTensor& %s";
-      tensor_wrapper_arg_str = paddle::string::Sprintf(
-          ATTR_TENSOR_WRAPPER_ARG_TEMPLATE, tensor_wrapper_name);
-
-      const char* TENSOR_WRAPPER_MEMBER_TEMPLATE =
-          "   egr::TensorWrapper %s;\n";
-      tensor_wrapper_members_str += paddle::string::Sprintf(
-          TENSOR_WRAPPER_MEMBER_TEMPLATE, struct_tensor_wrapper_name);
-
-      const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE =
-          "%s = egr::TensorWrapper(%s, true /*full_reserved*/);";
-      tensor_wrapper_body_str = paddle::string::Sprintf(
-          SET_TENSOR_WRAPPER_BODY_TEMPLATE, struct_tensor_wrapper_name,
-          tensor_wrapper_name);
-    }
-
-    const char* SET_TENSOR_WRAPPER_TEMPLATE =
-        "   void SetTensorWrapper%s(%s) {\n     %s\n   }\n";
-    set_tensor_wrappers_str += paddle::string::Sprintf(
-        SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name,
-        tensor_wrapper_arg_str, tensor_wrapper_body_str);
+      } else {
+        const char* ATTR_TENSOR_WRAPPER_ARG_TEMPLATE =
+            "const egr::EagerTensor& %s";
+        tensor_wrapper_arg_str = paddle::string::Sprintf(
+            ATTR_TENSOR_WRAPPER_ARG_TEMPLATE, tensor_wrapper_name);
+
+        const char* TENSOR_WRAPPER_MEMBER_TEMPLATE =
+            "   egr::TensorWrapper %s;\n";
+        tensor_wrapper_members_str += paddle::string::Sprintf(
+            TENSOR_WRAPPER_MEMBER_TEMPLATE, struct_tensor_wrapper_name);
+
+        const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE =
+            "%s = egr::TensorWrapper(%s, true /*full_reserved*/);";
+        tensor_wrapper_body_str = paddle::string::Sprintf(
+            SET_TENSOR_WRAPPER_BODY_TEMPLATE, struct_tensor_wrapper_name,
+            tensor_wrapper_name);
+      }
+
+      const char* SET_TENSOR_WRAPPER_TEMPLATE =
+          "   void SetTensorWrapper%s(%s) {\n     %s\n   }\n";
+      set_tensor_wrappers_str += paddle::string::Sprintf(
+          SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name,
+          tensor_wrapper_arg_str, tensor_wrapper_body_str);
+    }
   }
   VLOG(6) << "Generated TensorWrapper";
 
@@ -1682,97 +1964,62 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
     /* ----------------------------- */
     /* ---- Collect Information ---- */
     /* ----------------------------- */
-    std::vector<std::string> grad_op_types;
-    std::vector<proto::OpProto::Var> in_vars;
-    std::vector<proto::OpProto::Var> out_vars;
-    std::map<std::string, std::string> grad_outs_slotname_map;
-    std::map<std::string, std::string> grad_ins_fwd_slotname_map;
-    std::map<std::string, std::string> grad_ins_grad_slotname_map;
-    std::map<std::string,
-             std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>
-        grad_ins;
-    std::map<std::string,
-             std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>
-        grad_outs;
+
+    ForwardGenerationInfo fwd_info;
+    GradNodeGenerationInfo bwd_info;
 
     VLOG(6) << "-------- CollectInformationFromOpInfo -------";
 
-    CollectForwardInformationFromOpInfo(op_info, &in_vars, &out_vars);
+    CollectForwardInformationFromOpInfo(op_info, &fwd_info);
 
-    bool generate_forward_only = false;
-    bool is_available = CollectGradInformationFromOpInfo(
-        op_info, &generate_forward_only, &grad_op_types,
-        &grad_outs_slotname_map, &grad_ins_fwd_slotname_map,
-        &grad_ins_grad_slotname_map, &grad_ins, &grad_outs);
+    bool is_available = CollectGradInformationFromOpInfo(op_info, &bwd_info);
 
-    if (!is_available && !generate_forward_only) {
+    if (!is_available && !bwd_info.GenerateForwardOnly()) {
       VLOG(6) << "Skipped operator: " << op_type;
       continue;
     }
 
     VLOG(6) << "-------- PurifyOpProto -------";
-    std::unordered_map<std::string, size_t> fwd_inputs_name_pos_map;
-    std::unordered_map<std::string, size_t> fwd_outputs_name_pos_map;
-    PurifyForwardOpProto(*op_proto, &fwd_inputs_name_pos_map,
-                         &fwd_outputs_name_pos_map, &in_vars, &out_vars);
-
-    if (!generate_forward_only) {
-      PurifyGradOpProto(*op_proto, &grad_outs_slotname_map,
-                        &grad_ins_fwd_slotname_map, &grad_ins_grad_slotname_map,
-                        &grad_ins, &grad_outs);
+    PurifyForwardOpProto(*op_proto, &fwd_info);
+    if (!bwd_info.GenerateForwardOnly()) {
+      PurifyGradNodeGenerationInfo(*op_proto, &bwd_info);
     }
 
     /* --------------------------- */
     /* --------- CodeGen --------- */
     /* --------------------------- */
-    /* ---- forward_dygraph_functions.cc ---- */
     VLOG(6) << "-------- GenerateForwardFunctionContents -------";
     std::pair<std::string, std::string> body_and_declaration =
-        GenerateForwardFunctionContents(
-            generate_forward_only, fwd_inputs_name_pos_map,
-            fwd_outputs_name_pos_map, grad_ins_fwd_slotname_map,
-            grad_ins_grad_slotname_map, grad_outs_slotname_map, grad_ins,
-            grad_outs, op_type, in_vars, out_vars);
+        GenerateForwardFunctionContents(fwd_info, bwd_info);
 
     fwd_function_str += body_and_declaration.first + "\n";
 
-    /* ---- dygraph_forward_api.h ---- */
+    VLOG(6) << "-------- GenerateDygraphForwardAPIContents -------";
     std::string fwd_function_declare_str = body_and_declaration.second;
     dygraph_forward_api_str += fwd_function_declare_str;
 
-    if (generate_forward_only) continue;
+    if (bwd_info.GenerateForwardOnly()) continue;
 
-    /* ---- nodes.h ---- */
     VLOG(6) << "-------- GenerateGradNodeHeaderContents -------";
-    grad_node_h_str +=
-        GenerateGradNodeHeaderContents(grad_ins_fwd_slotname_map, op_type,
-                                       in_vars, out_vars) +
-        "\n";
+    grad_node_h_str += GenerateGradNodeHeaderContents(fwd_info, bwd_info);
+    grad_node_h_str += "\n";
 
-    /* ---- nodes.cc ---- */
     VLOG(6) << "-------- GenerateGradNodeCCContents -------";
-    grad_node_cc_str += GenerateGradNodeCCContents(
-                            grad_op_types, fwd_inputs_name_pos_map,
-                            fwd_outputs_name_pos_map, grad_ins_fwd_slotname_map,
-                            grad_ins_grad_slotname_map, grad_outs_slotname_map,
-                            grad_ins, grad_outs, op_type, in_vars, out_vars) +
-                        "\n";
+    grad_node_cc_str += GenerateGradNodeCCContents(fwd_info, bwd_info);
+    grad_node_cc_str += "\n";
 
     VLOG(6) << op_type << ": Finished Generating Op: " << op_type;
   }
-  /* ---- dygraph_forward_function.cc ---- */
+
   VLOG(6) << "-------- GenerateDygraphForwardCCFile -------";
   GenerateForwardDygraphFile(output_dir, fwd_function_str);
 
-  /* ---- dygraph_forward_api.h ---- */
   VLOG(6) << "-------- GenerateForwardHFile -------";
   GenerateForwardHFile(output_dir, dygraph_forward_api_str);
 
-  /* ---- nodes.h ---- */
   VLOG(6) << "-------- GenerateNodeHFile -------";
   GenerateNodeHFile(output_dir, grad_node_h_str);
 
-  /* ---- nodes.cc ---- */
   VLOG(6) << "-------- GenerateNodeCCFile -------";
   GenerateNodeCCFile(output_dir, grad_node_cc_str);
 }
diff --git a/paddle/fluid/eager/auto_code_generator/op_list.txt b/paddle/fluid/eager/auto_code_generator/op_list.txt
index 699a84169d700..d3e835a1d0355 100644
--- a/paddle/fluid/eager/auto_code_generator/op_list.txt
+++ b/paddle/fluid/eager/auto_code_generator/op_list.txt
@@ -237,6 +237,7 @@ spp
 floor
 gelu
 retinanet_detection_output
+minus
 push_dense
 silu
 sequence_erase
diff --git a/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt b/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt
index 8811aa8ad38a5..516789cbb8cf7 100644
--- a/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt
+++ b/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_library(performance_benchmark_utils SRCS benchmark_utils.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node scale_op matmul_v2_op)
+cc_library(performance_benchmark_utils SRCS benchmark_utils.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} eager_scale scale_node scale_op matmul_v2_op dygraph_function)
 
 cc_test(test_egr_performance_benchmark_eager_cpu SRCS benchmark_eager_cpu.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
 cc_test(test_egr_performance_benchmark_fluid_cpu SRCS benchmark_fluid_cpu.cc DEPS performance_benchmark_utils ${eager_deps} ${fluid_deps})
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 739e05e1d7971..7380e0f129cf4 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -180,10 +180,11 @@ enum TableType {
 
 message TableParameter {
   optional uint64 table_id = 1;
-  optional string table_class = 2;
-  optional uint64 shard_num = 3 [ default = 1000 ];
-  optional TableType type = 4;
-  optional TableAccessorParameter accessor = 5;
+  optional string table_name = 2;
+  optional string table_class = 3;
+  optional uint64 shard_num = 4 [ default = 1000 ];
+  optional TableType type = 5;
+  optional TableAccessorParameter accessor = 6;
 }
 
 message TableAccessorParameter {
@@ -198,7 +199,6 @@ message TableAccessorParameter {
   repeated TableAccessorSaveParameter table_accessor_save_param = 8;
 }
 
-// TODO(guanqun): add NaiveSGD/Adam...
 message SGDParameter {
   optional string name = 1;
   optional SparseNaiveSGDRuleParameter naive = 2;
@@ -321,7 +321,7 @@ message DistributedStrategy {
   optional HybridConfig hybrid_configs = 112;
   optional TensorParallelConfig tensor_parallel_configs = 113;
   optional TrainerDescConfig trainer_desc_configs = 114;
-  optional TableParameter downpour_table_param = 115;
+  repeated TableParameter downpour_table_param = 115;
   optional FsClientParameter fs_client_param = 116;
 
   optional BuildStrategy build_strategy = 201;
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index 71b53b8a51882..5e450234c405c 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -81,6 +81,11 @@ struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> {
     return device;
   }
 
+  inline ::DLDevice operator()(const platform::IPUPlace &place) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("platform::IPUPlace is not supported"));
+  }
+
   inline ::DLDevice operator()(const platform::XPUPlace &place) const {
     PADDLE_THROW(
         platform::errors::Unimplemented("platform::XPUPlace is not supported"));
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 93f4f8952fc67..9e57261477991 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -463,6 +463,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
 #else
       PADDLE_THROW(
           platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
+#endif
+    } else if (platform::is_ipu_place(place_)) {
+#ifdef PADDLE_WITH_IPU
+      gc.reset(new IPUGarbageCollector(
+          BOOST_GET_CONST(platform::IPUPlace, place_), max_memory_size));
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("No IPU gc found in CPU/IPU paddle"));
 #endif
     } else if (platform::is_npu_place(place_)) {
 #ifdef PADDLE_WITH_ASCEND_CL
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index b98a228868266..08055cd9a5407 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -156,7 +156,7 @@ cc_test(test_seqpool_cvm_concat_fuse_pass SRCS seqpool_cvm_concat_fuse_pass_test
 cc_test(test_repeated_fc_relu_fuse_pass_cc SRCS repeated_fc_relu_fuse_pass_tester.cc DEPS repeated_fc_relu_fuse_pass framework_proto)
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
 cc_test(test_simplify_with_basic_ops_pass SRCS simplify_with_basic_ops_pass_tester.cc DEPS simplify_with_basic_ops_pass)
-cc_test(test_fc_elementwise_layernorm_fuse_pass SRCS fc_elementwise_layernorm_fuse_pass_tester.cc DEPS fc_elementwise_layernorm_fuse_pass)
+cc_test(test_fc_elementwise_layernorm_fuse_pass_cc SRCS fc_elementwise_layernorm_fuse_pass_tester.cc DEPS fc_elementwise_layernorm_fuse_pass)
 cc_test(test_skip_layernorm_fuse_pass SRCS skip_layernorm_fuse_pass_tester.cc DEPS skip_layernorm_fuse_pass)
 cc_test(test_multihead_matmul_fuse_pass SRCS multihead_matmul_fuse_pass_tester.cc DEPS multihead_matmul_fuse_pass)
 cc_test(test_conv_bn_fuse_pass_cc SRCS conv_bn_fuse_pass_tester.cc DEPS conv_bn_fuse_pass)
diff --git a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
index d3cf3319adfc5..0bf30c29f3279 100644
--- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -338,3 +339,9 @@ void FCElementwiseLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
 
 REGISTER_PASS(fc_elementwise_layernorm_fuse_pass,
               paddle::framework::ir::FCElementwiseLayerNormFusePass);
+REGISTER_PASS_CAPABILITY(fc_elementwise_layernorm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("fc", 0)
+            .LE("elementwise_add", 1)
+            .EQ("layer_norm", 0));
diff --git a/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc b/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc
new file mode 100644
index 0000000000000..9dcbbb9c9856e
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/avg_shard_pass.h"
+
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void AvgShardPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter AvgShardPass::ApplyImpl";
+
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+
+  if (ipu_backend->GetIpuStrategy()->need_avg_shard) {
+    VLOG(10) << "start AvgShardPass";
+    auto nodes = ir::TopologySortOperations(*graph);
+    auto num_ipus = ipu_backend->GetIpuStrategy()->num_ipus;
+
+    int shard_position = nodes.size() / num_ipus;
+    int index_and_stage = -1;
+    for (int i = 0; i < nodes.size(); i++) {
+      if ((i % shard_position) == 0 && index_and_stage < num_ipus - 1) {
+        index_and_stage++;
+      }
+      nodes[i]->Op()->SetAttr("ipu_index", index_and_stage);
+      nodes[i]->Op()->SetAttr("ipu_stage", index_and_stage);
+    }
+    VLOG(10) << "end AvgShardPass";
+  }
+
+  VLOG(10) << "leave AvgShardPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(avg_shard_pass, paddle::framework::ir::AvgShardPass);
diff --git a/paddle/fluid/framework/ir/ipu/avg_shard_pass.h b/paddle/fluid/framework/ir/ipu/avg_shard_pass.h
new file mode 100644
index 0000000000000..b13acbd198dd5
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/avg_shard_pass.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class AvgShardPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.cc b/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.cc
new file mode 100644
index 0000000000000..5dcfddf6187f2
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void ForwardGraphExtractPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter ForwardGraphExtractPass::ApplyImpl";
+
+  std::unordered_map<OpRole, std::unordered_set<ir::Node*>> all_ops{
+      {OpRole::kForward, {}},  {OpRole::kBackward, {}},
+      {OpRole::kOptimize, {}}, {OpRole::kRPC, {}},
+      {OpRole::kDist, {}},     {OpRole::kLRSched, {}},
+      {OpRole::kLoss, {}},     {OpRole::kNotSpecified, {}}};
+  for (auto* node : graph->Nodes()) {
+    if (!node->IsOp()) {
+      continue;
+    }
+    auto op_role = BOOST_GET_MUTABLE(int, node->Op()->GetAttr("op_role"));
+    if (op_role == static_cast<int>(OpRole::kForward)) {
+      all_ops[OpRole::kForward].insert(node);
+    } else if (op_role == static_cast<int>(OpRole::kBackward)) {
+      all_ops[OpRole::kBackward].insert(node);
+    } else if (op_role == static_cast<int>(OpRole::kOptimize)) {
+      all_ops[OpRole::kOptimize].insert(node);
+    } else if (op_role == static_cast<int>(OpRole::kRPC)) {
+    } else if (op_role == static_cast<int>(OpRole::kDist)) {
+    } else if (op_role == static_cast<int>(OpRole::kLRSched)) {
+    } else if (op_role == static_cast<int>(OpRole::kLoss)) {
+      all_ops[OpRole::kLoss].insert(node);
+    } else if (op_role == static_cast<int>(OpRole::kNotSpecified)) {
+      LOG(WARNING) << "Op: " << node->Name() << " OpRole is NotSpecified ";
+    }
+  }
+
+  std::unordered_set<ir::Node*> forward_vars;
+  std::unordered_set<ir::Node*> backward_vars;
+  std::unordered_set<ir::Node*> control_vars;
+  // forward_vars
+  for (auto& nodes : std::array<std::unordered_set<ir::Node*>, 2>{
+           all_ops[OpRole::kForward], all_ops[OpRole::kLoss]}) {
+    for (auto* node : nodes) {
+      for (auto* in_node : node->inputs) {
+        forward_vars.insert(in_node);
+      }
+      for (auto* out_node : node->outputs) {
+        forward_vars.insert(out_node);
+      }
+    }
+  }
+  // control_vars &  backward_vars
+  for (auto* node : graph->Nodes()) {
+    if (!node->IsVar()) {
+      continue;
+    }
+    if (node->IsCtrlVar()) {
+      control_vars.insert(node);
+    }
+    for (auto* in_node : node->inputs) {
+      if (all_ops[OpRole::kOptimize].count(in_node)) {
+        backward_vars.insert(node);
+      }
+    }
+  }
+  // all removed node
+  std::unordered_set<ir::Node*> rm_nodes;
+  for (auto* node : graph->Nodes()) {
+    if (backward_vars.count(node)) {
+      rm_nodes.insert(node);
+    } else if (control_vars.count(node)) {
+      rm_nodes.insert(node);
+    } else if (all_ops[OpRole::kBackward].count(node)) {
+      rm_nodes.insert(node);
+    } else if (all_ops[OpRole::kForward].count(node) == 0 &&
+               all_ops[OpRole::kLoss].count(node) == 0 &&
+               forward_vars.count(node) == 0) {
+      rm_nodes.insert(node);
+    } else if (node->Name() == "feed" || node->Name() == "fetch") {
+      rm_nodes.insert(node);
+    }
+  }
+
+  VLOG(10) << "Remove Node: ";
+  for (auto* node : rm_nodes) {
+    // rm node releations
+    for (auto* node_in : node->inputs) {
+      for (size_t i = 0; i < node_in->outputs.size(); ++i) {
+        if (node_in->outputs[i] == node) {
+          node_in->outputs.erase(node_in->outputs.begin() + i);
+          break;
+        }
+      }
+    }
+    for (auto* node_out : node->outputs) {
+      for (size_t i = 0; i < node_out->inputs.size(); ++i) {
+        if (node_out->inputs[i] == node) {
+          node_out->inputs.erase(node_out->inputs.begin() + i);
+          break;
+        }
+      }
+    }
+    VLOG(10) << "\t" << node->Name();
+    graph->RemoveNode(node);
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  VLOG(10) << "leave ForwardGraphExtractPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(forward_graph_extract_pass,
+              paddle::framework::ir::ForwardGraphExtractPass);
diff --git a/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h b/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h
new file mode 100644
index 0000000000000..afa9f1c15f2ab
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class ForwardGraphExtractPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc b/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
new file mode 100644
index 0000000000000..ceef27ac1ce3c
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/infer_shape_pass.h"
+
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/variable_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void InferShapePass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter InferShapePass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+  auto batch_size = ipu_backend->GetIpuStrategy()->batch_size;
+
+  auto feed_list = Get<std::vector<std::string>>("feed_list");
+  for (auto node : graph->Nodes()) {
+    if (!node->IsVar()) {
+      continue;
+    }
+    bool is_feed = std::find(feed_list.begin(), feed_list.end(),
+                             node->Name()) != feed_list.end();
+    if (is_feed) {
+      auto input_shape = node->Var()->GetShape();
+      if (input_shape[0] <= -1) {
+        input_shape[0] = batch_size;
+        node->Var()->SetShape(input_shape);
+      }
+      // int64->int32
+      if (node->Var()->GetDataType() == proto::VarType::INT64) {
+        node->Var()->SetDataType(proto::VarType::INT32);
+      }
+    }
+  }
+
+  // temp scope for shape inference
+  std::shared_ptr<paddle::framework::Scope> scope(
+      new paddle::framework::Scope());
+  for (auto node : graph->Nodes()) {
+    if (!node->IsVar()) {
+      continue;
+    }
+    auto var_desc = node->Var();
+    auto* ptr = scope->Var(var_desc->Name());
+    paddle::framework::InitializeVariable(ptr, var_desc->GetType());
+
+    auto tensor = ptr->GetMutable<paddle::framework::LoDTensor>();
+    tensor->Resize(paddle::framework::make_ddim(var_desc->GetShape()));
+  }
+
+  // infer shape
+  auto nodes = ir::TopologySortOperations(*graph);
+  for (auto node : nodes) {
+    auto op_desc = node->Op();
+    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
+    paddle::framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), *scope);
+    op->RuntimeInferShape(*scope, paddle::platform::CPUPlace(), ctx);
+
+    for (auto it = ctx.outputs.begin(); it != ctx.outputs.end(); it++) {
+      for (int i = 0; i < it->second.size(); i++) {
+        auto output_name = op_desc->Output(it->first)[i];
+        auto dim =
+            it->second[i]->GetMutable<paddle::framework::LoDTensor>()->dims();
+        auto new_shape = paddle::framework::vectorize(dim);
+        for (auto output_node : node->outputs) {
+          if (output_node->Name() == output_name) {
+            output_node->Var()->SetShape(new_shape);
+          }
+        }
+      }
+    }
+  }
+  // release the temp scope
+  scope.reset();
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave InferShapePass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(infer_shape_pass, paddle::framework::ir::InferShapePass)
+    .RequirePassAttr("feed_list");
diff --git a/paddle/fluid/framework/ir/ipu/infer_shape_pass.h b/paddle/fluid/framework/ir/ipu/infer_shape_pass.h
new file mode 100644
index 0000000000000..3e8148b7f066d
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/infer_shape_pass.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InferShapePass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.cc b/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.cc
new file mode 100644
index 0000000000000..616139a52ac06
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h"
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void InferencePostprocessPass::ApplyImpl(ir::Graph *graph) const {
+  VLOG(10) << "enter InferencePostprocessPass::ApplyImpl";
+
+  std::vector<std::string> feed_list;
+  feed_list = Get<std::vector<std::string>>("feed_list");
+  std::vector<std::string> fetch_list;
+  fetch_list = Get<std::vector<std::string>>("fetch_list");
+
+  auto *feed_var = new paddle::framework::VarDesc("feed");
+  feed_var->SetType(proto::VarType::FEED_MINIBATCH);
+  auto *feed_var_node = graph->CreateVarNode(feed_var);
+
+  auto *fetch_var = new paddle::framework::VarDesc("fetch");
+  fetch_var->SetType(proto::VarType::FETCH_LIST);
+  auto *fetch_var_node = graph->CreateVarNode(fetch_var);
+
+  for (int i = 0; i < feed_list.size(); i++) {
+    for (auto node : graph->Nodes()) {
+      if (node->Name() == feed_list[i]) {
+        auto *op = new paddle::framework::OpDesc();
+        op->SetType("feed");
+        op->SetInput("X", {"feed"});
+        op->SetOutput("Out", {node->Name()});
+        op->SetAttr("col", i);
+        auto *op_node = graph->CreateOpNode(op);
+        node->inputs.push_back(op_node);
+        op_node->outputs.push_back(node);
+        feed_var_node->outputs.push_back(op_node);
+        op_node->inputs.push_back(feed_var_node);
+        break;
+      }
+    }
+  }
+
+  for (int i = 0; i < fetch_list.size(); i++) {
+    for (auto node : graph->Nodes()) {
+      if (node->Name() == fetch_list[i]) {
+        auto *op = new paddle::framework::OpDesc();
+        op->SetType("fetch");
+        op->SetInput("X", {node->Name()});
+        op->SetOutput("Out", {"fetch"});
+        op->SetAttr("col", i);
+        auto *op_node = graph->CreateOpNode(op);
+        node->outputs.push_back(op_node);
+        op_node->inputs.push_back(node);
+        fetch_var_node->inputs.push_back(op_node);
+        op_node->outputs.push_back(fetch_var_node);
+        break;
+      }
+    }
+  }
+
+  VLOG(10) << "leave InferencePostprocessPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(inference_postprocess_pass,
+              paddle::framework::ir::InferencePostprocessPass)
+    .RequirePassAttr("feed_list")
+    .RequirePassAttr("fetch_list");
diff --git a/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h b/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h
new file mode 100644
index 0000000000000..e80e1905d4ad7
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InferencePostprocessPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
new file mode 100644
index 0000000000000..d02dcce0cc62c
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
@@ -0,0 +1,129 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/inference_process_pass.h"
+
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void InferenceProcessPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter InferenceProcessPass::ApplyImpl";
+
+  // Get a new instance of ipu_backend
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetNewInstance();
+
+  // Set scope
+  auto& scope = graph->Get<Scope>(kParamScopeAttr);
+  ipu_backend->SetScope(scope);
+
+  // Set ipu_strategy
+  static std::shared_ptr<platform::ipu::IpuStrategy> ipu_strategy_instance_(
+      new platform::ipu::IpuStrategy());
+  ipu_strategy_instance_->is_training = false;
+  auto num_ipus = graph->Get<int>("num_ipus");
+  ipu_strategy_instance_->num_ipus = num_ipus;
+  if (num_ipus > 1) {
+    ipu_strategy_instance_->popart_options_.virtualGraphMode =
+        platform::ipu::VirtualGraphMode::Manual;
+  } else {
+    ipu_strategy_instance_->popart_options_.virtualGraphMode =
+        platform::ipu::VirtualGraphMode::Off;
+  }
+
+  auto enable_pipelining = graph->Get<bool>("enable_pipelining");
+  ipu_strategy_instance_->popart_options_.enablePipelining = enable_pipelining;
+  if (enable_pipelining) {
+    auto batches_per_step = graph->Get<int>("batches_per_step");
+    PADDLE_ENFORCE_GE(
+        batches_per_step, num_ipus,
+        platform::errors::InvalidArgument("Batched per step should be equal or "
+                                          "greater than the number of IPUs"));
+    ipu_strategy_instance_->batches_per_step = batches_per_step;
+  }
+  ipu_strategy_instance_->batch_size = graph->Get<int>("batch_size");
+  ipu_strategy_instance_->need_avg_shard = graph->Get<bool>("need_avg_shard");
+
+  ipu_backend->SetIpuStrategy(*(ipu_strategy_instance_.get()));
+
+  // Get feed_list and fetch list
+  std::vector<std::string> feed_list = {};
+  std::vector<std::string> fetch_list = {};
+  for (auto node : graph->Nodes()) {
+    if (node->Name() == "feed") {
+      if (node->IsOp()) {
+        feed_list.push_back("");
+      }
+    } else if (node->Name() == "fetch") {
+      if (node->IsOp()) {
+        fetch_list.push_back("");
+      }
+    }
+  }
+  for (auto node : graph->Nodes()) {
+    if (node->Name() == "feed") {
+      if (node->IsOp()) {
+        feed_list[BOOST_GET_CONST(int, node->Op()->GetAttr("col"))] =
+            node->outputs[0]->Name();
+      }
+    } else if (node->Name() == "fetch") {
+      if (node->IsOp()) {
+        fetch_list[BOOST_GET_CONST(int, node->Op()->GetAttr("col"))] =
+            node->inputs[0]->Name();
+      }
+    }
+  }
+
+  // Run passes
+  std::vector<std::string> graph_pass = {"forward_graph_extract_pass",
+                                         "infer_shape_pass", "avg_shard_pass",
+                                         "popart_canonicalization_pass"};
+  std::vector<std::string> compile_pass = {
+      "ipu_inplace_pass", "ipu_graph_builder_pass", "ipu_runtime_replacer_pass",
+      "inference_postprocess_pass"};
+  for (auto pass_name : graph_pass) {
+    auto pass = PassRegistry::Instance().Get(pass_name);
+    if (pass_name == "infer_shape_pass") {
+      pass->Set("feed_list", new std::vector<std::string>(feed_list.begin(),
+                                                          feed_list.end()));
+    }
+    pass->Apply(graph);
+  }
+
+  for (auto pass_name : compile_pass) {
+    auto pass = PassRegistry::Instance().Get(pass_name);
+    pass->Set("feed_list",
+              new std::vector<std::string>(feed_list.begin(), feed_list.end()));
+    pass->Set("fetch_list", new std::vector<std::string>(fetch_list.begin(),
+                                                         fetch_list.end()));
+    pass->Apply(graph);
+  }
+
+  VLOG(10) << "leave InferenceProcessPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(inference_process_pass,
+              paddle::framework::ir::InferenceProcessPass);
diff --git a/paddle/fluid/framework/ir/ipu/inference_process_pass.h b/paddle/fluid/framework/ir/ipu/inference_process_pass.h
new file mode 100644
index 0000000000000..bac0e88377f7c
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InferenceProcessPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc
new file mode 100644
index 0000000000000..5a53466089bc8
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void IpuGraphBuilderPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter IpuGraphBuilderPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  std::vector<std::string> feed_list;
+  feed_list = Get<std::vector<std::string>>("feed_list");
+
+  std::vector<std::string> fetch_list;
+  fetch_list = Get<std::vector<std::string>>("fetch_list");
+
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+
+  ipu_backend->Compile(graph, feed_list, fetch_list);
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuGraphBuilderPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(ipu_graph_builder_pass,
+              paddle::framework::ir::IpuGraphBuilderPass)
+    .RequirePassAttr("feed_list")
+    .RequirePassAttr("fetch_list");
diff --git a/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h
new file mode 100644
index 0000000000000..6237df3648033
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IpuGraphBuilderPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.cc b/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.cc
new file mode 100644
index 0000000000000..d3f1f1633ffc9
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::string GenerateVarName(Node *node) {
+  return node->Name() + "_" + std::to_string(node->id());
+}
+
+void IpuInplacePass::ApplyImpl(ir::Graph *graph) const {
+  // use this pass after forward_graph_extract_pass
+  // raise error if the inplaced var both in feed_list & fetch_list
+  VLOG(10) << "enter IpuInplacePass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  std::vector<std::string> feed_list;
+  feed_list = Get<std::vector<std::string>>("feed_list");
+  std::vector<std::string> fetch_list;
+  fetch_list = Get<std::vector<std::string>>("fetch_list");
+
+  std::map<std::string, int> var_name;
+  for (auto *node : graph->Nodes()) {
+    if (node->IsVar()) {
+      if (var_name.find(node->Name()) == var_name.end()) {
+        var_name.emplace(node->Name(), 1);
+      } else {
+        var_name[node->Name()]++;
+      }
+    }
+  }
+
+  for (auto *node : graph->Nodes()) {
+    if (node->IsVar()) {
+      if (var_name[node->Name()] > 1) {
+        auto is_feed = (std::find(feed_list.begin(), feed_list.end(),
+                                  node->Name()) != feed_list.end()) &&
+                       (node->inputs.size() == 0);
+        auto is_fetch = (std::find(fetch_list.begin(), fetch_list.end(),
+                                   node->Name()) != fetch_list.end()) &&
+                        (node->outputs.size() == 0);
+        if (!is_feed && !is_fetch && !node->Var()->Persistable()) {
+          auto old_name = node->Name();
+          auto new_name = GenerateVarName(node);
+          node->RenameVar(new_name);
+          for (auto *op_in : node->inputs) {
+            op_in->Op()->RenameOutput(old_name, new_name);
+          }
+          for (auto *op_out : node->outputs) {
+            op_out->Op()->RenameInput(old_name, new_name);
+          }
+        }
+      }
+    }
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuInplacePass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(ipu_inplace_pass, paddle::framework::ir::IpuInplacePass)
+    .RequirePassAttr("feed_list")
+    .RequirePassAttr("fetch_list");
diff --git a/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h b/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h
new file mode 100644
index 0000000000000..86756276c8c3d
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IpuInplacePass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/ipu_pass_base.cc b/paddle/fluid/framework/ir/ipu/ipu_pass_base.cc
new file mode 100644
index 0000000000000..ba9233eeb8cb9
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/ipu_pass_base.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void IPUPassBase::Init(const std::string& repr, Graph* graph) const {
+  repr_ = repr;
+  graph_ = graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/ipu_pass_base.h b/paddle/fluid/framework/ir/ipu/ipu_pass_base.h
new file mode 100644
index 0000000000000..b56d3e4c65b1c
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/ipu_pass_base.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IPUPassBase : public Pass {
+ public:
+  void Init(const std::string& repr, Graph* graph) const;
+  virtual ~IPUPassBase() {}
+
+ protected:
+  mutable Graph* graph_;
+  mutable std::string repr_;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc
new file mode 100644
index 0000000000000..a3e020714e1db
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc
@@ -0,0 +1,97 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h"
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void IpuRuntimeReplacerPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter IpuRuntimeReplacerPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  std::vector<std::string> feed_list;
+  feed_list = Get<std::vector<std::string>>("feed_list");
+
+  std::vector<std::string> fetch_list;
+  fetch_list = Get<std::vector<std::string>>("fetch_list");
+
+  framework::OpDesc ipu_rt_op_desc;
+  ipu_rt_op_desc.SetType("ipu_runtime");
+  ipu_rt_op_desc.SetInput("FeedList", feed_list);
+  ipu_rt_op_desc.SetOutput("FetchList", fetch_list);
+  ipu_rt_op_desc.Flush();
+
+  // Create a new node for the ipu_runtime_op.
+  auto* ipu_rt_node = graph->CreateOpNode(&ipu_rt_op_desc);
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsVar()) {
+      for (auto feed : feed_list) {
+        if (node->Name() == feed) {
+          IR_NODE_LINK_TO(node, ipu_rt_node);
+        }
+      }
+      for (auto fetch : fetch_list) {
+        if (node->Name() == fetch) {
+          IR_NODE_LINK_TO(ipu_rt_node, node);
+        }
+      }
+    }
+  }
+
+  // set ipu_runtime_op dtype attr
+  if (fetch_list.size() == 1) {
+    for (auto* node : graph->Nodes()) {
+      if (node->IsVar()) {
+        for (auto fetch : fetch_list) {
+          if (node->Name() == fetch) {
+            ipu_rt_node->Op()->SetAttr("dtype", node->Var()->GetDataType());
+          }
+        }
+      }
+    }
+  }
+
+  // Remove unneeded nodes.
+  std::unordered_set<const Node*> marked_nodes;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op_desc = node->Op();
+      if (op_desc->Type() != "ipu_runtime") {
+        marked_nodes.insert(node);
+      }
+    }
+  }
+
+  GraphSafeRemoveNodes(graph, marked_nodes);
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuRuntimeReplacerPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(ipu_runtime_replacer_pass,
+              paddle::framework::ir::IpuRuntimeReplacerPass)
+    .RequirePassAttr("feed_list")
+    .RequirePassAttr("fetch_list");
diff --git a/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h
new file mode 100644
index 0000000000000..ba2cc8702fa47
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IpuRuntimeReplacerPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
new file mode 100644
index 0000000000000..c6be2c775bd21
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
@@ -0,0 +1,91 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter IpuOptimizerExtractPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  auto ipu_backend = paddle::platform::ipu::IpuBackend::GetInstance();
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()) {
+      int op_role = BOOST_GET_CONST(
+          int, node->Op()->GetAttr(
+                   framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
+
+      // graph usually have multiple optimizer node for different parameter,
+      // and these node have the same type and attr value usually
+      if ((op_role == static_cast<int>(framework::OpRole::kOptimize))) {
+        ipu_backend->GetExecutor().SetOptimizerType(node->Op()->Type());
+        VLOG(10) << "found optimizer type: " << node->Op()->Type();
+
+        for (const std::string& attr_name : node->Op()->AttrNames()) {
+          auto attr_type = node->Op()->GetAttrType(attr_name);
+          // with adam, attr are float
+          if (attr_type == proto::AttrType::FLOAT) {
+            auto attr_value =
+                BOOST_GET_CONST(float, node->Op()->GetAttr(attr_name));
+            ipu_backend->GetExecutor().SetOptimizerAttr(attr_name, attr_value);
+          } else {
+            VLOG(10) << "Skip " << attr_type;
+          }
+        }
+
+        auto lr_var_name = node->Op()->Input("LearningRate");
+        PADDLE_ENFORCE_EQ(lr_var_name.size(), 1u,
+                          platform::errors::InvalidArgument(
+                              "In op(%s), find input(LearningRate) failed.",
+                              node->Op()->Type()));
+
+        ipu_backend->GetExecutor().SetLRVarName(lr_var_name[0]);
+      }
+
+      if ((op_role == static_cast<int>(framework::OpRole::kLoss))) {
+        VLOG(10) << "found loss op type: " << node->Op()->Type();
+        auto outputs = node->Op()->Outputs();
+        PADDLE_ENFORCE_EQ(
+            outputs.size(), 1,
+            platform::errors::InvalidArgument("Can only support one loss key"));
+
+        auto losses_name = outputs.begin()->second;
+        PADDLE_ENFORCE_EQ(losses_name.size(), 1,
+                          platform::errors::InvalidArgument(
+                              "Can only support one loss name"));
+
+        ipu_backend->GetExecutor().SetLoss(losses_name[0]);
+      }
+    }
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuOptimizerExtractPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(optimizer_extract_pass,
+              paddle::framework::ir::IpuOptimizerExtractPass);
diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h
new file mode 100644
index 0000000000000..fd274ded8f5bd
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IpuOptimizerExtractPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc
new file mode 100644
index 0000000000000..c23bfdcb154f1
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/common.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using paddle::platform::ipu::IpuBackend;
+using framework::ir::Graph;
+using framework::ir::Node;
+
+void IpuOptimizerStateAlignPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter IpuOptimizerStateAlignPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  auto ipu_backend = IpuBackend::GetInstance();
+  const auto* scope_ = ipu_backend->GetScope();
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()) {
+      int op_role = BOOST_GET_CONST(
+          int, node->Op()->GetAttr(
+                   framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
+
+      if ((op_role == static_cast<int>(framework::OpRole::kOptimize))) {
+        auto inputs = node->Op()->Inputs();
+        if (inputs.count(platform::ipu::sBeta1Pow)) {
+          auto var = scope_->GetVar(inputs.at(platform::ipu::sBeta1Pow)[0]);
+          auto data = var->GetMutable<framework::LoDTensor>()->data<float>();
+          auto beta = BOOST_GET_CONST(
+              float, node->Op()->GetAttr(platform::ipu::sBeta1));
+
+          // ensure current save with beta1pow, rather than step.
+          // beta1pow = beta1 ^ (step + 1). Just set beta1pow because popart
+          // support single Step__
+          bool save_with_beta1pow = (data[0] < 1.0f) && (data[0] > 0.0f);
+          float step = 0;
+          float beta_acc = beta;
+          while (beta_acc > data[0] && save_with_beta1pow) {
+            beta_acc *= beta;
+            step += 1;
+          }
+
+          if (save_with_beta1pow) {
+            data[0] = step;
+          }
+        }
+      }
+    }
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuOptimizerStateAlignPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(optimizer_state_align_pass,
+              paddle::framework::ir::IpuOptimizerStateAlignPass);
diff --git a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h
new file mode 100644
index 0000000000000..21a1017d88452
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * This pass should only affect optimizer that need bias correction,
+ * include Adam/Lamb.
+ */
+
+class IpuOptimizerStateAlignPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
new file mode 100644
index 0000000000000..c97b7fd5bcb0c
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/post_canonicalization.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using framework::ir::Graph;
+using framework::ir::Node;
+using platform::ipu::SymbolHandler;
+
+void PopartCanonicalizationPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter PopartCanonicalizationPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  auto nodes = graph->Nodes();
+  for (auto* node : nodes) {
+    if (!node->IsOp()) {
+      continue;
+    }
+    auto* op = node->Op();
+    auto op_type = op->Type();
+
+    ir::Node* new_node = nullptr;
+    SymbolHandler handler = platform::ipu::GetHandler(op_type);
+    if (handler) {
+      VLOG(11) << "Raw Paddle Node:";
+      VLOG(11) << node->Op()->Proto()->DebugString();
+      new_node = handler(graph, node);
+      VLOG(11) << "Post Popart Node:";
+      VLOG(11) << new_node->Op()->Proto()->DebugString();
+
+      platform::ipu::ClearNode(node);
+      graph->RemoveNode(node);
+    } else {
+      LOG(ERROR) << "Can not find OpHandler for op_type: " << op_type;
+    }
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave PopartCanonicalizationPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(popart_canonicalization_pass,
+              paddle::framework::ir::PopartCanonicalizationPass);
diff --git a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h
new file mode 100644
index 0000000000000..6690873f2a9ac
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class PopartCanonicalizationPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
index 523c216132646..f3d96c3850656 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -84,13 +84,16 @@ void TransposeFlattenConcatFusePass::RunTransposeFlattenConcatFuse(
       LOG(WARNING) << "Pass in op compat failed.";
       return;
     }
+
     const int kNumFields = 5;
     const int kTransOffset = 1;
     const int kTransOutOffset = 2;
     const int kFlattenOffset = 3;
     const int kFlattenOutOffset = 4;
-    std::vector<Node *> nodes;
 
+    std::vector<Node *> nodes;
+    std::vector<int> trans_axis0;
+    int flatten_axis0;
     for (int i = 0; i < times; i++) {
       PADDLE_ENFORCE_NOT_NULL(
           subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))),
@@ -112,6 +115,33 @@ void TransposeFlattenConcatFusePass::RunTransposeFlattenConcatFuse(
           platform::errors::NotFound("Can not find %s in subgraph.",
                                      input_nodes[i]->name()));
 
+      if (i == 0) {
+        trans_axis0 = BOOST_GET_CONST(
+            std::vector<int>,
+            subgraph.at(pattern.GetPDNode("transpose" + std::to_string(0)))
+                ->Op()
+                ->GetAttr("axis"));
+        flatten_axis0 = BOOST_GET_CONST(
+            int, subgraph.at(pattern.GetPDNode("flatten" + std::to_string(0)))
+                     ->Op()
+                     ->GetAttr("axis"));
+      } else {
+        std::vector<int> trans_axis = BOOST_GET_CONST(
+            std::vector<int>,
+            subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i)))
+                ->Op()
+                ->GetAttr("axis"));
+        // All axis of transpose should be the same
+        if (trans_axis0 != trans_axis) return;
+
+        int flatten_axis = BOOST_GET_CONST(
+            int, subgraph.at(pattern.GetPDNode("flatten" + std::to_string(0)))
+                     ->Op()
+                     ->GetAttr("axis"));
+        // All axis of flatten should be the same
+        if (flatten_axis0 != flatten_axis) return;
+      }
+
       nodes.push_back(subgraph.at(input_nodes[i]));
       nodes.push_back(
           subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))));
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 348ca5b952bfe..39496cb26776e 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -327,6 +327,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
 
+#define REGISTER_OP_IPU_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, IPU, ::paddle::platform::IPUPlace, __VA_ARGS__)
+
 #define REGISTER_OP_XPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
 
diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
index b13166cff60aa..8d2ee2f01008b 100644
--- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
+++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
@@ -2,7 +2,7 @@ cc_library(cinn_cache_key SRCS cinn_cache_key.cc DEPS boost graph graph_helper l
 cc_library(build_cinn_pass SRCS build_cinn_pass.cc DEPS pass subgraph_detector graph_pattern_detector cinn_compiler errors enforce)
 cc_library(transform_desc SRCS transform_desc.cc DEPS proto_desc cinn)
 cc_library(cinn_graph_symbolization SRCS cinn_graph_symbolization.cc DEPS lod_tensor graph transform_desc cinn)
-cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn)
+cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn cinn_launch_context)
 
 if (WITH_TESTING)
   cc_test(cinn_lib_test SRCS cinn_lib_test.cc DEPS cinn)
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
index 368fb4a5fd8c9..0e157ae7d79f3 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
@@ -29,55 +29,32 @@ namespace paddle {
 namespace framework {
 namespace paddle2cinn {
 
+using GraphHashStrategy = CinnCacheKey::GraphHashStrategy;
+
+CinnCacheKey::CinnCacheKey(GraphHashStrategy graph_hash)
+    : graph_hash_(graph_hash) {}
+
 CinnCacheKey::CinnCacheKey(
     const ir::Graph& graph,
     const std::map<std::string, const LoDTensor*>& input_tensors,
-    const std::string& arch_str) {
+    const std::string& arch_str, GraphHashStrategy graph_hash)
+    : graph_hash_(graph_hash) {
   this->SetKey(graph, input_tensors, arch_str);
 }
 
 CinnCacheKey::CinnCacheKey(const ir::Graph& graph,
                            const std::map<std::string, DDim>& input_shapes,
-                           const std::string& arch_str) {
+                           const std::string& arch_str,
+                           GraphHashStrategy graph_hash)
+    : graph_hash_(graph_hash) {
   this->SetKey(graph, input_shapes, arch_str);
 }
 
-size_t CinnCacheKey::HashGraph(const ir::Graph& graph) {
-  // using Dot to unqiue graph
-  inference::analysis::Dot dot;
-  std::unordered_map<const ir::Node*, std::string> node2dot;
-  int id = 0;
-  // Create nodes
-  // graph.Nodes() return unordered_set, the same graph may
-  // return different result?
-  for (const ir::Node* n : graph.Nodes()) {
-    std::string node_id = std::to_string(id++);
-    dot.AddNode(node_id, {}, n->Name(), true);
-    node2dot[n] = node_id;
-  }
-
-  // Create edges
-  for (const ir::Node* n : graph.Nodes()) {
-    const auto& src_id = node2dot.at(n);
-    for (auto* out : n->outputs) {
-      const auto& dest_id = node2dot.at(out);
-      dot.AddEdge(src_id, dest_id, {});
-    }
-  }
-
-  const std::string& viz_graph = dot.Build();
-  VLOG(1) << "The hash graph:\n" << viz_graph;
-
-  size_t hash_val = std::hash<std::string>()(viz_graph);
-  VLOG(4) << "The graph's hash value is: " << hash_val;
-  return hash_val;
-}
-
 void CinnCacheKey::SetKey(
     const ir::Graph& graph,
     const std::map<std::string, const LoDTensor*>& input_tensors,
     const std::string& arch_str) {
-  graph_serialize_str_ = std::to_string(HashGraph(graph));
+  graph_hash_val_ = graph_hash_(graph);
   for (const auto& name_tensor : input_tensors) {
     input_shapes_[name_tensor.first] = name_tensor.second->dims();
   }
@@ -87,7 +64,7 @@ void CinnCacheKey::SetKey(
 void CinnCacheKey::SetKey(const ir::Graph& graph,
                           const std::map<std::string, DDim>& input_shapes,
                           const std::string& arch_str) {
-  graph_serialize_str_ = std::to_string(HashGraph(graph));
+  graph_hash_val_ = graph_hash_(graph);
   input_shapes_ = input_shapes;
   arch_str_ = arch_str;
 }
@@ -97,7 +74,7 @@ bool CinnCacheKey::operator!=(const CinnCacheKey& other) const {
 }
 
 bool CinnCacheKey::operator==(const CinnCacheKey& other) const {
-  return graph_serialize_str_ == other.graph_serialize_str_ &&
+  return graph_hash_val_ == other.graph_hash_val_ &&
          input_shapes_ == other.input_shapes_ && arch_str_ == other.arch_str_;
 }
 
@@ -114,11 +91,48 @@ size_t CinnCacheKey::Hash::operator()(const CinnCacheKey& key) const {
     ret = hash_combine(ret, string_hasher(name_shape.second.to_str()));
   }
 
-  ret = hash_combine(ret, string_hasher(key.graph_serialize_str_));
+  ret = hash_combine(ret, key.graph_hash_val_);
   ret = hash_combine(ret, string_hasher(key.arch_str_));
   return ret;
 }
 
+size_t CinnCacheKeyByStructure::HashGraph(const ir::Graph& graph) {
+  // sort grad node by name and id.
+  auto compare = [](ir::Node* n1, ir::Node* n2) {
+    return (n1->Name() == n2->Name()) ? (n1->id() < n2->id())
+                                      : (n1->Name() < n2->Name());
+  };
+
+  // graph.Nodes() return unordered_set, here using set to avoid the same graph
+  // may return different result
+  std::set<ir::Node *, bool (*)(ir::Node *, ir::Node *)> node_set(compare),
+      output_set(compare);
+  node_set.insert(graph.Nodes().begin(), graph.Nodes().end());
+
+  std::string hash_str;
+  for (ir::Node* n : node_set) {
+    hash_str.append(n->Name());
+
+    output_set.clear();
+    output_set.insert(n->outputs.begin(), n->outputs.end());
+    for (auto* out : output_set) {
+      hash_str.append(out->Name());
+    }
+  }
+
+  VLOG(1) << "The hash graph:\n" << hash_str;
+
+  size_t hash_val = std::hash<std::string>()(hash_str);
+  VLOG(4) << "The graph's hash value by graph structure is: " << hash_val;
+  return hash_val;
+}
+
+size_t CinnCacheKeyByAddress::HashGraph(const ir::Graph& graph) {
+  size_t hash_val = reinterpret_cast<size_t>(&graph);
+  VLOG(4) << "The graph's hash value by graph address is: " << hash_val;
+  return hash_val;
+}
+
 }  // namespace paddle2cinn
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
index 941f8e0cdecc1..67325297c4772 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <functional>
 #include <map>
 
 #include "paddle/fluid/framework/ddim.h"
@@ -33,14 +34,18 @@ namespace paddle2cinn {
 // shapes.
 class CinnCacheKey {
  public:
+  using GraphHashStrategy = std::function<size_t(const ir::Graph&)>;
+
+  explicit CinnCacheKey(GraphHashStrategy graph_hash);
+
   CinnCacheKey(const ir::Graph& graph,
                const std::map<std::string, const LoDTensor*>& input_tensors,
-               const std::string& arch_str);
+               const std::string& arch_str, GraphHashStrategy graph_hash);
   CinnCacheKey(const ir::Graph& graph,
                const std::map<std::string, DDim>& input_shapes,
-               const std::string& arch_str);
+               const std::string& arch_str, GraphHashStrategy graph_hash);
 
-  ~CinnCacheKey() {}
+  ~CinnCacheKey() = default;
 
   void SetKey(const ir::Graph& graph,
               const std::map<std::string, const LoDTensor*>& input_tensors,
@@ -58,13 +63,38 @@ class CinnCacheKey {
   };
 
  private:
-  size_t HashGraph(const ir::Graph& graph);
-
-  std::string graph_serialize_str_;
+  GraphHashStrategy graph_hash_;
+  size_t graph_hash_val_;
   std::map<std::string, DDim> input_shapes_;
   std::string arch_str_;
 };
 
+#define CINN_CACHE_KEY_CREATE(NAME)                                    \
+  class NAME : public CinnCacheKey {                                   \
+   public:                                                             \
+    NAME() : CinnCacheKey(HashGraph) {}                                \
+                                                                       \
+    NAME(const ir::Graph& graph,                                       \
+         const std::map<std::string, const LoDTensor*>& input_tensors, \
+         const std::string& arch_str)                                  \
+        : CinnCacheKey(graph, input_tensors, arch_str, HashGraph) {}   \
+                                                                       \
+    NAME(const ir::Graph& graph,                                       \
+         const std::map<std::string, DDim>& input_shapes,              \
+         const std::string& arch_str)                                  \
+        : CinnCacheKey(graph, input_shapes, arch_str, HashGraph) {}    \
+                                                                       \
+   private:                                                            \
+    static size_t HashGraph(const ir::Graph& graph);                   \
+  };
+
+// Class to store the keys by graph address for compiling CINN.
+CINN_CACHE_KEY_CREATE(CinnCacheKeyByAddress)
+// Class to store the keys by graph structure for compiling CINN.
+CINN_CACHE_KEY_CREATE(CinnCacheKeyByStructure)
+
+#undef CINN_CACHE_KEY_CREATE
+
 }  // namespace paddle2cinn
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
index f13f44998211f..f9b48ef4b5ec0 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
@@ -26,8 +26,8 @@ namespace paddle {
 namespace framework {
 namespace paddle2cinn {
 
-TEST(CinnCacheKeyTest, TestAsUnorderedKey) {
-  std::unordered_set<CinnCacheKey, CinnCacheKey::Hash> test_set;
+TEST(CinnCacheKeyTest, TestAsUnorderedKeyByStructure) {
+  std::unordered_set<CinnCacheKeyByStructure, CinnCacheKey::Hash> test_set;
 
   ProgramDesc empty_program;
   ir::Graph empty_graph(empty_program);
@@ -47,19 +47,20 @@ TEST(CinnCacheKeyTest, TestAsUnorderedKey) {
   DDim ddim = paddle::framework::make_ddim({1, 2, 3});
   std::map<std::string, DDim> feed_shapes = {{"X", ddim}};
 
-  CinnCacheKey cache_key0(empty_graph, feed_tensors, "x86");
-  CinnCacheKey cache_key1(empty_graph, feed_shapes, "x86");
+  CinnCacheKeyByStructure cache_key0(empty_graph, feed_tensors, "x86");
+  CinnCacheKeyByStructure cache_key1(empty_graph, feed_shapes, "x86");
   EXPECT_EQ(cache_key0, cache_key1);
 
-  CinnCacheKey cache_key2(graph, feed_shapes, "x86");
-  CinnCacheKey cache_key3(graph, feed_shapes, "nvgpu");
-  CinnCacheKey cache_key4(graph, feed_tensors, "nvgpu");
+  CinnCacheKeyByStructure cache_key2(graph, feed_shapes, "x86");
+  CinnCacheKeyByStructure cache_key3(graph, feed_shapes, "nvgpu");
+  CinnCacheKeyByStructure cache_key4(graph, feed_tensors, "nvgpu");
   EXPECT_NE(cache_key2, cache_key3);
   EXPECT_EQ(cache_key3, cache_key4);
 
-  CinnCacheKey cache_key5(empty_graph,
-                          std::map<std::string, const LoDTensor *>(), "unk");
-  CinnCacheKey cache_key6(empty_graph, std::map<std::string, DDim>(), "unk");
+  CinnCacheKeyByStructure cache_key5(
+      empty_graph, std::map<std::string, const LoDTensor *>(), "unk");
+  CinnCacheKeyByStructure cache_key6(empty_graph, std::map<std::string, DDim>(),
+                                     "unk");
   EXPECT_EQ(cache_key5, cache_key6);
 
   EXPECT_NE(cache_key1, cache_key3);
@@ -98,6 +99,107 @@ TEST(CinnCacheKeyTest, TestAsUnorderedKey) {
   EXPECT_EQ(test_set.find(cache_key6), test_set.end());
 }
 
+TEST(CinnCacheKeyTest, TestAsUnorderedKeyByAddress) {
+  std::unordered_set<CinnCacheKeyByAddress, CinnCacheKey::Hash> test_set;
+
+  ProgramDesc empty_program;
+  ir::Graph empty_graph(empty_program);
+
+  ProgramDesc program;
+  auto *global_block = program.MutableBlock(0);
+  auto *x = global_block->Var("X");
+  x->SetType(proto::VarType::LOD_TENSOR);
+  ir::Graph graph(program);
+
+  LoDTensor tensor;
+  tensor.Resize({1, 2, 3});
+  const LoDTensor *tensor_pointer = &tensor;
+  std::map<std::string, const LoDTensor *> feed_tensors = {
+      {"X", tensor_pointer}};
+
+  DDim ddim = paddle::framework::make_ddim({1, 2, 3});
+  std::map<std::string, DDim> feed_shapes = {{"X", ddim}};
+
+  CinnCacheKeyByAddress cache_key0(empty_graph, feed_tensors, "x86");
+  CinnCacheKeyByAddress cache_key1(empty_graph, feed_shapes, "x86");
+  EXPECT_EQ(cache_key0, cache_key1);
+
+  CinnCacheKeyByAddress cache_key2(graph, feed_shapes, "x86");
+  CinnCacheKeyByAddress cache_key3(graph, feed_shapes, "nvgpu");
+  CinnCacheKeyByAddress cache_key4(graph, feed_tensors, "nvgpu");
+  EXPECT_NE(cache_key2, cache_key3);
+  EXPECT_EQ(cache_key3, cache_key4);
+
+  CinnCacheKeyByAddress cache_key5(
+      empty_graph, std::map<std::string, const LoDTensor *>(), "unk");
+  CinnCacheKeyByAddress cache_key6(empty_graph, std::map<std::string, DDim>(),
+                                   "unk");
+  EXPECT_EQ(cache_key5, cache_key6);
+
+  EXPECT_NE(cache_key1, cache_key3);
+  EXPECT_NE(cache_key4, cache_key2);
+
+  EXPECT_NE(cache_key3, cache_key5);
+  EXPECT_NE(cache_key6, cache_key4);
+
+  EXPECT_NE(cache_key5, cache_key1);
+  EXPECT_NE(cache_key2, cache_key6);
+
+  test_set.insert(cache_key0);
+  test_set.insert(cache_key1);
+  test_set.insert(cache_key3);
+  test_set.insert(cache_key4);
+  test_set.insert(cache_key5);
+  test_set.insert(cache_key6);
+  EXPECT_EQ(test_set.size(), 3U);
+
+  auto iter = test_set.find(cache_key0);
+  EXPECT_NE(iter, test_set.end());
+  test_set.erase(iter);
+  EXPECT_EQ(test_set.size(), 2U);
+  EXPECT_EQ(test_set.find(cache_key1), test_set.end());
+
+  iter = test_set.find(cache_key3);
+  EXPECT_NE(iter, test_set.end());
+  test_set.erase(iter);
+  EXPECT_EQ(test_set.size(), 1U);
+  EXPECT_EQ(test_set.find(cache_key4), test_set.end());
+
+  iter = test_set.find(cache_key5);
+  EXPECT_NE(iter, test_set.end());
+  test_set.erase(iter);
+  EXPECT_EQ(test_set.size(), 0U);
+  EXPECT_EQ(test_set.find(cache_key6), test_set.end());
+}
+
+TEST(CinnCacheKeyTest, TestSameGraph) {
+  ProgramDesc program1;
+  auto *global_block1 = program1.MutableBlock(0);
+  auto *x1 = global_block1->Var("X");
+  x1->SetType(proto::VarType::LOD_TENSOR);
+  ir::Graph graph1(program1);
+
+  ProgramDesc program2;
+  auto *global_block2 = program2.MutableBlock(0);
+  auto *x2 = global_block2->Var("X");
+  x2->SetType(proto::VarType::LOD_TENSOR);
+  ir::Graph graph2(program2);
+
+  LoDTensor tensor;
+  tensor.Resize({1, 2, 3});
+  const LoDTensor *tensor_pointer = &tensor;
+  std::map<std::string, const LoDTensor *> feed_tensors = {
+      {"X", tensor_pointer}};
+
+  CinnCacheKeyByAddress cache_key_by_address1(graph1, feed_tensors, "x86");
+  CinnCacheKeyByAddress cache_key_by_address2(graph2, feed_tensors, "x86");
+  EXPECT_NE(cache_key_by_address1, cache_key_by_address2);
+
+  CinnCacheKeyByStructure cache_key_by_struct1(graph1, feed_tensors, "x86");
+  CinnCacheKeyByStructure cache_key_by_struct2(graph2, feed_tensors, "x86");
+  EXPECT_EQ(cache_key_by_struct1, cache_key_by_struct2);
+}
+
 }  // namespace paddle2cinn
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 7fc8eff3d31c9..54167d95899d6 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -41,6 +41,7 @@
 #include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/analysis/dot.h"
+#include "paddle/fluid/operators/cinn/cinn_launch_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/string/string_helper.h"
 
@@ -68,23 +69,41 @@ const CinnCompiledObject& CinnCompiler::Compile(
     const std::map<std::string, const LoDTensor*>& input_tensors,
     const Target& target, void* stream) {
   VLOG(1) << "-- The graph to be compiled is:\n" << VizGraph(graph);
-  CinnCacheKey cur_key(graph, input_tensors, target.arch_str());
+  CinnCacheKeyByAddress cur_key_by_address(graph, input_tensors,
+                                           target.arch_str());
+  CinnCacheKeyByStructure cur_key_by_struct;
+
   bool exist = false;
   {
     AutoRDLock r_guard{&rwlock_};
-    exist = cache_.count(cur_key) != 0;
+    exist = cache_by_address_.count(cur_key_by_address) != 0;
+    // if cannot find graph by address, checkout whether the graph structure
+    // have been stored in cache.
+    if (!exist) {
+      // generate the structure cache key
+      cur_key_by_struct.SetKey(graph, input_tensors, target.arch_str());
+
+      // if the graph structure can be found, storing the graph address in
+      // cache for next query.
+      if (cache_by_struct_.count(cur_key_by_struct) != 0) {
+        exist = true;
+        cache_by_address_[cur_key_by_address] =
+            cache_by_struct_.at(cur_key_by_struct).get();
+      }
+    }
   }
   if (!exist) {
     std::int64_t compiled_num = real_compiled_num_.fetch_add(1);
     auto compiled_res =
         CompileGraph(graph, input_tensors, target, compiled_num, stream);
     AutoWRLock w_guard{&rwlock_};
-    if (!cache_.count(cur_key)) {
-      cache_[cur_key] = std::move(compiled_res);
+    if (!cache_by_struct_.count(cur_key_by_struct)) {
+      cache_by_address_[cur_key_by_address] = compiled_res.get();
+      cache_by_struct_[cur_key_by_struct] = std::move(compiled_res);
     }
   }
   AutoRDLock guard{&rwlock_};
-  const auto& cached_boj = *cache_[cur_key];
+  const auto& cached_boj = *cache_by_address_[cur_key_by_address];
   return cached_boj;
 }
 
@@ -181,7 +200,8 @@ void CinnCompiler::Clear() {
   {
     AutoWRLock guard{&rwlock_};
     graphs_.clear();
-    cache_.clear();
+    cache_by_address_.clear();
+    cache_by_struct_.clear();
   }
   real_compiled_num_.store(0);
 }
@@ -217,6 +237,9 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
   *compiled_obj = {std::move(graph_compiler),
                    std::move(compiled_res.runtime_program), scope,
                    symbol.var_model_to_program_map()};
+  compiled_obj->launch_context =
+      std::make_unique<operators::details::CinnLaunchContext>(
+          compiled_obj->paddle2cinn_varmap, compiled_obj->scope);
   return compiled_obj;
 }
 
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
index 71119acf1fb49..5070eb5ce5674 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
@@ -31,6 +31,13 @@
 #include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
+
+namespace operators {
+namespace details {
+class CinnLaunchContext;
+}  // namespace details
+}  // namespace operators
+
 namespace framework {
 namespace paddle2cinn {
 
@@ -39,6 +46,7 @@ struct CinnCompiledObject {
   std::unique_ptr<::cinn::hlir::framework::Program> runtime_program;
   std::shared_ptr<::cinn::hlir::framework::Scope> scope;
   std::unordered_map<std::string, std::string> paddle2cinn_varmap;
+  std::unique_ptr<operators::details::CinnLaunchContext> launch_context;
 };
 
 // Entrance to use CINN.
@@ -87,9 +95,12 @@ class CinnCompiler {
       void* stream = nullptr) const;
 
   std::unordered_map<std::string, std::unique_ptr<ir::Graph>> graphs_;
-  std::unordered_map<CinnCacheKey, std::unique_ptr<CinnCompiledObject>,
+  std::unordered_map<CinnCacheKeyByAddress, CinnCompiledObject*,
                      CinnCacheKey::Hash>
-      cache_;
+      cache_by_address_;
+  std::unordered_map<CinnCacheKeyByStructure,
+                     std::unique_ptr<CinnCompiledObject>, CinnCacheKey::Hash>
+      cache_by_struct_;
   std::atomic_int64_t real_compiled_num_{0};
   mutable RWLock rwlock_;
 
diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc
index 55254c65fad59..9521df651f9de 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <sstream>
 
 #include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/kernel_factory.h"
 
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -190,8 +191,9 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
 }
 
 KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() {
-  return KernelSignature(op_proto_->type(), GetInputArgsNames(),
-                         GetAttrsArgsNames(), GetOutputArgsNames());
+  return KernelSignature(pten::TransToPtenKernelName(op_proto_->type()),
+                         GetInputArgsNames(), GetAttrsArgsNames(),
+                         GetOutputArgsNames());
 }
 
 std::string KernelSignatureToString(const KernelSignature& signature) {
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 2e5d2260d12d4..3c28260ed278b 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -76,6 +76,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
   }
+#ifdef PADDLE_WITH_IPU
+  else if (platform::is_ipu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::IPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_ipu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::IPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_ipu_place(src_place) &&
+             platform::is_ipu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::IPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::IPUPlace, src_place), src_ptr, size);
+  }
+#endif
+
 #ifdef PADDLE_WITH_XPU
   else if (platform::is_xpu_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
@@ -386,17 +402,33 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
   }
+#ifdef PADDLE_WITH_IPU
+  else if (platform::is_ipu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::IPUPlace, src_place), src_ptr, size);
+  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
+             platform::is_ipu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::IPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
+  } else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 #ifdef PADDLE_WITH_XPU
   else if (platform::is_xpu_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
-  } else if (platform::is_cpu_place(src_place) &&  // NOLINT
-             platform::is_xpu_place(dst_place)) {
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_xpu_place(dst_place)) {
     memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
-  } else if (platform::is_xpu_place(src_place) &&  // NOLINT
-             platform::is_xpu_place(dst_place)) {
+  }
+  else if (platform::is_xpu_place(src_place) &&  // NOLINT
+           platform::is_xpu_place(dst_place)) {
     if (src_ptr == dst_ptr) {
       VLOG(3) << "Skip copy the same data async from " << src_place << " to "
               << dst_place;
@@ -404,7 +436,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
     }
     memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
                  BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
-  } else {  // NOLINT
+  }
+  else {  // NOLINT
     PADDLE_THROW(platform::errors::Unimplemented(
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
@@ -571,6 +604,11 @@ class AnyVisitor : public boost::static_visitor<bool> {
         platform::errors::Unimplemented("Not supported on place (%s) ", npu));
     // return GetResultHelper(out, npu);
   }
+  bool GetResult(const framework::Tensor& out,
+                 const platform::IPUPlace& ipu) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("Not supported on place (%s) ", ipu));
+  }
 
   bool GetResult(const framework::Tensor& out,
                  const platform::NPUPinnedPlace& cpu) const {
@@ -762,6 +800,9 @@ struct BothFalseVisitor : public boost::static_visitor<> {
   void VisitorImpl(const platform::XPUPlace& xpu) const {
     PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
   }
+  void VisitorImpl(const platform::IPUPlace& ipu) const {
+    PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported"));
+  }
 
   void VisitorImpl(const platform::CUDAPlace& gpu) const {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index d95c78c5db8a7..6aad54fba86e4 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -155,6 +155,13 @@ class TensorAddFunctor : public boost::static_visitor<> {
         "is not supported in imperative mode",
         place));
   }
+  // there is NO support in IPUPlace
+  void operator()(const platform::IPUPlace& place) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
 
  private:
   int64_t numel_;
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 8875ef74bce14..54f46e49c4f73 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -487,6 +487,14 @@ static void PreparedOpRunImpl(
         op.Type(), outs, dev_ctx->GetPlace());
   }
 
+  if (FLAGS_benchmark) {
+    dev_ctx->Wait();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
+    VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
+#endif
+  }
+
   /**
    * [ Why need handle complex gradient to real gradient? ]
    *
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 068de4f0435bb..9014871229b39 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -211,70 +211,6 @@ void SplitTensorsWithType<platform::XPUDeviceContext>(
 }
 #endif
 
-// NOTE(liubo48): Only implement operators::math::SplitFunctor for npu now.
-// If later the operators::StridedMemcpyWithAxis0 is supported,
-// then this specific SplitTensorsForAllReduce can be removed.
-#ifdef PADDLE_WITH_ASCEND_CL
-template <>
-void SplitTensorsForAllReduce<platform::NPUDeviceContext, float>(
-    const platform::NPUDeviceContext &context,
-    framework::Variable *p_dense_contents,
-    std::vector<framework::Tensor> *p_dense_tensors) {
-  auto *in = p_dense_contents->GetMutable<framework::LoDTensor>();
-  std::vector<framework::Tensor *> outs;
-  std::vector<const framework::Tensor *> shape_refer;
-
-  outs.reserve(p_dense_tensors->size());
-  shape_refer.reserve(p_dense_tensors->size());
-
-  for (auto &tensor : *p_dense_tensors) {
-    outs.emplace_back(&tensor);
-    shape_refer.emplace_back(&tensor);
-  }
-  operators::math::SplitFunctor<platform::NPUDeviceContext, float>
-      split_functor_;
-  split_functor_(context, *in, shape_refer, 0, &outs);
-}
-
-template <>
-void ConcatTensorsWithType<platform::NPUDeviceContext>(
-    const platform::NPUDeviceContext &context,
-    const std::vector<framework::Tensor> &dense_tensors_,
-    framework::Variable *p_dense_contents,
-    framework::proto::VarType::Type type) {
-  switch (type) {
-    case framework::proto::VarType::FP32:
-      ConcatTensorsForAllReduce<platform::NPUDeviceContext, float>(
-          context, dense_tensors_, p_dense_contents);
-      break;
-    default:
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Data type (%s) is not supported when it concats tensors for "
-          "allreduce.",
-          framework::DataTypeToString(type)));
-  }
-}
-
-template <>
-void SplitTensorsWithType<platform::NPUDeviceContext>(
-    const platform::NPUDeviceContext &context,
-    framework::Variable *p_dense_contents,
-    std::vector<framework::Tensor> *p_dense_tensors,
-    framework::proto::VarType::Type type) {
-  switch (type) {
-    case framework::proto::VarType::FP32:
-      SplitTensorsForAllReduce<platform::NPUDeviceContext, float>(
-          context, p_dense_contents, p_dense_tensors);
-      break;
-    default:
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Data type (%s) is not supported when it splits tensors for "
-          "allreduce.",
-          framework::DataTypeToString(type)));
-  }
-}
-#endif
-
 void Group::ConcatTensors(const platform::DeviceContext &context) {
   auto place = context.GetPlace();
   if (platform::is_gpu_place(place)) {
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 8569dd6347852..7c5af43816c44 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -228,7 +228,7 @@ class ElementwiseTensorOpConverter : public OpConverter {
       }
     };
 
-    if (CheckDims(dims_x, dims_y)) {
+    if (dims_x.nbDims == dims_y.nbDims) {
       // The two input tensor should have the same dims
       VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer";
       nvinfer1::IElementWiseLayer* elet_layer =
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 2aed7ec001d2a..c836593f3f409 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -348,13 +348,14 @@ class AllocatorFacadePrivate {
 
   const AllocatorMap& GetAllocatorMap() {
 #ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+    if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
       auto id = platform::CUDAGraph::CapturingID();
       auto iter = cuda_graph_allocator_map_.find(id);
       PADDLE_ENFORCE_NE(
           iter, cuda_graph_allocator_map_.end(),
           platform::errors::PermissionDenied(
               "No memory pool is prepared for CUDA Graph capturing."));
+      VLOG(10) << "Choose CUDA Graph memory pool to allocate memory";
       return iter->second->allocators_;
     } else {
       return allocators_;
@@ -405,7 +406,7 @@ class AllocatorFacadePrivate {
 #if defined(PADDLE_WITH_HIP)
     auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
     cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
-        cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk_);
+        cuda_allocator, platform::GpuMinChunkSize(), 0, allow_free_idle_chunk_);
 #endif
 
 #if defined(PADDLE_WITH_CUDA)
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 6de32335c62b2..41dcf277d7a11 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -116,6 +116,34 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
   return GetCPUBuddyAllocator()->Used();
 }
 
+// For Graphcore IPU
+template <>
+void *Alloc<platform::IPUPlace>(const platform::IPUPlace &place, size_t size) {
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  VLOG(10) << "IPUPlace, Allocate on cpu.";
+
+  void *p = GetCPUBuddyAllocator()->Alloc(size);
+  if (FLAGS_init_allocated_mem) {
+    memset(p, 0xEF, size);
+  }
+  VLOG(10) << "  pointer=" << p;
+  return p;
+}
+template <>
+void Free<platform::IPUPlace>(const platform::IPUPlace &place, void *p,
+                              size_t size) {
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  GetCPUBuddyAllocator()->Free(p);
+}
+template <>
+uint64_t Release<platform::IPUPlace>(const platform::IPUPlace &place) {
+  return GetCPUBuddyAllocator()->Release();
+}
+template <>
+size_t Used<platform::IPUPlace>(const platform::IPUPlace &place) {
+  return GetCPUBuddyAllocator()->Used();
+}
+
 // For kunlun XPU
 template <>
 void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc
index 90a4ca73399cf..fa93bf00f2ac0 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -32,9 +32,34 @@ CinnLaunchContext::CinnLaunchContext(
       [](const auto& name_view) { return std::string(name_view.data()); });
 }
 
-bool CinnLaunchContext::IsVariableUsed(const std::string& paddle_name) {
-  return paddle2cinn_varmap_.count(paddle_name) > 0 &&
-         cinn_variable_names_.count(paddle2cinn_varmap_.at(paddle_name)) > 0;
+void CinnLaunchContext::UpdateCapturedEnv(const framework::Scope& scope,
+                                          const platform::Place& place) {
+  if (std::addressof(scope) == cached_scope_ &&
+      std::addressof(place) == cached_place_) {
+    VLOG(4) << "Captured scope:" << cached_scope_ << ", place:" << cached_place_
+            << " are not changed";
+    return;
+  }
+  cached_scope_ = std::addressof(scope);
+  cached_place_ = std::addressof(place);
+  cached_temp_scope_ = scope.NewTmpScope();
+  VLOG(4) << "Captured env is update, scope:" << cached_scope_ << "->"
+          << std::addressof(scope) << ", place:" << cached_place_ << "->"
+          << std::addressof(place);
+}
+
+bool CinnLaunchContext::IsArgumentsInitialized() const {
+  if (hold_buffers_.empty() || name2argument_.empty()) {
+    return false;
+  }
+  return true;
+}
+
+bool CinnLaunchContext::IsVariableUsed(
+    const std::string& paddle_var_name) const {
+  return paddle2cinn_varmap_.count(paddle_var_name) > 0 &&
+         cinn_variable_names_.count(paddle2cinn_varmap_.at(paddle_var_name)) >
+             0;
 }
 
 CinnTensor CinnLaunchContext::GetCinnTensor(const std::string& var_name) {
@@ -53,99 +78,101 @@ std::unordered_set<std::string> CinnLaunchContext::GetInternalVariableNames() {
   return all_parameters;
 }
 
-void CinnLaunchContext::CheckTensorEquivalent(const std::string& paddle_name,
-                                              const LoDTensor& paddle_tensor,
-                                              const CinnTensor& cinn_tensor) {
+void CinnLaunchContext::CheckTensorEquivalent(
+    const std::string& paddle_var_name, const LoDTensor& paddle_tensor,
+    const CinnTensor& cinn_tensor) {
   // check dimension
   auto cinn_dims = framework::make_ddim(cinn_tensor->shape().data());
   PADDLE_ENFORCE_EQ(paddle_tensor.dims(), cinn_dims,
                     platform::errors::PreconditionNotMet(
                         "Tensors' shape in variable(%s) are not equivalent, "
                         "paddle's shape = [%s], but cinn's shape = [%s].",
-                        paddle_name, paddle_tensor.dims(), cinn_dims));
+                        paddle_var_name, paddle_tensor.dims(), cinn_dims));
 
   // TODO(CtfGo): check the underlying data type after CINN ready
 }
 
-void CinnLaunchContext::AssignExternalVariable(const std::string& paddle_name,
-                                               const platform::Place& place,
-                                               LoDTensor* paddle_tensor) {
-  PADDLE_ENFORCE_EQ(IsVariableUsed(paddle_name), true,
-                    platform::errors::InvalidArgument(
-                        "Paddle variable(%s) not used by cinn", paddle_name));
-
-  const auto& cinn_name = paddle2cinn_varmap_.at(paddle_name);
-  CinnTensor cinn_tensor = GetCinnTensor(cinn_name);
-  if (!paddle_tensor->IsInitialized()) {
-    paddle_tensor->Resize(framework::make_ddim(cinn_tensor->shape().data()));
+void CinnLaunchContext::AssignExternalVariable(
+    const std::string& paddle_var_name) {
+  PADDLE_ENFORCE_EQ(
+      IsVariableUsed(paddle_var_name), true,
+      platform::errors::InvalidArgument("Paddle variable(%s) not used by cinn",
+                                        paddle_var_name));
+
+  const auto& cinn_var_name = paddle2cinn_varmap_.at(paddle_var_name);
+  const auto& paddle_tensor =
+      cached_scope_->GetVar(paddle_var_name)->Get<LoDTensor>();
+  CinnTensor cinn_tensor = GetCinnTensor(cinn_var_name);
+  if (paddle_tensor.IsInitialized()) {
+    CheckTensorEquivalent(paddle_var_name, paddle_tensor, cinn_tensor);
   }
-  CheckTensorEquivalent(paddle_name, *paddle_tensor, cinn_tensor);
-  return SetArgument(cinn_name, place, /* free_mem_callback = */ false,
-                     paddle_tensor);
-}
 
-void CinnLaunchContext::AssignInternalVariable(const std::string& cinn_name,
-                                               const platform::Place& place,
-                                               LoDTensor* paddle_tensor) {
-  PADDLE_ENFORCE_GT(cinn_variable_names_.count(cinn_name), 0,
-                    platform::errors::InvalidArgument(
-                        "Variable(%s) not found in cinn socpe.", cinn_name));
-  CinnTensor cinn_tensor = GetCinnTensor(cinn_name);
-  if (!paddle_tensor->IsInitialized()) {
-    paddle_tensor->Resize(framework::make_ddim(cinn_tensor->shape().data()));
-  }
-  CheckTensorEquivalent(cinn_name, *paddle_tensor, cinn_tensor);
-  return SetArgument(cinn_name, place, /* free_mem_callback = */ true,
-                     paddle_tensor);
-}
+  auto cinn_buffer = std::make_unique<cinn_buffer_t>();
+  // assign dimensions and alloc/free callback of cinn_buffer_t
+  cinn_buffer->resize(cinn_tensor->shape().data().data(),
+                      cinn_tensor->shape().data().size());
+  cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
+      [this, paddle_var_name](void* ctx, cinn_buffer_t* buffer) {
+        auto* tensor =
+            cached_scope_->GetVar(paddle_var_name)->GetMutable<LoDTensor>();
+        tensor->Resize(framework::DDim(buffer->dims, buffer->dimensions));
+        buffer->memory = reinterpret_cast<uint8_t*>(
+            tensor->mutable_data<float>(*cached_place_));
+        return 0;
+      });
 
-std::unique_ptr<cinn_buffer_t> CinnLaunchContext::ShareTensorWithCinnBuffer(
-    const platform::Place& place, bool free_mem_callback, LoDTensor* tensor) {
-  // convert paddle dimensions array to cinn format
-  std::vector<cinn_dimension_t> cinn_dims(tensor->dims().size());
-  for (auto i = 0; i < tensor->dims().size(); ++i) {
-    cinn_dims[i] = static_cast<cinn_dimension_t>(tensor->dims().at(i));
-  }
+  // external variables will be recycled by global gc, so do nothing here
+  cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
+      [](void* ctx, cinn_buffer_t* buffer) {
+        // Do nothing
+        return 0;
+      });
 
+  return SetArgument(cinn_var_name, std::move(cinn_buffer));
+}
+
+void CinnLaunchContext::AssignInternalVariable(
+    const std::string& cinn_var_name) {
+  PADDLE_ENFORCE_GT(
+      cinn_variable_names_.count(cinn_var_name), 0,
+      platform::errors::InvalidArgument("Variable(%s) not found in cinn socpe.",
+                                        cinn_var_name));
+  CinnTensor cinn_tensor = GetCinnTensor(cinn_var_name);
   auto cinn_buffer = std::make_unique<cinn_buffer_t>();
-  // assign size and memory
-  cinn_buffer->resize(cinn_dims.data(), cinn_dims.size());
+  // assign dimensions and alloc/free callback of cinn_buffer_t
+  cinn_buffer->resize(cinn_tensor->shape().data().data(),
+                      cinn_tensor->shape().data().size());
 
   cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
-      [place, tensor](void* ctx, cinn_buffer_t* buffer) {
-        buffer->memory =
-            reinterpret_cast<uint8_t*>(tensor->mutable_data<float>(place));
+      [this, cinn_var_name](void* ctx, cinn_buffer_t* buffer) {
+        auto* tensor =
+            cached_temp_scope_->Var(cinn_var_name)->GetMutable<LoDTensor>();
+        tensor->Resize(framework::DDim(buffer->dims, buffer->dimensions));
+        buffer->memory = reinterpret_cast<uint8_t*>(
+            tensor->mutable_data<float>(*cached_place_));
         return 0;
       });
 
-  if (free_mem_callback) {
-    cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
-        [tensor](void* ctx, cinn_buffer_t* buffer) {
-          tensor->clear();
-          return 0;
-        });
-    return cinn_buffer;
-  }
-
+  // internal variables should release its buffer immediately
+  // if no instruction use it
   cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
-      [](void* ctx, cinn_buffer_t* buffer) {
-        // Do nothing
+      [this, cinn_var_name](void* ctx, cinn_buffer_t* buffer) {
+        auto* tensor =
+            cached_temp_scope_->GetVar(cinn_var_name)->GetMutable<LoDTensor>();
+        tensor->clear();
         return 0;
       });
-  return cinn_buffer;
+  return SetArgument(cinn_var_name, std::move(cinn_buffer));
 }
 
-void CinnLaunchContext::SetArgument(const std::string& cinn_name,
-                                    const platform::Place& place,
-                                    bool free_mem_callback,
-                                    LoDTensor* paddle_tensor) {
-  auto buffer =
-      ShareTensorWithCinnBuffer(place, free_mem_callback, paddle_tensor);
-  name2argument_.emplace(cinn_name, buffer.get());
+void CinnLaunchContext::SetArgument(const std::string& cinn_var_name,
+                                    std::unique_ptr<cinn_buffer_t>&& buffer) {
+  VLOG(4) << "SetArgument-" << name2argument_.size() << ": name("
+          << cinn_var_name << "), dims("
+          << framework::DDim(buffer->dims, buffer->dimensions) << ").";
+
+  name2argument_.emplace(cinn_var_name, buffer.get());
   hold_buffers_.emplace_back(std::move(buffer));
-  VLOG(4) << "SetArgument-" << name2argument_.size() << ": "
-          << "name(" << cinn_name << "), dims(" << paddle_tensor->dims()
-          << ").";
 }
 
 const std::map<std::string, cinn_pod_value_t>&
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h
index c990255d68253..7b71d77d8b886 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.h
@@ -24,7 +24,7 @@
 #include "cinn/runtime/cinn_runtime.h"
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
 namespace operators {
@@ -40,16 +40,22 @@ class CinnLaunchContext {
       const std::unordered_map<std::string, std::string>& paddle2cinn_varmap,
       const std::shared_ptr<CinnScope>& cinn_scope);
 
+  // explicitly update several environment variables captured
+  // by callback of execution arguments
+  void UpdateCapturedEnv(const framework::Scope& scope,
+                         const platform::Place& place);
+
+  // Return whether execution arguments has been initialized
+  bool IsArgumentsInitialized() const;
+
   // Return whether a Paddle variable used on compiled kernels
-  bool IsVariableUsed(const std::string& var_name);
+  bool IsVariableUsed(const std::string& paddle_var_name) const;
 
   // Assign tensor buffer to input or output variables
-  void AssignExternalVariable(const std::string& var_name,
-                              const platform::Place& place, LoDTensor* tensor);
+  void AssignExternalVariable(const std::string& paddle_var_name);
 
   // Assign tensor buffer to internal variables
-  void AssignInternalVariable(const std::string& var_name,
-                              const platform::Place& place, LoDTensor* tensor);
+  void AssignInternalVariable(const std::string& cinn_var_name);
 
   // Extract internal variable names from CinnScope
   // by excluding used input and output variables
@@ -58,10 +64,6 @@ class CinnLaunchContext {
   // Finalize all execution arguments and return them
   const std::map<std::string, cinn_pod_value_t>& FinalizeArguments() const;
 
-  std::vector<std::unique_ptr<cinn_buffer_t>> HandoverBuffers() {
-    return std::move(hold_buffers_);
-  }
-
  private:
   // Get CinnTensor with CINN variable name
   CinnTensor GetCinnTensor(const std::string& var_name);
@@ -72,16 +74,15 @@ class CinnLaunchContext {
                              const LoDTensor& paddle_tensor,
                              const CinnTensor& cinn_tensor);
 
-  // Share the buffer of a Paddle tensor to CINN by delivering memory address
-  // to a cinn_buffer_t object
-  std::unique_ptr<cinn_buffer_t> ShareTensorWithCinnBuffer(
-      const platform::Place& place, bool free_mem_callback, LoDTensor* tensor);
-
-  // Set an argument with (cinn name)->(paddle tensor) pair
-  void SetArgument(const std::string& cinn_name, const platform::Place& place,
-                   bool free_mem_callback, LoDTensor* paddle_tensor);
+  // Set an argument with (cinn name)->(cinn_buffer_t) pair
+  void SetArgument(const std::string& cinn_var_name,
+                   std::unique_ptr<cinn_buffer_t>&& buffer);
 
  private:
+  const framework::Scope* cached_scope_ = nullptr;
+  const platform::Place* cached_place_ = nullptr;
+  std::unique_ptr<framework::Scope> cached_temp_scope_ = nullptr;
+
   // a variable name map from paddle to cinn
   const std::unordered_map<std::string, std::string>& paddle2cinn_varmap_;
   // the variable scope of cinn
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
index d922e8355b44c..da7640c3c0f68 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
@@ -45,81 +45,86 @@ std::unique_ptr<CinnLaunchContext> CreateDefaultLaunchContext() {
   return std::make_unique<CinnLaunchContext>(paddle2cinn_varmap, cinn_scope);
 }
 
-TEST(CinnLaunchContextTest, TestIsVariableUsed) {
+TEST(CinnLaunchContextTest, TestBasic) {
   auto launch_context = CreateDefaultLaunchContext();
-
+  // test IsVariableUsed
   ASSERT_EQ(launch_context->IsVariableUsed("var1"), true);
   ASSERT_EQ(launch_context->IsVariableUsed("var4"), false);
-}
-
-TEST(CinnLaunchContextTest, TestGetInternalVariableNames) {
-  auto launch_context = CreateDefaultLaunchContext();
-  auto internal_variable_names = launch_context->GetInternalVariableNames();
-  ASSERT_EQ(internal_variable_names.size(), 3);
-  EXPECT_NE(internal_variable_names.find("cinn_var2"),
-            internal_variable_names.end());
+  // test UpdateCapturedEnv
+  platform::CPUPlace place;
+  framework::Scope scope;
+  ASSERT_NO_THROW(launch_context->UpdateCapturedEnv(scope, place));
+  // test IsArgumentsInitialized
+  ASSERT_FALSE(launch_context->IsArgumentsInitialized());
 }
 
 TEST(CinnLaunchContextTest, TestCheckTensorEquivalent) {
-  auto launch_context = CreateDefaultLaunchContext();
   platform::CPUPlace place;
   framework::Scope scope;
+  auto launch_context = CreateDefaultLaunchContext();
+  launch_context->UpdateCapturedEnv(scope, place);
   auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
 
   // CheckTensorEquivalent: tensor dimension not equivalent
   tensor1->mutable_data<float>(framework::make_ddim({3, 5}), place);
-  ASSERT_THROW(launch_context->AssignExternalVariable("var1", place, tensor1),
+  ASSERT_THROW(launch_context->AssignExternalVariable("var1"),
                paddle::platform::EnforceNotMet);
 }
 
 TEST(CinnLaunchContextTest, TestAssignVariablePreCondition) {
-  auto launch_context = CreateDefaultLaunchContext();
   platform::CPUPlace place;
   framework::Scope scope;
+  auto launch_context = CreateDefaultLaunchContext();
+  launch_context->UpdateCapturedEnv(scope, place);
   auto* tensor4 = scope.Var("var4")->GetMutable<LoDTensor>();
 
   // not used
-  ASSERT_THROW(launch_context->AssignExternalVariable("var4", place, tensor4),
+  ASSERT_THROW(launch_context->AssignExternalVariable("var4"),
                paddle::platform::EnforceNotMet);
   // not found
-  ASSERT_THROW(
-      launch_context->AssignExternalVariable("cinn_var4", place, tensor4),
-      paddle::platform::EnforceNotMet);
+  ASSERT_THROW(launch_context->AssignInternalVariable("cinn_var4"),
+               paddle::platform::EnforceNotMet);
 }
 
 TEST(CinnLaunchContextTest, TestSetArgument) {
+  platform::CPUPlace cpu_place;
+  platform::Place place(cpu_place);
+  framework::Scope scope;
   auto launch_context = CreateDefaultLaunchContext();
+  launch_context->UpdateCapturedEnv(scope, place);
 
-  platform::CPUPlace place;
-  framework::Scope scope;
+  // assign external variables
   auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
   float* data1 =
       tensor1->mutable_data<float>(framework::make_ddim({3, 4}), place);
   data1[0] = 9.99f;
   data1[10] = 19.99f;
+  ASSERT_NO_THROW(launch_context->AssignExternalVariable("var1"));
 
-  // assign external variable
-  ASSERT_NO_THROW(
-      launch_context->AssignExternalVariable("var1", place, tensor1));
-  auto* tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
-  tensor2->mutable_data<float>(framework::make_ddim({6, 7, 8}), place);
-  ASSERT_NO_THROW(
-      launch_context->AssignInternalVariable("cinn_var2", place, tensor2));
-  // FinalizeArguments not missed check
-  ASSERT_THROW(launch_context->FinalizeArguments(),
-               paddle::platform::EnforceNotMet);
   auto* tensor3 = scope.Var("var3")->GetMutable<LoDTensor>();
   tensor3->mutable_data<float>(framework::make_ddim({10, 16}), place);
-  ASSERT_NO_THROW(
-      launch_context->AssignExternalVariable("var3", place, tensor3));
+  ASSERT_NO_THROW(launch_context->AssignExternalVariable("var3"));
+
+  // FinalizeArguments missed check
+  ASSERT_THROW(launch_context->FinalizeArguments(),
+               paddle::platform::EnforceNotMet);
+  // test get internal variables
+  auto internal_variable_names = launch_context->GetInternalVariableNames();
+  ASSERT_EQ(internal_variable_names.size(), 1);
+  EXPECT_EQ(*internal_variable_names.begin(), "cinn_var2");
 
+  auto* tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
+  tensor2->mutable_data<float>(framework::make_ddim({6, 7, 8}), place);
+  ASSERT_NO_THROW(launch_context->AssignInternalVariable("cinn_var2"));
+
+  // check argument is set correctly and alloc/free callbacks work well
   auto name2argument = launch_context->FinalizeArguments();
   ASSERT_EQ(name2argument.size(), 3);
   ASSERT_EQ(name2argument.count("cinn_var1"), 1);
-  // check ShareTensorWithCinnBuffer
+  ASSERT_TRUE(launch_context->IsArgumentsInitialized());
+
   auto* cinn_buffer =
       static_cast<cinn_buffer_t*>(name2argument.at("cinn_var1"));
-
   ASSERT_EQ(cinn_buffer->memory, nullptr);
   cinn_buffer->external_malloc->operator()(nullptr, cinn_buffer);
   ASSERT_NE(cinn_buffer->memory, nullptr);
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
index 813e7b1152f87..ea36a19202ef0 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
@@ -31,26 +31,6 @@ namespace operators {
 namespace details {
 
 #ifdef PADDLE_WITH_CUDA
-void CUDART_CB ReleaseScope(void* data) {
-  auto* temp_scope = static_cast<framework::Scope*>(data);
-  delete temp_scope;
-}
-
-void CUDART_CB ReleaseBuffers(void* data) {
-  auto* buffers =
-      static_cast<std::vector<std::unique_ptr<cinn_buffer_t>>*>(data);
-  delete buffers;
-}
-
-template <>
-void ReleaseResource<platform::CUDADeviceContext>(
-    const std::vector<void*>& resources, void* stream) {
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaLaunchHostFunc(
-      static_cast<gpuStream_t>(stream), ReleaseScope, resources[0]));
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaLaunchHostFunc(
-      static_cast<gpuStream_t>(stream), ReleaseBuffers, resources[1]));
-}
-
 template <>
 void* GetStream<platform::CUDADeviceContext>(
     const framework::ExecutionContext& ctx) {
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h
index 3a272916332be..170546ed23041 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -56,25 +56,12 @@ void LaunchCinnExecution(const CinnCompiledObject& compiled_obj,
 // Set cinn FLAGS (such as FLAGS_cinn_cudnn_deterministic) with paddle's FLAGS.
 void SetCinnRuntimeFlags();
 
-template <typename DeviceContext>
-void ReleaseResource(const std::vector<void*>& resources, void* stream) {
-  auto* temp_scope = static_cast<framework::Scope*>(resources[0]);
-  auto* buffers =
-      static_cast<std::vector<std::unique_ptr<cinn_buffer_t>>*>(resources[1]);
-  delete temp_scope;
-  delete buffers;
-}
-
 template <typename DeviceContext>
 void* GetStream(const framework::ExecutionContext& ctx) {
   return nullptr;
 }
 
 #ifdef PADDLE_WITH_CUDA
-template <>
-void ReleaseResource<platform::CUDADeviceContext>(
-    const std::vector<void*>& resources, void* stream);
-
 template <>
 void* GetStream<platform::CUDADeviceContext>(
     const framework::ExecutionContext& ctx);
@@ -116,56 +103,54 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
         compilation_key, inputs_name2tensor, target, stream);
     details::DebugCinnCompiledResult(cinn_compiled_object);
 
-    auto launch_context = std::make_unique<details::CinnLaunchContext>(
-        cinn_compiled_object.paddle2cinn_varmap, cinn_compiled_object.scope);
-
+    auto* launch_context = cinn_compiled_object.launch_context.get();
     // Step 3. Prepare arguments needed for the compiled executable program.
-    VLOG(4) << "CinnLaunchOp prepare arguments";
-
-    // 3.1 Prepare input variables: tensors of input variables have
-    //     been initialized before graph compiled, just check the
-    //     equiality between tensors of paddle and cinn.
-    for (const auto& var_name : input_variable_names) {
-      if (!launch_context->IsVariableUsed(var_name)) {
-        // some input variables don't need for cinn because they are
-        // eliminated by optimized passes or some cinn operators use
-        // less variables
-        VLOG(4) << "Input variable(" << var_name << ") not used by cinn";
-        continue;
+    launch_context->UpdateCapturedEnv(scope, place);
+    if (!launch_context->IsArgumentsInitialized()) {
+      VLOG(4) << "CinnLaunchOp prepare arguments";
+
+      // 3.1 Prepare input variables: tensors of input variables have
+      //     been initialized before graph compiled, just check the
+      //     equiality between tensors of paddle and cinn.
+      for (const auto& var_name : input_variable_names) {
+        if (!launch_context->IsVariableUsed(var_name)) {
+          // some input variables don't need for cinn because they are
+          // eliminated by optimized passes or some cinn operators use
+          // less variables
+          VLOG(4) << "Input variable(" << var_name << ") not used by cinn";
+          continue;
+        }
+
+        launch_context->AssignExternalVariable(var_name);
       }
 
-      launch_context->AssignExternalVariable(
-          var_name, place, scope.GetVar(var_name)->GetMutable<LoDTensor>());
-    }
-
-    // 3.2 Prepare output variables: all output variables should
-    //     be initialized and allocated buffer before
-    //     the runtime program start execution, the compilation result
-    //     includes details of their buffer assginment and we use that to
-    //     allocate space in Paddle. For those variables allocated yet,
-    //     like persistable parameters, just check the equiality between
-    //     Paddle allocation and CINN buffer assginment.
-    auto output_variable_names = ctx.OutputNames(kOutputs);
-    for (const auto var_name : output_variable_names) {
-      PADDLE_ENFORCE_EQ(launch_context->IsVariableUsed(var_name), true,
-                        platform::errors::InvalidArgument(
-                            "Output variable(%s) not used by cinn", var_name));
-
-      auto* tensor = scope.GetVar(var_name)->GetMutable<LoDTensor>();
-      launch_context->AssignExternalVariable(var_name, place, tensor);
-    }
+      // 3.2 Prepare output variables: all output variables should
+      //     be initialized and allocated buffer before
+      //     the runtime program start execution, the compilation result
+      //     includes details of their buffer assginment and we use that to
+      //     allocate space in Paddle. For those variables allocated yet,
+      //     like persistable parameters, just check the equiality between
+      //     Paddle allocation and CINN buffer assginment.
+      auto output_variable_names = ctx.OutputNames(kOutputs);
+      for (const auto var_name : output_variable_names) {
+        PADDLE_ENFORCE_EQ(
+            launch_context->IsVariableUsed(var_name), true,
+            platform::errors::InvalidArgument(
+                "Output variable(%s) not used by cinn", var_name));
+
+        launch_context->AssignExternalVariable(var_name);
+      }
 
-    // 3.3 Prepare internal or temporary variables: Create a temporary
-    //     scope to keep internal variables within graph or temporary
-    //     variables needed by the compiled runtime program in addition.
-    //     Here we directly use the names from CinnScope as Paddle variable
-    //     names, because they will not be used outside the graph
-    //     and should be destructed after computation finished.
-    auto internal_variable_names = launch_context->GetInternalVariableNames();
-    framework::Scope* temp_scope = scope.NewTmpScope().release();
-    for (const auto& var_name : internal_variable_names) {
-      auto* tensor = temp_scope->Var(var_name)->GetMutable<LoDTensor>();
-      launch_context->AssignInternalVariable(var_name, place, tensor);
+      // 3.3 Prepare internal or temporary variables: Create a temporary
+      //     scope to keep internal variables within graph or temporary
+      //     variables needed by the compiled runtime program in addition.
+      //     Here we directly use the names from CinnScope as Paddle variable
+      //     names, because they will not be used outside the graph
+      //     and should be destructed after computation finished.
+      auto internal_variable_names = launch_context->GetInternalVariableNames();
+      for (const auto& var_name : internal_variable_names) {
+        launch_context->AssignInternalVariable(var_name);
+      }
     }
 
     // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
@@ -175,12 +160,6 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
     VLOG(4) << "Run Cinn compiled executable program with stream: " << stream;
     details::LaunchCinnExecution(cinn_compiled_object, *launch_context, stream);
     VLOG(4) << "CinnLaunchOp launch execution done.";
-
-    // Step 6. Release some resources, such as `temp_scope` and cinn_buffers.
-    auto* buffers_holder = new std::vector<std::unique_ptr<cinn_buffer_t>>{
-        launch_context->HandoverBuffers()};
-    details::ReleaseResource<DeviceContext>({temp_scope, buffers_holder},
-                                            stream);
   }
 };
 
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
index 02373c38184fc..e10fdf522ff7c 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
@@ -130,8 +130,9 @@ TEST(CinnLaunchOpTest, TestElementwiseAddPass) {
     scope.Var(test_out_name)->GetMutable<LoDTensor>();
     scope.Var(expected_out_name)->GetMutable<LoDTensor>();
 
-    cinn_launch_op->Run(scope, place);
-    elementwise_add_op->Run(scope, place);
+    platform::Place run_place(place);
+    cinn_launch_op->Run(scope, run_place);
+    elementwise_add_op->Run(scope, run_place);
 
     LoDTensor test_out, expected_out;
     TensorCopySync(scope.Var(test_out_name)->Get<LoDTensor>(),
diff --git a/paddle/fluid/operators/complex_view_op.cc b/paddle/fluid/operators/complex_view_op.cc
new file mode 100644
index 0000000000000..2fb21ca4ea753
--- /dev/null
+++ b/paddle/fluid/operators/complex_view_op.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/complex_view_op.h"
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+
+class AsComplexOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "as_complex");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "as_complex");
+
+    auto in_dims = ctx->GetInputDim("X");
+    const int input_rank = in_dims.size();
+    PADDLE_ENFORCE_GE(
+        input_rank, 1,
+        platform::errors::InvalidArgument(
+            "The rank of input(X) is less than 1. "
+            "Expected the rank of input(X) to be equal to or greater than 1."
+            "But received rank of input(X) = %d",
+            input_rank));
+    const int last_dim_size = in_dims[input_rank - 1];
+    PADDLE_ENFORCE_EQ(
+        last_dim_size, 2,
+        platform::errors::InvalidArgument(
+            "The size of the last dimension of input(X)"
+            "does not equals 2."
+            "Expected the size of last dimension of input(X) to be 2."
+            "But received %d",
+            last_dim_size));
+
+    const framework::DDim out_dims(in_dims.Get(), input_rank - 1);
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class AsComplexOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of view_as_complex op.");
+    AddOutput("Out", "(Tensor), The output tensor of view_as_complex op.");
+    AddComment(R"DOC(
+As_complex Operator.
+
+This operator is used to return a complex tensor represented
+by an old-fashioned real tensor. The size of the last dimension of 
+the input tensor should be 2, which corresponds to 'real' and 
+'complex', respectively.
+
+)DOC");
+  }
+};
+
+template <typename T>
+class AsComplexGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("as_real");
+    retv->SetInput("X", this->OutputGrad("Out"));
+    retv->SetAttrMap(this->Attrs());
+    retv->SetOutput("Out", this->InputGrad("X"));
+  }
+};
+
+class AsRealOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "as_real");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "as_real");
+
+    auto out_dims_v = framework::vectorize(ctx->GetInputDim("X"));
+    out_dims_v.push_back(2);
+    const framework::DDim out_dims = framework::make_ddim(out_dims_v);
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(framework::ToRealType(input_data_type),
+                                   ctx.GetPlace());
+  }
+};
+
+class AsRealOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of as_real op.");
+    AddOutput("Out", "(Tensor), The output tensor of as_real op.");
+    AddComment(R"DOC(
+AsReal Operator.
+
+This operator is used to return an old-fashioned real tensor from a 
+complex tensor. The size of the last dimension of the output tensor is 2,
+which corresponds to 'real' and 'complex', respectively.
+
+)DOC");
+  }
+};
+
+template <typename T>
+class AsRealGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("as_complex");
+    retv->SetInput("X", this->OutputGrad("Out"));
+    retv->SetAttrMap(this->Attrs());
+    retv->SetOutput("Out", this->InputGrad("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(as_complex, ops::AsComplexOp, ops::AsComplexOpMaker,
+                  ops::AsComplexGradMaker<paddle::framework::OpDesc>,
+                  ops::AsComplexGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(as_real, ops::AsRealOp, ops::AsRealOpMaker,
+                  ops::AsRealGradMaker<paddle::framework::OpDesc>,
+                  ops::AsRealGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(
+    as_complex, ops::AsComplexKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AsComplexKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    as_real, ops::AsRealKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AsRealKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/complex_view_op.cu b/paddle/fluid/operators/complex_view_op.cu
new file mode 100644
index 0000000000000..261881cb8d256
--- /dev/null
+++ b/paddle/fluid/operators/complex_view_op.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/complex_view_op.h"
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    as_complex,
+    ops::AsComplexKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AsComplexKernel<paddle::platform::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    as_real, ops::AsRealKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AsRealKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/complex_view_op.h b/paddle/fluid/operators/complex_view_op.h
new file mode 100644
index 0000000000000..9a8d89db40208
--- /dev/null
+++ b/paddle/fluid/operators/complex_view_op.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/math/complex_functors.h"
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class AsComplexKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const auto* x = context.Input<framework::LoDTensor>("X");
+    auto* out = context.Output<framework::LoDTensor>("Out");
+    out->mutable_data<platform::complex<T>>(context.GetPlace());
+
+    // TensorCopy also changes output's shape & dtype
+    const framework::DDim out_dims_original = out->dims();
+    framework::TensorCopy(*x, context.GetPlace(), out);
+    out->Resize(out_dims_original);  // restored the shape
+    out->mutable_data<platform::complex<T>>(
+        context.GetPlace());  // restore the dtype
+  }
+};
+
+template <typename DeviceContext, typename T>
+class AsRealKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const auto* x = context.Input<framework::LoDTensor>("X");
+    auto* out = context.Output<framework::LoDTensor>("Out");
+
+    out->mutable_data<T>(context.GetPlace());
+    const framework::DDim out_dims_original = out->dims();
+    framework::TensorCopy(*x, context.GetPlace(), out);
+    out->Resize(out_dims_original);            // restored the shape
+    out->mutable_data<T>(context.GetPlace());  // restore the dtype
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index ad5a55aede751..2cdacfcf105af 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -31,8 +31,8 @@ limitations under the License. */
 
 // only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/kernels/functions/cpu/elementwise.h"
-#include "paddle/pten/kernels/functions/general/elementwise_base.h"
+#include "paddle/pten/kernels/hybird/cpu/elementwise.h"
+#include "paddle/pten/kernels/hybird/general/elementwise_base.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 #ifdef __NVCC__
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index 531a6ad224c6c..12fdcd40aa0b1 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -23,7 +23,7 @@ limitations under the License. */
 // only can include the headers in paddle/top/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
-#include "paddle/pten/kernels/functions/cuda/elementwise/elementwise.h"
+#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h"
 
 #ifdef __HIPCC__
 #define ELEMENTWISE_BLOCK_SIZE 256
diff --git a/paddle/fluid/operators/flip_op.cu b/paddle/fluid/operators/flip_op.cu
index 26b3d11bc6c7b..2391d4b907a60 100644
--- a/paddle/fluid/operators/flip_op.cu
+++ b/paddle/fluid/operators/flip_op.cu
@@ -24,24 +24,6 @@ namespace operators {
 using Tensor = framework::Tensor;
 using CUDADeviceContext = paddle::platform::CUDADeviceContext;
 
-template <typename T>
-__global__ void kernel_pointwise_flip_apply(const int N, const T* in_data,
-                                            T* out_data, int dim0, int stride0,
-                                            int dim1, int flip_dim) {
-  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < N;
-       idx += gridDim.x * blockDim.x) {
-    int dst_offset = 0;
-    if (flip_dim == 0) {
-      // flip 1st dim
-      dst_offset = (dim0 - 1 - idx / stride0) * stride0 + idx % stride0;
-    } else {
-      // flip last dim
-      dst_offset = idx / stride0 * stride0 + (dim1 - 1 - idx % stride0);
-    }
-    out_data[dst_offset] = in_data[idx];
-  }
-}
-
 template <typename T>
 __global__ void flip_cuda_kernel(const int N, const T* in_data, T* out_data,
                                  int64_t* x_shape, int64_t* x_stride,
@@ -103,29 +85,6 @@ class FlipKernel<platform::CUDADeviceContext, T>
     std::vector<int64_t> x_dims_v = framework::vectorize(x_dims);
     std::vector<int64_t> x_stride_v = framework::vectorize(x_stride);
 
-    // wrap high-dims to 2-dims
-    if (flip_dims_size == 1 &&
-        (flip_dims[0] == 0 || flip_dims[0] == total_dims - 1)) {
-      int dim0 = 1, dim1 = 1;
-      int stride0 = 1;
-      if (flip_dims[0] == 0) {
-        dim0 = x_dims_v[0];
-        stride0 = x_stride_v[0];
-        for (size_t i = 1; i < total_dims; ++i) {
-          dim1 *= x_dims_v[i];
-        }
-      } else {
-        dim1 = x_dims_v[total_dims - 1];
-        for (size_t i = 0; i < total_dims - 1; ++i) {
-          dim0 *= x_dims_v[i];
-        }
-        stride0 *= x_dims_v[total_dims - 1];
-      }
-      kernel_pointwise_flip_apply<
-          T><<<dim_grid, dim_block, 0, ctx.cuda_device_context().stream()>>>(
-          N, in_data, out_data, dim0, stride0, dim1, flip_dims[0]);
-    }
-
     int bytes = total_dims * sizeof(int64_t);
     auto x_strides_array_tmp = memory::Alloc(dev_ctx, bytes);
     int64_t* x_strides_array_gpu =
diff --git a/paddle/fluid/operators/ipu_runtime_op.cc b/paddle/fluid/operators/ipu_runtime_op.cc
new file mode 100644
index 0000000000000..4b473da00f331
--- /dev/null
+++ b/paddle/fluid/operators/ipu_runtime_op.cc
@@ -0,0 +1,62 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/ipu_runtime_op.h"
+
+namespace paddle {
+namespace operators {
+
+class IpuRuntimeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
+        ctx.device_context());
+  }
+};
+
+class IpuRuntimeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("FeedList", "FeedList of Graph").AsDuplicable();
+    AddOutput("FetchList", "FetchList of Graph").AsDuplicable();
+    AddAttr<int>("dtype",
+                 "(int, default 5 (FP32)) "
+                 "Output data type")
+        .SetDefault(framework::proto::VarType::FP32);
+    AddComment(R"DOC(
+Run graph by PopART runtime.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(ipu_runtime, ops::IpuRuntimeOp, ops::IpuRuntimeOpMaker);
+
+REGISTER_OP_IPU_KERNEL(ipu_runtime, ops::IpuRuntimeKernel<float>,
+                       ops::IpuRuntimeKernel<double>,
+                       ops::IpuRuntimeKernel<int>,
+                       ops::IpuRuntimeKernel<int64_t>,
+                       ops::IpuRuntimeKernel<bool>,
+                       ops::IpuRuntimeKernel<int8_t>,
+                       ops::IpuRuntimeKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/ipu_runtime_op.h b/paddle/fluid/operators/ipu_runtime_op.h
new file mode 100644
index 0000000000000..b6fc9ae98895d
--- /dev/null
+++ b/paddle/fluid/operators/ipu_runtime_op.h
@@ -0,0 +1,69 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/framework/ipu/ipu_backend.h"
+#include "paddle/fluid/framework/tensor.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class IpuRuntimeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#ifdef PADDLE_WITH_IPU
+    auto ipu_backend = framework::ipu::IpuBackend::GetInstance();
+    if (!ipu_backend->DeviceIsAttached()) {
+      const platform::IPUDeviceContext& ipu_ctx =
+          reinterpret_cast<const platform::IPUDeviceContext&>(
+              ctx.device_context());
+      ipu_backend->AttachDevice(ipu_ctx.DeviceId());
+    }
+
+    auto inputs = ctx.MultiInput<framework::Tensor>("FeedList");
+    auto outputs = ctx.MultiOutput<framework::Tensor>("FetchList");
+    auto output_names = ctx.OutputNames("FetchList");
+    VLOG(4) << "IpuRuntime Kernel, begin to run graph";
+    ipu_backend->Run(inputs, outputs, ctx);
+
+    // post-run
+    // resize tensor when tensor.dims() is empty
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      auto* out = outputs[i];
+      if (out->dims().size() == 0) {
+        auto tensor_dtype = out->type();
+        auto sizeof_dtype = framework::SizeOfType(tensor_dtype);
+        int64_t dim = out->memory_size() / sizeof_dtype;
+        out->Resize({dim});
+        VLOG(10) << "set ipu_runtime_op output: " << output_names[i]
+                 << " dims from () to: "
+                 << "(" << dim << ")";
+      }
+    }
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "Please compile WITH_IPU option to enable ipu_runtime op"));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index cd919b18b83c8..daa4efa02ac50 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -29,7 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function_impl.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/pten/kernels/functions/eigen/common.h"
+#include "paddle/pten/kernels/hybird/eigen/common.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
@@ -173,6 +173,13 @@ void set_constant_with_place<platform::NPUPinnedPlace>(
       platform::errors::Unimplemented("NPUPinnedPlace is not supported"));
 }
 
+template <>
+void set_constant_with_place<platform::IPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported"));
+}
+
 template <>
 void set_constant_with_place<platform::CPUPlace>(
     const platform::DeviceContext& context, framework::Tensor* tensor,
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index 144db9f5a2bb4..fa214c6389c51 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function_impl.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/pten/kernels/functions/eigen/common.h"
+#include "paddle/pten/kernels/hybird/eigen/common.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/py_layer_op.h b/paddle/fluid/operators/py_layer_op.h
index d80faab90b223..6625a4a1a753c 100644
--- a/paddle/fluid/operators/py_layer_op.h
+++ b/paddle/fluid/operators/py_layer_op.h
@@ -54,7 +54,7 @@ class PyLayerOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    auto data_type = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
     return framework::OpKernelType(data_type, ctx.device_context());
   }
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index df27083edbed5..76a14f6b1b254 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -28,7 +28,7 @@ limitations under the License. */
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/include/core.h"
 #include "paddle/pten/include/math.h"
-#include "paddle/pten/kernels/functions/general/reduce_impl.h"
+#include "paddle/pten/kernels/hybird/general/reduce_impl.h"
 
 #if defined(__HIPCC__) || defined(__NVCC__)
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index ed06fac298a8f..155eb1ebbe3db 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -555,10 +555,10 @@ class Reshape2Op : public ReshapeOp {
       const framework::ExecutionContext &ctx) const override {
     auto multi_inputs = ctx.MultiInput<framework::Tensor>("ShapeTensor");
     if (multi_inputs.size() > 0) {
-      return framework::KernelSignature("reshape.mulhost", {"X", "ShapeTensor"},
+      return framework::KernelSignature("reshape_mulhost", {"X", "ShapeTensor"},
                                         {}, {"Out"});
     } else if (ctx.HasInput("Shape")) {
-      return framework::KernelSignature("reshape.host", {"X", "Shape"}, {},
+      return framework::KernelSignature("reshape_host", {"X", "Shape"}, {},
                                         {"Out"});
     } else {
       return framework::KernelSignature("reshape", {"X"}, {"shape"}, {"Out"});
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.h b/paddle/fluid/operators/softmax_cudnn_op.cu.h
index 533488896dfcd..0c10152c23b2a 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu.h
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.h
@@ -222,15 +222,27 @@ __global__ void WarpSoftmaxForward(T* softmax, const T* src,
     idx_max_v[i] = idx_max / kVSize;
   }
 
-  // read data from global memory
+  // data src
   AccT srcdata[kBatchSize][kLoopsV][kVSize];
-  kps::Init<AccT, kStep>(&srcdata[0][0][0], kLowInf);
   T src_tmp[kBatchSize][kLoopsV][kVSize];
+  kps::Init<AccT, kStep>(&srcdata[0][0][0], kLowInf);
   kps::Init<T, kStep>(&src_tmp[0][0][0], -std::numeric_limits<T>::infinity());
+
+  // data dst
+  T out_tmp[kBatchSize][kLoopsV][kVSize];
+
+  // max value
+  AccT max[kBatchSize];
+  kps::Init<AccT, kBatchSize>(&max[0], kLowInf);
+
+  // sum value
+  AccT sum[kBatchSize] = {0};
+
+// read data from global memory
 #pragma unroll
   for (int i = 0; i < kBatchSize; ++i) {
-    int ptr = (first_batch + i) * stride;
-    const VecT* src_v = reinterpret_cast<const VecT*>(&src[ptr]);
+    const VecT* src_v =
+        reinterpret_cast<const VecT*>(&src[(first_batch + i) * stride]);
     VecT* reg_v = reinterpret_cast<VecT*>(&src_tmp[i][0][0]);
     kps::ReadData<VecT, VecT, kLoopsV, 1, 1, true>(
         &reg_v[0], &src_v[0], idx_max_v[i], 0, kWarpSize, 1);
@@ -239,15 +251,12 @@ __global__ void WarpSoftmaxForward(T* softmax, const T* src,
   }
 
   // compute max
-  AccT max[kBatchSize];
-  kps::Init<AccT, kBatchSize>(&max[0], kLowInf);
   kps::Reduce<AccT, kVItem, kBatchSize, 1, ReduceMaxFunctor<AccT>,
               kMode::kLocalMode>(&max[0], &srcdata[0][0][0],
                                  ReduceMaxFunctor<AccT>(), true);
   WarpReduceMax<AccT, kBatchSize, kWarpSize>(max);
 
   // compute sum
-  AccT sum[kBatchSize] = {0};
   for (int i = 0; i < kBatchSize; ++i) {
     kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, ExpSubFunctor<AccT>>(
         &srcdata[i][0][0], &srcdata[i][0][0], ExpSubFunctor<AccT>(max[i]));
@@ -257,15 +266,14 @@ __global__ void WarpSoftmaxForward(T* softmax, const T* src,
                                  kps::AddFunctor<AccT>(), true);
   WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
 
-  // write result to global memory
-  T out_tmp[kBatchSize][kLoopsV][kVSize];
+// write data to global memory
 #pragma unroll
   for (int i = 0; i < kBatchSize; ++i) {
+    VecT* softmax_v =
+        reinterpret_cast<VecT*>(&softmax[(first_batch + i) * stride]);
+    VecT* reg_v = reinterpret_cast<VecT*>(&out_tmp[i][0][0]);
     kps::ElementwiseUnary<AccT, T, kVItem, 1, 1, UnaryDivFunctor<AccT>>(
         &out_tmp[i][0][0], &srcdata[i][0][0], UnaryDivFunctor<AccT>(sum[i]));
-    int softmax_ptr = (first_batch + i) * stride;
-    VecT* softmax_v = reinterpret_cast<VecT*>(&softmax[softmax_ptr]);
-    VecT* reg_v = reinterpret_cast<VecT*>(&out_tmp[i][0][0]);
     kps::WriteData<VecT, VecT, kLoopsV, 1, 1, true>(
         &softmax_v[0], &reg_v[0], idx_max_v[i], 0, kWarpSize, 1);
   }
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
index 3970acf82d3ea..8ee3b118c32f2 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
@@ -18,6 +18,7 @@ namespace paddle {
 namespace platform {
 
 std::unique_ptr<CUDAGraph> CUDAGraph::capturing_graph_{nullptr};
+paddle::optional<std::thread::id> CUDAGraph::capturing_thread_id_{paddle::none};
 
 void CUDAGraph::Reset() {
   if (is_reset_) return;
@@ -58,6 +59,13 @@ void CUDAGraph::BeginSegmentCapture() {
       IsCapturing(), true,
       errors::PermissionDenied("BeginSegmentCapture should be called when CUDA "
                                "Graph is capturing."));
+  if (IsThreadLocalCapturing()) {
+    PADDLE_ENFORCE_EQ(IsThisThreadCapturing(), true,
+                      platform::errors::PermissionDenied(
+                          "When capturing CUDA Graph in the thread local mode, "
+                          "you cannot begin segmented capturing in the thread "
+                          "which is not the one that starts the capturing."));
+  }
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamBeginCapture(
       capturing_graph_->stream_, capturing_graph_->capture_mode_));
   PADDLE_ENFORCE_EQ(IsValidCapturing(), true,
@@ -82,6 +90,11 @@ void CUDAGraph::BeginCapture(platform::CUDAPlace place, cudaStream_t stream,
   capturing_graph_->place_ = place;
   capturing_graph_->stream_ = stream;
   capturing_graph_->capture_mode_ = mode;
+  if (mode == cudaStreamCaptureModeThreadLocal) {
+    capturing_thread_id_ = std::this_thread::get_id();
+    VLOG(10) << "Capturing CUDA Graph in thread local mode, thread id: "
+             << capturing_thread_id_;
+  }
   BeginSegmentCapture();
 #endif
 }
@@ -115,6 +128,7 @@ void CUDAGraph::EndSegmentCapture() {
 
 std::unique_ptr<CUDAGraph> CUDAGraph::EndCapture() {
   EndSegmentCapture();
+  capturing_thread_id_ = paddle::none;
   return std::move(capturing_graph_);
 }
 
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
index 0856e0fad1900..ca1e7abb375cb 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
@@ -18,6 +18,7 @@
 #include <functional>
 #include <memory>
 #include <mutex>
+#include <thread>
 #include <vector>
 #include "cuda.h"          // NOLINT
 #include "cuda_runtime.h"  // NOLINT
@@ -26,6 +27,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/utils/optional.h"
 
 namespace paddle {
 namespace platform {
@@ -99,6 +101,25 @@ class CUDAGraph {
   // supported during capturing CUDA Graph.
   static bool IsValidCapturing();
 
+  static bool IsThreadLocalCapturing() {
+#if CUDA_VERSION >= 10010
+    return IsCapturing() &&
+           capturing_graph_->capture_mode_ == cudaStreamCaptureModeThreadLocal;
+#else
+    return false;
+#endif
+  }
+
+  static bool IsThisThreadCapturing() {
+    if (UNLIKELY(IsCapturing())) {
+      return IsThreadLocalCapturing()
+                 ? capturing_thread_id_.get() == std::this_thread::get_id()
+                 : true;
+    } else {
+      return false;
+    }
+  }
+
  private:
   static CUDAGraphID UniqueID() {
     static std::atomic<CUDAGraphID> id;
@@ -118,6 +139,7 @@ class CUDAGraph {
   bool is_reset_{false};
   std::mutex mtx_;
 
+  static paddle::optional<std::thread::id> capturing_thread_id_;
   static std::unique_ptr<CUDAGraph> capturing_graph_;
 };
 
diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h
index d443e78ed874f..3e070da546b2a 100644
--- a/paddle/fluid/platform/device/gpu/gpu_primitives.h
+++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h
@@ -101,6 +101,20 @@ inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) {
   return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
 }
 
+#if CUDA_VERSION >= 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+static __device__ __forceinline__ float16 CUDAFP16ToPDFP16(__half x) {
+  return *reinterpret_cast<float16 *>(&x);
+}
+
+static __device__ __forceinline__ __half PDFP16ToCUDAFP16(float16 x) {
+  return *reinterpret_cast<__half *>(&x);
+}
+
+CUDA_ATOMIC_WRAPPER(Add, float16) {
+  return CUDAFP16ToPDFP16(
+      atomicAdd(reinterpret_cast<__half *>(address), PDFP16ToCUDAFP16(val)));
+}
+#else
 CUDA_ATOMIC_WRAPPER(Add, float16) {
   // concrete packed float16 value may exsits in lower or higher 16bits
   // of the 32bits address.
@@ -133,6 +147,7 @@ CUDA_ATOMIC_WRAPPER(Add, float16) {
   }
 }
 #endif
+#endif
 
 CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
   float *real = reinterpret_cast<float *>(address);
diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt
index 25629ba74d915..9be12cbf6d437 100644
--- a/paddle/fluid/platform/device/ipu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt
@@ -1,5 +1,5 @@
-# IPU
 IF(WITH_IPU)
+  FILE(GLOB POPART_CANONICALIZATION_SRC ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/device/ipu/popart_canonicalization/*.cc)
   cc_library(ipu_device SRCS device.cc DEPS enforce popart)
   cc_library(ipu_utils SRCS ipu_utils.cc DEPS memory framework_proto popart)
   cc_library(ipu_strategy SRCS ipu_strategy.cc DEPS popart graph framework_proto enforce)
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
new file mode 100644
index 0000000000000..5793c4c0e3ca6
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *activation_op_handler(Graph *graph, Node *node, const std::string &type) {
+  auto new_node = CreateBaseOp(graph, node, type, {GetInputVarNode("X", node)},
+                               node->outputs);
+  return new_node;
+}
+
+Node *relu_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_relu");
+}
+
+Node *tanh_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_tanh");
+}
+
+Node *log_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_log");
+}
+
+Node *sigmoid_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_sigmoid");
+}
+
+Node *sqrt_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_sqrt");
+}
+
+Node *gelu_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_gelu_v2");
+}
+
+Node *log_softmax_handler(Graph *graph, Node *node) {
+  auto axis = BOOST_GET_CONST(int, node->Op()->GetAttr("axis"));
+  auto new_softmax = CreateSoftmaxOpset11(graph, node, node->inputs, {}, axis);
+  return CreateBaseOp(graph, node, "popart_log", new_softmax->outputs,
+                      node->outputs);
+}
+
+REGISTER_HANDLER(relu, relu_handler);
+REGISTER_HANDLER(tanh, tanh_handler);
+REGISTER_HANDLER(log, log_handler);
+REGISTER_HANDLER(sigmoid, sigmoid_handler);
+REGISTER_HANDLER(sqrt, sqrt_handler);
+REGISTER_HANDLER(gelu, gelu_handler);
+REGISTER_HANDLER(log_softmax, log_softmax_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
new file mode 100644
index 0000000000000..d46fc55ec6ce0
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
@@ -0,0 +1,185 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+// This avoids the static initialisation order fiasco,
+std::unordered_map<std::string, SymbolHandler> &SymbolHandlers() {
+  static std::unordered_map<std::string, SymbolHandler> symbol_handlers;
+  return symbol_handlers;
+}
+
+bool RegisterHandler(const std::string &symbol, const SymbolHandler &handler) {
+  if (SymbolHandlers().count(symbol) != 0) {
+    LOG(WARNING) << "Trying to register popart handler twice for operator: "
+                 << symbol;
+    return false;
+  }
+  bool new_handler = SymbolHandlers().emplace(symbol, handler).second;
+  return new_handler;
+}
+
+// Return a pointer to a handler if one is registered for this kind of node or
+// an empty std::function otherwise.
+SymbolHandler GetHandler(const std::string &kind) {
+  auto it = SymbolHandlers().find(kind);
+  if (it != SymbolHandlers().end()) {
+    return it->second;
+  }
+  return {};
+}
+
+void ConnectNodes(Node *first_node, Node *next_node) {
+  first_node->outputs.push_back(next_node);
+  next_node->inputs.push_back(first_node);
+}
+
+void DisConnectNodes(Node *first_node, Node *next_node) {
+  auto rm_by_value = [&](std::vector<Node *> &vec, Node *n) {
+    vec.erase(std::remove(vec.begin(), vec.end(), n), vec.end());
+  };
+  rm_by_value(first_node->outputs, next_node);
+  rm_by_value(next_node->inputs, first_node);
+  rm_by_value(first_node->inputs, next_node);
+  rm_by_value(next_node->outputs, first_node);
+}
+
+void ClearNode(Node *node) {
+  auto rm_by_value = [&](std::vector<Node *> &vec, Node *n) {
+    vec.erase(std::remove(vec.begin(), vec.end(), n), vec.end());
+  };
+  for (auto *node_in : node->inputs) {
+    rm_by_value(node_in->outputs, node);
+  }
+  for (auto *node_out : node->outputs) {
+    rm_by_value(node_out->inputs, node);
+  }
+}
+
+void CopyOpAttr(const std::string &attr_name, OpDesc *op, OpDesc *new_op,
+                bool override) {
+  if (new_op->HasAttr(attr_name) && !override) {
+    return;
+  }
+  if (op->HasAttr(attr_name)) {
+    VLOG(10) << "Copying attr: " << attr_name << " from " << op->Type()
+             << " to " << new_op->Type();
+    new_op->SetAttr(attr_name, op->GetAttr(attr_name));
+    new_op->Flush();
+  }
+}
+
+const int VarType2OnnxDtype(const int type) {
+  auto dtype = static_cast<framework::proto::VarType::Type>(type);
+  switch (dtype) {
+    case framework::proto::VarType::BOOL:
+      return static_cast<int>(ONNXDataType::BOOL);
+    case framework::proto::VarType::INT16:
+      return static_cast<int>(ONNXDataType::INT16);
+    case framework::proto::VarType::INT32:
+      return static_cast<int>(ONNXDataType::INT32);
+    case framework::proto::VarType::INT64:
+      return static_cast<int>(ONNXDataType::INT64);
+    case framework::proto::VarType::FP16:
+      return static_cast<int>(ONNXDataType::FLOAT16);
+    case framework::proto::VarType::FP32:
+      return static_cast<int>(ONNXDataType::FLOAT);
+    case framework::proto::VarType::FP64:
+      return static_cast<int>(ONNXDataType::DOUBLE);
+    case framework::proto::VarType::UINT8:
+      return static_cast<int>(ONNXDataType::UINT8);
+    case framework::proto::VarType::INT8:
+      return static_cast<int>(ONNXDataType::INT8);
+    case framework::proto::VarType::BF16:
+      return static_cast<int>(ONNXDataType::BFLOAT16);
+    case framework::proto::VarType::COMPLEX64:
+      return static_cast<int>(ONNXDataType::COMPLEX64);
+    case framework::proto::VarType::COMPLEX128:
+      return static_cast<int>(ONNXDataType::COMPLEX128);
+    default:
+      PADDLE_THROW(
+          platform::errors::Unimplemented("Unsupported data type: %d.", dtype));
+  }
+}
+
+const std::string VarType2PopStr(const int type) {
+  auto dtype = static_cast<framework::proto::VarType::Type>(type);
+  switch (dtype) {
+    case framework::proto::VarType::UINT8:
+      return "UINT8";
+    case framework::proto::VarType::INT8:
+      return "INT8";
+    case framework::proto::VarType::INT16:
+      return "INT16";
+    case framework::proto::VarType::INT32:
+      return "INT32";
+    case framework::proto::VarType::INT64:
+      return "INT64";
+    case framework::proto::VarType::BOOL:
+      return "BOOL";
+    case framework::proto::VarType::FP64:
+      return "DOUBLE";
+    case framework::proto::VarType::FP32:
+      return "FLOAT";
+    case framework::proto::VarType::FP16:
+      return "FLOAT16";
+    default:
+      PADDLE_THROW(
+          paddle::platform::errors::Unavailable("Unsupported data type."));
+  }
+}
+
+Node *GetInputVarNode(const std::string &input_name, const Node *op_node,
+                      const int id) {
+  auto var_name = op_node->Op()->Input(input_name).at(id);
+  return GetInputVarNodeByVarName(var_name, op_node);
+}
+
+Node *GetOutputVarNode(const std::string &output_name, const Node *op_node,
+                       const int id) {
+  auto var_name = op_node->Op()->Output(output_name).at(id);
+  return GetOutputVarNodeByVarName(var_name, op_node);
+}
+
+Node *GetInputVarNodeByVarName(const std::string &var_name,
+                               const Node *op_node) {
+  for (auto *var : op_node->inputs) {
+    if (var->Name() == var_name) {
+      return var;
+    }
+  }
+  return nullptr;
+}
+
+Node *GetOutputVarNodeByVarName(const std::string &var_name,
+                                const Node *op_node) {
+  for (auto *var : op_node->outputs) {
+    if (var->Name() == var_name) {
+      return var;
+    }
+  }
+  return nullptr;
+}
+
+const bool is_float_equal(float a, float b, float eps) {
+  return std::fabs(a - b) <= eps;
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
new file mode 100644
index 0000000000000..c1b2bd0c8b5fd
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+using framework::ir::Graph;
+using framework::ir::Node;
+using framework::OpDesc;
+
+#define REGISTER_HANDLER(name, func) \
+  static bool __UNUSED_##name =      \
+      paddle::platform::ipu::RegisterHandler(#name, func)
+
+using SymbolHandler = std::function<Node *(Graph *, Node *)>;
+
+std::unordered_map<std::string, SymbolHandler> &SymbolHandlers();
+
+bool RegisterHandler(const std::string &, const SymbolHandler &);
+
+SymbolHandler GetHandler(const std::string &);
+
+void ConnectNodes(Node *first_node, Node *next_node);
+void DisConnectNodes(Node *first_node, Node *next_node);
+void ClearNode(Node *node);
+void CopyOpAttr(const std::string &attr_name, OpDesc *op, OpDesc *new_op,
+                bool override = false);
+
+const int VarType2OnnxDtype(const int type);
+const std::string VarType2PopStr(const int type);
+
+Node *GetInputVarNode(const std::string &input_name, const Node *op_node,
+                      const int id = 0);
+Node *GetOutputVarNode(const std::string &output_name, const Node *op_node,
+                       const int id = 0);
+Node *GetInputVarNodeByVarName(const std::string &var_name,
+                               const Node *op_node);
+Node *GetOutputVarNodeByVarName(const std::string &var_name,
+                                const Node *op_node);
+
+const bool is_float_equal(float a, float b, float eps = 1e-8);
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/elementwise_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/elementwise_ops.cc
new file mode 100644
index 0000000000000..f0c19cac3a6c3
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/elementwise_ops.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *elementwise_op_handler(Graph *graph, Node *node,
+                             const std::string &type) {
+  auto *op = node->Op();
+  auto x_shape = GetInputVarNode("X", node)->Var()->GetShape();
+  int64_t x_rank = x_shape.size();
+  auto y_shape = GetInputVarNode("Y", node)->Var()->GetShape();
+  int64_t y_rank = y_shape.size();
+
+  auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
+  if (axis == -1 || axis == x_rank - 1 || x_rank == y_rank) {
+    auto new_node =
+        CreateBaseOp(graph, node, type,
+                     {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
+                     node->outputs);
+    return new_node;
+  } else {
+    auto y_new_shape = std::vector<int64_t>(x_rank, 1);
+    for (int i = axis; i < axis + y_rank; ++i) {
+      y_new_shape[i] = y_shape[i - axis];
+    }
+    auto attrs = AttributeMap{
+        {"value", y_new_shape},
+        {"dims", std::vector<int64_t>{x_rank}},
+        {"dtype", ONNXDataType::INT64},
+    };
+    // constant
+    auto new_node_const = CreateConst(graph, node, {}, {}, attrs);
+    // reshape
+    auto new_node_reshape = CreateBaseOp(
+        graph, node, "popart_reshape",
+        {GetInputVarNode("Y", node), new_node_const->outputs[0]}, {});
+    // elementwise_op
+    auto new_node =
+        CreateBaseOp(graph, node, type,
+                     {GetInputVarNode("X", node), new_node_reshape->outputs[0]},
+                     node->outputs);
+    return new_node;
+  }
+}
+
+Node *elementwise_add_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_add");
+}
+
+Node *elementwise_sub_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_sub");
+}
+
+Node *elementwise_div_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_div");
+}
+
+Node *elementwise_mul_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_mul");
+}
+
+Node *elementwise_min_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_min");
+}
+
+Node *elementwise_max_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_max");
+}
+
+Node *elementwise_pow_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_pow");
+}
+
+Node *elementwise_mod_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_mod");
+}
+
+REGISTER_HANDLER(elementwise_add, elementwise_add_handler);
+REGISTER_HANDLER(elementwise_sub, elementwise_sub_handler);
+REGISTER_HANDLER(elementwise_div, elementwise_div_handler);
+REGISTER_HANDLER(elementwise_mul, elementwise_mul_handler);
+REGISTER_HANDLER(elementwise_min, elementwise_min_handler);
+REGISTER_HANDLER(elementwise_max, elementwise_max_handler);
+REGISTER_HANDLER(elementwise_pow, elementwise_pow_handler);
+REGISTER_HANDLER(elementwise_mod, elementwise_mod_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
new file mode 100644
index 0000000000000..92362ebf5be7d
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *equal_handler(Graph *graph, Node *node) {
+  auto new_node = CreateBaseOp(
+      graph, node, "popart_equal",
+      {GetInputVarNode("X", node), GetInputVarNode("Y", node)}, node->outputs);
+  return new_node;
+}
+
+REGISTER_HANDLER(equal, equal_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
new file mode 100644
index 0000000000000..af7e4d0c7dbe9
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
@@ -0,0 +1,259 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *mean_handler(Graph *graph, Node *node) {
+  return CreateBaseOp(graph, node, "popart_reducemean",
+                      {GetInputVarNode("X", node)},
+                      {GetOutputVarNode("Out", node)},
+                      {
+                          {"keepdims", int64_t{0}},
+                      });
+}
+
+Node *pow_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  if (op->HasInput("FactorTensor") && !op->Input("FactorTensor").empty()) {
+    return CreateBaseOp(
+        graph, node, "popart_pow",
+        {GetInputVarNode("X", node), GetInputVarNode("FactorTensor", node)},
+        node->outputs);
+  } else {
+    // Op(pow) -> Op(Constant)->Var(const_out)->Op(Pow)
+    auto value_ = BOOST_GET_CONST(float, op->GetAttr("factor"));
+    auto attrs =
+        MakeConstAttrMapFromValue<float>(value_, {1}, ONNXDataType::FLOAT);
+    auto new_node_const = CreateConst(graph, node, {}, {}, attrs);
+    return CreateBaseOp(graph, node, "popart_pow", {GetInputVarNode("X", node),
+                                                    new_node_const->outputs[0]},
+                        node->outputs);
+  }
+}
+
+Node *mul_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto x_num_col_dims = BOOST_GET_CONST(int, op->GetAttr("x_num_col_dims"));
+  auto y_num_col_dims = BOOST_GET_CONST(int, op->GetAttr("y_num_col_dims"));
+  auto x_shape_ = GetInputVarNode("X", node)->Var()->GetShape();
+  auto y_shape_ = GetInputVarNode("Y", node)->Var()->GetShape();
+
+  // build the shape for reshape
+  std::vector<int64_t> reshape_shape_{};
+  for (int left = 0; left < x_num_col_dims; left++) {
+    reshape_shape_.push_back(int64_t(x_shape_[left]));
+  }
+  for (int right = y_num_col_dims; right < y_shape_.size(); right++) {
+    reshape_shape_.push_back(int64_t(y_shape_[right]));
+  }
+  auto x_flatten =
+      CreateBaseOp(graph, node, "popart_flatten", {GetInputVarNode("X", node)},
+                   {}, {{"axis", int64_t(x_num_col_dims)}});
+  auto y_flatten =
+      CreateBaseOp(graph, node, "popart_flatten", {GetInputVarNode("Y", node)},
+                   {}, {{"axis", int64_t(y_num_col_dims)}});
+  auto matmul =
+      CreateBaseOp(graph, node, "popart_matmul",
+                   {x_flatten->outputs[0], y_flatten->outputs[0]}, {}, {});
+
+  auto reshape_const = CreateConst(
+      graph, node, {}, {},
+      {{"value", reshape_shape_},
+       {"dims", std::vector<int64_t>{int64_t(reshape_shape_.size())}},
+       {"dtype", ONNXDataType::INT64}});
+  return CreateBaseOp(graph, node, "popart_reshape",
+                      {matmul->outputs[0], reshape_const->outputs[0]},
+                      node->outputs, {});
+}
+
+Node *matmul_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto transpose_x = BOOST_GET_CONST(bool, op->GetAttr("transpose_X"));
+  auto transpose_y = BOOST_GET_CONST(bool, op->GetAttr("transpose_Y"));
+  auto alpha = BOOST_GET_CONST(float, op->GetAttr("alpha"));
+  auto x_shape = GetInputVarNode("X", node)->Var()->GetShape();
+  auto y_shape = GetInputVarNode("Y", node)->Var()->GetShape();
+
+  int x_rank = x_shape.size();
+  std::vector<int64_t> perm;
+  if (x_rank == 1) {
+    perm = std::vector<int64_t>{0};
+  } else if (x_rank == 2) {
+    return CreateGemm(graph, node,
+                      {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
+                      node->outputs, transpose_x, transpose_y, alpha);
+  } else if (x_rank == 3) {
+    perm = std::vector<int64_t>{0, 2, 1};
+  } else if (x_rank == 4) {
+    perm = std::vector<int64_t>{0, 1, 3, 2};
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "op matmul with input rank == %d", x_rank));
+  }
+
+  Node *x_node = GetInputVarNode("X", node);
+  Node *y_node = GetInputVarNode("Y", node);
+  if (transpose_x) {
+    x_node = CreateBaseOp(graph, node, "popart_transpose",
+                          {GetInputVarNode("X", node)}, {}, {{"perm", perm}});
+    x_node = x_node->outputs[0];
+  }
+  if (transpose_y) {
+    y_node = CreateBaseOp(graph, node, "popart_transpose",
+                          {GetInputVarNode("Y", node)}, {}, {{"perm", perm}});
+    y_node = y_node->outputs[0];
+  }
+  if (is_float_equal(alpha, 1.0)) {
+    auto o_node =
+        CreateBaseOp(graph, node, "popart_matmul", {x_node, y_node}, {});
+    auto attr = MakeConstAttrMapFromValue(alpha, {1}, ONNXDataType::FLOAT);
+    auto const_node = CreateConst(graph, node, {}, {}, attr);
+    return CreateBaseOp(graph, node, "popart_mul",
+                        {o_node->outputs[0], const_node->outputs[0]},
+                        node->outputs);
+  } else {
+    return CreateBaseOp(graph, node, "popart_matmul", {x_node, y_node},
+                        node->outputs);
+  }
+}
+
+Node *sum_handler(Graph *graph, Node *node) {
+  return CreateBaseOp(graph, node, "popart_sum", node->inputs, node->outputs);
+}
+
+Node *softmax_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
+  return CreateSoftmaxOpset11(graph, node, node->inputs, node->outputs, axis);
+}
+
+Node *scale_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto scale_ = BOOST_GET_CONST(float, op->GetAttr("scale"));
+  auto bias_ = BOOST_GET_CONST(float, op->GetAttr("bias"));
+  auto bias_after_scale_ =
+      BOOST_GET_CONST(bool, op->GetAttr("bias_after_scale"));
+  auto data_type_ = GetInputVarNode("X", node)->Var()->GetDataType();
+
+  auto new_node_bias_var =
+      CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{bias_}},
+                                        {"dims", std::vector<int64_t>{1}},
+                                        {"dtype", ONNXDataType::FLOAT}});
+  new_node_bias_var = new_node_bias_var->outputs[0];
+
+  Node *new_node_scale_var = nullptr;
+  if (op->HasInput("ScaleTensor") && !op->Input("ScaleTensor").empty()) {
+    new_node_scale_var = GetInputVarNode("ScaleTensor", node);
+  } else {
+    new_node_scale_var =
+        CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{scale_}},
+                                          {"dims", std::vector<int64_t>{1}},
+                                          {"dtype", ONNXDataType::FLOAT}});
+    new_node_scale_var = new_node_scale_var->outputs[0];
+  }
+
+  // convert to float32
+  auto new_node_cast =
+      CreateCast(graph, node, {GetInputVarNode("X", node)}, {},
+                 static_cast<int>(framework::proto::VarType::FP32));
+  Node *result = nullptr;
+  if (bias_after_scale_) {
+    auto new_node_mul =
+        CreateBaseOp(graph, node, "popart_mul",
+                     {new_node_cast->outputs[0], new_node_scale_var}, {}, {});
+    result =
+        CreateBaseOp(graph, node, "popart_add",
+                     {new_node_mul->outputs[0], new_node_bias_var}, {}, {});
+  } else {
+    auto new_node_add =
+        CreateBaseOp(graph, node, "popart_add",
+                     {new_node_cast->outputs[0], new_node_bias_var}, {}, {});
+    result =
+        CreateBaseOp(graph, node, "popart_mul",
+                     {new_node_add->outputs[0], new_node_scale_var}, {}, {});
+  }
+  auto result_after_cast =
+      CreateCast(graph, node, result->outputs, node->outputs,
+                 static_cast<int>(data_type_));
+  return result_after_cast;
+}
+
+Node *cross_entropy2_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto ignoreIndex = BOOST_GET_CONST(int, op->GetAttr("ignore_index"));
+  auto new_cast = CreateCast(graph, node, {GetInputVarNode("Label", node)}, {},
+                             framework::proto::VarType::INT32);
+  auto label_shape_ = GetInputVarNode("Label", node)->Var()->GetShape();
+  if (label_shape_.size() == 1) {
+    return CreateBaseOp(graph, node, "popart_nllloss",
+                        {GetInputVarNode("X", node), new_cast->outputs[0]},
+                        {GetOutputVarNode("Y", node)},
+                        {
+                            {"ignoreIndex", ignoreIndex},
+                        });
+  } else {
+    std::vector<int64_t> new_shape_{label_shape_[0]};
+    auto const_before_loss = CreateBaseOp(
+        graph, node, "popart_constant", {}, {},
+        {{"value", new_shape_},
+         {"dims",
+          std::vector<int64_t>{static_cast<int64_t>(new_shape_.size())}},
+         {"dtype", ONNXDataType::INT64}});
+
+    auto reshape_before_loss = CreateBaseOp(
+        graph, node, "popart_reshape",
+        {new_cast->outputs[0], const_before_loss->outputs[0]}, {}, {});
+
+    auto nllloss = CreateBaseOp(
+        graph, node, "popart_nllloss",
+        {GetInputVarNode("X", node), reshape_before_loss->outputs[0]}, {},
+        {
+            {"ignoreIndex", ignoreIndex},
+        });
+
+    auto const_after_loss = CreateBaseOp(
+        graph, node, "popart_constant", {}, {},
+        {{"value", label_shape_},
+         {"dims",
+          std::vector<int64_t>{static_cast<int64_t>(label_shape_.size())}},
+         {"dtype", ONNXDataType::INT64}});
+
+    auto reshape_after_loss =
+        CreateBaseOp(graph, node, "popart_reshape",
+                     {nllloss->outputs[0], const_after_loss->outputs[0]},
+                     {GetOutputVarNode("Y", node)}, {});
+    return reshape_after_loss;
+  }
+}
+
+REGISTER_HANDLER(mean, mean_handler);
+REGISTER_HANDLER(pow, pow_handler);
+REGISTER_HANDLER(mul, mul_handler);
+REGISTER_HANDLER(matmul, matmul_handler);
+REGISTER_HANDLER(sum, sum_handler);
+REGISTER_HANDLER(softmax, softmax_handler);
+REGISTER_HANDLER(scale, scale_handler);
+REGISTER_HANDLER(cross_entropy2, cross_entropy2_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
new file mode 100644
index 0000000000000..58f3e42b7387a
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
@@ -0,0 +1,301 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *conv2d_handler(Graph *graph, Node *node) {
+  OpDesc *op = node->Op();
+  auto dilations_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("dilations"));
+  auto dilations = std::vector<int64_t>{dilations_.begin(), dilations_.end()};
+  auto group_ = BOOST_GET_CONST(int, op->GetAttr("groups"));
+  auto pads_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("paddings"));
+  if (pads_.size() == 2) {
+    pads_.push_back(pads_[0]);
+    pads_.push_back(pads_[1]);
+  }
+  auto pads = std::vector<int64_t>{pads_.begin(), pads_.end()};
+  auto stride_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("strides"));
+  auto stride = std::vector<int64_t>{stride_.begin(), stride_.end()};
+  if (op->HasInput("Bias") && !op->Input("Bias").empty()) {
+    return CreateConv(
+        graph, node,
+        {
+            GetInputVarNode("Input", node), GetInputVarNode("Filter", node),
+            GetInputVarNode("Bias", node),
+        },
+        node->outputs, dilations, group_, {}, pads, stride);
+  } else {
+    return CreateConv(
+        graph, node,
+        {
+            GetInputVarNode("Input", node), GetInputVarNode("Filter", node),
+        },
+        node->outputs, dilations, group_, {}, pads, stride);
+  }
+}
+
+Node *batch_norm_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  std::vector<Node *> inputs;
+  inputs.push_back(GetInputVarNode("X", node));
+  inputs.push_back(GetInputVarNode("Scale", node));
+  inputs.push_back(GetInputVarNode("Bias", node));
+  inputs.push_back(GetInputVarNode("Mean", node));
+  inputs.push_back(GetInputVarNode("Variance", node));
+  int64_t num_outputs = 1;
+  std::vector<Node *> outputs;
+  auto is_test_type = op->GetAttrType("is_test");
+  bool is_test;
+  if (is_test_type == 0) {
+    // int
+    is_test = BOOST_GET_CONST(int, op->GetAttr("is_test"));
+  } else {
+    // bool
+    is_test = BOOST_GET_CONST(bool, op->GetAttr("is_test"));
+  }
+  outputs.push_back(GetOutputVarNode("Y", node));
+  if (!is_test) {
+    outputs.push_back(GetOutputVarNode("MeanOut", node));
+    outputs.push_back(GetOutputVarNode("VarianceOut", node));
+    outputs.push_back(GetOutputVarNode("SavedMean", node));
+    outputs.push_back(GetOutputVarNode("SavedVariance", node));
+    num_outputs = 5;
+  }
+  // outputs.push_back(GetOutputVarNode("ReserveSpace", node));
+  auto momentum = BOOST_GET_CONST(float, op->GetAttr("momentum"));
+  auto epsilon = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
+  // data_layout
+  return CreateBaseOp(graph, node, "popart_batchnormalization", inputs, outputs,
+                      {
+                          {"momentum", momentum},
+                          {"epsilon", epsilon},
+                          {"num_outputs", num_outputs},
+                      });
+}
+
+Node *pool2d_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto pooling_type = BOOST_GET_CONST(std::string, op->GetAttr("pooling_type"));
+  auto global_pooling = BOOST_GET_CONST(bool, op->GetAttr("global_pooling"));
+  if (global_pooling) {
+    if (pooling_type == "max") {
+      return CreateBaseOp(graph, node, "popart_globalmaxpool", node->inputs,
+                          node->outputs);
+    } else if (pooling_type == "avg") {
+      return CreateBaseOp(graph, node, "popart_globalaveragepool", node->inputs,
+                          node->outputs);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "op pool2d with unkonwn pooling_type: %s", pooling_type));
+    }
+  }
+  if (op->HasAttr("padding_algorithm")) {
+    auto padding_algorithm =
+        BOOST_GET_CONST(std::string, op->GetAttr("padding_algorithm"));
+    if (padding_algorithm != "EXPLICIT") {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "op pool2d with unkonwn padding_algorithm: %s", padding_algorithm));
+    }
+  }
+
+  auto ksize = BOOST_GET_CONST(std::vector<int>, op->GetAttr("ksize"));
+  auto kernel_shape = std::vector<int64_t>{ksize.begin(), ksize.end()};
+  auto ceil_mode_ = BOOST_GET_CONST(bool, op->GetAttr("ceil_mode"));
+  auto ceil_mode = int64_t(ceil_mode_ ? 1 : 0);
+  auto paddings = BOOST_GET_CONST(std::vector<int>, op->GetAttr("paddings"));
+  auto pads = std::vector<int64_t>{paddings.begin(), paddings.end()};
+  if (pads.size() == 2) {
+    pads.push_back(paddings[0]);
+    pads.push_back(paddings[1]);
+  }
+  auto strides_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("strides"));
+  auto strides = std::vector<int64_t>{strides_.begin(), strides_.end()};
+  if (pooling_type == "max") {
+    int64_t num_outputs = 1;
+    auto dilations = std::vector<int64_t>{};
+    int64_t storage_order = 0;
+    return CreateBaseOp(graph, node, "popart_maxpool", node->inputs,
+                        node->outputs, {
+                                           {"num_outputs", num_outputs},
+                                           {"kernel_shape", kernel_shape},
+                                           {"ceil_mode", ceil_mode},
+                                           {"dilations", dilations},
+                                           {"pads", pads},
+                                           {"storage_order", storage_order},
+                                           {"strides", strides},
+                                       });
+  } else if (pooling_type == "avg") {
+    int64_t count_include_pad = 0;
+    return CreateBaseOp(graph, node, "popart_averagepool", node->inputs,
+                        node->outputs,
+                        {
+                            {"kernel_shape", kernel_shape},
+                            {"ceil_mode", ceil_mode},
+                            {"count_include_pad", count_include_pad},
+                            {"pads", pads},
+                            {"strides", strides},
+                        });
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "op pool2d with unkonwn pooling_type: %s", pooling_type));
+  }
+}
+
+Node *group_norm_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
+  auto groups_ = BOOST_GET_CONST(int, op->GetAttr("groups"));
+  auto groups = int64_t{groups_};
+  auto attrs_ = AttributeMap{{"epsilon", epsilon_}, {"num_groups", groups}};
+
+  std::vector<Node *> inputs_ = {GetInputVarNode("X", node),
+                                 GetInputVarNode("Scale", node),
+                                 GetInputVarNode("Bias", node)};
+  std::vector<Node *> outputs_ = {GetOutputVarNode("Y", node),
+                                  GetOutputVarNode("Mean", node),
+                                  GetOutputVarNode("Variance", node)};
+  return CreateBaseOp(graph, node, "popart_groupnormalization_v2", inputs_,
+                      outputs_, attrs_);
+}
+
+Node *instance_norm_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
+  auto attrs_ = AttributeMap{{"epsilon", epsilon_}};
+
+  std::vector<Node *> inputs_ = {GetInputVarNode("X", node),
+                                 GetInputVarNode("Scale", node),
+                                 GetInputVarNode("Bias", node)};
+  std::vector<Node *> outputs_ = {GetOutputVarNode("Y", node)};
+  return CreateBaseOp(graph, node, "popart_instancenormalization", inputs_,
+                      outputs_, attrs_);
+}
+
+Node *layer_norm_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto begin_norm_axis_ = BOOST_GET_CONST(int, op->GetAttr("begin_norm_axis"));
+  auto input_shape_ = GetInputVarNode("X", node)->Var()->GetShape();
+
+  std::vector<int64_t> norm_shape_{1, 1};
+  for (int i = 0; i < input_shape_.size(); i++) {
+    if (i < begin_norm_axis_) {
+      norm_shape_[0] *= input_shape_[i];
+    } else {
+      norm_shape_[1] *= input_shape_[i];
+    }
+  }
+
+  auto attrs1 = AttributeMap{
+      {"value", norm_shape_},
+      {"dims", std::vector<int64_t>{static_cast<int64_t>(norm_shape_.size())}},
+      {"dtype", ONNXDataType::INT64}};
+  auto reshape1_const =
+      CreateBaseOp(graph, node, "popart_constant", {}, {}, attrs1);
+  auto new_node_reshape1 = CreateBaseOp(
+      graph, node, "popart_reshape",
+      {GetInputVarNode("X", node), reshape1_const->outputs[0]}, {}, {});
+
+  auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
+  int64_t groups_ = 1;
+  auto groupnorm_attrs_ =
+      AttributeMap{{"epsilon", epsilon_}, {"num_groups", groups_}};
+  auto out_Y_ = MakeVarNode(graph, node);
+  CreateBaseOp(graph, node, "popart_groupnormalization_v2",
+               {new_node_reshape1->outputs[0], GetInputVarNode("Scale", node),
+                GetInputVarNode("Bias", node)},
+               {out_Y_, GetOutputVarNode("Mean", node),
+                GetOutputVarNode("Variance", node)},
+               groupnorm_attrs_);
+
+  auto attrs2 = AttributeMap{
+      {"value", input_shape_},
+      {"dims", std::vector<int64_t>{static_cast<int64_t>(input_shape_.size())}},
+      {"dtype", ONNXDataType::INT64}};
+  auto reshape2_const =
+      CreateBaseOp(graph, node, "popart_constant", {}, {}, attrs2);
+  auto new_node_reshape2 = CreateBaseOp(graph, node, "popart_reshape",
+                                        {out_Y_, reshape2_const->outputs[0]},
+                                        {GetOutputVarNode("Y", node)}, {});
+  return new_node_reshape2;
+}
+
+Node *dropout_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto dropout_prob_ = BOOST_GET_CONST(float, op->GetAttr("dropout_prob"));
+  auto dropout_implementation_ =
+      BOOST_GET_CONST(std::string, op->GetAttr("dropout_implementation"));
+  auto is_test_type_ = op->GetAttrType("is_test");
+  bool is_test_;
+  if (is_test_type_ == 0) {
+    // int
+    is_test_ = BOOST_GET_CONST(int, op->GetAttr("is_test"));
+  } else {
+    // bool
+    is_test_ = BOOST_GET_CONST(bool, op->GetAttr("is_test"));
+  }
+
+  if (is_test_) {
+    if (dropout_implementation_ == "upscale_in_train") {
+      return CreateBaseOp(graph, node, "popart_identity",
+                          {GetInputVarNode("X", node)},
+                          {GetOutputVarNode("Out", node)}, {});
+    } else if (dropout_implementation_ == "downgrade_in_infer") {
+      auto scale =
+          CreateConst(graph, node, {}, {},
+                      {{"value", std::vector<float>{1 - dropout_prob_}},
+                       {"dims", std::vector<int64_t>{1}},
+                       {"dtype", ONNXDataType::FLOAT}});
+      return CreateBaseOp(graph, node, "popart_mul",
+                          {GetInputVarNode("X", node), scale->outputs[0]},
+                          {GetOutputVarNode("Out", node)}, {});
+    } else {
+      PADDLE_THROW(
+          platform::errors::InvalidArgument("Invalid dropout_implementation"));
+    }
+  } else {
+    if (dropout_implementation_ == "upscale_in_train") {
+      auto attrs_ =
+          AttributeMap{{"num_outputs", (int64_t)1}, {"ratio", dropout_prob_}};
+      return CreateBaseOp(graph, node, "popart_dropout",
+                          {GetInputVarNode("X", node)},
+                          {GetOutputVarNode("Out", node)}, attrs_);
+    } else if (dropout_implementation_ == "downgrade_in_infer") {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Do not support downgrade_in_infer with training"));
+    } else {
+      PADDLE_THROW(
+          platform::errors::InvalidArgument("Invalid dropout_implementation"));
+    }
+  }
+}
+
+REGISTER_HANDLER(pool2d, pool2d_handler);
+REGISTER_HANDLER(batch_norm, batch_norm_handler);
+REGISTER_HANDLER(group_norm, group_norm_handler);
+REGISTER_HANDLER(instance_norm, instance_norm_handler);
+REGISTER_HANDLER(layer_norm, layer_norm_handler);
+REGISTER_HANDLER(conv2d, conv2d_handler);
+REGISTER_HANDLER(dropout, dropout_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
new file mode 100644
index 0000000000000..b7a3a8ca7c60f
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
@@ -0,0 +1,195 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+// singleton
+static int var_count = 0;
+static int op_count = 0;
+
+const std::string GenerateVarName() {
+  return std::string("_gen_var_") + std::to_string(var_count++);
+}
+
+const std::string GenerateOpName() {
+  return std::string("_gen_op_") + std::to_string(op_count++);
+}
+
+const std::string CreateOpIdentifyId(Node *node) {
+  // format: op_type|out_var0|out_var1|...|_gen_*
+  // this name will be used as op name when exporting onnx model from popart
+  auto op_type = node->Name();
+  std::string op_out = "";
+  for (auto *out_node : node->outputs) {
+    op_out += "|";
+    op_out += out_node->Name();
+  }
+  return {op_type + op_out + "|" + GenerateOpName()};
+}
+
+Node *MakeVarNode(Graph *graph, Node *node) {
+  auto var_name = GenerateVarName();
+  auto var_desc = std::make_unique<framework::VarDesc>(var_name);
+
+  auto var = graph->CreateVarNode(var_desc.get());
+  return var;
+}
+
+Node *MakeOpNode(Graph *graph, Node *node, const std::string &type,
+                 const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs) {
+  auto op_desc = std::make_unique<framework::OpDesc>();
+  op_desc->SetType(type);
+  auto op = graph->CreateOpNode(op_desc.get());
+
+  for (auto *in : inputs) {
+    ConnectNodes(in, op);
+  }
+  if (outputs.empty()) {
+    auto var = MakeVarNode(graph, node);
+    ConnectNodes(op, var);
+  } else {
+    for (auto *out : outputs) {
+      ConnectNodes(op, out);
+    }
+  }
+
+  // i/o
+  std::vector<std::string> input_names;
+  for (auto node : op->inputs) {
+    input_names.push_back(node->Name());
+  }
+  op->Op()->SetInput("__inputs__", input_names);
+  std::vector<std::string> output_names;
+  for (auto node : op->outputs) {
+    output_names.push_back(node->Name());
+  }
+  op->Op()->SetOutput("__outputs__", output_names);
+  op->Op()->Flush();
+
+  return op;
+}
+
+Node *CreateBaseOp(Graph *graph, Node *node, const std::string &type,
+                   const std::vector<Node *> &inputs,
+                   const std::vector<Node *> &outputs,
+                   const AttributeMap &attrs) {
+  auto new_node = MakeOpNode(graph, node, type, inputs, outputs);
+  if (!attrs.empty()) {
+    new_node->Op()->SetAttrMap(attrs);
+  }
+  // deal special attr
+  if (!new_node->Op()->HasAttr(sIpuIndexAttr)) {
+    CopyOpAttr(sIpuIndexAttr, node->Op(), new_node->Op());
+  }
+  if (!new_node->Op()->HasAttr(sIpuStageAttr)) {
+    CopyOpAttr(sIpuStageAttr, node->Op(), new_node->Op());
+  }
+  {
+    new_node->Op()->SetAttr(sOpIdentifyIdAttr, CreateOpIdentifyId(node));
+    new_node->Op()->Flush();
+  }
+
+  return new_node;
+}
+
+Node *CreateConst(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                  const std::vector<Node *> &outputs,
+                  const AttributeMap &attrs) {
+  return CreateBaseOp(graph, node, "popart_constant", inputs, outputs, attrs);
+}
+
+Node *CreateCast(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs, const int otype) {
+  auto to = VarType2PopStr(otype);
+  return CreateBaseOp(graph, node, "popart_cast", inputs, outputs,
+                      {{"to", to}});
+}
+
+Node *CreateGemm(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs, int64_t transA,
+                 int64_t transB, float alpha, float beta) {
+  return CreateBaseOp(graph, node, "popart_gemm", inputs, outputs,
+                      {
+                          {"alpha", alpha},
+                          {"beta", beta},
+                          {"transA", transA},
+                          {"transB", transB},
+                      });
+}
+
+Node *CreateReshape(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                    const std::vector<Node *> &outputs,
+                    const std::vector<int64_t> &oshape) {
+  auto attr = AttributeMap{
+      {"value", oshape},
+      {"dims", std::vector<int64_t>{static_cast<int64_t>(oshape.size())}},
+      {"dtype", ONNXDataType::INT64}};
+  auto new_node_const =
+      CreateBaseOp(graph, node, "popart_constant", {}, {}, attr);
+  auto new_node_reshape =
+      CreateBaseOp(graph, node, "popart_reshape",
+                   {inputs[0], new_node_const->outputs[0]}, outputs);
+  return new_node_reshape;
+}
+
+Node *CreateConv(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs,
+                 const std::vector<int64_t> &dilations, int64_t group,
+                 const std::vector<int64_t> &kernel_shape,
+                 const std::vector<int64_t> &pads,
+                 const std::vector<int64_t> &strides) {
+  auto attrs = AttributeMap{
+      {"dilations", dilations},       {"group", group},
+      {"kernel_shape", kernel_shape}, {"pads", pads},
+      {"strides", strides},
+  };
+  return CreateBaseOp(graph, node, "popart_conv", inputs, outputs, attrs);
+}
+
+Node *CreateSoftmaxOpset11(Graph *graph, Node *node,
+                           const std::vector<Node *> &inputs,
+                           const std::vector<Node *> &outputs, int64_t axis) {
+  PADDLE_ENFORCE_EQ(inputs.size(), 1, platform::errors::InvalidArgument(
+                                          "Softmax op only support one input"));
+  auto x_shape = inputs[0]->Var()->GetShape();
+  int x_rank = x_shape.size();
+  if (axis < 0) {
+    axis = axis + x_rank;
+  }
+  if (axis == x_rank - 1) {
+    return CreateBaseOp(graph, node, "popart_softmax", inputs, outputs,
+                        {{"axis", int64_t{-1}}});
+  } else {
+    auto perm = std::vector<int64_t>(x_rank);
+    std::iota(perm.begin(), perm.end(), 0);
+    perm[x_rank - 1] = axis;
+    perm[axis] = x_rank - 1;
+    auto new_transpose_pre = CreateBaseOp(graph, node, "popart_transpose",
+                                          inputs, {}, {{"perm", perm}});
+    auto new_softmax =
+        CreateBaseOp(graph, node, "popart_softmax", new_transpose_pre->outputs,
+                     {}, {{"axis", int64_t{-1}}});
+    return CreateBaseOp(graph, node, "popart_transpose", new_softmax->outputs,
+                        outputs, {{"perm", perm}});
+  }
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
new file mode 100644
index 0000000000000..7e70e56ef9166
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
@@ -0,0 +1,85 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/device/ipu/common.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+using paddle::framework::AttributeMap;
+
+template <typename T>
+AttributeMap MakeConstAttrMap(std::vector<T> value, std::vector<int64_t> dims,
+                              int dtype) {
+  return AttributeMap{{"value", value}, {"dims", dims}, {"dtype", dtype}};
+}
+
+template <typename T>
+AttributeMap MakeConstAttrMapFromValue(T v, std::vector<int64_t> dims,
+                                       int dtype) {
+  size_t size = 1;
+  for (auto &dim : dims) {
+    size *= dim;
+  }
+  return MakeConstAttrMap<T>(std::vector<T>(size, v), dims, dtype);
+}
+
+const std::string GenerateVarName();
+const std::string CreateOpIdentifyId(Node *node);
+
+Node *MakeVarNode(Graph *graph, Node *node);
+Node *MakeOpNode(Graph *graph, Node *node, const std::string &type,
+                 const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs);
+
+Node *CreateBaseOp(Graph *graph, Node *node, const std::string &type,
+                   const std::vector<Node *> &inputs,
+                   const std::vector<Node *> &outputs,
+                   const AttributeMap &attrs = {});
+
+Node *CreateConst(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                  const std::vector<Node *> &outputs,
+                  const AttributeMap &attrs);
+
+// otype is proto::VarType::Type
+Node *CreateCast(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs, const int otype);
+
+Node *CreateGemm(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs, int64_t transA = 0,
+                 int64_t transB = 0, float alpha = 1.0f, float beta = 1.0f);
+
+Node *CreateReshape(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                    const std::vector<Node *> &outputs,
+                    const std::vector<int64_t> &oshape);
+
+Node *CreateConv(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs,
+                 const std::vector<int64_t> &dilations = {1, 1},
+                 int64_t group = 1,
+                 const std::vector<int64_t> &kernel_shape = {},
+                 const std::vector<int64_t> &pads = {0, 0, 0, 0},
+                 const std::vector<int64_t> &strides = {1, 1});
+
+Node *CreateSoftmaxOpset11(Graph *graph, Node *node,
+                           const std::vector<Node *> &inputs,
+                           const std::vector<Node *> &outputs, int64_t axis);
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/supported_ops_autogen.h b/paddle/fluid/platform/device/ipu/supported_ops_autogen.h
index 4cd7f928f6e22..763c5a46abe28 100644
--- a/paddle/fluid/platform/device/ipu/supported_ops_autogen.h
+++ b/paddle/fluid/platform/device/ipu/supported_ops_autogen.h
@@ -195,3 +195,5 @@ OP_DECL(popart_sqrt, aiOnnxOpset.sqrt, NONE) // NOLINT
 OP_DECL(popart_tanh, aiOnnxOpset.tanh, NONE) // NOLINT
 OP_DECL(popart_tile, aiOnnxOpset.tile, NONE) // NOLINT
 OP_DECL(popart_transpose, aiOnnxOpset.transpose, ARG(INT_VEC,perm) ) // NOLINT
+
+// clang-format on
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index a0c9ff09460af..206bef12aac95 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -16,6 +16,9 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/ipu/ipu_backend.h"
+#endif
 #include "glog/logging.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -96,8 +99,9 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
   if (it == device_contexts_.end()) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Place %s is not supported. Please check that your paddle compiles "
-        "with WITH_GPU, WITH_XPU or WITH_ASCEND_CL option or check that "
-        "your train process set the correct device id if you use Executor.",
+        "with WITH_GPU, WITH_XPU, WITH_IPU or WITH_ASCEND_CL option or check "
+        "that your train process set the correct device id if you use "
+        "Executor.",
         place));
   }
   return it->second.get().get();
@@ -158,6 +162,14 @@ DeviceContextPool::DeviceContextPool(
       PADDLE_THROW(
           platform::errors::Unimplemented("XPUPlace is not supported. Please "
                                           "re-compile with WITH_XPU option."));
+#endif
+    } else if (platform::is_ipu_place(p)) {
+#ifdef PADDLE_WITH_IPU
+      EmplaceDeviceContext<IPUDeviceContext, IPUPlace>(&device_contexts_, p);
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("IPUPlace is not supported. Please "
+                                          "re-compile with WITH_IPU option."));
 #endif
     } else if (platform::is_npu_place(p)) {
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -195,6 +207,22 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
 
 Place CPUDeviceContext::GetPlace() const { return place_; }
 
+#ifdef PADDLE_WITH_IPU
+IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {
+  int id = place.GetDeviceId();
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+  device_ = ipu_backend->GetDevice(id);
+}
+
+Place IPUDeviceContext::GetPlace() const { return place_; }
+void IPUDeviceContext::Wait() const {
+  /*! \brief  Wait for all operations completion in the stream. */
+}
+
+IPUDeviceContext::~IPUDeviceContext() {}
+
+#endif
 #ifdef PADDLE_WITH_XPU
 XPUDeviceContext::XPUDeviceContext() {
   context_ = xpu::create_context();
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
index 415babc9cb85e..ec49134b654e9 100644
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -36,6 +36,7 @@ class PlacePrinter : public boost::static_visitor<> {
   void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; }
   void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; }
   void operator()(const NPUPinnedPlace &p) { os_ << "NPUPinnedPlace"; }
+  void operator()(const IPUPlace &p) { os_ << "IPUPlace(" << p.device << ")"; }
   void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }
 
  private:
@@ -56,6 +57,10 @@ bool is_npu_place(const Place &p) {
   return boost::apply_visitor(IsNPUPlace(), p);
 }
 
+bool is_ipu_place(const Place &p) {
+  return boost::apply_visitor(IsIPUPlace(), p);
+}
+
 bool is_cpu_place(const Place &p) {
   return boost::apply_visitor(IsCPUPlace(), p);
 }
@@ -80,6 +85,8 @@ bool is_same_place(const Place &p1, const Place &p2) {
       return BOOST_GET_CONST(XPUPlace, p1) == BOOST_GET_CONST(XPUPlace, p2);
     } else if (is_npu_place(p1)) {
       return BOOST_GET_CONST(NPUPlace, p1) == BOOST_GET_CONST(NPUPlace, p2);
+    } else if (is_ipu_place(p1)) {
+      return BOOST_GET_CONST(IPUPlace, p1) == BOOST_GET_CONST(IPUPlace, p2);
     } else {
       return BOOST_GET_CONST(CUDAPlace, p1) == BOOST_GET_CONST(CUDAPlace, p2);
     }
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index 1ab2a62391157..fadc1e27e8a0a 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -95,12 +95,25 @@ struct NPUPinnedPlace {
   inline bool operator!=(const NPUPinnedPlace &) const { return false; }
   inline bool operator<(const NPUPinnedPlace &) const { return false; }
 };
+struct IPUPlace {
+  IPUPlace() : IPUPlace(0) {}
+  explicit IPUPlace(int d) : device(d) {}
+
+  inline int GetDeviceId() const { return device; }
+  // needed for variant equality comparison
+  inline bool operator==(const IPUPlace &o) const { return device == o.device; }
+  inline bool operator!=(const IPUPlace &o) const { return !(*this == o); }
+  inline bool operator<(const IPUPlace &o) const { return device < o.device; }
+
+  int device;
+};
 
 struct IsCUDAPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const NPUPlace &) const { return false; }
   bool operator()(const NPUPinnedPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return true; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -110,6 +123,7 @@ struct IsCPUPlace : public boost::static_visitor<bool> {
   bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const NPUPlace &) const { return false; }
   bool operator()(const NPUPinnedPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -119,6 +133,7 @@ struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
   bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const NPUPlace &) const { return false; }
   bool operator()(const NPUPinnedPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
 };
@@ -128,6 +143,7 @@ struct IsXPUPlace : public boost::static_visitor<bool> {
   bool operator()(const XPUPlace &) const { return true; }
   bool operator()(const NPUPlace &) const { return false; }
   bool operator()(const NPUPinnedPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -137,6 +153,7 @@ struct IsNPUPlace : public boost::static_visitor<bool> {
   bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const NPUPlace &) const { return true; }
   bool operator()(const NPUPinnedPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -145,22 +162,33 @@ struct IsNPUPinnedPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const XPUPlace &) const { return false; }
   bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return false; }
+  bool operator()(const CUDAPlace &) const { return false; }
+  bool operator()(const CUDAPinnedPlace &) const { return false; }
   bool operator()(const NPUPinnedPlace &) const { return true; }
+};
+struct IsIPUPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const XPUPlace &) const { return false; }
+  bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const IPUPlace &) const { return true; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
+  bool operator()(const NPUPinnedPlace &) const { return false; }
 };
 
 class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
-                                    CUDAPinnedPlace, NPUPinnedPlace> {
+                                    CUDAPinnedPlace, NPUPinnedPlace, IPUPlace> {
  private:
   using PlaceBase = boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
-                                   CUDAPinnedPlace, NPUPinnedPlace>;
+                                   CUDAPinnedPlace, NPUPinnedPlace, IPUPlace>;
 
  public:
   Place() = default;
   Place(const CPUPlace &cpu_place) : PlaceBase(cpu_place) {}     // NOLINT
   Place(const XPUPlace &xpu_place) : PlaceBase(xpu_place) {}     // NOLINT
   Place(const NPUPlace &npu_place) : PlaceBase(npu_place) {}     // NOLINT
+  Place(const IPUPlace &ipu_place) : PlaceBase(ipu_place) {}     // NOLINT
   Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {}  // NOLINT
   Place(const CUDAPinnedPlace &cuda_pinned_place)                // NOLINT
       : PlaceBase(cuda_pinned_place) {}
@@ -180,6 +208,7 @@ using PlaceList = std::vector<Place>;
 bool is_gpu_place(const Place &);
 bool is_xpu_place(const Place &);
 bool is_npu_place(const Place &);
+bool is_ipu_place(const Place &);
 bool is_cpu_place(const Place &);
 bool is_cuda_pinned_place(const Place &);
 bool is_npu_pinned_place(const Place &);
@@ -228,6 +257,15 @@ struct PlaceVisitorWrapper
     return typename Visitor::result_type();
 #endif
   }
+  typename Visitor::result_type operator()(const IPUPlace &ipu) const {
+#ifdef PADDLE_WITH_IPU
+    return visitor_(ipu);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Paddle is not compiled with IPU. Cannot visit ipu device"));
+    return typename Visitor::result_type();
+#endif
+  }
 
   typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/pybind/.gitignore b/paddle/fluid/pybind/.gitignore
index d86562fe73c57..7b0459e583cd8 100644
--- a/paddle/fluid/pybind/.gitignore
+++ b/paddle/fluid/pybind/.gitignore
@@ -1,2 +1,3 @@
 pybind.h
-op_function_impl.h
\ No newline at end of file
+op_function_impl.h
+eager_op_function_impl.h
diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc
index 46d0bdcb46de7..222db68a1c1aa 100644
--- a/paddle/fluid/pybind/eager_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_op_function_generator.cc
@@ -32,8 +32,126 @@
 #endif
 #include "paddle/fluid/pybind/op_function_generator.h"
 
-std::set<std::string> gen_list = {"elementwise_add", "reduce_sum", "matmul_v2",
-                                  "sigmoid"};
+std::set<std::string> gen_list = {
+    "sigmoid", "matmul_v2", "reduce_sum", "elementwise_add", "rsqrt",
+    "multihead_matmul", "addmm", "gru", "round", "push_dense", "rank_attention",
+    "fused_embedding_fc_lstm", "where_index", "bicubic_interp", "arg_min",
+    "tile", "bilinear_tensor_product", "ctc_align",
+    "pow2_decay_with_linear_warmup", "marker", "split", "fc",
+    "load", "elementwise_max", "adadelta",
+    "tan",
+    "fsp", "where", "logical_xor", "multiclass_nms3", "one_hot_v2",
+    "sequence_softmax", "affine_channel", "triangular_solve",
+    "sequence_topk_avg_pooling", "space_to_depth", "reverse",
+    "fused_embedding_eltwise_layernorm", "expand_v2", "lgamma", "solve",
+    "deformable_psroi_pooling", "instance_norm", "decode_jpeg", "gather_nd",
+    "reduce_prod", "matrix_rank", "asin", "lstmp", "iou_similarity",
+    "huber_loss", "one_hot", "sequence_slice", "lookup_table", "softplus",
+    "depthwise_conv2d", "fused_fc_elementwise_layernorm",
+    "sigmoid_cross_entropy_with_logits", "exp", "scatter", "equal_all",
+    "searchsorted", "fusion_squared_mat_sub", "unique", "log", "conv_shift",
+    "smooth_l1_loss", "linear_interp_v2",
+    "temporal_shift", "nce", "mv", "proximal_gd", "memcpy_h2d",
+    "add_position_encoding", "cosh", "hash", "grad_add", "sign", "prelu",
+    "linspace", "fill_diagonal", "logsigmoid", "load_combine", "fetch_v2",
+    "randperm", "sequence_scatter", "partial_sum", "relu6", "conv3d",
+    "lstm_unit", "not_equal", "transpose2", "uniform_random_batch_size_like",
+    "unfold", "lrn", "softmax_with_cross_entropy", "isfinite_v2", "bernoulli",
+    "max_pool3d_with_index", "gaussian_random", "flatten2",
+    "cvm", "adamax", "masked_select", "range", "bitwise_not", "trace",
+    "multinomial", "modified_huber_loss", "roll", "squared_l2_distance",
+    "conv3d_transpose", "share_data", "fake_quantize_abs_max",
+    "unique_with_counts", "fill", "concat", "fill_zeros_like",
+    "hierarchical_sigmoid", "isinf_v2", "squeeze", "multiclass_nms2",
+    "bpr_loss", "fft_c2c", "bicubic_interp_v2", "reshape", "coalesce_tensor",
+    "roi_align", "reshape2", "reduce_any", "unstack", "scatter_nd_add",
+    "sequence_reshape", "bilateral_slice", "fill_any_like", "empty",
+    "pad_constant_like", "pool2d", "size", "imag", "eigh", "stack",
+    "dgc_momentum",
+    "generate_proposals_v2", "bitwise_or", "gru_unit",
+    "sampling_id", "unsqueeze2",
+    "sequence_enumerate", "fusion_seqconv_eltadd_relu", "bce_loss",
+    "generate_proposal_labels", "im2sequence", "isinf", "adagrad",
+    "linear_chain_crf", "retinanet_target_assign", "fusion_group",
+    "teacher_student_sigmoid_loss", "random_crop", "lookup_table_v2",
+    "detection_map", "l1_norm", "sqrt", "fused_elemwise_activation",
+    "slogdeterminant", "share_buffer", "bitwise_and", "diag_embed", "unbind",
+    "dropout",
+    "beam_search", "log_loss", "greater_than", "kron", "sigmoid_focal_loss",
+    "rmsprop", "conv2d", "uniform_random_inplace", "maxout", "linear_interp",
+    "auc", "logical_or",
+    "acos", "unpool", "cumprod", "sample_logits", "crop_tensor",
+    "deformable_conv", "generate_mask_labels", "locality_aware_nms",
+    "expand_as", "matrix_power", "greater_equal", "generate_proposals",
+    "bilinear_interp", "inplace_abn", "softshrink", "mul", "data_norm",
+    "get_tensor_from_selected_rows", "spp", "floor", "gelu",
+    "retinanet_detection_output", "push_dense", "silu", "sequence_erase",
+    "real", "nearest_interp_v2", "dgc_clip_by_norm", "squeeze2",
+    "strided_slice", "conj", "precision_recall", "save",
+    "fusion_seqexpand_concat_fc", "fake_quantize_range_abs_max",
+    "depthwise_conv2d_transpose", "positive_negative_pair", "square",
+    "var_conv_2d", "log1p", "fused_softmax_mask_upper_triangle", "clip_by_norm",
+    "atan2", "box_decoder_and_assign", "fft_r2c", "roi_pool", "overlap_add",
+    "fill_constant_batch_size_like", "fill_any", "dequantize_log",
+    "max_pool2d_with_index", "pad3d", "norm", "viterbi_decode", "mish",
+    "box_coder", "flatten", "elementwise_mod", "margin_cross_entropy",
+    "logical_and", "pow", "stanh", "label_smooth", "merged_momentum",
+    "ascend_trigger", "fused_feedforward", "rpn_target_assign",
+    "roi_perspective_transform", "expand", "prroi_pool", "pool3d", "memcpy",
+    "distribute_fpn_proposals", "frame", "bincount", "shape", "group_norm",
+    "resnet_unit", "sequence_expand_as", "cos_sim", "eigvals", "save_combine",
+    "class_center_sample", "read_file", "isfinite", "arg_max", "equal",
+    "fake_dequantize_max_abs", "qr", "anchor_generator", "layer_norm",
+    "merge_selected_rows", "less_equal",
+    "fusion_lstm", "lars_momentum", "hard_sigmoid", "isnan",
+    "elementwise_floordiv", "correlation", "histogram", "gather_tree",
+    "segment_pool", 
+    "fusion_repeated_fc_relu", "nop",
+    "expand_as_v2", "filter_by_instag", "nll_loss", "dot", "scale", "ncclBcast",
+    "shuffle_batch", "ncclReduce", "diag", "multiplex", "leaky_relu",
+    "allclose",
+    "elementwise_pow", "prior_box", "p_norm", "unique_consecutive", "lod_reset",
+    "pad", "sequence_conv", "log10", "set_value", "bitwise_xor", "center_loss",
+    "randint", "attention_lstm", "uniform_random", "slice", "meshgrid",
+    "hard_swish", "sin", "mean_iou", "pad2d", "inverse", "spectral_norm",
+    "shuffle_channel", "psroi_pool", "seed", "ceil", "eig", "reduce_min", "cos",
+    "ncclAllReduce", "cudnn_lstm", "digamma", "assign_value", "increment",
+    "tdm_sampler", "fused_softmax_mask", "sequence_reverse", "eigvalsh",
+    "diagonal", "trunc", "log2", "tanh", "yolov3_loss", "graph_send_recv",
+    "atan", "less_than", "unsqueeze", "crf_decoding", "log_softmax", "ftrl",
+    "matrix_nms", "top_k_v2", "cast", "tanh_shrink", "hard_shrink",
+    "multiclass_nms", "fusion_transpose_flatten_concat", "sequence_unpad",
+    "fused_elemwise_add_activation", "frobenius_norm", "crop", "cross_entropy2",
+    "skip_layernorm", "tdm_child", "fused_embedding_seq_pool", "erf",
+    "conv2d_inception_fusion", "trilinear_interp", "logsumexp",
+    "fusion_seqpool_concat", "alloc_float_status", "sequence_concat",
+    "fusion_seqpool_cvm_concat", "similarity_focus", "argsort",
+    "sequence_expand",
+    "fused_bn_add_activation", "bilinear_interp_v2", "clip",
+    "deformable_conv_v1", "hinge_loss", "determinant", "conv2d_transpose",
+    "memcpy_d2h", "softsign",
+    "broadcast_tensors", "grid_sampler", "fft_c2r", "pyramid_hash",
+    "multi_dot", "sequence_pool", "transpose", "top_k", "dist", "affine_grid",
+    "gaussian_random_batch_size_like", "fake_channel_wise_dequantize_max_abs",
+    "reciprocal", "sequence_mask", "fill_diagonal_tensor", "abs",
+    "partial_concat", "elu", "index_select", "row_conv", "cross",
+    "elementwise_mul", "decayed_adagrad", "bipartite_match",
+    "fake_quantize_moving_average_abs_max", "mine_hard_examples",
+    "target_assign", "lstm", "truncated_gaussian_random", "match_matrix_tensor",
+    "elementwise_div", "kldiv_loss", "cumsum", "sum", "proximal_adagrad",
+    "shard_index", "selu", "mean", "gumbel_softmax", "sequence_pad",
+    "tree_conv", "assign", "flatten_contiguous_range", "tril_triu", "brelu",
+    "celu", "reduce_mean", "sinh", "rank_loss", "reduce_max", "fusion_gru",
+    "fill_zeros_like2", "expm1", "squared_l2_norm", "elementwise_sub",
+    "margin_rank_loss", "faster_tokenizer", "relu", "is_empty", "reduce_all",
+    "edit_distance", "bmm", "yolo_box", "soft_relu", "density_prior_box", "eye",
+    "swish", "cross_entropy", "dpsgd", "cholesky", "batch_fc", "nearest_interp",
+    "gather", "trilinear_interp_v2", "box_clip", "isnan_v2", "softmax",
+    "conv2d_fusion", "fused_batch_norm_act",
+    "index_sample", "elementwise_min", "logical_not", "collect_fpn_proposals",
+    "pixel_shuffle", "thresholded_relu", "polygon_box_transform",
+    "lookup_table_dequant", "warpctc", "fake_channel_wise_quantize_abs_max",
+    "dequantize_abs_max", "svd", "flip"};
 
 // clang-format off
 const char* OUT_INITIALIZER_TEMPLATE =
@@ -178,15 +296,7 @@ std::string GenerateOpFunctionsBody(
     ins_cast_str += paddle::string::Sprintf(in_cast_type, in_name, op_type,
                                             in_name, arg_idx++, dispensable);
 
-    if (input.dispensable()) {
-      const auto in_template = input.duplicable()
-                                   ? INPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST
-                                   : INPUT_INITIALIZER_TEMPLATE_WITH_NULL;
-      ins_initializer_with_null +=
-          paddle::string::Sprintf(in_template, in_name, in_name, in_name);
-    } else {
-      call_api_str += in_name + ", ";
-    }
+    call_api_str += in_name + ", ";
   }
 
   if (!input_args.empty() && input_args.back() == ',') {
@@ -237,6 +347,8 @@ std::string GenerateOpFunctionsBody(
       auto dispensable = output.dispensable() ? "true" : "false";
       ins_cast_str += paddle::string::Sprintf(in_cast_type, out_name, op_type,
                                               out_name, arg_idx++, dispensable);
+
+      // call_api_str += out_name + ", ";
     } else {
       // There are few Operators that have duplicable output, like `Out` in
       // split op. We need to specify the number of variables for the
@@ -281,11 +393,9 @@ std::string GenerateOpFunctionsBody(
         HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT, viwe_input_name, viwe_output_name,
         viwe_input_name, viwe_output_name);
   }
-  if (outs_num == 0) {
-    return_str = "Py_INCREF(Py_None);\n    return Py_None;";
-  } else {
-    return_str = "return ToPyObject(out);";
-  }
+
+  return_str = "return ToPyObject(out);";
+
   std::string function_args = "";
   if (input_args == "") {
     function_args = FUNCTION_ARGS_NO_INPUT;
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index eb53884186ffc..3a0990c126391 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -370,6 +370,15 @@ PyObject* ToPyObject(const platform::Place& value) {
   return obj.ptr();
 }
 
+PyObject* ToPyObject(const void* value) {
+  if (value == nullptr) {
+    Py_INCREF(Py_None);
+    return Py_None;
+  }
+  PADDLE_THROW(
+      platform::errors::Fatal("ToPyObject do not support void* with value."));
+}
+
 egr::EagerTensor GetEagerTensorFromArgs(const std::string& op_type,
                                         const std::string& arg_name,
                                         PyObject* args, ssize_t arg_idx,
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index e72820c4dbe8c..bb1d247e59007 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -51,6 +51,7 @@ PyObject* ToPyObject(const std::vector<float>& value);
 PyObject* ToPyObject(const std::vector<double>& value);
 PyObject* ToPyObject(const std::vector<egr::EagerTensor>& value);
 PyObject* ToPyObject(const platform::Place& value);
+PyObject* ToPyObject(const void* value);
 
 template <typename Tuple, size_t N>
 struct TupleEagerTensorResult {
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index dc97d98e8c47f..080323bbc2541 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -252,12 +252,16 @@ static void InitVarBaseFromNumpyWithArgDefault(imperative::VarBase *self,
   InitVarBaseAndTensor(self, array, place, "");
 }
 
-static void InitVarBaseFromTensorWithArgDefault(
-    imperative::VarBase *self, const framework::Tensor &tensor) {
+static void InitVarBaseFromTensorWithArgDefault(imperative::VarBase *self,
+                                                const framework::Tensor &tensor,
+                                                const std::string &name) {
   VLOG(4) << "Init VarBase";
   auto place = imperative::GetCurrentTracer()->ExpectedPlace();
-  new (self) imperative::VarBase(
-      imperative::GetCurrentTracer()->GenerateUniqueName("generated_tensor"));
+  auto name_ = name == ""
+                   ? imperative::GetCurrentTracer()->GenerateUniqueName(
+                         "generated_tensor")
+                   : name;
+  new (self) imperative::VarBase(name_);
   self->SetPersistable(false);
   self->SetType(framework::proto::VarType::LOD_TENSOR);
   self->SetDataType(tensor.type());
@@ -275,10 +279,14 @@ static void InitVarBaseFromTensorWithArgDefault(
 template <typename P>
 static void InitVarBaseFromTensorWithArg(imperative::VarBase *self,
                                          const framework::Tensor &tensor,
-                                         const P &place) {
+                                         const P &place,
+                                         const std::string &name) {
   VLOG(4) << "Init VarBase";
-  new (self) imperative::VarBase(
-      imperative::GetCurrentTracer()->GenerateUniqueName("generated_tensor"));
+  auto name_ = name == ""
+                   ? imperative::GetCurrentTracer()->GenerateUniqueName(
+                         "generated_tensor")
+                   : name;
+  new (self) imperative::VarBase(name_);
   self->SetPersistable(false);
   self->SetType(framework::proto::VarType::LOD_TENSOR);
   self->SetDataType(tensor.type());
@@ -917,17 +925,18 @@ void BindImperative(py::module *m_ptr) {
            py::arg("zero_copy") = false, py::arg("name") = "",
            py::arg("stop_gradient") = -1)
       .def("__init__", &InitVarBaseFromNumpyWithArgDefault, py::arg("value"))
-      .def("__init__", &InitVarBaseFromTensorWithArgDefault, py::arg("tensor"))
+      .def("__init__", &InitVarBaseFromTensorWithArgDefault, py::arg("tensor"),
+           py::arg("name") = "")
       .def("__init__", &InitVarBaseFromTensorWithArg<platform::CPUPlace>,
-           py::arg("tensor"), py::arg("place"))
+           py::arg("tensor"), py::arg("place"), py::arg("name") = "")
       .def("__init__", &InitVarBaseFromTensorWithArg<platform::XPUPlace>,
-           py::arg("tensor"), py::arg("place"))
+           py::arg("tensor"), py::arg("place"), py::arg("name") = "")
       .def("__init__", &InitVarBaseFromTensorWithArg<platform::CUDAPlace>,
-           py::arg("tensor"), py::arg("place"))
+           py::arg("tensor"), py::arg("place"), py::arg("name") = "")
       .def("__init__", &InitVarBaseFromTensorWithArg<platform::CUDAPinnedPlace>,
-           py::arg("tensor"), py::arg("place"))
+           py::arg("tensor"), py::arg("place"), py::arg("name") = "")
       .def("__init__", &InitVarBaseFromTensorWithArg<platform::NPUPlace>,
-           py::arg("tensor"), py::arg("place"))
+           py::arg("tensor"), py::arg("place"), py::arg("name") = "")
       .def("__init__", &InitVarBaseFromNumpyWithKwargs)
       .def(
           "__setitem_varbase__",
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 635f3149773e8..c5277a4210395 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -75,7 +75,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/cuda_streams_py.h"
+#ifndef PADDLE_ON_INFERENCE
 #include "paddle/fluid/pybind/eager.h"
+#endif
 #include "paddle/fluid/pybind/io.h"
 #include "paddle/utils/none.h"
 #ifdef PADDLE_WITH_ASCEND
@@ -130,6 +132,10 @@ limitations under the License. */
 #endif
 
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/ipu_info.h"
+#endif
 
 #ifdef PADDLE_WITH_CRYPTO
 #include "paddle/fluid/pybind/crypto.h"
@@ -199,6 +205,14 @@ bool IsCompiledWithNPU() {
 #endif
 }
 
+bool IsCompiledWithIPU() {
+#ifndef PADDLE_WITH_IPU
+  return false;
+#else
+  return true;
+#endif
+}
+
 bool IsCompiledWithMKLDNN() {
 #ifndef PADDLE_WITH_MKLDNN
   return false;
@@ -541,7 +555,9 @@ PYBIND11_MODULE(core_avx, m) {
 PYBIND11_MODULE(core_noavx, m) {
 #endif
 
+#ifndef PADDLE_ON_INFERENCE
   BindEager(&m);
+#endif
   BindCudaStream(&m);
 
   // Not used, just make sure cpu_info.cc is linked.
@@ -812,6 +828,8 @@ PYBIND11_MODULE(core_noavx, m) {
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
       .def("set", SetTensorFromPyArray<paddle::platform::NPUPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
+      .def("set", SetTensorFromPyArray<paddle::platform::IPUPlace>,
+           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
       .def("set", SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false,
            R"DOC(
@@ -819,7 +837,7 @@ PYBIND11_MODULE(core_noavx, m) {
         
         Args:
           lod (numpy.ndarray): The data to set.
-          place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace|NPUPlace): The place where the
+          place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace): The place where the
           LoDTensor is to be set.
           zero_copy (bool, optional): Whether to share memory with the input numpy array.
           This parameter only works with CPUPlace. Default: False.
@@ -1909,6 +1927,58 @@ All parameter, weight, gradient are variables in Paddle.
            [](const platform::NPUPlace &self) { return self.GetDeviceId(); })
       .def("__str__", string::to_string<const platform::NPUPlace &>);
 
+  // IPUPlace
+  py::class_<platform::IPUPlace>(m, "IPUPlace", R"DOC(
+    IPUPlace is a descriptor of a device.
+    It represents a IPU device on which a tensor will be allocated and a model will run.
+
+    Examples:
+        .. code-block:: python
+          import paddle
+
+          # required: ipu
+
+          ipu_place = paddle.IPUPlace()
+
+        )DOC")
+      .def("__init__",
+           [](platform::IPUPlace &self) {
+#ifdef PADDLE_WITH_IPU
+             if (platform::GetIPUDeviceCount() == 0) {
+               LOG(ERROR) << "Cannot use IPU because there is no IPU "
+                             "detected on your "
+                             "machine.";
+               std::exit(-1);
+             }
+             // use ipu(0) to comile, while run with the number user configure
+             // in sharding and pipline.
+             new (&self) platform::IPUPlace(0);
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use IPU because you didn't install IPU version "
+                 "PaddlePaddle.\n"
+                 "If you want to use IPU, please try to install IPU version "
+                 "PaddlePaddle by: pip install paddlepaddle*\n"
+                 "If you only have CPU, please change IPUPlace to be "
+                 "CPUPlace().\n");
+             std::exit(-1);
+#endif
+           })
+      .def("_type", &PlaceIndex<platform::IPUPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::NPUPlace>)
+      .def("_equals", &IsSamePlace<platform::IPUPlace, platform::IPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::IPUPlace, platform::CUDAPinnedPlace>)
+#ifdef PADDLE_WITH_IPU
+      .def("get_device_id",
+           [](const platform::IPUPlace &self) { return self.GetDeviceId(); })
+#endif
+      .def("__str__", string::to_string<const platform::IPUPlace &>);
+
   py::class_<platform::Place> platformplace(m, "Place");
   g_place_pytype = reinterpret_cast<PyTypeObject *>(platformplace.ptr());
   platformplace.def(py::init<>())
@@ -1918,6 +1988,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::XPUPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::NPUPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::IPUPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
       .def("is_gpu_place",
            [](platform::Place &self) { return platform::is_gpu_place(self); })
@@ -1927,6 +1998,8 @@ All parameter, weight, gradient are variables in Paddle.
            [](platform::Place &self) { return platform::is_xpu_place(self); })
       .def("is_npu_place",
            [](platform::Place &self) { return platform::is_npu_place(self); })
+      .def("is_ipu_place",
+           [](platform::Place &self) { return platform::is_ipu_place(self); })
       .def("is_cuda_pinned_place",
            [](platform::Place &self) {
              return platform::is_cuda_pinned_place(self);
@@ -1943,6 +2016,10 @@ All parameter, weight, gradient are variables in Paddle.
            [](platform::Place &self) {
              return BOOST_GET_CONST(platform::NPUPlace, self).device;
            })
+      .def("ipu_device_id",
+           [](platform::Place &self) {
+             return BOOST_GET_CONST(platform::IPUPlace, self).device;
+           })
       .def("set_place", [](platform::Place &self,
                            const platform::Place &other) { self = other; })
       .def("set_place",
@@ -1966,6 +2043,10 @@ All parameter, weight, gradient are variables in Paddle.
            [](platform::Place &self, const platform::NPUPlace &npu_place) {
              self = npu_place;
            })
+      .def("set_place",
+           [](platform::Place &self, const platform::IPUPlace &ipu_place) {
+             self = ipu_place;
+           })
       .def("__repr__", string::to_string<const platform::Place &>)
       .def("__str__", string::to_string<const platform::Place &>);
 
@@ -2197,6 +2278,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("is_compiled_with_ascend", IsCompiledWithAscend);
   m.def("is_compiled_with_rocm", IsCompiledWithROCM);
   m.def("is_compiled_with_npu", IsCompiledWithNPU);
+  m.def("is_compiled_with_ipu", IsCompiledWithIPU);
   m.def("is_compiled_with_xpu", IsCompiledWithXPU);
   m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
   m.def("is_compiled_with_cinn", IsCompiledWithCINN);
@@ -2516,6 +2598,10 @@ All parameter, weight, gradient are variables in Paddle.
   });
 #endif
 
+#ifdef PADDLE_WITH_IPU
+  m.def("get_ipu_device_count", platform::GetIPUDeviceCount);
+#endif
+
   py::enum_<platform::TracerOption>(m, "TracerOption", py::arithmetic())
       .value("kDefault", platform::TracerOption::kDefault)
       .value("kOpDetail", platform::TracerOption::kOpDetail)
@@ -2593,6 +2679,11 @@ All parameter, weight, gradient are variables in Paddle.
                      bool val) { self.Set<bool>(name, new bool(val)); })
       .def("set", [](ir::Pass &self, const std::string &name,
                      int val) { self.Set<const int>(name, new int(val)); })
+      .def("set",
+           [](ir::Pass &self, const std::string &name,
+              std::vector<std::string> set) {
+             self.Set(name, new std::vector<std::string>(set));
+           })
       .def("set",
            [](ir::Pass &self, const std::string &name,
               std::unordered_set<std::string> set) {
@@ -3425,6 +3516,118 @@ All parameter, weight, gradient are variables in Paddle.
            })
       .def("device_count", &ParallelExecutor::DeviceCount);
 
+#ifdef PADDLE_WITH_IPU
+  py::class_<platform::ipu::IpuBackend,
+             std::shared_ptr<platform::ipu::IpuBackend>>(m, "IpuBackend")
+      .def(py::init(&platform::ipu::IpuBackend::GetNewInstance))
+      .def("clear", &platform::ipu::IpuBackend::Clear)
+      .def("set_scope", &platform::ipu::IpuBackend::SetScope)
+      .def("set_ipu_strategy", &platform::ipu::IpuBackend::SetIpuStrategy);
+
+  py::class_<platform::ipu::IpuStrategy>(m, "IpuStrategy")
+      .def(py::init())
+      .def_property(
+          "num_ipus",
+          [](const platform::ipu::IpuStrategy &self) { return self.num_ipus; },
+          [](platform::ipu::IpuStrategy &self, int num_ipus) {
+            self.num_ipus = num_ipus;
+          },
+          R"DOC(
+            Int type, set the number ipu we need. Default 1.
+          )DOC")
+      .def_property(
+          "accumulationFactor",
+          [](const platform::ipu::IpuStrategy &self) {
+            return self.popart_options_.accumulationFactor;
+          },
+          [](platform::ipu::IpuStrategy &self, int accumulationFactor) {
+            self.popart_options_.accumulationFactor = accumulationFactor;
+          },
+          R"DOC(
+            Specify the number of micro-batches to accumulate before
+            applying the varUpdate. Default 1.
+          )DOC")
+      .def_property("batches_per_step",
+                    [](const platform::ipu::IpuStrategy &self) {
+                      return self.batches_per_step;
+                    },
+                    [](platform::ipu::IpuStrategy &self, int batches_per_step) {
+                      self.batches_per_step = batches_per_step;
+                    },
+                    R"DOC(
+            Int type, set batches_per_step. Default 1.
+          )DOC")
+      .def_property("is_training",
+                    [](const platform::ipu::IpuStrategy &self) {
+                      return self.is_training;
+                    },
+                    [](platform::ipu::IpuStrategy &self, bool is_training) {
+                      self.is_training = is_training;
+                    },
+                    R"DOC(
+            Bool type, True for training, False inference. Default True.
+          )DOC")
+      .def_property(
+          "enable_pipelining",
+          [](const platform::ipu::IpuStrategy &self) {
+            return self.popart_options_.enablePipelining;
+          },
+          [](platform::ipu::IpuStrategy &self, bool enable_pipelining) {
+            self.popart_options_.enablePipelining = enable_pipelining;
+          },
+          R"DOC(
+            Bool type, True enable pipeline, otherwise disable. Default False.
+          )DOC")
+      .def_property(
+          "enable_manual_shard",
+          [](const platform::ipu::IpuStrategy &self) {
+            return self.popart_options_.virtualGraphMode ==
+                   platform::ipu::VirtualGraphMode::Manual;
+          },
+          [](platform::ipu::IpuStrategy &self, bool enable_ipu_shard) {
+            if (enable_ipu_shard) {
+              self.popart_options_.virtualGraphMode =
+                  platform::ipu::VirtualGraphMode::Manual;
+            } else {
+              self.popart_options_.virtualGraphMode =
+                  platform::ipu::VirtualGraphMode::Off;
+            }
+          },
+          R"DOC(
+            Bool type, True enable model sharding, otherwise disable. Default "
+            "False.
+          )DOC")
+      .def_property("need_avg_shard",
+                    [](const platform::ipu::IpuStrategy &self) {
+                      return self.need_avg_shard;
+                    },
+                    [](platform::ipu::IpuStrategy &self, bool need_avg_shard) {
+                      self.need_avg_shard = need_avg_shard;
+                    },
+                    R"DOC(
+            Bool type, True enable avg shard, otherwise disable. Default False.
+          )DOC")
+      .def_property("batch_size",
+                    [](const platform::ipu::IpuStrategy &self) {
+                      return self.batch_size;
+                    },
+                    [](platform::ipu::IpuStrategy &self, int batch_size) {
+                      self.batch_size = batch_size;
+                    },
+                    R"DOC(
+            Int type, used to make batch size fixed. Default 1.
+          )DOC")
+      .def_property("enable_fp16",
+                    [](const platform::ipu::IpuStrategy &self) {
+                      return self.enable_fp16;
+                    },
+                    [](platform::ipu::IpuStrategy &self, bool enable_fp16) {
+                      self.enable_fp16 = enable_fp16;
+                    },
+                    R"DOC(
+            Bool type, True enable float16 mode, otherwise disable. Default False.)DOC");
+#endif
+
   BindFleetWrapper(&m);
   BindIO(&m);
 
diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc
index 60b99a964a57f..d4fa1b2c89abf 100644
--- a/paddle/fluid/pybind/reader_py.cc
+++ b/paddle/fluid/pybind/reader_py.cc
@@ -37,6 +37,9 @@ PADDLE_DEFINE_EXPORTED_bool(
     "If set true, the queue.pop will only get data from queue but not "
     "remove the data from queue for speed testing");
 
+// disable auto conversion to list in Python
+PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
+
 namespace paddle {
 namespace pybind {
 
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index c5d0afb9a1716..935a6437338a7 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -313,6 +313,21 @@ void SetTensorFromPyArrayT(
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Cannot use XPUPlace in CPU/GPU version, "
         "Please recompile or reinstall Paddle with XPU support."));
+#endif
+  } else if (paddle::platform::is_ipu_place(place)) {
+#ifdef PADDLE_WITH_IPU
+    if (zero_copy) {
+      auto holder = std::make_shared<details::NumpyAllocation<T>>(array);
+      auto type = framework::ToDataType(std::type_index(typeid(T)));
+      self->ResetHolderWithType(holder, type);
+    } else {
+      auto dst = self->mutable_data<T>(place);
+      std::memcpy(dst, array.data(), array.nbytes());
+    }
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use IPUPlace in CPU/GPU/XPU/NPU version, "
+        "Please recompile or reinstall Paddle with IPU support."));
 #endif
   } else if (paddle::platform::is_npu_place(place)) {
 #ifdef PADDLE_WITH_ASCEND_CL
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
index 448bcb80aa678..7f6a64a1bb2e7 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -11,10 +11,14 @@ add_subdirectory(common)
 add_subdirectory(api)
 # pten core components
 add_subdirectory(core)
+# pten components of specific backends
+add_subdirectory(backends)
 # pten kernels for diff device
 add_subdirectory(kernels)
 # pten infermeta
 add_subdirectory(infermeta)
+# pten operator definitions
+add_subdirectory(ops)
 # pten tests
 add_subdirectory(tests)
 
diff --git a/paddle/pten/api/lib/CMakeLists.txt b/paddle/pten/api/lib/CMakeLists.txt
index 96ad9ade8e3ad..d1e60c4505d6b 100644
--- a/paddle/pten/api/lib/CMakeLists.txt
+++ b/paddle/pten/api/lib/CMakeLists.txt
@@ -22,6 +22,10 @@ set(api_source_file ${CMAKE_SOURCE_DIR}/paddle/pten/api/lib/api.cc)
 set(api_header_file_tmp ${api_header_file}.tmp)
 set(api_source_file_tmp ${api_source_file}.tmp)
 
+if (NOT PYTHON_EXECUTABLE)
+  find_package(PythonInterp REQUIRED)
+endif()
+
 add_custom_command(
   OUTPUT ${api_header_file} ${api_source_file}
   COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml
diff --git a/paddle/pten/api/lib/kernel_declare.h b/paddle/pten/api/lib/kernel_declare.h
new file mode 100644
index 0000000000000..8c21094a4af20
--- /dev/null
+++ b/paddle/pten/api/lib/kernel_declare.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/kernel_registry.h"
+
+// TODO(chenweihang) After the kernel is split into a single file,
+// the kernel declare statement is automatically generated according to the
+// file name of the kernel, and this header file will be removed
+
+PT_DECLARE_KERNEL(full_like, CPU);
+PT_DECLARE_KERNEL(dot, CPU);
+PT_DECLARE_KERNEL(flatten, CPU);
+PT_DECLARE_KERNEL(sign, CPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_KERNEL(full_like, CUDA);
+PT_DECLARE_KERNEL(dot, CUDA);
+PT_DECLARE_KERNEL(flatten, CUDA);
+PT_DECLARE_KERNEL(sign, CUDA);
+#endif
+
+#ifdef PADDLE_WITH_XPU
+PT_DECLARE_KERNEL(flatten, XPU);
+#endif
diff --git a/paddle/pten/api/lib/utils.cc b/paddle/pten/api/lib/utils.cc
index e17b19d9f689e..bfde9b14b0020 100644
--- a/paddle/pten/api/lib/utils.cc
+++ b/paddle/pten/api/lib/utils.cc
@@ -25,10 +25,14 @@ limitations under the License. */
 #include "paddle/pten/include/core.h"
 #include "paddle/pten/include/infermeta.h"
 
-PT_DECLARE_MODULE(UtilsCPU);
+PT_DECLARE_KERNEL(copy, CPU);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(UtilsCUDA);
+PT_DECLARE_KERNEL(copy, CUDA);
+#endif
+
+#ifdef PADDLE_WITH_XPU
+PT_DECLARE_KERNEL(copy, XPU);
 #endif
 
 namespace paddle {
diff --git a/paddle/pten/kernels/functions/blas/CMakeLists.txt b/paddle/pten/backends/CMakeLists.txt
similarity index 100%
rename from paddle/pten/kernels/functions/blas/CMakeLists.txt
rename to paddle/pten/backends/CMakeLists.txt
diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt
index e19d0a490cef3..f02678538cb8e 100644
--- a/paddle/pten/core/CMakeLists.txt
+++ b/paddle/pten/core/CMakeLists.txt
@@ -6,7 +6,7 @@ else()
   cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place)
 endif()
 
-cc_library(kernel_factory SRCS kernel_factory.cc DEPS enforce)
+cc_library(kernel_factory SRCS kernel_factory.cc DEPS enforce convert_utils)
 cc_library(kernel_context SRCS kernel_context.cc DEPS enforce device_context)
 
 cc_library(tensor_base SRCS tensor_base.cc allocator.cc storage.cc DEPS enforce)
diff --git a/paddle/pten/core/kernel_alias_name.h b/paddle/pten/core/kernel_alias_name.h
index 0310b6e6fafb5..3b8347dec772e 100644
--- a/paddle/pten/core/kernel_alias_name.h
+++ b/paddle/pten/core/kernel_alias_name.h
@@ -27,13 +27,13 @@ const std::unordered_map<std::string, std::string> kernel_alias_name_map = {
     {"fill_any_like", "full_like"},
     {"fill_constant", "full"},
     {"flatten_contiguous_range", "flatten"},
-    // {"matmul_v2", "matmul"},
+    {"matmul_v2", "matmul"},
     {"reduce_mean", "mean"},
     {"reduce_sum", "sum"},
     {"reshape2", "reshape"},
     // fluid kernel "mean/reshape/matmul/flatten/sum" should be deprecated
     {"flatten", "deprecated"},
-    // {"matmul", "deprecated"},
+    {"matmul", "deprecated"},
     {"mean", "deprecated"},
     {"reshape", "deprecated"},
     {"sum", "deprecated"}};
diff --git a/paddle/pten/core/kernel_factory.h b/paddle/pten/core/kernel_factory.h
index dbdf90b5bdbf4..4adfb703503a8 100644
--- a/paddle/pten/core/kernel_factory.h
+++ b/paddle/pten/core/kernel_factory.h
@@ -265,12 +265,8 @@ class KernelFactory {
 
   KernelMap& kernels() { return kernels_; }
 
-  void InsertCompatibleOpType(const std::string& op_type) {
-    compatible_op_types_.insert(op_type);
-  }
-
   bool HasCompatiblePtenKernel(const std::string& op_type) const {
-    return compatible_op_types_.count(TransToPtenKernelName(op_type)) > 0;
+    return kernels_.find(TransToPtenKernelName(op_type)) != kernels_.end();
   }
 
   const Kernel& SelectKernelOrThrowError(const KernelName& kernel_name,
@@ -288,9 +284,6 @@ class KernelFactory {
   KernelFactory() = default;
 
   KernelMap kernels_;
-  // Used to be compatible with the original execution system and
-  // quickly confirm whether the new kernel can be called
-  std::unordered_set<std::string> compatible_op_types_;
 };
 
 /** operator << overload **/
diff --git a/paddle/pten/core/kernel_registry.h b/paddle/pten/core/kernel_registry.h
index cd6fa80906cfb..645e77fc60f8c 100644
--- a/paddle/pten/core/kernel_registry.h
+++ b/paddle/pten/core/kernel_registry.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <cstring>
+#include <string>
 #include <type_traits>
 #include <typeindex>
 #include <typeinfo>
@@ -24,6 +25,8 @@
 #include "paddle/pten/core/kernel_factory.h"
 #include "paddle/pten/core/kernel_utils.h"
 
+#include "paddle/fluid/platform/enforce.h"
+
 namespace pten {
 
 #define BACKEND(arg__) pten::Backend::arg__
@@ -140,7 +143,6 @@ struct KernelRegistrar {
     Kernel kernel(kernel_fn);
     args_parse_fn(kernel_key, kernel.mutable_args_def());
     args_def_fn(&kernel);
-    KernelFactory::Instance().InsertCompatibleOpType(kernel_name.name());
     KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel;
   }
 };
@@ -193,64 +195,35 @@ struct KernelRegistrar {
 #define _PT_ARG_N(args) _PT_ARG_N_EXPAND args
 #define _PT_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
+/** PT_REGISTER_KERNEL
+ *
+ * The most frequently used kernel registration macro, used for kernel
+ * registration with only data type as template parameter, and the function
+ * pointer of the corresponding data type is automatically instantiated
+ * during registration.
+ */
 #define PT_REGISTER_KERNEL(                                       \
     kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
-  _PT_REGISTER_KERNEL(kernel_name,                                \
-                      PT_ID,                                      \
-                      backend,                                    \
-                      layout,                                     \
-                      meta_kernel_fn,                             \
-                      cpp_dtype,                                  \
-                      __VA_ARGS__)
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                              \
+      pt_register_kernel_ns_check_##kernel_name,                  \
+      "PT_REGISTER_KERNEL must be called in global namespace.");  \
+  _PT_REGISTER_KERNEL(                                            \
+      kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, __VA_ARGS__)
+
 #ifndef _WIN32
-#define _PT_REGISTER_KERNEL(                                                   \
-    kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...)     \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
-      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                         \
-      "PT_REGISTER_KERNEL must be called in global namespace.");               \
-  PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, __VA_ARGS__);             \
-  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                         \
-                             func_id)(::pten::Kernel*);                        \
-  PT_KERNEL_REGISTRAR_INIT(kernel_name,                                        \
-                           func_id,                                            \
-                           backend,                                            \
-                           layout,                                             \
-                           &PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id), \
-                           meta_kernel_fn,                                     \
-                           cpp_dtype,                                          \
-                           __VA_ARGS__);                                       \
-  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                                \
-                      func_id)(::pten::Kernel * kernel)
+#define _PT_REGISTER_KERNEL(                                          \
+    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...)     \
+  PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, __VA_ARGS__);    \
+  static void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*); \
+  PT_KERNEL_REGISTRAR_INIT(kernel_name,                               \
+                           backend,                                   \
+                           layout,                                    \
+                           &__PT_KERNEL_args_def_FN_##kernel_name,    \
+                           meta_kernel_fn,                            \
+                           cpp_dtype,                                 \
+                           __VA_ARGS__);                              \
+  void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel* kernel)
 #else
-#define _PT_REGISTER_KERNEL(                                                   \
-    kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...)     \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
-      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                         \
-      "PT_REGISTER_KERNEL must be called in global namespace.");               \
-  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                         \
-                             func_id)(::pten::Kernel*);                        \
-  PT_KERNEL_REGISTRAR_INIT(kernel_name,                                        \
-                           func_id,                                            \
-                           backend,                                            \
-                           layout,                                             \
-                           &PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id), \
-                           meta_kernel_fn,                                     \
-                           cpp_dtype,                                          \
-                           __VA_ARGS__);                                       \
-  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                                \
-                      func_id)(::pten::Kernel * kernel)
-#endif
-
-#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, ...) \
-  _PT_KERNEL_INSTANTIATION(PT_NARGS(cpp_dtype, __VA_ARGS__),    \
-                           meta_kernel_fn,                      \
-                           cpp_dtype,                           \
-                           __VA_ARGS__)
-
-#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, cpp_dtype, ...) \
-  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N)                      \
-  (meta_kernel_fn, cpp_dtype, __VA_ARGS__)
-
 /**
  * `template decltype(fn) fn` can work on gcc and clang,
  * but msvc will failed, error like:
@@ -261,8 +234,30 @@ struct KernelRegistrar {
  *
  *   https://stackoverflow.com/questions/63989585/explicit-instantiation-of-function-using-decltype-work-on-g-but-not-on-visua
  *
- * So we solve the explict instantiation of kernel by CMake
+ * And msvc can work without template instantiation
  */
+#define _PT_REGISTER_KERNEL(                                          \
+    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...)     \
+  static void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*); \
+  PT_KERNEL_REGISTRAR_INIT(kernel_name,                               \
+                           backend,                                   \
+                           layout,                                    \
+                           &__PT_KERNEL_args_def_FN_##kernel_name,    \
+                           meta_kernel_fn,                            \
+                           cpp_dtype,                                 \
+                           __VA_ARGS__);                              \
+  void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel* kernel)
+#endif
+
+#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, ...) \
+  _PT_KERNEL_INSTANTIATION(PT_NARGS(cpp_dtype, __VA_ARGS__),    \
+                           meta_kernel_fn,                      \
+                           cpp_dtype,                           \
+                           __VA_ARGS__)
+
+#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, cpp_dtype, ...) \
+  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N)                      \
+  (meta_kernel_fn, cpp_dtype, __VA_ARGS__)
 
 #define _PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, cpp_dtype, ...) \
   template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>
@@ -309,22 +304,15 @@ struct KernelRegistrar {
   template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
   PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, __VA_ARGS__))
 
-#define PT_KERNEL_REGISTRAR_INIT(kernel_name,                 \
-                                 func_id,                     \
-                                 backend,                     \
-                                 layout,                      \
-                                 args_def_fn,                 \
-                                 meta_kernel_fn,              \
-                                 cpp_dtype,                   \
-                                 ...)                         \
-  _PT_KERNEL_REGISTRAR_INIT(PT_NARGS(cpp_dtype, __VA_ARGS__), \
-                            kernel_name,                      \
-                            func_id,                          \
-                            backend,                          \
-                            layout,                           \
-                            args_def_fn,                      \
-                            meta_kernel_fn,                   \
-                            cpp_dtype,                        \
+#define PT_KERNEL_REGISTRAR_INIT(                                              \
+    kernel_name, backend, layout, args_def_fn, meta_kernel_fn, cpp_dtype, ...) \
+  _PT_KERNEL_REGISTRAR_INIT(PT_NARGS(cpp_dtype, __VA_ARGS__),                  \
+                            kernel_name,                                       \
+                            backend,                                           \
+                            layout,                                            \
+                            args_def_fn,                                       \
+                            meta_kernel_fn,                                    \
+                            cpp_dtype,                                         \
                             __VA_ARGS__)
 
 // clang-format off
@@ -333,7 +321,6 @@ struct KernelRegistrar {
   and multi-line macros cannot be skipped with NOLINT.*/
 #define _PT_KERNEL_REGISTRAR_INIT(N,              \
                                   kernel_name,    \
-                                  func_id,        \
                                   backend,        \
                                   layout,         \
                                   args_def_fn,    \
@@ -342,7 +329,6 @@ struct KernelRegistrar {
                                   ...)            \
   PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N) ( \
     kernel_name,                                  \
-    func_id,                                      \
     PT_ID,                                        \
     backend,                                      \
     layout,                                       \
@@ -354,7 +340,6 @@ struct KernelRegistrar {
 // clang-format on
 
 #define _PT_KERNEL_REGISTRAR_INIT_1(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -363,17 +348,17 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
       ::pten::KernelArgsParseFunctor<decltype(                      \
           &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
       args_def_fn,                                                  \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  int TouchKernelSymbolFor_##kernel_name##_##backend() { return 0; }
 #define _PT_KERNEL_REGISTRAR_INIT_2(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -382,8 +367,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -392,7 +377,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -400,7 +384,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_3(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -409,8 +392,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -419,7 +402,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -427,7 +409,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_4(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -436,8 +417,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -446,7 +427,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -454,7 +434,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_5(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -463,8 +442,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -473,7 +452,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -481,7 +459,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_6(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -490,8 +467,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -500,7 +477,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -508,7 +484,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_7(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -517,8 +492,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -527,7 +502,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -535,7 +509,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_8(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -544,8 +517,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -554,7 +527,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -562,7 +534,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_9(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -571,8 +542,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -581,7 +552,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -589,7 +559,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_10(kernel_name,                   \
-                                     func_id,                       \
                                      registrar_id,                  \
                                      backend,                       \
                                      layout,                        \
@@ -598,8 +567,8 @@ struct KernelRegistrar {
                                      cpp_dtype,                     \
                                      ...)                           \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -608,7 +577,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -616,7 +584,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_11(kernel_name,                   \
-                                     func_id,                       \
                                      registrar_id,                  \
                                      backend,                       \
                                      layout,                        \
@@ -625,8 +592,8 @@ struct KernelRegistrar {
                                      cpp_dtype,                     \
                                      ...)                           \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -635,7 +602,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(kernel_name,               \
-                                         func_id,                   \
                                          PT_ID,                     \
                                          backend,                   \
                                          layout,                    \
@@ -643,7 +609,6 @@ struct KernelRegistrar {
                                          meta_kernel_fn,            \
                                          __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_12(kernel_name,                   \
-                                     func_id,                       \
                                      registrar_id,                  \
                                      backend,                       \
                                      layout,                        \
@@ -652,8 +617,8 @@ struct KernelRegistrar {
                                      cpp_dtype,                     \
                                      ...)                           \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -662,7 +627,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(kernel_name,               \
-                                         func_id,                   \
                                          PT_ID,                     \
                                          backend,                   \
                                          layout,                    \
@@ -670,7 +634,6 @@ struct KernelRegistrar {
                                          meta_kernel_fn,            \
                                          __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_13(kernel_name,                   \
-                                     func_id,                       \
                                      registrar_id,                  \
                                      backend,                       \
                                      layout,                        \
@@ -679,8 +642,8 @@ struct KernelRegistrar {
                                      cpp_dtype,                     \
                                      ...)                           \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -689,7 +652,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(kernel_name,               \
-                                         func_id,                   \
                                          PT_ID,                     \
                                          backend,                   \
                                          layout,                    \
@@ -697,7 +659,6 @@ struct KernelRegistrar {
                                          meta_kernel_fn,            \
                                          __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_14(kernel_name,                   \
-                                     func_id,                       \
                                      registrar_id,                  \
                                      backend,                       \
                                      layout,                        \
@@ -706,8 +667,8 @@ struct KernelRegistrar {
                                      cpp_dtype,                     \
                                      ...)                           \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -716,7 +677,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(kernel_name,               \
-                                         func_id,                   \
                                          PT_ID,                     \
                                          backend,                   \
                                          layout,                    \
@@ -724,7 +684,6 @@ struct KernelRegistrar {
                                          meta_kernel_fn,            \
                                          __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_15(kernel_name,                   \
-                                     func_id,                       \
                                      registrar_id,                  \
                                      backend,                       \
                                      layout,                        \
@@ -733,8 +692,8 @@ struct KernelRegistrar {
                                      cpp_dtype,                     \
                                      ...)                           \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -743,7 +702,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(kernel_name,               \
-                                         func_id,                   \
                                          PT_ID,                     \
                                          backend,                   \
                                          layout,                    \
@@ -751,90 +709,59 @@ struct KernelRegistrar {
                                          meta_kernel_fn,            \
                                          __VA_ARGS__))
 
-#define PT_REGISTER_KERNEL_STANDARD(                \
-    kernel_name, backend, layout, dtype, kernel_fn) \
-  _PT_REGISTER_KERNEL_STANDARD(                     \
-      kernel_name, PT_ID, backend, layout, dtype, kernel_fn)
-
-#define _PT_REGISTER_KERNEL_STANDARD(                                      \
-    kernel_name, func_id, backend, layout, dtype, kernel_fn)               \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
-      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                     \
-      "_PT_REGISTER_KERNEL_STANDARD must be called in global namespace."); \
-  template decltype(kernel_fn) kernel_fn;                                  \
-  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                     \
-                             func_id)(::pten::Kernel*);                    \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(__reg_pt_op_kernel_, \
-                                                      func_id)(            \
-      kernel_name,                                                         \
-      BACKEND(backend),                                                    \
-      DATALAYOUT(layout),                                                  \
-      DATATYPE(dtype),                                                     \
-      ::pten::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,         \
-      args_def_fn,                                                         \
-      PT_KERNEL(kernel_fn));                                               \
-  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id)(::pten::Kernel*)
-
-// use to declare symbol
-#define PT_REGISTER_MODULE(name) \
-  int RegisterSymbolsFor##name() { return 0; }
-
-#define PT_DECLARE_MODULE(name)          \
-  extern int RegisterSymbolsFor##name(); \
-  UNUSED static int use_kernel_module_##name = RegisterSymbolsFor##name()
-
-// only used in cpp tests
-
-#define PT_REGISTER_KERNEL_FOR_TEST(                              \
-    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
-  _PT_REGISTER_KERNEL_FOR_TEST(kernel_name,                       \
-                               PT_ID,                             \
-                               backend,                           \
-                               layout,                            \
-                               meta_kernel_fn,                    \
-                               cpp_dtype,                         \
-                               __VA_ARGS__)
-
-#define _PT_REGISTER_KERNEL_FOR_TEST(                                      \
-    kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
-      PT_CONCATENATE(pt_op_kernel_for_test_ns_check_, func_id),            \
-      "PT_REGISTER_KERNEL must be called in global namespace.");           \
-  static void PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_,            \
-                             func_id)(::pten::Kernel*);                    \
-  PT_KERNEL_REGISTRAR_INIT(                                                \
-      kernel_name,                                                         \
-      func_id,                                                             \
-      backend,                                                             \
-      layout,                                                              \
-      &PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_, func_id),         \
-      meta_kernel_fn,                                                      \
-      cpp_dtype,                                                           \
-      __VA_ARGS__);                                                        \
-  void PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_,                   \
-                      func_id)(::pten::Kernel * kernel)
-
-#define PT_REGISTER_KERNEL_WITH_NO_TYPE(          \
-    kernel_name, backend, layout, meta_kernel_fn) \
-  _PT_REGISTER_KERNEL_WITH_NO_TYPE(               \
-      kernel_name, PT_ID, backend, layout, meta_kernel_fn)
-
-#define _PT_REGISTER_KERNEL_WITH_NO_TYPE(                                  \
-    kernel_name, func_id, backend, layout, meta_kernel_fn)                 \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
-      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                     \
-      "PT_REGISTER_KERNEL must be called in global namespace.");           \
-  decltype(meta_kernel_fn) meta_kernel_fn;                                 \
-  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                     \
-                             func_id)(::pten::Kernel*);                    \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(__reg_pt_op_kernel_, \
-                                                      func_id)(            \
-      kernel_name,                                                         \
-      BACKEND(backend),                                                    \
-      DATALAYOUT(layout),                                                  \
-      ::pten::KernelArgsParseFunctor<decltype(&meta_kernel_fn)>::Parse,    \
-      &PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id),                  \
-      PT_KERNEL(meta_kernel_fn));                                          \
-  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                            \
-                      func_id)(::pten::Kernel * kernel)
+/** PT_REGISTER_SINGLE_KERNEL
+ *
+ * Used to register a single kernel, pass in the complete function pointer
+ * of the kernel, this registration macro will not do automatic template
+ * instantiation.
+ */
+#define PT_REGISTER_SINGLE_KERNEL(                                           \
+    kernel_name, backend, layout, dtype, kernel_fn)                          \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
+      pt_register_single_kernel_ns_check_##kernel_name,                      \
+      "PT_REGISTER_SINGLE_KERNEL must be called in global namespace.");      \
+  static void __PT_SINGLE_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*); \
+  static const ::pten::KernelRegistrar __reg_pt_single_kernel_##kernel_name( \
+      #kernel_name,                                                          \
+      BACKEND(backend),                                                      \
+      DATALAYOUT(layout),                                                    \
+      DATATYPE(dtype),                                                       \
+      ::pten::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,           \
+      args_def_fn,                                                           \
+      PT_KERNEL(kernel_fn));                                                 \
+  int TouchKernelSymbolFor_##kernel_name##_##backend() { return 0; }         \
+  void __PT_SINGLE_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*)
+
+/** PT_REGISTER_KERNEL_ALL_DTYPE
+ *
+ * Used to register a kernel that supports all data types, such as copy and
+ * reshape that are not sensitive to data types.
+ */
+#define PT_REGISTER_KERNEL_ALL_DTYPE(kernel_name, backend, layout, kernel_fn) \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
+      pt_register_kernel_all_dtype_ns_check_##kernel_name,                    \
+      "PT_REGISTER_KERNEL_ALL_DTYPE must be called in global namespace.");    \
+  static void __PT_KERNEL_ALL_DTYPE_args_def_FN_##kernel_name(                \
+      ::pten::Kernel*);                                                       \
+  static const ::pten::KernelRegistrar                                        \
+      __reg_pt_kernel_all_dtype_##kernel_name(                                \
+          #kernel_name,                                                       \
+          BACKEND(backend),                                                   \
+          DATALAYOUT(layout),                                                 \
+          ::pten::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,        \
+          &__PT_KERNEL_ALL_DTYPE_args_def_FN_##kernel_name,                   \
+          PT_KERNEL(kernel_fn));                                              \
+  int TouchKernelSymbolFor_##kernel_name##_##backend() { return 0; }          \
+  void __PT_KERNEL_ALL_DTYPE_args_def_FN_##kernel_name(::pten::Kernel* kernel)
+
+/** PT_DECLARE_KERNEL
+ *
+ * Used to export the symbols of the file where the kernel is located,
+ * to avoid being removed by linker
+ */
+#define PT_DECLARE_KERNEL(kernel_name, backend)                             \
+  extern int TouchKernelSymbolFor_##kernel_name##_##backend();              \
+  UNUSED static int __declare_kernel_symbol_for_##kernel_name##_##backend = \
+      TouchKernelSymbolFor_##kernel_name##_##backend()
+
 }  // namespace pten
diff --git a/paddle/pten/infermeta/binary.cc b/paddle/pten/infermeta/binary.cc
index a0d4cba90dae7..5d3844a1dec3d 100644
--- a/paddle/pten/infermeta/binary.cc
+++ b/paddle/pten/infermeta/binary.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/pten/infermeta/binary.h"
-#include "paddle/pten/kernels/functions/general/elementwise_base.h"
+#include "paddle/pten/kernels/hybird/general/elementwise_base.h"
 
 namespace pten {
 
diff --git a/paddle/pten/kernels/CMakeLists.txt b/paddle/pten/kernels/CMakeLists.txt
index ffc45a5cf23a4..ebf659da47298 100644
--- a/paddle/pten/kernels/CMakeLists.txt
+++ b/paddle/pten/kernels/CMakeLists.txt
@@ -1,22 +1,28 @@
-# pten math functions called by kernels
-add_subdirectory(math)
-# pten basic functions called by kernels
-add_subdirectory(functions)
-# pten kernels for diff device
+# kernel primitive api
+add_subdirectory(primitive)
+# pten hybird functors and functions called by kernels
+add_subdirectory(hybird)
+
+# pten kernels for different backends
+# NOTE(chenweihang): We need to increase the compilation option of WITH_EIGEN,
+# which will support splitting eigen at compile time on demand in the future
+add_subdirectory(eigen)
+# NOTE(chenweihang): We need to increase the compilation option of WITH_BLAS,
+# which will support splitting eigen at compile time on demand in the future,
+# and if necessary, blas can be split into openblas and cublas
+add_subdirectory(blas)
 add_subdirectory(cpu)
 if(WITH_GPU OR WITH_ROCM)
-  # TODO(chenweihang): if hip can split from cuda impl, we should add hip dir
+  # NOTE(chenweihang): if hip can split from cuda impl, we should add hip dir
   add_subdirectory(cuda)
 endif()
-# TODO(chenweihang): migrate MKLDNN Kernel in the second phase of the project
 if(WITH_MKLDNN)
-  add_subdirectory(mkldnn)
+  # mkldnn will be deprecated and use the new name dnnl
+  add_subdirectory(dnnl)
 endif()
-# TODO(chenweihang): migrate NPU Kernel in the second phase of the project
 if(WITH_ASCEND_CL)
   add_subdirectory(npu)
 endif()
-# TODO(chenweihang): migrate XPU Kernel in the second phase of the project
 if(WITH_XPU)
   add_subdirectory(xpu)
 endif()
diff --git a/paddle/pten/kernels/functions/cpu/CMakeLists.txt b/paddle/pten/kernels/blas/CMakeLists.txt
similarity index 100%
rename from paddle/pten/kernels/functions/cpu/CMakeLists.txt
rename to paddle/pten/kernels/blas/CMakeLists.txt
diff --git a/paddle/pten/kernels/cpu/creation.cc b/paddle/pten/kernels/cpu/creation.cc
index db3d5c2bf4b1f..4175203410f8d 100644
--- a/paddle/pten/kernels/cpu/creation.cc
+++ b/paddle/pten/kernels/cpu/creation.cc
@@ -15,7 +15,7 @@
 #include "paddle/pten/kernels/cpu/creation.h"
 
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/functions/eigen/fill.h"
+#include "paddle/pten/kernels/hybird/eigen/fill.h"
 
 namespace pten {
 
@@ -61,9 +61,7 @@ void FillConstant(const CPUContext& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_MODULE(CreationCPU);
-
-PT_REGISTER_KERNEL("full_like",
+PT_REGISTER_KERNEL(full_like,
                    CPU,
                    ANY,
                    pten::FillAnyLike,
@@ -74,7 +72,7 @@ PT_REGISTER_KERNEL("full_like",
                    bool,
                    paddle::platform::float16) {}
 
-PT_REGISTER_KERNEL("full",
+PT_REGISTER_KERNEL(full,
                    CPU,
                    ANY,
                    pten::FillConstant,
diff --git a/paddle/pten/kernels/cpu/linalg.cc b/paddle/pten/kernels/cpu/linalg.cc
index ced13dc41d1ae..9f4f1be18259a 100644
--- a/paddle/pten/kernels/cpu/linalg.cc
+++ b/paddle/pten/kernels/cpu/linalg.cc
@@ -21,7 +21,7 @@
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/complex.h"
 
-#include "paddle/pten/kernels/functions/math/matmul_func.h"
+#include "paddle/pten/kernels/hybird/math/matmul_func.h"
 
 namespace pten {
 
@@ -70,12 +70,10 @@ void Matmul(const CPUContext& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_MODULE(LinalgCPU);
-
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
-PT_REGISTER_KERNEL("dot",
+PT_REGISTER_KERNEL(dot,
                    CPU,
                    ANY,
                    pten::Dot,
@@ -87,5 +85,4 @@ PT_REGISTER_KERNEL("dot",
                    complex128) {}
 
 PT_REGISTER_KERNEL(
-    "matmul_v2", CPU, ANY, pten::Matmul, float, double, complex64, complex128) {
-}
+    matmul, CPU, ANY, pten::Matmul, float, double, complex64, complex128) {}
diff --git a/paddle/pten/kernels/cpu/manipulation.cc b/paddle/pten/kernels/cpu/manipulation.cc
index bf94d00964d95..61c6cb57a9f78 100644
--- a/paddle/pten/kernels/cpu/manipulation.cc
+++ b/paddle/pten/kernels/cpu/manipulation.cc
@@ -16,8 +16,8 @@
 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/cpu/utils.h"
-#include "paddle/pten/kernels/functions/general/manipulation.h"
-#include "paddle/pten/kernels/functions/math/cast_func.h"
+#include "paddle/pten/kernels/hybird/general/manipulation.h"
+#include "paddle/pten/kernels/hybird/math/cast_func.h"
 
 namespace pten {
 
@@ -130,12 +130,9 @@ void Cast(const CPUContext& dev_ctx,
 
 }  // namespace pten
 
-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(ManipulationCPU);
-
 // TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
 // architecture, kernel_name should be "flatten".
-PT_REGISTER_KERNEL("flatten",
+PT_REGISTER_KERNEL(flatten,
                    CPU,
                    ANY,
                    pten::Flatten,
@@ -145,8 +142,7 @@ PT_REGISTER_KERNEL("flatten",
                    int8_t,
                    int,
                    int64_t) {}
-
-PT_REGISTER_KERNEL("flatten.mid",
+PT_REGISTER_KERNEL(flatten_mid,
                    CPU,
                    ANY,
                    pten::FlattenWithXShape,
@@ -156,7 +152,8 @@ PT_REGISTER_KERNEL("flatten.mid",
                    int8_t,
                    int,
                    int64_t) {}
-PT_REGISTER_KERNEL("cast",
+
+PT_REGISTER_KERNEL(cast,
                    CPU,
                    ANY,
                    pten::Cast,
@@ -174,42 +171,33 @@ PT_REGISTER_KERNEL("cast",
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
 
-// TODO(yuanrisheng): "reshape2" is compatible with old kernel
-// architecture, kernel_name should be "reshape".
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape",
-                                CPU,
-                                ANY,
-                                pten::ReshapeFromVectorVal) {}
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mid",
-                                CPU,
-                                ANY,
-                                pten::ReshapeFromVectorValWithXShape) {}
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.host", CPU, ANY, pten::ReshapeFromDT) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape, CPU, ANY, pten::ReshapeFromVectorVal) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mid,
+                             CPU,
+                             ANY,
+                             pten::ReshapeFromVectorValWithXShape) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_host, CPU, ANY, pten::ReshapeFromDT) {
   kernel->InputAt(1).SetBackend(pten::Backend::CPU);
   kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.host.mid",
-                                CPU,
-                                ANY,
-                                pten::ReshapeFromDTWithXShape) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_host_mid,
+                             CPU,
+                             ANY,
+                             pten::ReshapeFromDTWithXShape) {
   kernel->InputAt(1).SetBackend(pten::Backend::CPU);
   kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mulhost",
-                                CPU,
-                                ANY,
-                                pten::ReshapeFromVectorDT) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mulhost,
+                             CPU,
+                             ANY,
+                             pten::ReshapeFromVectorDT) {
   kernel->InputAt(1).SetBackend(pten::Backend::CPU);
   kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mulhost.mid",
-                                CPU,
-                                ANY,
-                                pten::ReshapeFromVectorDTWithXShape) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mulhost_mid,
+                             CPU,
+                             ANY,
+                             pten::ReshapeFromVectorDTWithXShape) {
   kernel->InputAt(1).SetBackend(pten::Backend::CPU);
   kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
diff --git a/paddle/pten/kernels/cpu/math.cc b/paddle/pten/kernels/cpu/math.cc
index e768d4f1eff38..2d556d96c2fcf 100644
--- a/paddle/pten/kernels/cpu/math.cc
+++ b/paddle/pten/kernels/cpu/math.cc
@@ -15,12 +15,12 @@
 #include "paddle/pten/kernels/cpu/math.h"
 
 #include "paddle/pten/api/ext/dispatch.h"
-#include "paddle/pten/kernels/functions/cpu/elementwise.h"
-#include "paddle/pten/kernels/functions/eigen/reduce.h"
-#include "paddle/pten/kernels/functions/eigen/scale.h"
-#include "paddle/pten/kernels/functions/eigen/sign.h"
-#include "paddle/pten/kernels/functions/general/elementwise_functor.h"
-#include "paddle/pten/kernels/functions/general/reduce_impl.h"
+#include "paddle/pten/kernels/hybird/cpu/elementwise.h"
+#include "paddle/pten/kernels/hybird/eigen/reduce.h"
+#include "paddle/pten/kernels/hybird/eigen/scale.h"
+#include "paddle/pten/kernels/hybird/eigen/sign.h"
+#include "paddle/pten/kernels/hybird/general/elementwise_functor.h"
+#include "paddle/pten/kernels/hybird/general/reduce_impl.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/eigen.h"
@@ -106,18 +106,14 @@ DEFINE_CPU_ELEMENTWISE_OP(Mul)
 
 }  // namespace pten
 
-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(MathCPU);
-
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
 // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
 // using bfloat16 = ::paddle::platform::bfloat16;
-
-PT_REGISTER_KERNEL("sign", CPU, ANY, pten::Sign, float, double) {}
-PT_REGISTER_KERNEL("mean", CPU, ANY, pten::Mean, float, double, bool) {}
-PT_REGISTER_KERNEL("scale",
+PT_REGISTER_KERNEL(sign, CPU, ANY, pten::Sign, float, double) {}
+PT_REGISTER_KERNEL(mean, CPU, ANY, pten::Mean, float, double, bool) {}
+PT_REGISTER_KERNEL(scale,
                    CPU,
                    ANY,
                    pten::Scale,
@@ -129,8 +125,7 @@ PT_REGISTER_KERNEL("scale",
                    int16_t,
                    int,
                    int64_t) {}
-
-PT_REGISTER_KERNEL("add",
+PT_REGISTER_KERNEL(add,
                    CPU,
                    ANY,
                    pten::ElementwiseAdd,
@@ -140,7 +135,7 @@ PT_REGISTER_KERNEL("add",
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL("subtract",
+PT_REGISTER_KERNEL(subtract,
                    CPU,
                    ANY,
                    pten::ElementwiseSub,
@@ -150,7 +145,7 @@ PT_REGISTER_KERNEL("subtract",
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL("divide",
+PT_REGISTER_KERNEL(divide,
                    CPU,
                    ANY,
                    pten::ElementwiseDiv,
@@ -160,7 +155,7 @@ PT_REGISTER_KERNEL("divide",
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL("multiply",
+PT_REGISTER_KERNEL(multiply,
                    CPU,
                    ANY,
                    pten::ElementwiseMul,
@@ -171,8 +166,7 @@ PT_REGISTER_KERNEL("multiply",
                    bool,
                    complex64,
                    complex128) {}
-
-PT_REGISTER_KERNEL("sum",
+PT_REGISTER_KERNEL(sum,
                    CPU,
                    ANY,
                    pten::Sum,
diff --git a/paddle/pten/kernels/cpu/utils.cc b/paddle/pten/kernels/cpu/utils.cc
index b462ef70c2f06..500b4664d6388 100644
--- a/paddle/pten/kernels/cpu/utils.cc
+++ b/paddle/pten/kernels/cpu/utils.cc
@@ -57,7 +57,4 @@ void Copy(const CPUContext& dev_ctx,
 
 }  // namespace pten
 
-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(UtilsCPU);
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", CPU, ANY, pten::Copy) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(copy, CPU, ANY, pten::Copy) {}
diff --git a/paddle/pten/kernels/cuda/creation.cu b/paddle/pten/kernels/cuda/creation.cu
index 84d9fa255ce1f..dd29fd5fbb84d 100644
--- a/paddle/pten/kernels/cuda/creation.cu
+++ b/paddle/pten/kernels/cuda/creation.cu
@@ -15,7 +15,7 @@
 #include "paddle/pten/kernels/cuda/creation.h"
 
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/functions/eigen/fill.h"
+#include "paddle/pten/kernels/hybird/eigen/fill.h"
 
 namespace pten {
 
@@ -62,9 +62,7 @@ void FillConstant(const CUDAContext& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_MODULE(CreationCUDA);
-
-PT_REGISTER_KERNEL("full_like",
+PT_REGISTER_KERNEL(full_like,
                    CUDA,
                    ANY,
                    pten::FillAnyLike,
@@ -75,7 +73,7 @@ PT_REGISTER_KERNEL("full_like",
                    bool,
                    paddle::platform::float16) {}
 
-PT_REGISTER_KERNEL("full",
+PT_REGISTER_KERNEL(full,
                    CUDA,
                    ANY,
                    pten::FillConstant,
diff --git a/paddle/pten/kernels/cuda/linalg.cu b/paddle/pten/kernels/cuda/linalg.cu
index 6811afa8a49ff..2114bbcc71c75 100644
--- a/paddle/pten/kernels/cuda/linalg.cu
+++ b/paddle/pten/kernels/cuda/linalg.cu
@@ -15,8 +15,8 @@
 #include "paddle/pten/kernels/cuda/linalg.h"
 
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/functions/eigen/dot.h"
-#include "paddle/pten/kernels/functions/math/matmul_func.h"
+#include "paddle/pten/kernels/hybird/eigen/dot.h"
+#include "paddle/pten/kernels/hybird/math/matmul_func.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/complex.h"
@@ -54,13 +54,11 @@ void Matmul(const CUDAContext& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_MODULE(LinalgCUDA);
-
 using float16 = paddle::platform::float16;
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
-PT_REGISTER_KERNEL("dot",
+PT_REGISTER_KERNEL(dot,
                    CUDA,
                    ANY,
                    pten::Dot,
@@ -71,7 +69,7 @@ PT_REGISTER_KERNEL("dot",
                    complex64,
                    complex128) {}
 
-PT_REGISTER_KERNEL("matmul_v2",
+PT_REGISTER_KERNEL(matmul,
                    CUDA,
                    ANY,
                    pten::Matmul,
diff --git a/paddle/pten/kernels/cuda/manipulation.cu b/paddle/pten/kernels/cuda/manipulation.cu
index 9c7fded091041..e668d1b04d723 100644
--- a/paddle/pten/kernels/cuda/manipulation.cu
+++ b/paddle/pten/kernels/cuda/manipulation.cu
@@ -16,8 +16,8 @@
 #include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/cuda/manipulation.h"
 #include "paddle/pten/kernels/cuda/utils.h"
-#include "paddle/pten/kernels/functions/cuda/cast_kernel_impl.h"
-#include "paddle/pten/kernels/functions/general/manipulation.h"
+#include "paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h"
+#include "paddle/pten/kernels/hybird/general/manipulation.h"
 
 namespace pten {
 
@@ -129,13 +129,9 @@ void Cast(const CUDAContext& dev_ctx,
 
 }  // namespace pten
 
-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(ManipulationCUDA);
-
 using float16 = paddle::platform::float16;
-// TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
-// architecture, kernel_name should be "flatten".
-PT_REGISTER_KERNEL("flatten",
+
+PT_REGISTER_KERNEL(flatten,
                    CUDA,
                    ANY,
                    pten::Flatten,
@@ -146,8 +142,7 @@ PT_REGISTER_KERNEL("flatten",
                    int8_t,
                    int,
                    int64_t) {}
-
-PT_REGISTER_KERNEL("flatten.mid",
+PT_REGISTER_KERNEL(flatten_mid,
                    CUDA,
                    ANY,
                    pten::FlattenWithXShape,
@@ -159,7 +154,7 @@ PT_REGISTER_KERNEL("flatten.mid",
                    int64_t) {}
 
 #define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \
-  PT_REGISTER_KERNEL("cast",                            \
+  PT_REGISTER_KERNEL(cast,                              \
                      CUDA,                              \
                      ANY,                               \
                      pten::Cast,                        \
@@ -184,44 +179,33 @@ PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, paddle::platform::bfloat16)
 PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast)
 #endif
 
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape",
-                                CUDA,
-                                ANY,
-                                pten::ReshapeFromVectorVal) {}
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mid",
-                                CUDA,
-                                ANY,
-                                pten::ReshapeFromVectorValWithXShape) {}
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.host",
-                                CUDA,
-                                ANY,
-                                pten::ReshapeFromDT) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape, CUDA, ANY, pten::ReshapeFromVectorVal) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mid,
+                             CUDA,
+                             ANY,
+                             pten::ReshapeFromVectorValWithXShape) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_host, CUDA, ANY, pten::ReshapeFromDT) {
   kernel->InputAt(1).SetBackend(pten::Backend::CPU);
   kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.host.mid",
-                                CUDA,
-                                ANY,
-                                pten::ReshapeFromDTWithXShape) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_host_mid,
+                             CUDA,
+                             ANY,
+                             pten::ReshapeFromDTWithXShape) {
   kernel->InputAt(1).SetBackend(pten::Backend::CPU);
   kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mulhost",
-                                CUDA,
-                                ANY,
-                                pten::ReshapeFromVectorDT) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mulhost,
+                             CUDA,
+                             ANY,
+                             pten::ReshapeFromVectorDT) {
   kernel->InputAt(1).SetBackend(pten::Backend::CPU);
   kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mulhost.mid",
-                                CUDA,
-                                ANY,
-                                pten::ReshapeFromVectorDTWithXShape) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mulhost_mid,
+                             CUDA,
+                             ANY,
+                             pten::ReshapeFromVectorDTWithXShape) {
   kernel->InputAt(1).SetBackend(pten::Backend::CPU);
   kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
diff --git a/paddle/pten/kernels/cuda/math.cu b/paddle/pten/kernels/cuda/math.cu
index e7fa599cb68b1..66aaf14dcd0f6 100644
--- a/paddle/pten/kernels/cuda/math.cu
+++ b/paddle/pten/kernels/cuda/math.cu
@@ -15,12 +15,12 @@ limitations under the License. */
 #include "paddle/pten/kernels/cuda/math.h"
 
 #include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
-#include "paddle/pten/kernels/functions/cuda/elementwise/elementwise.h"
-#include "paddle/pten/kernels/functions/cuda/reduce/reduce.h"
-#include "paddle/pten/kernels/functions/eigen/scale.h"
-#include "paddle/pten/kernels/functions/eigen/sign.h"
-#include "paddle/pten/kernels/functions/general/elementwise_functor.h"
-#include "paddle/pten/kernels/functions/general/reduce_impl.h"
+#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h"
+#include "paddle/pten/kernels/hybird/cuda/reduce/reduce.h"
+#include "paddle/pten/kernels/hybird/eigen/scale.h"
+#include "paddle/pten/kernels/hybird/eigen/sign.h"
+#include "paddle/pten/kernels/hybird/general/elementwise_functor.h"
+#include "paddle/pten/kernels/hybird/general/reduce_impl.h"
 
 #ifdef __NVCC__
 #include "cub/cub.cuh"
@@ -111,16 +111,13 @@ void Sum(const CUDAContext& dev_ctx,
 
 }  // namespace pten
 
-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(MathCUDA);
-
 using float16 = paddle::platform::float16;
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
-PT_REGISTER_KERNEL("sign", CUDA, ANY, pten::Sign, float, double, float16) {}
-PT_REGISTER_KERNEL("mean", CUDA, ANY, pten::Mean, float, double, bool) {}
-PT_REGISTER_KERNEL("scale",
+PT_REGISTER_KERNEL(sign, CUDA, ANY, pten::Sign, float, double, float16) {}
+PT_REGISTER_KERNEL(mean, CUDA, ANY, pten::Mean, float, double, bool) {}
+PT_REGISTER_KERNEL(scale,
                    CUDA,
                    ANY,
                    pten::Scale,
@@ -132,7 +129,7 @@ PT_REGISTER_KERNEL("scale",
                    int16_t,
                    int,
                    int64_t) {}
-PT_REGISTER_KERNEL("add",
+PT_REGISTER_KERNEL(add,
                    CUDA,
                    ANY,
                    pten::ElementwiseAdd,
@@ -143,7 +140,7 @@ PT_REGISTER_KERNEL("add",
                    float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL("subtract",
+PT_REGISTER_KERNEL(subtract,
                    CUDA,
                    ANY,
                    pten::ElementwiseSub,
@@ -154,7 +151,7 @@ PT_REGISTER_KERNEL("subtract",
                    float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL("divide",
+PT_REGISTER_KERNEL(divide,
                    CUDA,
                    ANY,
                    pten::ElementwiseDiv,
@@ -165,7 +162,7 @@ PT_REGISTER_KERNEL("divide",
                    float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL("multiply",
+PT_REGISTER_KERNEL(multiply,
                    CUDA,
                    ANY,
                    pten::ElementwiseMul,
@@ -177,7 +174,7 @@ PT_REGISTER_KERNEL("multiply",
                    float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL("sum",
+PT_REGISTER_KERNEL(sum,
                    CUDA,
                    ANY,
                    pten::Sum,
diff --git a/paddle/pten/kernels/cuda/utils.cu b/paddle/pten/kernels/cuda/utils.cu
index 24da650d1f3eb..49027e956b2d7 100644
--- a/paddle/pten/kernels/cuda/utils.cu
+++ b/paddle/pten/kernels/cuda/utils.cu
@@ -234,7 +234,4 @@ void Copy(const CUDAContext& dev_ctx,
 }
 }  // namespace pten
 
-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(UtilsCUDA);
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", CUDA, ANY, pten::Copy) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(copy, CUDA, ANY, pten::Copy) {}
diff --git a/paddle/pten/kernels/functions/cuda/CMakeLists.txt b/paddle/pten/kernels/dnnl/CMakeLists.txt
similarity index 100%
rename from paddle/pten/kernels/functions/cuda/CMakeLists.txt
rename to paddle/pten/kernels/dnnl/CMakeLists.txt
diff --git a/paddle/pten/kernels/functions/eigen/CMakeLists.txt b/paddle/pten/kernels/eigen/CMakeLists.txt
similarity index 100%
rename from paddle/pten/kernels/functions/eigen/CMakeLists.txt
rename to paddle/pten/kernels/eigen/CMakeLists.txt
diff --git a/paddle/pten/kernels/functions/CMakeLists.txt b/paddle/pten/kernels/functions/CMakeLists.txt
deleted file mode 100644
index a5b4bb3e8bd96..0000000000000
--- a/paddle/pten/kernels/functions/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-add_subdirectory(eigen)
-add_subdirectory(blas)
-add_subdirectory(general)
diff --git a/paddle/pten/kernels/math/cuda/CMakeLists.txt b/paddle/pten/kernels/hybird/CMakeLists.txt
similarity index 58%
rename from paddle/pten/kernels/math/cuda/CMakeLists.txt
rename to paddle/pten/kernels/hybird/CMakeLists.txt
index b0be23bb09241..c82cbd1ef9e21 100644
--- a/paddle/pten/kernels/math/cuda/CMakeLists.txt
+++ b/paddle/pten/kernels/hybird/CMakeLists.txt
@@ -1,3 +1,8 @@
+add_subdirectory(eigen)
+add_subdirectory(blas)
+add_subdirectory(general)
+
+cc_library(pten_transpose_cpu SRCS transpose.cc DEPS dense_tensor device_context)
 if(WITH_GPU)
   nv_library(pten_transpose_cuda SRCS transpose.cu DEPS dense_tensor malloc device_context)
 elseif(WITH_ROCM)
diff --git a/paddle/pten/kernels/functions/general/CMakeLists.txt b/paddle/pten/kernels/hybird/blas/CMakeLists.txt
similarity index 100%
rename from paddle/pten/kernels/functions/general/CMakeLists.txt
rename to paddle/pten/kernels/hybird/blas/CMakeLists.txt
diff --git a/paddle/pten/kernels/functions/blas/elementwise.h b/paddle/pten/kernels/hybird/blas/elementwise.h
similarity index 100%
rename from paddle/pten/kernels/functions/blas/elementwise.h
rename to paddle/pten/kernels/hybird/blas/elementwise.h
diff --git a/paddle/pten/kernels/mkldnn/CMakeLists.txt b/paddle/pten/kernels/hybird/cpu/CMakeLists.txt
similarity index 100%
rename from paddle/pten/kernels/mkldnn/CMakeLists.txt
rename to paddle/pten/kernels/hybird/cpu/CMakeLists.txt
diff --git a/paddle/pten/kernels/functions/cpu/elementwise.h b/paddle/pten/kernels/hybird/cpu/elementwise.h
similarity index 99%
rename from paddle/pten/kernels/functions/cpu/elementwise.h
rename to paddle/pten/kernels/hybird/cpu/elementwise.h
index 98600f29910be..e8213c8b45ddb 100644
--- a/paddle/pten/kernels/functions/cpu/elementwise.h
+++ b/paddle/pten/kernels/hybird/cpu/elementwise.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/functions/general/elementwise_base.h"
+#include "paddle/pten/kernels/hybird/general/elementwise_base.h"
 
 namespace pten {
 
diff --git a/paddle/pten/kernels/hybird/cuda/CMakeLists.txt b/paddle/pten/kernels/hybird/cuda/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/pten/kernels/functions/cuda/cast_kernel_impl.h b/paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h
similarity index 100%
rename from paddle/pten/kernels/functions/cuda/cast_kernel_impl.h
rename to paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h
diff --git a/paddle/pten/kernels/functions/cuda/elementwise/elementwise.h b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h
similarity index 90%
rename from paddle/pten/kernels/functions/cuda/elementwise/elementwise.h
rename to paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h
index b769936089b25..9bcfa1d857ba7 100644
--- a/paddle/pten/kernels/functions/cuda/elementwise/elementwise.h
+++ b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pten/kernels/functions/cuda/elementwise/elementwise_broadcast.cu.h"
-#include "paddle/pten/kernels/functions/cuda/elementwise/elementwise_no_broadcast.cu.h"
+#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise_broadcast.cu.h"
+#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise_no_broadcast.cu.h"
 
 namespace pten {
 
diff --git a/paddle/pten/kernels/functions/cuda/elementwise/elementwise_broadcast.cu.h b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_broadcast.cu.h
similarity index 99%
rename from paddle/pten/kernels/functions/cuda/elementwise/elementwise_broadcast.cu.h
rename to paddle/pten/kernels/hybird/cuda/elementwise/elementwise_broadcast.cu.h
index 40d3cf60f0926..258e8f410ebf3 100644
--- a/paddle/pten/kernels/functions/cuda/elementwise/elementwise_broadcast.cu.h
+++ b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_broadcast.cu.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/functions/cuda/elementwise/elementwise_common.cu.h"
+#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h"
 
 namespace pten {
 
diff --git a/paddle/pten/kernels/functions/cuda/elementwise/elementwise_common.cu.h b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h
similarity index 97%
rename from paddle/pten/kernels/functions/cuda/elementwise/elementwise_common.cu.h
rename to paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h
index 1c0dd56254ce1..053b53041d165 100644
--- a/paddle/pten/kernels/functions/cuda/elementwise/elementwise_common.cu.h
+++ b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/function_traits.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/functions/general/elementwise_base.h"
+#include "paddle/pten/kernels/hybird/general/elementwise_base.h"
 
 namespace pten {
 namespace kps = paddle::operators::kernel_primitives;
diff --git a/paddle/pten/kernels/functions/cuda/elementwise/elementwise_no_broadcast.cu.h b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_no_broadcast.cu.h
similarity index 98%
rename from paddle/pten/kernels/functions/cuda/elementwise/elementwise_no_broadcast.cu.h
rename to paddle/pten/kernels/hybird/cuda/elementwise/elementwise_no_broadcast.cu.h
index 4eaf8867fd0c9..e2659271bdcd9 100644
--- a/paddle/pten/kernels/functions/cuda/elementwise/elementwise_no_broadcast.cu.h
+++ b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_no_broadcast.cu.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pten/kernels/functions/cuda/elementwise/elementwise_common.cu.h"
+#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h"
 
 #ifdef __HIPCC__
 #define ELEMENTWISE_BLOCK_SIZE 256
diff --git a/paddle/pten/kernels/functions/cuda/reduce/reduce.h b/paddle/pten/kernels/hybird/cuda/reduce/reduce.h
similarity index 97%
rename from paddle/pten/kernels/functions/cuda/reduce/reduce.h
rename to paddle/pten/kernels/hybird/cuda/reduce/reduce.h
index 0f7917f79e70c..1e47726333bcd 100644
--- a/paddle/pten/kernels/functions/cuda/reduce/reduce.h
+++ b/paddle/pten/kernels/hybird/cuda/reduce/reduce.h
@@ -21,7 +21,7 @@
 #include "paddle/pten/core/dense_tensor.h"
 
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/pten/kernels/functions/cuda/reduce/reduce_cuda_impl.h"
+#include "paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h"
 
 namespace pten {
 
diff --git a/paddle/pten/kernels/functions/cuda/reduce/reduce_cuda_impl.h b/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
similarity index 97%
rename from paddle/pten/kernels/functions/cuda/reduce/reduce_cuda_impl.h
rename to paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
index 21663ee0388c0..1f1b8ddd5f412 100644
--- a/paddle/pten/kernels/functions/cuda/reduce/reduce_cuda_impl.h
+++ b/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h
@@ -43,7 +43,7 @@ namespace cub = hipcub;
 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/api/include/tensor.h"
 #include "paddle/pten/kernels/cuda/utils.h"
-#include "paddle/pten/kernels/functions/math/cast_func.h"
+#include "paddle/pten/kernels/hybird/math/cast_func.h"
 
 // Reduce split or not, Whether to use ReduceHigherDim
 #define REDUCE_SPLIT_BOUNDARY 512
@@ -769,6 +769,23 @@ static void LaunchReduceKernel(const Tx* x_data,
   }
 }
 
+void TensorCopy(const DenseTensor& src, DenseTensor* dst) {
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  const paddle::platform::CUDADeviceContext* dev_ctx;
+  if (paddle::platform::is_gpu_place(dst->place()) ||
+      paddle::platform::is_npu_place(dst->place())) {
+    dev_ctx = static_cast<paddle::platform::CUDADeviceContext*>(
+        pool.Get(dst->place()));
+
+  } else {
+    dev_ctx = static_cast<paddle::platform::CUDADeviceContext*>(
+        pool.Get(src.place()));
+  }
+
+  pten::Copy(*dev_ctx, src, false, dst);
+}
+
 template <typename Tx,
           typename Ty,
           template <typename, typename> class ReduceOp>
@@ -800,7 +817,7 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
   if (config.reduce_num == 1) {
     auto out_dims = y->dims();
     if (x.dtype() == y->dtype()) {
-      pten::Copy(*dev_ctx, x, true, y);
+      TensorCopy(x, y);
       y->Resize(out_dims);
     } else {
       PD_VISIT_ALL_TYPES(y->dtype(), "CastKernelImpl", ([&] {
diff --git a/paddle/pten/kernels/hybird/eigen/CMakeLists.txt b/paddle/pten/kernels/hybird/eigen/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/pten/kernels/functions/eigen/common.h b/paddle/pten/kernels/hybird/eigen/common.h
similarity index 100%
rename from paddle/pten/kernels/functions/eigen/common.h
rename to paddle/pten/kernels/hybird/eigen/common.h
diff --git a/paddle/pten/kernels/functions/eigen/dot.h b/paddle/pten/kernels/hybird/eigen/dot.h
similarity index 96%
rename from paddle/pten/kernels/functions/eigen/dot.h
rename to paddle/pten/kernels/hybird/eigen/dot.h
index 27a0b8cf32953..eb089037fa3f3 100644
--- a/paddle/pten/kernels/functions/eigen/dot.h
+++ b/paddle/pten/kernels/hybird/eigen/dot.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/functions/eigen/common.h"
+#include "paddle/pten/kernels/hybird/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
diff --git a/paddle/pten/kernels/functions/eigen/elementwise.h b/paddle/pten/kernels/hybird/eigen/elementwise.h
similarity index 97%
rename from paddle/pten/kernels/functions/eigen/elementwise.h
rename to paddle/pten/kernels/hybird/eigen/elementwise.h
index 369ff36c46e7f..e67cce63d461f 100644
--- a/paddle/pten/kernels/functions/eigen/elementwise.h
+++ b/paddle/pten/kernels/hybird/eigen/elementwise.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/functions/eigen/common.h"
+#include "paddle/pten/kernels/hybird/eigen/common.h"
 
 namespace pten {
 namespace eigen {
diff --git a/paddle/pten/kernels/functions/eigen/fill.h b/paddle/pten/kernels/hybird/eigen/fill.h
similarity index 95%
rename from paddle/pten/kernels/functions/eigen/fill.h
rename to paddle/pten/kernels/hybird/eigen/fill.h
index 122a6aef22dc6..80e310847e186 100644
--- a/paddle/pten/kernels/functions/eigen/fill.h
+++ b/paddle/pten/kernels/hybird/eigen/fill.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/functions/eigen/common.h"
+#include "paddle/pten/kernels/hybird/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
diff --git a/paddle/pten/kernels/functions/eigen/reduce.h b/paddle/pten/kernels/hybird/eigen/reduce.h
similarity index 98%
rename from paddle/pten/kernels/functions/eigen/reduce.h
rename to paddle/pten/kernels/hybird/eigen/reduce.h
index 7bc3d2260916c..e6ab872928c77 100644
--- a/paddle/pten/kernels/functions/eigen/reduce.h
+++ b/paddle/pten/kernels/hybird/eigen/reduce.h
@@ -16,8 +16,8 @@
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/functions/eigen/common.h"
-#include "paddle/pten/kernels/math/transpose.h"
+#include "paddle/pten/kernels/hybird/eigen/common.h"
+#include "paddle/pten/kernels/hybird/transpose.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
diff --git a/paddle/pten/kernels/functions/eigen/scale.h b/paddle/pten/kernels/hybird/eigen/scale.h
similarity index 96%
rename from paddle/pten/kernels/functions/eigen/scale.h
rename to paddle/pten/kernels/hybird/eigen/scale.h
index 49ee561df50ec..111f6c22cc35e 100644
--- a/paddle/pten/kernels/functions/eigen/scale.h
+++ b/paddle/pten/kernels/hybird/eigen/scale.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/functions/eigen/common.h"
+#include "paddle/pten/kernels/hybird/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
diff --git a/paddle/pten/kernels/functions/eigen/sign.h b/paddle/pten/kernels/hybird/eigen/sign.h
similarity index 96%
rename from paddle/pten/kernels/functions/eigen/sign.h
rename to paddle/pten/kernels/hybird/eigen/sign.h
index 5cd620815bf26..0beebda4f39e8 100644
--- a/paddle/pten/kernels/functions/eigen/sign.h
+++ b/paddle/pten/kernels/hybird/eigen/sign.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/functions/eigen/common.h"
+#include "paddle/pten/kernels/hybird/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
diff --git a/paddle/pten/kernels/hybird/general/CMakeLists.txt b/paddle/pten/kernels/hybird/general/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/pten/kernels/functions/general/elementwise_base.h b/paddle/pten/kernels/hybird/general/elementwise_base.h
similarity index 100%
rename from paddle/pten/kernels/functions/general/elementwise_base.h
rename to paddle/pten/kernels/hybird/general/elementwise_base.h
diff --git a/paddle/pten/kernels/functions/general/elementwise_functor.h b/paddle/pten/kernels/hybird/general/elementwise_functor.h
similarity index 98%
rename from paddle/pten/kernels/functions/general/elementwise_functor.h
rename to paddle/pten/kernels/hybird/general/elementwise_functor.h
index 973389f14797e..109f0907c6a10 100644
--- a/paddle/pten/kernels/functions/general/elementwise_functor.h
+++ b/paddle/pten/kernels/hybird/general/elementwise_functor.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/hostdevice.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/functions/blas/elementwise.h"
-#include "paddle/pten/kernels/functions/eigen/elementwise.h"
+#include "paddle/pten/kernels/hybird/blas/elementwise.h"
+#include "paddle/pten/kernels/hybird/eigen/elementwise.h"
 
 namespace pten {
 namespace general {
diff --git a/paddle/pten/kernels/functions/general/manipulation.h b/paddle/pten/kernels/hybird/general/manipulation.h
similarity index 100%
rename from paddle/pten/kernels/functions/general/manipulation.h
rename to paddle/pten/kernels/hybird/general/manipulation.h
diff --git a/paddle/pten/kernels/functions/general/reduce_impl.h b/paddle/pten/kernels/hybird/general/reduce_impl.h
similarity index 95%
rename from paddle/pten/kernels/functions/general/reduce_impl.h
rename to paddle/pten/kernels/hybird/general/reduce_impl.h
index a48eb7e44bdc1..50f40c5f2ca12 100644
--- a/paddle/pten/kernels/functions/general/reduce_impl.h
+++ b/paddle/pten/kernels/hybird/general/reduce_impl.h
@@ -16,8 +16,8 @@
 #include "paddle/fluid/platform/transform.h"
 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/functions/eigen/reduce.h"
-#include "paddle/pten/kernels/functions/math/cast_func.h"
+#include "paddle/pten/kernels/hybird/eigen/reduce.h"
+#include "paddle/pten/kernels/hybird/math/cast_func.h"
 namespace pten {
 namespace general {
 
diff --git a/paddle/pten/kernels/functions/math/cast_func.h b/paddle/pten/kernels/hybird/math/cast_func.h
similarity index 100%
rename from paddle/pten/kernels/functions/math/cast_func.h
rename to paddle/pten/kernels/hybird/math/cast_func.h
diff --git a/paddle/pten/kernels/functions/math/matmul_func.h b/paddle/pten/kernels/hybird/math/matmul_func.h
similarity index 100%
rename from paddle/pten/kernels/functions/math/matmul_func.h
rename to paddle/pten/kernels/hybird/math/matmul_func.h
diff --git a/paddle/pten/kernels/math/cpu/transpose.cc b/paddle/pten/kernels/hybird/transpose.cc
similarity index 98%
rename from paddle/pten/kernels/math/cpu/transpose.cc
rename to paddle/pten/kernels/hybird/transpose.cc
index c8580b8877d55..73375a6a379fe 100644
--- a/paddle/pten/kernels/math/cpu/transpose.cc
+++ b/paddle/pten/kernels/hybird/transpose.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/pten/kernels/math/transpose.h"
+#include "paddle/pten/kernels/hybird/transpose.h"
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/pten/core/dense_tensor.h"
 
diff --git a/paddle/pten/kernels/math/cuda/transpose.cu b/paddle/pten/kernels/hybird/transpose.cu
similarity index 97%
rename from paddle/pten/kernels/math/cuda/transpose.cu
rename to paddle/pten/kernels/hybird/transpose.cu
index a18efde2c2e62..bf7a1409938a6 100644
--- a/paddle/pten/kernels/math/cuda/transpose.cu
+++ b/paddle/pten/kernels/hybird/transpose.cu
@@ -15,8 +15,8 @@
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/functions/math/cast_func.h"
-#include "paddle/pten/kernels/math/transpose.h"
+#include "paddle/pten/kernels/hybird/math/cast_func.h"
+#include "paddle/pten/kernels/hybird/transpose.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/bfloat16.h"
diff --git a/paddle/pten/kernels/math/transpose.h b/paddle/pten/kernels/hybird/transpose.h
similarity index 100%
rename from paddle/pten/kernels/math/transpose.h
rename to paddle/pten/kernels/hybird/transpose.h
diff --git a/paddle/pten/kernels/math/CMakeLists.txt b/paddle/pten/kernels/math/CMakeLists.txt
deleted file mode 100644
index 429a6df8e2cc7..0000000000000
--- a/paddle/pten/kernels/math/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-add_subdirectory(cpu)
-if(WITH_GPU OR WITH_ROCM)
-  add_subdirectory(cuda)
-endif()
diff --git a/paddle/pten/kernels/math/cpu/CMakeLists.txt b/paddle/pten/kernels/math/cpu/CMakeLists.txt
deleted file mode 100644
index 235a49a5e4af5..0000000000000
--- a/paddle/pten/kernels/math/cpu/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-cc_library(pten_transpose_cpu SRCS transpose.cc DEPS dense_tensor device_context)
diff --git a/paddle/pten/kernels/primitive/CMakeLists.txt b/paddle/pten/kernels/primitive/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/pten/kernels/xpu/manipulation.cc b/paddle/pten/kernels/xpu/manipulation.cc
index 352d21e6e9bf9..f361933cad45a 100644
--- a/paddle/pten/kernels/xpu/manipulation.cc
+++ b/paddle/pten/kernels/xpu/manipulation.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/pten/kernels/xpu/manipulation.h"
 #include "paddle/pten/infermeta/unary.h"
-#include "paddle/pten/kernels/functions/general/manipulation.h"
+#include "paddle/pten/kernels/hybird/general/manipulation.h"
 #include "paddle/pten/kernels/xpu/utils.h"
 
 namespace pten {
@@ -95,12 +95,7 @@ void ReshapeFromVectorDT(const XPUContext& dev_ctx,
 
 }  // namespace pten
 
-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(ManipulationXPU);
-
-// TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
-// architecture, kernel_name should be "flatten".
-PT_REGISTER_KERNEL("flatten_contiguous_range",
+PT_REGISTER_KERNEL(flatten,
                    XPU,
                    ANY,
                    pten::Flatten,
@@ -112,7 +107,7 @@ PT_REGISTER_KERNEL("flatten_contiguous_range",
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
+PT_REGISTER_KERNEL(flatten_mid,
                    XPU,
                    ANY,
                    pten::FlattenWithXShape,
@@ -124,9 +119,4 @@ PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
                    int,
                    int64_t) {}
 
-// TODO(yuanrisheng): "reshape2" is compatible with old kernel
-// architecture, kernel_name should be "reshape".
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2",
-                                XPU,
-                                ANY,
-                                pten::ReshapeFromVectorVal) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape, XPU, ANY, pten::ReshapeFromVectorVal) {}
diff --git a/paddle/pten/kernels/xpu/utils.cc b/paddle/pten/kernels/xpu/utils.cc
index 329dc2baf87b5..5c98217f4ec2c 100644
--- a/paddle/pten/kernels/xpu/utils.cc
+++ b/paddle/pten/kernels/xpu/utils.cc
@@ -76,7 +76,4 @@ void Copy(const XPUDeviceContext& dev_ctx,
 
 }  // namespace pten
 
-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(UtilsXPU);
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", XPU, ANY, pten::Copy) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(copy, XPU, ANY, pten::Copy) {}
diff --git a/paddle/pten/ops/CMakeLists.txt b/paddle/pten/ops/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/pten/tests/api/test_reshape_api.cc b/paddle/pten/tests/api/test_reshape_api.cc
index b6179f11b1019..227dcc6e9568d 100644
--- a/paddle/pten/tests/api/test_reshape_api.cc
+++ b/paddle/pten/tests/api/test_reshape_api.cc
@@ -21,12 +21,6 @@ limitations under the License. */
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
 
-PT_DECLARE_MODULE(ManipulationCPU);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(ManipulationCUDA);
-#endif
-
 namespace paddle {
 namespace tests {
 
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 44afeecec32da..6ffdd75f72b4e 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -156,6 +156,9 @@
 from .tensor.manipulation import chunk  # noqa: F401
 from .tensor.manipulation import tolist  # noqa: F401
 from .tensor.manipulation import tensordot  # noqa: F401
+from .tensor.manipulation import as_complex  # noqa: F401
+from .tensor.manipulation import as_real  # noqa: F401
+
 from .tensor.math import abs  # noqa: F401
 from .tensor.math import acos  # noqa: F401
 from .tensor.math import asin  # noqa: F401
@@ -227,6 +230,8 @@
 from .tensor.math import lerp  # noqa: F401
 from .tensor.math import rad2deg  # noqa: F401
 from .tensor.math import deg2rad  # noqa: F401
+from .tensor.math import gcd  # noqa: F401
+from .tensor.math import lcm  # noqa: F401
 from .tensor.math import diff  # noqa: F401
 from .tensor.math import angle  # noqa: F401
 
@@ -260,6 +265,7 @@
 from .framework import ParamAttr  # noqa: F401
 from .framework import create_parameter  # noqa: F401
 from .framework import CPUPlace  # noqa: F401
+from .framework import IPUPlace  # noqa: F401
 from .framework import CUDAPlace  # noqa: F401
 from .framework import NPUPlace  # noqa: F401
 from .framework import CUDAPinnedPlace  # noqa: F401
@@ -291,6 +297,7 @@
 from .fluid.framework import set_flags  # noqa: F401
 from .device import is_compiled_with_xpu  # noqa: F401
 from .device import is_compiled_with_npu  # noqa: F401
+from .device import is_compiled_with_ipu  # noqa: F401
 from .device import XPUPlace  # noqa: F401
 
 from .fluid.dygraph.base import enable_dygraph as disable_static  # noqa: F401
@@ -478,6 +485,8 @@
            'atan2',
            'rad2deg',
            'deg2rad',
+           'gcd',
+           'lcm',
            'expand',
            'broadcast_to',
            'ones_like',
@@ -553,6 +562,8 @@
            'einsum',
            'set_flags',
            'get_flags',
+           'as_complex',
+           'as_real',
            'diff',
            'angle',
 ]
diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py
index 95402898589f6..0a11d59d69c94 100644
--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -28,7 +28,9 @@
     'set_device',
     'get_device',
     'XPUPlace',
+    'IPUPlace',
     'is_compiled_with_xpu',
+    'is_compiled_with_ipu',
     'is_compiled_with_cinn',
     'is_compiled_with_cuda',
     'is_compiled_with_rocm',
@@ -55,6 +57,36 @@ def is_compiled_with_npu():
     return core.is_compiled_with_npu()
 
 
+def is_compiled_with_ipu():
+    """
+    Whether paddle was built with WITH_IPU=ON to support Graphcore IPU.
+
+    Returns (bool): `True` if IPU is supported, otherwise `False`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            support_ipu = paddle.is_compiled_with_ipu()
+    """
+    return core.is_compiled_with_ipu()
+
+
+def IPUPlace():
+    """
+    Return a Graphcore IPU Place
+
+    Examples:
+        .. code-block:: python
+
+            # required: ipu
+
+            import paddle
+            place = paddle.device.IPUPlace()
+    """
+    return core.IPUPlace()
+
+
 def is_compiled_with_xpu():
     """
     Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun
@@ -143,13 +175,19 @@ def _convert_to_place(device):
         selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
         device_id = int(selected_npus[0])
         place = core.NPUPlace(device_id)
+    elif lower_device == 'ipu':
+        if not core.is_compiled_with_ipu():
+            raise ValueError(
+                "The device should not be 'ipu', " \
+                "since PaddlePaddle is not compiled with IPU")
+        place = core.IPUPlace()
     else:
         avaliable_gpu_device = re.match(r'gpu:\d+', lower_device)
         avaliable_xpu_device = re.match(r'xpu:\d+', lower_device)
         avaliable_npu_device = re.match(r'npu:\d+', lower_device)
         if not avaliable_gpu_device and not avaliable_xpu_device and not avaliable_npu_device:
             raise ValueError(
-                "The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu', 'xpu:x', 'npu' or 'npu:x'"
+                "The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu', 'xpu:x', 'npu', 'npu:x' or ipu"
             )
         if avaliable_gpu_device:
             if not core.is_compiled_with_cuda():
@@ -183,13 +221,13 @@ def _convert_to_place(device):
 
 def set_device(device):
     """
-    Paddle supports running calculations on various types of devices, including CPU, GPU, XPU and NPU.
+    Paddle supports running calculations on various types of devices, including CPU, GPU, XPU, NPU and IPU.
     They are represented by string identifiers. This function can specify the global device
     which the OP will run.
 
     Parameters:
         device(str): This parameter determines the specific running device.
-            It can be ``cpu``, ``gpu``, ``xpu``, ``npu``, ``gpu:x``, ``xpu:x`` and ``npu:x``,
+            It can be ``cpu``, ``gpu``, ``xpu``, ``npu``, ``gpu:x``, ``xpu:x``, ``npu:x`` and ``ipu``,
             where ``x`` is the index of the GPUs, XPUs or NPUs.
 
     Examples:
@@ -236,5 +274,10 @@ def get_device():
     elif isinstance(place, core.NPUPlace):
         device_id = place.get_device_id()
         device = 'npu:' + str(device_id)
+    elif isinstance(place, core.IPUPlace):
+        num_devices = core.get_ipu_device_count()
+        device = "ipus:{{0-{}}}".format(num_devices - 1)
+    else:
+        raise ValueError("The device specification {} is invalid".format(place))
 
     return device
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
index 786d24052e2f7..7bda6a9a28348 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -296,6 +296,83 @@ def is_output_compatible(self, dist_op):
                 return False
         return True
 
+    def is_auto_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        y_name = op_desc.input('Y')[0]
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
+
+        assert len(x_dims_mapping) >= len(
+            y_dims_mapping), "now just support x dims > y dims"
+        if len(x_dims_mapping) == len(y_dims_mapping) and len(
+                x_dims_mapping) == 4:
+            if x_dims_mapping[:2] != y_dims_mapping[:2]:
+                return False
+            if x_dims_mapping[:2] != out_dims_mapping[:2]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+        elif len(x_dims_mapping) != len(y_dims_mapping) and len(
+                x_dims_mapping) == 3:
+            if x_dims_mapping[0] != out_dims_mapping[0]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+
+        if is_dim_replicate(out_dims_mapping[-1]):
+            return False
+
+        for mapping in out_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+
+        input_dims_mapping = []
+        ordered_input_shard_dims_mapping = []
+
+        for dim in (x_dims_mapping + y_dims_mapping):
+            input_dims_mapping.append(dim)
+
+        for item in input_dims_mapping:
+            if item not in ordered_input_shard_dims_mapping and item != -1:
+                ordered_input_shard_dims_mapping.append(item)
+
+        for mapping in out_dims_mapping:
+            if mapping not in input_dims_mapping:
+                return False
+
+        if is_dim_shard(x_dims_mapping[0]):
+            order_index = 0
+            for idx, item in enumerate(out_dims_mapping):
+                if item != -1:
+                    if item != ordered_input_shard_dims_mapping[order_index]:
+                        return False
+                    else:
+                        order_index += 1
+            if order_index != len(ordered_input_shard_dims_mapping):
+                return False
+
+        if is_dim_shard(x_dims_mapping[-1]):
+            return False
+        if is_dim_shard(y_dims_mapping[0]) or is_dim_replicate(y_dims_mapping[
+                1]):
+            return False
+        for mapping in x_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+
+        if is_dim_shard(x_dims_mapping[0]):
+            for mapping in y_dims_mapping[1:]:
+                if is_dim_shard(mapping) and mapping == x_dims_mapping[0]:
+                    return False
+
+        return True
+
     def update_dims_mapping(self, dist_op):
         changed = False
         dim_changed = _update_dims_mapping_for_matmul(dist_op)
@@ -510,6 +587,95 @@ def is_output_compatible(self, dist_op):
                 return False
         return True
 
+    def is_auto_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        y_name = op_desc.input('Y')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
+
+        if op_desc.attr('transpose_X') or op_desc.attr('transpose_Y'):
+            return False
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        # for gpt2, x dims > y dims, this is a temporary solution
+        assert len(x_dims_mapping) >= len(
+            y_dims_mapping), "now just support x dims > y dims"
+        if len(x_dims_mapping) == len(y_dims_mapping) and len(
+                x_dims_mapping) == 4:
+            if x_dims_mapping[:2] != y_dims_mapping[:2]:
+                return False
+            if x_dims_mapping[:2] != out_dims_mapping[:2]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+        elif len(x_dims_mapping) != len(y_dims_mapping) and len(
+                x_dims_mapping) == 3:
+            if x_dims_mapping[0] != out_dims_mapping[0]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+
+        if is_dim_shard(out_dims_mapping[-1]):
+            return False
+        # Other dimensions must be replicate except the batch dimension
+        for mapping in out_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+
+        if is_dim_replicate(x_dims_mapping[-1]):
+            return False
+
+        if is_dim_replicate(y_dims_mapping[-2]) or is_dim_shard(y_dims_mapping[
+                -1]):
+            return False
+
+        # Other dimensions must be replicate except the batch dimension
+        for mapping in x_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+
+        x_shard_dim_count = 0
+        x_shard_dims = []
+        y_shard_dim_count = 0
+        y_shard_dims = []
+        for dim in x_dims_mapping:
+            if is_dim_shard(dim):
+                x_shard_dim_count += 1
+                x_shard_dims.append(dim)
+
+        for dim in y_dims_mapping:
+            if is_dim_shard(dim):
+                y_shard_dim_count += 1
+                y_shard_dims.append(dim)
+
+        if not x_shard_dims and not y_shard_dims:
+            return False
+
+        if x_shard_dims[-1] != y_shard_dims[0]:
+            return False
+
+        if x_shard_dim_count == y_shard_dim_count:
+            for dim in out_dims_mapping:
+                if is_dim_shard(dim):
+                    return False
+            if x_shard_dims != y_shard_dims:
+                return False
+        else:
+            if x_shard_dim_count < y_shard_dim_count:
+                return False
+            output_shard_dims = []
+            for dim in out_dims_mapping:
+                if is_dim_shard(dim):
+                    output_shard_dims.append(dim)
+            if not output_shard_dims or output_shard_dims[0] != x_shard_dims[0]:
+                return False
+
+        return True
+
     def update_dims_mapping(self, dist_op):
         changed = False
         dim_changed = _update_dims_mapping_for_matmul(dist_op)
@@ -710,6 +876,59 @@ def is_output_compatible(self, dist_op):
 
         return True
 
+    def is_auto_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        y_name = op_desc.input('Y')[0]
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
+        assert len(x_dims_mapping) >= len(
+            y_dims_mapping
+        ), "now just support x dims > y dims,but x:{0} and y:{1}".format(
+            x_dims_mapping, y_dims_mapping)
+        if len(x_dims_mapping) == len(y_dims_mapping) and len(
+                x_dims_mapping) == 4:
+            if x_dims_mapping[:2] != y_dims_mapping[:2]:
+                return False
+            if x_dims_mapping[:2] != out_dims_mapping[:2]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+        elif len(x_dims_mapping) != len(y_dims_mapping) and len(
+                x_dims_mapping) == 3:
+            if x_dims_mapping[0] != out_dims_mapping[0]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+
+        if is_dim_shard(out_dims_mapping[-1]):
+            return False
+
+        if is_valid_list_index(out_dims_mapping,
+                               -2) and is_dim_shard(out_dims_mapping[-2]):
+            return False
+
+        if is_dim_shard(x_dims_mapping[-1]):
+            return False
+
+        if is_valid_list_index(x_dims_mapping,
+                               -2) and is_dim_shard(x_dims_mapping[-2]):
+            return False
+
+        if is_dim_shard(y_dims_mapping[-1]):
+            return False
+
+        if is_valid_list_index(y_dims_mapping,
+                               -2) and is_dim_shard(y_dims_mapping[-2]):
+            return False
+
+        return True
+
     def update_dims_mapping(self, dist_op):
         changed = False
         dim_changed = _update_dims_mapping_for_matmul(dist_op)
@@ -777,6 +996,86 @@ def is_output_compatible(self, dist_op):
                 return False
         return True
 
+    def is_auto_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        y_name = op_desc.input('Y')[0]
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
+
+        if op_desc.attr('trans_x') or op_desc.attr('trans_y'):
+            return False
+        assert len(x_dims_mapping) >= len(
+            y_dims_mapping), "now just support x dims > y dims"
+        if len(x_dims_mapping) == len(y_dims_mapping) and len(
+                x_dims_mapping) == 4:
+            if x_dims_mapping[:2] != y_dims_mapping[:2]:
+                return False
+            if x_dims_mapping[:2] != out_dims_mapping[:2]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+        elif len(x_dims_mapping) != len(y_dims_mapping) and len(
+                x_dims_mapping) == 3:
+            if x_dims_mapping[0] != out_dims_mapping[0]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+
+        if is_dim_replicate(out_dims_mapping[-1]):
+            return False
+
+        for mapping in out_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+        input_dims_mapping = []
+        ordered_input_shard_dims_mapping = []
+
+        for dim in (x_dims_mapping + y_dims_mapping):
+            input_dims_mapping.append(dim)
+
+        for item in input_dims_mapping:
+            if item not in ordered_input_shard_dims_mapping and item != -1:
+                ordered_input_shard_dims_mapping.append(item)
+
+        for mapping in out_dims_mapping:
+            if mapping not in input_dims_mapping:
+                return False
+
+        if is_dim_shard(x_dims_mapping[0]):
+            order_index = 0
+            for idx, item in enumerate(out_dims_mapping):
+                if item != -1:
+                    if item != ordered_input_shard_dims_mapping[order_index]:
+                        return False
+                    else:
+                        order_index += 1
+            if order_index != len(ordered_input_shard_dims_mapping):
+                return False
+
+        if is_dim_shard(x_dims_mapping[-1]):
+            return False
+
+        if is_dim_shard(y_dims_mapping[0]) or is_dim_replicate(y_dims_mapping[
+                1]):
+            return False
+
+        for mapping in x_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+
+        if is_dim_shard(x_dims_mapping[0]):
+            for mapping in y_dims_mapping[1:]:
+                if is_dim_shard(mapping) and mapping == x_dims_mapping[0]:
+                    return False
+
+        return True
+
     def update_dims_mapping(self, dist_op):
         changed = False
         dim_changed = _update_dims_mapping_for_matmul(dist_op)
@@ -985,6 +1284,94 @@ def is_output_compatible(self, dist_op):
                 return False
         return True
 
+    def is_auto_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        y_name = op_desc.input('Y')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
+        if op_desc.attr('trans_x') or op_desc.attr('trans_y'):
+            return False
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        assert len(x_dims_mapping) >= len(
+            y_dims_mapping), "now just support x dims > y dims"
+        if len(x_dims_mapping) == len(y_dims_mapping) and len(
+                x_dims_mapping) == 4:
+            if x_dims_mapping[:2] != y_dims_mapping[:2]:
+                return False
+            if x_dims_mapping[:2] != out_dims_mapping[:2]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+
+        elif len(x_dims_mapping) != len(y_dims_mapping) and len(
+                x_dims_mapping) == 3:
+            if x_dims_mapping[0] != out_dims_mapping[0]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+
+        if is_dim_shard(out_dims_mapping[-1]):
+            return False
+
+        # Other dimensions must be replicate except the batch dimension
+        for mapping in out_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+
+        if is_dim_replicate(x_dims_mapping[-1]):
+            return False
+
+        if is_dim_replicate(y_dims_mapping[-2]) or is_dim_shard(y_dims_mapping[
+                -1]):
+            return False
+
+        # Other dimensions must be replicate except the batch dimension
+        for mapping in x_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+
+        x_shard_dim_count = 0
+        x_shard_dims = []
+        y_shard_dim_count = 0
+        y_shard_dims = []
+        for dim in x_dims_mapping:
+            if is_dim_shard(dim):
+                x_shard_dim_count += 1
+                x_shard_dims.append(dim)
+
+        for dim in y_dims_mapping:
+            if is_dim_shard(dim):
+                y_shard_dim_count += 1
+                y_shard_dims.append(dim)
+
+        if not x_shard_dims and not y_shard_dims:
+            return False
+
+        if x_shard_dims[-1] != y_shard_dims[0]:
+            return False
+
+        if x_shard_dim_count == y_shard_dim_count:
+            for dim in out_dims_mapping:
+                if is_dim_shard(dim):
+                    return False
+            if x_shard_dims != y_shard_dims:
+                return False
+        else:
+            if x_shard_dim_count < y_shard_dim_count:
+                return False
+            output_shard_dims = []
+            for dim in out_dims_mapping:
+                if is_dim_shard(dim):
+                    output_shard_dims.append(dim)
+            if not output_shard_dims or output_shard_dims[0] != x_shard_dims[0]:
+                return False
+        return True
+
     def update_dims_mapping(self, dist_op):
         changed = False
         dim_changed = _update_dims_mapping_for_matmul(dist_op)
@@ -1183,6 +1570,61 @@ def is_output_compatible(self, dist_op):
 
         return True
 
+    def is_auto_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        y_name = op_desc.input('Y')[0]
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
+        assert len(x_dims_mapping) >= len(
+            y_dims_mapping
+        ), "now just support x dims > y dims,but x:{0} and y:{1}".format(
+            x_dims_mapping, y_dims_mapping)
+
+        if len(x_dims_mapping) == len(y_dims_mapping) and len(
+                x_dims_mapping) == 4:
+            if x_dims_mapping[:2] != y_dims_mapping[:2]:
+                return False
+            if x_dims_mapping[:2] != out_dims_mapping[:2]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+
+        elif len(x_dims_mapping) != len(y_dims_mapping) and len(
+                x_dims_mapping) == 3:
+            if x_dims_mapping[0] != out_dims_mapping[0]:
+                return False
+            x_dims_mapping = x_dims_mapping[-2:]
+            y_dims_mapping = y_dims_mapping[-2:]
+            out_dims_mapping = out_dims_mapping[-2:]
+
+        if is_dim_shard(out_dims_mapping[-1]):
+            return False
+
+        if is_valid_list_index(out_dims_mapping,
+                               -2) and is_dim_shard(out_dims_mapping[-2]):
+            return False
+
+        if is_dim_shard(x_dims_mapping[-1]):
+            return False
+
+        if is_valid_list_index(x_dims_mapping,
+                               -2) and is_dim_shard(x_dims_mapping[-2]):
+            return False
+
+        if is_dim_shard(y_dims_mapping[-1]):
+            return False
+
+        if is_valid_list_index(y_dims_mapping,
+                               -2) and is_dim_shard(y_dims_mapping[-2]):
+            return False
+
+        return True
+
     def update_dims_mapping(self, dist_op):
         changed = False
         dim_changed = _update_dims_mapping_for_matmul(dist_op)
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index e58b6c312fa1f..3b8b36a61e2fb 100644
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -474,12 +474,12 @@ def set_table_config(msg, config_name, configs, index=0):
             for field in msg.DESCRIPTOR.fields:
                 name = config_name + "." + field.name
                 if field.type == FieldDescriptor.TYPE_MESSAGE:
-                    print("message:", name)
+                    # print("message:", name)
                     if field.label == FieldDescriptor.LABEL_REPEATED:
                         if name + ".num" not in configs:
                             continue
                         num = configs[name + ".num"]
-                        print("message num:", name, num)
+                        # print("message num:", name, num)
                         for i in range(num):
                             data = getattr(msg, field.name).add()
                             set_table_config(data, name, configs, i)
@@ -487,7 +487,7 @@ def set_table_config(msg, config_name, configs, index=0):
                         set_table_config(
                             getattr(msg, field.name), name, configs)
                 else:
-                    print("not message:", name)
+                    # print("not message:", name)
                     if name not in configs:
                         continue
                     if field.label == FieldDescriptor.LABEL_REPEATED:
@@ -501,7 +501,11 @@ def set_table_config(msg, config_name, configs, index=0):
         if not configs:
             print("table configs is empty")
         else:
-            set_table_config(table_param, "table_parameters", configs)
+            for table_name in configs:
+                table_data = table_param.add()
+                table_data.table_name = table_name
+                set_table_config(table_data, "table_parameters." + table_name,
+                                 configs[table_name])
 
     @property
     def amp(self):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
index ffd24add50a4d..dc313c33ee3e2 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
@@ -27,11 +27,13 @@
 import paddle
 import paddle.fluid as fluid
 from paddle import framework
+from paddle.fluid import core
 import paddle.distributed as dist
 from paddle.optimizer import Optimizer
+from paddle.fluid.clip import ClipGradByGlobalNorm
 
 from ...utils.internal_storage import ParamStorage
-from ...meta_parallel.sharding.sharding_utils import Type
+from ...meta_parallel.sharding.sharding_utils import Type, device_guard, ShardingClipGrad
 
 # CUDA alignment 256 bytes
 alignment = {"gpu": 256, }
@@ -99,16 +101,41 @@ def __init__(self,
 
         self.broadcast_fp16 = broadcast_fp16
         self.param_storages = {}  # {dtype: {rank: InternalStorage}}
+
+        if isinstance(self._optim._grad_clip, ClipGradByGlobalNorm):
+            logging.warning(
+                "While using ClipGradByGlobalNorm in ShardingOptimizer, the grad clip of original optimizer will be changed."
+            )
+            self._optim._grad_clip = ShardingClipGrad(self._optim._grad_clip,
+                                                      group,
+                                                      paddle.get_device())
+
+        if offload:
+            assert self._pfp16, "Only support offload strategy while using \'Adam\', \'AdamW\' and \'Momentum\' optimizer with AMP/Pure FP16"
+
         self.offload = offload  # Using for offload
+        self.offload_device = "cpu"
+
+        self._master_params = {}
 
         # Update optimizer parameters and adjust parameter storage and use according to rank.
         self.update_opt_status()
 
     def _generate_master_params(self, trainable_params):
-        for param in trainable_params:
-            if param.dtype == Type.fp16.value:
-                self._optim._master_weights[param.name] = paddle.cast(
-                    param, Type.fp32.value)
+        if self.offload:
+            for param in trainable_params:
+                if param.name not in self._master_params.keys():
+                    self._master_params[param.name] = core.VarBase(
+                        name=param.name,
+                        value=param.cast(dtype=Type.fp32.value).numpy(),
+                        place=core.CPUPlace(),
+                        stop_gradient=param.stop_gradient)
+            self._optim._master_weights = self._master_params
+        else:
+            for param in trainable_params:
+                if param.dtype == Type.fp16.value:
+                    self._optim._master_weights[param.name] = paddle.cast(
+                        param, Type.fp32.value)
 
     def update_opt_status(self):
         """Update optimizer status and parameter storage information, and special functions to be developed.
@@ -243,22 +270,43 @@ def step(self):
         A wrapper for Optimizer's step function to finish the update operation of the optimizer.
         """
 
-        # Synchronize optimizer parameters for the current rank
-        if len(self.dtype_rank_params.keys(
-        )) == 1 and Type.fp32.value in self.dtype_rank_params.keys():
-            self._optim._parameter_list = self.dtype_rank_params[
-                Type.fp32.value][self.rank]
-        elif len(self.dtype_rank_params.keys(
-        )) == 1 and Type.fp16.value in self.dtype_rank_params.keys():
-            self._optim._parameter_list = self.dtype_rank_params[
-                Type.fp16.value][self.rank]
+        if self.offload:
+            self._optim._parameter_list = [
+                param for name, param in self._master_params.items()
+            ]
         else:
-            self._optim._parameter_list = self.dtype_rank_params[
-                Type.fp16.value][self.rank] + self.dtype_rank_params[
+            # Synchronize optimizer parameters for the current rank
+            if len(self.dtype_rank_params.keys(
+            )) == 1 and Type.fp32.value in self.dtype_rank_params.keys():
+                self._optim._parameter_list = self.dtype_rank_params[
                     Type.fp32.value][self.rank]
+            elif len(self.dtype_rank_params.keys(
+            )) == 1 and Type.fp16.value in self.dtype_rank_params.keys():
+                self._optim._parameter_list = self.dtype_rank_params[
+                    Type.fp16.value][self.rank]
+            else:
+                self._optim._parameter_list = self.dtype_rank_params[
+                    Type.fp16.value][self.rank] + self.dtype_rank_params[
+                        Type.fp32.value][self.rank]
 
         # Run the optimizer of the current rank step
-        self._optim.step()
+        if self.offload:
+            with device_guard(self.rank, self.offload_device):
+                self._optim.step()
+
+                for param in self._optim._parameter_list:
+                    self._master_params[param.name].set_value(param)
+
+            dev_id = 0 if paddle.get_device() == "cpu" else int(
+                paddle.get_device().split(":")[1])
+
+            for param in self._local_params:
+                if param.name in self._master_params.keys():
+                    param.set_value(self._master_params[param.name].cuda(dev_id)
+                                    .cast(dtype=param.dtype))
+                    self._master_params[param.name].clear_gradient(False)
+        else:
+            self._optim.step()
 
         # Synchronize all the updated shards in between the ranks
         self._broadcast_params()
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
index 37b85751149f7..fd49c2a7d6586 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
@@ -112,6 +112,18 @@ def __init__(
         self._has_grad_storage = []
         self._grad_storage_list = []
 
+        # offload
+        # TODO(haohongxiang): Now it's not supported for multi-optimizers using Offload strategy
+        self._offload_optims = list(
+            filter(lambda optim: optim.offload, self._sharding_optimizers))
+        if len(self._offload_optims) > 0:
+            assert len(
+                self._sharding_optimizers
+            ) == 1, "Only support offload strategy for single optimizer"
+
+        self._offload = self._sharding_optimizers[0].offload
+        self._offload_device = "cpu"
+
         # Set backward pass hooks
         self._bw_hooks = []
 
@@ -156,7 +168,8 @@ def clear_gradients(self):
         # Release grad storages
         for dtype in self._grad_storages.keys():
             if self._rank in self._grad_storages[dtype].keys():
-                self._grad_storages[dtype][self._rank].buffer.zero_()
+                if not self._offload:
+                    self._grad_storages[dtype][self._rank].buffer.zero_()
 
         # Release params
         for param in self._trainable_params:
@@ -167,17 +180,24 @@ def grad_scale(self):
         """
         Before the gradient accumulation, scale the gradient.
         """
-        # Scale grad storages
-        for dtype in self._grad_storages.keys():
-            if self._rank in self._grad_storages[dtype].keys():
-                self._grad_storages[dtype][self._rank].buffer.scale_(
-                    scale=self._world_size_scaling)
-
-        # Scale params
-        for param in self._trainable_params:
-            if param.name in self._param_grads and param.grad is not None:
-                param.grad.scale_(scale=self._world_size_scaling)
-                param._reset_grad_inplace_version(True)
+        if self._offload:
+            for param in self._trainable_params:
+                if param.name in self._sharding_optimizers[
+                        0]._master_params.keys():
+                    self._sharding_optimizers[0]._master_params[
+                        param.name].grad.scale_(scale=self._world_size_scaling)
+        else:
+            # Scale grad storages
+            for dtype in self._grad_storages.keys():
+                if self._rank in self._grad_storages[dtype].keys():
+                    self._grad_storages[dtype][self._rank].buffer.scale_(
+                        scale=self._world_size_scaling)
+
+            # Scale params
+            for param in self._trainable_params:
+                if param.name in self._param_grads and param.grad is not None:
+                    param.grad.scale_(scale=self._world_size_scaling)
+                    param._reset_grad_inplace_version(True)
 
     def _init_internal_storage(self, needs_fresh):
         """
@@ -195,8 +215,14 @@ def to(self, device=None, dtype=None, blocking=True):
         """
         Synchronously or asynchronously convert the data type of the layer, the device is not supported now.
         """
+        assert isinstance(device, str), "Device must be type str"
         assert device == self._default_device, "New devices are not supported, because of the optimizer state is not sync"
 
+        self._layer.to(device=device, dtype=dtype, blocking=blocking)
+
+        # Re-build the buckets, hooks, etc..
+        self._fresh_trainable()
+
     def _fresh_trainable(self):
         """ Whether to update training parameters. """
 
@@ -283,12 +309,17 @@ def reduce(*_):
                     self._grad_reduced[index] = False
                     if not self._accumulate_grads:
                         param.grad.scale_(scale=self._world_size_scaling)
-                    param._reset_grad_inplace_version(True)
+                        param._reset_grad_inplace_version(True)
 
                     # Clear the gradient that does not belong to the current rank through the callback function
                     def cleanup():
                         if dst_rank != self._rank:
                             param.clear_gradient(False)
+                        elif self._offload:
+                            self._sharding_optimizers[0]._master_params[
+                                param.name]._copy_gradient_from(param.grad.cpu(
+                                ).cast(dtype=Type.fp32.value))
+                            param.clear_gradient(False)
 
                     # Synchronize the reduce parameter gradient
                     self._tasks_flow.append(
@@ -339,6 +370,15 @@ def cleanup():
 
                                 grad_storage.buffer.value().get_tensor()._clear(
                                 )
+                            elif self._offload:
+                                grad_storage.to(device=self._offload_device)
+                                for param in grad_storage._params:
+                                    self._sharding_optimizers[0]._master_params[
+                                        param.name]._copy_gradient_from(
+                                            param.grad.cast(
+                                                dtype=Type.fp32.value))
+                                grad_storage.buffer.value().get_tensor()._clear(
+                                )
 
                         # Reduce the bucket
                         grad_storage.sent = True
@@ -478,7 +518,7 @@ def _build_grad_storages(self):
         # Rebuild fp16/fp32 grad storages
         for dtype in self._grad_storages.keys():
             for dst_rank, grad_storage in self._grad_storages[dtype].items():
-                if dst_rank != self._rank:
+                if self._offload or dst_rank != self._rank:
                     grad_storage.manumal_relase()
                     grad_storage.rebuild()
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
index d4c443e385f6f..651bed82396d1 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
@@ -17,10 +17,17 @@
 from collections import abc
 from enum import Enum
 from math import inf
+import numpy as np
+from types import MethodType
 
 import paddle
 import paddle.distributed as dist
+from paddle import _C_ops
 from paddle.fluid import core
+from paddle.fluid import layers
+from paddle.fluid.dygraph import to_variable
+from paddle.fluid.framework import dygraph_only
+from paddle.fluid.dygraph import base as imperative_base
 
 
 class Taskflow:
@@ -41,6 +48,88 @@ class Type(Enum):
     fp32 = paddle.float32
 
 
+class ShardingClipGrad:
+    def __init__(self, clip, group, device):
+        self._clip = clip
+        self._group = group
+        self._device = device
+
+    @imperative_base.no_grad
+    def _dygraph_clip(self, params_grads):
+        params_and_grads = []
+
+        sum_square_fp16 = []
+        sum_square_fp32 = []
+
+        for p, g in params_grads:
+            if g is None or getattr(p, 'need_clip', True) is False:
+                continue
+
+            merge_grad = g
+            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
+                merge_grad = layers.get_tensor_from_selected_rows(
+                    layers.merge_selected_rows(g))
+            square = layers.square(merge_grad)
+            sum_square = layers.reduce_sum(square)
+
+            if p.dtype == paddle.float16:
+                sum_square_fp16.append(sum_square)
+            elif p.dtype == paddle.float32:
+                sum_square_fp32.append(sum_square)
+
+        # global norm of non-distributed FP16 params_and_grads
+        if len(sum_square_fp16) == 0:
+            global_norm_fp16 = paddle.to_tensor([0.], dtype=paddle.float32)
+        else:
+            global_norm_fp16 = layers.concat(sum_square_fp16)
+            global_norm_fp16 = layers.reduce_sum(global_norm_fp16)
+            global_norm_fp16 = paddle.cast(
+                global_norm_fp16, dtype=paddle.float32)
+
+        # global norm of non-distributed FP32 params_and_grads
+        global_norm_fp32 = layers.concat(sum_square_fp32) if len(
+            sum_square_fp32) != 0 else paddle.to_tensor(
+                [0.], dtype=paddle.float32)
+        global_norm_fp32 = layers.reduce_sum(global_norm_fp32)
+
+        global_norm_var = global_norm_fp16 + global_norm_fp32
+
+        # add all reduce to get global norm of distributed params_and_grads
+        dev_id = int(self._device.split(":")[1])
+        with device_guard(dev_id, "gpu"):
+            paddle.distributed.all_reduce(global_norm_var, group=self._group)
+
+        global_norm_var = layers.sqrt(global_norm_var)
+        max_global_norm = layers.fill_constant(
+            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
+
+        clip_var = layers.elementwise_div(
+            x=max_global_norm,
+            y=layers.elementwise_max(
+                x=global_norm_var, y=max_global_norm))
+        clip_var_fp16 = paddle.cast(clip_var, paddle.float16)
+
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                params_and_grads.append((p, g))
+                continue
+            if p.dtype == paddle.float16:
+                new_grad = layers.elementwise_mul(x=g, y=clip_var_fp16)
+            else:
+                new_grad = layers.elementwise_mul(x=g, y=clip_var)
+            params_and_grads.append((p, new_grad))
+
+        return params_and_grads
+
+    def __getattr__(self, item):
+        return getattr(self._clip, item)
+
+    def __call__(self, params_grads):
+        return self._dygraph_clip(params_grads)
+
+
 @contextlib.contextmanager
 def device_guard(dev_id, device="cpu"):
     origin_device = paddle.device.get_device()
@@ -52,3 +141,65 @@ def device_guard(dev_id, device="cpu"):
         yield
     finally:
         paddle.set_device(origin_device)
+
+
+@dygraph_only
+def ShardingScaler(scaler, sharding_group):
+    def unscale_method(self, optimizer):
+        if not self._enable:
+            return
+        param_grads = []
+        param_grads_fp16 = []
+        param_grads_fp32 = []
+
+        if getattr(optimizer, '_param_groups', None) and isinstance(
+                optimizer._param_groups[0], dict):
+
+            for group in optimizer._param_groups:
+                for param in group['params']:
+                    if param._grad_ivar() is not None:
+                        param_grads.append(param._grad_ivar())
+                        if param._grad_ivar(
+                        ).dtype == core.VarDesc.VarType.FP16:
+                            param_grads_fp16.append(param._grad_ivar())
+                        else:
+                            param_grads_fp32.append(param._grad_ivar())
+        else:
+            param_grads = [
+                param._grad_ivar() for param in optimizer._parameter_list
+                if param._grad_ivar() is not None
+            ]
+            param_grads_fp16 = [
+                param._grad_ivar() for param in optimizer._parameter_list
+                if (param._grad_ivar() is not None
+                    ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP16
+                           )
+            ]
+            param_grads_fp32 = [
+                param._grad_ivar() for param in optimizer._parameter_list
+                if (param._grad_ivar() is not None
+                    ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32
+                           )
+            ]
+        temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool))
+        temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))
+        if len(param_grads_fp16):
+            _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
+                                            param_grads_fp16,
+                                            temp_found_inf_fp16)
+        if len(param_grads_fp32):
+            _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
+                                            param_grads_fp32,
+                                            temp_found_inf_fp32)
+
+        self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0
+        is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
+
+        paddle.distributed.all_reduce(
+            is_found_inf,
+            op=paddle.distributed.ReduceOp.MAX,
+            group=sharding_group)
+        self._found_inf = is_found_inf.numpy()[0]
+
+    scaler._unscale = MethodType(unscale_method, scaler)
+    return scaler
diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py
index 1c51e833f53f6..1240e1492a784 100644
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
@@ -56,53 +56,77 @@ def get_default_accessor_proto(accessor, varname, o_main_program):
     embedding_dim = 0
     for var in o_main_program.list_vars():
         if var.name == varname:
-            print("var:", var)
-            print("var.shape:", var.shape)
             embedding_dim = var.shape[1]
-            print("sparse dim:", embedding_dim)
             break
 
-    accessor.accessor_class = "CtrCommonAccessor"
-    accessor.fea_dim = embedding_dim + 2
-    accessor.embedx_dim = embedding_dim - 1
-    accessor.embedx_threshold = 0
+    if not accessor.HasField("accessor_class"):
+        accessor.accessor_class = "CtrCommonAccessor"
+    if not accessor.HasField("fea_dim"):
+        accessor.fea_dim = embedding_dim + 2
+    if not accessor.HasField("embedx_dim"):
+        accessor.embedx_dim = embedding_dim - 1
+    if not accessor.HasField("embedx_threshold"):
+        accessor.embedx_threshold = 0
 
     ctr_accessor_param = accessor.ctr_accessor_param
-    ctr_accessor_param.nonclk_coeff = 0.1
-    ctr_accessor_param.click_coeff = 1.0
-    ctr_accessor_param.base_threshold = 0
-    ctr_accessor_param.delta_threshold = 0
-    ctr_accessor_param.delta_keep_days = 16
-    ctr_accessor_param.show_click_decay_rate = 1
-    ctr_accessor_param.delete_threshold = 0
-    ctr_accessor_param.delete_after_unseen_days = 30
-    ctr_accessor_param.ssd_unseenday_threshold = 1
-
-    embed_sgd_param = accessor.embed_sgd_param
-    embed_sgd_param.name = "SparseAdaGradSGDRule"
-    embed_sgd_param.adagrad.learning_rate = 0.05
-    embed_sgd_param.adagrad.initial_g2sum = 3.0
-    embed_sgd_param.adagrad.initial_range = 0.0001
-    embed_sgd_param.adagrad.weight_bounds.append(-10.0)
-    embed_sgd_param.adagrad.weight_bounds.append(10.0)
-
-    embedx_sgd_param = accessor.embedx_sgd_param
-    embedx_sgd_param.name = "SparseAdaGradSGDRule"
-    embedx_sgd_param.adagrad.learning_rate = 0.05
-    embedx_sgd_param.adagrad.initial_g2sum = 3.0
-    embedx_sgd_param.adagrad.initial_range = 0.0001
-    embedx_sgd_param.adagrad.weight_bounds.append(-10.0)
-    embedx_sgd_param.adagrad.weight_bounds.append(10.0)
+    if not ctr_accessor_param.HasField("nonclk_coeff"):
+        ctr_accessor_param.nonclk_coeff = 0.1
+    if not ctr_accessor_param.HasField("click_coeff"):
+        ctr_accessor_param.click_coeff = 1.0
+    if not ctr_accessor_param.HasField("base_threshold"):
+        ctr_accessor_param.base_threshold = 0
+    if not ctr_accessor_param.HasField("delta_threshold"):
+        ctr_accessor_param.delta_threshold = 0
+    if not ctr_accessor_param.HasField("delta_keep_days"):
+        ctr_accessor_param.delta_keep_days = 16
+    if not ctr_accessor_param.HasField("show_click_decay_rate"):
+        ctr_accessor_param.show_click_decay_rate = 1
+    if not ctr_accessor_param.HasField("delete_threshold"):
+        ctr_accessor_param.delete_threshold = 0
+    if not ctr_accessor_param.HasField("delete_after_unseen_days"):
+        ctr_accessor_param.delete_after_unseen_days = 30
+    if not ctr_accessor_param.HasField("ssd_unseenday_threshold"):
+        ctr_accessor_param.ssd_unseenday_threshold = 1
+
+    for sgd_param in [accessor.embed_sgd_param, accessor.embedx_sgd_param]:
+        if not sgd_param.HasField("name"):
+            sgd_param.name = "SparseAdaGradSGDRule"
+        if sgd_param.name == "SparseAdaGradSGDRule" or sgd_param.name == "StdAdaGradSGDRule":
+            if not sgd_param.adagrad.HasField("learning_rate"):
+                sgd_param.adagrad.learning_rate = 0.05
+            if not sgd_param.adagrad.HasField("initial_g2sum"):
+                sgd_param.adagrad.initial_g2sum = 3.0
+            if not sgd_param.adagrad.HasField("initial_range"):
+                sgd_param.adagrad.initial_range = 0.0001
+            if len(sgd_param.adagrad.weight_bounds) == 0:
+                sgd_param.adagrad.weight_bounds.extend([-10.0, 10.0])
+        if sgd_param.name == "SparseNaiveSGDRule":
+            if not sgd_param.naive.HasField("learning_rate"):
+                sgd_param.naive.learning_rate = 0.05
+            if not sgd_param.naive.HasField("initial_range"):
+                sgd_param.naive.initial_range = 0.0001
+            if len(sgd_param.naive.weight_bounds) == 0:
+                sgd_param.naive.weight_bounds.extend([-10.0, 10.0])
+        if sgd_param.name == "SparseAdamSGDRule":
+            if not sgd_param.adam.HasField("learning_rate"):
+                sgd_param.adam.learning_rate = 0.001
+            if not sgd_param.adam.HasField("initial_range"):
+                sgd_param.adam.initial_range = 0.0001
+            if not sgd_param.adam.HasField("beta1_decay_rate"):
+                sgd_param.adam.beta1_decay_rate = 0.9
+            if not sgd_param.adam.HasField("beta2_decay_rate"):
+                sgd_param.adam.beta2_decay_rate = 0.999
+            if not sgd_param.adam.HasField("ada_epsilon"):
+                sgd_param.adam.ada_epsilon = 1e-08
+            if len(sgd_param.adam.weight_bounds) == 0:
+                sgd_param.adam.weight_bounds.extend([-10.0, 10.0])
 
 
 def check_embedding_dim(accessor, varname, o_main_program):
     embedding_dim = 0
     for var in o_main_program.list_vars():
         if var.name == varname:
-            print("var:", var)
-            print("var.shape:", var.shape)
             embedding_dim = var.shape[1]
-            print("sparse dim:", embedding_dim)
             break
     fea_dim = accessor.fea_dim
     if fea_dim != embedding_dim + 2:
@@ -917,19 +941,14 @@ def _get_tables():
                     if self.compiled_strategy.is_geo_mode():
                         table.table_class = "SparseGeoTable"
                     else:
-                        import copy
-                        table_proto = copy.deepcopy(self.context[
-                            "user_defined_strategy"].sparse_table_configs)
-                        print('table proto:', table_proto)
-                        print('table_class:', table_proto.table_class)
-                        print('shard_num:', table_proto.shard_num)
-                        print('table_proto.accessor:', table_proto.accessor)
-                        print('accessor.IsInitialized',
-                              table_proto.accessor.IsInitialized())
-                        print('accessor.ByteSize',
-                              table_proto.accessor.ByteSize())
-                        if table_proto.table_class:
-                            print('table_proto.table_class is true')
+                        all_table_proto = self.context[
+                            "user_defined_strategy"].sparse_table_configs
+                        table_proto = all_table_proto.add()
+                        for proto in all_table_proto:
+                            if proto.table_name == common.table_name:
+                                table_proto = proto
+                                break
+                        if table_proto.HasField("table_class"):
                             table.table_class = table_proto.table_class
                         else:
                             table.table_class = parse_table_class(
@@ -939,8 +958,7 @@ def _get_tables():
                             warnings.warn(
                                 "The PS mode must use MemorySparseTable.")
 
-                        if table_proto.shard_num:
-                            print('table_proto.shard_num is true')
+                        if table_proto.HasField("shard_num"):
                             table.shard_num = table_proto.shard_num
                         else:
                             table.shard_num = 1000
@@ -949,22 +967,18 @@ def _get_tables():
                             )
 
                         if table_proto.accessor.ByteSize() == 0:
-                            print('table_proto.accessor is false')
-                            get_default_accessor_proto(table_proto.accessor,
-                                                       common.table_name,
-                                                       self.origin_main_program)
                             warnings.warn(
                                 "The accessor of sparse table is not set, use default value."
                             )
+                        get_default_accessor_proto(table_proto.accessor,
+                                                   common.table_name,
+                                                   self.origin_main_program)
                         check_embedding_dim(table_proto.accessor,
                                             common.table_name,
                                             self.origin_main_program)
-                        print('accessor.ByteSize',
-                              table_proto.accessor.ByteSize())
                         from google.protobuf import text_format
                         table.accessor_proto = text_format.MessageToString(
                             table_proto.accessor)
-                        print("the_one_ps table_proto:", table.accessor_proto)
                 else:
                     table.type = "PS_DENSE_TABLE"
                     table.table_class = "CommonDenseTable"
@@ -1275,10 +1289,8 @@ def _ps_inference_save_inference_model(self,
             is_dense=False,
             split_dense_table=self.role_maker._is_heter_parameter_server_mode,
             use_origin_program=True)
-        print("the one ps sparses:", sparses)
         sparse_names = self._save_sparse_params(executor, dirname, sparses,
                                                 main_program, mode)
-        print("the one ps sparse names:", sparse_names)
 
         denses = self.compiled_strategy.get_the_one_recv_context(
             is_dense=True,
@@ -1293,7 +1305,7 @@ def _ps_inference_save_inference_model(self,
             filter(
                 TheOnePSRuntime.__exclude_vars(sparse_names),
                 infer_program.list_vars()))
-        print("remain_vars:", [var.name for var in remaining_vars])
+
         for var in remaining_vars:
             tensor = var.get_value()
             paddle.save(
diff --git a/python/paddle/distributed/fleet/utils/internal_storage.py b/python/paddle/distributed/fleet/utils/internal_storage.py
index ff41ca217e43b..f44b57ede468b 100644
--- a/python/paddle/distributed/fleet/utils/internal_storage.py
+++ b/python/paddle/distributed/fleet/utils/internal_storage.py
@@ -50,6 +50,29 @@ def __init__(self, size, dtype, device, convert_cpu=False):
         else:
             self.buffer = paddle.zeros(size, dtype=dtype)
 
+    def to(self, device, dtype=None, keep_alignment=True):
+        """
+        Move the underlying buffer
+        """
+        assert self.buffer is not None, "Cannot move a collapsed bucket, please rebuild it"
+        assert (dtype == Type.fp32.value or
+                Type.fp16.value), "Conversion type is not supported now"
+
+        dev_id = 0 if paddle.get_device() == "cpu" else int(paddle.get_device()
+                                                            .split(":")[1])
+
+        if self._device != device:
+            tmp_buffer = self.buffer.cuda(
+                dev_id) if device == "gpu" else self.buffer.cpu()
+            for param in self._params:
+                param.clear_gradient(False)
+                param._gradient_set_empty(False)
+            self.buffer.value().get_tensor()._clear()
+            self.buffer = tmp_buffer
+
+        if dtype is not None:
+            self.buffer = self.buffer.cast(dtype=dtype)
+
 
 class ParamStorage(InternalStorage):
     """
@@ -60,6 +83,16 @@ def __init__(self, size, dtype, device):
         super().__init__(size, dtype, device, convert_cpu=True)
         self.param2align = None
 
+    def to(self, device, dtype=None, keep_alignment=True):
+        """
+        Move the underlying buffer
+        """
+
+        super().to(device, dtype)
+
+        if keep_alignment:
+            self._array_params()
+
     @fluid.dygraph.no_grad
     def add_rank_params(self, trainable_params, param2align):
         """
@@ -78,7 +111,7 @@ def add_rank_params(self, trainable_params, param2align):
             p_shape = self._add_param_as_view(param, param2align[param.name])
             cpu_param_shape.append(p_shape)
 
-        # buffer covert from cpu to cuda
+        # buffer convert from cpu to cuda
         dev_id = int(paddle.get_device().split(":")[1])
         self.buffer = self.buffer.cuda(dev_id)
         self._fill = 0
@@ -109,7 +142,8 @@ def _add_param_as_view(self, param, align):
         param.stop_gradient = origin_state
 
         # Copy the current param value
-        dev_id = int(paddle.get_device().split(":")[1])
+        dev_id = 0 if paddle.get_device() == "cpu" else int(paddle.get_device()
+                                                            .split(":")[1])
         with device_guard(dev_id, "cpu"):
             tmp_var = core.VarBase(tensor=self.buffer._slice(self._fill,
                                                              var_end))
@@ -134,6 +168,18 @@ def _convert_buffer(self, param, p_shape, align):
 
         self._fill = offset
 
+    @fluid.dygraph.no_grad
+    def _array_params(self):
+        """
+        Given the parameters which have been registered previously, rebuild the whole InternalStorage.
+        """
+        assert len(self._params) > 0
+        assert self.param2align is not None
+
+        self._fill = 0
+        for p in self._params:
+            self._convert_buffer(p, p.shape, self.param2align[p.name])  # modify
+
 
 class GradStorage(InternalStorage):
     """
@@ -171,6 +217,18 @@ def can_add_grad_view(self, param, align):
             param.shape) + align <= self._max_size and id(
                 param) not in self._param_ids
 
+    def to(self, device, dtype=None, keep_alignment=True):
+        """
+        Move the underlying buffer
+        """
+        if self._release:
+            self.rebuild()
+
+        super().to(device, dtype)
+
+        if keep_alignment:
+            self._array_grads()
+
     @fluid.dygraph.no_grad
     def add_grad(self, param, align):
         """
@@ -206,17 +264,25 @@ def rebuild(self):
         """
         Given the parameter gradients which have been registered previously, rebuild the whole InternalStorage.
         """
-        assert len(self._params) > 0
 
         if self._release:
-            self.buffer = paddle.zeros(
-                [self._max_size], dtype=self._params[0].dtype)
+            self.buffer = paddle.zeros([self._max_size], dtype=self._dtype)
 
             for p in self._params:
                 self._add_grad_as_view(p, self._parm2align[p.name])
 
             self._release = False
 
+    @fluid.dygraph.no_grad
+    def _array_grads(self):
+        """
+        Given the parameters gradients which have been registered previously, rebuild the whole InternalStorage.
+        """
+        if len(self._params) > 0:
+            self._fill = 0
+            for p in self._params:
+                self._add_grad_as_view(p, self._parm2align[p.name])
+
     @fluid.dygraph.no_grad
     def _add_grad_as_view(self, param, align):
         assert np.prod(
@@ -229,8 +295,17 @@ def _add_grad_as_view(self, param, align):
         assert offset <= np.prod(self.buffer.shape)
 
         # Copy the current grad value to InternalStorage
-        assert self._device == "gpu"
-        tmp_var = core.VarBase(self.buffer._slice(self._fill, grad_end))
-        param._copy_gradient_from(tmp_var)
-        tmp_var.value().get_tensor()._clear()
+        dev_id = 0 if paddle.get_device() == "cpu" else int(paddle.get_device()
+                                                            .split(":")[1])
+        if self._device == "cpu":
+            with device_guard(dev_id, self._device):
+                tmp_var = core.VarBase(self.buffer._slice(self._fill, grad_end))
+                param._copy_gradient_from(tmp_var)
+                tmp_var.value().get_tensor()._clear()
+
+        elif self._device == "gpu":
+            tmp_var = core.VarBase(self.buffer._slice(self._fill, grad_end))
+            param._copy_gradient_from(tmp_var)
+            tmp_var.value().get_tensor()._clear()
+
         self._fill = offset
diff --git a/python/paddle/distribution.py b/python/paddle/distribution.py
index e30d3e4c20a92..cf198eab1e8e0 100644
--- a/python/paddle/distribution.py
+++ b/python/paddle/distribution.py
@@ -305,7 +305,8 @@ def sample(self, shape, seed=0):
         else:
             output_shape = shape + batch_shape
             output = nn.uniform_random(
-                output_shape, seed=seed, dtype=self.dtype) * (tensor.zeros(
+                output_shape, dtype=self.dtype, min=0., max=1.,
+                seed=seed) * (tensor.zeros(
                     output_shape, dtype=self.dtype) + (self.high - self.low))
             output = elementwise_add(output, self.low, name=name)
             if self.all_arg_is_float:
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 5482413dbbc5d..d8ee875e768e5 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -71,7 +71,7 @@
 from .param_attr import ParamAttr, WeightNormParamAttr
 from .data_feeder import DataFeeder
 from .core import LoDTensor, LoDTensorArray, Scope, _Scope
-from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace
+from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace, IPUPlace
 from .incubate import fleet
 from .transpiler import DistributeTranspiler, \
     memory_optimize, release_memory, DistributeTranspilerConfig
@@ -132,6 +132,7 @@
         'CUDAPlace',
         'CUDAPinnedPlace',
         'NPUPlace',
+        'IPUPlace',
         'Tensor',
         'ParamAttr',
         'WeightNormParamAttr',
@@ -197,6 +198,11 @@ def remove_flag_if_exists(name):
     if os.name == 'nt':
         remove_flag_if_exists('cpu_deterministic')
 
+    if core.is_compiled_with_ipu():
+        # Currently we request all ipu available for training and testing
+        #   finer control of pod of IPUs will be added later
+        read_env_flags += []
+
     core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
     # Note(zhouwei25): sys may not have argv in some cases, 
     # Such as: use Python/C API to call Python from C++
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 125d9fa88d4ae..5d29dc522b3ef 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -484,7 +484,7 @@ def save_quantized_model(self, model, path, input_spec=None, **config):
                 model_filename=model_filename,
                 params_filename=params_filename))
 
-        self._gather_scales(infer_program, scope)
+        self._gather_scales(infer_program, scope, fetch_targets)
 
         self._set_skip_quant_attr(infer_program)
 
@@ -520,10 +520,10 @@ def _is_target_layer(self, layer):
 
         return flag
 
-    def _gather_scales(self, program, scope):
+    def _gather_scales(self, program, scope, fetch_targets):
         """
         Get all scales from fake ops, save them into the corresponding ops
-        and delete all moving_average_abs_max_scale ops. 
+        and delete all moving_average_abs_max_scale ops.
         """
 
         def _gather_input_scale():
@@ -580,6 +580,11 @@ def _gather_output_scale():
 
                 for next_op in next_ops:
                     next_op._rename_input(out_var_name, in_var_name)
+                    # If next_op is `fetch` and out_var_name in fetch_targets,
+                    # fetch_targets must update to in_var_name when rename input.
+                    for i in range(len(fetch_targets)):
+                        if fetch_targets[i].name == out_var_name:
+                            fetch_targets[i] = block.var(in_var_name)
 
         _gather_input_scale()
         _gather_output_scale()
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 3e033f70aca38..1ddb9c8e5fa9f 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -410,6 +410,23 @@ def quantize(self):
                for op_type in self._dynamic_quantize_op_type):
             self._collect_dynamic_quantize_op_threshold(
                 self._dynamic_quantize_op_type)
+
+        # Move sub blocks persistable var to global block
+        global_block = self._program.global_block()
+        for _op in global_block.ops:
+            if _op.type == "while":
+                _block_id = _op.attr("sub_block").id
+                _block = self._program.block(_block_id)
+                persistables = []
+                for _name, _var in _block.vars.items():
+                    if _var.persistable:
+                        global_block._clone_variable(_var)
+                        persistables.append(_name)
+                for _name in persistables:
+                    _block._remove_var(_name)
+                persistables.extend(_op.input('X'))
+                _op.desc.set_input("X", persistables)
+
         return self._program
 
     def save_quantized_model(self,
@@ -451,10 +468,6 @@ def _load_model_data(self):
                                     model_filename=self._model_filename,
                                     params_filename=self._params_filename)
 
-        if self._program.num_blocks > 1:
-            _logger.error("The post training quantization requires that the "
-                          "program only has one block.")
-
         if self._optimize_model:
             self._optimize_fp32_model()
 
@@ -505,23 +518,26 @@ def collect_var_name(var_name_list, persistable_var_names, op_type):
                     self._quantized_act_var_name.add(var_name)
 
         persistable_var_names = _all_persistable_var_names(self._program)
-        for op in self._program.global_block().ops:
-            op_type = op.type
-            if self._is_full_quantize and \
-                op_type not in self._quantizable_op_type:
-                _logger.warning(op_type + " is not supported for quantization.")
-            # For quantized ops, sample inputs and outputs
-            if op_type in self._quantizable_op_type:
-                collect_var_name(
-                    _get_op_input_var_names(op), persistable_var_names, op_type)
-                collect_var_name(
-                    _get_op_output_var_names(op), persistable_var_names,
-                    op_type)
-            # For other op, only sample output scale
-            elif op_type in self._out_scale_op_list:
-                collect_var_name(
-                    _get_op_output_var_names(op), persistable_var_names,
-                    op_type)
+        for block_id in range(len(self._program.blocks)):
+            for op in self._program.blocks[block_id].ops:
+                op_type = op.type
+                if self._is_full_quantize and \
+                    op_type not in self._quantizable_op_type:
+                    _logger.warning(op_type +
+                                    " is not supported for quantization.")
+                # For quantized ops, sample inputs and outputs
+                if op_type in self._quantizable_op_type:
+                    collect_var_name(
+                        _get_op_input_var_names(op), persistable_var_names,
+                        op_type)
+                    collect_var_name(
+                        _get_op_output_var_names(op), persistable_var_names,
+                        op_type)
+                # For other op, only sample output scale
+                elif op_type in self._out_scale_op_list:
+                    collect_var_name(
+                        _get_op_output_var_names(op), persistable_var_names,
+                        op_type)
 
     def _set_activation_persistable(self):
         '''
@@ -696,16 +712,17 @@ def _save_input_threhold(self):
         '''
         assert self._algo == "min_max", \
             "The algo should be min_max to save input threshold."
-        for op in self._program.global_block().ops:
-            if op.type in self._quantizable_op_type:
-                for var_name in _get_op_input_var_names(op):
-                    assert var_name in self._quantized_var_min
-                    assert var_name in self._quantized_var_max
-                    op._set_attr(var_name + ".min",
-                                 self._quantized_var_min[var_name])
-                    op._set_attr(var_name + ".max",
-                                 self._quantized_var_max[var_name])
-                    op._set_attr("with_quant_attr", True)
+        for block_id in range(len(self._program.blocks)):
+            for op in self._program.blocks[block_id].ops:
+                if op.type in self._quantizable_op_type:
+                    for var_name in _get_op_input_var_names(op):
+                        assert var_name in self._quantized_var_min
+                        assert var_name in self._quantized_var_max
+                        op._set_attr(var_name + ".min",
+                                     self._quantized_var_min[var_name])
+                        op._set_attr(var_name + ".max",
+                                     self._quantized_var_max[var_name])
+                        op._set_attr("with_quant_attr", True)
 
     def _collect_activation_abs_min_max(self):
         '''
@@ -795,7 +812,12 @@ def _update_program(self):
             activation_quantize_type=self._activation_quantize_type,
             weight_quantize_type=self._weight_quantize_type,
             quantizable_op_type=major_quantizable_op_types)
-        transform_pass.apply(graph)
+
+        for sub_graph in graph.all_sub_graphs():
+            # Insert fake_quant/fake_dequantize op must in test graph, so
+            # set per graph's _for_test is True.
+            sub_graph._for_test = True
+            transform_pass.apply(sub_graph)
 
         # use AddQuantDequantPass to insert fake_quant_dequant op
         minor_quantizable_op_types = []
@@ -806,7 +828,10 @@ def _update_program(self):
             scope=self._scope,
             place=self._place,
             quantizable_op_type=minor_quantizable_op_types)
-        add_quant_dequant_pass.apply(graph)
+
+        for sub_graph in graph.all_sub_graphs():
+            sub_graph._for_test = True
+            add_quant_dequant_pass.apply(sub_graph)
 
         # save threshold to scale var node
         if self._algo in ["KL", "hist"]:
@@ -836,7 +861,11 @@ def _update_program(self):
             activation_bits=self._activation_bits,
             weight_quantize_type=self._weight_quantize_type,
             quantizable_op_type=major_quantizable_op_types)
-        freeze_pass.apply(graph)
+
+        for sub_graph in graph.all_sub_graphs():
+            sub_graph._for_test = True
+            freeze_pass.apply(sub_graph)
+
         self._program = graph.to_program()
 
     def _save_output_threshold(self):
@@ -888,13 +917,15 @@ def analysis_and_save_info(op_node, out_var_name):
                 save_info(op_node, out_var_name, self._quantized_var_max,
                           "out_max", "post_min_max")
 
-        for op in self._program.global_block().ops:
-            if op.type in (self._quantizable_op_type + self._out_scale_op_list):
-                out_var_names = _get_op_output_var_names(op)
-                assert len(out_var_names) == 1, "Post training " + \
-                    "quantization only support one output for " + op.type
-                for var_name in out_var_names:
-                    analysis_and_save_info(op, var_name)
+        for block_id in range(len(self._program.blocks)):
+            for op in self._program.blocks[block_id].ops:
+                if op.type in (
+                        self._quantizable_op_type + self._out_scale_op_list):
+                    out_var_names = _get_op_output_var_names(op)
+                    assert len(out_var_names) == 1, "Post training " + \
+                        "quantization only support one output for " + op.type
+                    for var_name in out_var_names:
+                        analysis_and_save_info(op, var_name)
 
     def _collect_dynamic_quantize_op_threshold(self, target_ops_type):
         """
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index 94d7a2ed15348..494ea96979719 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -139,6 +139,7 @@ endfunction()
 if(WIN32)
 	list(REMOVE_ITEM TEST_OPS test_light_nas)
 	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mnist)
+	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_while)
 	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mobilenetv1)
 	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_resnet50)
 	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_lstm_model)
@@ -336,6 +337,7 @@ if(NOT WIN32)
     set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 600 LABELS "RUN_TYPE=NIGHTLY")
     set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 600 LABELS "RUN_TYPE=NIGHTLY")
     set_tests_properties(test_post_training_quantization_mnist PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_post_training_quantization_while PROPERTIES TIMEOUT 120)
     set_tests_properties(test_imperative_ptq PROPERTIES TIMEOUT 120)
     set_tests_properties(test_weight_quantization_mobilenetv1 PROPERTIES TIMEOUT 120)
 endif()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py
new file mode 100644
index 0000000000000..3c3dfd08fccfa
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py
@@ -0,0 +1,313 @@
+#   copyright (c) 2021 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+import unittest
+import os
+import time
+import sys
+import random
+import math
+import functools
+import contextlib
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.dataset.common import download
+from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization
+
+paddle.enable_static()
+
+random.seed(0)
+np.random.seed(0)
+
+
+class TestPostTrainingQuantization(unittest.TestCase):
+    def setUp(self):
+        self.download_path = 'int8/download'
+        self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
+                                               self.download_path)
+        self.timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
+        self.int8_model_path = os.path.join(os.getcwd(),
+                                            "post_training_" + self.timestamp)
+        try:
+            os.system("mkdir -p " + self.int8_model_path)
+        except Exception as e:
+            print("Failed to create {} due to {}".format(self.int8_model_path,
+                                                         str(e)))
+            sys.exit(-1)
+
+    def tearDown(self):
+        try:
+            os.system("rm -rf {}".format(self.int8_model_path))
+        except Exception as e:
+            print("Failed to delete {} due to {}".format(self.int8_model_path,
+                                                         str(e)))
+
+    def cache_unzipping(self, target_folder, zip_path):
+        cmd = 'tar xf {0} -C {1}'.format(zip_path, target_folder)
+        os.system(cmd)
+
+    def download_model(self, data_url, data_md5, folder_name):
+        download(data_url, self.download_path, data_md5)
+        file_name = data_url.split('/')[-1]
+        zip_path = os.path.join(self.cache_folder, file_name)
+        print('Data is downloaded at {0}'.format(zip_path))
+
+        data_cache_folder = os.path.join(self.cache_folder, folder_name)
+        self.cache_unzipping(self.cache_folder, zip_path)
+        return data_cache_folder
+
+    def run_program(self, model_path, batch_size, infer_iterations):
+        print("test model path:" + model_path)
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        [infer_program, feed_dict, fetch_targets] = \
+            fluid.io.load_inference_model(model_path,
+                    model_filename='model.pdmodel',
+                    params_filename='model.pdiparams', executor=exe)
+        val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size)
+
+        img_shape = [1, 28, 28]
+        test_info = []
+        cnt = 0
+        periods = []
+        for batch_id, data in enumerate(val_reader()):
+            image = np.array(
+                [x[0].reshape(img_shape) for x in data]).astype("float32")
+            input_label = np.array([x[1] for x in data]).astype("int64")
+
+            t1 = time.time()
+            out = exe.run(infer_program,
+                          feed={feed_dict[0]: image},
+                          fetch_list=fetch_targets)
+            t2 = time.time()
+            period = t2 - t1
+            periods.append(period)
+
+            out_label = np.argmax(np.array(out[0]), axis=1)
+            top1_num = sum(input_label == out_label)
+            test_info.append(top1_num)
+            cnt += len(data)
+
+            if (batch_id + 1) == infer_iterations:
+                break
+
+        throughput = cnt / np.sum(periods)
+        latency = np.average(periods)
+        acc1 = np.sum(test_info) / cnt
+        return (throughput, latency, acc1)
+
+    def generate_quantized_model(self,
+                                 model_path,
+                                 algo="KL",
+                                 quantizable_op_type=["conv2d"],
+                                 is_full_quantize=False,
+                                 is_use_cache_file=False,
+                                 is_optimize_model=False,
+                                 batch_size=10,
+                                 batch_nums=10):
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        scope = fluid.global_scope()
+        val_reader = paddle.dataset.mnist.train()
+
+        ptq = PostTrainingQuantization(
+            executor=exe,
+            model_dir=model_path,
+            model_filename='model.pdmodel',
+            params_filename='model.pdiparams',
+            sample_generator=val_reader,
+            batch_size=batch_size,
+            batch_nums=batch_nums,
+            algo=algo,
+            quantizable_op_type=quantizable_op_type,
+            is_full_quantize=is_full_quantize,
+            optimize_model=is_optimize_model,
+            is_use_cache_file=is_use_cache_file)
+        ptq.quantize()
+        ptq.save_quantized_model(
+            self.int8_model_path,
+            model_filename='model.pdmodel',
+            params_filename='model.pdiparams')
+
+    def run_test(self,
+                 model_name,
+                 data_url,
+                 data_md5,
+                 algo,
+                 quantizable_op_type,
+                 is_full_quantize,
+                 is_use_cache_file,
+                 is_optimize_model,
+                 diff_threshold,
+                 batch_size=10,
+                 infer_iterations=10,
+                 quant_iterations=5):
+
+        origin_model_path = self.download_model(data_url, data_md5, model_name)
+        #origin_model_path = os.path.join(origin_model_path, model_name)
+
+        print("Start FP32 inference for {0} on {1} images ...".format(
+            model_name, infer_iterations * batch_size))
+        (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program(
+            origin_model_path, batch_size, infer_iterations)
+
+        print("Start INT8 post training quantization for {0} on {1} images ...".
+              format(model_name, quant_iterations * batch_size))
+        self.generate_quantized_model(
+            origin_model_path, algo, quantizable_op_type, is_full_quantize,
+            is_use_cache_file, is_optimize_model, batch_size, quant_iterations)
+
+        print("Start INT8 inference for {0} on {1} images ...".format(
+            model_name, infer_iterations * batch_size))
+        (int8_throughput, int8_latency, int8_acc1) = self.run_program(
+            self.int8_model_path, batch_size, infer_iterations)
+
+        print("---Post training quantization of {} method---".format(algo))
+        print(
+            "FP32 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.".
+            format(model_name, batch_size, fp32_throughput, fp32_latency,
+                   fp32_acc1))
+        print(
+            "INT8 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.\n".
+            format(model_name, batch_size, int8_throughput, int8_latency,
+                   int8_acc1))
+        sys.stdout.flush()
+
+        delta_value = fp32_acc1 - int8_acc1
+        self.assertLess(delta_value, diff_threshold)
+
+
+class TestPostTrainingKLForWhile(TestPostTrainingQuantization):
+    def test_post_training_kl(self):
+        model_name = "mnist_while"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
+        data_md5 = "2387390beeb37b51dec041c27b8a681f"
+        algo = "KL"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTraininghistForWhile(TestPostTrainingQuantization):
+    def test_post_training_hist(self):
+        model_name = "mnist_while"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
+        data_md5 = "2387390beeb37b51dec041c27b8a681f"
+        algo = "hist"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingmseForWhile(TestPostTrainingQuantization):
+    def test_post_training_mse(self):
+        model_name = "mnist_while"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
+        data_md5 = "2387390beeb37b51dec041c27b8a681f"
+        algo = "mse"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingavgForWhile(TestPostTrainingQuantization):
+    def test_post_training_avg(self):
+        model_name = "mnist_while"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
+        data_md5 = "2387390beeb37b51dec041c27b8a681f"
+        algo = "avg"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingMinMaxForWhile(TestPostTrainingQuantization):
+    def test_post_training_min_max(self):
+        model_name = "mnist_while"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
+        data_md5 = "2387390beeb37b51dec041c27b8a681f"
+        algo = "min_max"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingAbsMaxForWhile(TestPostTrainingQuantization):
+    def test_post_training_abs_max(self):
+        model_name = "mnist_while"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
+        data_md5 = "2387390beeb37b51dec041c27b8a681f"
+        algo = "abs_max"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index d8cb3e0918dc8..10a9358612960 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -273,6 +273,8 @@ def __next__(self):
             else:
                 if self._return_list:
                     data = self._reader.read_next_list()
+                    for i in range(len(data)):
+                        data[i] = data[i]._move_to_list()
                     data = [
                         _restore_batch(d, s)
                         for d, s in zip(data, self._structure_infos[:len(
@@ -718,6 +720,8 @@ def __next__(self):
             else:
                 if self._return_list:
                     data = self._reader.read_next_list()
+                    for i in range(len(data)):
+                        data[i] = data[i]._move_to_list()
                     data = [
                         _restore_batch(d, s)
                         for d, s in zip(data, self._structure_infos[:len(
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index c4df01c4c7654..51e85901e7d55 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -547,7 +547,11 @@ def func_to_source_code(function, dedent=True):
         raise TypeError(
             "The type of 'function' should be a function or method, but received {}.".
             format(type(function).__name__))
-    source_code = inspect.getsource(function)
+    source_code_list, _ = inspect.getsourcelines(function)
+    source_code_list = [
+        line for line in source_code_list if not line.lstrip().startswith('#')
+    ]
+    source_code = ''.join(source_code_list)
     if dedent:
         source_code = textwrap.dedent(source_code)
 
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 996cec29e185b..f308af04e5e58 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -238,7 +238,7 @@ def backward(self, grad_tensor=None, retain_graph=False):
                     "Tensor shape not match, Tensor of grad_tensor [ {} ] with shape {} mismatch Tensor [ {} ] with shape {}".format(
                     grad_tensor.name, grad_tensor.shape, self.name, self.shape)
 
-            if paddle.is_compiled_with_xpu():
+            if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_npu():
                 # TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
                 scaled_loss = scale_loss(self)
                 core.dygraph_run_backward([scaled_loss], [grad_tensor],
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index e601c1cb4c300..d10564e21ea47 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1999,6 +1999,14 @@ def _run_using_fleet_executor(self,
                 fetch_list=fetch_list,
                 feed_var_name=feed_var_name,
                 fetch_var_name=fetch_var_name)
+            main_block = cached_program.block(0)
+            for op in main_block.ops:
+                # set the op_role of fetch op to Optimize to avoid
+                # erase the fetched vars by gc for pipeline
+                if op.type == 'fetch':
+                    op._set_attr(
+                        'op_role',
+                        core.op_proto_and_checker_maker.OpRole.Optimize)
             self._add_program_cache(cache_key, cached_program)
         if cached_ctx is None:
             fleet_opt = program._pipeline_opt["fleet_opt"]
@@ -2007,6 +2015,18 @@ def _run_using_fleet_executor(self,
             self._add_ctx_cache(cache_key, cached_ctx)
         if feed:
             self._feed_data(cached_program, feed, feed_var_name, cached_scope)
+
+        from paddle.optimizer.lr import LRScheduler
+        if hasattr(program, 'lr_sheduler'):
+            lr_sheduler = program.lr_sheduler
+            assert isinstance(lr_sheduler, LRScheduler), "must be LRScheduler"
+            lr_value = lr_sheduler()
+            lr_var = program.global_block().vars[lr_sheduler._var_name]
+            data = np.array([lr_value]).astype(convert_dtype(lr_var.dtype))
+            tensor = core.get_variable_tensor(cached_scope,
+                                              lr_sheduler._var_name)
+            tensor.set(data, self.place)
+
         cached_ctx.run()
         if fetch_list:
             arr = cached_scope.find_var(fetch_var_name).get_fetch_list()
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index dfc887292e7cf..83ccd1051bb66 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -1254,7 +1254,10 @@ def __iter__(self):
     def __next__(self):
         try:
             if self._return_list:
-                return self._reader.read_next_list()
+                data = self._reader.read_next_list()
+                for i in range(len(data)):
+                    data[i] = data[i]._move_to_list()
+                return data
             else:
                 return self._reader.read_next()
         except StopIteration:
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
index 05008a3bc12f7..2b4002ab9c9d4 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
@@ -30,6 +30,7 @@
 seed = 2021
 epoch = 2
 batch_size = 32
+linear_size = 10000
 
 strategy = fleet.DistributedStrategy()
 strategy.hybrid_configs = {
@@ -45,12 +46,12 @@
 
 
 class MLP(fluid.Layer):
-    def __init__(self, param_attr=None, bias_attr=None):
+    def __init__(self, linear_size=10000, param_attr=None, bias_attr=None):
         super(MLP, self).__init__()
 
-        self._linear1 = Linear(10000, 10000)
-        self._linear2 = Linear(10000, 10000)
-        self._linear3 = Linear(10000, 10)
+        self._linear1 = Linear(linear_size, linear_size)
+        self._linear2 = Linear(linear_size, linear_size)
+        self._linear3 = Linear(linear_size, 10)
 
     def forward(self, inputs):
         y = self._linear1(inputs)
@@ -59,10 +60,10 @@ def forward(self, inputs):
         return y
 
 
-def reader_decorator():
+def reader_decorator(linear_size=10000):
     def __reader__():
         for _ in range(100):
-            img = np.random.rand(10000).astype('float32')
+            img = np.random.rand(linear_size).astype('float32')
             label = np.ones(1).astype('int64')
             yield img, label
 
@@ -120,6 +121,9 @@ def train_mlp(model,
         use_multiprocess=True)
     train_loader.set_sample_list_generator(train_reader)
 
+    if sharding_stage == 2:
+        model.to(device="gpu")
+
     for eop in range(epoch):
         model.train()
 
@@ -153,9 +157,6 @@ def train_mlp(model,
             if all_test and batch_id == 2:
                 return model.parameters()
 
-    if sharding_stage == 2:
-        model.to(device="gpu")
-
     return model.parameters()
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
new file mode 100644
index 0000000000000..8adcda9d24e1c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
@@ -0,0 +1,115 @@
+# -*- coding: UTF-8 -*-
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import ast
+import time
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Linear
+from paddle.distributed import fleet
+from paddle.fluid.dygraph import nn
+
+from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler
+
+from dygraph_sharding_stage2 import MLP, reader_decorator, optimizer_setting
+
+seed = 2021
+epoch = 2
+batch_size = 32
+linear_size = 8000
+
+np.random.seed(seed)
+paddle.seed(seed)
+
+
+def train_mlp(model, offload=False):
+    group = paddle.distributed.new_group([0, 1])
+    optimizer = optimizer_setting(model=model, use_pure_fp16=True)
+
+    model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32')
+    scaler = paddle.amp.GradScaler(init_loss_scaling=32768)
+    scaler = ShardingScaler(scaler, group)
+
+    optimizer = ShardingOptimizerStage2(
+        params=model.parameters(),
+        optim=optimizer,
+        group=group,
+        offload=offload)
+    model = ShardingStage2(model, optimizer, group=group, accumulate_grads=True)
+
+    train_reader = paddle.batch(
+        reader_decorator(linear_size), batch_size=batch_size, drop_last=True)
+
+    train_loader = paddle.io.DataLoader.from_generator(
+        capacity=32,
+        use_double_buffer=True,
+        iterable=True,
+        return_list=True,
+        use_multiprocess=True)
+    train_loader.set_sample_list_generator(train_reader)
+
+    for eop in range(epoch):
+        model.train()
+
+        for batch_id, data in enumerate(train_loader()):
+            img, label = data
+            label.stop_gradient = True
+            img.stop_gradient = True
+
+            with paddle.amp.auto_cast(True, level='O2'):
+                out = model(img)
+                loss = paddle.nn.functional.cross_entropy(
+                    input=out, label=label)
+
+            avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
+            scaler.scale(avg_loss).backward()
+
+            model.grad_scale()
+            scaler.step(optimizer)
+            scaler.update()
+            model.clear_gradients()
+
+    for dtype in optimizer.param_storages:
+        for dst_rank, param_storage in optimizer.param_storages[dtype].items():
+            param_storage.to(device="gpu", dtype=dtype)
+
+    return model.parameters()
+
+
+def test_sharding_stage2_offload():
+    mlp = MLP(linear_size)
+    mlp_offload = MLP(linear_size)
+    mlp_offload.set_state_dict(mlp.state_dict())
+
+    mlp_params = train_mlp(mlp, offload=False)
+    mlp_offload_params = train_mlp(mlp_offload, offload=True)
+
+    for i in range(len(mlp_params)):
+        for j in range(len(mlp_offload_params)):
+            if mlp_params[i].name == mlp_offload_params[j].name:
+                np.testing.assert_allclose(
+                    mlp_params[i].numpy(),
+                    mlp_offload_params[j].numpy(),
+                    rtol=1e-6)
+    return
+
+
+if __name__ == '__main__':
+    test_sharding_stage2_offload()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
index c08a8d350f8aa..d4c41781078b7 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
@@ -26,6 +26,7 @@
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
 from paddle.fluid.dygraph.jit import declarative
 from paddle.fluid.dygraph.nn import Linear
+from paddle.fluid.dygraph.dygraph_to_static.utils import func_to_source_code
 
 from ifelse_simple_func import dyfunc_with_if_else
 
@@ -344,5 +345,18 @@ def test_raise_error(self):
             net.foo.train()
 
 
+class TestRemoveCommentInDy2St(unittest.TestCase):
+    def func_with_comment(self):
+        # Comment1
+        x = paddle.to_tensor([1, 2, 3])
+        # Comment2
+        # Comment3
+        y = paddle.to_tensor([4, 5, 6])
+
+    def test_remove_comment(self):
+        code_string = func_to_source_code(self.func_with_comment)
+        self.assertEqual('#' not in code_string, True)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
index 33754fac127a4..c05ad30da2797 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
@@ -322,14 +322,14 @@ def assert_op_list(self, op_list_after_fusion):
             "Expected operator list after fusion is {}, but now it's {}".format(
                 op_list_after_fusion, after_op_list), )
 
-    def run_and_statis(
-            self,
-            quant=False,
-            max_examples=100,
-            reproduce=None,
-            min_success_num=25,
-            max_duration=180,
-            passes=None, ):
+    def run_and_statis(self,
+                       quant=False,
+                       max_examples=100,
+                       reproduce=None,
+                       min_success_num=25,
+                       max_duration=180,
+                       passes=None,
+                       use_gpu_run_baseline=False):
         if os.getenv('HYPOTHESIS_TEST_PROFILE', 'ci') == "dev":
             max_examples *= 10
             min_success_num *= 10
@@ -354,7 +354,10 @@ def program_generator(draw):
             return self.sample_program_config(draw)
 
         def run_test(prog_config):
-            return self.run_test(quant=quant, prog_configs=[prog_config])
+            return self.run_test(
+                quant=quant,
+                prog_configs=[prog_config],
+                use_gpu_run_baseline=use_gpu_run_baseline)
 
         generator = st.composite(program_generator)
         loop_func = given(generator())(run_test)
@@ -371,8 +374,8 @@ def run_test(prog_config):
         logging.info("Number of Ran Programs: {}".format(self.num_ran_programs))
         logging.info("Number of Ignore Tests: {}".format(self.num_ignore_tests))
         successful_ran_programs = int(self.num_ran_programs -
-                                      self.num_ignore_tests /
-                                      self.num_predictor_kinds)
+                                      self.num_ignore_tests / max(
+                                          self.num_predictor_kinds, 1))
         logging.info(
             "Number of successfully ran programs approximately equal to {}".
             format(successful_ran_programs))
@@ -391,7 +394,10 @@ def run_test(prog_config):
                 format(max_duration))
             assert False
 
-    def run_test(self, quant=False, prog_configs=None):
+    def run_test(self,
+                 quant=False,
+                 prog_configs=None,
+                 use_gpu_run_baseline=False):
         status = True
 
         for prog_config in prog_configs:
@@ -413,7 +419,9 @@ def run_test(self, quant=False, prog_configs=None):
             results: List[Dict[str, np.ndarray]] = []
 
             # baseline: cpu no ir_optim run
-            base_config = self.create_inference_config(ir_optim=False)
+
+            base_config = self.create_inference_config(
+                ir_optim=False, use_gpu=use_gpu_run_baseline)
             logging.info('RUN program_config: ' + str(prog_config))
             results.append(
                 self.run_test_config(model, params, prog_config, base_config,
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py b/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py
index 8cb6af1dcf044..96c2a175208fa 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py
@@ -109,7 +109,7 @@ def teller2(program_config, predictor_config):
     def test(self):
         self.run_and_statis(
             quant=False,
-            max_examples=100,
+            max_examples=300,
             passes=["adaptive_pool2d_convert_global_pass"],
             min_success_num=40)
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
new file mode 100644
index 0000000000000..2ccb9de5d5470
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, IgnoreReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume, reproduce_failure
+import hypothesis.strategies as st
+
+
+class TestFCElementwiseLayerNormFusePass(PassAutoScanTest):
+    """
+    x_var   w(persistable) bias_var(persistable)
+      \     |              /
+          fc     
+          |
+      fc_out_var  bias_var(persistable)
+            \        /
+          elementwise_add  bias_var(persistable)  scale_var(persistable)
+                  \            |                       /                    
+                           layer_norm
+                         /      |         \
+                        Y    mean_var  variance_var
+    """
+
+    def sample_predictor_configs(self, program_config):
+        # for gpu
+        config = self.create_inference_config(use_gpu=True)
+        yield config, ["fused_fc_elementwise_layernorm"], (1e-5, 1e-5)
+
+    def sample_program_config(self, draw):
+        # 1. Generate shape of input:X of fc
+        x_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=8), min_size=2, max_size=5))
+        x_shape = [2, 1]
+        x_rank = len(x_shape)
+        # 2. Generate attr:in_num_col_dims of fc
+        in_num_col_dims = draw(st.integers(min_value=1, max_value=x_rank - 1))
+        # 3. Generate legal shape of input:W/bias of fc
+        w_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=8), min_size=2, max_size=2))
+        w_shape[0] = int(np.prod(x_shape[in_num_col_dims:]))
+        w_shape = [1, 2]
+        fc_bias_shape = [w_shape[1], ]
+        if draw(st.booleans()):
+            fc_bias_shape.insert(0, 1)
+        fc_bias_shape = [2, ]
+        fc_out_shape = x_shape[:in_num_col_dims] + w_shape[1:]
+        # 4. Generate legal attr:axis/shape of elementwise_add
+        add_bias_shape = fc_out_shape[:]
+        axis = draw(st.integers(min_value=-1, max_value=0))
+        # 5. Generate legal shape of layer_norm
+        begin_norm_axis = draw(
+            st.integers(
+                min_value=1, max_value=len(fc_out_shape) - 1))
+        layer_norm_shape = [int(np.prod(fc_out_shape[begin_norm_axis:]))]
+        epsilon = 1e-5
+
+        fc_op = OpConfig(
+            "fc",
+            inputs={"Input": ["fc_x"],
+                    "W": ["fc_w"],
+                    "Bias": ["fc_bias"]},
+            outputs={"Out": ["fc_out"]},
+            in_num_col_dims=in_num_col_dims,
+            padding_weights=False,
+            activation_type="",
+            use_quantizer=False,
+            use_mkldnn=False, )
+        add_op = OpConfig(
+            "elementwise_add",
+            inputs={"X": ["fc_out"],
+                    "Y": ["add_bias"]},
+            outputs={"Out": ["add_out"]},
+            axis=axis, )
+        layer_norm_op = OpConfig(
+            "layer_norm",
+            inputs={
+                "X": ["add_out"],
+                "Scale": ["scale"],
+                "Bias": ["layer_norm_bias"]
+            },
+            outputs={
+                "Y": ["layer_norm_out"],
+                "Mean": ["layer_norm_mean"],
+                "Variance": ["layer_norm_var"]
+            },
+            begin_norm_axis=begin_norm_axis,
+            epsilon=epsilon)
+
+        ops = [fc_op, add_op, layer_norm_op]
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={
+                "fc_w": TensorConfig(shape=w_shape),
+                "fc_bias": TensorConfig(shape=fc_bias_shape),
+                "add_bias": TensorConfig(shape=add_bias_shape),
+                "scale": TensorConfig(shape=layer_norm_shape),
+                "layer_norm_bias": TensorConfig(shape=layer_norm_shape),
+            },
+            inputs={"fc_x": TensorConfig(shape=x_shape), },
+            outputs=ops[-1].outputs["Y"], )
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=300,
+            passes=["fc_elementwise_layernorm_fuse_pass"],
+            use_gpu_run_baseline=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
index 83d4b7091cb32..64c3042b63cf8 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,72 +12,147 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from auto_scan_test import PassAutoScanTest, IgnoreReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
 import unittest
-import numpy as np
-from inference_pass_test import InferencePassTest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.core import PassVersionChecker
 
+import hypothesis
+from hypothesis import given, settings, seed, example, assume, reproduce_failure
+import hypothesis.strategies as st
 
-class TransposeFlattenConcatFusePassTest(InferencePassTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data1 = fluid.data(name="data1", shape=[5, 5, 5], dtype="float32")
-            data2 = fluid.data(name="data2", shape=[5, 5, 5], dtype="float32")
-            trans1 = fluid.layers.transpose(data1, perm=[2, 1, 0])
-            trans2 = fluid.layers.transpose(data2, perm=[2, 1, 0])
-            flatt1 = fluid.layers.flatten(trans1)
-            flatt2 = fluid.layers.flatten(trans2)
-            concat_out = fluid.layers.concat([flatt1, flatt2])
-            # There is no parameters for above structure. 
-            # Hence, append a batch_norm to avoid failure caused by load_combined. 
-            out = fluid.layers.batch_norm(concat_out, is_test=True)
 
-        self.feeds = {
-            "data1": np.random.random([5, 5, 5]).astype("float32"),
-            "data2": np.random.random([5, 5, 5]).astype("float32")
-        }
-        self.fetch_list = [out]
+class TestTransposeFlattenConcatFusePass(PassAutoScanTest):
+    """
+        x_1_var              x_2_var
+          |                     |
+      transpose2            transpose2
+          |                     | 
+       flatten2              flatten2
+          \                     /
+    flatten2_out_var    flatten2_out_var
+              \              /
+                   concat 
+    """
 
-    def test_check_output(self):
-        # There is no cpu pass for transpose_flatten_concat_fuse
-        if core.is_compiled_with_cuda():
-            use_gpu = True
-            self.check_output_with_option(use_gpu)
+    def sample_predictor_configs(self, program_config):
+        # TRT  
+        # after tensorrt_subgraph_pass ，The pass needs to be deleted on TRT
 
-        PassVersionChecker.IsCompatible('transpose_flatten_concat_fuse_pass')
+        # for gpu
+        config = self.create_inference_config(use_gpu=True)
+        yield config, ["fusion_transpose_flatten_concat", ], (1e-5, 1e-5)
 
+    def is_program_valid(self, prog_config):
+        concat_axis = prog_config.ops[-1].attrs["axis"]
+        ops_num = len(prog_config.ops) - 1
+        if ops_num % 2 != 0:
+            return False
+        input_num = ops_num // 2
+        flatten_shape = 0
+        x_trans_axis = prog_config.ops[0].attrs["axis"]
+        x_flatten_axis = prog_config.ops[1].attrs["axis"]
+        for i in range(input_num):
+            input_name = "transpose2_x" + str(i)
+            input_shape = prog_config.inputs[input_name].shape
+            trans_axis = prog_config.ops[i * 2].attrs["axis"]
+            if x_trans_axis != trans_axis:
+                return False
+            #  calculate shape after transpose
+            input_shape = [input_shape[j] for j in trans_axis]
+            #  calculate shape after flateen
+            flatten_axis = prog_config.ops[i * 2 + 1].attrs["axis"]
+            if x_flatten_axis != flatten_axis:
+                return False
+            flatten_shape1 = flatten_shape2 = 1
+            for j in range(len(input_shape)):
+                if j < flatten_axis:
+                    flatten_shape1 *= input_shape[j]
+                else:
+                    flatten_shape2 *= input_shape[j]
+            if concat_axis == 0:
+                if i == 0:
+                    flatten_shape = flatten_shape2
+                elif flatten_shape != flatten_shape2:
+                    return False
+            else:
+                if i == 0:
+                    flatten_shape = flatten_shape1
+                elif flatten_shape != flatten_shape1:
+                    return False
+        return True
 
-class TransposeFlattenConcatFusePassWithAxisTest(InferencePassTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data1 = fluid.data(name="data1", shape=[5, 5, 5], dtype="float32")
-            data2 = fluid.data(name="data2", shape=[5, 5, 5], dtype="float32")
-            trans1 = fluid.layers.transpose(data1, perm=[2, 1, 0])
-            trans2 = fluid.layers.transpose(data2, perm=[2, 1, 0])
-            flatt1 = fluid.layers.flatten(trans1, axis=2)
-            flatt2 = fluid.layers.flatten(trans2, axis=2)
-            concat_out = fluid.layers.concat([flatt1, flatt2], axis=1)
-            # There is no parameters for above structure. 
-            # Hence, append a batch_norm to avoid failure caused by load_combined. 
-            out = fluid.layers.batch_norm(concat_out, is_test=True)
+    def sample_program_config(self, draw):
+        times = draw(st.integers(min_value=1, max_value=6))
+        concat_axis = draw(st.integers(min_value=0, max_value=1))
+        ops = []
+        concat_input = []
+        inputs = {}
+        x_shape_rank = draw(st.integers(min_value=2, max_value=5))
+        #  Generate axis of transpose
+        trans_axis = [j for j in range(x_shape_rank)]
+        for j in range(x_shape_rank - 1):
+            if draw(st.booleans()):
+                trans_axis[j], trans_axis[-1] = trans_axis[-1], trans_axis[j]
+        #  Generate axis of flatten
+        flatten_axis = draw(
+            st.integers(
+                min_value=0, max_value=x_shape_rank - 1))
+        for i in range(times):
+            #  Generate x_shape of transpose
+            x_shape = draw(
+                st.lists(
+                    st.integers(
+                        min_value=1, max_value=10),
+                    min_size=x_shape_rank,
+                    max_size=x_shape_rank))
 
-        self.feeds = {
-            "data1": np.random.random([5, 5, 5]).astype("float32"),
-            "data2": np.random.random([5, 5, 5]).astype("float32")
-        }
-        self.fetch_list = [out]
+            str_i = str(i)
+            transpose_op = OpConfig(
+                "transpose2",
+                inputs={"X": ["transpose2_x" + str_i], },
+                axis=trans_axis,
+                outputs={
+                    "Out": ["trans_out" + str_i],
+                    "XShape": ["trans_shape" + str_i]
+                }, )
+            ops.append(transpose_op)
+            flatten_op = OpConfig(
+                "flatten2",
+                inputs={"X": ["trans_out" + str_i], },
+                axis=flatten_axis,
+                outputs={
+                    "Out": ["flatten2_out" + str_i],
+                    "XShape": ["xshape" + str_i]
+                }, )
+            concat_input.append("flatten2_out" + str_i)
+            ops.append(flatten_op)
+            inputs["transpose2_x" + str_i] = TensorConfig(shape=x_shape)
 
-    def test_check_output(self):
-        # There is no cpu pass for transpose_flatten_concat_fuse
-        if core.is_compiled_with_cuda():
-            use_gpu = True
-            self.check_output_with_option(use_gpu)
+        concat_op = OpConfig(
+            "concat",
+            inputs={
+                "X": concat_input,
+                "AxisTensor": [],
+            },
+            outputs={"Out": ["concat_out"]},
+            axis=concat_axis, )
 
-        self.assertTrue(
-            PassVersionChecker.IsCompatible(
-                'transpose_flatten_concat_fuse_pass'))
+        ops.append(concat_op)
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={},
+            inputs=inputs,
+            outputs=ops[-1].outputs["Out"], )
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=300,
+            passes=["transpose_flatten_concat_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
index 992e0353837bc..b54b923d3b086 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
@@ -317,21 +317,28 @@ def generate_input(shape):
         input1_shape_list = [[4, 32], [2, 4, 32], [4, 2, 4, 32]]
         input2_shape1_list = [[32], [4, 32], [2, 4, 32]]
         input2_shape2_list = [[4, 1], [2, 4, 1], [4, 2, 4, 1]]
-        input2_shape3_list = [[32], [2, 1, 1], [4, 2, 1, 1]]
-        input2_shape4_list = [[32], [4, 32], [4, 1, 1, 1]]
+        input2_shape3_list = [[32], [2, 1, 1], [4, 2, 1, 32]]
+        input2_shape4_list = [[32], [4, 32], [4, 1, 4, 32]]
+        input2_shape5_list = [[32], [2, 1, 32], [4, 1, 1, 32]]
+        input2_shape6_list = [[1, 32], [1, 32], [1, 1, 1, 32]]
         input2_shape_list = [
             input2_shape1_list, input2_shape2_list, input2_shape3_list,
-            input2_shape4_list
+            input2_shape4_list, input2_shape5_list, input2_shape6_list
         ]
         axis1_list = [[-1], [1, -1], [1, -1]]
         axis2_list = [[-1], [0], [0]]
         axis3_list = [[-1], [0], [0]]
         axis4_list = [[-1], [-1], [0]]
-        axis_list = [axis1_list, axis2_list, axis3_list, axis4_list]
+        axis5_list = [[-1, 1], [-1, 0], [-1, 0]]
+        axis6_list = [[-1, 0], [-1, 1], [-1, 0]]
+        axis_list = [
+            axis1_list, axis2_list, axis3_list, axis4_list, axis5_list,
+            axis6_list
+        ]
 
         for i in range(3):
             input1_shape = input1_shape_list[i]
-            for j in range(4):
+            for j in range(6):
                 input2_shape = input2_shape_list[j][i]
                 for op_type in ["elementwise_add", "elementwise_mul"]:
                     for axis in axis_list[j][i]:
diff --git a/python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py b/python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py
new file mode 100644
index 0000000000000..82178e1b62dfb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_auto_search_dist_matmul_op.py
@@ -0,0 +1,397 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import copy
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.utils as utils
+import paddle.fluid.core as core
+from paddle.fluid import layers
+from paddle.distributed.auto_parallel.operators.common import DistributedOperatorImplContainer
+from paddle.distributed.auto_parallel.operators.common import DistributedOperatorImpl
+from paddle.distributed.auto_parallel.operators.common import get_distributed_operator_impl_container
+from paddle.distributed.auto_parallel.dist_context import DistributedContext, DistributedOperatorContext
+from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
+from paddle.distributed.auto_parallel.dist_op import DistributedOperator
+paddle.enable_static()
+device = "gpu" if core.is_compiled_with_cuda() else "cpu"
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
+            mean=0.0, std=initializer_range))
+        bias_attr = None
+
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+
+    def forward(self, input):
+        out = self.norm(input)
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        out = self.linear1(out)
+
+        return out
+
+
+def mlp_forward(train_program, start_program):
+    with static.program_guard(train_program,
+                              start_program), utils.unique_name.guard():
+        batch_size = 4
+        hidden_size = 1024
+        sqrt_hidden_size = 32
+        double_hidden_size = 64
+
+        input = static.data(name="input", shape=[8, 8, 16], dtype='int32')
+        input = paddle.reshape(input, [hidden_size])
+        input = paddle.reshape(input, [sqrt_hidden_size, sqrt_hidden_size])
+        embedding = paddle.nn.Embedding(2, batch_size, sparse=True)
+        input = embedding(input)
+        input = paddle.reshape(input, [hidden_size, batch_size])
+        input = paddle.transpose(input, perm=[1, 0])
+        matmulinput = static.data(
+            name="matmulinput",
+            shape=[hidden_size, hidden_size],
+            dtype='float32')
+        input = layers.matmul(x=input, y=matmulinput)
+        label = static.data(
+            name="label", shape=[batch_size, 1], dtype='float32')
+        mlp = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            initializer_range=0.02)
+
+        predict = mlp(input)
+        error_cost = paddle.nn.functional.square_error_cost(predict, label)
+        loss = paddle.mean(error_cost)
+        m = paddle.nn.Softmax()
+        loss = m(loss)
+    return loss, train_program, start_program
+
+
+class Testcompatible(unittest.TestCase):
+    def test_matmulv2_matmul_2_compatible(self):
+        valid_op_dist_attr_list = []
+        program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        loss, program, start_program = mlp_forward(program, startup_program)
+
+        with static.program_guard(program,
+                                  start_program), utils.unique_name.guard():
+            matmulx3 = static.data(
+                name="matmulx3", shape=[6, 2, 6], dtype='float32')
+            matmuly3 = static.data(
+                name="matmuly3", shape=[6, 6], dtype='float32')
+            output1 = paddle.matmul(x=matmulx3, y=matmuly3)
+            output_1 = layers.matmul(x=matmulx3, y=matmuly3)
+            matmulx4 = static.data(
+                name="matmulx4", shape=[6, 6, 2, 6], dtype='float32')
+            matmuly4 = static.data(
+                name="matmuly4", shape=[6, 6, 6, 6], dtype='float32')
+            output2 = paddle.matmul(x=matmulx4, y=matmuly4)
+            output_2 = layers.matmul(x=matmulx4, y=matmuly4)
+        ops = program.global_block().ops
+        vars = program.global_block().vars
+        for idx, op in enumerate(ops):
+            if op.type == 'matmul_v2' or op.type == 'matmul':
+                dist_op_impl_container = get_distributed_operator_impl_container(
+                    op.type)
+                impls = dist_op_impl_container.get_impls()
+                op_dist_attr = OperatorDistributedAttribute()
+                X = op.input_arg_names[0]
+                Y = op.input_arg_names[1]
+                out = op.output_arg_names[0]
+                if len(vars[X].shape) == 2 and len(vars[Y].shape) == 2:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1])
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1])
+                    self.assertTrue(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [1, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, 1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [1, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, 1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, 1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [1, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                if len(vars[X].shape) == 3 and len(vars[Y].shape) == 2:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, -1])
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, -1])
+                    self.assertTrue(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [1, -1, -1])
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, 1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [1, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, 1, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                if len(vars[X].shape) == 4 and len(vars[Y].shape) == 4:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, -1, -1])
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, -1, -1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, -1, -1])
+                    self.assertTrue(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [0, -1, -1, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [0, -1, -1, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, 0, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, 0, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, -1, 1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, 0, -1])
+                    self.assertFalse(impls[2].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+
+    def test_matmulv2_matmul_1_compatible(self):
+        valid_op_dist_attr_list = []
+        program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        loss, program, start_program = mlp_forward(program, startup_program)
+        with static.program_guard(program,
+                                  start_program), utils.unique_name.guard():
+            matmulx3 = static.data(
+                name="matmulx3", shape=[6, 2, 6], dtype='float32')
+            matmuly3 = static.data(
+                name="matmuly3", shape=[6, 6], dtype='float32')
+            output1 = paddle.matmul(x=matmulx3, y=matmuly3)
+            output_1 = layers.matmul(x=matmulx3, y=matmuly3)
+            matmulx4 = static.data(
+                name="matmulx4", shape=[6, 6, 6, 6], dtype='float32')
+            matmuly4 = static.data(
+                name="matmuly4", shape=[6, 6, 6, 6], dtype='float32')
+            output2 = paddle.matmul(x=matmulx4, y=matmuly4)
+            output_2 = layers.matmul(x=matmulx4, y=matmuly4)
+        ops = program.global_block().ops
+        vars = program.global_block().vars
+        for idx, op in enumerate(ops):
+            if op.type == 'matmul_v2' or op.type == 'matmul':
+                dist_op_impl_container = get_distributed_operator_impl_container(
+                    op.type)
+                impls = dist_op_impl_container.get_impls()
+                op_dist_attr = OperatorDistributedAttribute()
+                X = op.input_arg_names[0]
+                Y = op.input_arg_names[1]
+                out = op.output_arg_names[0]
+                if len(vars[X].shape) == 2 and len(vars[Y].shape) == 2:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, 1])
+                    op_dist_attr.set_input_dims_mapping(Y, [1, -1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1])
+                    dist_op = DistributedOperator(op, op_dist_attr)
+                    op_dist_attr.set_output_dims_mapping(out, [1, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                if len(vars[X].shape) == 3 and len(vars[Y].shape) == 2:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, 1])
+                    op_dist_attr.set_input_dims_mapping(Y, [1, -1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, -1])
+                    self.assertTrue(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [1, -1, 1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(out, [-1, -1, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, 0, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                if len(vars[X].shape) == 4 and len(vars[Y].shape) == 4:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, -1, 1])
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, 1, -1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, -1, -1])
+                    self.assertTrue(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [0, -1, -1, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [0, -1, -1, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, 0, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, 0, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, -1, 1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, 0, -1])
+                    self.assertFalse(impls[1].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+
+    def test_matmulv2_matmul_0_compatible(self):
+        valid_op_dist_attr_list = []
+        program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        loss, program, start_program = mlp_forward(program, startup_program)
+        with static.program_guard(program,
+                                  start_program), utils.unique_name.guard():
+            matmulx3 = static.data(
+                name="matmulx3", shape=[6, 2, 6], dtype='float32')
+            matmuly3 = static.data(
+                name="matmuly3", shape=[6, 6], dtype='float32')
+            output1 = paddle.matmul(x=matmulx3, y=matmuly3)
+            output_1 = layers.matmul(x=matmulx3, y=matmuly3)
+            matmulx4 = static.data(
+                name="matmulx4", shape=[6, 6, 2, 6], dtype='float32')
+            matmuly4 = static.data(
+                name="matmuly4", shape=[6, 6, 6, 6], dtype='float32')
+            output2 = paddle.matmul(x=matmulx4, y=matmuly4)
+            output_2 = layers.matmul(x=matmulx4, y=matmuly4)
+        ops = program.global_block().ops
+        vars = program.global_block().vars
+        for idx, op in enumerate(ops):
+            if op.type == 'matmul_v2' or op.type == 'matmul':
+                dist_op_impl_container = get_distributed_operator_impl_container(
+                    op.type)
+                impls = dist_op_impl_container.get_impls()
+                op_dist_attr = OperatorDistributedAttribute()
+                X = op.input_arg_names[0]
+                Y = op.input_arg_names[1]
+                out = op.output_arg_names[0]
+                if len(vars[X].shape) == 2 and len(vars[Y].shape) == 2:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1])
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, 1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, 1])
+                    self.assertTrue(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, 1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [1, 1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [0, 0])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [0, -1])
+                    op_dist_attr.set_output_dims_mapping(out, [1, 1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                if len(vars[X].shape) == 3 and len(vars[Y].shape) == 2:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, -1])
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, 1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, 1])
+                    self.assertTrue(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, 0, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, 1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [1, -1, 1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, 1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                if len(vars[X].shape) == 4 and len(vars[Y].shape) == 4:
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, -1, -1])
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, -1, 1])
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, -1, 1])
+                    self.assertTrue(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [0, -1, -1, 1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, 1, 1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, 1, -1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(X, [-1, -1, 1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [0, -1, -1, 1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, 1, 1, 1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, -1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_output_dims_mapping(out, [-1, -1, 1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+                    op_dist_attr.set_input_dims_mapping(Y, [-1, -1, 1, -1])
+                    self.assertFalse(impls[0].is_auto_compatible(
+                        DistributedOperator(op, op_dist_attr)))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_complex_view_op.py b/python/paddle/fluid/tests/unittests/test_complex_view_op.py
new file mode 100644
index 0000000000000..5dac121ff3e38
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_complex_view_op.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+import paddle
+from paddle.fluid import dygraph
+from paddle import static
+paddle.enable_static()
+
+
+def ref_view_as_complex(x):
+    real, imag = np.take(x, 0, axis=-1), np.take(x, 1, axis=-1)
+    return real + 1j * imag
+
+
+def ref_view_as_real(x):
+    return np.stack([x.real, x.imag], -1)
+
+
+class TestViewAsComplexOp(OpTest):
+    def setUp(self):
+        self.op_type = "as_complex"
+        x = np.random.randn(10, 10, 2).astype("float64")
+        out_ref = ref_view_as_complex(x)
+        self.out_grad = np.ones(
+            [10, 10], dtype="float64") + 1j * np.ones(
+                [10, 10], dtype="float64")
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out_ref}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ['X'],
+            'Out',
+            user_defined_grads=[ref_view_as_real(self.out_grad)],
+            user_defined_grad_outputs=[self.out_grad])
+
+
+class TestViewAsRealOp(OpTest):
+    def setUp(self):
+        self.op_type = "as_real"
+        real = np.random.randn(10, 10).astype("float64")
+        imag = np.random.randn(10, 10).astype("float64")
+        x = real + 1j * imag
+        out_ref = ref_view_as_real(x)
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out_ref}
+        self.out_grad = np.ones([10, 10, 2], dtype="float64")
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ['X'],
+            'Out',
+            user_defined_grads=[ref_view_as_complex(self.out_grad)],
+            user_defined_grad_outputs=[self.out_grad])
+
+
+class TestViewAsComplexAPI(unittest.TestCase):
+    def setUp(self):
+        self.x = np.random.randn(10, 10, 2)
+        self.out = ref_view_as_complex(self.x)
+
+    def test_dygraph(self):
+        with dygraph.guard():
+            x = paddle.to_tensor(self.x)
+            out_np = paddle.as_complex(x).numpy()
+        self.assertTrue(np.allclose(self.out, out_np))
+
+    def test_static(self):
+        mp, sp = static.Program(), static.Program()
+        with static.program_guard(mp, sp):
+            x = static.data("x", shape=[10, 10, 2], dtype="float64")
+            out = paddle.as_complex(x)
+
+        exe = static.Executor()
+        exe.run(sp)
+        [out_np] = exe.run(mp, feed={"x": self.x}, fetch_list=[out])
+        self.assertTrue(np.allclose(self.out, out_np))
+
+
+class TestViewAsRealAPI(unittest.TestCase):
+    def setUp(self):
+        self.x = np.random.randn(10, 10) + 1j * np.random.randn(10, 10)
+        self.out = ref_view_as_real(self.x)
+
+    def test_dygraph(self):
+        with dygraph.guard():
+            x = paddle.to_tensor(self.x)
+            out_np = paddle.as_real(x).numpy()
+        self.assertTrue(np.allclose(self.out, out_np))
+
+    def test_static(self):
+        mp, sp = static.Program(), static.Program()
+        with static.program_guard(mp, sp):
+            x = static.data("x", shape=[10, 10], dtype="complex128")
+            out = paddle.as_real(x)
+
+        exe = static.Executor()
+        exe.run(sp)
+        [out_np] = exe.run(mp, feed={"x": self.x}, fetch_list=[out])
+        self.assertTrue(np.allclose(self.out, out_np))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_graph.py b/python/paddle/fluid/tests/unittests/test_cuda_graph.py
index 8b4eae8ada4e8..66228856effe4 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_graph.py
@@ -34,7 +34,8 @@ def setUp(self):
             paddle.set_flags({
                 'FLAGS_allocator_strategy': 'auto_growth',
                 'FLAGS_sync_nccl_allreduce': False,
-                'FLAGS_cudnn_deterministic': True
+                'FLAGS_cudnn_deterministic': True,
+                'FLAGS_use_stream_safe_cuda_allocator': False,
             })
 
     def random_tensor(self, shape):
@@ -187,6 +188,48 @@ def test_concat_and_split(self):
         finally:
             graph.reset()
 
+    def test_dataloader(self):
+        if not can_use_cuda_graph():
+            return
+
+        class AutoIncDataset(paddle.io.Dataset):
+            def __init__(self, n, dtype):
+                self.n = n
+                self.dtype = dtype
+
+            def __len__(self):
+                return self.n
+
+            def __getitem__(self, idx):
+                return np.array([idx]).astype(self.dtype)
+
+        n = 100
+        dtype = 'int64'
+        dataset = AutoIncDataset(n, dtype)
+        data_loader = paddle.io.DataLoader(
+            dataset, batch_size=1, num_workers=2, use_buffer_reader=True)
+        x = None
+        y = None
+
+        graph = None
+        for i, data in enumerate(data_loader):
+            if graph is None:
+                x = data
+                x = x.cuda()
+                graph = CUDAGraph()
+                graph.capture_begin()
+                y = x * x
+                graph.capture_end()
+            else:
+                x.copy_(data, False)
+                x = x.cuda()
+
+            graph.replay()
+            actual_x = np.array([[i]]).astype(dtype)
+            actual_y = np.array([[i * i]]).astype(dtype)
+            self.assertTrue(np.array_equal(actual_x, x.numpy()))
+            self.assertTrue(np.array_equal(actual_y, y.numpy()))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
index ccbe154a48753..4e3dfccee28a2 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
@@ -167,6 +167,15 @@ def test(self):
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
+
+        configs = {}
+        configs['__emb__'] = {
+            "table_parameters.__emb__.accessor.embed_sgd_param.name":
+            "SparseNaiveSGDRule",
+            "table_parameters.__emb__.accessor.embedx_sgd_param.name":
+            "SparseAdamSGDRule",
+        }
+        strategy.sparse_table_configs = configs
         optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(loss)
diff --git a/python/paddle/fluid/tests/unittests/test_distribution.py b/python/paddle/fluid/tests/unittests/test_distribution.py
index f1c12c90490c2..6cf2c5f6e2ca4 100644
--- a/python/paddle/fluid/tests/unittests/test_distribution.py
+++ b/python/paddle/fluid/tests/unittests/test_distribution.py
@@ -336,6 +336,29 @@ def init_static_data(self, batch_size, dims):
                 name='values', shape=[dims], dtype='float32')
 
 
+class UniformTestSample(unittest.TestCase):
+    def setUp(self):
+        self.init_param()
+
+    def init_param(self):
+        self.low = 3.0
+        self.high = 4.0
+
+    def test_uniform_sample(self):
+        paddle.disable_static()
+        uniform = Uniform(low=self.low, high=self.high)
+        s = uniform.sample([100])
+        self.assertTrue((s >= self.low).all())
+        self.assertTrue((s < self.high).all())
+        paddle.enable_static()
+
+
+class UniformTestSample2(UniformTestSample):
+    def init_param(self):
+        self.low = -5.0
+        self.high = 2.0
+
+
 class NormalNumpy(DistributionNumpy):
     def __init__(self, loc, scale):
         self.loc = np.array(loc)
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
index c5cf8c5d5ed69..f76dcb5687c2a 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
@@ -26,6 +26,9 @@ class TestDygraphShardingStage2(TestMultipleGpus):
     def test_dygraph_sharding_optimizer_stage2(self):
         self.run_mnist_2gpu('dygraph_sharding_stage2.py')
 
+    def test_dygraph_sharding_optimizer_stage2_offload(self):
+        self.run_mnist_2gpu('dygraph_sharding_stage2_offload.py')
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
index a9193c0abdfc1..7d611ed6e06d4 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -257,19 +257,19 @@ def test_a_sync_configs(self):
 
     def test_sparse_table_configs(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
-        configs = {
-            "table_parameters.accessor.embed_sgd_param.adagrad.learning_rate":
+        configs = {}
+        configs['emb'] = {
+            "table_parameters.emb.accessor.embed_sgd_param.adagrad.learning_rate":
             0.05,
-            "table_parameters.accessor.table_accessor_save_param.num": 2,
-            "table_parameters.accessor.table_accessor_save_param.param":
+            "table_parameters.emb.accessor.table_accessor_save_param.num": 2,
+            "table_parameters.emb.accessor.table_accessor_save_param.param":
             [1, 2]
         }
         strategy.sparse_table_configs = configs
-        self.assertEqual(strategy.sparse_table_configs.accessor.embed_sgd_param.
-                         adagrad.learning_rate, 0.05)
-        self.assertEqual(
-            strategy.sparse_table_configs.accessor.table_accessor_save_param[
-                0].param, 1)
+        self.assertEqual(strategy.sparse_table_configs[0]
+                         .accessor.embed_sgd_param.adagrad.learning_rate, 0.05)
+        self.assertEqual(strategy.sparse_table_configs[0]
+                         .accessor.table_accessor_save_param[0].param, 1)
 
         strategy.adam_d2sum = True
         self.assertEqual(strategy.adam_d2sum, True)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor.py b/python/paddle/fluid/tests/unittests/test_fleet_executor.py
index 09f9fa6ce105d..fbc5db341e5e9 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor.py
@@ -47,6 +47,18 @@ def run_fleet_executor(self, place, x_data, y_data):
                 name='y', shape=y_data.shape, dtype=y_data.dtype)
             z = x + y
             a = 2 * x + 3 * y
+            loss = paddle.mean(a)
+            base_lr = 0.1
+            passes = [30, 60, 80, 90]
+            steps_per_pass = 10
+            bd = [steps_per_pass * p for p in passes]
+            lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+            lr_val = paddle.optimizer.lr.PiecewiseDecay(
+                boundaries=bd, values=lr)
+            opt = paddle.optimizer.AdamW(
+                learning_rate=lr_val,
+                grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
+            opt.minimize(loss)
         # TODO: section_program will be removed in the future
         empty_program._pipeline_opt = {
             "fleet_opt": self.fake_fleet_opt(),
diff --git a/python/paddle/fluid/tests/unittests/test_gcd.py b/python/paddle/fluid/tests/unittests/test_gcd.py
new file mode 100644
index 0000000000000..820216dc56cd6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_gcd.py
@@ -0,0 +1,93 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, program_guard
+from op_test import OpTest
+
+paddle.enable_static()
+
+
+class TestGcdAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = 12
+        self.y_np = 20
+        self.x_shape = [1]
+        self.y_shape = [1]
+
+    def test_static_graph(self):
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(startup_program, train_program):
+            x = fluid.data(name='input1', dtype='int32', shape=self.x_shape)
+            y = fluid.data(name='input2', dtype='int32', shape=self.y_shape)
+            out = paddle.gcd(x, y)
+
+            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            res = exe.run(fluid.default_main_program(),
+                          feed={'input1': self.x_np,
+                                'input2': self.y_np},
+                          fetch_list=[out])
+            self.assertTrue((np.array(res[0]) == np.gcd(self.x_np, self.y_np)
+                             ).all())
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x_np)
+        y = paddle.to_tensor(self.y_np)
+        result = paddle.gcd(x, y)
+        self.assertEqual(
+            np.allclose(np.gcd(self.x_np, self.y_np), result.numpy()), True)
+
+        paddle.enable_static()
+
+
+class TestGcdAPI2(TestGcdAPI):
+    def setUp(self):
+        self.x_np = np.arange(6).astype(np.int32)
+        self.y_np = np.array([20]).astype(np.int32)
+        self.x_shape = [6]
+        self.y_shape = [1]
+
+
+class TestGcdAPI3(TestGcdAPI):
+    def setUp(self):
+        self.x_np = 0
+        self.y_np = 20
+        self.x_shape = [1]
+        self.y_shape = [1]
+
+
+class TestGcdAPI4(TestGcdAPI):
+    def setUp(self):
+        self.x_np = 0
+        self.y_np = 0
+        self.x_shape = [1]
+        self.y_shape = [1]
+
+
+class TestGcdAPI5(TestGcdAPI):
+    def setUp(self):
+        self.x_np = 12
+        self.y_np = -20
+        self.x_shape = [1]
+        self.y_shape = [1]
diff --git a/python/paddle/fluid/tests/unittests/test_lcm.py b/python/paddle/fluid/tests/unittests/test_lcm.py
new file mode 100644
index 0000000000000..123c3e3d444e1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_lcm.py
@@ -0,0 +1,93 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, program_guard
+from op_test import OpTest
+
+paddle.enable_static()
+
+
+class TestLcmAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = 12
+        self.y_np = 20
+        self.x_shape = [1]
+        self.y_shape = [1]
+
+    def test_static_graph(self):
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(startup_program, train_program):
+            x1 = fluid.data(name='input1', dtype='int32', shape=self.x_shape)
+            x2 = fluid.data(name='input2', dtype='int32', shape=self.y_shape)
+            out = paddle.lcm(x1, x2)
+
+            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            res = exe.run(fluid.default_main_program(),
+                          feed={'input1': self.x_np,
+                                'input2': self.y_np},
+                          fetch_list=[out])
+            self.assertTrue((np.array(res[0]) == np.lcm(self.x_np, self.y_np)
+                             ).all())
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        x1 = paddle.to_tensor(self.x_np)
+        x2 = paddle.to_tensor(self.y_np)
+        result = paddle.lcm(x1, x2)
+        self.assertEqual(
+            np.allclose(np.lcm(self.x_np, self.y_np), result.numpy()), True)
+
+        paddle.enable_static()
+
+
+class TestLcmAPI2(TestLcmAPI):
+    def setUp(self):
+        self.x_np = np.arange(6).astype(np.int32)
+        self.y_np = np.array([20]).astype(np.int32)
+        self.x_shape = [6]
+        self.y_shape = [1]
+
+
+class TestLcmAPI3(TestLcmAPI):
+    def setUp(self):
+        self.x_np = 0
+        self.y_np = 20
+        self.x_shape = [1]
+        self.y_shape = [1]
+
+
+class TestLcmAPI4(TestLcmAPI):
+    def setUp(self):
+        self.x_np = 0
+        self.y_np = 0
+        self.x_shape = [1]
+        self.y_shape = [1]
+
+
+class TestLcmAPI5(TestLcmAPI):
+    def setUp(self):
+        self.x_np = 12
+        self.y_np = -20
+        self.x_shape = [1]
+        self.y_shape = [1]
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index ce84fb739c000..722003c034091 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -23,6 +23,7 @@
 from ..fluid.param_attr import ParamAttr  # noqa: F401
 from ..fluid.layers.tensor import create_parameter  # noqa: F401
 from ..fluid.core import CPUPlace  # noqa: F401
+from ..fluid.core import IPUPlace  # noqa: F401
 from ..fluid.core import CUDAPlace  # noqa: F401
 from ..fluid.core import CUDAPinnedPlace  # noqa: F401
 from ..fluid.core import NPUPlace  # noqa: F401
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 53001c0715989..36dfd717a12a0 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -111,6 +111,9 @@
 from .manipulation import roll  # noqa: F401
 from .manipulation import chunk  # noqa: F401
 from .manipulation import tensordot  # noqa: F401
+from .manipulation import as_complex  # noqa: F401
+from .manipulation import as_real  # noqa: F401
+
 from .math import abs  # noqa: F401
 from .math import acos  # noqa: F401
 from .math import asin  # noqa: F401
@@ -194,6 +197,8 @@
 from .math import lerp_  # noqa: F401
 from .math import rad2deg  # noqa: F401
 from .math import deg2rad  # noqa: F401
+from .math import gcd  # noqa: F401
+from .math import lcm  # noqa: F401
 from .math import diff  # noqa: F401
 from .math import angle  # noqa: F401
 
@@ -409,6 +414,12 @@
            'multi_dot',
            'solve',
            'triangular_solve',
+           'as_complex',
+           'as_real',
+           'rad2deg',
+           'deg2rad',
+           'gcd',
+           'lcm',
            'diff',
            'lerp',
            'lerp_',
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index b4f00ab4ffb8b..a81d8c54ffc42 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -34,6 +34,7 @@
 from ..fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only
 import paddle
 from paddle import _C_ops
+from paddle.tensor.attribute import _complex_to_real_dtype, _real_to_complex_dtype
 
 __all__ = []
 
@@ -2488,3 +2489,94 @@ def _var_to_list(var):
         [contraction_size, not_contraction_size_y])
     out = x.matmul(y).reshape(shape_out)
     return out
+
+
+def as_complex(x, name=None):
+    """Transform a real tensor to a complex tensor. 
+    
+    The data type of the input tensor is 'float32' or 'float64', and the data
+    type of the returned tensor is 'complex64' or 'complex128', respectively.
+
+    The shape of the input tensor is ``(* ,2)``, (``*`` means arbitary shape), i.e. 
+    the size of the last axis shoule be 2, which represent the real and imag part
+    of a complex number. The shape of the returned tensor is ``(*,)``.
+
+    Args:
+        x (Tensor): The input tensor. Data type is 'float32' or 'float64'.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: The output. Data type is 'complex64' or 'complex128', with the same precision as the input.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            x = paddle.arange(12, dtype=paddle.float32).reshape([2, 3, 2])
+            y = paddle.as_complex(x)
+            print(y.numpy())
+
+            # [[ 0. +1.j  2. +3.j  4. +5.j]
+            #  [ 6. +7.j  8. +9.j 10.+11.j]]
+    """
+    if in_dygraph_mode():
+        return paddle._C_ops.as_complex(x)
+
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'as_complex')
+    op_type = "as_complex"
+    helper = LayerHelper(op_type, **locals())
+    inputs = {"X": x}
+    out = helper.create_variable_for_type_inference(
+        dtype=_real_to_complex_dtype(x.dtype))
+    outputs = {"Out": out}
+    attrs = {}
+    helper.append_op(type=op_type, inputs=inputs, attrs=attrs, outputs=outputs)
+    return out
+
+
+def as_real(x, name=None):
+    """Transform a complex tensor to a real tensor. 
+    
+    The data type of the input tensor is 'complex64' or 'complex128', and the data 
+    type of the returned tensor is 'float32' or 'float64', respectively.
+
+    When the shape of the input tensor is ``(*, )``, (``*`` means arbitary shape),
+    the shape of the output tensor is ``(*, 2)``, i.e. the shape of the output is
+    the shape of the input appended by an extra ``2``.
+
+    Args:
+        x (Tensor): The input tensor. Data type is 'complex64' or 'complex128'.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: The output. Data type is 'float32' or 'float64', with the same precision as the input.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            x = paddle.arange(12, dtype=paddle.float32).reshape([2, 3, 2])
+            y = paddle.as_complex(x)
+            z = paddle.as_real(y)
+            print(z.numpy())
+
+            # [[[ 0.  1.]
+            #   [ 2.  3.]
+            #   [ 4.  5.]]
+
+            #  [[ 6.  7.]
+            #   [ 8.  9.]
+            #   [10. 11.]]]
+    """
+    if in_dygraph_mode():
+        return paddle._C_ops.as_real(x)
+
+    check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], 'as_real')
+    op_type = "as_real"
+    helper = LayerHelper(op_type, **locals())
+    inputs = {"X": x}
+    out = helper.create_variable_for_type_inference(
+        dtype=_complex_to_real_dtype(x.dtype))
+    outputs = {"Out": out}
+    helper.append_op(type=op_type, inputs=inputs, outputs=outputs)
+    return out
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index f705510f848be..fefaecaf604a0 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -2624,9 +2624,9 @@ def lerp(x, y, weight, name=None):
             lerp(x, y, weight) = x + weight * (y - x).
 
     Args:
-        x (Tensor): An N-D Tensor, the data type is float32, float64.
-        y (Tensor): An N-D Tensor, the data type is float32, float64.
-        weight (float|Tensor): the weight for the interpolation formula.
+        x (Tensor): An N-D Tensor with starting points, the data type is float32, float64.
+        y (Tensor): An N-D Tensor with ending points, the data type is float32, float64.
+        weight (float|Tensor): The weight for the interpolation formula. When weight is Tensor, the data type is float32, float64.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -2788,6 +2788,139 @@ def deg2rad(x, name=None):
             type='scale', inputs={'X':out_cast}, outputs={'Out': out}, attrs={'scale': deg2rad_scale})
         return out
 
+def gcd(x, y, name=None):
+    """
+    Computes the element-wise greatest common divisor (GCD) of input |x| and |y|.
+    Both x and y must have integer types.
+    
+    Note:
+        gcd(0,0)=0, gcd(0, y)=|y|
+
+    Args:
+        x, y (Tensor): An N-D Tensor, the data type is int8，int16，int32，int64，uint8. 
+            If x.shape != y.shape, they must be broadcastable to a common shape (which becomes the shape of the output).
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        out (Tensor): An N-D Tensor, the data type is the same with input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            
+            x1 = paddle.to_tensor(12)
+            x2 = paddle.to_tensor(20)
+            paddle.gcd(x1, x2)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [4])
+
+            x3 = paddle.to_tensor(np.arange(6))
+            paddle.gcd(x3, x2)
+            # Tensor(shape=[6], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [20, 1 , 2 , 1 , 4 , 5])
+
+            x4 = paddle.to_tensor(0)
+            paddle.gcd(x4, x2)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [20])
+
+            paddle.gcd(x4, x4)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [0])
+            
+            x5 = paddle.to_tensor(-20)
+            paddle.gcd(x1, x5)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [4])
+    """
+    shape = paddle.broadcast_shape(x.shape, y.shape)
+    x = paddle.broadcast_to(x, shape)
+    y = paddle.broadcast_to(y, shape)
+    x = paddle.abs(x)
+    y = paddle.abs(y)
+
+    def _gcd_cond_fn(x, y):
+        return paddle.any(y != 0)
+
+    def _gcd_body_fn(x, y):
+        # paddle.mod will raise an error when any element of y is 0. To avoid
+        # that, we change those zeros to ones. Their values don't matter because
+        # they won't be used.
+        y_not_equal_0 = (y != 0)
+        y_safe = paddle.where(y_not_equal_0, y, paddle.ones(y.shape, y.dtype))
+        x, y = (paddle.where(y_not_equal_0, y, x),
+                  paddle.where(y_not_equal_0, paddle.mod(x, y_safe),paddle.zeros(y.shape, y.dtype)))
+        return (paddle.where(x < y, y, x), paddle.where(x < y, x, y))
+
+    if in_dygraph_mode():
+        while _gcd_cond_fn(x, y):
+            x, y = _gcd_body_fn(x, y)
+
+        return x
+    else:
+        check_variable_and_dtype(x, 'x', ['int32', 'int64', 'int8', 'int16', 'uint8'], 'gcd')
+        check_variable_and_dtype(y, 'y', ['int32', 'int64', 'int8', 'int16', 'uint8'], 'gcd')
+        out, _ = paddle.static.nn.while_loop(_gcd_cond_fn, _gcd_body_fn, [x, y])
+        return out
+
+def lcm(x, y, name=None):
+    """
+    Computes the element-wise least common multiple (LCM) of input |x| and |y|.
+    Both x and y must have integer types.
+    
+    Note:
+        lcm(0,0)=0, lcm(0, y)=0
+
+    Args:
+        x, y (Tensor): An N-D Tensor, the data type is int8，int16，int32，int64，uint8. 
+            If x.shape != y.shape, they must be broadcastable to a common shape (which becomes the shape of the output).
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        out (Tensor): An N-D Tensor, the data type is the same with input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            
+            x1 = paddle.to_tensor(12)
+            x2 = paddle.to_tensor(20)
+            paddle.lcm(x1, x2)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [60])
+
+            x3 = paddle.to_tensor(np.arange(6))
+            paddle.lcm(x3, x2)
+            # Tensor(shape=[6], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [0, 20, 20, 60, 20, 20])
+
+            x4 = paddle.to_tensor(0)
+            paddle.lcm(x4, x2)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [0])
+
+            paddle.lcm(x4, x4)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [0])
+            
+            x5 = paddle.to_tensor(-20)
+            paddle.lcm(x1, x5)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [60])
+    """
+    d = paddle.gcd(x, y)
+    # paddle.mod will raise an error when any element of y is 0. To avoid
+    # that, we change those zeros to ones. Their values don't matter because
+    # they won't be used.
+    d_equal_0 = paddle.equal(d, 0)
+    d_safe = paddle.where(d_equal_0, paddle.ones(d.shape, d.dtype), d)
+    out = paddle.where(d_equal_0, paddle.zeros(d.shape, d.dtype), paddle.abs(x * y) // d_safe)
+    return out
+
 def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
     r"""
     Computes the n-th forward difference along the given axis.
@@ -2949,7 +3082,6 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
 
         return out
 
-
 def angle(x, name=None):
     r"""
     Element-wise angle of complex numbers. For non-negative real numbers, the angle is 0 while 
@@ -2965,7 +3097,7 @@ def angle(x, name=None):
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        out (Tensor): y (Tensor): An N-D Tensor of real data type with the same precision as that of x's data type.
+        Tensor: An N-D Tensor of real data type with the same precision as that of x's data type.
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 3d61caae002e9..2c47bbe4566d6 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -76,7 +76,7 @@
   infer_meta : 
     func : MatmulInferMeta
   kernel : 
-    func : matmul_v2
+    func : matmul
 
 - api : mean
   args : (const Tensor& x, const std::vector<int64_t>& axis, bool keep_dim)
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index 5506ee95bd7c9..ed3bb1dc5f1f0 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -345,6 +345,7 @@ def source_include(header_file_path):
 #include "glog/logging.h"
 
 #include "paddle/pten/api/lib/api_registry.h"
+#include "paddle/pten/api/lib/kernel_declare.h"
 #include "paddle/pten/api/lib/kernel_dispatch.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/kernel_registry.h"
@@ -353,22 +354,6 @@ def source_include(header_file_path):
 """
 
 
-def module_declare():
-    return """
-PT_DECLARE_MODULE(CreationCPU);
-PT_DECLARE_MODULE(LinalgCPU);
-PT_DECLARE_MODULE(ManipulationCPU);
-PT_DECLARE_MODULE(MathCPU);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(CreationCUDA);
-PT_DECLARE_MODULE(LinalgCUDA);
-PT_DECLARE_MODULE(ManipulationCUDA);
-PT_DECLARE_MODULE(MathCUDA);
-#endif
-"""
-
-
 def api_register():
     return """
 PT_REGISTER_API(Creation);
@@ -405,7 +390,6 @@ def generate_api(api_yaml_path, header_file_path, source_file_path):
 
     include_header_file = "paddle/pten/api/include/api.h"
     source_file.write(source_include(include_header_file))
-    source_file.write(module_declare())
     source_file.write(namespace[0])
 
     for api in apis:
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 79a742c314bd0..46b415e24d8ca 100644
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -202,7 +202,7 @@
     'test_fleet_runtime',
     'test_rnn_cudnn_params_packing',
     'test_mkldnn_placement_pass',
-    'test_fc_elementwise_layernorm_fuse_pass',
+    'test_fc_elementwise_layernorm_fuse_pass_cc',
     'program_desc_test',
     'test_simplify_with_basic_ops_pass',
     'test_dygraph_mode_of_unittest',
@@ -1417,7 +1417,7 @@
     'test_fc_mkldnn_op',
     'test_fc_lstm_fuse_pass',
     'test_fc_gru_fuse_pass',
-    'test_fc_elementwise_layernorm_fuse_pass',
+    'test_fc_elementwise_layernorm_fuse_pass_cc',
     'test_fc_bf16_mkldnn_op',
     'test_executor_feed_non_tensor',
     'test_executor_check_feed',