diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 42de34fb52061a..640e2e37ad434d 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -35,7 +35,7 @@ ELSE ()
 ENDIF()
 
 SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210701")
+SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210729")
 SET(XPU_XRE_URL  "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 485fddff4df424..c632a1dc6935ae 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -188,8 +188,13 @@ cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place)
 
 cc_library(unused_var_check SRCS unused_var_check.cc DEPS glog no_need_buffer_vars_inference)
 
+IF(WITH_XPU)
+cc_library(operator SRCS operator.cc DEPS xpu_op_list op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
+    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils)
+ELSE()
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
     shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils)
+ENDIF()
 
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
 cc_test(operator_exception_test SRCS operator_exception_test.cc DEPS operator op_registry device_context)
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 20cffaa9590196..0f7012940d76b0 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -36,7 +36,8 @@ class LoDTensor;
 }  // namespace framework
 }  // namespace paddle
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/platform/xpu_info.h"
+#include "paddle/fluid/platform/xpu/xpu_info.h"
+#include "paddle/fluid/platform/xpu/xpu_op_list.h"
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN
@@ -1254,7 +1255,8 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
 #endif
 #ifdef PADDLE_WITH_XPU
   if (kernel_iter == kernels.end() &&
-      is_xpu_place(expected_kernel_key.place_)) {
+      is_xpu_place(expected_kernel_key.place_) &&
+      !paddle::platform::is_xpu_support_op(type_, expected_kernel_key)) {
     VLOG(3) << "missing XPU kernel: " << type_
             << ", expected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index c9dffe2d76a436..93b18e7e553b5e 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -1,6 +1,10 @@
 cc_library(imperative_flag SRCS flags.cc DEPS gflags)
 
+IF(WITH_XPU)
+cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils)
+ELSE()
 cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils)
+ENDIF()
 cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
 add_subdirectory(jit)
 cc_library(amp SRCS amp_auto_cast.cc DEPS layer )
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 57c6ae3cbb0a13..e463ddae383ff3 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -17,7 +17,9 @@
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/imperative/infer_shape_context.h"
-
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu/xpu_op_list.h"
+#endif
 DECLARE_bool(check_nan_inf);
 
 namespace paddle {
@@ -127,7 +129,8 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
   auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_XPU
   if (kernel_iter == kernels.end() &&
-      is_xpu_place(expected_kernel_key.place_)) {
+      is_xpu_place(expected_kernel_key.place_) &&
+      !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key)) {
     VLOG(3) << "missing XPU kernel: " << op.Type()
             << ", expected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 8bc9775381be57..bfc4a1d598200e 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -33,7 +33,7 @@
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/platform/xpu_info.h"
+#include "paddle/fluid/platform/xpu/xpu_info.h"
 #endif
 #include "paddle/fluid/platform/npu_info.h"
 
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index bc72b4b20d0614..6c2fb82cb7cbe1 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -31,7 +31,7 @@
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 #endif
 
 DEFINE_bool(init_allocated_mem, false,
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index f2f8c5d1fb5551..3b3be9776c4c54 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/operators/activation_op_xpu.cc b/paddle/fluid/operators/activation_op_xpu.cc
index 2c7219ef6885b5..257a91d7c15d73 100644
--- a/paddle/fluid/operators/activation_op_xpu.cc
+++ b/paddle/fluid/operators/activation_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/activation_op.h"
 #include <string>
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/concat_op_xpu.cc b/paddle/fluid/operators/concat_op_xpu.cc
index be299babdba7a4..dc9359ecf5c3d1 100644
--- a/paddle/fluid/operators/concat_op_xpu.cc
+++ b/paddle/fluid/operators/concat_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/deformable_conv_op_xpu.cc b/paddle/fluid/operators/deformable_conv_op_xpu.cc
index 18bab83b0edb84..457616756215c2 100644
--- a/paddle/fluid/operators/deformable_conv_op_xpu.cc
+++ b/paddle/fluid/operators/deformable_conv_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc
index 79d239074845ad..0b0b7095bd5d16 100644
--- a/paddle/fluid/operators/dropout_op_xpu.cc
+++ b/paddle/fluid/operators/dropout_op_xpu.cc
@@ -11,7 +11,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/dropout_op.h"
 #include <memory>
 #include <string>
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
index d73e46df3491be..cb75616221bc4d 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
 #include "paddle/fluid/operators/metrics/accuracy_op.h"
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc
index 9cc8ac200b8eec..4f98dde210f7a7 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc
@@ -16,7 +16,7 @@
 
 #include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc
index a4ed0c85f4f9d9..ae27a5d7df4734 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc
@@ -16,7 +16,7 @@
 #include <memory>
 #include <string>
 #include "paddle/fluid/operators/reduce_ops/reduce_op_xpu.h"
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h b/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h
index fa9503ec3f0aec..5ae60713bc912b 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc
index bf55221bd3ffdd..f759b104d01d18 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc
@@ -16,7 +16,7 @@
 #include <memory>
 #include <string>
 #include "paddle/fluid/operators/reduce_ops/reduce_op_xpu.h"
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/rnn_op_xpu.cc b/paddle/fluid/operators/rnn_op_xpu.cc
index fb82d18e62f3bf..9d637e1cee1176 100644
--- a/paddle/fluid/operators/rnn_op_xpu.cc
+++ b/paddle/fluid/operators/rnn_op_xpu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc
index fdb90797b69db5..e0dfad91570ad6 100644
--- a/paddle/fluid/operators/scale_op_xpu.cc
+++ b/paddle/fluid/operators/scale_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/scale_op.h"
 #include <string>
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sign_op_xpu.cc b/paddle/fluid/operators/sign_op_xpu.cc
index 86fe826c659ef9..a164a9b056677a 100644
--- a/paddle/fluid/operators/sign_op_xpu.cc
+++ b/paddle/fluid/operators/sign_op_xpu.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
 #include "paddle/fluid/operators/sign_op.h"
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/sum_op_xpu.cc b/paddle/fluid/operators/sum_op_xpu.cc
index 264cc4e2cf7941..d16bb5562ed3a7 100644
--- a/paddle/fluid/operators/sum_op_xpu.cc
+++ b/paddle/fluid/operators/sum_op_xpu.cc
@@ -13,7 +13,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/sum_op.h"
 #include <vector>
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/transpose_op_xpu.cc b/paddle/fluid/operators/transpose_op_xpu.cc
index 2748c07f9e6d7c..360c2125ed1f6f 100644
--- a/paddle/fluid/operators/transpose_op_xpu.cc
+++ b/paddle/fluid/operators/transpose_op_xpu.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 36a956762174e1..efd25bc8929409 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -69,7 +69,8 @@ cc_library(place SRCS place.cc DEPS enforce boost)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
 if(WITH_XPU)
-cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib)
+cc_library(xpu_info SRCS xpu/xpu_info.cc DEPS gflags glog enforce xpulib)
+cc_library(xpu_op_list SRCS xpu/xpu_op_list.cc DEPS gflags glog enforce xpulib)
 endif()
 
 if(WITH_ASCEND)
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 82f14c612d1fa2..c7162f58de284c 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -196,7 +196,10 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
 Place CPUDeviceContext::GetPlace() const { return place_; }
 
 #ifdef PADDLE_WITH_XPU
-XPUDeviceContext::XPUDeviceContext() { context_ = xpu::create_context(); }
+XPUDeviceContext::XPUDeviceContext() {
+  context_ = xpu::create_context();
+  xpu_version_ = get_xpu_version(place_.device);
+}
 
 XPUDeviceContext::~XPUDeviceContext() {}
 
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 68589f546dc787..abac12ff266486 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -68,8 +68,8 @@ struct GpuDevice;
 }  // namespace Eigen
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/platform/xpu_header.h"
-#include "paddle/fluid/platform/xpu_info.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_info.h"
 #endif
 
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -137,12 +137,14 @@ struct DefaultDeviceContextType<platform::CPUPlace> {
 };
 
 #ifdef PADDLE_WITH_XPU
+namespace xpu = baidu::xpu::api;
 class XPUDeviceContext : public DeviceContext {
  public:
   XPUDeviceContext();
   explicit XPUDeviceContext(XPUPlace place);
   virtual ~XPUDeviceContext();
   Eigen::DefaultDevice* eigen_device() const { return nullptr; }
+  XPUVersion xpu_version() const { return xpu_version_; }
   Place GetPlace() const override;
   xpu::Context* x_context() const;
 
@@ -159,6 +161,7 @@ class XPUDeviceContext : public DeviceContext {
 
  private:
   XPUPlace place_;
+  XPUVersion xpu_version_;
   xpu::Context* context_;
 #ifdef PADDLE_WITH_XPU_BKCL
   BKCLContext_t bkcl_context_;
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index ac6988d350f4f3..2e0ba9d241c72b 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -29,8 +29,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/platform/xpu_header.h"
-#include "paddle/fluid/platform/xpu_info.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_info.h"
 #endif
 
 #ifdef WITH_WIN_DUMP_DBG
diff --git a/paddle/fluid/platform/xpu/xpu1_op_list.h b/paddle/fluid/platform/xpu/xpu1_op_list.h
new file mode 100644
index 00000000000000..131525718cac75
--- /dev/null
+++ b/paddle/fluid/platform/xpu/xpu1_op_list.h
@@ -0,0 +1,230 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_XPU
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/op_kernel_type.h"
+
+namespace paddle {
+namespace platform {
+
+using vartype = paddle::framework::proto::VarType;
+using pOpKernelType = paddle::framework::OpKernelType;
+using XPUKernelSet =
+    std::unordered_set<pOpKernelType, paddle::framework::OpKernelType::Hash>;
+using XPUOpMap = std::unordered_map<std::string, XPUKernelSet>;
+
+XPUOpMap& get_kl1_ops() {
+  // KL1支持的op，通过op_name, data_type, place来索引
+  static XPUOpMap s_xpu1_kernels{
+      {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"relu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sigmoid_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"gelu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"gelu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sqrt", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sqrt_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"square", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"square_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"hard_switch", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"hard_switch_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"leaky_relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"leaky_relu_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"log", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"pow", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"abs", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"affine_channel",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"affine_channel_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"assign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"batch_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"batch_norm_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"cast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"clip_by_norm",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"coalesce_tensor",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"c_reduce_sum",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"c_allreduce_sum",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"broadcast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"concat", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"concat_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"logicalor", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"logicaland", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"logicalnot", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"depthwise_conv2d",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"depthwise_conv2d_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"deformable_conv",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"deformable_conv_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"dropout", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"dropout_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_sub",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_sub_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_add",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_add_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_div",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_div_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_pow",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_floordiv",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_mul",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_mul_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_max",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_max_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_min",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_min_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"fill_constant",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"gather", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"gather_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"gaussian_random",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"bilinear_interp",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"bilinear_interp_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"nearest_interp",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"nearest_interp_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"bilinear_interp_v2",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"bilinear_interp_v2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"nearest_interp_v2",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"nearest_interp_v2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"layer_norm_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"load", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"log_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"log_loss_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"lookup_table_v2",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"lookup_table_v2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"matmul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"matmul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"matmul_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"matmul_v2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"mean_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"accuracy", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"mul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"one_hot", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sgd", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"rmsprop", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"lamb", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"pool2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"pool2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"range", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_sum_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"logsumexp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_max_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reshape2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reshape2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"rnn_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"roi_align", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"roi_align_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"shape", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"slice", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"softmax_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"softmax_with_cross_entropy",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"squeeze", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"squeeze_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"squeeze2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"squeeze2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"top_k", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"transpose", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"transpose_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"transpose2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"transpose2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"truncated_gaussian_random",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"uniform_random",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"unsqueeze", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"unsqueeze_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"unsqueeze2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"unsqueeze2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"momuntem", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}
+      // AddMore
+  };
+
+  return s_xpu1_kernels;
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/xpu/xpu2_op_list.h b/paddle/fluid/platform/xpu/xpu2_op_list.h
new file mode 100644
index 00000000000000..fc80e5ee962f99
--- /dev/null
+++ b/paddle/fluid/platform/xpu/xpu2_op_list.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_XPU
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/op_kernel_type.h"
+
+namespace paddle {
+namespace platform {
+
+using vartype = paddle::framework::proto::VarType;
+using pOpKernelType = paddle::framework::OpKernelType;
+using XPUKernelSet =
+    std::unordered_set<pOpKernelType, paddle::framework::OpKernelType::Hash>;
+using XPUOpMap = std::unordered_map<std::string, XPUKernelSet>;
+
+XPUOpMap& get_kl2_ops() {
+  // KL1支持的op，通过op_name, data_type, place来索引
+  static XPUOpMap s_xpu2_kernels{
+      {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                            pOpKernelType(vartype::FP16, XPUPlace())})},
+      // AddMore
+  };
+
+  return s_xpu2_kernels;
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/xpu_header.h b/paddle/fluid/platform/xpu/xpu_header.h
similarity index 95%
rename from paddle/fluid/platform/xpu_header.h
rename to paddle/fluid/platform/xpu/xpu_header.h
index 99f4224b5d408a..caee41ae299c75 100644
--- a/paddle/fluid/platform/xpu_header.h
+++ b/paddle/fluid/platform/xpu/xpu_header.h
@@ -21,12 +21,9 @@
 
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/float16.h"
-#include "xpu/api.h"
-#include "xpu/refactor/fusion.h"
-#include "xpu/refactor/math.h"
-#include "xpu/refactor/nn.h"
 #include "xpu/runtime.h"
 #include "xpu/runtime_ex.h"
+#include "xpu/xdnn.h"
 
 namespace xpu = baidu::xpu::api;
 
diff --git a/paddle/fluid/platform/xpu_info.cc b/paddle/fluid/platform/xpu/xpu_info.cc
similarity index 86%
rename from paddle/fluid/platform/xpu_info.cc
rename to paddle/fluid/platform/xpu/xpu_info.cc
index f88248fda7e65e..6b8ab16b47d68c 100644
--- a/paddle/fluid/platform/xpu_info.cc
+++ b/paddle/fluid/platform/xpu/xpu_info.cc
@@ -8,14 +8,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/platform/xpu_info.h"
+#include "paddle/fluid/platform/xpu/xpu_info.h"
 
 #include <algorithm>
 #include <cstdlib>
 #include <string>
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 #include "paddle/fluid/string/split.h"
 
 DEFINE_string(selected_xpus, "",
@@ -103,5 +103,21 @@ void SetXPUDeviceId(int id) {
                         ret));
 }
 
+XPUVersion get_xpu_version(int dev_id) {
+  uint64_t v = 0;
+  int ret = xpu_device_get_attr(&v, XPUATTR_MODEL, dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "xpu_device_get_attr return wrong value[%d]", ret));
+
+  if (v == K100 || v == K200) {
+    VLOG(1) << "KUNLUN device " << dev_id << " is XPU1\n";
+    return XPU1;
+  } else {
+    VLOG(1) << "KUNLUN device " << dev_id << " is XPU2\n";
+    return XPU2;
+  }
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/xpu_info.h b/paddle/fluid/platform/xpu/xpu_info.h
similarity index 95%
rename from paddle/fluid/platform/xpu_info.h
rename to paddle/fluid/platform/xpu/xpu_info.h
index 2bf7b0b5cb647a..3cb79d51eb7bb6 100644
--- a/paddle/fluid/platform/xpu_info.h
+++ b/paddle/fluid/platform/xpu/xpu_info.h
@@ -51,6 +51,9 @@ class XPUDeviceGuard {
   int prev_id_{-1};
 };
 
+enum XPUVersion { XPU1, XPU2 };
+XPUVersion get_xpu_version(int dev_id);
+
 }  // namespace platform
 }  // namespace paddle
 #endif
diff --git a/paddle/fluid/platform/xpu/xpu_op_list.cc b/paddle/fluid/platform/xpu/xpu_op_list.cc
new file mode 100644
index 00000000000000..b3349407942bd1
--- /dev/null
+++ b/paddle/fluid/platform/xpu/xpu_op_list.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PADDLE_WITH_XPU
+#include <string>
+
+#include "paddle/fluid/platform/xpu/xpu1_op_list.h"
+#include "paddle/fluid/platform/xpu/xpu2_op_list.h"
+#include "paddle/fluid/platform/xpu/xpu_info.h"
+#include "paddle/fluid/platform/xpu/xpu_op_list.h"
+
+namespace paddle {
+namespace platform {
+
+bool is_xpu_support_op(std::string op_name, const pOpKernelType& type) {
+  auto& ops = get_kl1_ops();
+  auto v =
+      get_xpu_version(BOOST_GET_CONST(platform::XPUPlace, type.place_).device);
+  if (v == XPU2) {
+    ops = get_kl2_ops();
+  }
+
+  if (ops.find(op_name) != ops.end() &&
+      ops[op_name].find(type) != ops[op_name].end()) {
+    return true;
+  }
+  return false;
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/xpu/xpu_op_list.h b/paddle/fluid/platform/xpu/xpu_op_list.h
new file mode 100644
index 00000000000000..487bc8ac48b66f
--- /dev/null
+++ b/paddle/fluid/platform/xpu/xpu_op_list.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_XPU
+#include <string>
+
+#include "paddle/fluid/framework/op_kernel_type.h"
+
+namespace paddle {
+namespace platform {
+
+using pOpKernelType = paddle::framework::OpKernelType;
+
+bool is_xpu_support_op(std::string op_name, const pOpKernelType& type);
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index d646e06d8a47a9..fd0b352ec2efa6 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -117,7 +117,7 @@ limitations under the License. */
 #endif
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/platform/xpu_info.h"
+#include "paddle/fluid/platform/xpu/xpu_info.h"
 #endif
 
 #ifdef PADDLE_WITH_CRYPTO