diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
index a296607bcb9db9..d616ecf147dd7d 100644
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -193,6 +193,7 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::BOOL,
                      phi::DataType::INT8,
                      phi::DataType::UINT8,
+                     phi::DataType::INT16,
                      phi::DataType::INT64,
                      phi::DataType::INT32})},
       {"check_finite_and_unscale",
diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc
index 88e042994d5a6e..a22111207f92c5 100644
--- a/paddle/phi/backends/xpu/xpu3_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu3_op_list.cc
@@ -96,6 +96,7 @@ XPUOpMap& get_kl3_ops() {
                      phi::DataType::BOOL})},
       {"assign_value",
        XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT64,
                      phi::DataType::INT32,
                      phi::DataType::INT64,
                      phi::DataType::FLOAT16,
@@ -215,6 +216,7 @@ XPUOpMap& get_kl3_ops() {
                      phi::DataType::BOOL,
                      phi::DataType::INT8,
                      phi::DataType::UINT8,
+                     phi::DataType::INT16,
                      phi::DataType::INT64,
                      phi::DataType::INT32})},
       {"check_finite_and_unscale",
diff --git a/paddle/phi/kernels/cpu/flatten2_grad_kernel.cc b/paddle/phi/kernels/cpu/flatten2_grad_kernel.cc
index ddb2f6140cb58f..74bab2ae02a6f7 100644
--- a/paddle/phi/kernels/cpu/flatten2_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/flatten2_grad_kernel.cc
@@ -25,5 +25,6 @@ PD_REGISTER_KERNEL(flatten2_grad,
                    uint8_t,
                    int,
                    int8_t,
+                   int16_t,
                    int64_t,
                    bool) {}
diff --git a/paddle/phi/kernels/cpu/flatten2_kernel.cc b/paddle/phi/kernels/cpu/flatten2_kernel.cc
index 358dfaa57c304f..2b2b6e696ff7c3 100644
--- a/paddle/phi/kernels/cpu/flatten2_kernel.cc
+++ b/paddle/phi/kernels/cpu/flatten2_kernel.cc
@@ -25,5 +25,6 @@ PD_REGISTER_KERNEL(flatten2,
                    uint8_t,
                    int,
                    int8_t,
+                   int16_t,
                    int64_t,
                    bool) {}
diff --git a/paddle/phi/kernels/cpu/pad_grad_kernel.cc b/paddle/phi/kernels/cpu/pad_grad_kernel.cc
index af1db8173f971c..7cea0820f97b4a 100644
--- a/paddle/phi/kernels/cpu/pad_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/pad_grad_kernel.cc
@@ -24,6 +24,7 @@ PD_REGISTER_KERNEL(pad_grad,
                    phi::PadGradKernel,
                    float,
                    double,
+                   int16_t,
                    int,
                    int64_t,
                    phi::dtype::complex<float>,
diff --git a/paddle/phi/kernels/cpu/pad_kernel.cc b/paddle/phi/kernels/cpu/pad_kernel.cc
index ed0cb2f64442f4..474ba2ce29ad11 100644
--- a/paddle/phi/kernels/cpu/pad_kernel.cc
+++ b/paddle/phi/kernels/cpu/pad_kernel.cc
@@ -24,6 +24,7 @@ PD_REGISTER_KERNEL(pad,
                    phi::PadKernel,
                    float,
                    double,
+                   int16_t,
                    int,
                    int64_t,
                    phi::dtype::complex<float>,
diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc
index 25e6a2f3666510..f12194165a8e06 100644
--- a/paddle/phi/kernels/flatten_grad_kernel.cc
+++ b/paddle/phi/kernels/flatten_grad_kernel.cc
@@ -45,6 +45,7 @@ PD_REGISTER_KERNEL(flatten_grad,
                    double,
                    uint8_t,
                    int8_t,
+                   int16_t,
                    int,
                    int64_t,
                    bool) {}
diff --git a/paddle/phi/kernels/reshape_kernel.cc b/paddle/phi/kernels/reshape_kernel.cc
index d5142fdef75685..2a168f938142ef 100644
--- a/paddle/phi/kernels/reshape_kernel.cc
+++ b/paddle/phi/kernels/reshape_kernel.cc
@@ -19,9 +19,6 @@
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
-#ifdef PADDLE_WITH_XPU
-#include "paddle/phi/backends/xpu/enforce_xpu.h"
-#endif
 
 namespace phi {
 
@@ -46,34 +43,6 @@ void ReshapeKernel(const Context& dev_ctx,
   out->ResetLoD(x.lod());
 }
 
-#ifdef PADDLE_WITH_XPU
-template <>
-void ReshapeKernel<phi::XPUContext>(const XPUContext& dev_ctx,
-                                    const DenseTensor& x,
-                                    const IntArray& shape,
-                                    DenseTensor* out) {
-  MetaTensor meta_out(out);
-  InferMetaFromVecValue(x, shape.GetData(), &meta_out);
-
-  if (x.initialized() && x.Holder() == out->Holder()) {
-    dev_ctx.Alloc(out, x.dtype());
-    return;
-  }
-  dev_ctx.Alloc(out, x.dtype());
-  auto dims = out->dims();
-  auto* src_ptr = x.data();
-  auto* dst_ptr = out->data();
-  auto size = x.numel() * phi::SizeOf(x.dtype());
-  int ret = xpu::copy(dev_ctx.x_context(),
-                      reinterpret_cast<const int8_t*>(src_ptr),
-                      reinterpret_cast<int8_t*>(dst_ptr),
-                      size);
-  PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy");
-  out->Resize(dims);
-  out->ResetLoD(x.lod());
-}
-#endif
-
 template <typename Context>
 void ReshapeWithXShapeKernel(const Context& dev_ctx,
                              const DenseTensor& x,
diff --git a/paddle/phi/kernels/xpu/cast_kernel.cc b/paddle/phi/kernels/xpu/cast_kernel.cc
index 3096040472a0cd..a5e3e9290568b4 100644
--- a/paddle/phi/kernels/xpu/cast_kernel.cc
+++ b/paddle/phi/kernels/xpu/cast_kernel.cc
@@ -124,6 +124,7 @@ PD_REGISTER_KERNEL(cast,
                    XPU,
                    ALL_LAYOUT,
                    phi::CastKernel,
+                   int16_t,
                    int32_t,
                    float,
                    phi::dtype::float16,
diff --git a/test/dygraph_to_static/test_tensor_to.py b/test/dygraph_to_static/test_tensor_to.py
index 9b73ac99844d8b..c50c30e959cad6 100644
--- a/test/dygraph_to_static/test_tensor_to.py
+++ b/test/dygraph_to_static/test_tensor_to.py
@@ -25,6 +25,7 @@
 )
 
 import paddle
+from paddle import base
 
 # NOTE: only test in PIR mode
 
@@ -38,19 +39,20 @@
     "int32",
     "int64",
     "uint8",
-    "complex64",
-    "complex128",
     "bool",
-]
+] + ([] if base.core.is_compiled_with_xpu() else ["complex64", "complex128"])
 
 _cpu_place = "Place(cpu)"
 _gpu_place = "Place(gpu:0)"
+_xpu_place = "Place(xpu:0)"
 
 
 def place_res():
     def res():
         if paddle.is_compiled_with_cuda():
             return _gpu_place
+        elif paddle.is_compiled_with_xpu():
+            return _xpu_place
         else:
             return _cpu_place
 
@@ -125,6 +127,8 @@ def test_tensor_to_dtype(self):
     def test_tensor_to_device(self):
         if paddle.is_compiled_with_cuda():
             x = paddle.to_tensor([1, 2, 3], place="gpu")
+        elif paddle.is_compiled_with_xpu():
+            x = paddle.to_tensor([1, 2, 3], place="xpu")
         else:
             x = paddle.to_tensor([1, 2, 3])
 
@@ -136,6 +140,8 @@ def test_tensor_to_device(self):
     def test_tensor_to_device2(self):
         if paddle.is_compiled_with_cuda():
             x = paddle.to_tensor([1, 2, 3], place="gpu")
+        elif paddle.is_compiled_with_xpu():
+            x = paddle.to_tensor([1, 2, 3], place="xpu")
         else:
             x = paddle.to_tensor([1, 2, 3])
 
@@ -150,6 +156,8 @@ def test_tensor_to_device_dtype(self):
         places = ["cpu"]
         if paddle.is_compiled_with_cuda():
             places.append("gpu")
+        if paddle.is_compiled_with_xpu():
+            places.append("xpu")
         for dtype in _valid_dtypes:
             for place in places:
                 tensor_x = paddle.jit.to_static(to_device_dtype)(
@@ -158,6 +166,8 @@ def test_tensor_to_device_dtype(self):
                 place_x_str = str(tensor_x.place)
                 if "gpu" == place:
                     self.assertEqual(place_x_str, _gpu_place)
+                elif "xpu" == place:
+                    self.assertEqual(place_x_str, _xpu_place)
                 else:
                     self.assertEqual(place_x_str, _cpu_place)
                 type_x_str = str(tensor_x.dtype)
diff --git a/test/dygraph_to_static/test_to_tensor.py b/test/dygraph_to_static/test_to_tensor.py
index b36a09c4a12493..44ba50744852ee 100644
--- a/test/dygraph_to_static/test_to_tensor.py
+++ b/test/dygraph_to_static/test_to_tensor.py
@@ -40,6 +40,8 @@ def case1(x):
 def case2(x):
     if core.is_compiled_with_cuda():
         place = paddle.CUDAPlace(0)
+    elif core.is_compiled_with_xpu():
+        place = paddle.XPUPlace(0)
     else:
         place = paddle.CPUPlace()
     a = paddle.to_tensor(
@@ -53,6 +55,8 @@ def case3(x):
     paddle.set_default_dtype("float64")
     if core.is_compiled_with_cuda():
         place = paddle.CUDAPlace(0)
+    elif core.is_compiled_with_xpu():
+        place = paddle.XPUPlace(0)
     else:
         place = paddle.CPUPlace()
     a = paddle.to_tensor([1.0, 2.0, 3.0], place=place)
@@ -64,6 +68,8 @@ def case4(x):
     paddle.set_default_dtype("float64")
     if core.is_compiled_with_cuda():
         place = paddle.CUDAPlace(0)
+    elif core.is_compiled_with_xpu():
+        place = paddle.XPUPlace(0)
     else:
         place = paddle.CPUPlace()
     a = paddle.to_tensor([1], place=place)
@@ -182,6 +188,8 @@ def test_static(self):
         with paddle.static.program_guard(main_prog, startup_prog):
             if core.is_compiled_with_cuda():
                 place = paddle.CUDAPlace(0)
+            elif core.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
             else:
                 place = paddle.CPUPlace()
 
diff --git a/test/legacy_test/test_Tensor_to.py b/test/legacy_test/test_Tensor_to.py
index 9821fac8616218..65aa691ed90992 100644
--- a/test/legacy_test/test_Tensor_to.py
+++ b/test/legacy_test/test_Tensor_to.py
@@ -31,10 +31,12 @@ def test_Tensor_to_dtype(self):
             "int32",
             "int64",
             "uint8",
-            "complex64",
-            "complex128",
             "bool",
-        ]
+        ] + (
+            []
+            if base.core.is_compiled_with_xpu()
+            else ["complex64", "complex128"]
+        )
         for dtype in valid_dtypes:
             tensorx = tensorx.to(dtype)
             typex_str = str(tensorx.dtype)
@@ -46,11 +48,14 @@ def test_Tensor_to_device(self):
         if base.core.is_compiled_with_cuda():
             places.append("gpu:0")
             places.append("gpu")
+        if base.core.is_compiled_with_xpu():
+            places.append("xpu:0")
+            places.append("xpu")
 
         for place in places:
             tensorx = tensorx.to(place)
             placex_str = str(tensorx.place)
-            if place == "gpu":
+            if place == "gpu" or place == "xpu":
                 self.assertTrue(placex_str, "Place(" + place + ":0)")
             else:
                 self.assertTrue(placex_str, "Place(" + place + ")")
@@ -68,6 +73,9 @@ def test_Tensor_to_device_dtype(self):
         if base.core.is_compiled_with_cuda():
             places.append("gpu:0")
             places.append("gpu")
+        if base.core.is_compiled_with_xpu():
+            places.append("xpu:0")
+            places.append("xpu")
         valid_dtypes = [
             "bfloat16",
             "float16",
@@ -78,15 +86,17 @@ def test_Tensor_to_device_dtype(self):
             "int32",
             "int64",
             "uint8",
-            "complex64",
-            "complex128",
             "bool",
-        ]
+        ] + (
+            []
+            if base.core.is_compiled_with_xpu()
+            else ["complex64", "complex128"]
+        )
         for dtype in valid_dtypes:
             for place in places:
                 tensorx = tensorx.to(place, dtype)
                 placex_str = str(tensorx.place)
-                if place == "gpu":
+                if place == "gpu" or place == "xpu":
                     self.assertTrue(placex_str, "Place(" + place + ":0)")
                 else:
                     self.assertTrue(placex_str, "Place(" + place + ")")
diff --git a/test/legacy_test/test_dygraph_multi_forward.py b/test/legacy_test/test_dygraph_multi_forward.py
index 599160f5b39cb2..edbccb08d36c62 100644
--- a/test/legacy_test/test_dygraph_multi_forward.py
+++ b/test/legacy_test/test_dygraph_multi_forward.py
@@ -188,11 +188,12 @@ def test_mnist_forward_float32(self):
                 paddle.framework.random._manual_program_seed(SEED)
             else:
                 paddle.framework.random._manual_program_seed(SEED)
-            exe = base.Executor(
-                base.CPUPlace()
-                if not core.is_compiled_with_cuda()
-                else base.CUDAPlace(0)
-            )
+            if core.is_compiled_with_cuda():
+                exe = base.Executor(base.CUDAPlace(0))
+            elif core.is_compiled_with_xpu():
+                exe = base.Executor(base.XPUPlace(0))
+            else:
+                exe = base.Executor(base.CPUPlace())
 
             mnist = MNIST()
             sgd = paddle.optimizer.SGD(learning_rate=1e-3)
diff --git a/test/legacy_test/test_random_seed.py b/test/legacy_test/test_random_seed.py
index 8fbaf9a3d6942e..2af2bfff71551b 100644
--- a/test/legacy_test/test_random_seed.py
+++ b/test/legacy_test/test_random_seed.py
@@ -51,7 +51,7 @@ def test_generator_uniform_random_dygraph(self):
         x2_np = x2.numpy()
         x3_np = x3.numpy()
 
-        if not core.is_compiled_with_cuda():
+        if not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu():
             np.testing.assert_allclose(x1_np, x2_np, rtol=1e-05)
             np.testing.assert_allclose(x_np, x3_np, rtol=1e-05)
 
@@ -84,7 +84,10 @@ def test_generator_uniform_random_static(self):
             out2_res1 = np.array(out2[0])
             out2_res2 = np.array(out2[1])
 
-            if not core.is_compiled_with_cuda():
+            if (
+                not core.is_compiled_with_cuda()
+                and not core.is_compiled_with_xpu()
+            ):
                 np.testing.assert_allclose(out1_res1, out2_res1, rtol=1e-05)
                 np.testing.assert_allclose(out1_res2, out2_res2, rtol=1e-05)
                 self.assertTrue(not np.allclose(out1_res2, out1_res1))
@@ -104,7 +107,7 @@ def test_gen_dropout_dygraph(self):
         y_np = y.numpy()
         y1_np = y1.numpy()
 
-        if not core.is_compiled_with_cuda():
+        if not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu():
             print(">>>>>>> dropout dygraph >>>>>>>")
             np.testing.assert_allclose(y_np, y1_np, rtol=1e-05)
 
@@ -129,7 +132,7 @@ def test_gen_dropout_static(self):
         out1_np = np.array(out1[0])
         out2_np = np.array(out2[0])
 
-        if not core.is_compiled_with_cuda():
+        if not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu():
             print(">>>>>>> dropout static >>>>>>>")
             np.testing.assert_allclose(out1_np, out2_np, rtol=1e-05)
 
@@ -150,7 +153,7 @@ def test_generator_gaussian_random_dygraph(self):
         x2_np = x2.numpy()
         x3_np = x3.numpy()
 
-        if not core.is_compiled_with_cuda():
+        if not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu():
             print(">>>>>>> gaussian random dygraph >>>>>>>")
             np.testing.assert_allclose(x1_np, x2_np, rtol=1e-05)
             np.testing.assert_allclose(x_np, x3_np, rtol=1e-05)
@@ -184,7 +187,10 @@ def test_generator_gaussian_random_static(self):
             out2_res1 = np.array(out2[0])
             out2_res2 = np.array(out2[1])
 
-            if not core.is_compiled_with_cuda():
+            if (
+                not core.is_compiled_with_cuda()
+                and not core.is_compiled_with_xpu()
+            ):
                 print(">>>>>>> gaussian random static >>>>>>>")
                 np.testing.assert_allclose(out1_res1, out2_res1, rtol=1e-05)
                 np.testing.assert_allclose(out1_res2, out2_res2, rtol=1e-05)
@@ -207,7 +213,7 @@ def test_generator_randint_dygraph(self):
         x2_np = x2.numpy()
         x3_np = x3.numpy()
 
-        if not core.is_compiled_with_cuda():
+        if not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu():
             print(">>>>>>> randint dygraph >>>>>>>")
             np.testing.assert_allclose(x1_np, x2_np, rtol=1e-05)
             np.testing.assert_allclose(x_np, x3_np, rtol=1e-05)
@@ -241,7 +247,10 @@ def test_generator_uniform_random_static_1(self):
             out2_res1 = np.array(out2[0])
             out2_res2 = np.array(out2[1])
 
-            if not core.is_compiled_with_cuda():
+            if (
+                not core.is_compiled_with_cuda()
+                and not core.is_compiled_with_xpu()
+            ):
                 np.testing.assert_allclose(out1_res1, out2_res1, rtol=1e-05)
                 np.testing.assert_allclose(out1_res2, out2_res2, rtol=1e-05)
                 self.assertTrue(not np.allclose(out1_res2, out1_res1))
@@ -262,7 +271,7 @@ def test_generator_randint_dygraph_1(self):
         x1_np = x1.numpy()
         x2_np = x2.numpy()
         x3_np = x3.numpy()
-        if not core.is_compiled_with_cuda():
+        if not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu():
             np.testing.assert_allclose(x1_np, x2_np, rtol=1e-05)
             np.testing.assert_allclose(x_np, x3_np, rtol=1e-05)
 
@@ -295,7 +304,10 @@ def test_generator_ranint_static(self):
             out2_res1 = np.array(out2[0])
             out2_res2 = np.array(out2[1])
 
-            if not core.is_compiled_with_cuda():
+            if (
+                not core.is_compiled_with_cuda()
+                and not core.is_compiled_with_xpu()
+            ):
                 print(">>>>>>> randint static >>>>>>>")
                 np.testing.assert_allclose(out1_res1, out2_res1, rtol=1e-05)
                 np.testing.assert_allclose(out1_res2, out2_res2, rtol=1e-05)
@@ -319,7 +331,7 @@ def test_generator_randperm_dygraph(self):
         x2_np = x2.numpy()
         x3_np = x3.numpy()
 
-        if not core.is_compiled_with_cuda():
+        if not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu():
             print(">>>>>>> randperm dygraph >>>>>>>")
             np.testing.assert_allclose(x1_np, x2_np, rtol=1e-05)
             np.testing.assert_allclose(x_np, x3_np, rtol=1e-05)
@@ -353,7 +365,10 @@ def test_generator_randperm_static(self):
             out2_res1 = np.array(out2[0])
             out2_res2 = np.array(out2[1])
 
-            if not core.is_compiled_with_cuda():
+            if (
+                not core.is_compiled_with_cuda()
+                and not core.is_compiled_with_xpu()
+            ):
                 print(">>>>>>> randperm static >>>>>>>")
                 np.testing.assert_allclose(out1_res1, out2_res1, rtol=1e-05)
                 np.testing.assert_allclose(out1_res2, out2_res2, rtol=1e-05)
diff --git a/test/xpu/test_cast_op_xpu.py b/test/xpu/test_cast_op_xpu.py
index 76c310cf4a83fd..e9b66c9a84aa61 100644
--- a/test/xpu/test_cast_op_xpu.py
+++ b/test/xpu/test_cast_op_xpu.py
@@ -29,6 +29,7 @@
 from paddle.base import Program, core, program_guard
 
 typeid_dict = {
+    'int16': int(core.VarDesc.VarType.INT16),
     'int32': int(core.VarDesc.VarType.INT32),
     'int64': int(core.VarDesc.VarType.INT64),
     'float32': int(core.VarDesc.VarType.FP32),