From bd454315e127465b9f434c12bf83df9189eac409 Mon Sep 17 00:00:00 2001
From: baocheny <baochen.yang@intel.com>
Date: Fri, 31 Mar 2023 00:36:58 +0800
Subject: [PATCH 1/7] rem is_compiled_with_npu

---
 .flake8                                       |  3 -
 .pre-commit-config.yaml                       |  1 -
 paddle/fluid/pybind/pybind.cc                 | 67 --------------
 pyproject.toml                                |  2 -
 python/paddle/__init__.py                     |  3 -
 python/paddle/amp/grad_scaler.py              |  5 +-
 python/paddle/device/__init__.py              | 17 ++--
 python/paddle/distributed/collective.py       |  5 -
 python/paddle/distributed/fleet/launch.py     | 27 +-----
 .../paddle/distributed/fleet/launch_utils.py  | 80 +---------------
 .../distributed/fleet/layers/mpu/mp_ops.py    |  4 +-
 .../fleet/meta_optimizers/common.py           |  2 +-
 .../sharding/offload_helper.py                |  6 --
 .../meta_optimizers/sharding_optimizer.py     |  6 +-
 .../sharding/group_sharded_utils.py           |  4 +-
 .../fleet/utils/hybrid_parallel_inference.py  |  6 +-
 .../distributed/launch/context/device.py      | 10 --
 python/paddle/distributed/parallel.py         | 18 +---
 .../ps/utils/collective_transpiler.py         | 33 +------
 .../distributed/transpiler/collective.py      | 32 +------
 python/paddle/fluid/__init__.py               |  6 --
 python/paddle/fluid/device_worker.py          |  2 -
 .../fluid/dygraph/varbase_patch_methods.py    |  6 +-
 python/paddle/fluid/executor.py               |  4 +-
 python/paddle/fluid/framework.py              | 92 +------------------
 python/paddle/fluid/optimizer.py              | 10 +-
 .../tests/unittests/c_embedding_op_base.py    | 11 +--
 .../fluid/tests/unittests/eager_op_test.py    | 10 +-
 .../fluid/tests/unittests/test_device.py      | 23 -----
 .../fluid/tests/unittests/test_dist_base.py   | 33 -------
 .../fluid/tests/unittests/test_var_base.py    |  3 -
 python/paddle/hapi/model.py                   | 27 ------
 python/paddle/nn/functional/conv.py           |  6 +-
 python/paddle/static/amp/amp_nn.py            |  2 +-
 python/paddle/static/amp/decorator.py         |  4 +-
 python/paddle/static/amp/fp16_lists.py        |  2 +-
 python/paddle/static/nn/common.py             |  2 +-
 python/paddle/utils/install_check.py          | 41 ++-------
 .../custom_kernel_dot_c_setup.py              |  4 -
 test/custom_kernel/custom_kernel_dot_setup.py |  4 -
 tools/get_quick_disable_lt.py                 |  3 -
 41 files changed, 57 insertions(+), 569 deletions(-)

diff --git a/.flake8 b/.flake8
index d87d6b43e3a61..0015cd971fe23 100644
--- a/.flake8
+++ b/.flake8
@@ -8,9 +8,6 @@ exclude =
     ./python/paddle/fluid/tra**,
     # Exclude third-party libraries
     ./python/paddle/utils/gast/**,
-    # Exclude files that will be removed in the future, see more at
-    # https://github.com/PaddlePaddle/Paddle/pull/46782#issuecomment-1273033731
-    ./python/paddle/fluid/tests/unittests/npu/**,
     ./python/paddle/fluid/tests/unittests/mlu/**
 ignore =
     # Whitespace before ‘,’, ‘;’, or ‘:’, it is not compatible with black
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ccf1db464d344..80010c9b0c431 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -5,7 +5,6 @@ exclude: |
         paddle/fluid/framework/fleet/heter_ps/cudf/.+|
         paddle/fluid/distributed/ps/thirdparty/round_robin.h|
         python/paddle/utils/gast/.+|
-        python/paddle/fluid/tests/unittests/npu/.+|
         python/paddle/fluid/tests/unittests/mlu/.+
     )$
 repos:
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 4328f638d852f..ac26b8c140cdf 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -154,12 +154,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/npu_info.h"
-#include "paddle/fluid/platform/device/npu/npu_profiler.h"
-#endif
-
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
@@ -291,14 +285,6 @@ bool IsCompiledWithXPU() {
 #endif
 }
 
-bool IsCompiledWithNPU() {
-#ifndef PADDLE_WITH_ASCEND_CL
-  return false;
-#else
-  return true;
-#endif
-}
-
 bool IsCompiledWithCustomDevice(std::string device_type) {
 #ifndef PADDLE_WITH_CUSTOM_DEVICE
   return false;
@@ -1626,18 +1612,6 @@ All parameter, weight, gradient are variables in Paddle.
                 "Please recompile or reinstall Paddle with MLU support."));
 #else
                     return new paddle::platform::MLUDeviceContext(place);
-#endif
-          })
-      .def_static(
-          "create",
-          [](paddle::platform::NPUPlace &place)
-              -> paddle::platform::DeviceContext * {
-#ifndef PADDLE_WITH_ASCEND_CL
-            PADDLE_THROW(platform::errors::PermissionDenied(
-                "Cannot use NPUPlace in CPU/GPU/XPU version, "
-                "Please recompile or reinstall Paddle with NPU support."));
-#else
-                return new paddle::platform::NPUDeviceContext(place);
 #endif
           })
       .def_static("create",
@@ -1809,13 +1783,6 @@ All parameter, weight, gradient are variables in Paddle.
              pybind11::gil_scoped_release release;
              self.Run(scope, place);
            })
-      .def("run",
-           [](OperatorBase &self,
-              const Scope &scope,
-              const platform::NPUPlace &place) {
-             pybind11::gil_scoped_release release;
-             self.Run(scope, place);
-           })
       .def("run",
            [](OperatorBase &self,
               const Scope &scope,
@@ -2034,7 +2001,6 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
   m.def("is_compiled_with_ascend", IsCompiledWithAscend);
   m.def("is_compiled_with_rocm", IsCompiledWithROCM);
-  m.def("is_compiled_with_npu", IsCompiledWithNPU);
   m.def("is_compiled_with_custom_device", IsCompiledWithCustomDevice);
   m.def("is_compiled_with_ipu", IsCompiledWithIPU);
   m.def("is_compiled_with_xpu", IsCompiledWithXPU);
@@ -2372,39 +2338,6 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
 #endif
 
-#ifdef PADDLE_WITH_ASCEND_CL
-  m.def("get_npu_device_count", platform::GetNPUDeviceCount);
-  m.def("npu_finalize", []() {
-    platform::HCCLCommContext::Instance().ReleaseHCCLComms();
-
-    auto &pool = platform::DeviceContextPool::Instance();
-    auto devices = platform::GetSelectedNPUDevices();
-    for (size_t i = 0; i < devices.size(); ++i) {
-      platform::NPUDeviceGuard guard(devices[i]);
-      pool.Get(platform::NPUPlace(devices[i]))->Wait();
-    }
-    platform::AclInstance::Instance().Finalize();
-  });
-
-  py::class_<platform::NPUProfConfigWrapper>(m, "NPUProfConfigWrapper");
-
-  m.def("npu_prof_init", platform::NPUProfilerInit);
-  m.def("npu_prof_start", [](platform::NPUProfConfigWrapper c) {
-    platform::NPUProfilerStart(c.ptr());
-  });
-  m.def("npu_prof_stop", [](platform::NPUProfConfigWrapper c) {
-    platform::NPUProfilerStop(c.ptr());
-  });
-  m.def("npu_prof_finalize", platform::NPUProfilerFinalize);
-  m.def("npu_prof_create_config", []() {
-    return platform::NPUProfConfigWrapper(platform::NPUProfilerCreateConfig());
-  });
-
-  m.def("npu_prof_destropy_config", [](platform::NPUProfConfigWrapper c) {
-    platform::NPUProfilerDestroyConfig(c.ptr());
-  });
-#endif
-
 #ifdef PADDLE_WITH_IPU
   m.def("get_ipu_device_count", platform::GetIPUDeviceCount);
 #endif
diff --git a/pyproject.toml b/pyproject.toml
index 526b4e9e486cc..2a4fdc775f323 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,7 +14,6 @@ extend_skip_glob = [
     "python/paddle/fluid/[!t]**",
     "python/paddle/fluid/tra**",
     "python/paddle/utils/gast/**",
-    "python/paddle/fluid/tests/unittests/npu/**",
     "python/paddle/fluid/tests/unittests/mlu/**",
 ]
 
@@ -24,7 +23,6 @@ exclude = [
     "./python/paddle/fluid/[!t]**",
     "./python/paddle/fluid/tra**",
     "./python/paddle/utils/gast/**",
-    "./python/paddle/fluid/tests/unittests/npu/**",
     "./python/paddle/fluid/tests/unittests/mlu/**",
 ]
 target-version = "py37"
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index ca7c4b525434f..8b87fb298f323 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -333,7 +333,6 @@
 from .framework import CPUPlace  # noqa: F401
 from .framework import IPUPlace  # noqa: F401
 from .framework import CUDAPlace  # noqa: F401
-from .framework import NPUPlace  # noqa: F401
 from .framework import CUDAPinnedPlace  # noqa: F401
 from .framework import MLUPlace  # noqa: F401
 from .framework import CustomPlace  # noqa: F401
@@ -363,7 +362,6 @@
 from .device import set_device  # noqa: F401
 from .device import get_device  # noqa: F401
 from .device import is_compiled_with_xpu  # noqa: F401
-from .device import is_compiled_with_npu  # noqa: F401
 from .device import is_compiled_with_ipu  # noqa: F401
 from .device import is_compiled_with_mlu  # noqa: F401
 from .device import is_compiled_with_cinn  # noqa: F401
@@ -513,7 +511,6 @@
     'histogram',
     'multiplex',
     'CUDAPlace',
-    'NPUPlace',
     'empty',
     'shape',
     'real',
diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py
index 3268783c742ca..009316514f5ec 100644
--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -106,11 +106,10 @@ def __init__(
             tracer._expected_place.is_gpu_place()
             or tracer._expected_place.is_xpu_place()
             or tracer._expected_place.is_mlu_place()
-            or tracer._expected_place.is_npu_place()
             or tracer._expected_place.is_custom_place()
         ):
             warnings.warn(
-                'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace, NPUPlace and CustomPlace, current place is %s, so it makes no effect.'
+                'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace and CustomPlace, current place is %s, so it makes no effect.'
                 % tracer._expected_place
             )
             enable = False
@@ -327,7 +326,7 @@ def _unscale(self, optimizer):
                     if param.dtype == core.VarDesc.VarType.FP32
                 ]
         self._found_inf = self._temp_found_inf_value_false
-        if core.is_compiled_with_npu():
+        if core.is_compiled_with_custom_device('npu'):
             float_status = _legacy_C_ops.alloc_float_status()
             _legacy_C_ops.clear_float_status(float_status, float_status)
 
diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py
index b05c5be8d4568..7ab7deb5c28a7 100644
--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -248,14 +248,14 @@ def _convert_to_place(device):
         device_id = int(selected_xpus[0])
         place = core.XPUPlace(device_id)
     elif lower_device == 'npu':
-        if not core.is_compiled_with_npu():
+        if not core.is_compiled_with_custom_device('npu'):
             raise ValueError(
                 "The device should not be 'npu', "
                 "since PaddlePaddle is not compiled with NPU"
             )
         selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
         device_id = int(selected_npus[0])
-        place = core.NPUPlace(device_id)
+        place = core.CustomPlace('npu', device_id)
     elif lower_device == 'ipu':
         if not core.is_compiled_with_ipu():
             raise ValueError(
@@ -298,7 +298,7 @@ def _convert_to_place(device):
             device_id = int(device_id)
             place = core.XPUPlace(device_id)
         if avaliable_npu_device:
-            if not core.is_compiled_with_npu():
+            if not core.is_compiled_with_custom_device('npu'):
                 device_info_list = device.split(':', 1)
                 device_type = device_info_list[0]
                 if device_type in core.get_all_custom_device_type():
@@ -316,7 +316,7 @@ def _convert_to_place(device):
             device_info_list = device.split(':', 1)
             device_id = device_info_list[1]
             device_id = int(device_id)
-            place = core.NPUPlace(device_id)
+            place = core.CustomPlace('npu', device_id)
         if avaliable_mlu_device:
             if not core.is_compiled_with_mlu():
                 raise ValueError(
@@ -404,9 +404,6 @@ def get_device():
     elif isinstance(place, core.XPUPlace):
         device_id = place.get_device_id()
         device = 'xpu:' + str(device_id)
-    elif isinstance(place, core.NPUPlace):
-        device_id = place.get_device_id()
-        device = 'npu:' + str(device_id)
     elif isinstance(place, core.IPUPlace):
         num_devices = core.get_ipu_device_count()
         device = "ipus:{{0-{}}}".format(num_devices - 1)
@@ -529,7 +526,7 @@ class Event:
     Parameters:
         device(str|paddle.CUDAPlace(n)|paddle.CustomPlace(n)): Which device the stream runn on. If device is None, the device is the current device. Default: None.
             It can be ``gpu``, ``gpu:x``,``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
-            where ``x`` is the index of the GPUs, XPUs, NPUs or MLUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
+            where ``x`` is the index of the GPUs, XPUs or MLUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
         enable_timing (bool, optional): indicates if the event should measure time, default is False
         blocking (bool, optional): if True, ``wait`` will be blocking, default is False
         interprocess (bool): if True, the event can be shared between processes, default is False
@@ -674,7 +671,7 @@ class Stream:
     Parameters:
         device(str|paddle.CUDAPlace(n)|paddle.CustomPlace(n)): Which device the stream runn on. If device is None, the device is the current device. Default: None.
             It can be ``gpu``, ``gpu:x``,``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
-            where ``x`` is the index of the GPUs, XPUs, NPUs or MLUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
+            where ``x`` is the index of the GPUs, XPUs or MLUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
         priority(int, optional): priority of the CUDA stream. Can be either
             1 (high priority) or 2 (low priority). By default, streams have
             priority 2.
@@ -996,7 +993,7 @@ def synchronize(device=None):
     Parameters:
         device(str|paddle.CUDAPlace(n)|paddle.XPUPlace(n)|paddle.CustomPlace(n)): The device which want to wait for.  If device is None, the device is the current device. Default: None.
             It can be ``gpu``, ``gpu:x``, ``xpu``, ``xpu:x``, ``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
-            where ``x`` is the index of the GPUs, XPUs, NPUs or MLUs. And it can be paddle.CUDAPlace(n) or paddle.XPUPlace(n) or paddle.CustomPlace(n).
+            where ``x`` is the index of the GPUs, XPUs or MLUs. And it can be paddle.CUDAPlace(n) or paddle.XPUPlace(n) or paddle.CustomPlace(n).
     Examples:
         .. code-block:: python
             # required: custom_device
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 774112467fb91..6dd74531a0225 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -288,11 +288,6 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
                 core.NCCLParallelContext(strategy, place).init_with_ring_id(
                     ring_id
                 )
-            elif core.is_compiled_with_npu():
-                place = core.NPUPlace(genv.device_id)
-                core.HCCLParallelContext(strategy, place).init_with_ring_id(
-                    ring_id
-                )
             elif core.is_compiled_with_mlu():
                 place = core.MLUPlace(genv.device_id)
                 core.CNCLParallelContext(strategy, place).init_with_ring_id(
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 32a36783a71b9..354b4b708619c 100755
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -64,7 +64,7 @@
 from argparse import REMAINDER, ArgumentParser
 
 from paddle import framework
-from paddle.distributed.fleet import ascend_utils, cloud_utils, launch_utils
+from paddle.distributed.fleet import cloud_utils, launch_utils
 from paddle.distributed.fleet.elastic import enable_elastic, launch_elastic
 from paddle.distributed.fleet.launch_utils import (
     DeviceMode,
@@ -155,16 +155,6 @@ def _parse_args():
         )
         base_group.add_argument("--selected_xpus", dest="xpus")
 
-    if framework.core.is_compiled_with_npu():
-        base_group.add_argument(
-            "--npus",
-            type=str,
-            default=None,
-            help="It's for xpu training. For example: "
-            "--npus=\"0,1,2,3\" will launch four training processes each bound to one npu.",
-        )
-        base_group.add_argument("--selected_npus", dest="npus")
-
     if framework.core.is_compiled_with_mlu():
         base_group.add_argument(
             "--mlus",
@@ -417,13 +407,6 @@ def get_cluster_info(args):
             args.ips, device_mode, devices_per_proc, start_port
         )
         logger.debug("get cluster from cloud:{}".format(cluster))
-    elif device_mode == DeviceMode.ASCEND_NPU:
-        # for ascend
-        cluster, pod = ascend_utils.get_cloud_cluster(
-            rank_table_file=os.getenv("RANK_TABLE_FILE", None),
-            device_mode=device_mode,
-            start_port=start_port,
-        )
     else:
         # trainers_num = 1 or not use paddlecloud ips="a,b"
         cluster, pod = get_cluster_from_args(
@@ -503,8 +486,6 @@ def infer_backend(args):
         return
     if framework.core.is_compiled_with_cuda():
         args.backend = 'nccl'
-    elif framework.core.is_compiled_with_npu():
-        args.backend = 'unknown'
     elif framework.core.is_compiled_with_xpu():
         args.backend = 'bkcl'
     elif framework.core.is_compiled_with_mlu():
@@ -557,8 +538,6 @@ def which_distributed_mode(args):
 
     if framework.core.is_compiled_with_cuda():
         accelerators = framework.core.get_cuda_device_count()
-    elif framework.core.is_compiled_with_npu():
-        accelerators = framework.core.get_npu_device_count()
     elif framework.core.is_compiled_with_xpu():
         accelerators = framework.core.get_xpu_device_count()
     elif framework.core.is_compiled_with_mlu():
@@ -593,7 +572,7 @@ def which_distributed_mode(args):
         ):
             if args.servers:
                 logger.warning(
-                    "Not found distinct arguments and not compiled with cuda or xpu or npu or mlu. "
+                    "Not found distinct arguments and not compiled with cuda or xpu or mlu. "
                     "But found args.servers not empty, default use ps mode"
                 )
                 return DistributeMode.PS
@@ -601,7 +580,7 @@ def which_distributed_mode(args):
                 return DistributeMode.COLLECTIVE
         else:
             logger.warning(
-                "Not found distinct arguments and compiled with cuda or xpu or npu or mlu. "
+                "Not found distinct arguments and compiled with cuda or xpu or mlu. "
                 "Default use collective mode"
             )
             return DistributeMode.COLLECTIVE
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index ef30b7af9bc86..82d304940ce51 100755
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -55,7 +55,6 @@ class DeviceMode:
     GPU = 1
     KUNLUN = 2
     XPU = 2
-    ASCEND_NPU = 3
     UNKNOWN = 3
     MLU = 4
 
@@ -308,11 +307,7 @@ def get_cluster(
         ), "current trainer_endpoints size should be greater equal than acclerators size."
         for i in range(len(devices_per_proc)):
             trainer = Trainer()
-            if (
-                device_mode == DeviceMode.GPU
-                or device_mode == DeviceMode.ASCEND_NPU
-                or device_mode == DeviceMode.MLU
-            ):
+            if device_mode == DeviceMode.GPU or device_mode == DeviceMode.MLU:
                 if isinstance(devices_per_proc[i], (list, tuple)):
                     trainer.accelerators.extend(devices_per_proc[i])
                     pod.accelerators.extend(devices_per_proc[i])
@@ -555,13 +550,6 @@ def start_local_trainers(
             proc_env["FLAGS_selected_gpus"] = "%s" % ",".join(
                 [str(g) for g in t.accelerators]
             )
-
-        elif (
-            len(t.accelerators) > 0 and pod.device_mode == DeviceMode.ASCEND_NPU
-        ):
-            proc_env["FLAGS_selected_npus"] = "%s" % ",".join(
-                [str(g) for g in t.accelerators]
-            )
         elif len(t.accelerators) > 0 and pod.device_mode == DeviceMode.MLU:
             proc_env["FLAGS_selected_mlus"] = "%s" % ",".join(
                 [str(g) for g in t.accelerators]
@@ -773,40 +761,6 @@ def get_xpus(xpus):
     return res_xpus
 
 
-def get_npus(npus):
-    if npus is None:
-        npus_num = framework.core.get_npu_device_count()
-        res_npus = [str(x) for x in range(0, npus_num)]
-    else:
-        npu_visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
-        if npu_visible_devices is None or npu_visible_devices == "":
-            res_npus = [x.strip() for x in npus.split(',')]
-        else:
-            # change npus into relative values
-            # e.g. ASCEND_VISIBLE_DEVICES=4,5,6,7; args.npus=4,5,6,7;
-            # therefore npus=0,1,2,3
-            npu_visible_devices_list = npu_visible_devices.split(',')
-            for x in npus.split(','):
-                assert x in npu_visible_devices_list, (
-                    "Can't find "
-                    "your npus %s in ASCEND_VISIBLE_DEVICES[%s]."
-                    % (x, npu_visible_devices)
-                )
-            res_npus = [
-                npu_visible_devices_list.index(x.strip())
-                for x in npus.split(',')
-            ]
-            logger.info(
-                "Change selected_npus into reletive values. --ips:{} "
-                "will change into relative_ips:{} according to your "
-                "ASCEND_VISIBLE_DEVICES:{}".format(
-                    npus, res_npus, npu_visible_devices_list
-                )
-            )
-
-    return res_npus
-
-
 def get_mlus(mlus):
     if mlus is None:
         mlus_num = framework.core.get_mlu_device_count()
@@ -856,16 +810,6 @@ def get_device_mode(backend):
         ):
             print("launch train in heter mode with XPU device.")
             return DeviceMode.XPU
-        if (
-            framework.core.is_compiled_with_npu()
-            and framework.core.get_npu_device_count() > 0
-        ):
-            print("launch train in heter mode with NPU device.")
-            return DeviceMode.ASCEND_NPU
-
-    if backend == 'hccl' and framework.core.get_npu_device_count() > 0:
-        print("launch train in ascend npu mode!")
-        return DeviceMode.ASCEND_NPU
 
     if backend == 'nccl' and framework.core.get_cuda_device_count() > 0:
         print("launch train in GPU mode!")
@@ -905,19 +849,6 @@ def get_device_proc_info(args):
             devices_per_proc = [gpus[i : i + n] for i in range(0, len(gpus), n)]
         else:
             devices_per_proc = gpus
-    elif device_mode == DeviceMode.ASCEND_NPU:
-        npus = get_npus(args.npus)
-        if args.nproc_per_node is not None:
-            assert (
-                len(npus) % int(args.nproc_per_node)
-            ) == 0, "npus' number:{} mod args.nproc_per_node:{} must == 0".format(
-                len(npus), args.nproc_per_node
-            )
-
-            n = int(len(npus) / int(args.nproc_per_node))
-            devices_per_proc = [npus[i : i + n] for i in range(0, len(npus), n)]
-        else:
-            devices_per_proc = npus
     elif device_mode == DeviceMode.XPU:
         xpus = get_xpus(args.xpus)
         if args.nproc_per_node is not None:
@@ -2144,12 +2075,6 @@ def check_backend(backend):
             "your paddle is not compiled with xpu but you assign 'bkcl' as backend."
         )
 
-    if backend == 'hccl' and not framework.core.is_compiled_with_npu():
-        raise ValueError(
-            "paddle.distributed initialize error, "
-            "your paddle is not compiled with npu but you assign 'hccl' as backend."
-        )
-
     if backend == 'cncl' and not framework.core.is_compiled_with_mlu():
         raise ValueError(
             "paddle.distributed initialize error, "
@@ -2177,9 +2102,6 @@ def get_backend_by_compile_flag():
     if framework.core.is_compiled_with_xpu():
         return 'bkcl'
 
-    if framework.core.is_compiled_with_npu():
-        return 'hccl'
-
     if framework.core.is_compiled_with_mlu():
         return 'cncl'
 
diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
index fade4aa61ce84..44a01677bee5a 100644
--- a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
@@ -536,7 +536,9 @@ def _parallel_linear(
 
     # NOTE: npu linear function use matmul_v2 but linear use matmul
     linear_function = (
-        _linear if core.is_compiled_with_npu() else paddle.nn.functional.linear
+        _linear
+        if core.is_compiled_with_custom_device('npu')
+        else paddle.nn.functional.linear
     )
     linear_out = linear_function(
         x,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py
index bc79bea4e2359..c9474d397417a 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -196,7 +196,7 @@ def _add_sync_by_allreduce(block):
                     OP_ROLE_KEY: OpRole.Forward,
                 },
             )
-        elif core.is_compiled_with_npu():
+        elif core.is_compiled_with_custom_device('npu'):
             block.append_op(
                 type='c_gen_hccl_id',
                 inputs={},
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
index d9a30150accdb..bcdd93ffba2e5 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
@@ -26,23 +26,17 @@ class PlaceType:
     CUDA = 1
     CUDA_PINNED = 2
     XPU = 3  # unsupport for now
-    NPU = 4
-    NPU_PINNED = 5
 
     @staticmethod
     def default_device():
         if core.is_compiled_with_cuda():
             return PlaceType.CUDA
-        elif core.is_compiled_with_npu():
-            return PlaceType.NPU
         return PlaceType.CPU
 
     @staticmethod
     def default_pinned():
         if core.is_compiled_with_cuda():
             return PlaceType.CUDA_PINNED
-        elif core.is_compiled_with_npu():
-            return PlaceType.NPU_PINNED
         return PlaceType.CPU
 
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index 264c48870e84f..5a3c058ace66e 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -600,7 +600,7 @@ def _adapt_amp_clip_without_sharding(self):
 
         rings = [self.mp_ring_id, self.pp_ring_id]
         # FIXME(wangxi): some problem with NPU found_finite, need sync with DP
-        if core.is_compiled_with_npu():
+        if core.is_compiled_with_custom_device('npu'):
             rings += [self.dp_ring_id]
         FP16Utils.sync_amp_check_nan_inf(main_block, rings)
 
@@ -725,7 +725,7 @@ def minimize_impl(
         self._dump_program_for_debug()
 
         # GPU need to wait server ready, GPU and NPU is Layered connection
-        if not core.is_compiled_with_npu():
+        if not core.is_compiled_with_custom_device('npu'):
             self._wait()
         return optimize_ops, params_grads
 
@@ -847,7 +847,7 @@ def _init_pipeline_comm(self, startup_block):
                 sync=False,
             )
 
-        if core.is_compiled_with_npu():
+        if core.is_compiled_with_custom_device('npu'):
             self._init_npu_pipeline_comm(startup_block)
             return
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
index 1a09bb3fa92a2..100750dd28b34 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
@@ -200,7 +200,7 @@ def device_guard(dev_id=0, device="cpu"):
     origin_device = paddle.device.get_device()
     if device == "cpu":
         paddle.set_device(device)
-    elif device in ["gpu", "xpu", "npu"]:
+    elif device in ["gpu", "xpu"]:
         paddle.set_device("{}:{}".format(device, dev_id))
     try:
         yield
@@ -313,8 +313,6 @@ def cvt_to_device(x, dev_id, blocking=True):
     """
     if paddle.is_compiled_with_cuda():
         place = paddle.CUDAPlace(dev_id)
-    elif paddle.is_compiled_with_npu():
-        place = paddle.NPUPlace(dev_id)
     elif paddle.is_compiled_with_xpu():
         place = paddle.XPUPlace(dev_id)
     else:
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
index b36af2c2a040b..68b5581d3be37 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
@@ -201,11 +201,9 @@ def __init__(
         assert isinstance(main_program, Program)
 
         self._device = None
-        if core.is_compiled_with_npu():
-            self._device = "npu"
-        elif core.is_compiled_with_cuda():
+        if core.is_compiled_with_cuda():
             self._device = "gpu"
-        assert self._device, "Only gpu and npu are supported."
+        assert self._device, "Only gpu are supported."
 
         assert not in_dygraph_mode(), "Only static graph mode is supported."
 
diff --git a/python/paddle/distributed/launch/context/device.py b/python/paddle/distributed/launch/context/device.py
index 48dba9af56411..c3f6e504dcc77 100644
--- a/python/paddle/distributed/launch/context/device.py
+++ b/python/paddle/distributed/launch/context/device.py
@@ -24,7 +24,6 @@ class DeviceType:
     CPU = 'cpu'
     GPU = 'gpu'
     XPU = 'xpu'
-    NPU = 'npu'
     MLU = 'mlu'
     IPU = 'ipu'
     CUSTOM_DEVICE = 'custom_device'
@@ -69,8 +68,6 @@ def get_selected_device_key(self):
             return 'FLAGS_selected_cpus'
         if self._dtype == DeviceType.GPU:
             return 'FLAGS_selected_gpus'
-        if self._dtype == DeviceType.NPU:
-            return 'FLAGS_selected_npus'
         if self._dtype == DeviceType.XPU:
             return 'FLAGS_selected_xpus'
         if self._dtype == DeviceType.MLU:
@@ -114,9 +111,6 @@ def parse_device(self):
         elif 'XPU_VISIBLE_DEVICES' in os.environ:
             dev._dtype = DeviceType.XPU
             visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
-        elif 'ASCEND_VISIBLE_DEVICES' in os.environ:
-            dev._dtype = DeviceType.NPU
-            visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
         elif 'MLU_VISIBLE_DEVICES' in os.environ:
             dev._dtype = DeviceType.MLU
             visible_devices = os.getenv("MLU_VISIBLE_DEVICES")
@@ -158,10 +152,6 @@ def get_custom_devices_count(device_type):
             dev._dtype = DeviceType.XPU
             num = core.get_xpu_device_count()
             visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
-        elif core.is_compiled_with_npu():
-            dev._dtype = DeviceType.NPU
-            num = core.get_npu_device_count()
-            visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
         elif core.is_compiled_with_mlu():
             dev._dtype = DeviceType.MLU
             num = core.get_mlu_device_count()
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 2be2f097be984..56dc3741e3c1f 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -721,9 +721,6 @@ def __init__(self):
             elif core.is_compiled_with_xpu():
                 selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",")
                 self._device_id = int(selected_xpus[0])
-            elif core.is_compiled_with_npu():
-                selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
-                self._device_id = int(selected_npus[0])
             elif core.is_compiled_with_mlu():
                 selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",")
                 self._device_id = int(selected_mlus[0])
@@ -892,11 +889,10 @@ def _start_kv_server(port, http_server_d, size):
 def _is_cpuonly(backend):
     check_backend(backend)
     if (
-        backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter', 'cncl']
+        backend in ['auto', 'nccl', 'bkcl', 'heter', 'cncl']
         and (
             core.is_compiled_with_cuda()
             or core.is_compiled_with_xpu()
-            or core.is_compiled_with_npu()
             or core.is_compiled_with_mlu()
         )
     ) or backend == 'xccl':
@@ -998,7 +994,6 @@ def train():
         is_cpu_only
         or core.is_compiled_with_cuda()
         or core.is_compiled_with_xpu()
-        or core.is_compiled_with_npu()
         or core.is_compiled_with_mlu()
         or backend == "xccl"
     ):
@@ -1018,9 +1013,6 @@ def train():
         elif not is_cpu_only and core.is_compiled_with_xpu():
             _check_var_exists('FLAGS_selected_xpus')
             backend = "bkcl" if backend == "auto" else backend
-        elif not is_cpu_only and core.is_compiled_with_npu():
-            _check_var_exists('FLAGS_selected_npus')
-            backend = "hccl" if backend == "auto" else backend
         elif not is_cpu_only and core.is_compiled_with_mlu():
             _check_var_exists('FLAGS_selected_mlus')
             backend = "cncl" if backend == "auto" else backend
@@ -1046,8 +1038,6 @@ def train():
         place = core.CUDAPlace(parallel_env.device_id)
     elif core.is_compiled_with_xpu():
         place = core.XPUPlace(parallel_env.device_id)
-    elif core.is_compiled_with_npu():
-        place = core.NPUPlace(parallel_env.device_id)
     elif core.is_compiled_with_mlu():
         place = core.MLUPlace(parallel_env.device_id)
 
@@ -1146,7 +1136,7 @@ def train():
     strategy.current_endpoint = parallel_env.current_endpoint
     strategy.nrings = parallel_env.nrings
 
-    # init nccl or hccl or bkcl or heter context
+    # init nccl or bkcl or heter context
     if is_cpu_only:
         parallel_helper._set_parallel_ctx(
             core.GLOOParallelContext(strategy, place)
@@ -1163,10 +1153,6 @@ def train():
         parallel_helper._set_parallel_ctx(
             core.BKCLParallelContext(strategy, place)
         )
-    elif core.is_compiled_with_npu():
-        parallel_helper._set_parallel_ctx(
-            core.HCCLParallelContext(strategy, place)
-        )
     elif core.is_compiled_with_mlu():
         parallel_helper._set_parallel_ctx(
             core.CNCLParallelContext(strategy, place)
diff --git a/python/paddle/distributed/ps/utils/collective_transpiler.py b/python/paddle/distributed/ps/utils/collective_transpiler.py
index ea6f23de48d97..952652d155800 100644
--- a/python/paddle/distributed/ps/utils/collective_transpiler.py
+++ b/python/paddle/distributed/ps/utils/collective_transpiler.py
@@ -133,37 +133,8 @@ def _init_communicator(
             wait_server_ready(other_endpoints)
 
         block = program.global_block()
-        if core.is_compiled_with_npu():
-            hccl_id_var = block.create_var(
-                name=unique_name.generate('hccl_id'),
-                persistable=True,
-                type=core.VarDesc.VarType.RAW,
-            )
-            endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
-            block.append_op(
-                type='c_gen_hccl_id',
-                inputs={},
-                outputs={'Out': hccl_id_var},
-                attrs={
-                    'rank': rank,
-                    'endpoint': current_endpoint,
-                    'other_endpoints': other_endpoints,
-                    self.op_role_key: OpRole.Forward,
-                },
-            )
-            block.append_op(
-                type='c_comm_init_hccl',
-                inputs={'X': hccl_id_var},
-                outputs={},
-                attrs={
-                    'rank': rank,
-                    'ring_id': ring_id,
-                    'device_id': int(os.getenv("FLAGS_selected_npus")),
-                    'rank_ids': nranks,
-                    self.op_role_key: OpRole.Forward,
-                },
-            )
-        elif core.is_compiled_with_xpu():
+
+        if core.is_compiled_with_xpu():
             bkcl_id_var = block.create_var(
                 name=unique_name.generate('bkcl_id'),
                 persistable=True,
diff --git a/python/paddle/distributed/transpiler/collective.py b/python/paddle/distributed/transpiler/collective.py
index b60ae1266e3c9..8ceb9c1e5c633 100644
--- a/python/paddle/distributed/transpiler/collective.py
+++ b/python/paddle/distributed/transpiler/collective.py
@@ -131,37 +131,7 @@ def _init_communicator(
             wait_server_ready(other_endpoints)
 
         block = program.global_block()
-        if core.is_compiled_with_npu():
-            hccl_id_var = block.create_var(
-                name=unique_name.generate('hccl_id'),
-                persistable=True,
-                type=core.VarDesc.VarType.RAW,
-            )
-            endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
-            block.append_op(
-                type='c_gen_hccl_id',
-                inputs={},
-                outputs={'Out': hccl_id_var},
-                attrs={
-                    'rank': rank,
-                    'endpoint': current_endpoint,
-                    'other_endpoints': other_endpoints,
-                    self.op_role_key: OpRole.Forward,
-                },
-            )
-            block.append_op(
-                type='c_comm_init_hccl',
-                inputs={'X': hccl_id_var},
-                outputs={},
-                attrs={
-                    'rank': rank,
-                    'ring_id': ring_id,
-                    'device_id': int(os.getenv("FLAGS_selected_npus")),
-                    'rank_ids': nranks,
-                    self.op_role_key: OpRole.Forward,
-                },
-            )
-        elif core.is_compiled_with_cuda():
+        if core.is_compiled_with_cuda():
             nccl_id_var = block.create_var(
                 name=unique_name.generate('nccl_id'),
                 persistable=True,
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 7408d8e3b6e42..a6892377cb5a9 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -71,7 +71,6 @@
     XPUPlace,
     CUDAPlace,
     CUDAPinnedPlace,
-    NPUPlace,
     IPUPlace,
     MLUPlace,
     CustomPlace,
@@ -127,7 +126,6 @@
         'XPUPlace',
         'CUDAPlace',
         'CUDAPinnedPlace',
-        'NPUPlace',
         'IPUPlace',
         'MLUPlace',
         'Tensor',
@@ -220,10 +218,6 @@ def remove_flag_if_exists(name):
 __bootstrap__()
 monkey_patch_varbase()
 
-# NOTE(zhiqiu): register npu_finalize on the exit of Python,
-# do some clean up manually.
-if core.is_compiled_with_npu():
-    atexit.register(core.npu_finalize)
 # NOTE(Aurelius84): clean up ExecutorCacheInfo in advance manually.
 atexit.register(core.clear_executor_cache)
 
diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py
index bebf5ebd5eae1..4083937e2d76d 100644
--- a/python/paddle/fluid/device_worker.py
+++ b/python/paddle/fluid/device_worker.py
@@ -654,8 +654,6 @@ def _gen_worker_desc(self, trainer_desc):
         place_id = pipeline_opt["place_id"]
         if core.is_compiled_with_cuda():
             assert isinstance(place, core.CUDAPlace)
-        elif core.is_compiled_with_npu():
-            assert isinstance(place, core.NPUPlace)
         cfg.place = cfg.CUDAPlace
         cfg.place_id = place_id
 
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 0bc2a15b7d7a5..1d294a0330d44 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -306,11 +306,7 @@ def backward(self, grad_tensor=None, retain_graph=False):
             if _grad_scalar:
                 # When using amp with Fleet DistributedStrategy, we do loss scaling implicitly.
                 self = _grad_scalar.scale(self)
-            if (
-                paddle.is_compiled_with_xpu()
-                or paddle.is_compiled_with_npu()
-                or paddle.is_compiled_with_mlu()
-            ):
+            if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_mlu():
                 # TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
                 scaled_loss = scale_loss(self)
                 if framework.global_var._in_eager_mode_:
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index ce1c55bd17168..82c077a883a7f 100755
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -2132,7 +2132,7 @@ def _run_from_dataset(
             for var in program.global_block().vars.values():
                 if var.is_data:
                     data_vars.append(var)
-            if core.is_compiled_with_npu():
+            if core.is_compiled_with_custom_device('npu'):
                 dataset = paddle.fluid.DatasetFactory().create_dataset(
                     'InMemoryDataset'
                 )
@@ -2309,7 +2309,7 @@ def _get_dataset():
             for var in program.global_block().vars.values():
                 if var.is_data:
                     data_vars.append(var)
-            if core.is_compiled_with_npu():
+            if core.is_compiled_with_custom_device('npu'):
                 dataset = paddle.fluid.DatasetFactory().create_dataset(
                     'InMemoryDataset'
                 )
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 7b17ecc3e150b..13d8609b3335e 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -59,7 +59,6 @@
     'is_compiled_with_cuda',
     'is_compiled_with_rocm',
     'is_compiled_with_xpu',
-    'is_compiled_with_npu',
     'Variable',
     'require_version',
     'device_guard',
@@ -225,7 +224,7 @@ def _in_eager_without_dygraph_check():
     return global_var._in_eager_mode_
 
 
-# FIXME(dev): We haven't fully verified eager mode on XPU/NPU et.al but
+# FIXME(dev): We haven't fully verified eager mode on XPU et.al but
 # only GPU/CPU. Remove this after we improve this feature.
 _is_first_import_ = True
 
@@ -728,15 +727,6 @@ def _xpu_ids():
     return device_ids
 
 
-def _npu_ids():
-    npus_env = os.getenv("FLAGS_selected_npus")
-    if npus_env:
-        device_ids = [int(s) for s in npus_env.split(",")]
-    else:
-        device_ids = range(core.get_npu_device_count())
-    return device_ids
-
-
 def _custom_device_ids(device_type):
     custom_devices_env = os.getenv("FLAGS_selected_" + device_type + "s")
     if custom_devices_env:
@@ -770,21 +760,6 @@ def is_compiled_with_xpu():
     return core.is_compiled_with_xpu()
 
 
-def is_compiled_with_npu():
-    """
-    Whether this whl package can be used to run the model on NPU.
-
-    Returns (bool): support npu or not.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            support_npu = fluid.is_compiled_with_npu()
-    """
-    return core.is_compiled_with_npu()
-
-
 def disable_signal_handler():
     """
     Reset signal handler registered by Paddle.
@@ -943,47 +918,6 @@ def xpu_places(device_ids=None):
     return [core.XPUPlace(dev_id) for dev_id in device_ids]
 
 
-def npu_places(device_ids=None):
-    """
-
-    Note:
-        For multi-card tasks, please use `FLAGS_selected_npus` environment variable to set the visible NPU device.
-
-    This function creates a list of :code:`paddle.NPUPlace` objects.
-    If :code:`device_ids` is None, environment variable of
-    :code:`FLAGS_selected_npus` would be checked first. For example, if
-    :code:`FLAGS_selected_npus=0,1,2`, the returned list would
-    be [paddle.NPUPlace(0), paddle.NPUPlace(1), paddle.NPUPlace(2)].
-    If :code:`FLAGS_selected_npus` is not set, all visible
-    npu places would be returned.
-    If :code:`device_ids` is not None, it should be the device
-    ids of NPUs. For example, if :code:`device_ids=[0,1,2]`,
-    the returned list would be
-    [paddle.NPUPlace(0), paddle.NPUPlace(1), paddle.NPUPlace(2)].
-
-    Parameters:
-        device_ids (list or tuple of int, optional): list of NPU device ids.
-    Returns:
-        list of paddle.NPUPlace: Created NPU place list.
-    Examples:
-        .. code-block:: python
-
-            # required: npu
-
-            import paddle
-            import paddle.static as static
-
-            paddle.enable_static()
-            npu_places = static.npu_places()
-    """
-    assert core.is_compiled_with_npu(), "Not compiled with NPU"
-    if device_ids is None:
-        device_ids = _npu_ids()
-    elif not isinstance(device_ids, (list, tuple)):
-        device_ids = [device_ids]
-    return [core.NPUPlace(dev_id) for dev_id in device_ids]
-
-
 def cpu_places(device_count=None):
     """
     This function creates a list of :code:`paddle.CPUPlace` objects, and returns the created list.
@@ -2641,10 +2575,6 @@ def set_value(self, value, scope=None):
             p = core.Place()
             p.set_place(t._place())
             place = core.XPUPlace(p.xpu_device_id())
-        elif p.is_npu_place():
-            p = core.Place()
-            p.set_place(t._place())
-            place = core.NPUPlace(p.npu_device_id())
         elif p.is_mlu_place():
             p = core.Place()
             p.set_place(t._place())
@@ -7574,9 +7504,9 @@ def device_guard(device=None):
         device, index = device.split(':')
         if device == 'cpu':
             raise ValueError("Should not set device id for cpu.")
-    if device not in ['cpu', 'gpu', 'npu', 'xpu', 'mlu', '', None]:
+    if device not in ['cpu', 'gpu', 'xpu', 'mlu', '', None]:
         raise ValueError(
-            "The Attr(device) should be 'cpu' 'npu' 'xpu' 'mlu' or 'gpu', and it can also be empty string or None "
+            "The Attr(device) should be 'cpu' 'xpu' 'mlu' or 'gpu', and it can also be empty string or None "
             "when there is no need to specify device. But received %s" % device
         )
     if index:
@@ -7705,7 +7635,6 @@ def _get_paddle_place(place):
             core.CPUPlace,
             core.CUDAPinnedPlace,
             core.CUDAPlace,
-            core.NPUPlace,
             core.IPUPlace,
             core.MLUPlace,
             core.CustomPlace,
@@ -7756,19 +7685,6 @@ def _get_paddle_place(place):
         device_id = int(device_id)
         return core.XPUPlace(device_id)
 
-    # NPU
-    avaliable_npu_place = re.match(r'npu:\d+', place)
-    if avaliable_npu_place:
-        if not core.is_compiled_with_npu():
-            raise ValueError(
-                "The device should not be {}, since PaddlePaddle is "
-                "not compiled with NPU".format(avaliable_npu_place.group())
-            )
-        place_info_list = place.split(':', 1)
-        device_id = place_info_list[1]
-        device_id = int(device_id)
-        return core.NPUPlace(device_id)
-
     # IPU
     avaliable_ipu_place = re.match(r'ipu:\d+', place)
     if avaliable_ipu_place:
@@ -7796,7 +7712,7 @@ def _get_paddle_place(place):
         return core.MLUPlace(device_id)
 
     raise ValueError(
-        "Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace, IPUPlace, MLUPlace and NPUPlace, but received {}.".format(
+        "Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace, IPUPlac and MLUPlace but received {}.".format(
             place
         )
     )
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index df190d6627321..6ed9e674689ee 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -4553,7 +4553,7 @@ def train_reader():
 
     def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0):
         self._device = 'cpu'
-        if core.is_compiled_with_npu():
+        if core.is_compiled_with_custom_device('npu'):
             self._device = "npu"
         elif core.is_compiled_with_cuda():
             self._device = "gpu"
@@ -5770,7 +5770,7 @@ def _insert_accumulate_gradients_with_fuse(
                     # If there are some not initialized sections in the fused var,
                     # and the value in those sections are nan/inf, it will trigger the nan/inf check.
                     # To avoid these problematic triggers, set constant is needed for npu
-                    "set_constant": core.is_compiled_with_npu(),
+                    "set_constant": core.is_compiled_with_custom_device('npu'),
                     "constant": float(0.0),
                 },
             )
@@ -6387,8 +6387,8 @@ def device_cmp(device1, device2):
             dev_index = int(dev.split(":")[1])
             if core.is_compiled_with_cuda():
                 place_list.append(core.CUDAPlace(dev_index % 1))
-            elif core.is_compiled_with_npu():
-                place_list.append(core.NPUPlace(dev_index % 1))
+            elif paddle.is_compiled_with_custom_device('npu'):
+                place_list.append(paddle.CustomPlace('npu', dev_index % 1))
 
         # Step6: Split startup program
         new_startup_program = self._split_startup_program(
@@ -6411,7 +6411,7 @@ def device_cmp(device1, device2):
 
         if core.is_compiled_with_cuda():
             place_id = int(os.getenv("FLAGS_selected_gpus", "0"))
-        elif core.is_compiled_with_npu():
+        elif core.is_compiled_with_custom_device('npu'):
             place_id = int(os.getenv("FLAGS_selected_npus", "0"))
         # A pass to move the recv op to the beginning of
         # the forward/backward phase
diff --git a/python/paddle/fluid/tests/unittests/c_embedding_op_base.py b/python/paddle/fluid/tests/unittests/c_embedding_op_base.py
index d8a25b48022b4..762961ca5e8a5 100644
--- a/python/paddle/fluid/tests/unittests/c_embedding_op_base.py
+++ b/python/paddle/fluid/tests/unittests/c_embedding_op_base.py
@@ -37,9 +37,7 @@ class TestCEmbeddingCPU(OpTest):
     def setUp(self):
         self.init_dtype()
         self.initcase()
-        if core.is_compiled_with_npu():
-            self.__class__.use_npu = True
-        elif core.is_compiled_with_xpu():
+        if core.is_compiled_with_xpu():
             self.__class__.use_xpu = True
         elif core.is_compiled_with_cuda():
             self.__class__.exist_fp64_check_grad = True
@@ -98,9 +96,6 @@ def init_dtype(self):
         if core.is_compiled_with_cuda():
             self.dtype = "float64"
             self.ids_dtype = "int64"
-        elif core.is_compiled_with_npu():
-            self.dtype = "float32"
-            self.ids_dtype = "int32"
         elif core.is_compiled_with_xpu():
             self.dtype = "float32"
             self.ids_dtype = "int64"
@@ -129,9 +124,7 @@ def initcase(self):
         self.outputs = {'Out': np_out.reshape((2, 4, 64))}
         self.attrs = {'start_index': self.start_index}
 
-        if core.is_compiled_with_npu():
-            self.__class__.use_npu = True
-        elif core.is_compiled_with_xpu():
+        if core.is_compiled_with_xpu():
             self.__class__.use_xpu = True
         elif core.is_compiled_with_cuda():
             self.__class__.exist_fp64_check_grad = True
diff --git a/python/paddle/fluid/tests/unittests/eager_op_test.py b/python/paddle/fluid/tests/unittests/eager_op_test.py
index f8f6c8023da81..0bb5bd6c769a9 100644
--- a/python/paddle/fluid/tests/unittests/eager_op_test.py
+++ b/python/paddle/fluid/tests/unittests/eager_op_test.py
@@ -338,10 +338,7 @@ def setUpClass(cls):
         np.random.seed(123)
         random.seed(124)
 
-        if paddle.is_compiled_with_npu():
-            cls._use_system_allocator = _set_use_system_allocator(False)
-        else:
-            cls._use_system_allocator = _set_use_system_allocator(True)
+        cls._use_system_allocator = _set_use_system_allocator(True)
 
     @classmethod
     def tearDownClass(cls):
@@ -376,9 +373,6 @@ def is_mkldnn_op_test():
         def is_rocm_op_test():
             return core.is_compiled_with_rocm()
 
-        def is_npu_op_test():
-            return hasattr(cls, "use_npu") and cls.use_npu
-
         def is_mlu_op_test():
             return hasattr(cls, "use_mlu") and cls.use_mlu
 
@@ -414,7 +408,6 @@ def is_custom_device_op_test():
                 and not is_xpu_op_test()
                 and not is_mkldnn_op_test()
                 and not is_rocm_op_test()
-                and not is_npu_op_test()
                 and not is_mlu_op_test()
                 and not is_custom_device_op_test()
                 and not cls.check_prim
@@ -1970,7 +1963,6 @@ def _is_skip_name(self, name):
         # Currently not support ParallelExecutor on XPUPlace.
         if (
             not paddle.is_compiled_with_xpu()
-            and not paddle.is_compiled_with_npu()
             and not paddle.is_compiled_with_mlu()
             and not isinstance(place, core.CustomPlace)
         ):
diff --git a/python/paddle/fluid/tests/unittests/test_device.py b/python/paddle/fluid/tests/unittests/test_device.py
index ac30b5112bc98..8396b2a39d699 100644
--- a/python/paddle/fluid/tests/unittests/test_device.py
+++ b/python/paddle/fluid/tests/unittests/test_device.py
@@ -46,10 +46,6 @@ def test_xpu_device(self):
         if core.is_compiled_with_xpu():
             self._test_device("xpu:0", core.XPUPlace)
 
-    def test_npu_device(self):
-        if core.is_compiled_with_npu():
-            self._test_device("npu:0", core.NPUPlace)
-
 
 class TestImperativeDeviceManage(unittest.TestCase):
     def test_cpu(self):
@@ -95,25 +91,6 @@ def test_xpu(self):
                 self.assertTrue(out.place.is_xpu_place())
                 self.assertEqual(device, "xpu:0")
 
-    def test_npu(self):
-        if core.is_compiled_with_npu():
-            with fluid.dygraph.guard():
-                paddle.set_device('npu:0')
-                out1 = paddle.zeros(shape=[1, 3], dtype='float32')
-                out2 = paddle.ones(shape=[1, 3], dtype='float32')
-                out3 = paddle.concat(x=[out1, out2], axis=0)
-                device = paddle.get_device()
-                self.assertEqual(
-                    isinstance(
-                        framework._current_expected_place(), core.NPUPlace
-                    ),
-                    True,
-                )
-                self.assertTrue(out1.place.is_npu_place())
-                self.assertTrue(out2.place.is_npu_place())
-                self.assertTrue(out3.place.is_npu_place())
-                self.assertEqual(device, "npu:0")
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index def841d16fb3b..507e8ac2422a2 100755
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -684,9 +684,6 @@ def run_trainer(self, args):
         elif fluid.core.is_compiled_with_xpu():
             device_id = int(os.getenv("FLAGS_selected_xpus", "0"))
             place = fluid.XPUPlace(device_id)
-        elif fluid.core.is_compiled_with_npu():
-            device_id = int(os.getenv("FLAGS_selected_npus", "0"))
-            place = fluid.NPUPlace(device_id)
         elif fluid.core.is_compiled_with_mlu():
             device_id = int(os.getenv("FLAGS_selected_mlus", "0"))
             place = fluid.MLUPlace(device_id)
@@ -891,7 +888,6 @@ def runtime_main(test_class):
     parser.add_argument('--use_cpu', action='store_true')
     parser.add_argument('--use_xpu', action='store_true')
     parser.add_argument('--use_dgc', action='store_true')
-    parser.add_argument('--use_npu', action='store_true')
     parser.add_argument('--use_mlu', action='store_true')
     parser.add_argument('--accumulate_gradient', action='store_true')
     parser.add_argument('--find_unused_parameters', action='store_true')
@@ -949,30 +945,20 @@ def _after_setup_config(self):
             self.__use_cuda = False
             self.__use_xpu = False
             self._use_dgc = False
-            self.__use_npu = False
             self._use_mlu = False
         elif self._enforce_place == "GPU":
             self.__use_cuda = True
             self.__use_xpu = False
-            self.__use_npu = False
             self._use_mlu = False
         elif self._enforce_place == "XPU":
             self.__use_cuda = False
             self.__use_xpu = True
             self._use_dgc = False
-            self.__use_npu = False
-            self._use_mlu = False
-        elif self._enforce_place == "NPU":
-            self.__use_cuda = False
-            self.__use_xpu = False
-            self._use_dgc = False
-            self.__use_npu = True
             self._use_mlu = False
         elif self._enforce_place == "MLU":
             self.__use_cuda = False
             self.__use_xpu = False
             self._use_dgc = False
-            self.__use_npu = False
             self._use_mlu = True
         else:
             if fluid.core.is_compiled_with_cuda():
@@ -1163,13 +1149,6 @@ def _run_local(
                 "PADDLE_TRAINERS_NUM": "1",
                 "PADDLE_TRAINER_ID": "0",
             }
-        elif self.__use_npu:
-            cmd += " --use_npu"
-            env_local = {
-                "FLAGS_selected_npus": devices,
-                "PADDLE_TRAINERS_NUM": "1",
-                "PADDLE_TRAINER_ID": "0",
-            }
         else:
             env_local = {'CPU_NUM': '1'}
 
@@ -1461,18 +1440,6 @@ def _get_nccl2_trainer_cmd(
                     "GLOG_v": "2",
                 }
             )
-        elif self.__use_npu:
-            tr_cmd += " --use_npu"
-            env.update(
-                {
-                    "FLAGS_selected_npus": "{}".format(trainer_id),
-                    "PADDLE_TRAINERS_NUM": "{}".format(trainer_num),
-                    "PADDLE_TRAINER_ID": "{}".format(trainer_id),
-                    "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-                    "PADDLE_CURRENT_ENDPOINT": ep,
-                    "GLOG_v": "2",
-                }
-            )
         elif self._use_mlu:
             tr_cmd += " --use_mlu"
             env.update(
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 17098ef9425a9..24920eb375ce1 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -272,9 +272,6 @@ def check_with_place(place):
             check_with_place("gpu_pinned")
             check_with_place(core.CUDAPlace(0))
             check_with_place("gpu:0")
-        if core.is_compiled_with_npu():
-            check_with_place(core.NPUPlace(0))
-            check_with_place("npu:0")
 
     def test_to_tensor_not_change_input_stop_gradient(self):
         with paddle.fluid.dygraph.guard(core.CPUPlace()):
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index e34274ceaee0e..966e4ab3000c0 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -156,33 +156,6 @@ def init_communicator(
                 'ring_id': 0,
             },
         )
-    elif core.is_compiled_with_npu():
-        hccl_id_var = block.create_var(
-            name=fluid.unique_name.generate('hccl_id'),
-            persistable=True,
-            type=core.VarDesc.VarType.RAW,
-        )
-        block.append_op(
-            type='c_gen_hccl_id',
-            inputs={},
-            outputs={'Out': hccl_id_var},
-            attrs={
-                'rank': rank,
-                'endpoint': current_endpoint,
-                'other_endpoints': other_endpoints,
-            },
-        )
-        block.append_op(
-            type='c_comm_init_hccl',
-            inputs={'X': hccl_id_var},
-            outputs={},
-            attrs={
-                'rank': rank,
-                'ring_id': 0,
-                'device_id': int(os.getenv("FLAGS_selected_npus")),
-                'rank_ids': nranks,
-            },
-        )
     elif core.is_compiled_with_xpu():
         bkcl_id_var = block.create_var(
             name=fluid.unique_name.generate('bkcl_id'),
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 816fd3266f184..a31744076e4f9 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -16,7 +16,7 @@
 from paddle.device import (
     get_all_custom_device_type,
     is_compiled_with_cuda,
-    is_compiled_with_npu,
+    is_compiled_with_custom_device,
     is_compiled_with_rocm,
 )
 from paddle.fluid.framework import _global_flags, in_dygraph_mode
@@ -466,7 +466,7 @@ def conv1d(
         use_cudnn = False
 
     # NPU only supports depthwise_conv2d when  "input_channel = output_channel = groups"
-    if is_compiled_with_npu():
+    if is_compiled_with_custom_device('npu'):
         if num_channels == groups and num_channels == num_filters:
             l_type = 'depthwise_conv2d'
         else:
@@ -756,7 +756,7 @@ def conv2d(
     use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
 
     # NPU only supports depthwise_conv2d when  "input_channel = output_channel = groups"
-    if is_compiled_with_npu():
+    if is_compiled_with_custom_device('npu'):
         if num_channels == groups and num_channels == num_filters:
             l_type = 'depthwise_conv2d'
         else:
diff --git a/python/paddle/static/amp/amp_nn.py b/python/paddle/static/amp/amp_nn.py
index 0f936ae8f57b9..2361c11f23b82 100644
--- a/python/paddle/static/amp/amp_nn.py
+++ b/python/paddle/static/amp/amp_nn.py
@@ -54,7 +54,7 @@ def check_finite_and_unscale(x, scale, name=None, float_status=None):
         )
 
     inputs = {'X': x, 'Scale': scale}
-    if core.is_compiled_with_npu():
+    if core.is_compiled_with_custom_device('npu'):
         check_variable_and_dtype(
             float_status,
             "float_status",
diff --git a/python/paddle/static/amp/decorator.py b/python/paddle/static/amp/decorator.py
index b3af9952a4faf..f2cf0c4295d91 100644
--- a/python/paddle/static/amp/decorator.py
+++ b/python/paddle/static/amp/decorator.py
@@ -187,7 +187,7 @@ def backward(
         self._train_program = train_program
 
         # NOTE(zhiqiu): _float_status is only used for NPU.
-        if core.is_compiled_with_npu():
+        if core.is_compiled_with_custom_device('npu'):
             float_status = paddle.static.data(
                 name="float_status", shape=[8], dtype='float32'
             )
@@ -408,7 +408,7 @@ def _check_finite_and_unscale(self, params_grads):
         if self._is_distributed:
             # if distributed, split check_finite_and_unscale to overlap
             # unscale with communication
-            if core.is_compiled_with_npu():
+            if core.is_compiled_with_custom_device('npu'):
                 with self._train_program._optimized_guard(grads):
                     _, found_inf = check_finite_and_unscale(
                         grads,
diff --git a/python/paddle/static/amp/fp16_lists.py b/python/paddle/static/amp/fp16_lists.py
index b3f9b0331a86c..e1924336dd445 100644
--- a/python/paddle/static/amp/fp16_lists.py
+++ b/python/paddle/static/amp/fp16_lists.py
@@ -182,7 +182,7 @@ def _update_list(self):
     _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
         'XPU', core.VarDesc.VarType.FP16
     )
-elif core.is_compiled_with_npu():
+elif core.is_compiled_with_custom_device('npu'):
     _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
         'NPU', core.VarDesc.VarType.FP16
     )
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index 37fe41624a4a5..d25f44d8a73f7 100644
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -946,7 +946,7 @@ def conv2d(
         l_type = 'depthwise_conv2d'
 
     # NPU only supports depthwise_conv2d when  "input_channel = output_channel = groups"
-    if core.is_compiled_with_npu():
+    if core.is_compiled_with_custom_device('npu'):
         if num_channels == groups and num_channels == num_filters:
             l_type = 'depthwise_conv2d'
         else:
diff --git a/python/paddle/utils/install_check.py b/python/paddle/utils/install_check.py
index 064d70ed4f892..21dbea143f920 100644
--- a/python/paddle/utils/install_check.py
+++ b/python/paddle/utils/install_check.py
@@ -65,22 +65,6 @@ def _is_cuda_available():
         return False
 
 
-def _is_npu_available():
-    """
-    Check whether NPU is avaiable.
-    """
-    try:
-        assert len(paddle.static.npu_places()) > 0
-        return True
-    except Exception as e:
-        logging.warning(
-            "You are using NPU version PaddlePaddle, but there is no NPU "
-            "detected on your machine. Maybe NPU devices is not set properly."
-            "\n Original Error is {}".format(e)
-        )
-        return False
-
-
 def _is_xpu_available():
     """
     Check whether XPU is avaiable.
@@ -97,22 +81,19 @@ def _is_xpu_available():
         return False
 
 
-def _run_dygraph_single(use_cuda, use_xpu, use_npu):
+def _run_dygraph_single(use_cuda, use_xpu):
     """
-    Testing the simple network in dygraph mode using one CPU/GPU/XPU/NPU.
+    Testing the simple network in dygraph mode using one CPU/GPU/XPU.
 
     Args:
         use_cuda (bool): Whether running with CUDA.
         use_xpu (bool): Whether running with XPU.
-        use_npu (bool): Whether running with NPU.
     """
     paddle.disable_static()
     if use_cuda:
         paddle.set_device('gpu')
     elif use_xpu:
         paddle.set_device('xpu')
-    elif use_npu:
-        paddle.set_device('npu')
     else:
         paddle.set_device('cpu')
     weight_attr = paddle.ParamAttr(
@@ -135,14 +116,13 @@ def _run_dygraph_single(use_cuda, use_xpu, use_npu):
     opt.step()
 
 
-def _run_static_single(use_cuda, use_xpu, use_npu):
+def _run_static_single(use_cuda, use_xpu):
     """
-    Testing the simple network with executor running directly, using one CPU/GPU/XPU/NPU.
+    Testing the simple network with executor running directly, using one CPU/GPU/XPU.
 
     Args:
         use_cuda (bool): Whether running with CUDA.
         use_xpu (bool): Whether running with XPU.
-        use_npu (bool): Whether running with NPU.
     """
     paddle.enable_static()
     with paddle.static.scope_guard(paddle.static.Scope()):
@@ -159,8 +139,6 @@ def _run_static_single(use_cuda, use_xpu, use_npu):
             place = paddle.CUDAPlace(0)
         elif use_xpu:
             place = paddle.XPUPlace(0)
-        elif use_npu:
-            place = paddle.NPUPlace(0)
         else:
             place = paddle.CPUPlace()
 
@@ -223,7 +201,6 @@ def _run_parallel(device_list):
     Args:
         use_cuda (bool): Whether running with CUDA.
         use_xpu (bool): Whether running with XPU.
-        use_npu (bool): Whether running with NPU.
         device_list (int): The specified devices.
     """
     paddle.distributed.spawn(train_for_run_parallel, nprocs=len(device_list))
@@ -252,14 +229,11 @@ def run_check():
 
     use_cuda = False
     use_xpu = False
-    use_npu = False
 
     if paddle.is_compiled_with_cuda():
         use_cuda = _is_cuda_available()
     elif paddle.is_compiled_with_xpu():
         use_xpu = _is_xpu_available()
-    elif paddle.is_compiled_with_npu():
-        use_npu = _is_npu_available()
 
     if use_cuda:
         device_str = "GPU"
@@ -267,16 +241,13 @@ def run_check():
     elif use_xpu:
         device_str = "XPU"
         device_list = paddle.static.xpu_places()
-    elif use_npu:
-        device_str = "NPU"
-        device_list = paddle.static.npu_places()
     else:
         device_str = "CPU"
         device_list = paddle.static.cpu_places(device_count=1)
     device_count = len(device_list)
 
-    _run_static_single(use_cuda, use_xpu, use_npu)
-    _run_dygraph_single(use_cuda, use_xpu, use_npu)
+    _run_static_single(use_cuda, use_xpu)
+    _run_dygraph_single(use_cuda, use_xpu)
     print("PaddlePaddle works well on 1 {}.".format(device_str))
 
     try:
diff --git a/test/custom_kernel/custom_kernel_dot_c_setup.py b/test/custom_kernel/custom_kernel_dot_c_setup.py
index 34778e0d5714d..c8cb7f4de4002 100644
--- a/test/custom_kernel/custom_kernel_dot_c_setup.py
+++ b/test/custom_kernel/custom_kernel_dot_c_setup.py
@@ -18,8 +18,6 @@
 from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
 
-from paddle.fluid import core
-
 
 # refer: https://note.qidong.name/2018/03/setup-warning-strict-prototypes
 # Avoid a gcc warning below:
@@ -40,8 +38,6 @@ def build_extensions(self):
     '-Wno-parentheses',
     '-DPADDLE_WITH_CUSTOM_KERNEL',
 ]
-if core.is_compiled_with_npu():
-    paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0']
 
 # include path
 site_packages_path = get_python_lib()
diff --git a/test/custom_kernel/custom_kernel_dot_setup.py b/test/custom_kernel/custom_kernel_dot_setup.py
index 71fe70a77f75d..7bf6f2fbe6f2a 100644
--- a/test/custom_kernel/custom_kernel_dot_setup.py
+++ b/test/custom_kernel/custom_kernel_dot_setup.py
@@ -18,8 +18,6 @@
 from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
 
-from paddle.fluid import core
-
 
 # refer: https://note.qidong.name/2018/03/setup-warning-strict-prototypes
 # Avoid a gcc warning below:
@@ -40,8 +38,6 @@ def build_extensions(self):
     '-Wno-parentheses',
     '-DPADDLE_WITH_CUSTOM_KERNEL',
 ]
-if core.is_compiled_with_npu():
-    paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0']
 
 # include path
 site_packages_path = site.getsitepackages()
diff --git a/tools/get_quick_disable_lt.py b/tools/get_quick_disable_lt.py
index eaf439b04134e..9ab9a7c53d3df 100644
--- a/tools/get_quick_disable_lt.py
+++ b/tools/get_quick_disable_lt.py
@@ -32,9 +32,6 @@ def download_file():
     if paddle.is_compiled_with_rocm():
         url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut_rocm')
 
-    if paddle.is_compiled_with_npu():
-        url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut_npu')
-
     if paddle.is_compiled_with_mlu():
         url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut_mlu')
 

From 71cd2d9d5fbe1d84529d8cb599e5c6067a02d553 Mon Sep 17 00:00:00 2001
From: baocheny <baochen.yang@intel.com>
Date: Fri, 31 Mar 2023 21:22:16 +0800
Subject: [PATCH 2/7] rem nup related code

---
 python/paddle/static/__init__.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
index d75c534aa32d8..2828927a6b156 100644
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -60,8 +60,6 @@
 from ..fluid.framework import cpu_places  # noqa: F401
 from ..fluid.framework import cuda_places  # noqa: F401
 from ..fluid.framework import xpu_places  # noqa: F401
-from ..fluid.framework import mlu_places  # noqa: F401
-from ..fluid.framework import npu_places  # noqa: F401
 from ..fluid.framework import Variable  # noqa: F401
 from ..fluid.framework import Operator  # noqa: F401
 from ..fluid.framework import Parameter  # noqa: F401
@@ -119,8 +117,6 @@
     'cpu_places',
     'cuda_places',
     'xpu_places',
-    'npu_places',
-    'mlu_places',
     'Variable',
     'create_global_var',
     'accuracy',

From 61411a9af7a033b280caf22563a6ba9d6035cc7e Mon Sep 17 00:00:00 2001
From: baocheny <baochen.yang@intel.com>
Date: Fri, 31 Mar 2023 21:33:13 +0800
Subject: [PATCH 3/7] make lint happy

---
 python/paddle/utils/install_check.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/paddle/utils/install_check.py b/python/paddle/utils/install_check.py
index 8ca8a5b34b065..548d16e8a96e8 100644
--- a/python/paddle/utils/install_check.py
+++ b/python/paddle/utils/install_check.py
@@ -246,7 +246,6 @@ def run_check():
         device_list = paddle.static.cpu_places(device_count=1)
     device_count = len(device_list)
 
-
     _run_static_single(use_cuda, use_xpu)
     _run_dygraph_single(use_cuda, use_xpu)
     print(f"PaddlePaddle works well on 1 {device_str}.")

From b859cd38d8eff663b5318b160c9342e4a126608f Mon Sep 17 00:00:00 2001
From: baocheny <baochen.yang@intel.com>
Date: Fri, 31 Mar 2023 23:19:56 +0800
Subject: [PATCH 4/7] rem test

---
 .../paddle/distributed/fleet/ascend_utils.py  | 138 ------------------
 .../unittests/test_fleet_ascend_utils.py      |  53 -------
 2 files changed, 191 deletions(-)
 delete mode 100644 python/paddle/distributed/fleet/ascend_utils.py
 delete mode 100644 python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py

diff --git a/python/paddle/distributed/fleet/ascend_utils.py b/python/paddle/distributed/fleet/ascend_utils.py
deleted file mode 100644
index 132ee3afac67c..0000000000000
--- a/python/paddle/distributed/fleet/ascend_utils.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-
-from paddle.distributed.fleet.launch_utils import (
-    DeviceMode,
-    get_cluster,
-    get_host_name_ip,
-)
-
-__all__ = []
-
-
-def _get_ascend_rankfile(rank_table_file_path):
-    """
-    Args:
-    rank_table_file_path: ascend npu rank file json
-    {
-        "status": "completed",
-        "version": "1.0",
-        "server_count": "2",
-        "server_list": [
-            {
-                "server_id": "192.168.24.217",
-                "device": [
-                    {
-                        "device_id": "0",
-                        "device_ip": "192.1.184.23",
-                        "rank_id": "0"
-                    },
-                    {
-                        "device_id": "1",
-                        "device_ip": "192.2.21.93",
-                        "rank_id": "1"
-                    }
-                ]
-            },
-            {
-                "server_id": "192.168.26.177",
-                "device": [
-                    {
-                        "device_id": "0",
-                        "device_ip": "192.1.94.132",
-                        "rank_id": "2"
-                    },
-                    {
-                        "device_id": "1",
-                        "device_ip": "192.2.94.30",
-                        "rank_id": "3"
-                    }
-                ]
-            }
-        ]
-    }
-
-    Returns:
-        node_ips: node ip list
-        device_count: number of npu per machine
-    """
-    json_data = None
-    with open(rank_table_file_path) as json_file:
-        json_data = json.load(json_file)
-
-    node_ips = []
-    device_count = 0
-    server_list = json_data['server_list']
-    for server in server_list:
-        device_list = server['device']
-        device_count = len(device_list)
-        if os.getenv("FLAGS_MODELARTS", None):
-            nodes = os.getenv("DLS_TASK_NUMBER", None)
-            assert nodes is not None, "DLS_TASK_NUMBER didn't set!"
-            for node in range(int(nodes)):
-                node_ip = os.getenv(f"VC_CUSTOM{node}_HOSTS", None)
-                assert node_ip is not None, f"VC_CUSTOM{node}_HOSTS didn't set!"
-                node_ips.append(node_ip)
-            return node_ips, device_count
-        node_ips.append(server['server_id'])
-    return node_ips, device_count
-
-
-def get_cloud_cluster(
-    rank_table_file=None, device_mode=DeviceMode.ASCEND_NPU, start_port=6070
-):
-    """
-    Args:
-    rank_table_file: string, ascend npu rank file path
-    device_mode: DeviceMode(Int)
-    start_port: the start port of current runtime env
-    """
-    if rank_table_file:
-        # multi trainers
-        node_ips, device_count = _get_ascend_rankfile(rank_table_file)
-        if len(node_ips) == 1:
-            node_ip = node_ips[0]
-        else:
-            node_index = os.environ.get("PADDLE_TRAINER_ID")
-            node_ip = None
-            if node_index:
-                node_ip = node_ips[int(node_index)]
-            else:
-                _, node_ip = get_host_name_ip()
-
-        assert (
-            node_ip in node_ips
-        ), "Can't find your local ip {{{}}} in node_ips: {{{}}}".format(
-            node_ip,
-            node_ips,
-        )
-    else:
-        # single trainer (single ascend card)
-        node_ips = ["127.0.0.1"]
-        node_ip = node_ips[0]
-        device_count = 1
-
-    devices_per_proc = [str(x) for x in range(device_count)]
-    free_ports = list(range(start_port, start_port + len(devices_per_proc)))
-
-    trainer_endpoints = []
-    for ip in node_ips:
-        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
-
-    return get_cluster(
-        node_ips, node_ip, trainer_endpoints, device_mode, devices_per_proc
-    )
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py b/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py
deleted file mode 100644
index 218cf786a3a35..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import unittest
-
-from paddle.distributed.fleet import ascend_utils
-
-RANK_TABLE_JSON = {
-    "status": "completed",
-    "version": "1.0",
-    "server_count": "1",
-    "server_list": [
-        {
-            "server_id": "127.0.0.1",
-            "device": [
-                {"device_id": "0", "device_ip": "192.1.184.23", "rank_id": "0"},
-                {"device_id": "1", "device_ip": "192.2.21.93", "rank_id": "1"},
-            ],
-        }
-    ],
-}
-
-
-class TestAscendUtil(unittest.TestCase):
-    def test_get_cloud_cluster(self):
-        cluster, pod = ascend_utils.get_cloud_cluster()
-        self.assertTrue(cluster)
-        self.assertTrue(pod)
-
-        with open('rank_table_file.json', 'w') as f:
-            json.dump(RANK_TABLE_JSON, f)
-        rank_table_file = "./rank_table_file.json"
-        cluster, pod = ascend_utils.get_cloud_cluster(
-            rank_table_file=rank_table_file
-        )
-        self.assertTrue(cluster)
-        self.assertTrue(pod)
-
-
-if __name__ == '__main__':
-    unittest.main()

From e0f1c0d144256d442771d3efa3b5a6e2ff816583 Mon Sep 17 00:00:00 2001
From: baocheny <baochen.yang@intel.com>
Date: Sat, 1 Apr 2023 14:07:47 +0800
Subject: [PATCH 5/7] remove some tests

---
 python/paddle/amp/auto_cast.py                |  5 --
 python/paddle/device/__init__.py              | 50 ----------------
 .../sharding/group_sharded_storage.py         |  2 +-
 .../ascend_multi_process_collective.py        | 48 ---------------
 .../tests/unittests/c_embedding_op_base.py    |  8 +--
 .../test_cuda_max_memory_allocated.py         |  1 -
 .../test_cuda_max_memory_reserved.py          |  1 -
 .../unittests/test_cuda_memory_allocated.py   |  1 -
 .../unittests/test_cuda_memory_reserved.py    |  1 -
 .../tests/unittests/test_npu_identity_op.py   | 59 -------------------
 python/paddle/static/io.py                    |  8 ---
 python/paddle/tensor/creation.py              |  2 -
 12 files changed, 2 insertions(+), 184 deletions(-)
 delete mode 100644 python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py
 delete mode 100644 python/paddle/fluid/tests/unittests/test_npu_identity_op.py

diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
index 9262fab7a5ef5..a4c2ffaea70f5 100644
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -345,7 +345,6 @@ def amp_guard(
         tracer._expected_place.is_gpu_place()
         or tracer._expected_place.is_xpu_place()
         or tracer._expected_place.is_mlu_place()
-        or tracer._expected_place.is_npu_place()
         or tracer._expected_place.is_custom_place()
     ):
         warnings.warn(
@@ -353,10 +352,6 @@ def amp_guard(
             % tracer._expected_place
         )
         enable = False
-    # For npu:
-    if tracer._expected_place.is_npu_place() and (dtype == 'bfloat16'):
-        warnings.warn('NPUPlace only support float16 amp.')
-        enable = False
     # For xpu:
     if tracer._expected_place.is_xpu_place() and (dtype == 'bfloat16'):
         warnings.warn('XPUPlace only support float16 amp.')
diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py
index 359b981a0925a..45285f51465d5 100644
--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -37,7 +37,6 @@
     'is_compiled_with_cinn',
     'is_compiled_with_cuda',
     'is_compiled_with_rocm',
-    'is_compiled_with_npu',
     'is_compiled_with_mlu',
     'is_compiled_with_custom_device',
     'get_all_device_type',
@@ -55,24 +54,6 @@
 _cudnn_version = None
 
 
-# TODO: WITH_ASCEND_CL may changed to WITH_NPU or others in the future
-# for consistent.
-def is_compiled_with_npu():
-    """
-    Whether paddle was built with WITH_ASCEND_CL=ON to support Ascend NPU.
-
-    Return:
-        bool, ``True`` if NPU is supported, otherwise ``False``.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            support_npu = paddle.device.is_compiled_with_npu()
-    """
-    return core.is_compiled_with_npu()
-
-
 def is_compiled_with_custom_device(device_type):
     """
     Whether paddle was built with Paddle_CUSTOM_DEVICE .
@@ -247,15 +228,6 @@ def _convert_to_place(device):
         selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",")
         device_id = int(selected_xpus[0])
         place = core.XPUPlace(device_id)
-    elif lower_device == 'npu':
-        if not core.is_compiled_with_custom_device('npu'):
-            raise ValueError(
-                "The device should not be 'npu', "
-                "since PaddlePaddle is not compiled with NPU"
-            )
-        selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
-        device_id = int(selected_npus[0])
-        place = core.CustomPlace('npu', device_id)
     elif lower_device == 'ipu':
         if not core.is_compiled_with_ipu():
             raise ValueError(
@@ -275,7 +247,6 @@ def _convert_to_place(device):
     else:
         avaliable_gpu_device = re.match(r'gpu:\d+', lower_device)
         avaliable_xpu_device = re.match(r'xpu:\d+', lower_device)
-        avaliable_npu_device = re.match(r'npu:\d+', lower_device)
         avaliable_mlu_device = re.match(r'mlu:\d+', lower_device)
         if avaliable_gpu_device:
             if not core.is_compiled_with_cuda():
@@ -297,26 +268,6 @@ def _convert_to_place(device):
             device_id = device_info_list[1]
             device_id = int(device_id)
             place = core.XPUPlace(device_id)
-        if avaliable_npu_device:
-            if not core.is_compiled_with_custom_device('npu'):
-                device_info_list = device.split(':', 1)
-                device_type = device_info_list[0]
-                if device_type in core.get_all_custom_device_type():
-                    device_id = device_info_list[1]
-                    device_id = int(device_id)
-                    place = core.CustomPlace(device_type, device_id)
-                    return place
-                else:
-                    raise ValueError(
-                        "The device should not be {}, since PaddlePaddle is "
-                        "not compiled with NPU or compiled with custom device".format(
-                            avaliable_npu_device
-                        )
-                    )
-            device_info_list = device.split(':', 1)
-            device_id = device_info_list[1]
-            device_id = int(device_id)
-            place = core.CustomPlace('npu', device_id)
         if avaliable_mlu_device:
             if not core.is_compiled_with_mlu():
                 raise ValueError(
@@ -330,7 +281,6 @@ def _convert_to_place(device):
         if (
             not avaliable_gpu_device
             and not avaliable_xpu_device
-            and not avaliable_npu_device
             and not avaliable_mlu_device
         ):
             device_info_list = device.split(':', 1)
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py
index 20f02b409b8db..73e1b9a9781a5 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py
@@ -78,7 +78,7 @@ def to(self, device, dtype=None, keep_alignment=True):
         if self._device != device:
             tmp_buffer = (
                 cvt_to_device(self.buffer, self.dev_id)
-                if device in ["gpu", "xpu", "npu"]
+                if device in ["gpu", "xpu"]
                 else self.buffer.cpu()
             )
             for param in self._params:
diff --git a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py
deleted file mode 100644
index 572e6caa1d7f4..0000000000000
--- a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-
-
-def train(prefix):
-    selected_accelerators = os.getenv("FLAGS_selected_accelerators")
-    selected_npus = os.getenv("FLAGS_selected_npus")
-    trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-    worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
-    current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
-    worker_endpoints = worker_endpoints_env
-    trainers_num = len(worker_endpoints.split(','))
-    device_ids = os.getenv("PADDLE_WORLD_DEVICE_IDS")
-    current_device_id = os.getenv("PADDLE_LOCAL_DEVICE_IDS")
-
-    details = "selected_accelerators:{} selected_npus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}".format(
-        selected_accelerators,
-        selected_npus,
-        worker_endpoints,
-        trainers_num,
-        current_endpoint,
-        trainer_id,
-        device_ids,
-        current_device_id,
-    )
-
-    print(details)
-    with open(f"multi_process_{prefix}.check_{trainer_id}.log", "w") as f:
-        f.write(details)
-
-
-if __name__ == '__main__':
-    prefix = sys.argv[1]
-    train(prefix)
diff --git a/python/paddle/fluid/tests/unittests/c_embedding_op_base.py b/python/paddle/fluid/tests/unittests/c_embedding_op_base.py
index 762961ca5e8a5..3eda046571e37 100644
--- a/python/paddle/fluid/tests/unittests/c_embedding_op_base.py
+++ b/python/paddle/fluid/tests/unittests/c_embedding_op_base.py
@@ -55,9 +55,7 @@ def initcase(self):
         np_out = get_c_embedding(self.start_index, self.end_index, table, ids)
         self.outputs = {'Out': np_out.reshape((2, 4, 64))}
         self.attrs = {'start_index': self.start_index}
-        if core.is_compiled_with_npu():
-            self.__class__.use_npu = True
-        elif core.is_compiled_with_xpu():
+        if core.is_compiled_with_xpu():
             self.__class__.use_xpu = True
 
     def test_check_cpu(self):
@@ -79,16 +77,12 @@ def setUp(self):
     def test_check_output(self):
         if core.is_compiled_with_cuda():
             self.check_output_with_place(core.CUDAPlace(0))
-        elif core.is_compiled_with_npu():
-            self.check_output_with_place(core.NPUPlace(0))
         elif core.is_compiled_with_xpu():
             self.check_output_with_place(core.XPUPlace(0))
 
     def test_check_grad(self):
         if core.is_compiled_with_cuda():
             self.check_grad_with_place(core.CUDAPlace(0), ['W'], 'Out')
-        elif core.is_compiled_with_npu():
-            self.check_grad_with_place(core.NPUPlace(0), ['W'], 'Out')
         elif core.is_compiled_with_xpu():
             self.check_grad_with_place(core.XPUPlace(0), ['W'], 'Out')
 
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py b/python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py
index 589c6643f726e..c7b6828a60c1d 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py
@@ -59,7 +59,6 @@ def test_max_memory_allocated_exception(self):
                 -2,
                 0.5,
                 "gpu1",
-                "npu",
             ]
             for device in wrong_device:
                 with self.assertRaises(BaseException):
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_max_memory_reserved.py b/python/paddle/fluid/tests/unittests/test_cuda_max_memory_reserved.py
index 9651d893fa776..07d2b5a9ded19 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_max_memory_reserved.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_max_memory_reserved.py
@@ -59,7 +59,6 @@ def test_max_memory_reserved_exception(self):
                 -2,
                 0.5,
                 "gpu1",
-                "npu",
             ]
             for device in wrong_device:
                 with self.assertRaises(BaseException):
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_memory_allocated.py b/python/paddle/fluid/tests/unittests/test_cuda_memory_allocated.py
index 16bbb51d559ea..252dd6f93117c 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_memory_allocated.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_memory_allocated.py
@@ -44,7 +44,6 @@ def test_memory_allocated_exception(self):
                 -2,
                 0.5,
                 "gpu1",
-                "npu",
             ]
             for device in wrong_device:
                 with self.assertRaises(BaseException):
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py b/python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py
index ca89a1a7dfe18..941ab27ecc5ba 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py
@@ -44,7 +44,6 @@ def test_memory_reserved_exception(self):
                 -2,
                 0.5,
                 "gpu1",
-                "npu",
             ]
             for device in wrong_device:
                 with self.assertRaises(BaseException):
diff --git a/python/paddle/fluid/tests/unittests/test_npu_identity_op.py b/python/paddle/fluid/tests/unittests/test_npu_identity_op.py
deleted file mode 100644
index da87384c4566c..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_npu_identity_op.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-import paddle
-
-
-class TestNPUIdentityOp(unittest.TestCase):
-    def setUp(self):
-        self.op_type = "npu_identity"
-        self.shape = [64, 6, 28, 28]
-        self.x = np.random.random(self.shape).astype(np.float32)
-        self.format = 3  # ACL_FORMAT_NC1HWC0 = 3
-        self.place = paddle.CPUPlace()
-
-    def test_api_static(self):
-        paddle.enable_static()
-
-        main_program = paddle.static.default_main_program()
-        startup_program = paddle.static.default_startup_program()
-        with paddle.static.program_guard(main_program, startup_program):
-            x_data = paddle.static.data(
-                shape=self.shape, name="data", dtype='float32'
-            )
-            output = paddle.incubate._npu_identity(x=x_data, format=self.format)
-        exe = paddle.static.Executor()
-        exe.run(startup_program)
-        result = exe.run(
-            main_program, feed={x_data.name: self.x}, fetch_list=[output]
-        )
-
-        np.testing.assert_allclose(result[0], self.x, rtol=1e-08)
-
-    def test_api_dygraph(self):
-        paddle.disable_static(self.place)
-
-        x = paddle.to_tensor(self.x)
-        out = paddle.incubate._npu_identity(x, self.format)
-
-        np.testing.assert_allclose(out.numpy(), self.x, rtol=1e-08)
-        paddle.enable_static()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py
index cac8f821c5d72..b9297ddde2a6b 100644
--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
@@ -1536,10 +1536,6 @@ def set_var(var, ndarray):
             p = paddle.fluid.core.Place()
             p.set_place(t._place())
             place = paddle.fluid.XPUPlace(p.xpu_device_id())
-        elif p.is_npu_place():
-            p = paddle.fluid.core.Place()
-            p.set_place(t._place())
-            place = paddle.fluid.NPUPlace(p.npu_device_id())
         elif p.is_mlu_place():
             p = paddle.fluid.core.Place()
             p.set_place(t._place())
@@ -1680,10 +1676,6 @@ def set_program_state(program, state_dict):
                 p = paddle.fluid.core.Place()
                 p.set_place(ten_place)
                 py_place = paddle.fluid.XPUPlace(p.xpu_device_id())
-            elif ten_place.is_npu_place():
-                p = paddle.fluid.core.Place()
-                p.set_place(ten_place)
-                py_place = paddle.fluid.NPUPlace(p.npu_device_id())
             elif ten_place.is_mlu_place():
                 p = paddle.fluid.core.Place()
                 p.set_place(ten_place)
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 74d2cfb88c0df..602fa7186ec84 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -2246,8 +2246,6 @@ def _memcpy(input, place=None, output=None):
             dst_place_type = 2
         elif p.is_xpu_place():
             dst_place_type = 3
-        elif p.is_npu_place():
-            dst_place_type = 4
 
     attrs = {'dst_place_type': dst_place_type}
     helper.append_op(

From f052f64fd448a33be22fd5de8670afd350ca2c13 Mon Sep 17 00:00:00 2001
From: Kim  Yann <kimyangbaochen@vip.qq.com>
Date: Mon, 3 Apr 2023 10:44:24 +0800
Subject: [PATCH 6/7] Update grad_scaler.py

---
 python/paddle/amp/grad_scaler.py | 98 ++++++++++----------------------
 1 file changed, 30 insertions(+), 68 deletions(-)

diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py
index 009316514f5ec..31d2981e8961e 100644
--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -326,74 +326,36 @@ def _unscale(self, optimizer):
                     if param.dtype == core.VarDesc.VarType.FP32
                 ]
         self._found_inf = self._temp_found_inf_value_false
-        if core.is_compiled_with_custom_device('npu'):
-            float_status = _legacy_C_ops.alloc_float_status()
-            _legacy_C_ops.clear_float_status(float_status, float_status)
-
-            if len(param_grads_fp16):
-                _legacy_C_ops.check_finite_and_unscale(
-                    param_grads_fp16,
-                    self._scale,
-                    float_status,
-                    param_grads_fp16,
-                    self._temp_found_inf_fp16,
-                )
-                self._found_inf = _C_ops.bitwise_or(
-                    self._found_inf, self._temp_found_inf_fp16
-                )
-            if len(param_grads_bf16):
-                _legacy_C_ops.check_finite_and_unscale(
-                    param_grads_bf16,
-                    self._scale,
-                    float_status,
-                    param_grads_bf16,
-                    self._temp_found_inf_bf16,
-                )
-                self._found_inf = _C_ops.bitwise_or(
-                    self._found_inf, self._temp_found_inf_bf16
-                )
-            if len(param_grads_fp32):
-                _legacy_C_ops.check_finite_and_unscale(
-                    param_grads_fp32,
-                    self._scale,
-                    float_status,
-                    param_grads_fp32,
-                    self._temp_found_inf_fp32,
-                )
-                self._found_inf = _C_ops.bitwise_or(
-                    self._found_inf, self._temp_found_inf_fp32
-                )
-        else:
-            if len(param_grads_fp16):
-                _legacy_C_ops.check_finite_and_unscale(
-                    param_grads_fp16,
-                    self._scale,
-                    param_grads_fp16,
-                    self._temp_found_inf_fp16,
-                )
-                self._found_inf = _C_ops.bitwise_or(
-                    self._found_inf, self._temp_found_inf_fp16
-                )
-            if len(param_grads_bf16):
-                _legacy_C_ops.check_finite_and_unscale(
-                    param_grads_bf16,
-                    self._scale,
-                    param_grads_bf16,
-                    self._temp_found_inf_bf16,
-                )
-                self._found_inf = _C_ops.bitwise_or(
-                    self._found_inf, self._temp_found_inf_bf16
-                )
-            if len(param_grads_fp32):
-                _legacy_C_ops.check_finite_and_unscale(
-                    param_grads_fp32,
-                    self._scale,
-                    param_grads_fp32,
-                    self._temp_found_inf_fp32,
-                )
-                self._found_inf = _C_ops.bitwise_or(
-                    self._found_inf, self._temp_found_inf_fp32
-                )
+        if len(param_grads_fp16):
+            _legacy_C_ops.check_finite_and_unscale(
+                param_grads_fp16,
+                self._scale,
+                param_grads_fp16,
+                self._temp_found_inf_fp16,
+            )
+            self._found_inf = _C_ops.bitwise_or(
+                self._found_inf, self._temp_found_inf_fp16
+            )
+        if len(param_grads_bf16):
+            _legacy_C_ops.check_finite_and_unscale(
+                param_grads_bf16,
+                self._scale,
+                param_grads_bf16,
+                self._temp_found_inf_bf16,
+            )
+            self._found_inf = _C_ops.bitwise_or(
+                self._found_inf, self._temp_found_inf_bf16
+            )
+        if len(param_grads_fp32):
+            _legacy_C_ops.check_finite_and_unscale(
+                param_grads_fp32,
+                self._scale,
+                param_grads_fp32,
+                self._temp_found_inf_fp32,
+            )
+            self._found_inf = _C_ops.bitwise_or(
+                self._found_inf, self._temp_found_inf_fp32
+            )
 
         optimizer_state["state"] = OptimizerState.UNSCALED
 

From 46a719336bc23e11fad35a575445ce846c7bddef Mon Sep 17 00:00:00 2001
From: baocheny <baochen.yang@intel.com>
Date: Tue, 4 Apr 2023 16:59:03 +0800
Subject: [PATCH 7/7] fix an error

---
 paddle/fluid/pybind/pybind.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 45b6af736a9ed..88a984244498b 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1960,7 +1960,6 @@ All parameter, weight, gradient are variables in Paddle.
   });
   m.def("is_compiled_with_avx", IsCompiledWithAVX);
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
-  m.def("is_compiled_with_ascend", IsCompiledWithAscend);
   m.def("is_compiled_with_rocm", IsCompiledWithROCM);
   m.def("is_compiled_with_custom_device", IsCompiledWithCustomDevice);
   m.def("is_compiled_with_ipu", IsCompiledWithIPU);