PaddlePaddle · luotao1 · Apr 6, 2023 · Mar 30, 2023 · Mar 31, 2023 · Mar 31, 2023
diff --git a/.flake8 b/.flake8
@@ -8,9 +8,6 @@ exclude =
     ./python/paddle/fluid/tra**,
     # Exclude third-party libraries
     ./python/paddle/utils/gast/**,
-    # Exclude files that will be removed in the future, see more at
-    # https://github.com/PaddlePaddle/Paddle/pull/46782#issuecomment-1273033731
-    ./python/paddle/fluid/tests/unittests/npu/**,
 ignore =
     # Whitespace before ‘,’, ‘;’, or ‘:’, it is not compatible with black
     E203,

@@ -4,8 +4,7 @@ exclude: |
         patches/.+|
         paddle/fluid/framework/fleet/heter_ps/cudf/.+|
         paddle/fluid/distributed/ps/thirdparty/round_robin.h|
-        python/paddle/utils/gast/.+|
-        python/paddle/fluid/tests/unittests/npu/.+
+        python/paddle/utils/gast/.+
     )$
 repos:
 # Common hooks

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
@@ -265,14 +265,6 @@ bool IsCompiledWithROCM() {
 #endif
 }
 
-bool IsCompiledWithAscend() {
-#ifndef PADDLE_WITH_ASCEND
-  return false;
-#else
-  return true;
-#endif
-}
-
 bool IsCompiledWithXPU() {
 #ifndef PADDLE_WITH_XPU
   return false;
@@ -281,8 +273,6 @@ bool IsCompiledWithXPU() {
 #endif
 }
 
-bool IsCompiledWithNPU() { return false; }
-
 bool IsCompiledWithCustomDevice(std::string device_type) {
 #ifndef PADDLE_WITH_CUSTOM_DEVICE
   return false;
@@ -1592,14 +1582,6 @@ All parameter, weight, gradient are variables in Paddle.
       return context;
 #endif
           })
-      .def_static(
-          "create",
-          [](paddle::platform::NPUPlace &place)
-              -> paddle::platform::DeviceContext * {
-            PADDLE_THROW(platform::errors::PermissionDenied(
-                "Cannot use NPUPlace in CPU/GPU/XPU version, "
-                "Please recompile or reinstall Paddle with NPU support."));
-          })
       .def_static("create",
                   [](paddle::platform::CustomPlace &place)
                       -> paddle::platform::DeviceContext * {
@@ -1769,13 +1751,6 @@ All parameter, weight, gradient are variables in Paddle.
              pybind11::gil_scoped_release release;
              self.Run(scope, place);
            })
-      .def("run",
-           [](OperatorBase &self,
-              const Scope &scope,
-              const platform::NPUPlace &place) {
-             pybind11::gil_scoped_release release;
-             self.Run(scope, place);
-           })
       .def("run",
            [](OperatorBase &self,
               const Scope &scope,
@@ -1985,9 +1960,7 @@ All parameter, weight, gradient are variables in Paddle.
   });
   m.def("is_compiled_with_avx", IsCompiledWithAVX);
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
-  m.def("is_compiled_with_ascend", IsCompiledWithAscend);
   m.def("is_compiled_with_rocm", IsCompiledWithROCM);
-  m.def("is_compiled_with_npu", IsCompiledWithNPU);
   m.def("is_compiled_with_custom_device", IsCompiledWithCustomDevice);
   m.def("is_compiled_with_ipu", IsCompiledWithIPU);
   m.def("is_compiled_with_xpu", IsCompiledWithXPU);

@@ -14,7 +14,6 @@ extend_skip_glob = [
     "python/paddle/fluid/[!t]**",
     "python/paddle/fluid/tra**",
     "python/paddle/utils/gast/**",
-    "python/paddle/fluid/tests/unittests/npu/**",
 ]
 
 [tool.ruff]
@@ -23,7 +22,6 @@ exclude = [
     "./python/paddle/fluid/[!t]**",
     "./python/paddle/fluid/tra**",
     "./python/paddle/utils/gast/**",
-    "./python/paddle/fluid/tests/unittests/npu/**",
 ]
 target-version = "py37"
 select = [

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
@@ -334,7 +334,6 @@
 from .framework import CPUPlace  # noqa: F401
 from .framework import IPUPlace  # noqa: F401
 from .framework import CUDAPlace  # noqa: F401
-from .framework import NPUPlace  # noqa: F401
 from .framework import CUDAPinnedPlace  # noqa: F401
 from .framework import CustomPlace  # noqa: F401
 
@@ -363,7 +362,6 @@
 from .device import set_device  # noqa: F401
 from .device import get_device  # noqa: F401
 from .device import is_compiled_with_xpu  # noqa: F401
-from .device import is_compiled_with_npu  # noqa: F401
 from .device import is_compiled_with_ipu  # noqa: F401
 from .device import is_compiled_with_cinn  # noqa: F401
 from .device import is_compiled_with_cuda  # noqa: F401
@@ -512,7 +510,6 @@
     'histogram',
     'multiplex',
     'CUDAPlace',
-    'NPUPlace',
     'empty',
     'shape',
     'real',

diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
@@ -344,18 +344,13 @@ def amp_guard(
     if enable and not (
         tracer._expected_place.is_gpu_place()
         or tracer._expected_place.is_xpu_place()
-        or tracer._expected_place.is_npu_place()
         or tracer._expected_place.is_custom_place()
     ):
         warnings.warn(
             'amp_guard can only be enabled on CUDAPlace, XPUPlace, MLUPlace, NPUPlace, and CustomPlace, current place is %s, so it makes no effect.'
             % tracer._expected_place
         )
         enable = False
-    # For npu:
-    if tracer._expected_place.is_npu_place() and (dtype == 'bfloat16'):
-        warnings.warn('NPUPlace only support float16 amp.')
-        enable = False
     # For xpu:
     if tracer._expected_place.is_xpu_place() and (dtype == 'bfloat16'):
         warnings.warn('XPUPlace only support float16 amp.')

diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py
@@ -105,11 +105,10 @@ def __init__(
         if enable and not (
             tracer._expected_place.is_gpu_place()
             or tracer._expected_place.is_xpu_place()
-            or tracer._expected_place.is_npu_place()
             or tracer._expected_place.is_custom_place()
         ):
             warnings.warn(
-                'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace, NPUPlace and CustomPlace, current place is %s, so it makes no effect.'
+                'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace and CustomPlace, current place is %s, so it makes no effect.'
                 % tracer._expected_place
             )
             enable = False
@@ -326,74 +325,36 @@ def _unscale(self, optimizer):
                     if param.dtype == core.VarDesc.VarType.FP32
                 ]
         self._found_inf = self._temp_found_inf_value_false
-        if core.is_compiled_with_npu():
-            float_status = _legacy_C_ops.alloc_float_status()
-            _legacy_C_ops.clear_float_status(float_status, float_status)
-
-            if len(param_grads_fp16):
-                _legacy_C_ops.check_finite_and_unscale(
-                    param_grads_fp16,
-                    self._scale,
-                    float_status,
-                    param_grads_fp16,
-                    self._temp_found_inf_fp16,
-                )
-                self._found_inf = _C_ops.bitwise_or(
-                    self._found_inf, self._temp_found_inf_fp16
-                )
-            if len(param_grads_bf16):
-                _legacy_C_ops.check_finite_and_unscale(
-                    param_grads_bf16,
-                    self._scale,
-                    float_status,
-                    param_grads_bf16,
-                    self._temp_found_inf_bf16,
-                )
-                self._found_inf = _C_ops.bitwise_or(
-                    self._found_inf, self._temp_found_inf_bf16
-                )
-            if len(param_grads_fp32):
-                _legacy_C_ops.check_finite_and_unscale(
-                    param_grads_fp32,
-                    self._scale,
-                    float_status,
-                    param_grads_fp32,
-                    self._temp_found_inf_fp32,
-                )
-                self._found_inf = _C_ops.bitwise_or(
-                    self._found_inf, self._temp_found_inf_fp32
-                )
-        else:
-            if len(param_grads_fp16):
-                _legacy_C_ops.check_finite_and_unscale(
-                    param_grads_fp16,
-                    self._scale,
-                    param_grads_fp16,
-                    self._temp_found_inf_fp16,
-                )
-                self._found_inf = _C_ops.bitwise_or(
-                    self._found_inf, self._temp_found_inf_fp16
-                )
-            if len(param_grads_bf16):
-                _legacy_C_ops.check_finite_and_unscale(
-                    param_grads_bf16,
-                    self._scale,
-                    param_grads_bf16,
-                    self._temp_found_inf_bf16,
-                )
-                self._found_inf = _C_ops.bitwise_or(
-                    self._found_inf, self._temp_found_inf_bf16
-                )
-            if len(param_grads_fp32):
-                _legacy_C_ops.check_finite_and_unscale(
-                    param_grads_fp32,
-                    self._scale,
-                    param_grads_fp32,
-                    self._temp_found_inf_fp32,
-                )
-                self._found_inf = _C_ops.bitwise_or(
-                    self._found_inf, self._temp_found_inf_fp32
-                )
+        if len(param_grads_fp16):
+            _legacy_C_ops.check_finite_and_unscale(
+                param_grads_fp16,
+                self._scale,
+                param_grads_fp16,
+                self._temp_found_inf_fp16,
+            )
+            self._found_inf = _C_ops.bitwise_or(
+                self._found_inf, self._temp_found_inf_fp16
+            )
+        if len(param_grads_bf16):
+            _legacy_C_ops.check_finite_and_unscale(
+                param_grads_bf16,
+                self._scale,
+                param_grads_bf16,
+                self._temp_found_inf_bf16,
+            )
+            self._found_inf = _C_ops.bitwise_or(
+                self._found_inf, self._temp_found_inf_bf16
+            )
+        if len(param_grads_fp32):
+            _legacy_C_ops.check_finite_and_unscale(
+                param_grads_fp32,
+                self._scale,
+                param_grads_fp32,
+                self._temp_found_inf_fp32,
+            )
+            self._found_inf = _C_ops.bitwise_or(
+                self._found_inf, self._temp_found_inf_fp32
+            )
 
         optimizer_state["state"] = OptimizerState.UNSCALED
 

diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py
@@ -36,7 +36,6 @@
     'is_compiled_with_cinn',
     'is_compiled_with_cuda',
     'is_compiled_with_rocm',
-    'is_compiled_with_npu',
     'is_compiled_with_custom_device',
     'get_all_device_type',
     'get_all_custom_device_type',
@@ -53,24 +52,6 @@
 _cudnn_version = None
 
 
-# TODO: WITH_ASCEND_CL may changed to WITH_NPU or others in the future
-# for consistent.
-def is_compiled_with_npu():
-    """
-    Whether paddle was built with WITH_ASCEND_CL=ON to support Ascend NPU.
-
-    Return:
-        bool, ``True`` if NPU is supported, otherwise ``False``.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            support_npu = paddle.device.is_compiled_with_npu()
-    """
-    return core.is_compiled_with_npu()
-
-
 def is_compiled_with_custom_device(device_type):
     """
     Whether paddle was built with Paddle_CUSTOM_DEVICE .
@@ -210,15 +191,6 @@ def _convert_to_place(device):
         selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",")
         device_id = int(selected_xpus[0])
         place = core.XPUPlace(device_id)
-    elif lower_device == 'npu':
-        if not core.is_compiled_with_npu():
-            raise ValueError(
-                "The device should not be 'npu', "
-                "since PaddlePaddle is not compiled with NPU"
-            )
-        selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
-        device_id = int(selected_npus[0])
-        place = core.NPUPlace(device_id)
     elif lower_device == 'ipu':
         if not core.is_compiled_with_ipu():
             raise ValueError(
@@ -229,7 +201,6 @@ def _convert_to_place(device):
     else:
         avaliable_gpu_device = re.match(r'gpu:\d+', lower_device)
         avaliable_xpu_device = re.match(r'xpu:\d+', lower_device)
-        avaliable_npu_device = re.match(r'npu:\d+', lower_device)
         if avaliable_gpu_device:
             if not core.is_compiled_with_cuda():
                 raise ValueError(
@@ -250,31 +221,7 @@ def _convert_to_place(device):
             device_id = device_info_list[1]
             device_id = int(device_id)
             place = core.XPUPlace(device_id)
-        if avaliable_npu_device:
-            if not core.is_compiled_with_npu():
-                device_info_list = device.split(':', 1)
-                device_type = device_info_list[0]
-                if device_type in core.get_all_custom_device_type():
-                    device_id = device_info_list[1]
-                    device_id = int(device_id)
-                    place = core.CustomPlace(device_type, device_id)
-                    return place
-                else:
-                    raise ValueError(
-                        "The device should not be {}, since PaddlePaddle is "
-                        "not compiled with NPU or compiled with custom device".format(
-                            avaliable_npu_device
-                        )
-                    )
-            device_info_list = device.split(':', 1)
-            device_id = device_info_list[1]
-            device_id = int(device_id)
-            place = core.NPUPlace(device_id)
-        if (
-            not avaliable_gpu_device
-            and not avaliable_xpu_device
-            and not avaliable_npu_device
-        ):
+        if not avaliable_gpu_device and not avaliable_xpu_device:
             device_info_list = device.split(':', 1)
             device_type = device_info_list[0]
             if device_type in core.get_all_custom_device_type():
@@ -346,9 +293,6 @@ def get_device():
     elif isinstance(place, core.XPUPlace):
         device_id = place.get_device_id()
         device = 'xpu:' + str(device_id)
-    elif isinstance(place, core.NPUPlace):
-        device_id = place.get_device_id()
-        device = 'npu:' + str(device_id)
     elif isinstance(place, core.IPUPlace):
         num_devices = core.get_ipu_device_count()
         device = f"ipus:{{0-{num_devices - 1}}}"
@@ -469,7 +413,7 @@ class Event:
     Parameters:
         device(str|paddle.CUDAPlace(n)|paddle.CustomPlace(n)): Which device the stream runn on. If device is None, the device is the current device. Default: None.
             It can be ``gpu``, ``gpu:x``,``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
-            where ``x`` is the index of the GPUs, XPUs or NPUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
+            where ``x`` is the index of the GPUs, XPUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
         enable_timing (bool, optional): indicates if the event should measure time, default is False
         blocking (bool, optional): if True, ``wait`` will be blocking, default is False
         interprocess (bool): if True, the event can be shared between processes, default is False
@@ -614,7 +558,7 @@ class Stream:
     Parameters:
         device(str|paddle.CUDAPlace(n)|paddle.CustomPlace(n)): Which device the stream runn on. If device is None, the device is the current device. Default: None.
             It can be ``gpu``, ``gpu:x``,``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
-            where ``x`` is the index of the GPUs, XPUs or NPUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
+            where ``x`` is the index of the GPUs, XPUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n).
         priority(int, optional): priority of the CUDA stream. Can be either
             1 (high priority) or 2 (low priority). By default, streams have
             priority 2.
@@ -936,7 +880,7 @@ def synchronize(device=None):
     Parameters:
         device(str|paddle.CUDAPlace(n)|paddle.XPUPlace(n)|paddle.CustomPlace(n)): The device which want to wait for.  If device is None, the device is the current device. Default: None.
             It can be ``gpu``, ``gpu:x``, ``xpu``, ``xpu:x``, ``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec,
-            where ``x`` is the index of the GPUs, XPUs or NPUs. And it can be paddle.CUDAPlace(n) or paddle.XPUPlace(n) or paddle.CustomPlace(n).
+            where ``x`` is the index of the GPUs, XPUs. And it can be paddle.CUDAPlace(n) or paddle.XPUPlace(n) or paddle.CustomPlace(n).
     Examples:
         .. code-block:: python
             # required: custom_device

diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
@@ -288,11 +288,6 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
                 core.NCCLParallelContext(strategy, place).init_with_ring_id(
                     ring_id
                 )
-            elif core.is_compiled_with_npu():
-                place = core.NPUPlace(genv.device_id)
-                core.HCCLParallelContext(strategy, place).init_with_ring_id(
-                    ring_id
-                )
             elif core.is_compiled_with_xpu():
                 place = core.XPUPlace(genv.device_id)
                 core.BKCLParallelContext(strategy, place).init_with_ring_id(