From bd454315e127465b9f434c12bf83df9189eac409 Mon Sep 17 00:00:00 2001 From: baocheny Date: Fri, 31 Mar 2023 00:36:58 +0800 Subject: [PATCH 1/7] rem is_compiled_with_npu --- .flake8 | 3 - .pre-commit-config.yaml | 1 - paddle/fluid/pybind/pybind.cc | 67 -------------- pyproject.toml | 2 - python/paddle/__init__.py | 3 - python/paddle/amp/grad_scaler.py | 5 +- python/paddle/device/__init__.py | 17 ++-- python/paddle/distributed/collective.py | 5 - python/paddle/distributed/fleet/launch.py | 27 +----- .../paddle/distributed/fleet/launch_utils.py | 80 +--------------- .../distributed/fleet/layers/mpu/mp_ops.py | 4 +- .../fleet/meta_optimizers/common.py | 2 +- .../sharding/offload_helper.py | 6 -- .../meta_optimizers/sharding_optimizer.py | 6 +- .../sharding/group_sharded_utils.py | 4 +- .../fleet/utils/hybrid_parallel_inference.py | 6 +- .../distributed/launch/context/device.py | 10 -- python/paddle/distributed/parallel.py | 18 +--- .../ps/utils/collective_transpiler.py | 33 +------ .../distributed/transpiler/collective.py | 32 +------ python/paddle/fluid/__init__.py | 6 -- python/paddle/fluid/device_worker.py | 2 - .../fluid/dygraph/varbase_patch_methods.py | 6 +- python/paddle/fluid/executor.py | 4 +- python/paddle/fluid/framework.py | 92 +------------------ python/paddle/fluid/optimizer.py | 10 +- .../tests/unittests/c_embedding_op_base.py | 11 +-- .../fluid/tests/unittests/eager_op_test.py | 10 +- .../fluid/tests/unittests/test_device.py | 23 ----- .../fluid/tests/unittests/test_dist_base.py | 33 ------- .../fluid/tests/unittests/test_var_base.py | 3 - python/paddle/hapi/model.py | 27 ------ python/paddle/nn/functional/conv.py | 6 +- python/paddle/static/amp/amp_nn.py | 2 +- python/paddle/static/amp/decorator.py | 4 +- python/paddle/static/amp/fp16_lists.py | 2 +- python/paddle/static/nn/common.py | 2 +- python/paddle/utils/install_check.py | 41 ++------- .../custom_kernel_dot_c_setup.py | 4 - test/custom_kernel/custom_kernel_dot_setup.py | 4 - tools/get_quick_disable_lt.py | 3 - 41 files changed, 57 insertions(+), 569 deletions(-) diff --git a/.flake8 b/.flake8 index d87d6b43e3a61..0015cd971fe23 100644 --- a/.flake8 +++ b/.flake8 @@ -8,9 +8,6 @@ exclude = ./python/paddle/fluid/tra**, # Exclude third-party libraries ./python/paddle/utils/gast/**, - # Exclude files that will be removed in the future, see more at - # https://github.com/PaddlePaddle/Paddle/pull/46782#issuecomment-1273033731 - ./python/paddle/fluid/tests/unittests/npu/**, ./python/paddle/fluid/tests/unittests/mlu/** ignore = # Whitespace before ‘,’, ‘;’, or ‘:’, it is not compatible with black diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ccf1db464d344..80010c9b0c431 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,7 +5,6 @@ exclude: | paddle/fluid/framework/fleet/heter_ps/cudf/.+| paddle/fluid/distributed/ps/thirdparty/round_robin.h| python/paddle/utils/gast/.+| - python/paddle/fluid/tests/unittests/npu/.+| python/paddle/fluid/tests/unittests/mlu/.+ )$ repos: diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 4328f638d852f..ac26b8c140cdf 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -154,12 +154,6 @@ limitations under the License. */ #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif -#ifdef PADDLE_WITH_ASCEND_CL -#include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/device/npu/npu_info.h" -#include "paddle/fluid/platform/device/npu/npu_profiler.h" -#endif - #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/device/xpu/xpu_info.h" #include "paddle/fluid/platform/device/xpu/xpu_op_list.h" @@ -291,14 +285,6 @@ bool IsCompiledWithXPU() { #endif } -bool IsCompiledWithNPU() { -#ifndef PADDLE_WITH_ASCEND_CL - return false; -#else - return true; -#endif -} - bool IsCompiledWithCustomDevice(std::string device_type) { #ifndef PADDLE_WITH_CUSTOM_DEVICE return false; @@ -1626,18 +1612,6 @@ All parameter, weight, gradient are variables in Paddle. "Please recompile or reinstall Paddle with MLU support.")); #else return new paddle::platform::MLUDeviceContext(place); -#endif - }) - .def_static( - "create", - [](paddle::platform::NPUPlace &place) - -> paddle::platform::DeviceContext * { -#ifndef PADDLE_WITH_ASCEND_CL - PADDLE_THROW(platform::errors::PermissionDenied( - "Cannot use NPUPlace in CPU/GPU/XPU version, " - "Please recompile or reinstall Paddle with NPU support.")); -#else - return new paddle::platform::NPUDeviceContext(place); #endif }) .def_static("create", @@ -1809,13 +1783,6 @@ All parameter, weight, gradient are variables in Paddle. pybind11::gil_scoped_release release; self.Run(scope, place); }) - .def("run", - [](OperatorBase &self, - const Scope &scope, - const platform::NPUPlace &place) { - pybind11::gil_scoped_release release; - self.Run(scope, place); - }) .def("run", [](OperatorBase &self, const Scope &scope, @@ -2034,7 +2001,6 @@ All parameter, weight, gradient are variables in Paddle. m.def("is_compiled_with_cuda", IsCompiledWithCUDA); m.def("is_compiled_with_ascend", IsCompiledWithAscend); m.def("is_compiled_with_rocm", IsCompiledWithROCM); - m.def("is_compiled_with_npu", IsCompiledWithNPU); m.def("is_compiled_with_custom_device", IsCompiledWithCustomDevice); m.def("is_compiled_with_ipu", IsCompiledWithIPU); m.def("is_compiled_with_xpu", IsCompiledWithXPU); @@ -2372,39 +2338,6 @@ All parameter, weight, gradient are variables in Paddle. #endif #endif -#ifdef PADDLE_WITH_ASCEND_CL - m.def("get_npu_device_count", platform::GetNPUDeviceCount); - m.def("npu_finalize", []() { - platform::HCCLCommContext::Instance().ReleaseHCCLComms(); - - auto &pool = platform::DeviceContextPool::Instance(); - auto devices = platform::GetSelectedNPUDevices(); - for (size_t i = 0; i < devices.size(); ++i) { - platform::NPUDeviceGuard guard(devices[i]); - pool.Get(platform::NPUPlace(devices[i]))->Wait(); - } - platform::AclInstance::Instance().Finalize(); - }); - - py::class_(m, "NPUProfConfigWrapper"); - - m.def("npu_prof_init", platform::NPUProfilerInit); - m.def("npu_prof_start", [](platform::NPUProfConfigWrapper c) { - platform::NPUProfilerStart(c.ptr()); - }); - m.def("npu_prof_stop", [](platform::NPUProfConfigWrapper c) { - platform::NPUProfilerStop(c.ptr()); - }); - m.def("npu_prof_finalize", platform::NPUProfilerFinalize); - m.def("npu_prof_create_config", []() { - return platform::NPUProfConfigWrapper(platform::NPUProfilerCreateConfig()); - }); - - m.def("npu_prof_destropy_config", [](platform::NPUProfConfigWrapper c) { - platform::NPUProfilerDestroyConfig(c.ptr()); - }); -#endif - #ifdef PADDLE_WITH_IPU m.def("get_ipu_device_count", platform::GetIPUDeviceCount); #endif diff --git a/pyproject.toml b/pyproject.toml index 526b4e9e486cc..2a4fdc775f323 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,6 @@ extend_skip_glob = [ "python/paddle/fluid/[!t]**", "python/paddle/fluid/tra**", "python/paddle/utils/gast/**", - "python/paddle/fluid/tests/unittests/npu/**", "python/paddle/fluid/tests/unittests/mlu/**", ] @@ -24,7 +23,6 @@ exclude = [ "./python/paddle/fluid/[!t]**", "./python/paddle/fluid/tra**", "./python/paddle/utils/gast/**", - "./python/paddle/fluid/tests/unittests/npu/**", "./python/paddle/fluid/tests/unittests/mlu/**", ] target-version = "py37" diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index ca7c4b525434f..8b87fb298f323 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -333,7 +333,6 @@ from .framework import CPUPlace # noqa: F401 from .framework import IPUPlace # noqa: F401 from .framework import CUDAPlace # noqa: F401 -from .framework import NPUPlace # noqa: F401 from .framework import CUDAPinnedPlace # noqa: F401 from .framework import MLUPlace # noqa: F401 from .framework import CustomPlace # noqa: F401 @@ -363,7 +362,6 @@ from .device import set_device # noqa: F401 from .device import get_device # noqa: F401 from .device import is_compiled_with_xpu # noqa: F401 -from .device import is_compiled_with_npu # noqa: F401 from .device import is_compiled_with_ipu # noqa: F401 from .device import is_compiled_with_mlu # noqa: F401 from .device import is_compiled_with_cinn # noqa: F401 @@ -513,7 +511,6 @@ 'histogram', 'multiplex', 'CUDAPlace', - 'NPUPlace', 'empty', 'shape', 'real', diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py index 3268783c742ca..009316514f5ec 100644 --- a/python/paddle/amp/grad_scaler.py +++ b/python/paddle/amp/grad_scaler.py @@ -106,11 +106,10 @@ def __init__( tracer._expected_place.is_gpu_place() or tracer._expected_place.is_xpu_place() or tracer._expected_place.is_mlu_place() - or tracer._expected_place.is_npu_place() or tracer._expected_place.is_custom_place() ): warnings.warn( - 'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace, NPUPlace and CustomPlace, current place is %s, so it makes no effect.' + 'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace and CustomPlace, current place is %s, so it makes no effect.' % tracer._expected_place ) enable = False @@ -327,7 +326,7 @@ def _unscale(self, optimizer): if param.dtype == core.VarDesc.VarType.FP32 ] self._found_inf = self._temp_found_inf_value_false - if core.is_compiled_with_npu(): + if core.is_compiled_with_custom_device('npu'): float_status = _legacy_C_ops.alloc_float_status() _legacy_C_ops.clear_float_status(float_status, float_status) diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py index b05c5be8d4568..7ab7deb5c28a7 100644 --- a/python/paddle/device/__init__.py +++ b/python/paddle/device/__init__.py @@ -248,14 +248,14 @@ def _convert_to_place(device): device_id = int(selected_xpus[0]) place = core.XPUPlace(device_id) elif lower_device == 'npu': - if not core.is_compiled_with_npu(): + if not core.is_compiled_with_custom_device('npu'): raise ValueError( "The device should not be 'npu', " "since PaddlePaddle is not compiled with NPU" ) selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",") device_id = int(selected_npus[0]) - place = core.NPUPlace(device_id) + place = core.CustomPlace('npu', device_id) elif lower_device == 'ipu': if not core.is_compiled_with_ipu(): raise ValueError( @@ -298,7 +298,7 @@ def _convert_to_place(device): device_id = int(device_id) place = core.XPUPlace(device_id) if avaliable_npu_device: - if not core.is_compiled_with_npu(): + if not core.is_compiled_with_custom_device('npu'): device_info_list = device.split(':', 1) device_type = device_info_list[0] if device_type in core.get_all_custom_device_type(): @@ -316,7 +316,7 @@ def _convert_to_place(device): device_info_list = device.split(':', 1) device_id = device_info_list[1] device_id = int(device_id) - place = core.NPUPlace(device_id) + place = core.CustomPlace('npu', device_id) if avaliable_mlu_device: if not core.is_compiled_with_mlu(): raise ValueError( @@ -404,9 +404,6 @@ def get_device(): elif isinstance(place, core.XPUPlace): device_id = place.get_device_id() device = 'xpu:' + str(device_id) - elif isinstance(place, core.NPUPlace): - device_id = place.get_device_id() - device = 'npu:' + str(device_id) elif isinstance(place, core.IPUPlace): num_devices = core.get_ipu_device_count() device = "ipus:{{0-{}}}".format(num_devices - 1) @@ -529,7 +526,7 @@ class Event: Parameters: device(str|paddle.CUDAPlace(n)|paddle.CustomPlace(n)): Which device the stream runn on. If device is None, the device is the current device. Default: None. It can be ``gpu``, ``gpu:x``,``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec, - where ``x`` is the index of the GPUs, XPUs, NPUs or MLUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n). + where ``x`` is the index of the GPUs, XPUs or MLUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n). enable_timing (bool, optional): indicates if the event should measure time, default is False blocking (bool, optional): if True, ``wait`` will be blocking, default is False interprocess (bool): if True, the event can be shared between processes, default is False @@ -674,7 +671,7 @@ class Stream: Parameters: device(str|paddle.CUDAPlace(n)|paddle.CustomPlace(n)): Which device the stream runn on. If device is None, the device is the current device. Default: None. It can be ``gpu``, ``gpu:x``,``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec, - where ``x`` is the index of the GPUs, XPUs, NPUs or MLUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n). + where ``x`` is the index of the GPUs, XPUs or MLUs. And it can be paddle.CUDAPlace(n) or paddle.CustomPlace(n). priority(int, optional): priority of the CUDA stream. Can be either 1 (high priority) or 2 (low priority). By default, streams have priority 2. @@ -996,7 +993,7 @@ def synchronize(device=None): Parameters: device(str|paddle.CUDAPlace(n)|paddle.XPUPlace(n)|paddle.CustomPlace(n)): The device which want to wait for. If device is None, the device is the current device. Default: None. It can be ``gpu``, ``gpu:x``, ``xpu``, ``xpu:x``, ``custom_device``, ``custom_device:x``, where ``custom_device`` is the name of CustomDevicec, - where ``x`` is the index of the GPUs, XPUs, NPUs or MLUs. And it can be paddle.CUDAPlace(n) or paddle.XPUPlace(n) or paddle.CustomPlace(n). + where ``x`` is the index of the GPUs, XPUs or MLUs. And it can be paddle.CUDAPlace(n) or paddle.XPUPlace(n) or paddle.CustomPlace(n). Examples: .. code-block:: python # required: custom_device diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index 774112467fb91..6dd74531a0225 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -288,11 +288,6 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout): core.NCCLParallelContext(strategy, place).init_with_ring_id( ring_id ) - elif core.is_compiled_with_npu(): - place = core.NPUPlace(genv.device_id) - core.HCCLParallelContext(strategy, place).init_with_ring_id( - ring_id - ) elif core.is_compiled_with_mlu(): place = core.MLUPlace(genv.device_id) core.CNCLParallelContext(strategy, place).init_with_ring_id( diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index 32a36783a71b9..354b4b708619c 100755 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -64,7 +64,7 @@ from argparse import REMAINDER, ArgumentParser from paddle import framework -from paddle.distributed.fleet import ascend_utils, cloud_utils, launch_utils +from paddle.distributed.fleet import cloud_utils, launch_utils from paddle.distributed.fleet.elastic import enable_elastic, launch_elastic from paddle.distributed.fleet.launch_utils import ( DeviceMode, @@ -155,16 +155,6 @@ def _parse_args(): ) base_group.add_argument("--selected_xpus", dest="xpus") - if framework.core.is_compiled_with_npu(): - base_group.add_argument( - "--npus", - type=str, - default=None, - help="It's for xpu training. For example: " - "--npus=\"0,1,2,3\" will launch four training processes each bound to one npu.", - ) - base_group.add_argument("--selected_npus", dest="npus") - if framework.core.is_compiled_with_mlu(): base_group.add_argument( "--mlus", @@ -417,13 +407,6 @@ def get_cluster_info(args): args.ips, device_mode, devices_per_proc, start_port ) logger.debug("get cluster from cloud:{}".format(cluster)) - elif device_mode == DeviceMode.ASCEND_NPU: - # for ascend - cluster, pod = ascend_utils.get_cloud_cluster( - rank_table_file=os.getenv("RANK_TABLE_FILE", None), - device_mode=device_mode, - start_port=start_port, - ) else: # trainers_num = 1 or not use paddlecloud ips="a,b" cluster, pod = get_cluster_from_args( @@ -503,8 +486,6 @@ def infer_backend(args): return if framework.core.is_compiled_with_cuda(): args.backend = 'nccl' - elif framework.core.is_compiled_with_npu(): - args.backend = 'unknown' elif framework.core.is_compiled_with_xpu(): args.backend = 'bkcl' elif framework.core.is_compiled_with_mlu(): @@ -557,8 +538,6 @@ def which_distributed_mode(args): if framework.core.is_compiled_with_cuda(): accelerators = framework.core.get_cuda_device_count() - elif framework.core.is_compiled_with_npu(): - accelerators = framework.core.get_npu_device_count() elif framework.core.is_compiled_with_xpu(): accelerators = framework.core.get_xpu_device_count() elif framework.core.is_compiled_with_mlu(): @@ -593,7 +572,7 @@ def which_distributed_mode(args): ): if args.servers: logger.warning( - "Not found distinct arguments and not compiled with cuda or xpu or npu or mlu. " + "Not found distinct arguments and not compiled with cuda or xpu or mlu. " "But found args.servers not empty, default use ps mode" ) return DistributeMode.PS @@ -601,7 +580,7 @@ def which_distributed_mode(args): return DistributeMode.COLLECTIVE else: logger.warning( - "Not found distinct arguments and compiled with cuda or xpu or npu or mlu. " + "Not found distinct arguments and compiled with cuda or xpu or mlu. " "Default use collective mode" ) return DistributeMode.COLLECTIVE diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index ef30b7af9bc86..82d304940ce51 100755 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -55,7 +55,6 @@ class DeviceMode: GPU = 1 KUNLUN = 2 XPU = 2 - ASCEND_NPU = 3 UNKNOWN = 3 MLU = 4 @@ -308,11 +307,7 @@ def get_cluster( ), "current trainer_endpoints size should be greater equal than acclerators size." for i in range(len(devices_per_proc)): trainer = Trainer() - if ( - device_mode == DeviceMode.GPU - or device_mode == DeviceMode.ASCEND_NPU - or device_mode == DeviceMode.MLU - ): + if device_mode == DeviceMode.GPU or device_mode == DeviceMode.MLU: if isinstance(devices_per_proc[i], (list, tuple)): trainer.accelerators.extend(devices_per_proc[i]) pod.accelerators.extend(devices_per_proc[i]) @@ -555,13 +550,6 @@ def start_local_trainers( proc_env["FLAGS_selected_gpus"] = "%s" % ",".join( [str(g) for g in t.accelerators] ) - - elif ( - len(t.accelerators) > 0 and pod.device_mode == DeviceMode.ASCEND_NPU - ): - proc_env["FLAGS_selected_npus"] = "%s" % ",".join( - [str(g) for g in t.accelerators] - ) elif len(t.accelerators) > 0 and pod.device_mode == DeviceMode.MLU: proc_env["FLAGS_selected_mlus"] = "%s" % ",".join( [str(g) for g in t.accelerators] @@ -773,40 +761,6 @@ def get_xpus(xpus): return res_xpus -def get_npus(npus): - if npus is None: - npus_num = framework.core.get_npu_device_count() - res_npus = [str(x) for x in range(0, npus_num)] - else: - npu_visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES") - if npu_visible_devices is None or npu_visible_devices == "": - res_npus = [x.strip() for x in npus.split(',')] - else: - # change npus into relative values - # e.g. ASCEND_VISIBLE_DEVICES=4,5,6,7; args.npus=4,5,6,7; - # therefore npus=0,1,2,3 - npu_visible_devices_list = npu_visible_devices.split(',') - for x in npus.split(','): - assert x in npu_visible_devices_list, ( - "Can't find " - "your npus %s in ASCEND_VISIBLE_DEVICES[%s]." - % (x, npu_visible_devices) - ) - res_npus = [ - npu_visible_devices_list.index(x.strip()) - for x in npus.split(',') - ] - logger.info( - "Change selected_npus into reletive values. --ips:{} " - "will change into relative_ips:{} according to your " - "ASCEND_VISIBLE_DEVICES:{}".format( - npus, res_npus, npu_visible_devices_list - ) - ) - - return res_npus - - def get_mlus(mlus): if mlus is None: mlus_num = framework.core.get_mlu_device_count() @@ -856,16 +810,6 @@ def get_device_mode(backend): ): print("launch train in heter mode with XPU device.") return DeviceMode.XPU - if ( - framework.core.is_compiled_with_npu() - and framework.core.get_npu_device_count() > 0 - ): - print("launch train in heter mode with NPU device.") - return DeviceMode.ASCEND_NPU - - if backend == 'hccl' and framework.core.get_npu_device_count() > 0: - print("launch train in ascend npu mode!") - return DeviceMode.ASCEND_NPU if backend == 'nccl' and framework.core.get_cuda_device_count() > 0: print("launch train in GPU mode!") @@ -905,19 +849,6 @@ def get_device_proc_info(args): devices_per_proc = [gpus[i : i + n] for i in range(0, len(gpus), n)] else: devices_per_proc = gpus - elif device_mode == DeviceMode.ASCEND_NPU: - npus = get_npus(args.npus) - if args.nproc_per_node is not None: - assert ( - len(npus) % int(args.nproc_per_node) - ) == 0, "npus' number:{} mod args.nproc_per_node:{} must == 0".format( - len(npus), args.nproc_per_node - ) - - n = int(len(npus) / int(args.nproc_per_node)) - devices_per_proc = [npus[i : i + n] for i in range(0, len(npus), n)] - else: - devices_per_proc = npus elif device_mode == DeviceMode.XPU: xpus = get_xpus(args.xpus) if args.nproc_per_node is not None: @@ -2144,12 +2075,6 @@ def check_backend(backend): "your paddle is not compiled with xpu but you assign 'bkcl' as backend." ) - if backend == 'hccl' and not framework.core.is_compiled_with_npu(): - raise ValueError( - "paddle.distributed initialize error, " - "your paddle is not compiled with npu but you assign 'hccl' as backend." - ) - if backend == 'cncl' and not framework.core.is_compiled_with_mlu(): raise ValueError( "paddle.distributed initialize error, " @@ -2177,9 +2102,6 @@ def get_backend_by_compile_flag(): if framework.core.is_compiled_with_xpu(): return 'bkcl' - if framework.core.is_compiled_with_npu(): - return 'hccl' - if framework.core.is_compiled_with_mlu(): return 'cncl' diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py index fade4aa61ce84..44a01677bee5a 100644 --- a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py +++ b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py @@ -536,7 +536,9 @@ def _parallel_linear( # NOTE: npu linear function use matmul_v2 but linear use matmul linear_function = ( - _linear if core.is_compiled_with_npu() else paddle.nn.functional.linear + _linear + if core.is_compiled_with_custom_device('npu') + else paddle.nn.functional.linear ) linear_out = linear_function( x, diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py index bc79bea4e2359..c9474d397417a 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/common.py +++ b/python/paddle/distributed/fleet/meta_optimizers/common.py @@ -196,7 +196,7 @@ def _add_sync_by_allreduce(block): OP_ROLE_KEY: OpRole.Forward, }, ) - elif core.is_compiled_with_npu(): + elif core.is_compiled_with_custom_device('npu'): block.append_op( type='c_gen_hccl_id', inputs={}, diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py index d9a30150accdb..bcdd93ffba2e5 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py @@ -26,23 +26,17 @@ class PlaceType: CUDA = 1 CUDA_PINNED = 2 XPU = 3 # unsupport for now - NPU = 4 - NPU_PINNED = 5 @staticmethod def default_device(): if core.is_compiled_with_cuda(): return PlaceType.CUDA - elif core.is_compiled_with_npu(): - return PlaceType.NPU return PlaceType.CPU @staticmethod def default_pinned(): if core.is_compiled_with_cuda(): return PlaceType.CUDA_PINNED - elif core.is_compiled_with_npu(): - return PlaceType.NPU_PINNED return PlaceType.CPU diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index 264c48870e84f..5a3c058ace66e 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -600,7 +600,7 @@ def _adapt_amp_clip_without_sharding(self): rings = [self.mp_ring_id, self.pp_ring_id] # FIXME(wangxi): some problem with NPU found_finite, need sync with DP - if core.is_compiled_with_npu(): + if core.is_compiled_with_custom_device('npu'): rings += [self.dp_ring_id] FP16Utils.sync_amp_check_nan_inf(main_block, rings) @@ -725,7 +725,7 @@ def minimize_impl( self._dump_program_for_debug() # GPU need to wait server ready, GPU and NPU is Layered connection - if not core.is_compiled_with_npu(): + if not core.is_compiled_with_custom_device('npu'): self._wait() return optimize_ops, params_grads @@ -847,7 +847,7 @@ def _init_pipeline_comm(self, startup_block): sync=False, ) - if core.is_compiled_with_npu(): + if core.is_compiled_with_custom_device('npu'): self._init_npu_pipeline_comm(startup_block) return diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py index 1a09bb3fa92a2..100750dd28b34 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py @@ -200,7 +200,7 @@ def device_guard(dev_id=0, device="cpu"): origin_device = paddle.device.get_device() if device == "cpu": paddle.set_device(device) - elif device in ["gpu", "xpu", "npu"]: + elif device in ["gpu", "xpu"]: paddle.set_device("{}:{}".format(device, dev_id)) try: yield @@ -313,8 +313,6 @@ def cvt_to_device(x, dev_id, blocking=True): """ if paddle.is_compiled_with_cuda(): place = paddle.CUDAPlace(dev_id) - elif paddle.is_compiled_with_npu(): - place = paddle.NPUPlace(dev_id) elif paddle.is_compiled_with_xpu(): place = paddle.XPUPlace(dev_id) else: diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py index b36af2c2a040b..68b5581d3be37 100644 --- a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py +++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py @@ -201,11 +201,9 @@ def __init__( assert isinstance(main_program, Program) self._device = None - if core.is_compiled_with_npu(): - self._device = "npu" - elif core.is_compiled_with_cuda(): + if core.is_compiled_with_cuda(): self._device = "gpu" - assert self._device, "Only gpu and npu are supported." + assert self._device, "Only gpu are supported." assert not in_dygraph_mode(), "Only static graph mode is supported." diff --git a/python/paddle/distributed/launch/context/device.py b/python/paddle/distributed/launch/context/device.py index 48dba9af56411..c3f6e504dcc77 100644 --- a/python/paddle/distributed/launch/context/device.py +++ b/python/paddle/distributed/launch/context/device.py @@ -24,7 +24,6 @@ class DeviceType: CPU = 'cpu' GPU = 'gpu' XPU = 'xpu' - NPU = 'npu' MLU = 'mlu' IPU = 'ipu' CUSTOM_DEVICE = 'custom_device' @@ -69,8 +68,6 @@ def get_selected_device_key(self): return 'FLAGS_selected_cpus' if self._dtype == DeviceType.GPU: return 'FLAGS_selected_gpus' - if self._dtype == DeviceType.NPU: - return 'FLAGS_selected_npus' if self._dtype == DeviceType.XPU: return 'FLAGS_selected_xpus' if self._dtype == DeviceType.MLU: @@ -114,9 +111,6 @@ def parse_device(self): elif 'XPU_VISIBLE_DEVICES' in os.environ: dev._dtype = DeviceType.XPU visible_devices = os.getenv("XPU_VISIBLE_DEVICES") - elif 'ASCEND_VISIBLE_DEVICES' in os.environ: - dev._dtype = DeviceType.NPU - visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES") elif 'MLU_VISIBLE_DEVICES' in os.environ: dev._dtype = DeviceType.MLU visible_devices = os.getenv("MLU_VISIBLE_DEVICES") @@ -158,10 +152,6 @@ def get_custom_devices_count(device_type): dev._dtype = DeviceType.XPU num = core.get_xpu_device_count() visible_devices = os.getenv("XPU_VISIBLE_DEVICES") - elif core.is_compiled_with_npu(): - dev._dtype = DeviceType.NPU - num = core.get_npu_device_count() - visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES") elif core.is_compiled_with_mlu(): dev._dtype = DeviceType.MLU num = core.get_mlu_device_count() diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index 2be2f097be984..56dc3741e3c1f 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -721,9 +721,6 @@ def __init__(self): elif core.is_compiled_with_xpu(): selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",") self._device_id = int(selected_xpus[0]) - elif core.is_compiled_with_npu(): - selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",") - self._device_id = int(selected_npus[0]) elif core.is_compiled_with_mlu(): selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",") self._device_id = int(selected_mlus[0]) @@ -892,11 +889,10 @@ def _start_kv_server(port, http_server_d, size): def _is_cpuonly(backend): check_backend(backend) if ( - backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter', 'cncl'] + backend in ['auto', 'nccl', 'bkcl', 'heter', 'cncl'] and ( core.is_compiled_with_cuda() or core.is_compiled_with_xpu() - or core.is_compiled_with_npu() or core.is_compiled_with_mlu() ) ) or backend == 'xccl': @@ -998,7 +994,6 @@ def train(): is_cpu_only or core.is_compiled_with_cuda() or core.is_compiled_with_xpu() - or core.is_compiled_with_npu() or core.is_compiled_with_mlu() or backend == "xccl" ): @@ -1018,9 +1013,6 @@ def train(): elif not is_cpu_only and core.is_compiled_with_xpu(): _check_var_exists('FLAGS_selected_xpus') backend = "bkcl" if backend == "auto" else backend - elif not is_cpu_only and core.is_compiled_with_npu(): - _check_var_exists('FLAGS_selected_npus') - backend = "hccl" if backend == "auto" else backend elif not is_cpu_only and core.is_compiled_with_mlu(): _check_var_exists('FLAGS_selected_mlus') backend = "cncl" if backend == "auto" else backend @@ -1046,8 +1038,6 @@ def train(): place = core.CUDAPlace(parallel_env.device_id) elif core.is_compiled_with_xpu(): place = core.XPUPlace(parallel_env.device_id) - elif core.is_compiled_with_npu(): - place = core.NPUPlace(parallel_env.device_id) elif core.is_compiled_with_mlu(): place = core.MLUPlace(parallel_env.device_id) @@ -1146,7 +1136,7 @@ def train(): strategy.current_endpoint = parallel_env.current_endpoint strategy.nrings = parallel_env.nrings - # init nccl or hccl or bkcl or heter context + # init nccl or bkcl or heter context if is_cpu_only: parallel_helper._set_parallel_ctx( core.GLOOParallelContext(strategy, place) @@ -1163,10 +1153,6 @@ def train(): parallel_helper._set_parallel_ctx( core.BKCLParallelContext(strategy, place) ) - elif core.is_compiled_with_npu(): - parallel_helper._set_parallel_ctx( - core.HCCLParallelContext(strategy, place) - ) elif core.is_compiled_with_mlu(): parallel_helper._set_parallel_ctx( core.CNCLParallelContext(strategy, place) diff --git a/python/paddle/distributed/ps/utils/collective_transpiler.py b/python/paddle/distributed/ps/utils/collective_transpiler.py index ea6f23de48d97..952652d155800 100644 --- a/python/paddle/distributed/ps/utils/collective_transpiler.py +++ b/python/paddle/distributed/ps/utils/collective_transpiler.py @@ -133,37 +133,8 @@ def _init_communicator( wait_server_ready(other_endpoints) block = program.global_block() - if core.is_compiled_with_npu(): - hccl_id_var = block.create_var( - name=unique_name.generate('hccl_id'), - persistable=True, - type=core.VarDesc.VarType.RAW, - ) - endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)} - block.append_op( - type='c_gen_hccl_id', - inputs={}, - outputs={'Out': hccl_id_var}, - attrs={ - 'rank': rank, - 'endpoint': current_endpoint, - 'other_endpoints': other_endpoints, - self.op_role_key: OpRole.Forward, - }, - ) - block.append_op( - type='c_comm_init_hccl', - inputs={'X': hccl_id_var}, - outputs={}, - attrs={ - 'rank': rank, - 'ring_id': ring_id, - 'device_id': int(os.getenv("FLAGS_selected_npus")), - 'rank_ids': nranks, - self.op_role_key: OpRole.Forward, - }, - ) - elif core.is_compiled_with_xpu(): + + if core.is_compiled_with_xpu(): bkcl_id_var = block.create_var( name=unique_name.generate('bkcl_id'), persistable=True, diff --git a/python/paddle/distributed/transpiler/collective.py b/python/paddle/distributed/transpiler/collective.py index b60ae1266e3c9..8ceb9c1e5c633 100644 --- a/python/paddle/distributed/transpiler/collective.py +++ b/python/paddle/distributed/transpiler/collective.py @@ -131,37 +131,7 @@ def _init_communicator( wait_server_ready(other_endpoints) block = program.global_block() - if core.is_compiled_with_npu(): - hccl_id_var = block.create_var( - name=unique_name.generate('hccl_id'), - persistable=True, - type=core.VarDesc.VarType.RAW, - ) - endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)} - block.append_op( - type='c_gen_hccl_id', - inputs={}, - outputs={'Out': hccl_id_var}, - attrs={ - 'rank': rank, - 'endpoint': current_endpoint, - 'other_endpoints': other_endpoints, - self.op_role_key: OpRole.Forward, - }, - ) - block.append_op( - type='c_comm_init_hccl', - inputs={'X': hccl_id_var}, - outputs={}, - attrs={ - 'rank': rank, - 'ring_id': ring_id, - 'device_id': int(os.getenv("FLAGS_selected_npus")), - 'rank_ids': nranks, - self.op_role_key: OpRole.Forward, - }, - ) - elif core.is_compiled_with_cuda(): + if core.is_compiled_with_cuda(): nccl_id_var = block.create_var( name=unique_name.generate('nccl_id'), persistable=True, diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 7408d8e3b6e42..a6892377cb5a9 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -71,7 +71,6 @@ XPUPlace, CUDAPlace, CUDAPinnedPlace, - NPUPlace, IPUPlace, MLUPlace, CustomPlace, @@ -127,7 +126,6 @@ 'XPUPlace', 'CUDAPlace', 'CUDAPinnedPlace', - 'NPUPlace', 'IPUPlace', 'MLUPlace', 'Tensor', @@ -220,10 +218,6 @@ def remove_flag_if_exists(name): __bootstrap__() monkey_patch_varbase() -# NOTE(zhiqiu): register npu_finalize on the exit of Python, -# do some clean up manually. -if core.is_compiled_with_npu(): - atexit.register(core.npu_finalize) # NOTE(Aurelius84): clean up ExecutorCacheInfo in advance manually. atexit.register(core.clear_executor_cache) diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py index bebf5ebd5eae1..4083937e2d76d 100644 --- a/python/paddle/fluid/device_worker.py +++ b/python/paddle/fluid/device_worker.py @@ -654,8 +654,6 @@ def _gen_worker_desc(self, trainer_desc): place_id = pipeline_opt["place_id"] if core.is_compiled_with_cuda(): assert isinstance(place, core.CUDAPlace) - elif core.is_compiled_with_npu(): - assert isinstance(place, core.NPUPlace) cfg.place = cfg.CUDAPlace cfg.place_id = place_id diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 0bc2a15b7d7a5..1d294a0330d44 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -306,11 +306,7 @@ def backward(self, grad_tensor=None, retain_graph=False): if _grad_scalar: # When using amp with Fleet DistributedStrategy, we do loss scaling implicitly. self = _grad_scalar.scale(self) - if ( - paddle.is_compiled_with_xpu() - or paddle.is_compiled_with_npu() - or paddle.is_compiled_with_mlu() - ): + if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_mlu(): # TODO(liuyuhui): Currently only for xpu. Will be removed in the future. scaled_loss = scale_loss(self) if framework.global_var._in_eager_mode_: diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index ce1c55bd17168..82c077a883a7f 100755 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -2132,7 +2132,7 @@ def _run_from_dataset( for var in program.global_block().vars.values(): if var.is_data: data_vars.append(var) - if core.is_compiled_with_npu(): + if core.is_compiled_with_custom_device('npu'): dataset = paddle.fluid.DatasetFactory().create_dataset( 'InMemoryDataset' ) @@ -2309,7 +2309,7 @@ def _get_dataset(): for var in program.global_block().vars.values(): if var.is_data: data_vars.append(var) - if core.is_compiled_with_npu(): + if core.is_compiled_with_custom_device('npu'): dataset = paddle.fluid.DatasetFactory().create_dataset( 'InMemoryDataset' ) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 7b17ecc3e150b..13d8609b3335e 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -59,7 +59,6 @@ 'is_compiled_with_cuda', 'is_compiled_with_rocm', 'is_compiled_with_xpu', - 'is_compiled_with_npu', 'Variable', 'require_version', 'device_guard', @@ -225,7 +224,7 @@ def _in_eager_without_dygraph_check(): return global_var._in_eager_mode_ -# FIXME(dev): We haven't fully verified eager mode on XPU/NPU et.al but +# FIXME(dev): We haven't fully verified eager mode on XPU et.al but # only GPU/CPU. Remove this after we improve this feature. _is_first_import_ = True @@ -728,15 +727,6 @@ def _xpu_ids(): return device_ids -def _npu_ids(): - npus_env = os.getenv("FLAGS_selected_npus") - if npus_env: - device_ids = [int(s) for s in npus_env.split(",")] - else: - device_ids = range(core.get_npu_device_count()) - return device_ids - - def _custom_device_ids(device_type): custom_devices_env = os.getenv("FLAGS_selected_" + device_type + "s") if custom_devices_env: @@ -770,21 +760,6 @@ def is_compiled_with_xpu(): return core.is_compiled_with_xpu() -def is_compiled_with_npu(): - """ - Whether this whl package can be used to run the model on NPU. - - Returns (bool): support npu or not. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - support_npu = fluid.is_compiled_with_npu() - """ - return core.is_compiled_with_npu() - - def disable_signal_handler(): """ Reset signal handler registered by Paddle. @@ -943,47 +918,6 @@ def xpu_places(device_ids=None): return [core.XPUPlace(dev_id) for dev_id in device_ids] -def npu_places(device_ids=None): - """ - - Note: - For multi-card tasks, please use `FLAGS_selected_npus` environment variable to set the visible NPU device. - - This function creates a list of :code:`paddle.NPUPlace` objects. - If :code:`device_ids` is None, environment variable of - :code:`FLAGS_selected_npus` would be checked first. For example, if - :code:`FLAGS_selected_npus=0,1,2`, the returned list would - be [paddle.NPUPlace(0), paddle.NPUPlace(1), paddle.NPUPlace(2)]. - If :code:`FLAGS_selected_npus` is not set, all visible - npu places would be returned. - If :code:`device_ids` is not None, it should be the device - ids of NPUs. For example, if :code:`device_ids=[0,1,2]`, - the returned list would be - [paddle.NPUPlace(0), paddle.NPUPlace(1), paddle.NPUPlace(2)]. - - Parameters: - device_ids (list or tuple of int, optional): list of NPU device ids. - Returns: - list of paddle.NPUPlace: Created NPU place list. - Examples: - .. code-block:: python - - # required: npu - - import paddle - import paddle.static as static - - paddle.enable_static() - npu_places = static.npu_places() - """ - assert core.is_compiled_with_npu(), "Not compiled with NPU" - if device_ids is None: - device_ids = _npu_ids() - elif not isinstance(device_ids, (list, tuple)): - device_ids = [device_ids] - return [core.NPUPlace(dev_id) for dev_id in device_ids] - - def cpu_places(device_count=None): """ This function creates a list of :code:`paddle.CPUPlace` objects, and returns the created list. @@ -2641,10 +2575,6 @@ def set_value(self, value, scope=None): p = core.Place() p.set_place(t._place()) place = core.XPUPlace(p.xpu_device_id()) - elif p.is_npu_place(): - p = core.Place() - p.set_place(t._place()) - place = core.NPUPlace(p.npu_device_id()) elif p.is_mlu_place(): p = core.Place() p.set_place(t._place()) @@ -7574,9 +7504,9 @@ def device_guard(device=None): device, index = device.split(':') if device == 'cpu': raise ValueError("Should not set device id for cpu.") - if device not in ['cpu', 'gpu', 'npu', 'xpu', 'mlu', '', None]: + if device not in ['cpu', 'gpu', 'xpu', 'mlu', '', None]: raise ValueError( - "The Attr(device) should be 'cpu' 'npu' 'xpu' 'mlu' or 'gpu', and it can also be empty string or None " + "The Attr(device) should be 'cpu' 'xpu' 'mlu' or 'gpu', and it can also be empty string or None " "when there is no need to specify device. But received %s" % device ) if index: @@ -7705,7 +7635,6 @@ def _get_paddle_place(place): core.CPUPlace, core.CUDAPinnedPlace, core.CUDAPlace, - core.NPUPlace, core.IPUPlace, core.MLUPlace, core.CustomPlace, @@ -7756,19 +7685,6 @@ def _get_paddle_place(place): device_id = int(device_id) return core.XPUPlace(device_id) - # NPU - avaliable_npu_place = re.match(r'npu:\d+', place) - if avaliable_npu_place: - if not core.is_compiled_with_npu(): - raise ValueError( - "The device should not be {}, since PaddlePaddle is " - "not compiled with NPU".format(avaliable_npu_place.group()) - ) - place_info_list = place.split(':', 1) - device_id = place_info_list[1] - device_id = int(device_id) - return core.NPUPlace(device_id) - # IPU avaliable_ipu_place = re.match(r'ipu:\d+', place) if avaliable_ipu_place: @@ -7796,7 +7712,7 @@ def _get_paddle_place(place): return core.MLUPlace(device_id) raise ValueError( - "Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace, IPUPlace, MLUPlace and NPUPlace, but received {}.".format( + "Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace, IPUPlac and MLUPlace but received {}.".format( place ) ) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index df190d6627321..6ed9e674689ee 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -4553,7 +4553,7 @@ def train_reader(): def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0): self._device = 'cpu' - if core.is_compiled_with_npu(): + if core.is_compiled_with_custom_device('npu'): self._device = "npu" elif core.is_compiled_with_cuda(): self._device = "gpu" @@ -5770,7 +5770,7 @@ def _insert_accumulate_gradients_with_fuse( # If there are some not initialized sections in the fused var, # and the value in those sections are nan/inf, it will trigger the nan/inf check. # To avoid these problematic triggers, set constant is needed for npu - "set_constant": core.is_compiled_with_npu(), + "set_constant": core.is_compiled_with_custom_device('npu'), "constant": float(0.0), }, ) @@ -6387,8 +6387,8 @@ def device_cmp(device1, device2): dev_index = int(dev.split(":")[1]) if core.is_compiled_with_cuda(): place_list.append(core.CUDAPlace(dev_index % 1)) - elif core.is_compiled_with_npu(): - place_list.append(core.NPUPlace(dev_index % 1)) + elif paddle.is_compiled_with_custom_device('npu'): + place_list.append(paddle.CustomPlace('npu', dev_index % 1)) # Step6: Split startup program new_startup_program = self._split_startup_program( @@ -6411,7 +6411,7 @@ def device_cmp(device1, device2): if core.is_compiled_with_cuda(): place_id = int(os.getenv("FLAGS_selected_gpus", "0")) - elif core.is_compiled_with_npu(): + elif core.is_compiled_with_custom_device('npu'): place_id = int(os.getenv("FLAGS_selected_npus", "0")) # A pass to move the recv op to the beginning of # the forward/backward phase diff --git a/python/paddle/fluid/tests/unittests/c_embedding_op_base.py b/python/paddle/fluid/tests/unittests/c_embedding_op_base.py index d8a25b48022b4..762961ca5e8a5 100644 --- a/python/paddle/fluid/tests/unittests/c_embedding_op_base.py +++ b/python/paddle/fluid/tests/unittests/c_embedding_op_base.py @@ -37,9 +37,7 @@ class TestCEmbeddingCPU(OpTest): def setUp(self): self.init_dtype() self.initcase() - if core.is_compiled_with_npu(): - self.__class__.use_npu = True - elif core.is_compiled_with_xpu(): + if core.is_compiled_with_xpu(): self.__class__.use_xpu = True elif core.is_compiled_with_cuda(): self.__class__.exist_fp64_check_grad = True @@ -98,9 +96,6 @@ def init_dtype(self): if core.is_compiled_with_cuda(): self.dtype = "float64" self.ids_dtype = "int64" - elif core.is_compiled_with_npu(): - self.dtype = "float32" - self.ids_dtype = "int32" elif core.is_compiled_with_xpu(): self.dtype = "float32" self.ids_dtype = "int64" @@ -129,9 +124,7 @@ def initcase(self): self.outputs = {'Out': np_out.reshape((2, 4, 64))} self.attrs = {'start_index': self.start_index} - if core.is_compiled_with_npu(): - self.__class__.use_npu = True - elif core.is_compiled_with_xpu(): + if core.is_compiled_with_xpu(): self.__class__.use_xpu = True elif core.is_compiled_with_cuda(): self.__class__.exist_fp64_check_grad = True diff --git a/python/paddle/fluid/tests/unittests/eager_op_test.py b/python/paddle/fluid/tests/unittests/eager_op_test.py index f8f6c8023da81..0bb5bd6c769a9 100644 --- a/python/paddle/fluid/tests/unittests/eager_op_test.py +++ b/python/paddle/fluid/tests/unittests/eager_op_test.py @@ -338,10 +338,7 @@ def setUpClass(cls): np.random.seed(123) random.seed(124) - if paddle.is_compiled_with_npu(): - cls._use_system_allocator = _set_use_system_allocator(False) - else: - cls._use_system_allocator = _set_use_system_allocator(True) + cls._use_system_allocator = _set_use_system_allocator(True) @classmethod def tearDownClass(cls): @@ -376,9 +373,6 @@ def is_mkldnn_op_test(): def is_rocm_op_test(): return core.is_compiled_with_rocm() - def is_npu_op_test(): - return hasattr(cls, "use_npu") and cls.use_npu - def is_mlu_op_test(): return hasattr(cls, "use_mlu") and cls.use_mlu @@ -414,7 +408,6 @@ def is_custom_device_op_test(): and not is_xpu_op_test() and not is_mkldnn_op_test() and not is_rocm_op_test() - and not is_npu_op_test() and not is_mlu_op_test() and not is_custom_device_op_test() and not cls.check_prim @@ -1970,7 +1963,6 @@ def _is_skip_name(self, name): # Currently not support ParallelExecutor on XPUPlace. if ( not paddle.is_compiled_with_xpu() - and not paddle.is_compiled_with_npu() and not paddle.is_compiled_with_mlu() and not isinstance(place, core.CustomPlace) ): diff --git a/python/paddle/fluid/tests/unittests/test_device.py b/python/paddle/fluid/tests/unittests/test_device.py index ac30b5112bc98..8396b2a39d699 100644 --- a/python/paddle/fluid/tests/unittests/test_device.py +++ b/python/paddle/fluid/tests/unittests/test_device.py @@ -46,10 +46,6 @@ def test_xpu_device(self): if core.is_compiled_with_xpu(): self._test_device("xpu:0", core.XPUPlace) - def test_npu_device(self): - if core.is_compiled_with_npu(): - self._test_device("npu:0", core.NPUPlace) - class TestImperativeDeviceManage(unittest.TestCase): def test_cpu(self): @@ -95,25 +91,6 @@ def test_xpu(self): self.assertTrue(out.place.is_xpu_place()) self.assertEqual(device, "xpu:0") - def test_npu(self): - if core.is_compiled_with_npu(): - with fluid.dygraph.guard(): - paddle.set_device('npu:0') - out1 = paddle.zeros(shape=[1, 3], dtype='float32') - out2 = paddle.ones(shape=[1, 3], dtype='float32') - out3 = paddle.concat(x=[out1, out2], axis=0) - device = paddle.get_device() - self.assertEqual( - isinstance( - framework._current_expected_place(), core.NPUPlace - ), - True, - ) - self.assertTrue(out1.place.is_npu_place()) - self.assertTrue(out2.place.is_npu_place()) - self.assertTrue(out3.place.is_npu_place()) - self.assertEqual(device, "npu:0") - if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index def841d16fb3b..507e8ac2422a2 100755 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -684,9 +684,6 @@ def run_trainer(self, args): elif fluid.core.is_compiled_with_xpu(): device_id = int(os.getenv("FLAGS_selected_xpus", "0")) place = fluid.XPUPlace(device_id) - elif fluid.core.is_compiled_with_npu(): - device_id = int(os.getenv("FLAGS_selected_npus", "0")) - place = fluid.NPUPlace(device_id) elif fluid.core.is_compiled_with_mlu(): device_id = int(os.getenv("FLAGS_selected_mlus", "0")) place = fluid.MLUPlace(device_id) @@ -891,7 +888,6 @@ def runtime_main(test_class): parser.add_argument('--use_cpu', action='store_true') parser.add_argument('--use_xpu', action='store_true') parser.add_argument('--use_dgc', action='store_true') - parser.add_argument('--use_npu', action='store_true') parser.add_argument('--use_mlu', action='store_true') parser.add_argument('--accumulate_gradient', action='store_true') parser.add_argument('--find_unused_parameters', action='store_true') @@ -949,30 +945,20 @@ def _after_setup_config(self): self.__use_cuda = False self.__use_xpu = False self._use_dgc = False - self.__use_npu = False self._use_mlu = False elif self._enforce_place == "GPU": self.__use_cuda = True self.__use_xpu = False - self.__use_npu = False self._use_mlu = False elif self._enforce_place == "XPU": self.__use_cuda = False self.__use_xpu = True self._use_dgc = False - self.__use_npu = False - self._use_mlu = False - elif self._enforce_place == "NPU": - self.__use_cuda = False - self.__use_xpu = False - self._use_dgc = False - self.__use_npu = True self._use_mlu = False elif self._enforce_place == "MLU": self.__use_cuda = False self.__use_xpu = False self._use_dgc = False - self.__use_npu = False self._use_mlu = True else: if fluid.core.is_compiled_with_cuda(): @@ -1163,13 +1149,6 @@ def _run_local( "PADDLE_TRAINERS_NUM": "1", "PADDLE_TRAINER_ID": "0", } - elif self.__use_npu: - cmd += " --use_npu" - env_local = { - "FLAGS_selected_npus": devices, - "PADDLE_TRAINERS_NUM": "1", - "PADDLE_TRAINER_ID": "0", - } else: env_local = {'CPU_NUM': '1'} @@ -1461,18 +1440,6 @@ def _get_nccl2_trainer_cmd( "GLOG_v": "2", } ) - elif self.__use_npu: - tr_cmd += " --use_npu" - env.update( - { - "FLAGS_selected_npus": "{}".format(trainer_id), - "PADDLE_TRAINERS_NUM": "{}".format(trainer_num), - "PADDLE_TRAINER_ID": "{}".format(trainer_id), - "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints, - "PADDLE_CURRENT_ENDPOINT": ep, - "GLOG_v": "2", - } - ) elif self._use_mlu: tr_cmd += " --use_mlu" env.update( diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py index 17098ef9425a9..24920eb375ce1 100644 --- a/python/paddle/fluid/tests/unittests/test_var_base.py +++ b/python/paddle/fluid/tests/unittests/test_var_base.py @@ -272,9 +272,6 @@ def check_with_place(place): check_with_place("gpu_pinned") check_with_place(core.CUDAPlace(0)) check_with_place("gpu:0") - if core.is_compiled_with_npu(): - check_with_place(core.NPUPlace(0)) - check_with_place("npu:0") def test_to_tensor_not_change_input_stop_gradient(self): with paddle.fluid.dygraph.guard(core.CPUPlace()): diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py index e34274ceaee0e..966e4ab3000c0 100644 --- a/python/paddle/hapi/model.py +++ b/python/paddle/hapi/model.py @@ -156,33 +156,6 @@ def init_communicator( 'ring_id': 0, }, ) - elif core.is_compiled_with_npu(): - hccl_id_var = block.create_var( - name=fluid.unique_name.generate('hccl_id'), - persistable=True, - type=core.VarDesc.VarType.RAW, - ) - block.append_op( - type='c_gen_hccl_id', - inputs={}, - outputs={'Out': hccl_id_var}, - attrs={ - 'rank': rank, - 'endpoint': current_endpoint, - 'other_endpoints': other_endpoints, - }, - ) - block.append_op( - type='c_comm_init_hccl', - inputs={'X': hccl_id_var}, - outputs={}, - attrs={ - 'rank': rank, - 'ring_id': 0, - 'device_id': int(os.getenv("FLAGS_selected_npus")), - 'rank_ids': nranks, - }, - ) elif core.is_compiled_with_xpu(): bkcl_id_var = block.create_var( name=fluid.unique_name.generate('bkcl_id'), diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py index 816fd3266f184..a31744076e4f9 100644 --- a/python/paddle/nn/functional/conv.py +++ b/python/paddle/nn/functional/conv.py @@ -16,7 +16,7 @@ from paddle.device import ( get_all_custom_device_type, is_compiled_with_cuda, - is_compiled_with_npu, + is_compiled_with_custom_device, is_compiled_with_rocm, ) from paddle.fluid.framework import _global_flags, in_dygraph_mode @@ -466,7 +466,7 @@ def conv1d( use_cudnn = False # NPU only supports depthwise_conv2d when "input_channel = output_channel = groups" - if is_compiled_with_npu(): + if is_compiled_with_custom_device('npu'): if num_channels == groups and num_channels == num_filters: l_type = 'depthwise_conv2d' else: @@ -756,7 +756,7 @@ def conv2d( use_mkldnn = _global_flags()["FLAGS_use_mkldnn"] # NPU only supports depthwise_conv2d when "input_channel = output_channel = groups" - if is_compiled_with_npu(): + if is_compiled_with_custom_device('npu'): if num_channels == groups and num_channels == num_filters: l_type = 'depthwise_conv2d' else: diff --git a/python/paddle/static/amp/amp_nn.py b/python/paddle/static/amp/amp_nn.py index 0f936ae8f57b9..2361c11f23b82 100644 --- a/python/paddle/static/amp/amp_nn.py +++ b/python/paddle/static/amp/amp_nn.py @@ -54,7 +54,7 @@ def check_finite_and_unscale(x, scale, name=None, float_status=None): ) inputs = {'X': x, 'Scale': scale} - if core.is_compiled_with_npu(): + if core.is_compiled_with_custom_device('npu'): check_variable_and_dtype( float_status, "float_status", diff --git a/python/paddle/static/amp/decorator.py b/python/paddle/static/amp/decorator.py index b3af9952a4faf..f2cf0c4295d91 100644 --- a/python/paddle/static/amp/decorator.py +++ b/python/paddle/static/amp/decorator.py @@ -187,7 +187,7 @@ def backward( self._train_program = train_program # NOTE(zhiqiu): _float_status is only used for NPU. - if core.is_compiled_with_npu(): + if core.is_compiled_with_custom_device('npu'): float_status = paddle.static.data( name="float_status", shape=[8], dtype='float32' ) @@ -408,7 +408,7 @@ def _check_finite_and_unscale(self, params_grads): if self._is_distributed: # if distributed, split check_finite_and_unscale to overlap # unscale with communication - if core.is_compiled_with_npu(): + if core.is_compiled_with_custom_device('npu'): with self._train_program._optimized_guard(grads): _, found_inf = check_finite_and_unscale( grads, diff --git a/python/paddle/static/amp/fp16_lists.py b/python/paddle/static/amp/fp16_lists.py index b3f9b0331a86c..e1924336dd445 100644 --- a/python/paddle/static/amp/fp16_lists.py +++ b/python/paddle/static/amp/fp16_lists.py @@ -182,7 +182,7 @@ def _update_list(self): _, _, _sys_unsupported_fp16_list = core.op_supported_infos( 'XPU', core.VarDesc.VarType.FP16 ) -elif core.is_compiled_with_npu(): +elif core.is_compiled_with_custom_device('npu'): _, _, _sys_unsupported_fp16_list = core.op_supported_infos( 'NPU', core.VarDesc.VarType.FP16 ) diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py index 37fe41624a4a5..d25f44d8a73f7 100644 --- a/python/paddle/static/nn/common.py +++ b/python/paddle/static/nn/common.py @@ -946,7 +946,7 @@ def conv2d( l_type = 'depthwise_conv2d' # NPU only supports depthwise_conv2d when "input_channel = output_channel = groups" - if core.is_compiled_with_npu(): + if core.is_compiled_with_custom_device('npu'): if num_channels == groups and num_channels == num_filters: l_type = 'depthwise_conv2d' else: diff --git a/python/paddle/utils/install_check.py b/python/paddle/utils/install_check.py index 064d70ed4f892..21dbea143f920 100644 --- a/python/paddle/utils/install_check.py +++ b/python/paddle/utils/install_check.py @@ -65,22 +65,6 @@ def _is_cuda_available(): return False -def _is_npu_available(): - """ - Check whether NPU is avaiable. - """ - try: - assert len(paddle.static.npu_places()) > 0 - return True - except Exception as e: - logging.warning( - "You are using NPU version PaddlePaddle, but there is no NPU " - "detected on your machine. Maybe NPU devices is not set properly." - "\n Original Error is {}".format(e) - ) - return False - - def _is_xpu_available(): """ Check whether XPU is avaiable. @@ -97,22 +81,19 @@ def _is_xpu_available(): return False -def _run_dygraph_single(use_cuda, use_xpu, use_npu): +def _run_dygraph_single(use_cuda, use_xpu): """ - Testing the simple network in dygraph mode using one CPU/GPU/XPU/NPU. + Testing the simple network in dygraph mode using one CPU/GPU/XPU. Args: use_cuda (bool): Whether running with CUDA. use_xpu (bool): Whether running with XPU. - use_npu (bool): Whether running with NPU. """ paddle.disable_static() if use_cuda: paddle.set_device('gpu') elif use_xpu: paddle.set_device('xpu') - elif use_npu: - paddle.set_device('npu') else: paddle.set_device('cpu') weight_attr = paddle.ParamAttr( @@ -135,14 +116,13 @@ def _run_dygraph_single(use_cuda, use_xpu, use_npu): opt.step() -def _run_static_single(use_cuda, use_xpu, use_npu): +def _run_static_single(use_cuda, use_xpu): """ - Testing the simple network with executor running directly, using one CPU/GPU/XPU/NPU. + Testing the simple network with executor running directly, using one CPU/GPU/XPU. Args: use_cuda (bool): Whether running with CUDA. use_xpu (bool): Whether running with XPU. - use_npu (bool): Whether running with NPU. """ paddle.enable_static() with paddle.static.scope_guard(paddle.static.Scope()): @@ -159,8 +139,6 @@ def _run_static_single(use_cuda, use_xpu, use_npu): place = paddle.CUDAPlace(0) elif use_xpu: place = paddle.XPUPlace(0) - elif use_npu: - place = paddle.NPUPlace(0) else: place = paddle.CPUPlace() @@ -223,7 +201,6 @@ def _run_parallel(device_list): Args: use_cuda (bool): Whether running with CUDA. use_xpu (bool): Whether running with XPU. - use_npu (bool): Whether running with NPU. device_list (int): The specified devices. """ paddle.distributed.spawn(train_for_run_parallel, nprocs=len(device_list)) @@ -252,14 +229,11 @@ def run_check(): use_cuda = False use_xpu = False - use_npu = False if paddle.is_compiled_with_cuda(): use_cuda = _is_cuda_available() elif paddle.is_compiled_with_xpu(): use_xpu = _is_xpu_available() - elif paddle.is_compiled_with_npu(): - use_npu = _is_npu_available() if use_cuda: device_str = "GPU" @@ -267,16 +241,13 @@ def run_check(): elif use_xpu: device_str = "XPU" device_list = paddle.static.xpu_places() - elif use_npu: - device_str = "NPU" - device_list = paddle.static.npu_places() else: device_str = "CPU" device_list = paddle.static.cpu_places(device_count=1) device_count = len(device_list) - _run_static_single(use_cuda, use_xpu, use_npu) - _run_dygraph_single(use_cuda, use_xpu, use_npu) + _run_static_single(use_cuda, use_xpu) + _run_dygraph_single(use_cuda, use_xpu) print("PaddlePaddle works well on 1 {}.".format(device_str)) try: diff --git a/test/custom_kernel/custom_kernel_dot_c_setup.py b/test/custom_kernel/custom_kernel_dot_c_setup.py index 34778e0d5714d..c8cb7f4de4002 100644 --- a/test/custom_kernel/custom_kernel_dot_c_setup.py +++ b/test/custom_kernel/custom_kernel_dot_c_setup.py @@ -18,8 +18,6 @@ from setuptools import Extension, setup from setuptools.command.build_ext import build_ext -from paddle.fluid import core - # refer: https://note.qidong.name/2018/03/setup-warning-strict-prototypes # Avoid a gcc warning below: @@ -40,8 +38,6 @@ def build_extensions(self): '-Wno-parentheses', '-DPADDLE_WITH_CUSTOM_KERNEL', ] -if core.is_compiled_with_npu(): - paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0'] # include path site_packages_path = get_python_lib() diff --git a/test/custom_kernel/custom_kernel_dot_setup.py b/test/custom_kernel/custom_kernel_dot_setup.py index 71fe70a77f75d..7bf6f2fbe6f2a 100644 --- a/test/custom_kernel/custom_kernel_dot_setup.py +++ b/test/custom_kernel/custom_kernel_dot_setup.py @@ -18,8 +18,6 @@ from setuptools import Extension, setup from setuptools.command.build_ext import build_ext -from paddle.fluid import core - # refer: https://note.qidong.name/2018/03/setup-warning-strict-prototypes # Avoid a gcc warning below: @@ -40,8 +38,6 @@ def build_extensions(self): '-Wno-parentheses', '-DPADDLE_WITH_CUSTOM_KERNEL', ] -if core.is_compiled_with_npu(): - paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0'] # include path site_packages_path = site.getsitepackages() diff --git a/tools/get_quick_disable_lt.py b/tools/get_quick_disable_lt.py index eaf439b04134e..9ab9a7c53d3df 100644 --- a/tools/get_quick_disable_lt.py +++ b/tools/get_quick_disable_lt.py @@ -32,9 +32,6 @@ def download_file(): if paddle.is_compiled_with_rocm(): url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut_rocm') - if paddle.is_compiled_with_npu(): - url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut_npu') - if paddle.is_compiled_with_mlu(): url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut_mlu') From 71cd2d9d5fbe1d84529d8cb599e5c6067a02d553 Mon Sep 17 00:00:00 2001 From: baocheny Date: Fri, 31 Mar 2023 21:22:16 +0800 Subject: [PATCH 2/7] rem nup related code --- python/paddle/static/__init__.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py index d75c534aa32d8..2828927a6b156 100644 --- a/python/paddle/static/__init__.py +++ b/python/paddle/static/__init__.py @@ -60,8 +60,6 @@ from ..fluid.framework import cpu_places # noqa: F401 from ..fluid.framework import cuda_places # noqa: F401 from ..fluid.framework import xpu_places # noqa: F401 -from ..fluid.framework import mlu_places # noqa: F401 -from ..fluid.framework import npu_places # noqa: F401 from ..fluid.framework import Variable # noqa: F401 from ..fluid.framework import Operator # noqa: F401 from ..fluid.framework import Parameter # noqa: F401 @@ -119,8 +117,6 @@ 'cpu_places', 'cuda_places', 'xpu_places', - 'npu_places', - 'mlu_places', 'Variable', 'create_global_var', 'accuracy', From 61411a9af7a033b280caf22563a6ba9d6035cc7e Mon Sep 17 00:00:00 2001 From: baocheny Date: Fri, 31 Mar 2023 21:33:13 +0800 Subject: [PATCH 3/7] make lint happy --- python/paddle/utils/install_check.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/paddle/utils/install_check.py b/python/paddle/utils/install_check.py index 8ca8a5b34b065..548d16e8a96e8 100644 --- a/python/paddle/utils/install_check.py +++ b/python/paddle/utils/install_check.py @@ -246,7 +246,6 @@ def run_check(): device_list = paddle.static.cpu_places(device_count=1) device_count = len(device_list) - _run_static_single(use_cuda, use_xpu) _run_dygraph_single(use_cuda, use_xpu) print(f"PaddlePaddle works well on 1 {device_str}.") From b859cd38d8eff663b5318b160c9342e4a126608f Mon Sep 17 00:00:00 2001 From: baocheny Date: Fri, 31 Mar 2023 23:19:56 +0800 Subject: [PATCH 4/7] rem test --- .../paddle/distributed/fleet/ascend_utils.py | 138 ------------------ .../unittests/test_fleet_ascend_utils.py | 53 ------- 2 files changed, 191 deletions(-) delete mode 100644 python/paddle/distributed/fleet/ascend_utils.py delete mode 100644 python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py diff --git a/python/paddle/distributed/fleet/ascend_utils.py b/python/paddle/distributed/fleet/ascend_utils.py deleted file mode 100644 index 132ee3afac67c..0000000000000 --- a/python/paddle/distributed/fleet/ascend_utils.py +++ /dev/null @@ -1,138 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os - -from paddle.distributed.fleet.launch_utils import ( - DeviceMode, - get_cluster, - get_host_name_ip, -) - -__all__ = [] - - -def _get_ascend_rankfile(rank_table_file_path): - """ - Args: - rank_table_file_path: ascend npu rank file json - { - "status": "completed", - "version": "1.0", - "server_count": "2", - "server_list": [ - { - "server_id": "192.168.24.217", - "device": [ - { - "device_id": "0", - "device_ip": "192.1.184.23", - "rank_id": "0" - }, - { - "device_id": "1", - "device_ip": "192.2.21.93", - "rank_id": "1" - } - ] - }, - { - "server_id": "192.168.26.177", - "device": [ - { - "device_id": "0", - "device_ip": "192.1.94.132", - "rank_id": "2" - }, - { - "device_id": "1", - "device_ip": "192.2.94.30", - "rank_id": "3" - } - ] - } - ] - } - - Returns: - node_ips: node ip list - device_count: number of npu per machine - """ - json_data = None - with open(rank_table_file_path) as json_file: - json_data = json.load(json_file) - - node_ips = [] - device_count = 0 - server_list = json_data['server_list'] - for server in server_list: - device_list = server['device'] - device_count = len(device_list) - if os.getenv("FLAGS_MODELARTS", None): - nodes = os.getenv("DLS_TASK_NUMBER", None) - assert nodes is not None, "DLS_TASK_NUMBER didn't set!" - for node in range(int(nodes)): - node_ip = os.getenv(f"VC_CUSTOM{node}_HOSTS", None) - assert node_ip is not None, f"VC_CUSTOM{node}_HOSTS didn't set!" - node_ips.append(node_ip) - return node_ips, device_count - node_ips.append(server['server_id']) - return node_ips, device_count - - -def get_cloud_cluster( - rank_table_file=None, device_mode=DeviceMode.ASCEND_NPU, start_port=6070 -): - """ - Args: - rank_table_file: string, ascend npu rank file path - device_mode: DeviceMode(Int) - start_port: the start port of current runtime env - """ - if rank_table_file: - # multi trainers - node_ips, device_count = _get_ascend_rankfile(rank_table_file) - if len(node_ips) == 1: - node_ip = node_ips[0] - else: - node_index = os.environ.get("PADDLE_TRAINER_ID") - node_ip = None - if node_index: - node_ip = node_ips[int(node_index)] - else: - _, node_ip = get_host_name_ip() - - assert ( - node_ip in node_ips - ), "Can't find your local ip {{{}}} in node_ips: {{{}}}".format( - node_ip, - node_ips, - ) - else: - # single trainer (single ascend card) - node_ips = ["127.0.0.1"] - node_ip = node_ips[0] - device_count = 1 - - devices_per_proc = [str(x) for x in range(device_count)] - free_ports = list(range(start_port, start_port + len(devices_per_proc))) - - trainer_endpoints = [] - for ip in node_ips: - trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports]) - - return get_cluster( - node_ips, node_ip, trainer_endpoints, device_mode, devices_per_proc - ) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py b/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py deleted file mode 100644 index 218cf786a3a35..0000000000000 --- a/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import unittest - -from paddle.distributed.fleet import ascend_utils - -RANK_TABLE_JSON = { - "status": "completed", - "version": "1.0", - "server_count": "1", - "server_list": [ - { - "server_id": "127.0.0.1", - "device": [ - {"device_id": "0", "device_ip": "192.1.184.23", "rank_id": "0"}, - {"device_id": "1", "device_ip": "192.2.21.93", "rank_id": "1"}, - ], - } - ], -} - - -class TestAscendUtil(unittest.TestCase): - def test_get_cloud_cluster(self): - cluster, pod = ascend_utils.get_cloud_cluster() - self.assertTrue(cluster) - self.assertTrue(pod) - - with open('rank_table_file.json', 'w') as f: - json.dump(RANK_TABLE_JSON, f) - rank_table_file = "./rank_table_file.json" - cluster, pod = ascend_utils.get_cloud_cluster( - rank_table_file=rank_table_file - ) - self.assertTrue(cluster) - self.assertTrue(pod) - - -if __name__ == '__main__': - unittest.main() From e0f1c0d144256d442771d3efa3b5a6e2ff816583 Mon Sep 17 00:00:00 2001 From: baocheny Date: Sat, 1 Apr 2023 14:07:47 +0800 Subject: [PATCH 5/7] remove some tests --- python/paddle/amp/auto_cast.py | 5 -- python/paddle/device/__init__.py | 50 ---------------- .../sharding/group_sharded_storage.py | 2 +- .../ascend_multi_process_collective.py | 48 --------------- .../tests/unittests/c_embedding_op_base.py | 8 +-- .../test_cuda_max_memory_allocated.py | 1 - .../test_cuda_max_memory_reserved.py | 1 - .../unittests/test_cuda_memory_allocated.py | 1 - .../unittests/test_cuda_memory_reserved.py | 1 - .../tests/unittests/test_npu_identity_op.py | 59 ------------------- python/paddle/static/io.py | 8 --- python/paddle/tensor/creation.py | 2 - 12 files changed, 2 insertions(+), 184 deletions(-) delete mode 100644 python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py delete mode 100644 python/paddle/fluid/tests/unittests/test_npu_identity_op.py diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py index 9262fab7a5ef5..a4c2ffaea70f5 100644 --- a/python/paddle/amp/auto_cast.py +++ b/python/paddle/amp/auto_cast.py @@ -345,7 +345,6 @@ def amp_guard( tracer._expected_place.is_gpu_place() or tracer._expected_place.is_xpu_place() or tracer._expected_place.is_mlu_place() - or tracer._expected_place.is_npu_place() or tracer._expected_place.is_custom_place() ): warnings.warn( @@ -353,10 +352,6 @@ def amp_guard( % tracer._expected_place ) enable = False - # For npu: - if tracer._expected_place.is_npu_place() and (dtype == 'bfloat16'): - warnings.warn('NPUPlace only support float16 amp.') - enable = False # For xpu: if tracer._expected_place.is_xpu_place() and (dtype == 'bfloat16'): warnings.warn('XPUPlace only support float16 amp.') diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py index 359b981a0925a..45285f51465d5 100644 --- a/python/paddle/device/__init__.py +++ b/python/paddle/device/__init__.py @@ -37,7 +37,6 @@ 'is_compiled_with_cinn', 'is_compiled_with_cuda', 'is_compiled_with_rocm', - 'is_compiled_with_npu', 'is_compiled_with_mlu', 'is_compiled_with_custom_device', 'get_all_device_type', @@ -55,24 +54,6 @@ _cudnn_version = None -# TODO: WITH_ASCEND_CL may changed to WITH_NPU or others in the future -# for consistent. -def is_compiled_with_npu(): - """ - Whether paddle was built with WITH_ASCEND_CL=ON to support Ascend NPU. - - Return: - bool, ``True`` if NPU is supported, otherwise ``False``. - - Examples: - .. code-block:: python - - import paddle - support_npu = paddle.device.is_compiled_with_npu() - """ - return core.is_compiled_with_npu() - - def is_compiled_with_custom_device(device_type): """ Whether paddle was built with Paddle_CUSTOM_DEVICE . @@ -247,15 +228,6 @@ def _convert_to_place(device): selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",") device_id = int(selected_xpus[0]) place = core.XPUPlace(device_id) - elif lower_device == 'npu': - if not core.is_compiled_with_custom_device('npu'): - raise ValueError( - "The device should not be 'npu', " - "since PaddlePaddle is not compiled with NPU" - ) - selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",") - device_id = int(selected_npus[0]) - place = core.CustomPlace('npu', device_id) elif lower_device == 'ipu': if not core.is_compiled_with_ipu(): raise ValueError( @@ -275,7 +247,6 @@ def _convert_to_place(device): else: avaliable_gpu_device = re.match(r'gpu:\d+', lower_device) avaliable_xpu_device = re.match(r'xpu:\d+', lower_device) - avaliable_npu_device = re.match(r'npu:\d+', lower_device) avaliable_mlu_device = re.match(r'mlu:\d+', lower_device) if avaliable_gpu_device: if not core.is_compiled_with_cuda(): @@ -297,26 +268,6 @@ def _convert_to_place(device): device_id = device_info_list[1] device_id = int(device_id) place = core.XPUPlace(device_id) - if avaliable_npu_device: - if not core.is_compiled_with_custom_device('npu'): - device_info_list = device.split(':', 1) - device_type = device_info_list[0] - if device_type in core.get_all_custom_device_type(): - device_id = device_info_list[1] - device_id = int(device_id) - place = core.CustomPlace(device_type, device_id) - return place - else: - raise ValueError( - "The device should not be {}, since PaddlePaddle is " - "not compiled with NPU or compiled with custom device".format( - avaliable_npu_device - ) - ) - device_info_list = device.split(':', 1) - device_id = device_info_list[1] - device_id = int(device_id) - place = core.CustomPlace('npu', device_id) if avaliable_mlu_device: if not core.is_compiled_with_mlu(): raise ValueError( @@ -330,7 +281,6 @@ def _convert_to_place(device): if ( not avaliable_gpu_device and not avaliable_xpu_device - and not avaliable_npu_device and not avaliable_mlu_device ): device_info_list = device.split(':', 1) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py index 20f02b409b8db..73e1b9a9781a5 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py @@ -78,7 +78,7 @@ def to(self, device, dtype=None, keep_alignment=True): if self._device != device: tmp_buffer = ( cvt_to_device(self.buffer, self.dev_id) - if device in ["gpu", "xpu", "npu"] + if device in ["gpu", "xpu"] else self.buffer.cpu() ) for param in self._params: diff --git a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py deleted file mode 100644 index 572e6caa1d7f4..0000000000000 --- a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys - - -def train(prefix): - selected_accelerators = os.getenv("FLAGS_selected_accelerators") - selected_npus = os.getenv("FLAGS_selected_npus") - trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) - worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") - current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") - worker_endpoints = worker_endpoints_env - trainers_num = len(worker_endpoints.split(',')) - device_ids = os.getenv("PADDLE_WORLD_DEVICE_IDS") - current_device_id = os.getenv("PADDLE_LOCAL_DEVICE_IDS") - - details = "selected_accelerators:{} selected_npus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}".format( - selected_accelerators, - selected_npus, - worker_endpoints, - trainers_num, - current_endpoint, - trainer_id, - device_ids, - current_device_id, - ) - - print(details) - with open(f"multi_process_{prefix}.check_{trainer_id}.log", "w") as f: - f.write(details) - - -if __name__ == '__main__': - prefix = sys.argv[1] - train(prefix) diff --git a/python/paddle/fluid/tests/unittests/c_embedding_op_base.py b/python/paddle/fluid/tests/unittests/c_embedding_op_base.py index 762961ca5e8a5..3eda046571e37 100644 --- a/python/paddle/fluid/tests/unittests/c_embedding_op_base.py +++ b/python/paddle/fluid/tests/unittests/c_embedding_op_base.py @@ -55,9 +55,7 @@ def initcase(self): np_out = get_c_embedding(self.start_index, self.end_index, table, ids) self.outputs = {'Out': np_out.reshape((2, 4, 64))} self.attrs = {'start_index': self.start_index} - if core.is_compiled_with_npu(): - self.__class__.use_npu = True - elif core.is_compiled_with_xpu(): + if core.is_compiled_with_xpu(): self.__class__.use_xpu = True def test_check_cpu(self): @@ -79,16 +77,12 @@ def setUp(self): def test_check_output(self): if core.is_compiled_with_cuda(): self.check_output_with_place(core.CUDAPlace(0)) - elif core.is_compiled_with_npu(): - self.check_output_with_place(core.NPUPlace(0)) elif core.is_compiled_with_xpu(): self.check_output_with_place(core.XPUPlace(0)) def test_check_grad(self): if core.is_compiled_with_cuda(): self.check_grad_with_place(core.CUDAPlace(0), ['W'], 'Out') - elif core.is_compiled_with_npu(): - self.check_grad_with_place(core.NPUPlace(0), ['W'], 'Out') elif core.is_compiled_with_xpu(): self.check_grad_with_place(core.XPUPlace(0), ['W'], 'Out') diff --git a/python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py b/python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py index 589c6643f726e..c7b6828a60c1d 100644 --- a/python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py +++ b/python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py @@ -59,7 +59,6 @@ def test_max_memory_allocated_exception(self): -2, 0.5, "gpu1", - "npu", ] for device in wrong_device: with self.assertRaises(BaseException): diff --git a/python/paddle/fluid/tests/unittests/test_cuda_max_memory_reserved.py b/python/paddle/fluid/tests/unittests/test_cuda_max_memory_reserved.py index 9651d893fa776..07d2b5a9ded19 100644 --- a/python/paddle/fluid/tests/unittests/test_cuda_max_memory_reserved.py +++ b/python/paddle/fluid/tests/unittests/test_cuda_max_memory_reserved.py @@ -59,7 +59,6 @@ def test_max_memory_reserved_exception(self): -2, 0.5, "gpu1", - "npu", ] for device in wrong_device: with self.assertRaises(BaseException): diff --git a/python/paddle/fluid/tests/unittests/test_cuda_memory_allocated.py b/python/paddle/fluid/tests/unittests/test_cuda_memory_allocated.py index 16bbb51d559ea..252dd6f93117c 100644 --- a/python/paddle/fluid/tests/unittests/test_cuda_memory_allocated.py +++ b/python/paddle/fluid/tests/unittests/test_cuda_memory_allocated.py @@ -44,7 +44,6 @@ def test_memory_allocated_exception(self): -2, 0.5, "gpu1", - "npu", ] for device in wrong_device: with self.assertRaises(BaseException): diff --git a/python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py b/python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py index ca89a1a7dfe18..941ab27ecc5ba 100644 --- a/python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py +++ b/python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py @@ -44,7 +44,6 @@ def test_memory_reserved_exception(self): -2, 0.5, "gpu1", - "npu", ] for device in wrong_device: with self.assertRaises(BaseException): diff --git a/python/paddle/fluid/tests/unittests/test_npu_identity_op.py b/python/paddle/fluid/tests/unittests/test_npu_identity_op.py deleted file mode 100644 index da87384c4566c..0000000000000 --- a/python/paddle/fluid/tests/unittests/test_npu_identity_op.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np - -import paddle - - -class TestNPUIdentityOp(unittest.TestCase): - def setUp(self): - self.op_type = "npu_identity" - self.shape = [64, 6, 28, 28] - self.x = np.random.random(self.shape).astype(np.float32) - self.format = 3 # ACL_FORMAT_NC1HWC0 = 3 - self.place = paddle.CPUPlace() - - def test_api_static(self): - paddle.enable_static() - - main_program = paddle.static.default_main_program() - startup_program = paddle.static.default_startup_program() - with paddle.static.program_guard(main_program, startup_program): - x_data = paddle.static.data( - shape=self.shape, name="data", dtype='float32' - ) - output = paddle.incubate._npu_identity(x=x_data, format=self.format) - exe = paddle.static.Executor() - exe.run(startup_program) - result = exe.run( - main_program, feed={x_data.name: self.x}, fetch_list=[output] - ) - - np.testing.assert_allclose(result[0], self.x, rtol=1e-08) - - def test_api_dygraph(self): - paddle.disable_static(self.place) - - x = paddle.to_tensor(self.x) - out = paddle.incubate._npu_identity(x, self.format) - - np.testing.assert_allclose(out.numpy(), self.x, rtol=1e-08) - paddle.enable_static() - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py index cac8f821c5d72..b9297ddde2a6b 100644 --- a/python/paddle/static/io.py +++ b/python/paddle/static/io.py @@ -1536,10 +1536,6 @@ def set_var(var, ndarray): p = paddle.fluid.core.Place() p.set_place(t._place()) place = paddle.fluid.XPUPlace(p.xpu_device_id()) - elif p.is_npu_place(): - p = paddle.fluid.core.Place() - p.set_place(t._place()) - place = paddle.fluid.NPUPlace(p.npu_device_id()) elif p.is_mlu_place(): p = paddle.fluid.core.Place() p.set_place(t._place()) @@ -1680,10 +1676,6 @@ def set_program_state(program, state_dict): p = paddle.fluid.core.Place() p.set_place(ten_place) py_place = paddle.fluid.XPUPlace(p.xpu_device_id()) - elif ten_place.is_npu_place(): - p = paddle.fluid.core.Place() - p.set_place(ten_place) - py_place = paddle.fluid.NPUPlace(p.npu_device_id()) elif ten_place.is_mlu_place(): p = paddle.fluid.core.Place() p.set_place(ten_place) diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index 74d2cfb88c0df..602fa7186ec84 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -2246,8 +2246,6 @@ def _memcpy(input, place=None, output=None): dst_place_type = 2 elif p.is_xpu_place(): dst_place_type = 3 - elif p.is_npu_place(): - dst_place_type = 4 attrs = {'dst_place_type': dst_place_type} helper.append_op( From f052f64fd448a33be22fd5de8670afd350ca2c13 Mon Sep 17 00:00:00 2001 From: Kim Yann Date: Mon, 3 Apr 2023 10:44:24 +0800 Subject: [PATCH 6/7] Update grad_scaler.py --- python/paddle/amp/grad_scaler.py | 98 ++++++++++---------------------- 1 file changed, 30 insertions(+), 68 deletions(-) diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py index 009316514f5ec..31d2981e8961e 100644 --- a/python/paddle/amp/grad_scaler.py +++ b/python/paddle/amp/grad_scaler.py @@ -326,74 +326,36 @@ def _unscale(self, optimizer): if param.dtype == core.VarDesc.VarType.FP32 ] self._found_inf = self._temp_found_inf_value_false - if core.is_compiled_with_custom_device('npu'): - float_status = _legacy_C_ops.alloc_float_status() - _legacy_C_ops.clear_float_status(float_status, float_status) - - if len(param_grads_fp16): - _legacy_C_ops.check_finite_and_unscale( - param_grads_fp16, - self._scale, - float_status, - param_grads_fp16, - self._temp_found_inf_fp16, - ) - self._found_inf = _C_ops.bitwise_or( - self._found_inf, self._temp_found_inf_fp16 - ) - if len(param_grads_bf16): - _legacy_C_ops.check_finite_and_unscale( - param_grads_bf16, - self._scale, - float_status, - param_grads_bf16, - self._temp_found_inf_bf16, - ) - self._found_inf = _C_ops.bitwise_or( - self._found_inf, self._temp_found_inf_bf16 - ) - if len(param_grads_fp32): - _legacy_C_ops.check_finite_and_unscale( - param_grads_fp32, - self._scale, - float_status, - param_grads_fp32, - self._temp_found_inf_fp32, - ) - self._found_inf = _C_ops.bitwise_or( - self._found_inf, self._temp_found_inf_fp32 - ) - else: - if len(param_grads_fp16): - _legacy_C_ops.check_finite_and_unscale( - param_grads_fp16, - self._scale, - param_grads_fp16, - self._temp_found_inf_fp16, - ) - self._found_inf = _C_ops.bitwise_or( - self._found_inf, self._temp_found_inf_fp16 - ) - if len(param_grads_bf16): - _legacy_C_ops.check_finite_and_unscale( - param_grads_bf16, - self._scale, - param_grads_bf16, - self._temp_found_inf_bf16, - ) - self._found_inf = _C_ops.bitwise_or( - self._found_inf, self._temp_found_inf_bf16 - ) - if len(param_grads_fp32): - _legacy_C_ops.check_finite_and_unscale( - param_grads_fp32, - self._scale, - param_grads_fp32, - self._temp_found_inf_fp32, - ) - self._found_inf = _C_ops.bitwise_or( - self._found_inf, self._temp_found_inf_fp32 - ) + if len(param_grads_fp16): + _legacy_C_ops.check_finite_and_unscale( + param_grads_fp16, + self._scale, + param_grads_fp16, + self._temp_found_inf_fp16, + ) + self._found_inf = _C_ops.bitwise_or( + self._found_inf, self._temp_found_inf_fp16 + ) + if len(param_grads_bf16): + _legacy_C_ops.check_finite_and_unscale( + param_grads_bf16, + self._scale, + param_grads_bf16, + self._temp_found_inf_bf16, + ) + self._found_inf = _C_ops.bitwise_or( + self._found_inf, self._temp_found_inf_bf16 + ) + if len(param_grads_fp32): + _legacy_C_ops.check_finite_and_unscale( + param_grads_fp32, + self._scale, + param_grads_fp32, + self._temp_found_inf_fp32, + ) + self._found_inf = _C_ops.bitwise_or( + self._found_inf, self._temp_found_inf_fp32 + ) optimizer_state["state"] = OptimizerState.UNSCALED From 46a719336bc23e11fad35a575445ce846c7bddef Mon Sep 17 00:00:00 2001 From: baocheny Date: Tue, 4 Apr 2023 16:59:03 +0800 Subject: [PATCH 7/7] fix an error --- paddle/fluid/pybind/pybind.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 45b6af736a9ed..88a984244498b 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1960,7 +1960,6 @@ All parameter, weight, gradient are variables in Paddle. }); m.def("is_compiled_with_avx", IsCompiledWithAVX); m.def("is_compiled_with_cuda", IsCompiledWithCUDA); - m.def("is_compiled_with_ascend", IsCompiledWithAscend); m.def("is_compiled_with_rocm", IsCompiledWithROCM); m.def("is_compiled_with_custom_device", IsCompiledWithCustomDevice); m.def("is_compiled_with_ipu", IsCompiledWithIPU);